Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 3bbd6af

Browse files
committed
Adjust btbulkdelete logic so that only one WAL record is issued while
deleting multiple index entries on a single index page. This makes for a very substantial reduction in the amount of WAL traffic during a large delete operation.
1 parent 13dadef commit 3bbd6af

File tree

4 files changed

+135
-107
lines changed

4 files changed

+135
-107
lines changed

src/backend/access/nbtree/nbtpage.c

+33-12
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
*
1010
*
1111
* IDENTIFICATION
12-
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.61 2003/02/23 06:17:13 tgl Exp $
12+
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.62 2003/02/23 22:43:08 tgl Exp $
1313
*
1414
* NOTES
1515
* Postgres btree pages look like ordinary relation pages. The opaque
@@ -618,26 +618,34 @@ _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level)
618618
}
619619

620620
/*
621-
* Delete an item from a btree page.
621+
* Delete item(s) from a btree page.
622622
*
623623
* This must only be used for deleting leaf items. Deleting an item on a
624624
* non-leaf page has to be done as part of an atomic action that includes
625625
* deleting the page it points to.
626626
*
627627
* This routine assumes that the caller has pinned and locked the buffer,
628-
* and will write the buffer afterwards.
628+
* and will write the buffer afterwards. Also, the given itemnos *must*
629+
* appear in increasing order in the array.
629630
*/
630631
void
631-
_bt_itemdel(Relation rel, Buffer buf, ItemPointer tid)
632+
_bt_delitems(Relation rel, Buffer buf,
633+
OffsetNumber *itemnos, int nitems)
632634
{
633635
Page page = BufferGetPage(buf);
634-
OffsetNumber offno;
635-
636-
offno = ItemPointerGetOffsetNumber(tid);
636+
int i;
637637

638+
/* No elog(ERROR) until changes are logged */
638639
START_CRIT_SECTION();
639640

640-
PageIndexTupleDelete(page, offno);
641+
/*
642+
* Delete the items in reverse order so we don't have to think about
643+
* adjusting item numbers for previous deletions.
644+
*/
645+
for (i = nitems - 1; i >= 0; i--)
646+
{
647+
PageIndexTupleDelete(page, itemnos[i]);
648+
}
641649

642650
/* XLOG stuff */
643651
if (!rel->rd_istemp)
@@ -646,17 +654,30 @@ _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid)
646654
XLogRecPtr recptr;
647655
XLogRecData rdata[2];
648656

649-
xlrec.target.node = rel->rd_node;
650-
xlrec.target.tid = *tid;
657+
xlrec.node = rel->rd_node;
658+
xlrec.block = BufferGetBlockNumber(buf);
651659

652660
rdata[0].buffer = InvalidBuffer;
653661
rdata[0].data = (char *) &xlrec;
654662
rdata[0].len = SizeOfBtreeDelete;
655663
rdata[0].next = &(rdata[1]);
656664

665+
/*
666+
* The target-offsets array is not in the buffer, but pretend
667+
* that it is. When XLogInsert stores the whole buffer, the offsets
668+
* array need not be stored too.
669+
*/
657670
rdata[1].buffer = buf;
658-
rdata[1].data = NULL;
659-
rdata[1].len = 0;
671+
if (nitems > 0)
672+
{
673+
rdata[1].data = (char *) itemnos;
674+
rdata[1].len = nitems * sizeof(OffsetNumber);
675+
}
676+
else
677+
{
678+
rdata[1].data = NULL;
679+
rdata[1].len = 0;
680+
}
660681
rdata[1].next = NULL;
661682

662683
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);

src/backend/access/nbtree/nbtree.c

+72-83
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* Portions Copyright (c) 1994, Regents of the University of California
1313
*
1414
* IDENTIFICATION
15-
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.97 2003/02/23 06:17:13 tgl Exp $
15+
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.98 2003/02/23 22:43:08 tgl Exp $
1616
*
1717
*-------------------------------------------------------------------------
1818
*/
@@ -572,121 +572,110 @@ btbulkdelete(PG_FUNCTION_ARGS)
572572
IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1);
573573
void *callback_state = (void *) PG_GETARG_POINTER(2);
574574
IndexBulkDeleteResult *result;
575-
BlockNumber num_pages;
576575
double tuples_removed;
577576
double num_index_tuples;
578-
IndexScanDesc scan;
579-
BTScanOpaque so;
580-
ItemPointer current;
577+
OffsetNumber deletable[BLCKSZ / sizeof(OffsetNumber)];
578+
int ndeletable;
579+
Buffer buf;
580+
BlockNumber num_pages;
581581

582582
tuples_removed = 0;
583583
num_index_tuples = 0;
584584

585585
/*
586-
* We use a standard IndexScanDesc scan object, but to speed up the
587-
* loop, we skip most of the wrapper layers of index_getnext and
588-
* instead call _bt_step directly. This implies holding buffer lock
589-
* on a target page throughout the loop over the page's tuples.
590-
*
591-
* Whenever we step onto a new page, we have to trade in the read
592-
* lock acquired by _bt_first or _bt_step for an exclusive write lock
593-
* (fortunately, _bt_relbuf doesn't care which kind of lock it's
594-
* releasing when it comes time for _bt_step to release our lock).
586+
* The outer loop iterates over index leaf pages, the inner over items
587+
* on a leaf page. We issue just one _bt_delitems() call per page,
588+
* so as to minimize WAL traffic.
595589
*
596-
* Note that we exclusive-lock every leaf page, or at least every one
597-
* containing data items. It sounds attractive to only exclusive-lock
590+
* Note that we exclusive-lock every leaf page containing data items,
591+
* in sequence left to right. It sounds attractive to only exclusive-lock
598592
* those containing items we need to delete, but unfortunately that
599593
* is not safe: we could then pass a stopped indexscan, which could
600594
* in rare cases lead to deleting the item it needs to find when it
601595
* resumes. (See _bt_restscan --- this could only happen if an indexscan
602596
* stops on a deletable item and then a page split moves that item
603597
* into a page further to its right, which the indexscan will have no
604-
* pin on.)
598+
* pin on.) We can skip obtaining exclusive lock on empty pages
599+
* though, since no indexscan could be stopped on those.
605600
*/
606-
scan = index_beginscan(NULL, rel, SnapshotAny, 0, (ScanKey) NULL);
607-
so = (BTScanOpaque) scan->opaque;
608-
current = &(scan->currentItemData);
609-
610-
/* Use _bt_first to get started, then _bt_step to remaining tuples */
611-
if (_bt_first(scan, ForwardScanDirection))
601+
buf = _bt_get_endpoint(rel, 0, false);
602+
if (BufferIsValid(buf)) /* check for empty index */
612603
{
613-
Buffer buf;
614-
BlockNumber lockedBlock = InvalidBlockNumber;
615-
616-
/* we have the buffer pinned and read-locked */
617-
buf = so->btso_curbuf;
618-
Assert(BufferIsValid(buf));
619-
620-
do
604+
for (;;)
621605
{
622606
Page page;
623-
BlockNumber blkno;
624-
OffsetNumber offnum;
625-
BTItem btitem;
626607
BTPageOpaque opaque;
627-
IndexTuple itup;
628-
ItemPointer htup;
608+
OffsetNumber offnum,
609+
minoff,
610+
maxoff;
611+
BlockNumber nextpage;
629612

630613
CHECK_FOR_INTERRUPTS();
631614

632-
/* current is the next index tuple */
615+
ndeletable = 0;
633616
page = BufferGetPage(buf);
634-
blkno = ItemPointerGetBlockNumber(current);
635-
636-
/*
637-
* Make sure we have a super-exclusive write lock on this page.
638-
*
639-
* We assume that only concurrent insertions, not deletions,
640-
* can occur while we're not holding the page lock (the
641-
* caller should hold a suitable relation lock to ensure
642-
* this). Therefore, no items can escape being scanned because
643-
* of this temporary lock release.
644-
*/
645-
if (blkno != lockedBlock)
617+
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
618+
minoff = P_FIRSTDATAKEY(opaque);
619+
maxoff = PageGetMaxOffsetNumber(page);
620+
/* We probably cannot see deleted pages, but skip 'em if so */
621+
if (minoff <= maxoff && !P_ISDELETED(opaque))
646622
{
623+
/*
624+
* Trade in the initial read lock for a super-exclusive
625+
* write lock on this page.
626+
*/
647627
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
648628
LockBufferForCleanup(buf);
649-
lockedBlock = blkno;
650629
/*
651-
* If the page was formerly rightmost but was split while we
652-
* didn't hold the lock, and ip_posid is pointing to item
653-
* 1, then ip_posid now points at the high key not a valid
654-
* data item. In this case we need to step forward.
630+
* Recompute minoff/maxoff, both of which could have changed
631+
* while we weren't holding the lock.
655632
*/
656-
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
657-
if (current->ip_posid < P_FIRSTDATAKEY(opaque))
658-
current->ip_posid = P_FIRSTDATAKEY(opaque);
659-
}
660-
661-
offnum = ItemPointerGetOffsetNumber(current);
662-
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
663-
itup = &btitem->bti_itup;
664-
htup = &(itup->t_tid);
665-
666-
if (callback(htup, callback_state))
667-
{
668-
/* Okay to delete the item from the page */
669-
_bt_itemdel(rel, buf, current);
670-
671-
/* Mark buffer dirty, but keep the lock and pin */
672-
WriteNoReleaseBuffer(buf);
673-
674-
tuples_removed += 1;
675-
633+
minoff = P_FIRSTDATAKEY(opaque);
634+
maxoff = PageGetMaxOffsetNumber(page);
676635
/*
677-
* We now need to back up the scan one item, so that the next
678-
* cycle will re-examine the same offnum on this page (which
679-
* now holds the next item).
636+
* Scan over all items to see which ones need deleted
637+
* according to the callback function.
680638
*/
681-
current->ip_posid--;
639+
for (offnum = minoff;
640+
offnum <= maxoff;
641+
offnum = OffsetNumberNext(offnum))
642+
{
643+
BTItem btitem;
644+
ItemPointer htup;
645+
646+
btitem = (BTItem) PageGetItem(page,
647+
PageGetItemId(page, offnum));
648+
htup = &(btitem->bti_itup.t_tid);
649+
if (callback(htup, callback_state))
650+
{
651+
deletable[ndeletable++] = offnum;
652+
tuples_removed += 1;
653+
}
654+
else
655+
num_index_tuples += 1;
656+
}
657+
}
658+
/*
659+
* If we need to delete anything, do it and write the buffer;
660+
* else just release the buffer.
661+
*/
662+
nextpage = opaque->btpo_next;
663+
if (ndeletable > 0)
664+
{
665+
_bt_delitems(rel, buf, deletable, ndeletable);
666+
_bt_wrtbuf(rel, buf);
682667
}
683668
else
684-
num_index_tuples += 1;
685-
} while (_bt_step(scan, &buf, ForwardScanDirection));
669+
{
670+
_bt_relbuf(rel, buf);
671+
}
672+
/* And advance to next page, if any */
673+
if (nextpage == P_NONE)
674+
break;
675+
buf = _bt_getbuf(rel, nextpage, BT_READ);
676+
}
686677
}
687678

688-
index_endscan(scan);
689-
690679
/* return statistics */
691680
num_pages = RelationGetNumberOfBlocks(rel);
692681

@@ -765,7 +754,7 @@ btvacuumcleanup(PG_FUNCTION_ARGS)
765754
}
766755
}
767756
else if ((opaque->btpo_flags & BTP_HALF_DEAD) ||
768-
P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page))
757+
P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page))
769758
{
770759
/* Empty, try to delete */
771760
int ndel;

src/backend/access/nbtree/nbtxlog.c

+20-7
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
1010
* IDENTIFICATION
11-
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.2 2003/02/23 06:17:13 tgl Exp $
11+
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.3 2003/02/23 22:43:08 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -379,11 +379,10 @@ btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record)
379379
return;
380380

381381
xlrec = (xl_btree_delete *) XLogRecGetData(record);
382-
reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node);
382+
reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->node);
383383
if (!RelationIsValid(reln))
384384
return;
385-
buffer = XLogReadBuffer(false, reln,
386-
ItemPointerGetBlockNumber(&(xlrec->target.tid)));
385+
buffer = XLogReadBuffer(false, reln, xlrec->block);
387386
if (!BufferIsValid(buffer))
388387
elog(PANIC, "btree_delete_redo: block unfound");
389388
page = (Page) BufferGetPage(buffer);
@@ -396,7 +395,21 @@ btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record)
396395
return;
397396
}
398397

399-
PageIndexTupleDelete(page, ItemPointerGetOffsetNumber(&(xlrec->target.tid)));
398+
if (record->xl_len > SizeOfBtreeDelete)
399+
{
400+
OffsetNumber *unused;
401+
OffsetNumber *unend;
402+
403+
unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete);
404+
unend = (OffsetNumber *) ((char *) xlrec + record->xl_len);
405+
406+
/* be careful to delete from back to front */
407+
while (unused < unend)
408+
{
409+
unend--;
410+
PageIndexTupleDelete(page, *unend);
411+
}
412+
}
400413

401414
PageSetLSN(page, lsn);
402415
PageSetSUI(page, ThisStartUpID);
@@ -853,8 +866,8 @@ btree_desc(char *buf, uint8 xl_info, char *rec)
853866
{
854867
xl_btree_delete *xlrec = (xl_btree_delete *) rec;
855868

856-
strcat(buf, "delete: ");
857-
out_target(buf, &(xlrec->target));
869+
sprintf(buf + strlen(buf), "delete: node %u/%u; blk %u",
870+
xlrec->node.tblNode, xlrec->node.relNode, xlrec->block);
858871
break;
859872
}
860873
case XLOG_BTREE_DELETE_PAGE:

src/include/access/nbtree.h

+10-5
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $Id: nbtree.h,v 1.66 2003/02/23 06:17:13 tgl Exp $
10+
* $Id: nbtree.h,v 1.67 2003/02/23 22:43:09 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -263,14 +263,18 @@ typedef struct xl_btree_split
263263
#define SizeOfBtreeSplit (offsetof(xl_btree_split, leftlen) + sizeof(uint16))
264264

265265
/*
266-
* This is what we need to know about delete of an individual leaf btitem
266+
* This is what we need to know about delete of individual leaf btitems.
267+
* The WAL record can represent deletion of any number of btitems on a
268+
* single index page.
267269
*/
268270
typedef struct xl_btree_delete
269271
{
270-
xl_btreetid target; /* deleted tuple id */
272+
RelFileNode node;
273+
BlockNumber block;
274+
/* TARGET OFFSET NUMBERS FOLLOW AT THE END */
271275
} xl_btree_delete;
272276

273-
#define SizeOfBtreeDelete (offsetof(xl_btreetid, tid) + SizeOfIptrData)
277+
#define SizeOfBtreeDelete (offsetof(xl_btree_delete, block) + sizeof(BlockNumber))
274278

275279
/*
276280
* This is what we need to know about deletion of a btree page. The target
@@ -453,7 +457,8 @@ extern void _bt_wrtnorelbuf(Relation rel, Buffer buf);
453457
extern void _bt_pageinit(Page page, Size size);
454458
extern bool _bt_page_recyclable(Page page);
455459
extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level);
456-
extern void _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid);
460+
extern void _bt_delitems(Relation rel, Buffer buf,
461+
OffsetNumber *itemnos, int nitems);
457462
extern int _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full);
458463

459464
/*

0 commit comments

Comments
 (0)