Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit b79575c

Browse files
committed
Reduce WAL activity for page splits:
> Currently, an index split writes all the data on the split page to > WAL. That's a lot of WAL traffic. The tuples that are copied to the > right page need to be WAL logged, but the tuples that stay on the > original page don't. Heikki Linnakangas
1 parent fe03a5f commit b79575c

File tree

3 files changed

+260
-164
lines changed

3 files changed

+260
-164
lines changed

src/backend/access/nbtree/nbtinsert.c

Lines changed: 91 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.149 2007/02/06 14:55:11 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.150 2007/02/08 05:05:53 momjian Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -733,6 +733,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
733733
rightoff;
734734
OffsetNumber maxoff;
735735
OffsetNumber i;
736+
bool isroot;
736737

737738
rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
738739
origpage = BufferGetPage(buf);
@@ -747,6 +748,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
747748
lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
748749
ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
749750

751+
isroot = P_ISROOT(oopaque);
752+
750753
/* if we're splitting this page, it won't be the root when we're done */
751754
/* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
752755
lopaque->btpo_flags = oopaque->btpo_flags;
@@ -921,61 +924,116 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
921924
MarkBufferDirty(sbuf);
922925
}
923926

927+
/*
928+
* By here, the original data page has been split into two new halves, and
929+
* these are correct. The algorithm requires that the left page never
930+
* move during a split, so we copy the new left page back on top of the
931+
* original. Note that this is not a waste of time, since we also require
932+
* (in the page management code) that the center of a page always be
933+
* clean, and the most efficient way to guarantee this is just to compact
934+
* the data by reinserting it into a new left page. (XXX the latter
935+
* comment is probably obsolete.)
936+
*
937+
* We need to do this before writing the WAL record, so that XLogInsert can
938+
* WAL log an image of the page if necessary.
939+
*/
940+
PageRestoreTempPage(leftpage, origpage);
941+
924942
/* XLOG stuff */
925943
if (!rel->rd_istemp)
926944
{
927945
xl_btree_split xlrec;
928946
uint8 xlinfo;
929947
XLogRecPtr recptr;
930-
XLogRecData rdata[4];
948+
XLogRecData rdata[6];
949+
XLogRecData *lastrdata;
931950

932-
xlrec.target.node = rel->rd_node;
933-
ItemPointerSet(&(xlrec.target.tid), itup_blkno, itup_off);
951+
xlrec.node = rel->rd_node;
952+
xlrec.leftsib = BufferGetBlockNumber(buf);
953+
xlrec.rightsib = BufferGetBlockNumber(rbuf);
954+
xlrec.firstright = firstright;
955+
xlrec.rnext = ropaque->btpo_next;
956+
xlrec.level = lopaque->btpo.level;
957+
958+
rdata[0].data = (char *) &xlrec;
959+
rdata[0].len = SizeOfBtreeSplit;
960+
rdata[0].buffer = InvalidBuffer;
961+
962+
lastrdata = &rdata[0];
963+
964+
/* Log downlink on non-leaf pages. */
965+
if (lopaque->btpo.level > 0)
966+
{
967+
lastrdata->next = lastrdata + 1;
968+
lastrdata++;
969+
970+
lastrdata->data = (char *) &newitem->t_tid.ip_blkid;
971+
lastrdata->len = sizeof(BlockIdData);
972+
lastrdata->buffer = InvalidBuffer;
973+
}
974+
975+
/* Log the new item, if it was inserted on the left page. If it was
976+
* put on the right page, we don't need to explicitly WAL log it
977+
* because it's included with all the other items on the right page.
978+
*/
979+
lastrdata->next = lastrdata + 1;
980+
lastrdata++;
934981
if (newitemonleft)
935-
xlrec.otherblk = BufferGetBlockNumber(rbuf);
982+
{
983+
lastrdata->data = (char *) &newitemoff;
984+
lastrdata->len = sizeof(OffsetNumber);
985+
lastrdata->buffer = buf; /* backup block 1 */
986+
lastrdata->buffer_std = true;
987+
988+
lastrdata->next = lastrdata + 1;
989+
lastrdata++;
990+
lastrdata->data = (char *)newitem;
991+
lastrdata->len = newitemsz;
992+
lastrdata->buffer = buf; /* backup block 1 */
993+
lastrdata->buffer_std = true;
994+
}
936995
else
937-
xlrec.otherblk = BufferGetBlockNumber(buf);
938-
xlrec.leftblk = lopaque->btpo_prev;
939-
xlrec.rightblk = ropaque->btpo_next;
940-
xlrec.level = lopaque->btpo.level;
996+
{
997+
lastrdata->data = NULL;
998+
lastrdata->len = 0;
999+
lastrdata->buffer = buf; /* backup block 1 */
1000+
lastrdata->buffer_std = true;
1001+
}
9411002

942-
/*
1003+
/* Log the contents of the right page in the format understood by
1004+
* _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer,
1005+
* because we're going to recreate the whole page anyway.
1006+
*
9431007
* Direct access to page is not good but faster - we should implement
9441008
* some new func in page API. Note we only store the tuples
9451009
* themselves, knowing that the item pointers are in the same order
9461010
* and can be reconstructed by scanning the tuples. See comments for
9471011
* _bt_restore_page().
9481012
*/
949-
xlrec.leftlen = ((PageHeader) leftpage)->pd_special -
950-
((PageHeader) leftpage)->pd_upper;
1013+
lastrdata->next = lastrdata + 1;
1014+
lastrdata++;
9511015

952-
rdata[0].data = (char *) &xlrec;
953-
rdata[0].len = SizeOfBtreeSplit;
954-
rdata[0].buffer = InvalidBuffer;
955-
rdata[0].next = &(rdata[1]);
956-
957-
rdata[1].data = (char *) leftpage + ((PageHeader) leftpage)->pd_upper;
958-
rdata[1].len = xlrec.leftlen;
959-
rdata[1].buffer = InvalidBuffer;
960-
rdata[1].next = &(rdata[2]);
961-
962-
rdata[2].data = (char *) rightpage + ((PageHeader) rightpage)->pd_upper;
963-
rdata[2].len = ((PageHeader) rightpage)->pd_special -
1016+
lastrdata->data = (char *) rightpage +
9641017
((PageHeader) rightpage)->pd_upper;
965-
rdata[2].buffer = InvalidBuffer;
966-
rdata[2].next = NULL;
1018+
lastrdata->len = ((PageHeader) rightpage)->pd_special -
1019+
((PageHeader) rightpage)->pd_upper;
1020+
lastrdata->buffer = InvalidBuffer;
9671021

1022+
/* Log the right sibling, because we've changed it's prev-pointer. */
9681023
if (!P_RIGHTMOST(ropaque))
9691024
{
970-
rdata[2].next = &(rdata[3]);
971-
rdata[3].data = NULL;
972-
rdata[3].len = 0;
973-
rdata[3].buffer = sbuf;
974-
rdata[3].buffer_std = true;
975-
rdata[3].next = NULL;
1025+
lastrdata->next = lastrdata + 1;
1026+
lastrdata++;
1027+
1028+
lastrdata->data = NULL;
1029+
lastrdata->len = 0;
1030+
lastrdata->buffer = sbuf; /* backup block 2 */
1031+
lastrdata->buffer_std = true;
9761032
}
9771033

978-
if (P_ISROOT(oopaque))
1034+
lastrdata->next = NULL;
1035+
1036+
if (isroot)
9791037
xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT;
9801038
else
9811039
xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
@@ -993,24 +1051,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
9931051
}
9941052
}
9951053

996-
/*
997-
* By here, the original data page has been split into two new halves, and
998-
* these are correct. The algorithm requires that the left page never
999-
* move during a split, so we copy the new left page back on top of the
1000-
* original. Note that this is not a waste of time, since we also require
1001-
* (in the page management code) that the center of a page always be
1002-
* clean, and the most efficient way to guarantee this is just to compact
1003-
* the data by reinserting it into a new left page. (XXX the latter
1004-
* comment is probably obsolete.)
1005-
*
1006-
* It's a bit weird that we don't fill in the left page till after writing
1007-
* the XLOG entry, but not really worth changing. Note that we use the
1008-
* origpage data (specifically its BTP_ROOT bit) while preparing the XLOG
1009-
* entry, so simply reshuffling the code won't do.
1010-
*/
1011-
1012-
PageRestoreTempPage(leftpage, origpage);
1013-
10141054
END_CRIT_SECTION();
10151055

10161056
/* release the old right sibling */

0 commit comments

Comments
 (0)