Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 93190c3

Browse files
committed
Repair still another bug in the btree page split WAL reduction patch:
it failed for splits of non-leaf pages because in such pages the first data key on a page is suppressed, and so we can't just copy the first key from the right page to reconstitute the left page's high key. Problem found by Koichi Suzuki, patch by Heikki.
1 parent bb4a78c commit 93190c3

File tree

3 files changed

+69
-31
lines changed

3 files changed

+69
-31
lines changed

src/backend/access/nbtree/nbtinsert.c

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.161 2007/11/15 21:14:32 momjian Exp $
11+
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.162 2007/11/16 19:53:50 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -371,13 +371,13 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
371371
* removing any LP_DEAD tuples.
372372
*
373373
* On entry, *buf and *offsetptr point to the first legal position
374-
* where the new tuple could be inserted. The caller should hold an
375-
* exclusive lock on *buf. *offsetptr can also be set to
376-
* InvalidOffsetNumber, in which case the function will search the right
377-
* location within the page if needed. On exit, they point to the chosen
378-
* insert location. If findinsertloc decided to move right, the lock and
379-
* pin on the original page will be released and the new page returned to
380-
* the caller is exclusively locked instead.
374+
* where the new tuple could be inserted. The caller should hold an
375+
* exclusive lock on *buf. *offsetptr can also be set to
376+
* InvalidOffsetNumber, in which case the function will search for the
377+
* right location within the page if needed. On exit, they point to the
378+
* chosen insert location. If _bt_findinsertloc decides to move right,
379+
* the lock and pin on the original page will be released and the new
380+
* page returned to the caller is exclusively locked instead.
381381
*
382382
* newtup is the new tuple we're inserting, and scankey is an insertion
383383
* type scan key for it.
@@ -422,8 +422,6 @@ _bt_findinsertloc(Relation rel,
422422
"Consider a function index of an MD5 hash of the value, "
423423
"or use full text indexing.")));
424424

425-
426-
427425
/*----------
428426
* If we will need to split the page to put the item on this page,
429427
* check whether we can put the tuple somewhere to the right,
@@ -1004,7 +1002,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
10041002
xl_btree_split xlrec;
10051003
uint8 xlinfo;
10061004
XLogRecPtr recptr;
1007-
XLogRecData rdata[6];
1005+
XLogRecData rdata[7];
10081006
XLogRecData *lastrdata;
10091007

10101008
xlrec.node = rel->rd_node;
@@ -1020,15 +1018,32 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
10201018

10211019
lastrdata = &rdata[0];
10221020

1023-
/* Log downlink on non-leaf pages. */
10241021
if (ropaque->btpo.level > 0)
10251022
{
1023+
/* Log downlink on non-leaf pages */
10261024
lastrdata->next = lastrdata + 1;
10271025
lastrdata++;
10281026

10291027
lastrdata->data = (char *) &newitem->t_tid.ip_blkid;
10301028
lastrdata->len = sizeof(BlockIdData);
10311029
lastrdata->buffer = InvalidBuffer;
1030+
1031+
/*
1032+
* We must also log the left page's high key, because the right
1033+
* page's leftmost key is suppressed on non-leaf levels. Show it
1034+
* as belonging to the left page buffer, so that it is not stored
1035+
* if XLogInsert decides it needs a full-page image of the left
1036+
* page.
1037+
*/
1038+
lastrdata->next = lastrdata + 1;
1039+
lastrdata++;
1040+
1041+
itemid = PageGetItemId(origpage, P_HIKEY);
1042+
item = (IndexTuple) PageGetItem(origpage, itemid);
1043+
lastrdata->data = (char *) item;
1044+
lastrdata->len = MAXALIGN(IndexTupleSize(item));
1045+
lastrdata->buffer = buf; /* backup block 1 */
1046+
lastrdata->buffer_std = true;
10321047
}
10331048

10341049
/*
@@ -1057,7 +1072,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
10571072
lastrdata->buffer = buf; /* backup block 1 */
10581073
lastrdata->buffer_std = true;
10591074
}
1060-
else
1075+
else if (ropaque->btpo.level == 0)
10611076
{
10621077
/*
10631078
* Although we don't need to WAL-log the new item, we still need

src/backend/access/nbtree/nbtxlog.c

Lines changed: 33 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.48 2007/11/15 22:25:15 momjian Exp $
11+
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.49 2007/11/16 19:53:50 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -273,6 +273,8 @@ btree_xlog_split(bool onleft, bool isroot,
273273
OffsetNumber newitemoff = 0;
274274
Item newitem = NULL;
275275
Size newitemsz = 0;
276+
Item left_hikey = NULL;
277+
Size left_hikeysz = 0;
276278

277279
reln = XLogOpenRelation(xlrec->node);
278280

@@ -289,6 +291,17 @@ btree_xlog_split(bool onleft, bool isroot,
289291
datalen -= sizeof(BlockIdData);
290292

291293
forget_matching_split(xlrec->node, downlink, false);
294+
295+
/* Extract left hikey and its size (still assuming 16-bit alignment) */
296+
if (!(record->xl_info & XLR_BKP_BLOCK_1))
297+
{
298+
/* We assume 16-bit alignment is enough for IndexTupleSize */
299+
left_hikey = (Item) datapos;
300+
left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey));
301+
302+
datapos += left_hikeysz;
303+
datalen -= left_hikeysz;
304+
}
292305
}
293306

294307
/* Extract newitem and newitemoff, if present */
@@ -302,17 +315,13 @@ btree_xlog_split(bool onleft, bool isroot,
302315

303316
if (onleft && !(record->xl_info & XLR_BKP_BLOCK_1))
304317
{
305-
IndexTupleData itupdata;
306-
307318
/*
308-
* We need to copy the tuple header to apply IndexTupleDSize, because
309-
* of alignment considerations. However, we assume that PageAddItem
310-
* doesn't care about the alignment of the newitem pointer it's given.
319+
* We assume that 16-bit alignment is enough to apply IndexTupleSize
320+
* (since it's fetching from a uint16 field) and also enough for
321+
* PageAddItem to insert the tuple.
311322
*/
312-
newitem = datapos;
313-
memcpy(&itupdata, datapos, sizeof(IndexTupleData));
314-
newitemsz = IndexTupleDSize(itupdata);
315-
newitemsz = MAXALIGN(newitemsz);
323+
newitem = (Item) datapos;
324+
newitemsz = MAXALIGN(IndexTupleSize(newitem));
316325
datapos += newitemsz;
317326
datalen -= newitemsz;
318327
}
@@ -333,6 +342,18 @@ btree_xlog_split(bool onleft, bool isroot,
333342

334343
_bt_restore_page(rpage, datapos, datalen);
335344

345+
/*
346+
* On leaf level, the high key of the left page is equal to the
347+
* first key on the right page.
348+
*/
349+
if (xlrec->level == 0)
350+
{
351+
ItemId hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque));
352+
353+
left_hikey = PageGetItem(rpage, hiItemId);
354+
left_hikeysz = ItemIdGetLength(hiItemId);
355+
}
356+
336357
PageSetLSN(rpage, lsn);
337358
PageSetTLI(rpage, ThisTimeLineID);
338359
MarkBufferDirty(rbuf);
@@ -360,8 +381,6 @@ btree_xlog_split(bool onleft, bool isroot,
360381
OffsetNumber maxoff = PageGetMaxOffsetNumber(lpage);
361382
OffsetNumber deletable[MaxOffsetNumber];
362383
int ndeletable = 0;
363-
ItemId hiItemId;
364-
Item hiItem;
365384

366385
/*
367386
* Remove the items from the left page that were copied to the
@@ -394,11 +413,8 @@ btree_xlog_split(bool onleft, bool isroot,
394413
elog(PANIC, "failed to add new item to left page after split");
395414
}
396415

397-
/* Set high key equal to the first key on the right page */
398-
hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque));
399-
hiItem = PageGetItem(rpage, hiItemId);
400-
401-
if (PageAddItem(lpage, hiItem, ItemIdGetLength(hiItemId),
416+
/* Set high key */
417+
if (PageAddItem(lpage, left_hikey, left_hikeysz,
402418
P_HIKEY, false, false) == InvalidOffsetNumber)
403419
elog(PANIC, "failed to add high key to left page after split");
404420

src/include/access/nbtree.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.114 2007/11/15 21:14:42 momjian Exp $
10+
* $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.115 2007/11/16 19:53:50 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -289,8 +289,15 @@ typedef struct xl_btree_split
289289
* than BlockNumber for alignment reasons: SizeOfBtreeSplit is only 16-bit
290290
* aligned.)
291291
*
292+
* If level > 0, an IndexTuple representing the HIKEY of the left page
293+
* follows. We don't need this on leaf pages, because it's the same
294+
* as the leftmost key in the new right page. Also, it's suppressed if
295+
* XLogInsert chooses to store the left page's whole page image.
296+
*
292297
* In the _L variants, next are OffsetNumber newitemoff and the new item.
293298
* (In the _R variants, the new item is one of the right page's tuples.)
299+
* The new item, but not newitemoff, is suppressed if XLogInsert chooses
300+
* to store the left page's whole page image.
294301
*
295302
* Last are the right page's tuples in the form used by _bt_restore_page.
296303
*/

0 commit comments

Comments
 (0)