Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 631118f

Browse files
committed
Get rid of the post-recovery cleanup step of GIN page splits.
Replace it with an approach similar to what GiST uses: when a page is split, the left sibling is marked with a flag indicating that the parent hasn't been updated yet. When the parent is updated, the flag is cleared. If an insertion steps on a page with the flag set, it will finish split before proceeding with the insertion. The post-recovery cleanup mechanism was never totally reliable, as insertion to the parent could fail e.g because of running out of memory or disk space, leaving the tree in an inconsistent state. This also divides the responsibility of WAL-logging more clearly between the generic ginbtree.c code, and the parts specific to entry and posting trees. There is now a common WAL record format for insertions and deletions, which is written by ginbtree.c, followed by tree-specific payload, which is returned by the placetopage- and split- callbacks.
1 parent ce5326e commit 631118f

File tree

9 files changed

+666
-556
lines changed

9 files changed

+666
-556
lines changed

src/backend/access/gin/ginbtree.c

Lines changed: 315 additions & 132 deletions
Large diffs are not rendered by default.

src/backend/access/gin/gindatapage.c

Lines changed: 28 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@ GinDataPageAddItemPointer(Page page, ItemPointer data, OffsetNumber offset)
227227
OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
228228
char *ptr;
229229

230+
Assert(ItemPointerIsValid(data));
230231
Assert(GinPageIsLeaf(page));
231232

232233
if (offset == InvalidOffsetNumber)
@@ -255,6 +256,7 @@ GinDataPageAddPostingItem(Page page, PostingItem *data, OffsetNumber offset)
255256
OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
256257
char *ptr;
257258

259+
Assert(PostingItemGetBlockNumber(data) != InvalidBlockNumber);
258260
Assert(!GinPageIsLeaf(page));
259261

260262
if (offset == InvalidOffsetNumber)
@@ -338,11 +340,8 @@ dataPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off,
338340
XLogRecData **prdata)
339341
{
340342
Page page = BufferGetPage(buf);
341-
int cnt = 0;
342-
343343
/* these must be static so they can be returned to caller */
344-
static XLogRecData rdata[3];
345-
static ginxlogInsert data;
344+
static XLogRecData rdata[2];
346345

347346
/* quick exit if it doesn't fit */
348347
if (!dataIsEnoughSpace(btree, buf, off, insertdata))
@@ -359,45 +358,10 @@ dataPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off,
359358
PostingItemSetBlockNumber(pitem, updateblkno);
360359
}
361360

362-
data.updateBlkno = updateblkno;
363-
data.node = btree->index->rd_node;
364-
data.blkno = BufferGetBlockNumber(buf);
365-
data.offset = off;
366-
data.nitem = 1;
367-
data.isDelete = FALSE;
368-
data.isData = TRUE;
369-
data.isLeaf = GinPageIsLeaf(page) ? TRUE : FALSE;
370-
371-
/*
372-
* Prevent full page write if child's split occurs. That is needed to
373-
* remove incomplete splits while replaying WAL
374-
*
375-
* data.updateBlkno contains new block number (of newly created right
376-
* page) for recently splited page.
377-
*/
378-
if (data.updateBlkno == InvalidBlockNumber)
379-
{
380-
rdata[0].buffer = buf;
381-
rdata[0].buffer_std = FALSE;
382-
rdata[0].data = NULL;
383-
rdata[0].len = 0;
384-
rdata[0].next = &rdata[1];
385-
cnt++;
386-
}
387-
388-
rdata[cnt].buffer = InvalidBuffer;
389-
rdata[cnt].data = (char *) &data;
390-
rdata[cnt].len = sizeof(ginxlogInsert);
391-
rdata[cnt].next = &rdata[cnt + 1];
392-
cnt++;
393-
394-
rdata[cnt].buffer = InvalidBuffer;
395-
/* data and len filled in below */
396-
rdata[cnt].next = NULL;
397-
398361
if (GinPageIsLeaf(page))
399362
{
400363
GinBtreeDataLeafInsertData *items = insertdata;
364+
static ginxlogInsertDataLeaf data;
401365
uint32 savedPos = items->curitem;
402366

403367
if (GinPageRightMost(page) && off > GinPageGetOpaque(page)->maxoff)
@@ -415,19 +379,29 @@ dataPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off,
415379
{
416380
GinDataPageAddItemPointer(page, items->items + items->curitem, off);
417381
items->curitem++;
382+
data.nitem = 1;
418383
}
419384

420-
rdata[cnt].data = (char *) &items->items[savedPos];
421-
rdata[cnt].len = sizeof(ItemPointerData) * data.nitem;
385+
rdata[0].buffer = InvalidBuffer;
386+
rdata[0].data = (char *) &data;
387+
rdata[0].len = offsetof(ginxlogInsertDataLeaf, items);
388+
rdata[0].next = &rdata[1];
389+
390+
rdata[1].buffer = InvalidBuffer;
391+
rdata[1].data = (char *) &items->items[savedPos];
392+
rdata[1].len = sizeof(ItemPointerData) * data.nitem;
393+
rdata[1].next = NULL;
422394
}
423395
else
424396
{
425397
PostingItem *pitem = insertdata;
426398

427399
GinDataPageAddPostingItem(page, pitem, off);
428400

429-
rdata[cnt].data = (char *) pitem;
430-
rdata[cnt].len = sizeof(PostingItem);
401+
rdata[0].buffer = InvalidBuffer;
402+
rdata[0].data = (char *) pitem;
403+
rdata[0].len = sizeof(PostingItem);
404+
rdata[0].next = NULL;
431405
}
432406

433407
return true;
@@ -456,8 +430,8 @@ dataSplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off,
456430
Size freeSpace;
457431

458432
/* these must be static so they can be returned to caller */
459-
static ginxlogSplit data;
460-
static XLogRecData rdata[4];
433+
static ginxlogSplitData data;
434+
static XLogRecData rdata[2];
461435
static char vector[2 * BLCKSZ];
462436

463437
GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize);
@@ -488,6 +462,7 @@ dataSplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off,
488462

489463
if (isleaf && GinPageRightMost(lpage) && off > GinPageGetOpaque(lpage)->maxoff)
490464
{
465+
/* append new items to the end */
491466
GinBtreeDataLeafInsertData *items = insertdata;
492467

493468
while (items->curitem < items->nitem &&
@@ -566,25 +541,18 @@ dataSplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off,
566541
bound = GinDataPageGetRightBound(rpage);
567542
*bound = oldbound;
568543

569-
data.node = btree->index->rd_node;
570-
data.rootBlkno = InvalidBlockNumber;
571-
data.lblkno = BufferGetBlockNumber(lbuf);
572-
data.rblkno = BufferGetBlockNumber(rbuf);
573544
data.separator = separator;
574545
data.nitem = maxoff;
575-
data.isData = TRUE;
576-
data.isLeaf = GinPageIsLeaf(lpage) ? TRUE : FALSE;
577-
data.isRootSplit = FALSE;
578546
data.rightbound = oldbound;
579547

580548
rdata[0].buffer = InvalidBuffer;
581549
rdata[0].data = (char *) &data;
582-
rdata[0].len = sizeof(ginxlogSplit);
550+
rdata[0].len = sizeof(ginxlogSplitData);
583551
rdata[0].next = &rdata[1];
584552

585553
rdata[1].buffer = InvalidBuffer;
586554
rdata[1].data = vector;
587-
rdata[1].len = MAXALIGN(maxoff * sizeofitem);
555+
rdata[1].len = maxoff * sizeofitem;
588556
rdata[1].next = NULL;
589557

590558
return lpage;
@@ -610,21 +578,18 @@ dataPrepareDownlink(GinBtree btree, Buffer lbuf)
610578
* Also called from ginxlog, should not use btree
611579
*/
612580
void
613-
ginDataFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf)
581+
ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage)
614582
{
615-
Page page = BufferGetPage(root),
616-
lpage = BufferGetPage(lbuf),
617-
rpage = BufferGetPage(rbuf);
618583
PostingItem li,
619584
ri;
620585

621586
li.key = *GinDataPageGetRightBound(lpage);
622-
PostingItemSetBlockNumber(&li, BufferGetBlockNumber(lbuf));
623-
GinDataPageAddPostingItem(page, &li, InvalidOffsetNumber);
587+
PostingItemSetBlockNumber(&li, lblkno);
588+
GinDataPageAddPostingItem(root, &li, InvalidOffsetNumber);
624589

625590
ri.key = *GinDataPageGetRightBound(rpage);
626-
PostingItemSetBlockNumber(&ri, BufferGetBlockNumber(rbuf));
627-
GinDataPageAddPostingItem(page, &ri, InvalidOffsetNumber);
591+
PostingItemSetBlockNumber(&ri, rblkno);
592+
GinDataPageAddPostingItem(root, &ri, InvalidOffsetNumber);
628593
}
629594

630595
/*

src/backend/access/gin/ginentrypage.c

Lines changed: 22 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -504,15 +504,14 @@ entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off,
504504

505505
/* these must be static so they can be returned to caller */
506506
static XLogRecData rdata[3];
507-
static ginxlogInsert data;
507+
static ginxlogInsertEntry data;
508508

509509
/* quick exit if it doesn't fit */
510510
if (!entryIsEnoughSpace(btree, buf, off, insertData))
511511
return false;
512512

513513
*prdata = rdata;
514514
entryPreparePage(btree, page, off, insertData, updateblkno);
515-
data.updateBlkno = updateblkno;
516515

517516
placed = PageAddItem(page,
518517
(Item) insertData->entry,
@@ -522,34 +521,11 @@ entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off,
522521
elog(ERROR, "failed to add item to index page in \"%s\"",
523522
RelationGetRelationName(btree->index));
524523

525-
data.node = btree->index->rd_node;
526-
data.blkno = BufferGetBlockNumber(buf);
527-
data.offset = off;
528-
data.nitem = 1;
529524
data.isDelete = insertData->isDelete;
530-
data.isData = false;
531-
data.isLeaf = GinPageIsLeaf(page) ? TRUE : FALSE;
532-
533-
/*
534-
* Prevent full page write if child's split occurs. That is needed to
535-
* remove incomplete splits while replaying WAL
536-
*
537-
* data.updateBlkno contains new block number (of newly created right
538-
* page) for recently splited page.
539-
*/
540-
if (data.updateBlkno == InvalidBlockNumber)
541-
{
542-
rdata[0].buffer = buf;
543-
rdata[0].buffer_std = TRUE;
544-
rdata[0].data = NULL;
545-
rdata[0].len = 0;
546-
rdata[0].next = &rdata[1];
547-
cnt++;
548-
}
549525

550526
rdata[cnt].buffer = InvalidBuffer;
551527
rdata[cnt].data = (char *) &data;
552-
rdata[cnt].len = sizeof(ginxlogInsert);
528+
rdata[cnt].len = offsetof(ginxlogInsertEntry, tuple);
553529
rdata[cnt].next = &rdata[cnt + 1];
554530
cnt++;
555531

@@ -577,6 +553,7 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off,
577553
maxoff,
578554
separator = InvalidOffsetNumber;
579555
Size totalsize = 0;
556+
Size tupstoresize;
580557
Size lsize = 0,
581558
size;
582559
char *ptr;
@@ -588,18 +565,18 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off,
588565

589566
/* these must be static so they can be returned to caller */
590567
static XLogRecData rdata[2];
591-
static ginxlogSplit data;
568+
static ginxlogSplitEntry data;
592569
static char tupstore[2 * BLCKSZ];
593570

594571
*prdata = rdata;
595-
data.leftChildBlkno = (GinPageIsLeaf(lpage)) ?
596-
InvalidOffsetNumber : GinGetDownlink(insertData->entry);
597-
data.updateBlkno = updateblkno;
598572
entryPreparePage(btree, lpage, off, insertData, updateblkno);
599573

574+
/*
575+
* First, append all the existing tuples and the new tuple we're inserting
576+
* one after another in a temporary workspace.
577+
*/
600578
maxoff = PageGetMaxOffsetNumber(lpage);
601579
ptr = tupstore;
602-
603580
for (i = FirstOffsetNumber; i <= maxoff; i++)
604581
{
605582
if (i == off)
@@ -624,7 +601,12 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off,
624601
ptr += size;
625602
totalsize += size + sizeof(ItemIdData);
626603
}
604+
tupstoresize = ptr - tupstore;
627605

606+
/*
607+
* Initialize the left and right pages, and copy all the tuples back to
608+
* them.
609+
*/
628610
GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize);
629611
GinInitPage(lpage, GinPageGetOpaque(rpage)->flags, pageSize);
630612

@@ -654,24 +636,17 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off,
654636
ptr += MAXALIGN(IndexTupleSize(itup));
655637
}
656638

657-
data.node = btree->index->rd_node;
658-
data.rootBlkno = InvalidBlockNumber;
659-
data.lblkno = BufferGetBlockNumber(lbuf);
660-
data.rblkno = BufferGetBlockNumber(rbuf);
661639
data.separator = separator;
662640
data.nitem = maxoff;
663-
data.isData = FALSE;
664-
data.isLeaf = GinPageIsLeaf(lpage) ? TRUE : FALSE;
665-
data.isRootSplit = FALSE;
666641

667642
rdata[0].buffer = InvalidBuffer;
668643
rdata[0].data = (char *) &data;
669-
rdata[0].len = sizeof(ginxlogSplit);
644+
rdata[0].len = sizeof(ginxlogSplitEntry);
670645
rdata[0].next = &rdata[1];
671646

672647
rdata[1].buffer = InvalidBuffer;
673648
rdata[1].data = tupstore;
674-
rdata[1].len = MAXALIGN(totalsize);
649+
rdata[1].len = tupstoresize;
675650
rdata[1].next = NULL;
676651

677652
return lpage;
@@ -702,24 +677,19 @@ entryPrepareDownlink(GinBtree btree, Buffer lbuf)
702677
* Also called from ginxlog, should not use btree
703678
*/
704679
void
705-
ginEntryFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf)
680+
ginEntryFillRoot(GinBtree btree, Page root,
681+
BlockNumber lblkno, Page lpage,
682+
BlockNumber rblkno, Page rpage)
706683
{
707-
Page page = BufferGetPage(root);
708-
Page lpage = BufferGetPage(lbuf);
709-
Page rpage = BufferGetPage(rbuf);
710684
IndexTuple itup;
711685

712-
itup = GinFormInteriorTuple(getRightMostTuple(lpage),
713-
lpage,
714-
BufferGetBlockNumber(lbuf));
715-
if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
686+
itup = GinFormInteriorTuple(getRightMostTuple(lpage), lpage, lblkno);
687+
if (PageAddItem(root, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
716688
elog(ERROR, "failed to add item to index root page");
717689
pfree(itup);
718690

719-
itup = GinFormInteriorTuple(getRightMostTuple(rpage),
720-
rpage,
721-
BufferGetBlockNumber(rbuf));
722-
if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
691+
itup = GinFormInteriorTuple(getRightMostTuple(rpage), rpage, rblkno);
692+
if (PageAddItem(root, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
723693
elog(ERROR, "failed to add item to index root page");
724694
pfree(itup);
725695
}

0 commit comments

Comments
 (0)