Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 8af2565

Browse files
committed
Introduce a new smgr bulk loading facility.
The new facility makes it easier to optimize bulk loading, as the logic for buffering, WAL-logging, and syncing the relation only needs to be implemented once. It's also less error-prone: We have had a number of bugs in how a relation is fsync'd - or not - at the end of a bulk loading operation. By centralizing that logic to one place, we only need to write it correctly once. The new facility is faster for small relations: Instead of of calling smgrimmedsync(), we register the fsync to happen at next checkpoint, which avoids the fsync latency. That can make a big difference if you are e.g. restoring a schema-only dump with lots of relations. It is also slightly more efficient with large relations, as the WAL logging is performed multiple pages at a time. That avoids some WAL header overhead. The sorted GiST index build did that already, this moves the buffering to the new facility. The changes to pageinspect GiST test needs an explanation: Before this patch, the sorted GiST index build set the LSN on every page to the special GistBuildLSN value, not the LSN of the WAL record, even though they were WAL-logged. There was no particular need for it, it just happened naturally when we wrote out the pages before WAL-logging them. Now we WAL-log the pages first, like in B-tree build, so the pages are stamped with the record's real LSN. When the build is not WAL-logged, we still use GistBuildLSN. To make the test output predictable, use an unlogged index. Reviewed-by: Andres Freund Discussion: https://www.postgresql.org/message-id/30e8f366-58b3-b239-c521-422122dd5150%40iki.fi
1 parent e612384 commit 8af2565

File tree

17 files changed

+552
-355
lines changed

17 files changed

+552
-355
lines changed

contrib/pageinspect/expected/gist.out

+3-11
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,6 @@
1-
-- The gist_page_opaque_info() function prints the page's LSN. Normally,
2-
-- that's constant 1 (GistBuildLSN) on every page of a freshly built GiST
3-
-- index. But with wal_level=minimal, the whole relation is dumped to WAL at
4-
-- the end of the transaction if it's smaller than wal_skip_threshold, which
5-
-- updates the LSNs. Wrap the tests on gist_page_opaque_info() in the
6-
-- same transaction with the CREATE INDEX so that we see the LSNs before
7-
-- they are possibly overwritten at end of transaction.
8-
BEGIN;
9-
-- Create a test table and GiST index.
10-
CREATE TABLE test_gist AS SELECT point(i,i) p, i::text t FROM
1+
-- The gist_page_opaque_info() function prints the page's LSN.
2+
-- Use an unlogged index, so that the LSN is predictable.
3+
CREATE UNLOGGED TABLE test_gist AS SELECT point(i,i) p, i::text t FROM
114
generate_series(1,1000) i;
125
CREATE INDEX test_gist_idx ON test_gist USING gist (p);
136
-- Page 0 is the root, the rest are leaf pages
@@ -29,7 +22,6 @@ SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2));
2922
0/1 | 0/0 | 1 | {leaf}
3023
(1 row)
3124

32-
COMMIT;
3325
SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx');
3426
itemoffset | ctid | itemlen | dead | keys
3527
------------+-----------+---------+------+-------------------------------

contrib/pageinspect/sql/gist.sql

+3-13
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,6 @@
1-
-- The gist_page_opaque_info() function prints the page's LSN. Normally,
2-
-- that's constant 1 (GistBuildLSN) on every page of a freshly built GiST
3-
-- index. But with wal_level=minimal, the whole relation is dumped to WAL at
4-
-- the end of the transaction if it's smaller than wal_skip_threshold, which
5-
-- updates the LSNs. Wrap the tests on gist_page_opaque_info() in the
6-
-- same transaction with the CREATE INDEX so that we see the LSNs before
7-
-- they are possibly overwritten at end of transaction.
8-
BEGIN;
9-
10-
-- Create a test table and GiST index.
11-
CREATE TABLE test_gist AS SELECT point(i,i) p, i::text t FROM
1+
-- The gist_page_opaque_info() function prints the page's LSN.
2+
-- Use an unlogged index, so that the LSN is predictable.
3+
CREATE UNLOGGED TABLE test_gist AS SELECT point(i,i) p, i::text t FROM
124
generate_series(1,1000) i;
135
CREATE INDEX test_gist_idx ON test_gist USING gist (p);
146

@@ -17,8 +9,6 @@ SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 0));
179
SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 1));
1810
SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2));
1911

20-
COMMIT;
21-
2212
SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx');
2313
SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 1), 'test_gist_idx') LIMIT 5;
2414

src/backend/access/gist/gistbuild.c

+28-93
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@
4343
#include "miscadmin.h"
4444
#include "optimizer/optimizer.h"
4545
#include "storage/bufmgr.h"
46-
#include "storage/smgr.h"
46+
#include "storage/bulk_write.h"
47+
4748
#include "utils/memutils.h"
4849
#include "utils/rel.h"
4950
#include "utils/tuplesort.h"
@@ -106,11 +107,8 @@ typedef struct
106107
Tuplesortstate *sortstate; /* state data for tuplesort.c */
107108

108109
BlockNumber pages_allocated;
109-
BlockNumber pages_written;
110110

111-
int ready_num_pages;
112-
BlockNumber ready_blknos[XLR_MAX_BLOCK_ID];
113-
Page ready_pages[XLR_MAX_BLOCK_ID];
111+
BulkWriteState *bulkstate;
114112
} GISTBuildState;
115113

116114
#define GIST_SORTED_BUILD_PAGE_NUM 4
@@ -142,7 +140,6 @@ static void gist_indexsortbuild_levelstate_add(GISTBuildState *state,
142140
IndexTuple itup);
143141
static void gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
144142
GistSortedBuildLevelState *levelstate);
145-
static void gist_indexsortbuild_flush_ready_pages(GISTBuildState *state);
146143

147144
static void gistInitBuffering(GISTBuildState *buildstate);
148145
static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
@@ -405,27 +402,18 @@ gist_indexsortbuild(GISTBuildState *state)
405402
{
406403
IndexTuple itup;
407404
GistSortedBuildLevelState *levelstate;
408-
Page page;
405+
BulkWriteBuffer rootbuf;
409406

410-
state->pages_allocated = 0;
411-
state->pages_written = 0;
412-
state->ready_num_pages = 0;
407+
/* Reserve block 0 for the root page */
408+
state->pages_allocated = 1;
413409

414-
/*
415-
* Write an empty page as a placeholder for the root page. It will be
416-
* replaced with the real root page at the end.
417-
*/
418-
page = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO);
419-
smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
420-
page, true);
421-
state->pages_allocated++;
422-
state->pages_written++;
410+
state->bulkstate = smgr_bulk_start_rel(state->indexrel, MAIN_FORKNUM);
423411

424412
/* Allocate a temporary buffer for the first leaf page batch. */
425413
levelstate = palloc0(sizeof(GistSortedBuildLevelState));
426-
levelstate->pages[0] = page;
414+
levelstate->pages[0] = palloc(BLCKSZ);
427415
levelstate->parent = NULL;
428-
gistinitpage(page, F_LEAF);
416+
gistinitpage(levelstate->pages[0], F_LEAF);
429417

430418
/*
431419
* Fill index pages with tuples in the sorted order.
@@ -455,31 +443,15 @@ gist_indexsortbuild(GISTBuildState *state)
455443
levelstate = parent;
456444
}
457445

458-
gist_indexsortbuild_flush_ready_pages(state);
459-
460446
/* Write out the root */
461447
PageSetLSN(levelstate->pages[0], GistBuildLSN);
462-
PageSetChecksumInplace(levelstate->pages[0], GIST_ROOT_BLKNO);
463-
smgrwrite(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
464-
levelstate->pages[0], true);
465-
if (RelationNeedsWAL(state->indexrel))
466-
log_newpage(&state->indexrel->rd_locator, MAIN_FORKNUM, GIST_ROOT_BLKNO,
467-
levelstate->pages[0], true);
468-
469-
pfree(levelstate->pages[0]);
448+
rootbuf = smgr_bulk_get_buf(state->bulkstate);
449+
memcpy(rootbuf, levelstate->pages[0], BLCKSZ);
450+
smgr_bulk_write(state->bulkstate, GIST_ROOT_BLKNO, rootbuf, true);
451+
470452
pfree(levelstate);
471453

472-
/*
473-
* When we WAL-logged index pages, we must nonetheless fsync index files.
474-
* Since we're building outside shared buffers, a CHECKPOINT occurring
475-
* during the build has no way to flush the previously written data to
476-
* disk (indeed it won't know the index even exists). A crash later on
477-
* would replay WAL from the checkpoint, therefore it wouldn't replay our
478-
* earlier WAL entries. If we do not fsync those pages here, they might
479-
* still not be on disk when the crash occurs.
480-
*/
481-
if (RelationNeedsWAL(state->indexrel))
482-
smgrimmedsync(RelationGetSmgr(state->indexrel), MAIN_FORKNUM);
454+
smgr_bulk_finish(state->bulkstate);
483455
}
484456

485457
/*
@@ -509,8 +481,7 @@ gist_indexsortbuild_levelstate_add(GISTBuildState *state,
509481
levelstate->current_page++;
510482

511483
if (levelstate->pages[levelstate->current_page] == NULL)
512-
levelstate->pages[levelstate->current_page] =
513-
palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
484+
levelstate->pages[levelstate->current_page] = palloc0(BLCKSZ);
514485

515486
newPage = levelstate->pages[levelstate->current_page];
516487
gistinitpage(newPage, old_page_flags);
@@ -573,14 +544,16 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
573544
for (; dist != NULL; dist = dist->next)
574545
{
575546
char *data;
547+
BulkWriteBuffer buf;
576548
Page target;
577549

578550
/* check once per page */
579551
CHECK_FOR_INTERRUPTS();
580552

581553
/* Create page and copy data */
582554
data = (char *) (dist->list);
583-
target = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO);
555+
buf = smgr_bulk_get_buf(state->bulkstate);
556+
target = (Page) buf;
584557
gistinitpage(target, isleaf ? F_LEAF : 0);
585558
for (int i = 0; i < dist->block.num; i++)
586559
{
@@ -593,20 +566,6 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
593566
}
594567
union_tuple = dist->itup;
595568

596-
if (state->ready_num_pages == XLR_MAX_BLOCK_ID)
597-
gist_indexsortbuild_flush_ready_pages(state);
598-
599-
/*
600-
* The page is now complete. Assign a block number to it, and add it
601-
* to the list of finished pages. (We don't write it out immediately,
602-
* because we want to WAL-log the pages in batches.)
603-
*/
604-
blkno = state->pages_allocated++;
605-
state->ready_blknos[state->ready_num_pages] = blkno;
606-
state->ready_pages[state->ready_num_pages] = target;
607-
state->ready_num_pages++;
608-
ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno);
609-
610569
/*
611570
* Set the right link to point to the previous page. This is just for
612571
* debugging purposes: GiST only follows the right link if a page is
@@ -621,6 +580,15 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
621580
*/
622581
if (levelstate->last_blkno)
623582
GistPageGetOpaque(target)->rightlink = levelstate->last_blkno;
583+
584+
/*
585+
* The page is now complete. Assign a block number to it, and pass it
586+
* to the bulk writer.
587+
*/
588+
blkno = state->pages_allocated++;
589+
PageSetLSN(target, GistBuildLSN);
590+
smgr_bulk_write(state->bulkstate, blkno, buf, true);
591+
ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno);
624592
levelstate->last_blkno = blkno;
625593

626594
/*
@@ -631,7 +599,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
631599
if (parent == NULL)
632600
{
633601
parent = palloc0(sizeof(GistSortedBuildLevelState));
634-
parent->pages[0] = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
602+
parent->pages[0] = palloc(BLCKSZ);
635603
parent->parent = NULL;
636604
gistinitpage(parent->pages[0], 0);
637605

@@ -641,39 +609,6 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
641609
}
642610
}
643611

644-
static void
645-
gist_indexsortbuild_flush_ready_pages(GISTBuildState *state)
646-
{
647-
if (state->ready_num_pages == 0)
648-
return;
649-
650-
for (int i = 0; i < state->ready_num_pages; i++)
651-
{
652-
Page page = state->ready_pages[i];
653-
BlockNumber blkno = state->ready_blknos[i];
654-
655-
/* Currently, the blocks must be buffered in order. */
656-
if (blkno != state->pages_written)
657-
elog(ERROR, "unexpected block number to flush GiST sorting build");
658-
659-
PageSetLSN(page, GistBuildLSN);
660-
PageSetChecksumInplace(page, blkno);
661-
smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, blkno, page,
662-
true);
663-
664-
state->pages_written++;
665-
}
666-
667-
if (RelationNeedsWAL(state->indexrel))
668-
log_newpages(&state->indexrel->rd_locator, MAIN_FORKNUM, state->ready_num_pages,
669-
state->ready_blknos, state->ready_pages, true);
670-
671-
for (int i = 0; i < state->ready_num_pages; i++)
672-
pfree(state->ready_pages[i]);
673-
674-
state->ready_num_pages = 0;
675-
}
676-
677612

678613
/*-------------------------------------------------------------------------
679614
* Routines for non-sorted build

0 commit comments

Comments
 (0)