diff options
author | Heikki Linnakangas | 2024-02-23 14:10:51 +0000 |
---|---|---|
committer | Heikki Linnakangas | 2024-02-23 14:10:51 +0000 |
commit | 8af256524893987a3e534c6578dd60edfb782a77 (patch) | |
tree | 76761046e9f1fd4e18abc502e208c76132b7b1c7 /src/backend/access/gist | |
parent | e612384fc78d35c3d3a8b3d27cef5181dca8430b (diff) |
Introduce a new smgr bulk loading facility.
The new facility makes it easier to optimize bulk loading, as the
logic for buffering, WAL-logging, and syncing the relation only needs
to be implemented once. It's also less error-prone: We have had a
number of bugs in how a relation is fsync'd - or not - at the end of a
bulk loading operation. By centralizing that logic to one place, we
only need to write it correctly once.
The new facility is faster for small relations: Instead of of calling
smgrimmedsync(), we register the fsync to happen at next checkpoint,
which avoids the fsync latency. That can make a big difference if you
are e.g. restoring a schema-only dump with lots of relations.
It is also slightly more efficient with large relations, as the WAL
logging is performed multiple pages at a time. That avoids some WAL
header overhead. The sorted GiST index build did that already, this
moves the buffering to the new facility.
The changes to pageinspect GiST test needs an explanation: Before this
patch, the sorted GiST index build set the LSN on every page to the
special GistBuildLSN value, not the LSN of the WAL record, even though
they were WAL-logged. There was no particular need for it, it just
happened naturally when we wrote out the pages before WAL-logging
them. Now we WAL-log the pages first, like in B-tree build, so the
pages are stamped with the record's real LSN. When the build is not
WAL-logged, we still use GistBuildLSN. To make the test output
predictable, use an unlogged index.
Reviewed-by: Andres Freund
Discussion: https://www.postgresql.org/message-id/30e8f366-58b3-b239-c521-422122dd5150%40iki.fi
Diffstat (limited to 'src/backend/access/gist')
-rw-r--r-- | src/backend/access/gist/gistbuild.c | 121 |
1 files changed, 28 insertions, 93 deletions
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 08555b97f92..465246173ba 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -43,7 +43,8 @@ #include "miscadmin.h" #include "optimizer/optimizer.h" #include "storage/bufmgr.h" -#include "storage/smgr.h" +#include "storage/bulk_write.h" + #include "utils/memutils.h" #include "utils/rel.h" #include "utils/tuplesort.h" @@ -106,11 +107,8 @@ typedef struct Tuplesortstate *sortstate; /* state data for tuplesort.c */ BlockNumber pages_allocated; - BlockNumber pages_written; - int ready_num_pages; - BlockNumber ready_blknos[XLR_MAX_BLOCK_ID]; - Page ready_pages[XLR_MAX_BLOCK_ID]; + BulkWriteState *bulkstate; } GISTBuildState; #define GIST_SORTED_BUILD_PAGE_NUM 4 @@ -142,7 +140,6 @@ static void gist_indexsortbuild_levelstate_add(GISTBuildState *state, IndexTuple itup); static void gist_indexsortbuild_levelstate_flush(GISTBuildState *state, GistSortedBuildLevelState *levelstate); -static void gist_indexsortbuild_flush_ready_pages(GISTBuildState *state); static void gistInitBuffering(GISTBuildState *buildstate); static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep); @@ -405,27 +402,18 @@ gist_indexsortbuild(GISTBuildState *state) { IndexTuple itup; GistSortedBuildLevelState *levelstate; - Page page; + BulkWriteBuffer rootbuf; - state->pages_allocated = 0; - state->pages_written = 0; - state->ready_num_pages = 0; + /* Reserve block 0 for the root page */ + state->pages_allocated = 1; - /* - * Write an empty page as a placeholder for the root page. It will be - * replaced with the real root page at the end. - */ - page = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO); - smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO, - page, true); - state->pages_allocated++; - state->pages_written++; + state->bulkstate = smgr_bulk_start_rel(state->indexrel, MAIN_FORKNUM); /* Allocate a temporary buffer for the first leaf page batch. */ levelstate = palloc0(sizeof(GistSortedBuildLevelState)); - levelstate->pages[0] = page; + levelstate->pages[0] = palloc(BLCKSZ); levelstate->parent = NULL; - gistinitpage(page, F_LEAF); + gistinitpage(levelstate->pages[0], F_LEAF); /* * Fill index pages with tuples in the sorted order. @@ -455,31 +443,15 @@ gist_indexsortbuild(GISTBuildState *state) levelstate = parent; } - gist_indexsortbuild_flush_ready_pages(state); - /* Write out the root */ PageSetLSN(levelstate->pages[0], GistBuildLSN); - PageSetChecksumInplace(levelstate->pages[0], GIST_ROOT_BLKNO); - smgrwrite(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO, - levelstate->pages[0], true); - if (RelationNeedsWAL(state->indexrel)) - log_newpage(&state->indexrel->rd_locator, MAIN_FORKNUM, GIST_ROOT_BLKNO, - levelstate->pages[0], true); - - pfree(levelstate->pages[0]); + rootbuf = smgr_bulk_get_buf(state->bulkstate); + memcpy(rootbuf, levelstate->pages[0], BLCKSZ); + smgr_bulk_write(state->bulkstate, GIST_ROOT_BLKNO, rootbuf, true); + pfree(levelstate); - /* - * When we WAL-logged index pages, we must nonetheless fsync index files. - * Since we're building outside shared buffers, a CHECKPOINT occurring - * during the build has no way to flush the previously written data to - * disk (indeed it won't know the index even exists). A crash later on - * would replay WAL from the checkpoint, therefore it wouldn't replay our - * earlier WAL entries. If we do not fsync those pages here, they might - * still not be on disk when the crash occurs. - */ - if (RelationNeedsWAL(state->indexrel)) - smgrimmedsync(RelationGetSmgr(state->indexrel), MAIN_FORKNUM); + smgr_bulk_finish(state->bulkstate); } /* @@ -509,8 +481,7 @@ gist_indexsortbuild_levelstate_add(GISTBuildState *state, levelstate->current_page++; if (levelstate->pages[levelstate->current_page] == NULL) - levelstate->pages[levelstate->current_page] = - palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); + levelstate->pages[levelstate->current_page] = palloc0(BLCKSZ); newPage = levelstate->pages[levelstate->current_page]; gistinitpage(newPage, old_page_flags); @@ -573,6 +544,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, for (; dist != NULL; dist = dist->next) { char *data; + BulkWriteBuffer buf; Page target; /* check once per page */ @@ -580,7 +552,8 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, /* Create page and copy data */ data = (char *) (dist->list); - target = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO); + buf = smgr_bulk_get_buf(state->bulkstate); + target = (Page) buf; gistinitpage(target, isleaf ? F_LEAF : 0); for (int i = 0; i < dist->block.num; i++) { @@ -593,20 +566,6 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, } union_tuple = dist->itup; - if (state->ready_num_pages == XLR_MAX_BLOCK_ID) - gist_indexsortbuild_flush_ready_pages(state); - - /* - * The page is now complete. Assign a block number to it, and add it - * to the list of finished pages. (We don't write it out immediately, - * because we want to WAL-log the pages in batches.) - */ - blkno = state->pages_allocated++; - state->ready_blknos[state->ready_num_pages] = blkno; - state->ready_pages[state->ready_num_pages] = target; - state->ready_num_pages++; - ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno); - /* * Set the right link to point to the previous page. This is just for * debugging purposes: GiST only follows the right link if a page is @@ -621,6 +580,15 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, */ if (levelstate->last_blkno) GistPageGetOpaque(target)->rightlink = levelstate->last_blkno; + + /* + * The page is now complete. Assign a block number to it, and pass it + * to the bulk writer. + */ + blkno = state->pages_allocated++; + PageSetLSN(target, GistBuildLSN); + smgr_bulk_write(state->bulkstate, blkno, buf, true); + ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno); levelstate->last_blkno = blkno; /* @@ -631,7 +599,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, if (parent == NULL) { parent = palloc0(sizeof(GistSortedBuildLevelState)); - parent->pages[0] = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); + parent->pages[0] = palloc(BLCKSZ); parent->parent = NULL; gistinitpage(parent->pages[0], 0); @@ -641,39 +609,6 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, } } -static void -gist_indexsortbuild_flush_ready_pages(GISTBuildState *state) -{ - if (state->ready_num_pages == 0) - return; - - for (int i = 0; i < state->ready_num_pages; i++) - { - Page page = state->ready_pages[i]; - BlockNumber blkno = state->ready_blknos[i]; - - /* Currently, the blocks must be buffered in order. */ - if (blkno != state->pages_written) - elog(ERROR, "unexpected block number to flush GiST sorting build"); - - PageSetLSN(page, GistBuildLSN); - PageSetChecksumInplace(page, blkno); - smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, blkno, page, - true); - - state->pages_written++; - } - - if (RelationNeedsWAL(state->indexrel)) - log_newpages(&state->indexrel->rd_locator, MAIN_FORKNUM, state->ready_num_pages, - state->ready_blknos, state->ready_pages, true); - - for (int i = 0; i < state->ready_num_pages; i++) - pfree(state->ready_pages[i]); - - state->ready_num_pages = 0; -} - /*------------------------------------------------------------------------- * Routines for non-sorted build |