Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 2095206

Browse files
committed
Adjust btree index build to not use shared buffers, thereby avoiding the
locking conflict against concurrent CHECKPOINT that was discussed a few weeks ago. Also, if not using WAL archiving (which is always true ATM but won't be if PITR makes it into this release), there's no need to WAL-log the index build process; it's sufficient to force-fsync the completed index before commit. This seems to gain about a factor of 2 in my tests, which is consistent with writing half as much data. I did not try it with WAL on a separate drive though --- probably the gain would be a lot less in that scenario.
1 parent 4d0e47d commit 2095206

File tree

8 files changed

+304
-214
lines changed

8 files changed

+304
-214
lines changed

src/backend/access/nbtree/nbtpage.c

Lines changed: 32 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
*
1010
*
1111
* IDENTIFICATION
12-
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.75 2004/04/21 18:24:25 tgl Exp $
12+
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.76 2004/06/02 17:28:17 tgl Exp $
1313
*
1414
* NOTES
1515
* Postgres btree pages look like ordinary relation pages. The opaque
@@ -31,21 +31,21 @@
3131
/*
3232
* _bt_metapinit() -- Initialize the metadata page of a new btree.
3333
*
34-
* If markvalid is true, the index is immediately marked valid, else it
35-
* will be invalid until _bt_metaproot() is called.
34+
* Note: this is actually not used for standard btree index building;
35+
* nbtsort.c prefers not to make the metadata page valid until completion
36+
* of build.
3637
*
3738
* Note: there's no real need for any locking here. Since the transaction
3839
* creating the index hasn't committed yet, no one else can even see the index
3940
* much less be trying to use it. (In a REINDEX-in-place scenario, that's
4041
* not true, but we assume the caller holds sufficient locks on the index.)
4142
*/
4243
void
43-
_bt_metapinit(Relation rel, bool markvalid)
44+
_bt_metapinit(Relation rel)
4445
{
4546
Buffer buf;
4647
Page pg;
4748
BTMetaPageData *metad;
48-
BTPageOpaque op;
4949

5050
if (RelationGetNumberOfBlocks(rel) != 0)
5151
elog(ERROR, "cannot initialize non-empty btree index \"%s\"",
@@ -55,21 +55,11 @@ _bt_metapinit(Relation rel, bool markvalid)
5555
Assert(BufferGetBlockNumber(buf) == BTREE_METAPAGE);
5656
pg = BufferGetPage(buf);
5757

58-
/* NO ELOG(ERROR) from here till newmeta op is logged */
59-
START_CRIT_SECTION();
60-
61-
_bt_pageinit(pg, BufferGetPageSize(buf));
62-
58+
_bt_initmetapage(pg, P_NONE, 0);
6359
metad = BTPageGetMeta(pg);
64-
metad->btm_magic = markvalid ? BTREE_MAGIC : 0;
65-
metad->btm_version = BTREE_VERSION;
66-
metad->btm_root = P_NONE;
67-
metad->btm_level = 0;
68-
metad->btm_fastroot = P_NONE;
69-
metad->btm_fastlevel = 0;
7060

71-
op = (BTPageOpaque) PageGetSpecialPointer(pg);
72-
op->btpo_flags = BTP_META;
61+
/* NO ELOG(ERROR) from here till newmeta op is logged */
62+
START_CRIT_SECTION();
7363

7464
/* XLOG stuff */
7565
if (!rel->rd_istemp)
@@ -90,7 +80,7 @@ _bt_metapinit(Relation rel, bool markvalid)
9080
rdata[0].next = NULL;
9181

9282
recptr = XLogInsert(RM_BTREE_ID,
93-
markvalid ? XLOG_BTREE_NEWMETA : XLOG_BTREE_INVALIDMETA,
83+
XLOG_BTREE_NEWMETA,
9484
rdata);
9585

9686
PageSetLSN(pg, recptr);
@@ -102,6 +92,29 @@ _bt_metapinit(Relation rel, bool markvalid)
10292
WriteBuffer(buf);
10393
}
10494

95+
/*
96+
* _bt_initmetapage() -- Fill a page buffer with a correct metapage image
97+
*/
98+
void
99+
_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
100+
{
101+
BTMetaPageData *metad;
102+
BTPageOpaque metaopaque;
103+
104+
_bt_pageinit(page, BLCKSZ);
105+
106+
metad = BTPageGetMeta(page);
107+
metad->btm_magic = BTREE_MAGIC;
108+
metad->btm_version = BTREE_VERSION;
109+
metad->btm_root = rootbknum;
110+
metad->btm_level = level;
111+
metad->btm_fastroot = rootbknum;
112+
metad->btm_fastlevel = level;
113+
114+
metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
115+
metaopaque->btpo_flags = BTP_META;
116+
}
117+
105118
/*
106119
* _bt_getroot() -- Get the root page of the btree.
107120
*
@@ -609,76 +622,6 @@ _bt_page_recyclable(Page page)
609622
return false;
610623
}
611624

612-
/*
613-
* _bt_metaproot() -- Change the root page of the btree.
614-
*
615-
* Lehman and Yao require that the root page move around in order to
616-
* guarantee deadlock-free short-term, fine-granularity locking. When
617-
* we split the root page, we record the new parent in the metadata page
618-
* for the relation. This routine does the work.
619-
*
620-
* No direct preconditions, but if you don't have the write lock on
621-
* at least the old root page when you call this, you're making a big
622-
* mistake. On exit, metapage data is correct and we no longer have
623-
* a pin or lock on the metapage.
624-
*
625-
* Actually this is not used for splitting on-the-fly anymore. It's only used
626-
* in nbtsort.c at the completion of btree building, where we know we have
627-
* sole access to the index anyway.
628-
*/
629-
void
630-
_bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level)
631-
{
632-
Buffer metabuf;
633-
Page metap;
634-
BTPageOpaque metaopaque;
635-
BTMetaPageData *metad;
636-
637-
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
638-
metap = BufferGetPage(metabuf);
639-
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metap);
640-
Assert(metaopaque->btpo_flags & BTP_META);
641-
642-
/* NO ELOG(ERROR) from here till newmeta op is logged */
643-
START_CRIT_SECTION();
644-
645-
metad = BTPageGetMeta(metap);
646-
Assert(metad->btm_magic == BTREE_MAGIC || metad->btm_magic == 0);
647-
metad->btm_magic = BTREE_MAGIC; /* it's valid now for sure */
648-
metad->btm_root = rootbknum;
649-
metad->btm_level = level;
650-
metad->btm_fastroot = rootbknum;
651-
metad->btm_fastlevel = level;
652-
653-
/* XLOG stuff */
654-
if (!rel->rd_istemp)
655-
{
656-
xl_btree_newmeta xlrec;
657-
XLogRecPtr recptr;
658-
XLogRecData rdata[1];
659-
660-
xlrec.node = rel->rd_node;
661-
xlrec.meta.root = metad->btm_root;
662-
xlrec.meta.level = metad->btm_level;
663-
xlrec.meta.fastroot = metad->btm_fastroot;
664-
xlrec.meta.fastlevel = metad->btm_fastlevel;
665-
666-
rdata[0].buffer = InvalidBuffer;
667-
rdata[0].data = (char *) &xlrec;
668-
rdata[0].len = SizeOfBtreeNewmeta;
669-
rdata[0].next = NULL;
670-
671-
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWMETA, rdata);
672-
673-
PageSetLSN(metap, recptr);
674-
PageSetSUI(metap, ThisStartUpID);
675-
}
676-
677-
END_CRIT_SECTION();
678-
679-
_bt_wrtbuf(rel, metabuf);
680-
}
681-
682625
/*
683626
* Delete item(s) from a btree page.
684627
*

src/backend/access/nbtree/nbtree.c

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* Portions Copyright (c) 1994, Regents of the University of California
1313
*
1414
* IDENTIFICATION
15-
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.116 2004/05/31 19:24:04 tgl Exp $
15+
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.117 2004/06/02 17:28:17 tgl Exp $
1616
*
1717
*-------------------------------------------------------------------------
1818
*/
@@ -112,10 +112,6 @@ btbuild(PG_FUNCTION_ARGS)
112112
elog(ERROR, "index \"%s\" already contains data",
113113
RelationGetRelationName(index));
114114

115-
/* initialize the btree index metadata page */
116-
/* mark it valid right away only if using slow build */
117-
_bt_metapinit(index, !buildstate.usefast);
118-
119115
if (buildstate.usefast)
120116
{
121117
buildstate.spool = _bt_spoolinit(index, indexInfo->ii_Unique, false);
@@ -127,6 +123,11 @@ btbuild(PG_FUNCTION_ARGS)
127123
if (indexInfo->ii_Unique)
128124
buildstate.spool2 = _bt_spoolinit(index, false, true);
129125
}
126+
else
127+
{
128+
/* if using slow build, initialize the btree index metadata page */
129+
_bt_metapinit(index);
130+
}
130131

131132
/* do the heap scan */
132133
reltuples = IndexBuildHeapScan(heap, index, indexInfo,

0 commit comments

Comments
 (0)