Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit d2896a9

Browse files
committed
Arrange to cache btree metapage data in the relcache entry for the index,
thereby saving a visit to the metapage in most index searches/updates. This wouldn't actually save any I/O (since in the old regime the metapage generally stayed in cache anyway), but it does provide a useful decrease in bufmgr traffic in high-contention scenarios. Per my recent proposal.
1 parent 8908387 commit d2896a9

File tree

6 files changed

+122
-9
lines changed

6 files changed

+122
-9
lines changed

src/backend/access/nbtree/README

+12-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.9 2006/01/17 00:09:00 tgl Exp $
1+
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.10 2006/04/25 22:46:05 tgl Exp $
22

33
This directory contains a correct implementation of Lehman and Yao's
44
high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
@@ -316,7 +316,17 @@ Other things that are handy to know
316316

317317
Page zero of every btree is a meta-data page. This page stores the
318318
location of the root page --- both the true root and the current effective
319-
root ("fast" root).
319+
root ("fast" root). To avoid fetching the metapage for every single index
320+
search, we cache a copy of the meta-data information in the index's
321+
relcache entry (rd_amcache). This is a bit ticklish since using the cache
322+
implies following a root page pointer that could be stale. We require
323+
every metapage update to send out a SI "relcache inval" message on the
324+
index relation. That ensures that each backend will flush its cached copy
325+
not later than the start of its next transaction. Therefore, stale
326+
pointers cannot be used for longer than the current transaction, which
327+
reduces the problem to the same one already dealt with for concurrent
328+
VACUUM --- we can just imagine that each open transaction is potentially
329+
"already in flight" to the old root.
320330

321331
The algorithm assumes we can fit at least three items per page
322332
(a "high key" and two real data items). Therefore it's unsafe

src/backend/access/nbtree/nbtinsert.c

+9-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.135 2006/04/13 03:53:05 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.136 2006/04/25 22:46:05 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -18,6 +18,7 @@
1818
#include "access/heapam.h"
1919
#include "access/nbtree.h"
2020
#include "miscadmin.h"
21+
#include "utils/inval.h"
2122

2223

2324
typedef struct
@@ -638,9 +639,12 @@ _bt_insertonpg(Relation rel,
638639

639640
END_CRIT_SECTION();
640641

641-
/* release pin/lock */
642+
/* release buffers; send out relcache inval if metapage changed */
642643
if (BufferIsValid(metabuf))
644+
{
645+
CacheInvalidateRelcache(rel);
643646
_bt_relbuf(rel, metabuf);
647+
}
644648

645649
_bt_relbuf(rel, buf);
646650
}
@@ -1526,6 +1530,9 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
15261530

15271531
END_CRIT_SECTION();
15281532

1533+
/* send out relcache inval for metapage change */
1534+
CacheInvalidateRelcache(rel);
1535+
15291536
/* done with metapage */
15301537
_bt_relbuf(rel, metabuf);
15311538

src/backend/access/nbtree/nbtpage.c

+72-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
*
1010
*
1111
* IDENTIFICATION
12-
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.95 2006/04/01 03:03:36 tgl Exp $
12+
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.96 2006/04/25 22:46:05 tgl Exp $
1313
*
1414
* NOTES
1515
* Postgres btree pages look like ordinary relation pages. The opaque
@@ -26,6 +26,7 @@
2626
#include "miscadmin.h"
2727
#include "storage/freespace.h"
2828
#include "storage/lmgr.h"
29+
#include "utils/inval.h"
2930

3031

3132
/*
@@ -99,6 +100,49 @@ _bt_getroot(Relation rel, int access)
99100
uint32 rootlevel;
100101
BTMetaPageData *metad;
101102

103+
/*
104+
* Try to use previously-cached metapage data to find the root. This
105+
* normally saves one buffer access per index search, which is a very
106+
* helpful savings in bufmgr traffic and hence contention.
107+
*/
108+
if (rel->rd_amcache != NULL)
109+
{
110+
metad = (BTMetaPageData *) rel->rd_amcache;
111+
/* We shouldn't have cached it if any of these fail */
112+
Assert(metad->btm_magic == BTREE_MAGIC);
113+
Assert(metad->btm_version == BTREE_VERSION);
114+
Assert(metad->btm_root != P_NONE);
115+
116+
rootblkno = metad->btm_fastroot;
117+
Assert(rootblkno != P_NONE);
118+
rootlevel = metad->btm_fastlevel;
119+
120+
rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
121+
rootpage = BufferGetPage(rootbuf);
122+
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
123+
124+
/*
125+
* Since the cache might be stale, we check the page more carefully
126+
* here than normal. We *must* check that it's not deleted.
127+
* If it's not alone on its level, then we reject too --- this
128+
* may be overly paranoid but better safe than sorry. Note we
129+
* don't check P_ISROOT, because that's not set in a "fast root".
130+
*/
131+
if (!P_IGNORE(rootopaque) &&
132+
rootopaque->btpo.level == rootlevel &&
133+
P_LEFTMOST(rootopaque) &&
134+
P_RIGHTMOST(rootopaque))
135+
{
136+
/* OK, accept cached page as the root */
137+
return rootbuf;
138+
}
139+
_bt_relbuf(rel, rootbuf);
140+
/* Cache is stale, throw it away */
141+
if (rel->rd_amcache)
142+
pfree(rel->rd_amcache);
143+
rel->rd_amcache = NULL;
144+
}
145+
102146
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
103147
metapg = BufferGetPage(metabuf);
104148
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
@@ -200,6 +244,12 @@ _bt_getroot(Relation rel, int access)
200244

201245
END_CRIT_SECTION();
202246

247+
/*
248+
* Send out relcache inval for metapage change (probably unnecessary
249+
* here, but let's be safe).
250+
*/
251+
CacheInvalidateRelcache(rel);
252+
203253
/*
204254
* swap root write lock for read lock. There is no danger of anyone
205255
* else accessing the new root page while it's unlocked, since no one
@@ -217,6 +267,13 @@ _bt_getroot(Relation rel, int access)
217267
Assert(rootblkno != P_NONE);
218268
rootlevel = metad->btm_fastlevel;
219269

270+
/*
271+
* Cache the metapage data for next time
272+
*/
273+
rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
274+
sizeof(BTMetaPageData));
275+
memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
276+
220277
/*
221278
* We are done with the metapage; arrange to release it via first
222279
* _bt_relandgetbuf call
@@ -280,6 +337,16 @@ _bt_gettrueroot(Relation rel)
280337
uint32 rootlevel;
281338
BTMetaPageData *metad;
282339

340+
/*
341+
* We don't try to use cached metapage data here, since (a) this path is
342+
* not performance-critical, and (b) if we are here it suggests our cache
343+
* is out-of-date anyway. In light of point (b), it's probably safest to
344+
* actively flush any cached metapage info.
345+
*/
346+
if (rel->rd_amcache)
347+
pfree(rel->rd_amcache);
348+
rel->rd_amcache = NULL;
349+
283350
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
284351
metapg = BufferGetPage(metabuf);
285352
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
@@ -1052,9 +1119,12 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
10521119

10531120
END_CRIT_SECTION();
10541121

1055-
/* release buffers */
1122+
/* release buffers; send out relcache inval if metapage changed */
10561123
if (BufferIsValid(metabuf))
1124+
{
1125+
CacheInvalidateRelcache(rel);
10571126
_bt_relbuf(rel, metabuf);
1127+
}
10581128
_bt_relbuf(rel, pbuf);
10591129
_bt_relbuf(rel, rbuf);
10601130
_bt_relbuf(rel, buf);

src/backend/access/nbtree/nbtree.c

+13-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* Portions Copyright (c) 1994, Regents of the University of California
1313
*
1414
* IDENTIFICATION
15-
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.144 2006/04/01 03:03:37 tgl Exp $
15+
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.145 2006/04/25 22:46:05 tgl Exp $
1616
*
1717
*-------------------------------------------------------------------------
1818
*/
@@ -26,6 +26,7 @@
2626
#include "miscadmin.h"
2727
#include "storage/freespace.h"
2828
#include "storage/smgr.h"
29+
#include "utils/inval.h"
2930
#include "utils/memutils.h"
3031

3132

@@ -127,6 +128,17 @@ btbuild(PG_FUNCTION_ARGS)
127128
}
128129
#endif /* BTREE_BUILD_STATS */
129130

131+
/*
132+
* If we are reindexing a pre-existing index, it is critical to send out
133+
* a relcache invalidation SI message to ensure all backends re-read the
134+
* index metapage. In most circumstances the update-stats operation will
135+
* cause that to happen, but at the moment there are corner cases where
136+
* no pg_class update will occur, so force an inval here. XXX FIXME:
137+
* the upper levels of CREATE INDEX should handle the stats update as
138+
* well as guaranteeing relcache inval.
139+
*/
140+
CacheInvalidateRelcache(index);
141+
130142
/* since we just counted the # of tuples, may as well update stats */
131143
IndexCloseAndUpdateStats(heap, reltuples, index, buildstate.indtuples);
132144

src/backend/utils/cache/relcache.c

+7-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.238 2006/03/05 15:58:45 momjian Exp $
11+
* $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.239 2006/04/25 22:46:05 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -948,6 +948,7 @@ RelationInitIndexAccessInfo(Relation relation)
948948
*/
949949
relation->rd_indexprs = NIL;
950950
relation->rd_indpred = NIL;
951+
relation->rd_amcache = NULL;
951952
}
952953

953954
/*
@@ -1481,6 +1482,10 @@ RelationReloadClassinfo(Relation relation)
14811482
RelationInitPhysicalAddr(relation);
14821483
/* Make sure targblock is reset in case rel was truncated */
14831484
relation->rd_targblock = InvalidBlockNumber;
1485+
/* Must free any AM cached data, too */
1486+
if (relation->rd_amcache)
1487+
pfree(relation->rd_amcache);
1488+
relation->rd_amcache = NULL;
14841489
/* Okay, now it's valid again */
14851490
relation->rd_isvalid = true;
14861491
}
@@ -3141,6 +3146,7 @@ load_relcache_init_file(void)
31413146
rel->rd_indexlist = NIL;
31423147
rel->rd_oidindex = InvalidOid;
31433148
rel->rd_createSubid = InvalidSubTransactionId;
3149+
rel->rd_amcache = NULL;
31443150
MemSet(&rel->pgstat_info, 0, sizeof(rel->pgstat_info));
31453151

31463152
/*

src/include/utils/rel.h

+9-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.88 2006/03/05 15:59:07 momjian Exp $
10+
* $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.89 2006/04/25 22:46:05 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -167,6 +167,13 @@ typedef struct RelationData
167167
* cached, namely those with subtype zero. The arrays are indexed by
168168
* strategy or support number, which is a sufficient identifier given that
169169
* restriction.
170+
*
171+
* Note: rd_amcache is available for index AMs to cache private data about
172+
* an index. This must be just a cache since it may get reset at any time
173+
* (in particular, it will get reset by a relcache inval message for the
174+
* index). If used, it must point to a single memory chunk palloc'd in
175+
* rd_indexcxt. A relcache reset will include freeing that chunk and
176+
* setting rd_amcache = NULL.
170177
*/
171178
MemoryContext rd_indexcxt; /* private memory cxt for this stuff */
172179
RelationAmInfo *rd_aminfo; /* lookup info for funcs found in pg_am */
@@ -175,6 +182,7 @@ typedef struct RelationData
175182
FmgrInfo *rd_supportinfo; /* lookup info for support procedures */
176183
List *rd_indexprs; /* index expression trees, if any */
177184
List *rd_indpred; /* index predicate tree, if any */
185+
void *rd_amcache; /* available for use by index AM */
178186

179187
/* statistics collection area */
180188
PgStat_Info pgstat_info;

0 commit comments

Comments
 (0)