Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit d6ad34f

Browse files
author
Amit Kapila
committed
Optimize DropRelFileNodeBuffers() for recovery.
The recovery path of DropRelFileNodeBuffers() is optimized so that scanning of the whole buffer pool can be avoided when the number of blocks to be truncated in a relation is below a certain threshold. For such cases, we find the buffers by doing lookups in BufMapping table. This improves the performance by more than 100 times in many cases when several small tables (tested with 1000 relations) are truncated and where the server is configured with a large value of shared buffers (greater than equal to 100GB). This optimization helps cases (a) when vacuum or autovacuum truncated off any of the empty pages at the end of a relation, or (b) when the relation is truncated in the same transaction in which it was created. This commit introduces a new API smgrnblocks_cached which returns a cached value for the number of blocks in a relation fork. This helps us to determine the exact size of relation which is required to apply this optimization. The exact size is required to ensure that we don't leave any buffer for the relation being dropped as otherwise the background writer or checkpointer can lead to a PANIC error while flushing buffers corresponding to files that don't exist. Author: Kirk Jamison based on ideas by Amit Kapila Reviewed-by: Kyotaro Horiguchi, Takayuki Tsunakawa, and Amit Kapila Tested-By: Haiying Tang Discussion: https://postgr.es/m/OSBPR01MB3207DCA7EC725FDD661B3EDAEF660@OSBPR01MB3207.jpnprd01.prod.outlook.com
1 parent 9a4c0e3 commit d6ad34f

File tree

4 files changed

+154
-15
lines changed

4 files changed

+154
-15
lines changed

src/backend/storage/buffer/bufmgr.c

+128-8
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,14 @@
7070

7171
#define RELS_BSEARCH_THRESHOLD 20
7272

73+
/*
74+
* This is the size (in the number of blocks) above which we scan the
75+
* entire buffer pool to remove the buffers for all the pages of relation
76+
* being dropped. For the relations with size below this threshold, we find
77+
* the buffers by doing lookups in BufMapping table.
78+
*/
79+
#define BUF_DROP_FULL_SCAN_THRESHOLD (uint32) (NBuffers / 32)
80+
7381
typedef struct PrivateRefCountEntry
7482
{
7583
Buffer buffer;
@@ -473,6 +481,10 @@ static BufferDesc *BufferAlloc(SMgrRelation smgr,
473481
BufferAccessStrategy strategy,
474482
bool *foundPtr);
475483
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
484+
static void FindAndDropRelFileNodeBuffers(RelFileNode rnode,
485+
ForkNumber forkNum,
486+
BlockNumber nForkBlock,
487+
BlockNumber firstDelBlock);
476488
static void AtProcExit_Buffers(int code, Datum arg);
477489
static void CheckForBufferLeaks(void);
478490
static int rnode_comparator(const void *p1, const void *p2);
@@ -2965,19 +2977,19 @@ BufferGetLSNAtomic(Buffer buffer)
29652977
* later. It is also the responsibility of higher-level code to ensure
29662978
* that no other process could be trying to load more pages of the
29672979
* relation into buffers.
2968-
*
2969-
* XXX currently it sequentially searches the buffer pool, should be
2970-
* changed to more clever ways of searching. However, this routine
2971-
* is used only in code paths that aren't very performance-critical,
2972-
* and we shouldn't slow down the hot paths to make it faster ...
29732980
* --------------------------------------------------------------------
29742981
*/
29752982
void
2976-
DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
2983+
DropRelFileNodeBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
29772984
int nforks, BlockNumber *firstDelBlock)
29782985
{
29792986
int i;
29802987
int j;
2988+
RelFileNodeBackend rnode;
2989+
BlockNumber nForkBlock[MAX_FORKNUM];
2990+
BlockNumber nBlocksToInvalidate = 0;
2991+
2992+
rnode = smgr_reln->smgr_rnode;
29812993

29822994
/* If it's a local relation, it's localbuf.c's problem. */
29832995
if (RelFileNodeBackendIsTemp(rnode))
@@ -2991,6 +3003,56 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
29913003
return;
29923004
}
29933005

3006+
/*
3007+
* To remove all the pages of the specified relation forks from the buffer
3008+
* pool, we need to scan the entire buffer pool but we can optimize it by
3009+
* finding the buffers from BufMapping table provided we know the exact
3010+
* size of each fork of the relation. The exact size is required to ensure
3011+
* that we don't leave any buffer for the relation being dropped as
3012+
* otherwise the background writer or checkpointer can lead to a PANIC
3013+
* error while flushing buffers corresponding to files that don't exist.
3014+
*
3015+
* To know the exact size, we rely on the size cached for each fork by us
3016+
* during recovery which limits the optimization to recovery and on
3017+
* standbys but we can easily extend it once we have shared cache for
3018+
* relation size.
3019+
*
3020+
* In recovery, we cache the value returned by the first lseek(SEEK_END)
3021+
* and the future writes keeps the cached value up-to-date. See
3022+
* smgrextend. It is possible that the value of the first lseek is smaller
3023+
* than the actual number of existing blocks in the file due to buggy
3024+
* Linux kernels that might not have accounted for the recent write. But
3025+
* that should be fine because there must not be any buffers after that
3026+
* file size.
3027+
*/
3028+
for (i = 0; i < nforks; i++)
3029+
{
3030+
/* Get the number of blocks for a relation's fork */
3031+
nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
3032+
3033+
if (nForkBlock[i] == InvalidBlockNumber)
3034+
{
3035+
nBlocksToInvalidate = InvalidBlockNumber;
3036+
break;
3037+
}
3038+
3039+
/* calculate the number of blocks to be invalidated */
3040+
nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
3041+
}
3042+
3043+
/*
3044+
* We apply the optimization iff the total number of blocks to invalidate
3045+
* is below the BUF_DROP_FULL_SCAN_THRESHOLD.
3046+
*/
3047+
if (BlockNumberIsValid(nBlocksToInvalidate) &&
3048+
nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
3049+
{
3050+
for (j = 0; j < nforks; j++)
3051+
FindAndDropRelFileNodeBuffers(rnode.node, forkNum[j],
3052+
nForkBlock[j], firstDelBlock[j]);
3053+
return;
3054+
}
3055+
29943056
for (i = 0; i < NBuffers; i++)
29953057
{
29963058
BufferDesc *bufHdr = GetBufferDescriptor(i);
@@ -3133,6 +3195,65 @@ DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
31333195
pfree(nodes);
31343196
}
31353197

3198+
/* ---------------------------------------------------------------------
3199+
* FindAndDropRelFileNodeBuffers
3200+
*
3201+
* This function performs look up in BufMapping table and removes from the
3202+
* buffer pool all the pages of the specified relation fork that has block
3203+
* number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
3204+
* pages are removed.)
3205+
* --------------------------------------------------------------------
3206+
*/
3207+
static void
3208+
FindAndDropRelFileNodeBuffers(RelFileNode rnode, ForkNumber forkNum,
3209+
BlockNumber nForkBlock,
3210+
BlockNumber firstDelBlock)
3211+
{
3212+
BlockNumber curBlock;
3213+
3214+
for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
3215+
{
3216+
uint32 bufHash; /* hash value for tag */
3217+
BufferTag bufTag; /* identity of requested block */
3218+
LWLock *bufPartitionLock; /* buffer partition lock for it */
3219+
int buf_id;
3220+
BufferDesc *bufHdr;
3221+
uint32 buf_state;
3222+
3223+
/* create a tag so we can lookup the buffer */
3224+
INIT_BUFFERTAG(bufTag, rnode, forkNum, curBlock);
3225+
3226+
/* determine its hash code and partition lock ID */
3227+
bufHash = BufTableHashCode(&bufTag);
3228+
bufPartitionLock = BufMappingPartitionLock(bufHash);
3229+
3230+
/* Check that it is in the buffer pool. If not, do nothing. */
3231+
LWLockAcquire(bufPartitionLock, LW_SHARED);
3232+
buf_id = BufTableLookup(&bufTag, bufHash);
3233+
LWLockRelease(bufPartitionLock);
3234+
3235+
if (buf_id < 0)
3236+
continue;
3237+
3238+
bufHdr = GetBufferDescriptor(buf_id);
3239+
3240+
/*
3241+
* We need to lock the buffer header and recheck if the buffer is
3242+
* still associated with the same block because the buffer could be
3243+
* evicted by some other backend loading blocks for a different
3244+
* relation after we release lock on the BufMapping table.
3245+
*/
3246+
buf_state = LockBufHdr(bufHdr);
3247+
3248+
if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
3249+
bufHdr->tag.forkNum == forkNum &&
3250+
bufHdr->tag.blockNum >= firstDelBlock)
3251+
InvalidateBuffer(bufHdr); /* releases spinlock */
3252+
else
3253+
UnlockBufHdr(bufHdr, buf_state);
3254+
}
3255+
}
3256+
31363257
/* ---------------------------------------------------------------------
31373258
* DropDatabaseBuffers
31383259
*
@@ -3245,8 +3366,7 @@ PrintPinnedBufs(void)
32453366
* XXX currently it sequentially searches the buffer pool, should be
32463367
* changed to more clever ways of searching. This routine is not
32473368
* used in any performance-critical code paths, so it's not worth
3248-
* adding additional overhead to normal paths to make it go faster;
3249-
* but see also DropRelFileNodeBuffers.
3369+
* adding additional overhead to normal paths to make it go faster.
32503370
* --------------------------------------------------------------------
32513371
*/
32523372
void

src/backend/storage/smgr/smgr.c

+24-6
Original file line numberDiff line numberDiff line change
@@ -549,18 +549,36 @@ smgrnblocks(SMgrRelation reln, ForkNumber forknum)
549549
{
550550
BlockNumber result;
551551

552+
/* Check and return if we get the cached value for the number of blocks. */
553+
result = smgrnblocks_cached(reln, forknum);
554+
if (result != InvalidBlockNumber)
555+
return result;
556+
557+
result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
558+
559+
reln->smgr_cached_nblocks[forknum] = result;
560+
561+
return result;
562+
}
563+
564+
/*
565+
* smgrnblocks_cached() -- Get the cached number of blocks in the supplied
566+
* relation.
567+
*
568+
* Returns an InvalidBlockNumber when not in recovery and when the relation
569+
* fork size is not cached.
570+
*/
571+
BlockNumber
572+
smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
573+
{
552574
/*
553575
* For now, we only use cached values in recovery due to lack of a shared
554576
* invalidation mechanism for changes in file size.
555577
*/
556578
if (InRecovery && reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber)
557579
return reln->smgr_cached_nblocks[forknum];
558580

559-
result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
560-
561-
reln->smgr_cached_nblocks[forknum] = result;
562-
563-
return result;
581+
return InvalidBlockNumber;
564582
}
565583

566584
/*
@@ -582,7 +600,7 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
582600
* Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
583601
* just drop them without bothering to write the contents.
584602
*/
585-
DropRelFileNodeBuffers(reln->smgr_rnode, forknum, nforks, nblocks);
603+
DropRelFileNodeBuffers(reln, forknum, nforks, nblocks);
586604

587605
/*
588606
* Send a shared-inval message to force other backends to close any smgr

src/include/storage/bufmgr.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ extern void FlushOneBuffer(Buffer buffer);
203203
extern void FlushRelationBuffers(Relation rel);
204204
extern void FlushRelationsAllBuffers(struct SMgrRelationData **smgrs, int nrels);
205205
extern void FlushDatabaseBuffers(Oid dbid);
206-
extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
206+
extern void DropRelFileNodeBuffers(struct SMgrRelationData *smgr_reln, ForkNumber *forkNum,
207207
int nforks, BlockNumber *firstDelBlock);
208208
extern void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes);
209209
extern void DropDatabaseBuffers(Oid dbid);

src/include/storage/smgr.h

+1
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
9999
extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum,
100100
BlockNumber blocknum, BlockNumber nblocks);
101101
extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
102+
extern BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum);
102103
extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum,
103104
int nforks, BlockNumber *nblocks);
104105
extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);

0 commit comments

Comments
 (0)