70
70
71
71
#define RELS_BSEARCH_THRESHOLD 20
72
72
73
+ /*
74
+ * This is the size (in the number of blocks) above which we scan the
75
+ * entire buffer pool to remove the buffers for all the pages of relation
76
+ * being dropped. For the relations with size below this threshold, we find
77
+ * the buffers by doing lookups in BufMapping table.
78
+ */
79
+ #define BUF_DROP_FULL_SCAN_THRESHOLD (uint32) (NBuffers / 32)
80
+
73
81
typedef struct PrivateRefCountEntry
74
82
{
75
83
Buffer buffer ;
@@ -473,6 +481,10 @@ static BufferDesc *BufferAlloc(SMgrRelation smgr,
473
481
BufferAccessStrategy strategy ,
474
482
bool * foundPtr );
475
483
static void FlushBuffer (BufferDesc * buf , SMgrRelation reln );
484
+ static void FindAndDropRelFileNodeBuffers (RelFileNode rnode ,
485
+ ForkNumber forkNum ,
486
+ BlockNumber nForkBlock ,
487
+ BlockNumber firstDelBlock );
476
488
static void AtProcExit_Buffers (int code , Datum arg );
477
489
static void CheckForBufferLeaks (void );
478
490
static int rnode_comparator (const void * p1 , const void * p2 );
@@ -2965,19 +2977,19 @@ BufferGetLSNAtomic(Buffer buffer)
2965
2977
* later. It is also the responsibility of higher-level code to ensure
2966
2978
* that no other process could be trying to load more pages of the
2967
2979
* relation into buffers.
2968
- *
2969
- * XXX currently it sequentially searches the buffer pool, should be
2970
- * changed to more clever ways of searching. However, this routine
2971
- * is used only in code paths that aren't very performance-critical,
2972
- * and we shouldn't slow down the hot paths to make it faster ...
2973
2980
* --------------------------------------------------------------------
2974
2981
*/
2975
2982
void
2976
- DropRelFileNodeBuffers (RelFileNodeBackend rnode , ForkNumber * forkNum ,
2983
+ DropRelFileNodeBuffers (SMgrRelation smgr_reln , ForkNumber * forkNum ,
2977
2984
int nforks , BlockNumber * firstDelBlock )
2978
2985
{
2979
2986
int i ;
2980
2987
int j ;
2988
+ RelFileNodeBackend rnode ;
2989
+ BlockNumber nForkBlock [MAX_FORKNUM ];
2990
+ BlockNumber nBlocksToInvalidate = 0 ;
2991
+
2992
+ rnode = smgr_reln -> smgr_rnode ;
2981
2993
2982
2994
/* If it's a local relation, it's localbuf.c's problem. */
2983
2995
if (RelFileNodeBackendIsTemp (rnode ))
@@ -2991,6 +3003,56 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
2991
3003
return ;
2992
3004
}
2993
3005
3006
+ /*
3007
+ * To remove all the pages of the specified relation forks from the buffer
3008
+ * pool, we need to scan the entire buffer pool but we can optimize it by
3009
+ * finding the buffers from BufMapping table provided we know the exact
3010
+ * size of each fork of the relation. The exact size is required to ensure
3011
+ * that we don't leave any buffer for the relation being dropped as
3012
+ * otherwise the background writer or checkpointer can lead to a PANIC
3013
+ * error while flushing buffers corresponding to files that don't exist.
3014
+ *
3015
+ * To know the exact size, we rely on the size cached for each fork by us
3016
+ * during recovery which limits the optimization to recovery and on
3017
+ * standbys but we can easily extend it once we have shared cache for
3018
+ * relation size.
3019
+ *
3020
+ * In recovery, we cache the value returned by the first lseek(SEEK_END)
3021
+ * and the future writes keeps the cached value up-to-date. See
3022
+ * smgrextend. It is possible that the value of the first lseek is smaller
3023
+ * than the actual number of existing blocks in the file due to buggy
3024
+ * Linux kernels that might not have accounted for the recent write. But
3025
+ * that should be fine because there must not be any buffers after that
3026
+ * file size.
3027
+ */
3028
+ for (i = 0 ; i < nforks ; i ++ )
3029
+ {
3030
+ /* Get the number of blocks for a relation's fork */
3031
+ nForkBlock [i ] = smgrnblocks_cached (smgr_reln , forkNum [i ]);
3032
+
3033
+ if (nForkBlock [i ] == InvalidBlockNumber )
3034
+ {
3035
+ nBlocksToInvalidate = InvalidBlockNumber ;
3036
+ break ;
3037
+ }
3038
+
3039
+ /* calculate the number of blocks to be invalidated */
3040
+ nBlocksToInvalidate += (nForkBlock [i ] - firstDelBlock [i ]);
3041
+ }
3042
+
3043
+ /*
3044
+ * We apply the optimization iff the total number of blocks to invalidate
3045
+ * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
3046
+ */
3047
+ if (BlockNumberIsValid (nBlocksToInvalidate ) &&
3048
+ nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD )
3049
+ {
3050
+ for (j = 0 ; j < nforks ; j ++ )
3051
+ FindAndDropRelFileNodeBuffers (rnode .node , forkNum [j ],
3052
+ nForkBlock [j ], firstDelBlock [j ]);
3053
+ return ;
3054
+ }
3055
+
2994
3056
for (i = 0 ; i < NBuffers ; i ++ )
2995
3057
{
2996
3058
BufferDesc * bufHdr = GetBufferDescriptor (i );
@@ -3133,6 +3195,65 @@ DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
3133
3195
pfree (nodes );
3134
3196
}
3135
3197
3198
+ /* ---------------------------------------------------------------------
3199
+ * FindAndDropRelFileNodeBuffers
3200
+ *
3201
+ * This function performs look up in BufMapping table and removes from the
3202
+ * buffer pool all the pages of the specified relation fork that has block
3203
+ * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
3204
+ * pages are removed.)
3205
+ * --------------------------------------------------------------------
3206
+ */
3207
+ static void
3208
+ FindAndDropRelFileNodeBuffers (RelFileNode rnode , ForkNumber forkNum ,
3209
+ BlockNumber nForkBlock ,
3210
+ BlockNumber firstDelBlock )
3211
+ {
3212
+ BlockNumber curBlock ;
3213
+
3214
+ for (curBlock = firstDelBlock ; curBlock < nForkBlock ; curBlock ++ )
3215
+ {
3216
+ uint32 bufHash ; /* hash value for tag */
3217
+ BufferTag bufTag ; /* identity of requested block */
3218
+ LWLock * bufPartitionLock ; /* buffer partition lock for it */
3219
+ int buf_id ;
3220
+ BufferDesc * bufHdr ;
3221
+ uint32 buf_state ;
3222
+
3223
+ /* create a tag so we can lookup the buffer */
3224
+ INIT_BUFFERTAG (bufTag , rnode , forkNum , curBlock );
3225
+
3226
+ /* determine its hash code and partition lock ID */
3227
+ bufHash = BufTableHashCode (& bufTag );
3228
+ bufPartitionLock = BufMappingPartitionLock (bufHash );
3229
+
3230
+ /* Check that it is in the buffer pool. If not, do nothing. */
3231
+ LWLockAcquire (bufPartitionLock , LW_SHARED );
3232
+ buf_id = BufTableLookup (& bufTag , bufHash );
3233
+ LWLockRelease (bufPartitionLock );
3234
+
3235
+ if (buf_id < 0 )
3236
+ continue ;
3237
+
3238
+ bufHdr = GetBufferDescriptor (buf_id );
3239
+
3240
+ /*
3241
+ * We need to lock the buffer header and recheck if the buffer is
3242
+ * still associated with the same block because the buffer could be
3243
+ * evicted by some other backend loading blocks for a different
3244
+ * relation after we release lock on the BufMapping table.
3245
+ */
3246
+ buf_state = LockBufHdr (bufHdr );
3247
+
3248
+ if (RelFileNodeEquals (bufHdr -> tag .rnode , rnode ) &&
3249
+ bufHdr -> tag .forkNum == forkNum &&
3250
+ bufHdr -> tag .blockNum >= firstDelBlock )
3251
+ InvalidateBuffer (bufHdr ); /* releases spinlock */
3252
+ else
3253
+ UnlockBufHdr (bufHdr , buf_state );
3254
+ }
3255
+ }
3256
+
3136
3257
/* ---------------------------------------------------------------------
3137
3258
* DropDatabaseBuffers
3138
3259
*
@@ -3245,8 +3366,7 @@ PrintPinnedBufs(void)
3245
3366
* XXX currently it sequentially searches the buffer pool, should be
3246
3367
* changed to more clever ways of searching. This routine is not
3247
3368
* used in any performance-critical code paths, so it's not worth
3248
- * adding additional overhead to normal paths to make it go faster;
3249
- * but see also DropRelFileNodeBuffers.
3369
+ * adding additional overhead to normal paths to make it go faster.
3250
3370
* --------------------------------------------------------------------
3251
3371
*/
3252
3372
void
0 commit comments