19
19
20
20
21
21
#define NUM_BUFFERCACHE_PAGES_MIN_ELEM 8
22
- #define NUM_BUFFERCACHE_PAGES_ELEM 9
22
+ #define NUM_BUFFERCACHE_PAGES_ELEM 10
23
23
#define NUM_BUFFERCACHE_SUMMARY_ELEM 5
24
24
#define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4
25
25
#define NUM_BUFFERCACHE_EVICT_ELEM 2
@@ -54,6 +54,7 @@ typedef struct
54
54
* because of bufmgr.c's PrivateRefCount infrastructure.
55
55
*/
56
56
int32 pinning_backends ;
57
+ int64 page_num ;
57
58
} BufferCachePagesRec ;
58
59
59
60
@@ -63,6 +64,9 @@ typedef struct
63
64
typedef struct
64
65
{
65
66
TupleDesc tupdesc ;
67
+ int buffers_per_page ;
68
+ int pages_per_buffer ;
69
+ int os_page_size ;
66
70
BufferCachePagesRec * record ;
67
71
} BufferCachePagesContext ;
68
72
@@ -119,8 +123,25 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
119
123
120
124
if (SRF_IS_FIRSTCALL ())
121
125
{
122
- int i ;
126
+ int i ,
127
+ idx ;
128
+ Size os_page_size ;
129
+ char * startptr ;
130
+ int pages_per_buffer ;
131
+ int max_entries ;
123
132
133
+ /*
134
+ * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, while
135
+ * the OS may have different memory page sizes.
136
+ *
137
+ * To correctly map between them, we need to: 1. Determine the OS memory
138
+ * page size 2. Calculate how many OS pages are used by all buffer blocks
139
+ * 3. Calculate how many OS pages are contained within each database
140
+ * block.
141
+ */
142
+ os_page_size = pg_get_shmem_pagesize ();
143
+
144
+ /* Initialize the multi-call context, load entries about buffers */
124
145
funcctx = SRF_FIRSTCALL_INIT ();
125
146
126
147
/* Switch context when allocating stuff to be used in later calls */
@@ -163,24 +184,36 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
163
184
TupleDescInitEntry (tupledesc , (AttrNumber ) 8 , "usage_count" ,
164
185
INT2OID , -1 , 0 );
165
186
166
- if (expected_tupledesc -> natts == NUM_BUFFERCACHE_PAGES_ELEM )
187
+ if (expected_tupledesc -> natts >= ( NUM_BUFFERCACHE_PAGES_ELEM - 1 ) )
167
188
TupleDescInitEntry (tupledesc , (AttrNumber ) 9 , "pinning_backends" ,
168
189
INT4OID , -1 , 0 );
169
190
191
+ if (expected_tupledesc -> natts == NUM_BUFFERCACHE_PAGES_ELEM )
192
+ TupleDescInitEntry (tupledesc , (AttrNumber ) 10 , "os_page_num" ,
193
+ INT8OID , -1 , 0 );
194
+
170
195
fctx -> tupdesc = BlessTupleDesc (tupledesc );
171
196
172
- /* Allocate NBuffers worth of BufferCachePagesRec records. */
197
+ /*
198
+ * Each buffer needs at least one entry, but it might be offset in
199
+ * some way, and use one extra entry. So we allocate space for the
200
+ * maximum number of entries we might need, and then count the exact
201
+ * number as we're walking buffers. That way we can do it in one pass,
202
+ * without reallocating memory.
203
+ */
204
+ pages_per_buffer = Max (1 , BLCKSZ / os_page_size ) + 1 ;
205
+ max_entries = NBuffers * pages_per_buffer ;
206
+
207
+ /* Allocate entries for BufferCachePagesRec records. */
173
208
fctx -> record = (BufferCachePagesRec * )
174
209
MemoryContextAllocHuge (CurrentMemoryContext ,
175
- sizeof (BufferCachePagesRec ) * NBuffers );
176
-
177
- /* Set max calls and remember the user function context. */
178
- funcctx -> max_calls = NBuffers ;
179
- funcctx -> user_fctx = fctx ;
210
+ sizeof (BufferCachePagesRec ) * max_entries );
180
211
181
212
/* Return to original context when allocating transient memory */
182
213
MemoryContextSwitchTo (oldcontext );
183
214
215
+ startptr = (char * ) TYPEALIGN_DOWN (os_page_size , (char * ) BufferGetBlock (1 ));
216
+ idx = 0 ;
184
217
/*
185
218
* Scan through all the buffers, saving the relevant fields in the
186
219
* fctx->record structure.
@@ -191,35 +224,65 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
191
224
*/
192
225
for (i = 0 ; i < NBuffers ; i ++ )
193
226
{
227
+ char * buffptr = (char * ) BufferGetBlock (i + 1 );
194
228
BufferDesc * bufHdr ;
195
229
uint32 buf_state ;
230
+ int32 page_num ;
231
+ char * startptr_buff ,
232
+ * endptr_buff ;
196
233
197
234
bufHdr = GetBufferDescriptor (i );
198
235
/* Lock each buffer header before inspecting. */
199
236
buf_state = LockBufHdr (bufHdr );
200
237
201
- fctx -> record [i ].bufferid = BufferDescriptorGetBuffer (bufHdr );
202
- fctx -> record [i ].relfilenumber = BufTagGetRelNumber (& bufHdr -> tag );
203
- fctx -> record [i ].reltablespace = bufHdr -> tag .spcOid ;
204
- fctx -> record [i ].reldatabase = bufHdr -> tag .dbOid ;
205
- fctx -> record [i ].forknum = BufTagGetForkNum (& bufHdr -> tag );
206
- fctx -> record [i ].blocknum = bufHdr -> tag .blockNum ;
207
- fctx -> record [i ].usagecount = BUF_STATE_GET_USAGECOUNT (buf_state );
208
- fctx -> record [i ].pinning_backends = BUF_STATE_GET_REFCOUNT (buf_state );
238
+ /* start of the first page of this buffer */
239
+ startptr_buff = (char * ) TYPEALIGN_DOWN (os_page_size , buffptr );
209
240
210
- if (buf_state & BM_DIRTY )
211
- fctx -> record [i ].isdirty = true;
212
- else
213
- fctx -> record [i ].isdirty = false;
241
+ /* end of the buffer (no need to align to memory page) */
242
+ endptr_buff = buffptr + BLCKSZ ;
214
243
215
- /* Note if the buffer is valid, and has storage created */
216
- if ((buf_state & BM_VALID ) && (buf_state & BM_TAG_VALID ))
217
- fctx -> record [i ].isvalid = true;
218
- else
219
- fctx -> record [i ].isvalid = false;
244
+ Assert (startptr_buff < endptr_buff );
245
+
246
+ /* calculate ID of the first page for this buffer */
247
+ page_num = (startptr_buff - startptr ) / os_page_size ;
248
+
249
+ /* Add an entry for each OS page overlapping with this buffer. */
250
+ for (char * ptr = startptr_buff ; ptr < endptr_buff ; ptr += os_page_size )
251
+ {
252
+ fctx -> record [idx ].bufferid = BufferDescriptorGetBuffer (bufHdr );
253
+ fctx -> record [idx ].relfilenumber = BufTagGetRelNumber (& bufHdr -> tag );
254
+ fctx -> record [idx ].reltablespace = bufHdr -> tag .spcOid ;
255
+ fctx -> record [idx ].reldatabase = bufHdr -> tag .dbOid ;
256
+ fctx -> record [idx ].forknum = BufTagGetForkNum (& bufHdr -> tag );
257
+ fctx -> record [idx ].blocknum = bufHdr -> tag .blockNum ;
258
+ fctx -> record [idx ].usagecount = BUF_STATE_GET_USAGECOUNT (buf_state );
259
+ fctx -> record [idx ].pinning_backends = BUF_STATE_GET_REFCOUNT (buf_state );
260
+
261
+ if (buf_state & BM_DIRTY )
262
+ fctx -> record [idx ].isdirty = true;
263
+ else
264
+ fctx -> record [idx ].isdirty = false;
265
+
266
+ /* Note if the buffer is valid, and has storage created */
267
+ if ((buf_state & BM_VALID ) && (buf_state & BM_TAG_VALID ))
268
+ fctx -> record [idx ].isvalid = true;
269
+ else
270
+ fctx -> record [idx ].isvalid = false;
271
+
272
+ fctx -> record [idx ].page_num = page_num ;
273
+ /* advance to the next entry/page */
274
+ ++ idx ;
275
+ ++ page_num ;
276
+ }
220
277
221
278
UnlockBufHdr (bufHdr , buf_state );
222
279
}
280
+
281
+ Assert (idx <= max_entries );
282
+
283
+ /* Set max calls and remember the user function context. */
284
+ funcctx -> max_calls = idx ;
285
+ funcctx -> user_fctx = fctx ;
223
286
}
224
287
225
288
funcctx = SRF_PERCALL_SETUP ();
@@ -252,6 +315,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
252
315
nulls [7 ] = true;
253
316
/* unused for v1.0 callers, but the array is always long enough */
254
317
nulls [8 ] = true;
318
+ nulls [9 ] = true;
255
319
}
256
320
else
257
321
{
@@ -272,6 +336,8 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
272
336
/* unused for v1.0 callers, but the array is always long enough */
273
337
values [8 ] = Int32GetDatum (fctx -> record [i ].pinning_backends );
274
338
nulls [8 ] = false;
339
+ values [9 ] = Int64GetDatum (fctx -> record [i ].page_num );
340
+ nulls [9 ] = false;
275
341
}
276
342
277
343
/* Build and return the tuple. */
0 commit comments