19
19
20
20
21
21
#define NUM_BUFFERCACHE_PAGES_MIN_ELEM 8
22
- #define NUM_BUFFERCACHE_PAGES_ELEM 9
22
+ #define NUM_BUFFERCACHE_PAGES_ELEM 10
23
23
#define NUM_BUFFERCACHE_SUMMARY_ELEM 5
24
24
#define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4
25
25
#define NUM_BUFFERCACHE_EVICT_ELEM 2
@@ -54,6 +54,7 @@ typedef struct
54
54
* because of bufmgr.c's PrivateRefCount infrastructure.
55
55
*/
56
56
int32 pinning_backends;
57
+ int64 page_num;
57
58
} BufferCachePagesRec;
58
59
59
60
@@ -63,6 +64,9 @@ typedef struct
63
64
typedef struct
64
65
{
65
66
TupleDesc tupdesc;
67
+ int buffers_per_page;
68
+ int pages_per_buffer;
69
+ int os_page_size;
66
70
BufferCachePagesRec *record;
67
71
} BufferCachePagesContext;
68
72
@@ -119,8 +123,25 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
119
123
120
124
if (SRF_IS_FIRSTCALL())
121
125
{
122
- int i;
126
+ int i,
127
+ idx;
128
+ Size os_page_size;
129
+ char *startptr;
130
+ int pages_per_buffer;
131
+ int max_entries;
123
132
133
+ /*
134
+ * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, while
135
+ * the OS may have different memory page sizes.
136
+ *
137
+ * To correctly map between them, we need to: 1. Determine the OS memory
138
+ * page size 2. Calculate how many OS pages are used by all buffer blocks
139
+ * 3. Calculate how many OS pages are contained within each database
140
+ * block.
141
+ */
142
+ os_page_size = pg_get_shmem_pagesize();
143
+
144
+ /* Initialize the multi-call context, load entries about buffers */
124
145
funcctx = SRF_FIRSTCALL_INIT();
125
146
126
147
/* Switch context when allocating stuff to be used in later calls */
@@ -163,24 +184,36 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
163
184
TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count",
164
185
INT2OID, -1, 0);
165
186
166
- if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
187
+ if (expected_tupledesc->natts >= ( NUM_BUFFERCACHE_PAGES_ELEM - 1) )
167
188
TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
168
189
INT4OID, -1, 0);
169
190
191
+ if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
192
+ TupleDescInitEntry(tupledesc, (AttrNumber) 10, "os_page_num",
193
+ INT8OID, -1, 0);
194
+
170
195
fctx->tupdesc = BlessTupleDesc(tupledesc);
171
196
172
- /* Allocate NBuffers worth of BufferCachePagesRec records. */
197
+ /*
198
+ * Each buffer needs at least one entry, but it might be offset in
199
+ * some way, and use one extra entry. So we allocate space for the
200
+ * maximum number of entries we might need, and then count the exact
201
+ * number as we're walking buffers. That way we can do it in one pass,
202
+ * without reallocating memory.
203
+ */
204
+ pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1;
205
+ max_entries = NBuffers * pages_per_buffer;
206
+
207
+ /* Allocate entries for BufferCachePagesRec records. */
173
208
fctx->record = (BufferCachePagesRec *)
174
209
MemoryContextAllocHuge(CurrentMemoryContext,
175
- sizeof(BufferCachePagesRec) * NBuffers);
176
-
177
- /* Set max calls and remember the user function context. */
178
- funcctx->max_calls = NBuffers;
179
- funcctx->user_fctx = fctx;
210
+ sizeof(BufferCachePagesRec) * max_entries);
180
211
181
212
/* Return to original context when allocating transient memory */
182
213
MemoryContextSwitchTo(oldcontext);
183
214
215
+ startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1));
216
+ idx = 0;
184
217
/*
185
218
* Scan through all the buffers, saving the relevant fields in the
186
219
* fctx->record structure.
@@ -191,35 +224,65 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
191
224
*/
192
225
for (i = 0; i < NBuffers; i++)
193
226
{
227
+ char *buffptr = (char *) BufferGetBlock(i + 1);
194
228
BufferDesc *bufHdr;
195
229
uint32 buf_state;
230
+ int32 page_num;
231
+ char *startptr_buff,
232
+ *endptr_buff;
196
233
197
234
bufHdr = GetBufferDescriptor(i);
198
235
/* Lock each buffer header before inspecting. */
199
236
buf_state = LockBufHdr(bufHdr);
200
237
201
- fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
202
- fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
203
- fctx->record[i].reltablespace = bufHdr->tag.spcOid;
204
- fctx->record[i].reldatabase = bufHdr->tag.dbOid;
205
- fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag);
206
- fctx->record[i].blocknum = bufHdr->tag.blockNum;
207
- fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
208
- fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
238
+ /* start of the first page of this buffer */
239
+ startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
209
240
210
- if (buf_state & BM_DIRTY)
211
- fctx->record[i].isdirty = true;
212
- else
213
- fctx->record[i].isdirty = false;
241
+ /* end of the buffer (no need to align to memory page) */
242
+ endptr_buff = buffptr + BLCKSZ;
214
243
215
- /* Note if the buffer is valid, and has storage created */
216
- if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
217
- fctx->record[i].isvalid = true;
218
- else
219
- fctx->record[i].isvalid = false;
244
+ Assert(startptr_buff < endptr_buff);
245
+
246
+ /* calculate ID of the first page for this buffer */
247
+ page_num = (startptr_buff - startptr) / os_page_size;
248
+
249
+ /* Add an entry for each OS page overlapping with this buffer. */
250
+ for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
251
+ {
252
+ fctx->record[idx].bufferid = BufferDescriptorGetBuffer(bufHdr);
253
+ fctx->record[idx].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
254
+ fctx->record[idx].reltablespace = bufHdr->tag.spcOid;
255
+ fctx->record[idx].reldatabase = bufHdr->tag.dbOid;
256
+ fctx->record[idx].forknum = BufTagGetForkNum(&bufHdr->tag);
257
+ fctx->record[idx].blocknum = bufHdr->tag.blockNum;
258
+ fctx->record[idx].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
259
+ fctx->record[idx].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
260
+
261
+ if (buf_state & BM_DIRTY)
262
+ fctx->record[idx].isdirty = true;
263
+ else
264
+ fctx->record[idx].isdirty = false;
265
+
266
+ /* Note if the buffer is valid, and has storage created */
267
+ if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
268
+ fctx->record[idx].isvalid = true;
269
+ else
270
+ fctx->record[idx].isvalid = false;
271
+
272
+ fctx->record[idx].page_num = page_num;
273
+ /* advance to the next entry/page */
274
+ ++idx;
275
+ ++page_num;
276
+ }
220
277
221
278
UnlockBufHdr(bufHdr, buf_state);
222
279
}
280
+
281
+ Assert(idx <= max_entries);
282
+
283
+ /* Set max calls and remember the user function context. */
284
+ funcctx->max_calls = idx;
285
+ funcctx->user_fctx = fctx;
223
286
}
224
287
225
288
funcctx = SRF_PERCALL_SETUP();
@@ -252,6 +315,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
252
315
nulls[7] = true;
253
316
/* unused for v1.0 callers, but the array is always long enough */
254
317
nulls[8] = true;
318
+ nulls[9] = true;
255
319
}
256
320
else
257
321
{
@@ -272,6 +336,8 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
272
336
/* unused for v1.0 callers, but the array is always long enough */
273
337
values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
274
338
nulls[8] = false;
339
+ values[9] = Int64GetDatum(fctx->record[i].page_num);
340
+ nulls[9] = false;
275
341
}
276
342
277
343
/* Build and return the tuple. */
0 commit comments