From 3febabb9c39cf278e7850f011daaea9425744a3d Mon Sep 17 00:00:00 2001 From: Bertrand Drouvot Date: Thu, 10 Apr 2025 05:49:45 +0000 Subject: [PATCH] Add os_page_num to pg_buffercache ba2a3c2302f added a way to check if a buffer is spread across multiple pages. Adding the same information in pg_buffercache so that one does not need NUMA support enabled to get this information. --- .../expected/pg_buffercache.out | 2 +- .../pg_buffercache--1.5--1.6.sql | 7 ++ contrib/pg_buffercache/pg_buffercache_pages.c | 118 ++++++++++++++---- contrib/pg_buffercache/sql/pg_buffercache.sql | 2 +- doc/src/sgml/pgbuffercache.sgml | 34 +++++ 5 files changed, 135 insertions(+), 28 deletions(-) diff --git a/contrib/pg_buffercache/expected/pg_buffercache.out b/contrib/pg_buffercache/expected/pg_buffercache.out index 9a9216dc7b1b..52e38dcc027a 100644 --- a/contrib/pg_buffercache/expected/pg_buffercache.out +++ b/contrib/pg_buffercache/expected/pg_buffercache.out @@ -1,5 +1,5 @@ CREATE EXTENSION pg_buffercache; -select count(*) = (select setting::bigint +select count(*) >= (select setting::bigint from pg_settings where name = 'shared_buffers') from pg_buffercache; diff --git a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql index 458f054a6917..8aa81d156885 100644 --- a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql +++ b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql @@ -44,3 +44,10 @@ CREATE FUNCTION pg_buffercache_evict_all( OUT buffers_skipped int4) AS 'MODULE_PATHNAME', 'pg_buffercache_evict_all' LANGUAGE C PARALLEL SAFE VOLATILE; + +-- Upgrade view to 1.6. format +CREATE OR REPLACE VIEW pg_buffercache AS + SELECT P.* FROM pg_buffercache_pages() AS P + (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid, + relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2, + pinning_backends int4, os_page_num bigint); diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index 4b007f6e1b06..efbce17b54fa 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -19,7 +19,7 @@ #define NUM_BUFFERCACHE_PAGES_MIN_ELEM 8 -#define NUM_BUFFERCACHE_PAGES_ELEM 9 +#define NUM_BUFFERCACHE_PAGES_ELEM 10 #define NUM_BUFFERCACHE_SUMMARY_ELEM 5 #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4 #define NUM_BUFFERCACHE_EVICT_ELEM 2 @@ -54,6 +54,7 @@ typedef struct * because of bufmgr.c's PrivateRefCount infrastructure. */ int32 pinning_backends; + int64 page_num; } BufferCachePagesRec; @@ -63,6 +64,9 @@ typedef struct typedef struct { TupleDesc tupdesc; + int buffers_per_page; + int pages_per_buffer; + int os_page_size; BufferCachePagesRec *record; } BufferCachePagesContext; @@ -119,8 +123,25 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) if (SRF_IS_FIRSTCALL()) { - int i; + int i, + idx; + Size os_page_size; + char *startptr; + int pages_per_buffer; + int max_entries; + /* + * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, while + * the OS may have different memory page sizes. + * + * To correctly map between them, we need to: 1. Determine the OS memory + * page size 2. Calculate how many OS pages are used by all buffer blocks + * 3. Calculate how many OS pages are contained within each database + * block. + */ + os_page_size = pg_get_shmem_pagesize(); + + /* Initialize the multi-call context, load entries about buffers */ funcctx = SRF_FIRSTCALL_INIT(); /* Switch context when allocating stuff to be used in later calls */ @@ -163,24 +184,36 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count", INT2OID, -1, 0); - if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM) + if (expected_tupledesc->natts >= (NUM_BUFFERCACHE_PAGES_ELEM - 1)) TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends", INT4OID, -1, 0); + if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM) + TupleDescInitEntry(tupledesc, (AttrNumber) 10, "os_page_num", + INT8OID, -1, 0); + fctx->tupdesc = BlessTupleDesc(tupledesc); - /* Allocate NBuffers worth of BufferCachePagesRec records. */ + /* + * Each buffer needs at least one entry, but it might be offset in + * some way, and use one extra entry. So we allocate space for the + * maximum number of entries we might need, and then count the exact + * number as we're walking buffers. That way we can do it in one pass, + * without reallocating memory. + */ + pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1; + max_entries = NBuffers * pages_per_buffer; + + /* Allocate entries for BufferCachePagesRec records. */ fctx->record = (BufferCachePagesRec *) MemoryContextAllocHuge(CurrentMemoryContext, - sizeof(BufferCachePagesRec) * NBuffers); - - /* Set max calls and remember the user function context. */ - funcctx->max_calls = NBuffers; - funcctx->user_fctx = fctx; + sizeof(BufferCachePagesRec) * max_entries); /* Return to original context when allocating transient memory */ MemoryContextSwitchTo(oldcontext); + startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1)); + idx = 0; /* * Scan through all the buffers, saving the relevant fields in the * fctx->record structure. @@ -191,35 +224,65 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) */ for (i = 0; i < NBuffers; i++) { + char *buffptr = (char *) BufferGetBlock(i + 1); BufferDesc *bufHdr; uint32 buf_state; + int32 page_num; + char *startptr_buff, + *endptr_buff; bufHdr = GetBufferDescriptor(i); /* Lock each buffer header before inspecting. */ buf_state = LockBufHdr(bufHdr); - fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr); - fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag); - fctx->record[i].reltablespace = bufHdr->tag.spcOid; - fctx->record[i].reldatabase = bufHdr->tag.dbOid; - fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag); - fctx->record[i].blocknum = bufHdr->tag.blockNum; - fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state); - fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state); + /* start of the first page of this buffer */ + startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr); - if (buf_state & BM_DIRTY) - fctx->record[i].isdirty = true; - else - fctx->record[i].isdirty = false; + /* end of the buffer (no need to align to memory page) */ + endptr_buff = buffptr + BLCKSZ; - /* Note if the buffer is valid, and has storage created */ - if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID)) - fctx->record[i].isvalid = true; - else - fctx->record[i].isvalid = false; + Assert(startptr_buff < endptr_buff); + + /* calculate ID of the first page for this buffer */ + page_num = (startptr_buff - startptr) / os_page_size; + + /* Add an entry for each OS page overlapping with this buffer. */ + for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size) + { + fctx->record[idx].bufferid = BufferDescriptorGetBuffer(bufHdr); + fctx->record[idx].relfilenumber = BufTagGetRelNumber(&bufHdr->tag); + fctx->record[idx].reltablespace = bufHdr->tag.spcOid; + fctx->record[idx].reldatabase = bufHdr->tag.dbOid; + fctx->record[idx].forknum = BufTagGetForkNum(&bufHdr->tag); + fctx->record[idx].blocknum = bufHdr->tag.blockNum; + fctx->record[idx].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state); + fctx->record[idx].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state); + + if (buf_state & BM_DIRTY) + fctx->record[idx].isdirty = true; + else + fctx->record[idx].isdirty = false; + + /* Note if the buffer is valid, and has storage created */ + if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID)) + fctx->record[idx].isvalid = true; + else + fctx->record[idx].isvalid = false; + + fctx->record[idx].page_num = page_num; + /* advance to the next entry/page */ + ++idx; + ++page_num; + } UnlockBufHdr(bufHdr, buf_state); } + + Assert(idx <= max_entries); + + /* Set max calls and remember the user function context. */ + funcctx->max_calls = idx; + funcctx->user_fctx = fctx; } funcctx = SRF_PERCALL_SETUP(); @@ -252,6 +315,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) nulls[7] = true; /* unused for v1.0 callers, but the array is always long enough */ nulls[8] = true; + nulls[9] = true; } else { @@ -272,6 +336,8 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) /* unused for v1.0 callers, but the array is always long enough */ values[8] = Int32GetDatum(fctx->record[i].pinning_backends); nulls[8] = false; + values[9] = Int64GetDatum(fctx->record[i].page_num); + nulls[9] = false; } /* Build and return the tuple. */ diff --git a/contrib/pg_buffercache/sql/pg_buffercache.sql b/contrib/pg_buffercache/sql/pg_buffercache.sql index 47cca1907c74..78ec7a88ea02 100644 --- a/contrib/pg_buffercache/sql/pg_buffercache.sql +++ b/contrib/pg_buffercache/sql/pg_buffercache.sql @@ -1,6 +1,6 @@ CREATE EXTENSION pg_buffercache; -select count(*) = (select setting::bigint +select count(*) >= (select setting::bigint from pg_settings where name = 'shared_buffers') from pg_buffercache; diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml index 537d60149424..6183adf73db0 100644 --- a/doc/src/sgml/pgbuffercache.sgml +++ b/doc/src/sgml/pgbuffercache.sgml @@ -205,6 +205,15 @@ Number of backends pinning this buffer + + + + os_page_num bigint + + + number of OS memory page for this buffer + + @@ -550,6 +559,31 @@ regression=# SELECT n.nspname, c.relname, count(*) AS buffers public | spgist_text_tbl | 182 (10 rows) +regression=# SELECT n.nspname, c.relname, count(*) AS buffers_on_multiple_pages + FROM pg_buffercache b JOIN pg_class c + ON b.relfilenode = pg_relation_filenode(c.oid) AND + b.reldatabase IN (0, (SELECT oid FROM pg_database + WHERE datname = current_database())) + JOIN pg_namespace n ON n.oid = c.relnamespace + JOIN (SELECT bufferid FROM pg_buffercache + GROUP BY bufferid HAVING count(*) > 1) m on m.bufferid = b.bufferid + GROUP BY n.nspname, c.relname + ORDER BY 3 DESC + LIMIT 10; + + nspname | relname | buffers_on_multiple_pages +------------+---------------------------------+--------------------------- + public | gin_test_tbl | 4 + public | delete_test_table | 4 + public | tenk1 | 4 + pg_catalog | pg_attribute_relid_attnum_index | 4 + pg_catalog | pg_class | 2 + pg_catalog | pg_depend_depender_index | 2 + pg_catalog | pg_attribute | 2 + pg_catalog | pg_opfamily | 2 + pg_catalog | pg_opclass_oid_index | 2 + pg_catalog | pg_description | 2 +(10 rows) regression=# SELECT * FROM pg_buffercache_summary(); buffers_used | buffers_unused | buffers_dirty | buffers_pinned | usagecount_avg