diff --git a/.cirrus.tasks.yml b/.cirrus.tasks.yml
index 86a1fa9bbdba..6f4f5c674a1b 100644
--- a/.cirrus.tasks.yml
+++ b/.cirrus.tasks.yml
@@ -471,6 +471,7 @@ task:
--enable-cassert --enable-injection-points --enable-debug \
--enable-tap-tests --enable-nls \
--with-segsize-blocks=6 \
+ --with-libnuma \
--with-liburing \
\
${LINUX_CONFIGURE_FEATURES} \
@@ -523,6 +524,7 @@ task:
-Dllvm=disabled \
--pkg-config-path /usr/lib/i386-linux-gnu/pkgconfig/ \
-DPERL=perl5.36-i386-linux-gnu \
+ -Dlibnuma=disabled \
build-32
EOF
diff --git a/configure b/configure
index 11615d1122de..e27badd83c34 100755
--- a/configure
+++ b/configure
@@ -708,6 +708,9 @@ XML2_LIBS
XML2_CFLAGS
XML2_CONFIG
with_libxml
+LIBNUMA_LIBS
+LIBNUMA_CFLAGS
+with_libnuma
LIBCURL_LIBS
LIBCURL_CFLAGS
with_libcurl
@@ -872,6 +875,7 @@ with_liburing
with_uuid
with_ossp_uuid
with_libcurl
+with_libnuma
with_libxml
with_libxslt
with_system_tzdata
@@ -906,6 +910,8 @@ LIBURING_CFLAGS
LIBURING_LIBS
LIBCURL_CFLAGS
LIBCURL_LIBS
+LIBNUMA_CFLAGS
+LIBNUMA_LIBS
XML2_CONFIG
XML2_CFLAGS
XML2_LIBS
@@ -1588,6 +1594,7 @@ Optional Packages:
--with-uuid=LIB build contrib/uuid-ossp using LIB (bsd,e2fs,ossp)
--with-ossp-uuid obsolete spelling of --with-uuid=ossp
--with-libcurl build with libcurl support
+ --with-libnuma build with libnuma support
--with-libxml build with XML support
--with-libxslt use XSLT support when building contrib/xml2
--with-system-tzdata=DIR
@@ -1629,6 +1636,10 @@ Some influential environment variables:
C compiler flags for LIBCURL, overriding pkg-config
LIBCURL_LIBS
linker flags for LIBCURL, overriding pkg-config
+ LIBNUMA_CFLAGS
+ C compiler flags for LIBNUMA, overriding pkg-config
+ LIBNUMA_LIBS
+ linker flags for LIBNUMA, overriding pkg-config
XML2_CONFIG path to xml2-config utility
XML2_CFLAGS C compiler flags for XML2, overriding pkg-config
XML2_LIBS linker flags for XML2, overriding pkg-config
@@ -9063,6 +9074,182 @@ $as_echo "$as_me: WARNING: *** OAuth support tests require --with-python to run"
fi
+#
+# libnuma
+#
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to build with libnuma support" >&5
+$as_echo_n "checking whether to build with libnuma support... " >&6; }
+
+
+
+# Check whether --with-libnuma was given.
+if test "${with_libnuma+set}" = set; then :
+ withval=$with_libnuma;
+ case $withval in
+ yes)
+
+$as_echo "#define USE_LIBNUMA 1" >>confdefs.h
+
+ ;;
+ no)
+ :
+ ;;
+ *)
+ as_fn_error $? "no argument expected for --with-libnuma option" "$LINENO" 5
+ ;;
+ esac
+
+else
+ with_libnuma=no
+
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_libnuma" >&5
+$as_echo "$with_libnuma" >&6; }
+
+
+if test "$with_libnuma" = yes ; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking for numa_available in -lnuma" >&5
+$as_echo_n "checking for numa_available in -lnuma... " >&6; }
+if ${ac_cv_lib_numa_numa_available+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ ac_check_lib_save_LIBS=$LIBS
+LIBS="-lnuma $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+/* Override any GCC internal prototype to avoid an error.
+ Use char because int might match the return type of a GCC
+ builtin and then its argument prototype would still apply. */
+#ifdef __cplusplus
+extern "C"
+#endif
+char numa_available ();
+int
+main ()
+{
+return numa_available ();
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+ ac_cv_lib_numa_numa_available=yes
+else
+ ac_cv_lib_numa_numa_available=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+ conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_numa_numa_available" >&5
+$as_echo "$ac_cv_lib_numa_numa_available" >&6; }
+if test "x$ac_cv_lib_numa_numa_available" = xyes; then :
+ cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBNUMA 1
+_ACEOF
+
+ LIBS="-lnuma $LIBS"
+
+else
+ as_fn_error $? "library 'libnuma' is required for NUMA support" "$LINENO" 5
+fi
+
+
+pkg_failed=no
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for numa" >&5
+$as_echo_n "checking for numa... " >&6; }
+
+if test -n "$LIBNUMA_CFLAGS"; then
+ pkg_cv_LIBNUMA_CFLAGS="$LIBNUMA_CFLAGS"
+ elif test -n "$PKG_CONFIG"; then
+ if test -n "$PKG_CONFIG" && \
+ { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"numa\""; } >&5
+ ($PKG_CONFIG --exists --print-errors "numa") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ test $ac_status = 0; }; then
+ pkg_cv_LIBNUMA_CFLAGS=`$PKG_CONFIG --cflags "numa" 2>/dev/null`
+ test "x$?" != "x0" && pkg_failed=yes
+else
+ pkg_failed=yes
+fi
+ else
+ pkg_failed=untried
+fi
+if test -n "$LIBNUMA_LIBS"; then
+ pkg_cv_LIBNUMA_LIBS="$LIBNUMA_LIBS"
+ elif test -n "$PKG_CONFIG"; then
+ if test -n "$PKG_CONFIG" && \
+ { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"numa\""; } >&5
+ ($PKG_CONFIG --exists --print-errors "numa") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ test $ac_status = 0; }; then
+ pkg_cv_LIBNUMA_LIBS=`$PKG_CONFIG --libs "numa" 2>/dev/null`
+ test "x$?" != "x0" && pkg_failed=yes
+else
+ pkg_failed=yes
+fi
+ else
+ pkg_failed=untried
+fi
+
+
+
+if test $pkg_failed = yes; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
+ _pkg_short_errors_supported=yes
+else
+ _pkg_short_errors_supported=no
+fi
+ if test $_pkg_short_errors_supported = yes; then
+ LIBNUMA_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "numa" 2>&1`
+ else
+ LIBNUMA_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "numa" 2>&1`
+ fi
+ # Put the nasty error message in config.log where it belongs
+ echo "$LIBNUMA_PKG_ERRORS" >&5
+
+ as_fn_error $? "Package requirements (numa) were not met:
+
+$LIBNUMA_PKG_ERRORS
+
+Consider adjusting the PKG_CONFIG_PATH environment variable if you
+installed software in a non-standard prefix.
+
+Alternatively, you may set the environment variables LIBNUMA_CFLAGS
+and LIBNUMA_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details." "$LINENO" 5
+elif test $pkg_failed = untried; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+ { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "The pkg-config script could not be found or is too old. Make sure it
+is in your PATH or set the PKG_CONFIG environment variable to the full
+path to pkg-config.
+
+Alternatively, you may set the environment variables LIBNUMA_CFLAGS
+and LIBNUMA_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details.
+
+To get pkg-config, see .
+See \`config.log' for more details" "$LINENO" 5; }
+else
+ LIBNUMA_CFLAGS=$pkg_cv_LIBNUMA_CFLAGS
+ LIBNUMA_LIBS=$pkg_cv_LIBNUMA_LIBS
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+fi
+fi
+
#
# XML
#
diff --git a/configure.ac b/configure.ac
index debdf1650441..d365a486d3d8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1053,6 +1053,20 @@ if test "$with_libcurl" = yes ; then
fi
+#
+# libnuma
+#
+AC_MSG_CHECKING([whether to build with libnuma support])
+PGAC_ARG_BOOL(with, libnuma, no, [build with libnuma support],
+ [AC_DEFINE([USE_LIBNUMA], 1, [Define to build with NUMA support. (--with-libnuma)])])
+AC_MSG_RESULT([$with_libnuma])
+AC_SUBST(with_libnuma)
+
+if test "$with_libnuma" = yes ; then
+ AC_CHECK_LIB(numa, numa_available, [], [AC_MSG_ERROR([library 'libnuma' is required for NUMA support])])
+ PKG_CHECK_MODULES(LIBNUMA, numa)
+fi
+
#
# XML
#
diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile
index eae65ead9e50..5f748543e2ea 100644
--- a/contrib/pg_buffercache/Makefile
+++ b/contrib/pg_buffercache/Makefile
@@ -8,10 +8,11 @@ OBJS = \
EXTENSION = pg_buffercache
DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \
pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \
- pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql
+ pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \
+ pg_buffercache--1.5--1.6.sql
PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time"
-REGRESS = pg_buffercache
+REGRESS = pg_buffercache pg_buffercache_numa
ifdef USE_PGXS
PG_CONFIG = pg_config
diff --git a/contrib/pg_buffercache/expected/pg_buffercache_numa.out b/contrib/pg_buffercache/expected/pg_buffercache_numa.out
new file mode 100644
index 000000000000..d4de5ea52fc2
--- /dev/null
+++ b/contrib/pg_buffercache/expected/pg_buffercache_numa.out
@@ -0,0 +1,28 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+select count(*) = (select setting::bigint
+ from pg_settings
+ where name = 'shared_buffers')
+from pg_buffercache_numa;
+ ?column?
+----------
+ t
+(1 row)
+
+-- Check that the functions / views can't be accessed by default. To avoid
+-- having to create a dedicated user, use the pg_database_owner pseudo-role.
+SET ROLE pg_database_owner;
+SELECT count(*) > 0 FROM pg_buffercache_numa;
+ERROR: permission denied for view pg_buffercache_numa
+RESET role;
+-- Check that pg_monitor is allowed to query view / function
+SET ROLE pg_monitor;
+SELECT count(*) > 0 FROM pg_buffercache_numa;
+ ?column?
+----------
+ t
+(1 row)
+
+RESET role;
diff --git a/contrib/pg_buffercache/expected/pg_buffercache_numa_1.out b/contrib/pg_buffercache/expected/pg_buffercache_numa_1.out
new file mode 100644
index 000000000000..6dd6824b4e4f
--- /dev/null
+++ b/contrib/pg_buffercache/expected/pg_buffercache_numa_1.out
@@ -0,0 +1,3 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+\quit
diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build
index 12d1fe487172..7cd039a1df9c 100644
--- a/contrib/pg_buffercache/meson.build
+++ b/contrib/pg_buffercache/meson.build
@@ -23,6 +23,7 @@ install_data(
'pg_buffercache--1.2.sql',
'pg_buffercache--1.3--1.4.sql',
'pg_buffercache--1.4--1.5.sql',
+ 'pg_buffercache--1.5--1.6.sql',
'pg_buffercache.control',
kwargs: contrib_data_args,
)
@@ -34,6 +35,7 @@ tests += {
'regress': {
'sql': [
'pg_buffercache',
+ 'pg_buffercache_numa',
],
},
}
diff --git a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
new file mode 100644
index 000000000000..1230e244a5f1
--- /dev/null
+++ b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
@@ -0,0 +1,22 @@
+/* contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.6'" to load this file. \quit
+
+-- Register the new functions.
+CREATE OR REPLACE FUNCTION pg_buffercache_numa_pages()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'pg_buffercache_numa_pages'
+LANGUAGE C PARALLEL SAFE;
+
+-- Create a view for convenient access.
+CREATE OR REPLACE VIEW pg_buffercache_numa AS
+ SELECT P.* FROM pg_buffercache_numa_pages() AS P
+ (bufferid integer, page_num int4, node_id int4);
+
+-- Don't want these to be available to public.
+REVOKE ALL ON FUNCTION pg_buffercache_numa_pages() FROM PUBLIC;
+REVOKE ALL ON pg_buffercache_numa FROM PUBLIC;
+
+GRANT EXECUTE ON FUNCTION pg_buffercache_numa_pages() TO pg_monitor;
+GRANT SELECT ON pg_buffercache_numa TO pg_monitor;
diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control
index 5ee875f77dd9..b030ba3a6fab 100644
--- a/contrib/pg_buffercache/pg_buffercache.control
+++ b/contrib/pg_buffercache/pg_buffercache.control
@@ -1,5 +1,5 @@
# pg_buffercache extension
comment = 'examine the shared buffer cache'
-default_version = '1.5'
+default_version = '1.6'
module_pathname = '$libdir/pg_buffercache'
relocatable = true
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 62602af1775f..65ade9d81354 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -11,6 +11,7 @@
#include "access/htup_details.h"
#include "catalog/pg_type.h"
#include "funcapi.h"
+#include "port/pg_numa.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
@@ -20,6 +21,8 @@
#define NUM_BUFFERCACHE_SUMMARY_ELEM 5
#define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4
+#define NUM_BUFFERCACHE_NUMA_ELEM 3
+
PG_MODULE_MAGIC_EXT(
.name = "pg_buffercache",
.version = PG_VERSION
@@ -58,16 +61,44 @@ typedef struct
BufferCachePagesRec *record;
} BufferCachePagesContext;
+/*
+ * Record structure holding the to be exposed cache data.
+ */
+typedef struct
+{
+ uint32 bufferid;
+ int32 numa_page;
+ int32 numa_node;
+} BufferCacheNumaRec;
+
+/*
+ * Function context for data persisting over repeated calls.
+ */
+typedef struct
+{
+ TupleDesc tupdesc;
+ int buffers_per_page;
+ int pages_per_buffer;
+ int os_page_size;
+ BufferCacheNumaRec *record;
+} BufferCacheNumaContext;
+
/*
* Function returning data from the shared buffer cache - buffer number,
* relation node/tablespace/database/blocknum and dirty indicator.
*/
PG_FUNCTION_INFO_V1(pg_buffercache_pages);
+PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages);
PG_FUNCTION_INFO_V1(pg_buffercache_summary);
PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
PG_FUNCTION_INFO_V1(pg_buffercache_evict);
+
+/* Only need to touch memory once per backend process lifetime */
+static bool firstNumaTouch = true;
+
+
Datum
pg_buffercache_pages(PG_FUNCTION_ARGS)
{
@@ -246,6 +277,263 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
SRF_RETURN_DONE(funcctx);
}
+/*
+ * Inquire about NUMA memory mappings for shared buffers.
+ *
+ * Returns NUMA node ID for each memory page used by the buffer. Buffers may
+ * be smaller or larger than OS memory pages. For each buffer we return one
+ * entry for each memory page used by the buffer (it fhe buffer is smaller,
+ * it only uses a part of one memory page).
+ *
+ * We expect both sizes (for buffers and memory pages) to be a power-of-2, so
+ * one is always a multiple of the other.
+ *
+ * In order to get reliable results we also need to touch memory pages, so
+ * that the inquiry about NUMA memory node doesn't return -2 (which indicates
+ * unmapped/unallocated pages).
+ */
+Datum
+pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
+{
+ FuncCallContext *funcctx;
+ MemoryContext oldcontext;
+ BufferCacheNumaContext *fctx; /* User function context. */
+ TupleDesc tupledesc;
+ TupleDesc expected_tupledesc;
+ HeapTuple tuple;
+ Datum result;
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ int i,
+ j,
+ idx;
+ Size os_page_size = 0;
+ void **os_page_ptrs = NULL;
+ int *os_page_status;
+ uint64 os_page_count;
+ int pages_per_buffer;
+ int buffers_per_page;
+ volatile uint64 touch pg_attribute_unused();
+
+ if (pg_numa_init() == -1)
+ elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
+
+ /*
+ * Different database block sizes (4kB, 8kB, ..., 32kB) can be used,
+ * while the OS may have different memory page sizes.
+ *
+ * To correctly map between them, we need to: 1. Determine the OS
+ * memory page size 2. Calculate how many OS pages are used by all
+ * buffer blocks 3. Calculate how many OS pages are contained within
+ * each database block.
+ *
+ * This information is needed before calling move_pages() for NUMA
+ * node id inquiry.
+ */
+ os_page_size = pg_numa_get_pagesize();
+ buffers_per_page = os_page_size / BLCKSZ;
+ pages_per_buffer = BLCKSZ / os_page_size;
+
+ /*
+ * The pages and block size is expected to be 2^k, so one divides the
+ * other (we don't know in which direction).
+ */
+ Assert((os_page_size % BLCKSZ == 0) || (BLCKSZ % os_page_size == 0));
+
+ /*
+ * Either both counts are 1 (when the pages have the same size), or
+ * exacly one of them is zero. Both can't be zero at the same time.
+ */
+ Assert((buffers_per_page > 0) || (pages_per_buffer > 0));
+ Assert(((buffers_per_page == 1) && (pages_per_buffer == 1)) ||
+ ((buffers_per_page == 0) || (pages_per_buffer == 0)));
+
+ /*
+ * How many addresses we are going to query (store) depends on the
+ * relation between BLCKSZ : PAGESIZE. We need at least one status per
+ * buffer - if the memory page is larger than buffer, we still query
+ * it for each buffer. With multiple memory pages per buffer, we need
+ * that many entries.
+ */
+ os_page_count = NBuffers * Max(1, pages_per_buffer);
+
+ elog(DEBUG1, "NUMA: NBuffers=%d os_page_query_count=" UINT64_FORMAT " "
+ "os_page_size=%zu buffers_per_page=%d pages_per_buffer=%d",
+ NBuffers, os_page_count, os_page_size,
+ buffers_per_page, pages_per_buffer);
+
+
+ /* initialize the multi-call context, load entries about buffers */
+
+ funcctx = SRF_FIRSTCALL_INIT();
+
+ /* Switch context when allocating stuff to be used in later calls */
+ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+ /* Create a user function context for cross-call persistence */
+ fctx = (BufferCacheNumaContext *) palloc(sizeof(BufferCacheNumaContext));
+
+ /*
+ * To smoothly support upgrades from version 1.0 of this extension
+ * transparently handle the (non-)existence of the pinning_backends
+ * column. We unfortunately have to get the result type for that... -
+ * we can't use the result type determined by the function definition
+ * without potentially crashing when somebody uses the old (or even
+ * wrong) function definition though.
+ */
+ if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
+ if (expected_tupledesc->natts != NUM_BUFFERCACHE_NUMA_ELEM)
+ elog(ERROR, "incorrect number of output arguments");
+
+ /* Construct a tuple descriptor for the result rows. */
+ tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
+ TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
+ INT4OID, -1, 0);
+ TupleDescInitEntry(tupledesc, (AttrNumber) 2, "page_num",
+ INT4OID, -1, 0);
+ TupleDescInitEntry(tupledesc, (AttrNumber) 3, "node_id",
+ INT4OID, -1, 0);
+
+ fctx->tupdesc = BlessTupleDesc(tupledesc);
+
+ /* Allocate NBuffers worth of BufferCachePagesRec records. */
+ fctx->record = (BufferCacheNumaRec *)
+ MemoryContextAllocHuge(CurrentMemoryContext,
+ sizeof(BufferCacheNumaRec) * os_page_count);
+
+ /* Set max calls and remember the user function context. */
+ funcctx->max_calls = NBuffers;
+ funcctx->user_fctx = fctx;
+
+ /* Return to original context when allocating transient memory */
+ MemoryContextSwitchTo(oldcontext);
+
+
+ /* used to determine the NUMA node for all OS pages at once */
+ os_page_ptrs = palloc0(sizeof(void *) * os_page_count);
+ os_page_status = palloc(sizeof(uint64) * os_page_count);
+
+ /*
+ * If we ever get 0xff back from kernel inquiry, then we probably have
+ * bug in our buffers to OS page mapping code here.
+ */
+ memset(os_page_status, 0xff, sizeof(int) * os_page_count);
+
+ if (firstNumaTouch)
+ elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts");
+
+ /*
+ * Scan through all the buffers, saving the relevant fields in the
+ * fctx->record structure.
+ *
+ * We don't hold the partition locks, so we don't get a consistent
+ * snapshot across all buffers, but we do grab the buffer header
+ * locks, so the information of each buffer is self-consistent.
+ *
+ * This loop touches and stores addresses into os_page_ptrs[] as input
+ * to one big big move_pages(2) inquiry system call. Basically we ask
+ * for all memory pages for NBuffers.
+ */
+ idx = 0;
+ for (i = 0; i < NBuffers; i++)
+ {
+ BufferDesc *bufHdr;
+ uint32 buf_state;
+ uint32 bufferid;
+
+ CHECK_FOR_INTERRUPTS();
+
+ bufHdr = GetBufferDescriptor(i);
+
+ /* Lock each buffer header before inspecting. */
+ buf_state = LockBufHdr(bufHdr);
+ bufferid = BufferDescriptorGetBuffer(bufHdr);
+
+ UnlockBufHdr(bufHdr, buf_state);
+
+ /*
+ * If we have multiple OS pages per buffer, fill those in too. We
+ * always want at least one OS page, even if there are multiple
+ * buffers per page.
+ *
+ * Altough we could query just once per each OS page, we do it
+ * repeatably for each Buffer and hit the same address as
+ * move_pages(2) requires page aligment. This also simplifies
+ * retrieval code later on. Also NBuffers starts from 1.
+ */
+ for (j = 0; j < Max(1, pages_per_buffer); j++)
+ {
+ char *buffptr = (char *) BufferGetBlock(i + 1);
+
+ fctx->record[idx].bufferid = bufferid;
+ fctx->record[idx].numa_page = j;
+
+ os_page_ptrs[idx]
+ = (char *) TYPEALIGN(os_page_size,
+ buffptr + (os_page_size * j));
+
+ /* Only need to touch memory once per backend process lifetime */
+ if (firstNumaTouch)
+ pg_numa_touch_mem_if_required(touch, os_page_ptrs[idx]);
+
+ ++idx;
+ }
+
+ }
+
+ /* we should get exactly the expected number of entrires */
+ Assert(idx == os_page_count);
+
+ /* query NUMA status for all the pointers */
+ if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1)
+ elog(ERROR, "failed NUMA pages inquiry: %m");
+
+ /*
+ * Update the entries with NUMA node ID. The status array is indexed
+ * the same way as the entry index.
+ */
+ for (i = 0; i < os_page_count; i++)
+ {
+ fctx->record[i].numa_node = os_page_status[i];
+ }
+
+ /* remember this backend touched the pages */
+ firstNumaTouch = false;
+ }
+
+ funcctx = SRF_PERCALL_SETUP();
+
+ /* Get the saved state */
+ fctx = funcctx->user_fctx;
+
+ if (funcctx->call_cntr < funcctx->max_calls)
+ {
+ uint32 i = funcctx->call_cntr;
+ Datum values[NUM_BUFFERCACHE_NUMA_ELEM];
+ bool nulls[NUM_BUFFERCACHE_NUMA_ELEM];
+
+ values[0] = Int32GetDatum(fctx->record[i].bufferid);
+ nulls[0] = false;
+
+ values[1] = Int32GetDatum(fctx->record[i].numa_page);
+ nulls[1] = false;
+
+ values[2] = Int32GetDatum(fctx->record[i].numa_node);
+ nulls[2] = false;
+
+ /* Build and return the tuple. */
+ tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
+ result = HeapTupleGetDatum(tuple);
+
+ SRF_RETURN_NEXT(funcctx, result);
+ }
+ else
+ SRF_RETURN_DONE(funcctx);
+}
+
Datum
pg_buffercache_summary(PG_FUNCTION_ARGS)
{
diff --git a/contrib/pg_buffercache/sql/pg_buffercache_numa.sql b/contrib/pg_buffercache/sql/pg_buffercache_numa.sql
new file mode 100644
index 000000000000..2225b879f58b
--- /dev/null
+++ b/contrib/pg_buffercache/sql/pg_buffercache_numa.sql
@@ -0,0 +1,20 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+
+select count(*) = (select setting::bigint
+ from pg_settings
+ where name = 'shared_buffers')
+from pg_buffercache_numa;
+
+-- Check that the functions / views can't be accessed by default. To avoid
+-- having to create a dedicated user, use the pg_database_owner pseudo-role.
+SET ROLE pg_database_owner;
+SELECT count(*) > 0 FROM pg_buffercache_numa;
+RESET role;
+
+-- Check that pg_monitor is allowed to query view / function
+SET ROLE pg_monitor;
+SELECT count(*) > 0 FROM pg_buffercache_numa;
+RESET role;
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 0224f93733dc..9ab070adffba 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -25143,6 +25143,19 @@ SELECT * FROM pg_ls_dir('.') WITH ORDINALITY AS t(ls,n);
+
+
+
+ pg_numa_available
+
+ pg_numa_available ()
+ boolean
+
+
+ Returns true if the server has been compiled with NUMA support.
+
+
+
diff --git a/doc/src/sgml/installation.sgml b/doc/src/sgml/installation.sgml
index cc28f0413307..8ebf0b03ec0c 100644
--- a/doc/src/sgml/installation.sgml
+++ b/doc/src/sgml/installation.sgml
@@ -1156,6 +1156,16 @@ build-postgresql:
+
+
+
+
+ Build with libnuma support for basic NUMA support.
+ Only supported on platforms for which the libnuma library is implemented.
+
+
+
+
@@ -2645,6 +2655,17 @@ ninja install
+
+
+
+
+ Build with libnuma support for basic NUMA support.
+ Only supported on platforms for which the libnuma library is implemented.
+ The default for this option is auto.
+
+
+
+
diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml
index 802a5112d77d..b01f8e713576 100644
--- a/doc/src/sgml/pgbuffercache.sgml
+++ b/doc/src/sgml/pgbuffercache.sgml
@@ -30,7 +30,9 @@
This module provides the pg_buffercache_pages()
function (wrapped in the pg_buffercache view),
- the pg_buffercache_summary() function, the
+ pg_buffercache_numa_pages() function (wrapped in the
+ pg_buffercache_numa view), the
+ pg_buffercache_summary() function, the
pg_buffercache_usage_counts() function and
the pg_buffercache_evict() function.
@@ -42,6 +44,15 @@
convenient use.
+
+ The pg_buffercache_numa_pages() provides
+ NUMA node mappings for shared buffer entries. This
+ information is not part of pg_buffercache_pages()
+ itself, as it is much slower to retrieve.
+ The pg_buffercache_numa view wraps the function for
+ convenient use.
+
+
The pg_buffercache_summary() function returns a single
row summarizing the state of the shared buffer cache.
@@ -200,6 +211,68 @@
+
+ The pg_buffercache_numa View
+
+
+ The definitions of the columns exposed by the view are shown in .
+
+
+
+ pg_buffercache_numa Columns
+
+
+
+
+ Column Type
+
+
+ Description
+
+
+
+
+
+
+
+ bufferidinteger
+
+
+ ID, in the range 1..shared_buffers
+
+
+
+
+
+ page_numint
+
+
+ number of OS memory page for this buffer
+
+
+
+
+
+ node_idint
+
+
+ ID of NUMA node
+
+
+
+
+
+
+
+
+ As NUMA node ID inquiry for each page requires memory pages
+ to be paged-in, the first execution of this function can take a noticeable
+ amount of time. In all the cases (first execution or not), retrieving this
+ information is costly and querying the view at a high frequency is not recommended.
+
+
+
+
The pg_buffercache_summary() Function
diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml
index 4f336ee0adfa..a83365ae24ae 100644
--- a/doc/src/sgml/system-views.sgml
+++ b/doc/src/sgml/system-views.sgml
@@ -181,6 +181,11 @@
shared memory allocations
+
+ pg_shmem_allocations_numa
+ NUMA node mappings for shared memory allocations
+
+
pg_statsplanner statistics
@@ -4051,6 +4056,80 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
+
+ pg_shmem_allocations_numa
+
+
+ pg_shmem_allocations_numa
+
+
+
+ The pg_shmem_allocations_numa shows how shared
+ memory allocations in the server's main shared memory segment are distributed
+ across NUMA nodes. This includes both memory allocated by
+ PostgreSQL itself and memory allocated
+ by extensions using the mechanisms detailed in
+ .
+
+
+
+ Note that this view does not include memory allocated using the dynamic
+ shared memory infrastructure.
+
+
+
+ pg_shmem_allocations_numa Columns
+
+
+
+
+ Column Type
+
+
+ Description
+
+
+
+
+
+
+
+ nametext
+
+
+ The name of the shared memory allocation.
+
+
+
+
+
+ node_idint4
+
+
+ ID of NUMA node
+
+
+
+
+
+ sizeint4
+
+
+ Size of the allocation on this particular NUMA memory node in bytes
+
+
+
+
+
+
+
+
+ By default, the pg_shmem_allocations_numa view can be
+ read only by superusers or roles with privileges of the
+ pg_read_all_stats role.
+
+
+
pg_stats
diff --git a/meson.build b/meson.build
index 454ed81f5ead..46e92daeb62b 100644
--- a/meson.build
+++ b/meson.build
@@ -943,6 +943,27 @@ else
endif
+###############################################################
+# Library: libnuma
+###############################################################
+
+libnumaopt = get_option('libnuma')
+if not libnumaopt.disabled()
+ # via pkg-config
+ libnuma = dependency('numa', required: libnumaopt)
+ if not libnuma.found()
+ libnuma = cc.find_library('numa', required: libnumaopt)
+ endif
+ if not cc.has_header('numa.h', dependencies: libnuma, required: libnumaopt)
+ libnuma = not_found_dep
+ endif
+ if libnuma.found()
+ cdata.set('USE_LIBNUMA', 1)
+ endif
+else
+ libnuma = not_found_dep
+endif
+
###############################################################
# Library: liburing
@@ -3243,6 +3264,7 @@ backend_both_deps += [
icu_i18n,
ldap,
libintl,
+ libnuma,
liburing,
libxml,
lz4,
@@ -3899,6 +3921,7 @@ if meson.version().version_compare('>=0.57')
'icu': icu,
'ldap': ldap,
'libcurl': libcurl,
+ 'libnuma': libnuma,
'liburing': liburing,
'libxml': libxml,
'libxslt': libxslt,
diff --git a/meson_options.txt b/meson_options.txt
index dd7126da3a73..06bf5627d3c0 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -106,6 +106,9 @@ option('libcurl', type : 'feature', value: 'auto',
option('libedit_preferred', type: 'boolean', value: false,
description: 'Prefer BSD Libedit over GNU Readline')
+option('libnuma', type: 'feature', value: 'auto',
+ description: 'NUMA support')
+
option('liburing', type : 'feature', value: 'auto',
description: 'io_uring support, for asynchronous I/O')
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 737b2dd18691..6722fbdf365f 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -196,6 +196,7 @@ with_gssapi = @with_gssapi@
with_krb_srvnam = @with_krb_srvnam@
with_ldap = @with_ldap@
with_libcurl = @with_libcurl@
+with_libnuma = @with_libnuma@
with_liburing = @with_liburing@
with_libxml = @with_libxml@
with_libxslt = @with_libxslt@
@@ -223,6 +224,9 @@ krb_srvtab = @krb_srvtab@
ICU_CFLAGS = @ICU_CFLAGS@
ICU_LIBS = @ICU_LIBS@
+LIBNUMA_CFLAGS = @LIBNUMA_CFLAGS@
+LIBNUMA_LIBS = @LIBNUMA_LIBS@
+
LIBURING_CFLAGS = @LIBURING_CFLAGS@
LIBURING_LIBS = @LIBURING_LIBS@
@@ -250,7 +254,7 @@ CPP = @CPP@
CPPFLAGS = @CPPFLAGS@
PG_SYSROOT = @PG_SYSROOT@
-override CPPFLAGS := $(ICU_CFLAGS) $(LIBURING_CFLAGS) $(CPPFLAGS)
+override CPPFLAGS := $(ICU_CFLAGS) $(LIBNUMA_CFLAGS) $(LIBURING_CFLAGS) $(CPPFLAGS)
ifdef PGXS
override CPPFLAGS := -I$(includedir_server) -I$(includedir_internal) $(CPPFLAGS)
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 273008db37fc..08f780a2e638 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -658,6 +658,14 @@ GRANT SELECT ON pg_shmem_allocations TO pg_read_all_stats;
REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations() FROM PUBLIC;
GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations() TO pg_read_all_stats;
+CREATE VIEW pg_shmem_allocations_numa AS
+ SELECT * FROM pg_get_shmem_allocations_numa();
+
+REVOKE ALL ON pg_shmem_allocations_numa FROM PUBLIC;
+GRANT SELECT ON pg_shmem_allocations_numa TO pg_read_all_stats;
+REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations_numa() FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations_numa() TO pg_read_all_stats;
+
CREATE VIEW pg_backend_memory_contexts AS
SELECT * FROM pg_get_backend_memory_contexts();
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index 895a43fb39e5..5d979423bd95 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -68,6 +68,7 @@
#include "fmgr.h"
#include "funcapi.h"
#include "miscadmin.h"
+#include "port/pg_numa.h"
#include "storage/lwlock.h"
#include "storage/pg_shmem.h"
#include "storage/shmem.h"
@@ -89,6 +90,8 @@ slock_t *ShmemLock; /* spinlock for shared memory and LWLock
static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */
+/* To get reliable results for NUMA inquiry we need to "touch pages" once */
+static bool firstNumaTouch = true;
/*
* InitShmemAccess() --- set up basic pointers to shared memory.
@@ -568,3 +571,152 @@ pg_get_shmem_allocations(PG_FUNCTION_ARGS)
return (Datum) 0;
}
+
+/* SQL SRF showing NUMA memory nodes for allocated shared memory */
+Datum
+pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
+{
+#define PG_GET_SHMEM_NUMA_SIZES_COLS 3
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ HASH_SEQ_STATUS hstat;
+ ShmemIndexEnt *ent;
+ Datum values[PG_GET_SHMEM_NUMA_SIZES_COLS];
+ bool nulls[PG_GET_SHMEM_NUMA_SIZES_COLS];
+ Size os_page_size;
+ void **page_ptrs;
+ int *pages_status;
+ uint64 shm_total_page_count,
+ shm_ent_page_count,
+ max_nodes;
+ Size *nodes;
+
+ if (pg_numa_init() == -1)
+ elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
+
+ InitMaterializedSRF(fcinfo, 0);
+
+ max_nodes = pg_numa_get_max_node();
+ nodes = palloc(sizeof(Size) * (max_nodes + 1));
+
+ /*
+ * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, while
+ * the OS may have different memory page sizes.
+ *
+ * To correctly map between them, we need to: 1. Determine the OS memory
+ * page size 2. Calculate how many OS pages are used by all buffer blocks
+ * 3. Calculate how many OS pages are contained within each database
+ * block.
+ *
+ * This information is needed before calling move_pages() for NUMA memory
+ * node inquiry.
+ */
+ os_page_size = pg_numa_get_pagesize();
+
+ /*
+ * Allocate memory for page pointers and status based on total shared
+ * memory size. This simplified approach allocates enough space for all
+ * pages in shared memory rather than calculating the exact requirements
+ * for each segment.
+ *
+ * XXX Isn't this wasteful? But there probably is one large segment of
+ * shared memory, much larger than the rest anyway.
+ */
+ shm_total_page_count = ShmemSegHdr->totalsize / os_page_size;
+ page_ptrs = palloc0(sizeof(void *) * shm_total_page_count);
+ pages_status = palloc(sizeof(int) * shm_total_page_count);
+
+ if (firstNumaTouch)
+ elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts");
+
+ LWLockAcquire(ShmemIndexLock, LW_SHARED);
+
+ hash_seq_init(&hstat, ShmemIndex);
+
+ /* output all allocated entries */
+ memset(nulls, 0, sizeof(nulls));
+ while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
+ {
+ int i;
+
+ /* XXX I assume we use TYPEALIGN as a way to round to whole pages.
+ * It's a bit misleading to call that "aligned", no? */
+
+ /* Get number of OS aligned pages */
+ shm_ent_page_count
+ = TYPEALIGN(os_page_size, ent->allocated_size) / os_page_size;
+
+ /*
+ * If we get ever 0xff back from kernel inquiry, then we probably have
+ * bug in our buffers to OS page mapping code here.
+ */
+ memset(pages_status, 0xff, sizeof(int) * shm_ent_page_count);
+
+ /*
+ * Setup page_ptrs[] with pointers to all OS pages for this segment,
+ * and get the NUMA status using pg_numa_query_pages.
+ *
+ * In order to get reliable results we also need to touch memory
+ * pages, so that inquiry about NUMA memory node doesn't return -2
+ * (ENOENT, which indicates unmapped/unallocated pages).
+ */
+ for (i = 0; i < shm_ent_page_count; i++)
+ {
+ volatile uint64 touch pg_attribute_unused();
+
+ page_ptrs[i] = (char *) ent->location + (i * os_page_size);
+
+ if (firstNumaTouch)
+ pg_numa_touch_mem_if_required(touch, page_ptrs[i]);
+
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1)
+ elog(ERROR, "failed NUMA pages inquiry status: %m");
+
+ /* Count number of NUMA nodes used for this shared memory entry */
+ memset(nodes, 0, sizeof(Size) * (max_nodes + 1));
+
+ for (i = 0; i < shm_ent_page_count; i++)
+ {
+ int s = pages_status[i];
+
+ /* Ensure we are adding only valid index to the array */
+ if (s < 0 || s > max_nodes)
+ {
+ elog(ERROR, "invalid NUMA node id outside of allowed range "
+ "[0, " UINT64_FORMAT "]: %d", max_nodes, s);
+ }
+
+ nodes[s]++;
+ }
+
+ /*
+ * Add one entry for each NUMA node, including those without allocated
+ * memory for this segment.
+ */
+ for (i = 0; i <= max_nodes; i++)
+ {
+ values[0] = CStringGetTextDatum(ent->key);
+ values[1] = i;
+ values[2] = Int64GetDatum(nodes[i] * os_page_size);
+
+ tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
+ values, nulls);
+ }
+ }
+
+ /*
+ * We are ignoring the following memory regions (as compared to
+ * pg_get_shmem_allocations()): 1. output shared memory allocated but not
+ * counted via the shmem index 2. output as-of-yet unused shared memory.
+ *
+ * XXX Not quite sure why this is at the end, and what "output memory"
+ * refers to.
+ */
+
+ LWLockRelease(ShmemIndexLock);
+ firstNumaTouch = false;
+
+ return (Datum) 0;
+}
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 4eaeca89f2c7..ea8d796e7c45 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -566,7 +566,7 @@ static int ssl_renegotiation_limit;
*/
int huge_pages = HUGE_PAGES_TRY;
int huge_page_size;
-static int huge_pages_status = HUGE_PAGES_UNKNOWN;
+int huge_pages_status = HUGE_PAGES_UNKNOWN;
/*
* These variables are all dummies that don't do anything, except in some
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 5d5be8ba4e16..a93075c675cb 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -8542,6 +8542,18 @@
proargnames => '{name,off,size,allocated_size}',
prosrc => 'pg_get_shmem_allocations' },
+{ oid => '9685', descr => 'Is NUMA compilation available?',
+ proname => 'pg_numa_available', provolatile => 's', prorettype => 'bool',
+ proargtypes => '', prosrc => 'pg_numa_available' },
+
+# shared memory usage with NUMA info
+{ oid => '9686', descr => 'NUMA mappings for the main shared memory segment',
+ proname => 'pg_get_shmem_allocations_numa', prorows => '50', proretset => 't',
+ provolatile => 'v', prorettype => 'record', proargtypes => '',
+ proallargtypes => '{text,int4,int8}', proargmodes => '{o,o,o}',
+ proargnames => '{name,node_id,size}',
+ prosrc => 'pg_get_shmem_allocations_numa' },
+
# memory context of local backend
{ oid => '2282',
descr => 'information about all memory contexts of local backend',
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index c2f1241b2342..b3166ec8f428 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -686,6 +686,9 @@
/* Define to 1 to build with libcurl support. (--with-libcurl) */
#undef USE_LIBCURL
+/* Define to 1 to build with NUMA support. (--with-libnuma) */
+#undef USE_LIBNUMA
+
/* Define to build with io_uring support. (--with-liburing) */
#undef USE_LIBURING
diff --git a/src/include/port/pg_numa.h b/src/include/port/pg_numa.h
new file mode 100644
index 000000000000..3c1b50c14286
--- /dev/null
+++ b/src/include/port/pg_numa.h
@@ -0,0 +1,40 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_numa.h
+ * Basic NUMA portability routines
+ *
+ *
+ * Copyright (c) 2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/include/port/pg_numa.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_NUMA_H
+#define PG_NUMA_H
+
+#include "fmgr.h"
+
+extern PGDLLIMPORT int pg_numa_init(void);
+extern PGDLLIMPORT int pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status);
+extern PGDLLIMPORT int pg_numa_get_max_node(void);
+extern PGDLLIMPORT Size pg_numa_get_pagesize(void);
+
+#ifdef USE_LIBNUMA
+
+/*
+ * This is required on Linux, before pg_numa_query_pages() as we
+ * need to page-fault before move_pages(2) syscall returns valid results.
+ */
+#define pg_numa_touch_mem_if_required(ro_volatile_var, ptr) \
+ ro_volatile_var = *(uint64 *) ptr
+
+#else
+
+#define pg_numa_touch_mem_if_required(ro_volatile_var, ptr) \
+ do {} while(0)
+
+#endif
+
+#endif /* PG_NUMA_H */
diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h
index b99ebc9e86f5..5f7d4b83a60e 100644
--- a/src/include/storage/pg_shmem.h
+++ b/src/include/storage/pg_shmem.h
@@ -45,6 +45,7 @@ typedef struct PGShmemHeader /* standard header for all Postgres shmem */
extern PGDLLIMPORT int shared_memory_type;
extern PGDLLIMPORT int huge_pages;
extern PGDLLIMPORT int huge_page_size;
+extern PGDLLIMPORT int huge_pages_status;
/* Possible values for huge_pages and huge_pages_status */
typedef enum
diff --git a/src/makefiles/meson.build b/src/makefiles/meson.build
index 46d8da070e82..55da678ec278 100644
--- a/src/makefiles/meson.build
+++ b/src/makefiles/meson.build
@@ -200,6 +200,8 @@ pgxs_empty = [
'ICU_LIBS',
+ 'LIBNUMA_CFLAGS', 'LIBNUMA_LIBS',
+
'LIBURING_CFLAGS', 'LIBURING_LIBS',
]
@@ -232,6 +234,7 @@ pgxs_deps = {
'icu': icu,
'ldap': ldap,
'libcurl': libcurl,
+ 'libnuma': libnuma,
'liburing': liburing,
'libxml': libxml,
'libxslt': libxslt,
diff --git a/src/port/Makefile b/src/port/Makefile
index f11896440d56..4274949dfa4c 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -45,6 +45,7 @@ OBJS = \
path.o \
pg_bitutils.o \
pg_localeconv_r.o \
+ pg_numa.o \
pg_popcount_aarch64.o \
pg_popcount_avx512.o \
pg_strong_random.o \
diff --git a/src/port/meson.build b/src/port/meson.build
index 51041e756099..228888b2f663 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -8,6 +8,7 @@ pgport_sources = [
'path.c',
'pg_bitutils.c',
'pg_localeconv_r.c',
+ 'pg_numa.c',
'pg_popcount_aarch64.c',
'pg_popcount_avx512.c',
'pg_strong_random.c',
diff --git a/src/port/pg_numa.c b/src/port/pg_numa.c
new file mode 100644
index 000000000000..5e2523cf798c
--- /dev/null
+++ b/src/port/pg_numa.c
@@ -0,0 +1,120 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_numa.c
+ * Basic NUMA portability routines
+ *
+ *
+ * Copyright (c) 2025, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/port/pg_numa.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include
+
+#ifdef WIN32
+#include
+#endif
+
+#include "fmgr.h"
+#include "miscadmin.h"
+#include "port/pg_numa.h"
+#include "storage/pg_shmem.h"
+
+/*
+ * At this point we provide support only for Linux thanks to libnuma, but in
+ * future support for other platforms e.g. Win32 or FreeBSD might be possible
+ * too. For Win32 NUMA APIs see
+ * https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support
+ */
+#ifdef USE_LIBNUMA
+
+#include
+#include
+
+Datum pg_numa_available(PG_FUNCTION_ARGS);
+
+/* libnuma requires initialization as per numa(3) on Linux */
+int
+pg_numa_init(void)
+{
+ int r = numa_available();
+
+ return r;
+}
+
+/*
+ * We use move_pages(2) syscall here - instead of get_mempolicy(2) - as the
+ * first one allows us to batch and query about many memory pages in one single
+ * giant system call that is way faster.
+ */
+int
+pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status)
+{
+ return numa_move_pages(pid, count, pages, NULL, status, 0);
+}
+
+int
+pg_numa_get_max_node(void)
+{
+ return numa_max_node();
+}
+
+#else
+
+Datum pg_numa_available(PG_FUNCTION_ARGS);
+
+/* Empty wrappers */
+int
+pg_numa_init(void)
+{
+ /* We state that NUMA is not available */
+ return -1;
+}
+
+int
+pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status)
+{
+ return 0;
+}
+
+int
+pg_numa_get_max_node(void)
+{
+ return 0;
+}
+
+#endif
+
+Datum
+pg_numa_available(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_BOOL(pg_numa_init() != -1);
+}
+
+/* This should be used only after the server is started */
+Size
+pg_numa_get_pagesize(void)
+{
+ Size os_page_size;
+#ifdef WIN32
+ SYSTEM_INFO sysinfo;
+
+ GetSystemInfo(&sysinfo);
+ os_page_size = sysinfo.dwPageSize;
+#else
+ os_page_size = sysconf(_SC_PAGESIZE);
+#endif
+
+ Assert(IsUnderPostmaster);
+ Assert(huge_pages_status != HUGE_PAGES_UNKNOWN);
+
+ if (huge_pages_status == HUGE_PAGES_ON)
+ GetHugePageSize(&os_page_size, NULL);
+
+ return os_page_size;
+}
diff --git a/src/test/regress/expected/numa.out b/src/test/regress/expected/numa.out
new file mode 100644
index 000000000000..668172f7d79a
--- /dev/null
+++ b/src/test/regress/expected/numa.out
@@ -0,0 +1,12 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+-- switch to superuser
+\c -
+SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations_numa;
+ ok
+----
+ t
+(1 row)
+
diff --git a/src/test/regress/expected/numa_1.out b/src/test/regress/expected/numa_1.out
new file mode 100644
index 000000000000..6dd6824b4e4f
--- /dev/null
+++ b/src/test/regress/expected/numa_1.out
@@ -0,0 +1,3 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+\quit
diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out
index 1fddb13b6aef..c25062c288f3 100644
--- a/src/test/regress/expected/privileges.out
+++ b/src/test/regress/expected/privileges.out
@@ -3219,8 +3219,8 @@ REVOKE MAINTAIN ON lock_table FROM regress_locktable_user;
-- clean up
DROP TABLE lock_table;
DROP USER regress_locktable_user;
--- test to check privileges of system views pg_shmem_allocations and
--- pg_backend_memory_contexts.
+-- test to check privileges of system views pg_shmem_allocations,
+-- pg_shmem_allocations_numa and pg_backend_memory_contexts.
-- switch to superuser
\c -
CREATE ROLE regress_readallstats;
@@ -3242,6 +3242,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT
f
(1 row)
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- no
+ has_table_privilege
+---------------------
+ f
+(1 row)
+
GRANT pg_read_all_stats TO regress_readallstats;
SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes
has_table_privilege
@@ -3261,6 +3267,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT
t
(1 row)
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- yes
+ has_table_privilege
+---------------------
+ t
+(1 row)
+
-- run query to ensure that functions within views can be executed
SET ROLE regress_readallstats;
SELECT COUNT(*) >= 0 AS ok FROM pg_aios;
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 673c63b8d1b6..abfdc97abc5a 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1757,6 +1757,10 @@ pg_shmem_allocations| SELECT name,
size,
allocated_size
FROM pg_get_shmem_allocations() pg_get_shmem_allocations(name, off, size, allocated_size);
+pg_shmem_allocations_numa| SELECT name,
+ node_id,
+ size
+ FROM pg_get_shmem_allocations_numa() pg_get_shmem_allocations_numa(name, node_id, size);
pg_stat_activity| SELECT s.datid,
d.datname,
s.pid,
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index 0a35f2f8f6a9..0f38caa0d240 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -119,7 +119,7 @@ test: plancache limit plpgsql copy2 temp domain rangefuncs prepare conversion tr
# The stats test resets stats, so nothing else needing stats access can be in
# this group.
# ----------
-test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate
+test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate numa
# event_trigger depends on create_am and cannot run concurrently with
# any test that runs DDL
diff --git a/src/test/regress/sql/numa.sql b/src/test/regress/sql/numa.sql
new file mode 100644
index 000000000000..034098783fb4
--- /dev/null
+++ b/src/test/regress/sql/numa.sql
@@ -0,0 +1,9 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+
+-- switch to superuser
+\c -
+
+SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations_numa;
diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql
index 85d7280f35fc..f337aa67c13f 100644
--- a/src/test/regress/sql/privileges.sql
+++ b/src/test/regress/sql/privileges.sql
@@ -1947,8 +1947,8 @@ REVOKE MAINTAIN ON lock_table FROM regress_locktable_user;
DROP TABLE lock_table;
DROP USER regress_locktable_user;
--- test to check privileges of system views pg_shmem_allocations and
--- pg_backend_memory_contexts.
+-- test to check privileges of system views pg_shmem_allocations,
+-- pg_shmem_allocations_numa and pg_backend_memory_contexts.
-- switch to superuser
\c -
@@ -1958,12 +1958,14 @@ CREATE ROLE regress_readallstats;
SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- no
SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- no
GRANT pg_read_all_stats TO regress_readallstats;
SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes
SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- yes
SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- yes
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- yes
-- run query to ensure that functions within views can be executed
SET ROLE regress_readallstats;
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index b69b3b1520cb..a5fe6c4a0893 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -341,6 +341,8 @@ BufFile
Buffer
BufferAccessStrategy
BufferAccessStrategyType
+BufferCacheNumaRec
+BufferCacheNumaContext
BufferCachePagesContext
BufferCachePagesRec
BufferDesc