From d5a437177450ac674b56229fd7690b3c7cdfa3ff Mon Sep 17 00:00:00 2001 From: Jakub Wartak Date: Wed, 2 Apr 2025 12:29:22 +0200 Subject: [PATCH 1/5] Add support for basic NUMA awareness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add basic NUMA awareness routines, using a minimal src/port/pg_numa.c portability wrapper and an optional build dependency, enabled by --with-libnuma configure option. For now this is Linux-only, other platforms may be supported later. A built-in SQL function pg_numa_available() allows checking NUMA support, i.e. that the server was built/linked with NUMA library. The libnuma library is not available on 32-bit builds (there's no shared object for i386), so we disable it in that case. The i386 is very memory limited anyway, even with PAE, so NUMA is mostly irrelevant. On Linux we use move_pages(2) syscall for speed instead of get_mempolicy(2). Author: Jakub Wartak Co-authored-by: Bertrand Drouvot Reviewed-by: Andres Freund Reviewed-by: Álvaro Herrera Reviewed-by: Tomas Vondra Discussion: https://postgr.es/m/CAKZiRmxh6KWo0aqRqvmcoaX2jUxZYb4kGp3N%3Dq1w%2BDiH-696Xw%40mail.gmail.com --- .cirrus.tasks.yml | 2 + configure | 187 ++++++++++++++++++++++++++++ configure.ac | 14 +++ doc/src/sgml/func.sgml | 13 ++ doc/src/sgml/installation.sgml | 21 ++++ meson.build | 23 ++++ meson_options.txt | 3 + src/Makefile.global.in | 6 +- src/backend/utils/misc/guc_tables.c | 2 +- src/include/catalog/pg_proc.dat | 4 + src/include/pg_config.h.in | 3 + src/include/port/pg_numa.h | 40 ++++++ src/include/storage/pg_shmem.h | 1 + src/makefiles/meson.build | 3 + src/port/Makefile | 1 + src/port/meson.build | 1 + src/port/pg_numa.c | 120 ++++++++++++++++++ 17 files changed, 442 insertions(+), 2 deletions(-) create mode 100644 src/include/port/pg_numa.h create mode 100644 src/port/pg_numa.c diff --git a/.cirrus.tasks.yml b/.cirrus.tasks.yml index 86a1fa9bbdba..6f4f5c674a1b 100644 --- a/.cirrus.tasks.yml +++ b/.cirrus.tasks.yml @@ -471,6 +471,7 @@ task: --enable-cassert --enable-injection-points --enable-debug \ --enable-tap-tests --enable-nls \ --with-segsize-blocks=6 \ + --with-libnuma \ --with-liburing \ \ ${LINUX_CONFIGURE_FEATURES} \ @@ -523,6 +524,7 @@ task: -Dllvm=disabled \ --pkg-config-path /usr/lib/i386-linux-gnu/pkgconfig/ \ -DPERL=perl5.36-i386-linux-gnu \ + -Dlibnuma=disabled \ build-32 EOF diff --git a/configure b/configure index 11615d1122de..e27badd83c34 100755 --- a/configure +++ b/configure @@ -708,6 +708,9 @@ XML2_LIBS XML2_CFLAGS XML2_CONFIG with_libxml +LIBNUMA_LIBS +LIBNUMA_CFLAGS +with_libnuma LIBCURL_LIBS LIBCURL_CFLAGS with_libcurl @@ -872,6 +875,7 @@ with_liburing with_uuid with_ossp_uuid with_libcurl +with_libnuma with_libxml with_libxslt with_system_tzdata @@ -906,6 +910,8 @@ LIBURING_CFLAGS LIBURING_LIBS LIBCURL_CFLAGS LIBCURL_LIBS +LIBNUMA_CFLAGS +LIBNUMA_LIBS XML2_CONFIG XML2_CFLAGS XML2_LIBS @@ -1588,6 +1594,7 @@ Optional Packages: --with-uuid=LIB build contrib/uuid-ossp using LIB (bsd,e2fs,ossp) --with-ossp-uuid obsolete spelling of --with-uuid=ossp --with-libcurl build with libcurl support + --with-libnuma build with libnuma support --with-libxml build with XML support --with-libxslt use XSLT support when building contrib/xml2 --with-system-tzdata=DIR @@ -1629,6 +1636,10 @@ Some influential environment variables: C compiler flags for LIBCURL, overriding pkg-config LIBCURL_LIBS linker flags for LIBCURL, overriding pkg-config + LIBNUMA_CFLAGS + C compiler flags for LIBNUMA, overriding pkg-config + LIBNUMA_LIBS + linker flags for LIBNUMA, overriding pkg-config XML2_CONFIG path to xml2-config utility XML2_CFLAGS C compiler flags for XML2, overriding pkg-config XML2_LIBS linker flags for XML2, overriding pkg-config @@ -9063,6 +9074,182 @@ $as_echo "$as_me: WARNING: *** OAuth support tests require --with-python to run" fi +# +# libnuma +# +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to build with libnuma support" >&5 +$as_echo_n "checking whether to build with libnuma support... " >&6; } + + + +# Check whether --with-libnuma was given. +if test "${with_libnuma+set}" = set; then : + withval=$with_libnuma; + case $withval in + yes) + +$as_echo "#define USE_LIBNUMA 1" >>confdefs.h + + ;; + no) + : + ;; + *) + as_fn_error $? "no argument expected for --with-libnuma option" "$LINENO" 5 + ;; + esac + +else + with_libnuma=no + +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_libnuma" >&5 +$as_echo "$with_libnuma" >&6; } + + +if test "$with_libnuma" = yes ; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for numa_available in -lnuma" >&5 +$as_echo_n "checking for numa_available in -lnuma... " >&6; } +if ${ac_cv_lib_numa_numa_available+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-lnuma $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char numa_available (); +int +main () +{ +return numa_available (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_numa_numa_available=yes +else + ac_cv_lib_numa_numa_available=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_numa_numa_available" >&5 +$as_echo "$ac_cv_lib_numa_numa_available" >&6; } +if test "x$ac_cv_lib_numa_numa_available" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_LIBNUMA 1 +_ACEOF + + LIBS="-lnuma $LIBS" + +else + as_fn_error $? "library 'libnuma' is required for NUMA support" "$LINENO" 5 +fi + + +pkg_failed=no +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for numa" >&5 +$as_echo_n "checking for numa... " >&6; } + +if test -n "$LIBNUMA_CFLAGS"; then + pkg_cv_LIBNUMA_CFLAGS="$LIBNUMA_CFLAGS" + elif test -n "$PKG_CONFIG"; then + if test -n "$PKG_CONFIG" && \ + { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"numa\""; } >&5 + ($PKG_CONFIG --exists --print-errors "numa") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; then + pkg_cv_LIBNUMA_CFLAGS=`$PKG_CONFIG --cflags "numa" 2>/dev/null` + test "x$?" != "x0" && pkg_failed=yes +else + pkg_failed=yes +fi + else + pkg_failed=untried +fi +if test -n "$LIBNUMA_LIBS"; then + pkg_cv_LIBNUMA_LIBS="$LIBNUMA_LIBS" + elif test -n "$PKG_CONFIG"; then + if test -n "$PKG_CONFIG" && \ + { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"numa\""; } >&5 + ($PKG_CONFIG --exists --print-errors "numa") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; then + pkg_cv_LIBNUMA_LIBS=`$PKG_CONFIG --libs "numa" 2>/dev/null` + test "x$?" != "x0" && pkg_failed=yes +else + pkg_failed=yes +fi + else + pkg_failed=untried +fi + + + +if test $pkg_failed = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + +if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then + _pkg_short_errors_supported=yes +else + _pkg_short_errors_supported=no +fi + if test $_pkg_short_errors_supported = yes; then + LIBNUMA_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "numa" 2>&1` + else + LIBNUMA_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "numa" 2>&1` + fi + # Put the nasty error message in config.log where it belongs + echo "$LIBNUMA_PKG_ERRORS" >&5 + + as_fn_error $? "Package requirements (numa) were not met: + +$LIBNUMA_PKG_ERRORS + +Consider adjusting the PKG_CONFIG_PATH environment variable if you +installed software in a non-standard prefix. + +Alternatively, you may set the environment variables LIBNUMA_CFLAGS +and LIBNUMA_LIBS to avoid the need to call pkg-config. +See the pkg-config man page for more details." "$LINENO" 5 +elif test $pkg_failed = untried; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "The pkg-config script could not be found or is too old. Make sure it +is in your PATH or set the PKG_CONFIG environment variable to the full +path to pkg-config. + +Alternatively, you may set the environment variables LIBNUMA_CFLAGS +and LIBNUMA_LIBS to avoid the need to call pkg-config. +See the pkg-config man page for more details. + +To get pkg-config, see . +See \`config.log' for more details" "$LINENO" 5; } +else + LIBNUMA_CFLAGS=$pkg_cv_LIBNUMA_CFLAGS + LIBNUMA_LIBS=$pkg_cv_LIBNUMA_LIBS + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +fi +fi + # # XML # diff --git a/configure.ac b/configure.ac index debdf1650441..d365a486d3d8 100644 --- a/configure.ac +++ b/configure.ac @@ -1053,6 +1053,20 @@ if test "$with_libcurl" = yes ; then fi +# +# libnuma +# +AC_MSG_CHECKING([whether to build with libnuma support]) +PGAC_ARG_BOOL(with, libnuma, no, [build with libnuma support], + [AC_DEFINE([USE_LIBNUMA], 1, [Define to build with NUMA support. (--with-libnuma)])]) +AC_MSG_RESULT([$with_libnuma]) +AC_SUBST(with_libnuma) + +if test "$with_libnuma" = yes ; then + AC_CHECK_LIB(numa, numa_available, [], [AC_MSG_ERROR([library 'libnuma' is required for NUMA support])]) + PKG_CHECK_MODULES(LIBNUMA, numa) +fi + # # XML # diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 0224f93733dc..9ab070adffba 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -25143,6 +25143,19 @@ SELECT * FROM pg_ls_dir('.') WITH ORDINALITY AS t(ls,n); + + + + pg_numa_available + + pg_numa_available () + boolean + + + Returns true if the server has been compiled with NUMA support. + + + diff --git a/doc/src/sgml/installation.sgml b/doc/src/sgml/installation.sgml index cc28f0413307..8ebf0b03ec0c 100644 --- a/doc/src/sgml/installation.sgml +++ b/doc/src/sgml/installation.sgml @@ -1156,6 +1156,16 @@ build-postgresql: + + + + + Build with libnuma support for basic NUMA support. + Only supported on platforms for which the libnuma library is implemented. + + + + @@ -2645,6 +2655,17 @@ ninja install + + + + + Build with libnuma support for basic NUMA support. + Only supported on platforms for which the libnuma library is implemented. + The default for this option is auto. + + + + diff --git a/meson.build b/meson.build index 454ed81f5ead..46e92daeb62b 100644 --- a/meson.build +++ b/meson.build @@ -943,6 +943,27 @@ else endif +############################################################### +# Library: libnuma +############################################################### + +libnumaopt = get_option('libnuma') +if not libnumaopt.disabled() + # via pkg-config + libnuma = dependency('numa', required: libnumaopt) + if not libnuma.found() + libnuma = cc.find_library('numa', required: libnumaopt) + endif + if not cc.has_header('numa.h', dependencies: libnuma, required: libnumaopt) + libnuma = not_found_dep + endif + if libnuma.found() + cdata.set('USE_LIBNUMA', 1) + endif +else + libnuma = not_found_dep +endif + ############################################################### # Library: liburing @@ -3243,6 +3264,7 @@ backend_both_deps += [ icu_i18n, ldap, libintl, + libnuma, liburing, libxml, lz4, @@ -3899,6 +3921,7 @@ if meson.version().version_compare('>=0.57') 'icu': icu, 'ldap': ldap, 'libcurl': libcurl, + 'libnuma': libnuma, 'liburing': liburing, 'libxml': libxml, 'libxslt': libxslt, diff --git a/meson_options.txt b/meson_options.txt index dd7126da3a73..06bf5627d3c0 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -106,6 +106,9 @@ option('libcurl', type : 'feature', value: 'auto', option('libedit_preferred', type: 'boolean', value: false, description: 'Prefer BSD Libedit over GNU Readline') +option('libnuma', type: 'feature', value: 'auto', + description: 'NUMA support') + option('liburing', type : 'feature', value: 'auto', description: 'io_uring support, for asynchronous I/O') diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 737b2dd18691..6722fbdf365f 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -196,6 +196,7 @@ with_gssapi = @with_gssapi@ with_krb_srvnam = @with_krb_srvnam@ with_ldap = @with_ldap@ with_libcurl = @with_libcurl@ +with_libnuma = @with_libnuma@ with_liburing = @with_liburing@ with_libxml = @with_libxml@ with_libxslt = @with_libxslt@ @@ -223,6 +224,9 @@ krb_srvtab = @krb_srvtab@ ICU_CFLAGS = @ICU_CFLAGS@ ICU_LIBS = @ICU_LIBS@ +LIBNUMA_CFLAGS = @LIBNUMA_CFLAGS@ +LIBNUMA_LIBS = @LIBNUMA_LIBS@ + LIBURING_CFLAGS = @LIBURING_CFLAGS@ LIBURING_LIBS = @LIBURING_LIBS@ @@ -250,7 +254,7 @@ CPP = @CPP@ CPPFLAGS = @CPPFLAGS@ PG_SYSROOT = @PG_SYSROOT@ -override CPPFLAGS := $(ICU_CFLAGS) $(LIBURING_CFLAGS) $(CPPFLAGS) +override CPPFLAGS := $(ICU_CFLAGS) $(LIBNUMA_CFLAGS) $(LIBURING_CFLAGS) $(CPPFLAGS) ifdef PGXS override CPPFLAGS := -I$(includedir_server) -I$(includedir_internal) $(CPPFLAGS) diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 4eaeca89f2c7..ea8d796e7c45 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -566,7 +566,7 @@ static int ssl_renegotiation_limit; */ int huge_pages = HUGE_PAGES_TRY; int huge_page_size; -static int huge_pages_status = HUGE_PAGES_UNKNOWN; +int huge_pages_status = HUGE_PAGES_UNKNOWN; /* * These variables are all dummies that don't do anything, except in some diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 5d5be8ba4e16..dfc59ea0cc8c 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -8542,6 +8542,10 @@ proargnames => '{name,off,size,allocated_size}', prosrc => 'pg_get_shmem_allocations' }, +{ oid => '9685', descr => 'Is NUMA compilation available?', + proname => 'pg_numa_available', provolatile => 's', prorettype => 'bool', + proargtypes => '', prosrc => 'pg_numa_available' }, + # memory context of local backend { oid => '2282', descr => 'information about all memory contexts of local backend', diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index c2f1241b2342..b3166ec8f428 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -686,6 +686,9 @@ /* Define to 1 to build with libcurl support. (--with-libcurl) */ #undef USE_LIBCURL +/* Define to 1 to build with NUMA support. (--with-libnuma) */ +#undef USE_LIBNUMA + /* Define to build with io_uring support. (--with-liburing) */ #undef USE_LIBURING diff --git a/src/include/port/pg_numa.h b/src/include/port/pg_numa.h new file mode 100644 index 000000000000..3c1b50c14286 --- /dev/null +++ b/src/include/port/pg_numa.h @@ -0,0 +1,40 @@ +/*------------------------------------------------------------------------- + * + * pg_numa.h + * Basic NUMA portability routines + * + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/include/port/pg_numa.h + * + *------------------------------------------------------------------------- + */ +#ifndef PG_NUMA_H +#define PG_NUMA_H + +#include "fmgr.h" + +extern PGDLLIMPORT int pg_numa_init(void); +extern PGDLLIMPORT int pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status); +extern PGDLLIMPORT int pg_numa_get_max_node(void); +extern PGDLLIMPORT Size pg_numa_get_pagesize(void); + +#ifdef USE_LIBNUMA + +/* + * This is required on Linux, before pg_numa_query_pages() as we + * need to page-fault before move_pages(2) syscall returns valid results. + */ +#define pg_numa_touch_mem_if_required(ro_volatile_var, ptr) \ + ro_volatile_var = *(uint64 *) ptr + +#else + +#define pg_numa_touch_mem_if_required(ro_volatile_var, ptr) \ + do {} while(0) + +#endif + +#endif /* PG_NUMA_H */ diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index b99ebc9e86f5..5f7d4b83a60e 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -45,6 +45,7 @@ typedef struct PGShmemHeader /* standard header for all Postgres shmem */ extern PGDLLIMPORT int shared_memory_type; extern PGDLLIMPORT int huge_pages; extern PGDLLIMPORT int huge_page_size; +extern PGDLLIMPORT int huge_pages_status; /* Possible values for huge_pages and huge_pages_status */ typedef enum diff --git a/src/makefiles/meson.build b/src/makefiles/meson.build index 46d8da070e82..55da678ec278 100644 --- a/src/makefiles/meson.build +++ b/src/makefiles/meson.build @@ -200,6 +200,8 @@ pgxs_empty = [ 'ICU_LIBS', + 'LIBNUMA_CFLAGS', 'LIBNUMA_LIBS', + 'LIBURING_CFLAGS', 'LIBURING_LIBS', ] @@ -232,6 +234,7 @@ pgxs_deps = { 'icu': icu, 'ldap': ldap, 'libcurl': libcurl, + 'libnuma': libnuma, 'liburing': liburing, 'libxml': libxml, 'libxslt': libxslt, diff --git a/src/port/Makefile b/src/port/Makefile index f11896440d56..4274949dfa4c 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -45,6 +45,7 @@ OBJS = \ path.o \ pg_bitutils.o \ pg_localeconv_r.o \ + pg_numa.o \ pg_popcount_aarch64.o \ pg_popcount_avx512.o \ pg_strong_random.o \ diff --git a/src/port/meson.build b/src/port/meson.build index 51041e756099..228888b2f663 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -8,6 +8,7 @@ pgport_sources = [ 'path.c', 'pg_bitutils.c', 'pg_localeconv_r.c', + 'pg_numa.c', 'pg_popcount_aarch64.c', 'pg_popcount_avx512.c', 'pg_strong_random.c', diff --git a/src/port/pg_numa.c b/src/port/pg_numa.c new file mode 100644 index 000000000000..5e2523cf798c --- /dev/null +++ b/src/port/pg_numa.c @@ -0,0 +1,120 @@ +/*------------------------------------------------------------------------- + * + * pg_numa.c + * Basic NUMA portability routines + * + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/port/pg_numa.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include + +#ifdef WIN32 +#include +#endif + +#include "fmgr.h" +#include "miscadmin.h" +#include "port/pg_numa.h" +#include "storage/pg_shmem.h" + +/* + * At this point we provide support only for Linux thanks to libnuma, but in + * future support for other platforms e.g. Win32 or FreeBSD might be possible + * too. For Win32 NUMA APIs see + * https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support + */ +#ifdef USE_LIBNUMA + +#include +#include + +Datum pg_numa_available(PG_FUNCTION_ARGS); + +/* libnuma requires initialization as per numa(3) on Linux */ +int +pg_numa_init(void) +{ + int r = numa_available(); + + return r; +} + +/* + * We use move_pages(2) syscall here - instead of get_mempolicy(2) - as the + * first one allows us to batch and query about many memory pages in one single + * giant system call that is way faster. + */ +int +pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status) +{ + return numa_move_pages(pid, count, pages, NULL, status, 0); +} + +int +pg_numa_get_max_node(void) +{ + return numa_max_node(); +} + +#else + +Datum pg_numa_available(PG_FUNCTION_ARGS); + +/* Empty wrappers */ +int +pg_numa_init(void) +{ + /* We state that NUMA is not available */ + return -1; +} + +int +pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status) +{ + return 0; +} + +int +pg_numa_get_max_node(void) +{ + return 0; +} + +#endif + +Datum +pg_numa_available(PG_FUNCTION_ARGS) +{ + PG_RETURN_BOOL(pg_numa_init() != -1); +} + +/* This should be used only after the server is started */ +Size +pg_numa_get_pagesize(void) +{ + Size os_page_size; +#ifdef WIN32 + SYSTEM_INFO sysinfo; + + GetSystemInfo(&sysinfo); + os_page_size = sysinfo.dwPageSize; +#else + os_page_size = sysconf(_SC_PAGESIZE); +#endif + + Assert(IsUnderPostmaster); + Assert(huge_pages_status != HUGE_PAGES_UNKNOWN); + + if (huge_pages_status == HUGE_PAGES_ON) + GetHugePageSize(&os_page_size, NULL); + + return os_page_size; +} From d38ba4f0453065737a8fa021f0bfc36a00701d8a Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Thu, 3 Apr 2025 20:21:25 +0200 Subject: [PATCH 2/5] Add pg_buffercache_numa view with NUMA node info Introduces a new view pg_buffercache_numa, showing a NUMA memory node for each individual buffer. To determine the NUMA node for a buffer, we first need to touch the memory pages using pg_numa_touch_mem_if_required, otherwise we might get status -2 (ENOENT = The page is not present), indicating the page is either unmapped or unallocated. The size of a database block and OS memory page may differ. For example the default block size (BLCKSZ) is 8KB, while the memory page is 4KB, but it's also possible to make the block size smaller (e.g. 1KB). Author: Jakub Wartak Reviewed-by: Andres Freund Reviewed-by: Bertrand Drouvot Reviewed-by: Tomas Vondra Discussion: https://postgr.es/m/CAKZiRmxh6KWo0aqRqvmcoaX2jUxZYb4kGp3N%3Dq1w%2BDiH-696Xw%40mail.gmail.com --- contrib/pg_buffercache/Makefile | 5 +- .../expected/pg_buffercache_numa.out | 28 ++ .../expected/pg_buffercache_numa_1.out | 3 + contrib/pg_buffercache/meson.build | 2 + .../pg_buffercache--1.5--1.6.sql | 22 ++ contrib/pg_buffercache/pg_buffercache.control | 2 +- contrib/pg_buffercache/pg_buffercache_pages.c | 276 ++++++++++++++++++ .../sql/pg_buffercache_numa.sql | 20 ++ doc/src/sgml/pgbuffercache.sgml | 75 ++++- 9 files changed, 429 insertions(+), 4 deletions(-) create mode 100644 contrib/pg_buffercache/expected/pg_buffercache_numa.out create mode 100644 contrib/pg_buffercache/expected/pg_buffercache_numa_1.out create mode 100644 contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql create mode 100644 contrib/pg_buffercache/sql/pg_buffercache_numa.sql diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile index eae65ead9e50..5f748543e2ea 100644 --- a/contrib/pg_buffercache/Makefile +++ b/contrib/pg_buffercache/Makefile @@ -8,10 +8,11 @@ OBJS = \ EXTENSION = pg_buffercache DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \ pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \ - pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql + pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \ + pg_buffercache--1.5--1.6.sql PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time" -REGRESS = pg_buffercache +REGRESS = pg_buffercache pg_buffercache_numa ifdef USE_PGXS PG_CONFIG = pg_config diff --git a/contrib/pg_buffercache/expected/pg_buffercache_numa.out b/contrib/pg_buffercache/expected/pg_buffercache_numa.out new file mode 100644 index 000000000000..d4de5ea52fc2 --- /dev/null +++ b/contrib/pg_buffercache/expected/pg_buffercache_numa.out @@ -0,0 +1,28 @@ +SELECT NOT(pg_numa_available()) AS skip_test \gset +\if :skip_test +\quit +\endif +select count(*) = (select setting::bigint + from pg_settings + where name = 'shared_buffers') +from pg_buffercache_numa; + ?column? +---------- + t +(1 row) + +-- Check that the functions / views can't be accessed by default. To avoid +-- having to create a dedicated user, use the pg_database_owner pseudo-role. +SET ROLE pg_database_owner; +SELECT count(*) > 0 FROM pg_buffercache_numa; +ERROR: permission denied for view pg_buffercache_numa +RESET role; +-- Check that pg_monitor is allowed to query view / function +SET ROLE pg_monitor; +SELECT count(*) > 0 FROM pg_buffercache_numa; + ?column? +---------- + t +(1 row) + +RESET role; diff --git a/contrib/pg_buffercache/expected/pg_buffercache_numa_1.out b/contrib/pg_buffercache/expected/pg_buffercache_numa_1.out new file mode 100644 index 000000000000..6dd6824b4e4f --- /dev/null +++ b/contrib/pg_buffercache/expected/pg_buffercache_numa_1.out @@ -0,0 +1,3 @@ +SELECT NOT(pg_numa_available()) AS skip_test \gset +\if :skip_test +\quit diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build index 12d1fe487172..7cd039a1df9c 100644 --- a/contrib/pg_buffercache/meson.build +++ b/contrib/pg_buffercache/meson.build @@ -23,6 +23,7 @@ install_data( 'pg_buffercache--1.2.sql', 'pg_buffercache--1.3--1.4.sql', 'pg_buffercache--1.4--1.5.sql', + 'pg_buffercache--1.5--1.6.sql', 'pg_buffercache.control', kwargs: contrib_data_args, ) @@ -34,6 +35,7 @@ tests += { 'regress': { 'sql': [ 'pg_buffercache', + 'pg_buffercache_numa', ], }, } diff --git a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql new file mode 100644 index 000000000000..1230e244a5f1 --- /dev/null +++ b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql @@ -0,0 +1,22 @@ +/* contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.6'" to load this file. \quit + +-- Register the new functions. +CREATE OR REPLACE FUNCTION pg_buffercache_numa_pages() +RETURNS SETOF RECORD +AS 'MODULE_PATHNAME', 'pg_buffercache_numa_pages' +LANGUAGE C PARALLEL SAFE; + +-- Create a view for convenient access. +CREATE OR REPLACE VIEW pg_buffercache_numa AS + SELECT P.* FROM pg_buffercache_numa_pages() AS P + (bufferid integer, page_num int4, node_id int4); + +-- Don't want these to be available to public. +REVOKE ALL ON FUNCTION pg_buffercache_numa_pages() FROM PUBLIC; +REVOKE ALL ON pg_buffercache_numa FROM PUBLIC; + +GRANT EXECUTE ON FUNCTION pg_buffercache_numa_pages() TO pg_monitor; +GRANT SELECT ON pg_buffercache_numa TO pg_monitor; diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control index 5ee875f77dd9..b030ba3a6fab 100644 --- a/contrib/pg_buffercache/pg_buffercache.control +++ b/contrib/pg_buffercache/pg_buffercache.control @@ -1,5 +1,5 @@ # pg_buffercache extension comment = 'examine the shared buffer cache' -default_version = '1.5' +default_version = '1.6' module_pathname = '$libdir/pg_buffercache' relocatable = true diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index 62602af1775f..a0e4cd69aeed 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -11,6 +11,7 @@ #include "access/htup_details.h" #include "catalog/pg_type.h" #include "funcapi.h" +#include "port/pg_numa.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" @@ -20,6 +21,8 @@ #define NUM_BUFFERCACHE_SUMMARY_ELEM 5 #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4 +#define NUM_BUFFERCACHE_NUMA_ELEM 3 + PG_MODULE_MAGIC_EXT( .name = "pg_buffercache", .version = PG_VERSION @@ -59,15 +62,41 @@ typedef struct } BufferCachePagesContext; +typedef struct +{ + uint32 bufferid; + int32 numa_page; + int32 numa_node; +} BufferCacheNumaRec; + +/* + * Function context for data persisting over repeated calls. + */ +typedef struct +{ + TupleDesc tupdesc; + int buffers_per_page; + int pages_per_buffer; + int os_page_size; + BufferCacheNumaRec *record; +} BufferCacheNumaContext; + + /* * Function returning data from the shared buffer cache - buffer number, * relation node/tablespace/database/blocknum and dirty indicator. */ PG_FUNCTION_INFO_V1(pg_buffercache_pages); +PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages); PG_FUNCTION_INFO_V1(pg_buffercache_summary); PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts); PG_FUNCTION_INFO_V1(pg_buffercache_evict); + +/* Only need to touch memory once per backend process lifetime */ +static bool firstNumaTouch = true; + + Datum pg_buffercache_pages(PG_FUNCTION_ARGS) { @@ -246,6 +275,253 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) SRF_RETURN_DONE(funcctx); } +/* + * Inquire about NUMA memory mappings, especially the NUMA node. + * + * In order to get reliable results we also need to touch memory pages, so + * that the inquiry about NUMA memory node doesn't return -2 (which indicates + * unmapped/unallocated pages). + */ +Datum +pg_buffercache_numa_pages(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + MemoryContext oldcontext; + BufferCacheNumaContext *fctx; /* User function context. */ + TupleDesc tupledesc; + TupleDesc expected_tupledesc; + HeapTuple tuple; + Datum result; + + if (SRF_IS_FIRSTCALL()) + { + int i, + j, + idx; + Size os_page_size = 0; + void **os_page_ptrs = NULL; + int *os_page_status; + uint64 os_page_count; + int pages_per_buffer; + int buffers_per_page; + volatile uint64 touch pg_attribute_unused(); + + if (pg_numa_init() == -1) + elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform"); + + /* + * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, + * while the OS may have different memory page sizes. + * + * To correctly map between them, we need to: 1. Determine the OS + * memory page size 2. Calculate how many OS pages are used by all + * buffer blocks 3. Calculate how many OS pages are contained within + * each database block. + * + * This information is needed before calling move_pages() for NUMA + * node id inquiry. + */ + os_page_size = pg_numa_get_pagesize(); + buffers_per_page = os_page_size / BLCKSZ; + pages_per_buffer = BLCKSZ / os_page_size; + + /* + * The pages and block size is expected to be 2^k, so one divides the + * other (we don't know in which direction). + */ + Assert((os_page_size % BLCKSZ == 0) || (BLCKSZ % os_page_size == 0)); + + /* + * Either both counts are 1 (when the pages have the same size), or + * exacly one of them is zero. Both can't be zero at the same time. + */ + Assert((buffers_per_page > 0) || (pages_per_buffer > 0)); + Assert(((buffers_per_page == 1) && (pages_per_buffer == 1)) || + ((buffers_per_page == 0) || (pages_per_buffer == 0))); + + /* + * How many addresses we are going to query (store) depends on the + * relation between BLCKSZ : PAGESIZE. We need at least one status per + * buffer - if the memory page is larger than buffer, we still query + * it for each buffer. With multiple memory pages per buffer, we need + * that many entries. + */ + os_page_count = NBuffers * Max(1, pages_per_buffer); + + elog(DEBUG1, "NUMA: NBuffers=%d os_page_query_count=" UINT64_FORMAT " os_page_size=%zu buffers_per_page=%d pages_per_buffer=%d", + NBuffers, os_page_count, os_page_size, buffers_per_page, pages_per_buffer); + + + /* initialize the multi-call context */ + + funcctx = SRF_FIRSTCALL_INIT(); + + /* Switch context when allocating stuff to be used in later calls */ + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + /* Create a user function context for cross-call persistence */ + fctx = (BufferCacheNumaContext *) palloc(sizeof(BufferCacheNumaContext)); + + /* + * To smoothly support upgrades from version 1.0 of this extension + * transparently handle the (non-)existence of the pinning_backends + * column. We unfortunately have to get the result type for that... - + * we can't use the result type determined by the function definition + * without potentially crashing when somebody uses the old (or even + * wrong) function definition though. + */ + if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + if (expected_tupledesc->natts != NUM_BUFFERCACHE_NUMA_ELEM) + elog(ERROR, "incorrect number of output arguments"); + + /* Construct a tuple descriptor for the result rows. */ + tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts); + TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid", + INT4OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 2, "page_num", + INT4OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 3, "node_id", + INT4OID, -1, 0); + + fctx->tupdesc = BlessTupleDesc(tupledesc); + + /* Allocate NBuffers worth of BufferCachePagesRec records. */ + fctx->record = (BufferCacheNumaRec *) + MemoryContextAllocHuge(CurrentMemoryContext, + sizeof(BufferCacheNumaRec) * os_page_count); + + /* Set max calls and remember the user function context. */ + funcctx->max_calls = NBuffers; + funcctx->user_fctx = fctx; + + /* Return to original context when allocating transient memory */ + MemoryContextSwitchTo(oldcontext); + + + /* determine the NUMA node for OS pages */ + os_page_ptrs = palloc0(sizeof(void *) * os_page_count); + os_page_status = palloc(sizeof(uint64) * os_page_count); + + /* + * If we ever get 0xff back from kernel inquiry, then we probably have + * bug in our buffers to OS page mapping code here. + */ + memset(os_page_status, 0xff, sizeof(int) * os_page_count); + + if (firstNumaTouch) + elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts"); + + /* + * Scan through all the buffers, saving the relevant fields in the + * fctx->record structure. + * + * We don't hold the partition locks, so we don't get a consistent + * snapshot across all buffers, but we do grab the buffer header + * locks, so the information of each buffer is self-consistent. + * + * This loop touches and stores addresses into os_page_ptrs[] as input + * to one big big move_pages(2) inquiry system call. Basically we ask + * for all memory pages for NBuffers. + */ + idx = 0; + for (i = 0; i < NBuffers; i++) + { + BufferDesc *bufHdr; + uint32 buf_state; + uint32 bufferid; + + CHECK_FOR_INTERRUPTS(); + + bufHdr = GetBufferDescriptor(i); + + /* Lock each buffer header before inspecting. */ + buf_state = LockBufHdr(bufHdr); + bufferid = BufferDescriptorGetBuffer(bufHdr); + + UnlockBufHdr(bufHdr, buf_state); + + /* + * If we have multiple OS pages per buffer, fill those in too. We + * always want at least one OS page, even if there are multiple + * buffers per page. + * + * Altough we could query just once per each OS page, we do it + * repeatably for each Buffer and hit the same address as + * move_pages(2) requires page aligment. This also simplifies + * retrieval code later on. Also NBuffers starts from 1. + */ + for (j = 0; j < Max(1, pages_per_buffer); j++) + { + char *buffptr = (char *) BufferGetBlock(i + 1); + + fctx->record[idx].bufferid = bufferid; + fctx->record[idx].numa_page = j; + + os_page_ptrs[idx] + = (char *) TYPEALIGN(os_page_size, + buffptr + (os_page_size * j)); + + /* Only need to touch memory once per backend process lifetime */ + if (firstNumaTouch) + pg_numa_touch_mem_if_required(touch, os_page_ptrs[idx]); + + ++idx; + } + + } + + /* we should get exactly the expected number of entrires */ + Assert(idx == os_page_count); + + /* query NUMA status for all the pointers */ + if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1) + elog(ERROR, "failed NUMA pages inquiry: %m"); + + /* + * Update the entries with NUMA node ID. The status array is indexed + * the same way as the entry index. + */ + for (i = 0; i < os_page_count; i++) + { + fctx->record[i].numa_node = os_page_status[i]; + } + + /* remember this backend touched the pages */ + firstNumaTouch = false; + } + + funcctx = SRF_PERCALL_SETUP(); + + /* Get the saved state */ + fctx = funcctx->user_fctx; + + if (funcctx->call_cntr < funcctx->max_calls) + { + uint32 i = funcctx->call_cntr; + Datum values[NUM_BUFFERCACHE_NUMA_ELEM]; + bool nulls[NUM_BUFFERCACHE_NUMA_ELEM]; + + values[0] = Int32GetDatum(fctx->record[i].bufferid); + nulls[0] = false; + + values[1] = Int32GetDatum(fctx->record[i].numa_page); + nulls[1] = false; + + values[2] = Int32GetDatum(fctx->record[i].numa_node); + nulls[2] = false; + + /* Build and return the tuple. */ + tuple = heap_form_tuple(fctx->tupdesc, values, nulls); + result = HeapTupleGetDatum(tuple); + + SRF_RETURN_NEXT(funcctx, result); + } + else + SRF_RETURN_DONE(funcctx); +} + Datum pg_buffercache_summary(PG_FUNCTION_ARGS) { diff --git a/contrib/pg_buffercache/sql/pg_buffercache_numa.sql b/contrib/pg_buffercache/sql/pg_buffercache_numa.sql new file mode 100644 index 000000000000..2225b879f58b --- /dev/null +++ b/contrib/pg_buffercache/sql/pg_buffercache_numa.sql @@ -0,0 +1,20 @@ +SELECT NOT(pg_numa_available()) AS skip_test \gset +\if :skip_test +\quit +\endif + +select count(*) = (select setting::bigint + from pg_settings + where name = 'shared_buffers') +from pg_buffercache_numa; + +-- Check that the functions / views can't be accessed by default. To avoid +-- having to create a dedicated user, use the pg_database_owner pseudo-role. +SET ROLE pg_database_owner; +SELECT count(*) > 0 FROM pg_buffercache_numa; +RESET role; + +-- Check that pg_monitor is allowed to query view / function +SET ROLE pg_monitor; +SELECT count(*) > 0 FROM pg_buffercache_numa; +RESET role; diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml index 802a5112d77d..3d9032efafb5 100644 --- a/doc/src/sgml/pgbuffercache.sgml +++ b/doc/src/sgml/pgbuffercache.sgml @@ -30,7 +30,9 @@ This module provides the pg_buffercache_pages() function (wrapped in the pg_buffercache view), - the pg_buffercache_summary() function, the + pg_buffercache_numa_pages() function (wrapped in the + pg_buffercache_numa view), the + pg_buffercache_summary() function, the pg_buffercache_usage_counts() function and the pg_buffercache_evict() function. @@ -42,6 +44,15 @@ convenient use. + + The pg_buffercache_numa_pages() provides + NUMA node mappings for shared buffer entries. This + information is not part of pg_buffercache_pages() + itself, as it is much slower to retrieve. + The pg_buffercache_numa view wraps the function for + convenient use. + + The pg_buffercache_summary() function returns a single row summarizing the state of the shared buffer cache. @@ -200,6 +211,68 @@ + + The <structname>pg_buffercache_numa</structname> View + + + The definitions of the columns exposed by the view are shown in . + + + + <structname>pg_buffercache_numa</structname> Columns + + + + + Column Type + + + Description + + + + + + + + bufferid integer + + + ID, in the range 1..shared_buffers + + + + + + page_num int + + + number of OS memory page for this buffer + + + + + + node_id int + + + ID of NUMA node + + + + + +
+ + + As NUMA node ID inquiry for each page requires memory pages + to be paged-in, the first execution of this function can take a noticeable + amount of time. In all the cases (first execution or not), retrieving this + information is costly and querying the view at a high frequency is not recommended. + + +
+ The <function>pg_buffercache_summary()</function> Function From be175a9fa3d431a69b61a530846f06d526e7e3fe Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Fri, 4 Apr 2025 21:10:11 +0200 Subject: [PATCH 3/5] review --- contrib/pg_buffercache/pg_buffercache_pages.c | 28 +++++++++++++------ doc/src/sgml/pgbuffercache.sgml | 2 +- src/tools/pgindent/typedefs.list | 2 ++ 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index a0e4cd69aeed..65ade9d81354 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -61,13 +61,15 @@ typedef struct BufferCachePagesRec *record; } BufferCachePagesContext; - +/* + * Record structure holding the to be exposed cache data. + */ typedef struct { uint32 bufferid; int32 numa_page; int32 numa_node; -} BufferCacheNumaRec; +} BufferCacheNumaRec; /* * Function context for data persisting over repeated calls. @@ -79,7 +81,7 @@ typedef struct int pages_per_buffer; int os_page_size; BufferCacheNumaRec *record; -} BufferCacheNumaContext; +} BufferCacheNumaContext; /* @@ -276,7 +278,15 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) } /* - * Inquire about NUMA memory mappings, especially the NUMA node. + * Inquire about NUMA memory mappings for shared buffers. + * + * Returns NUMA node ID for each memory page used by the buffer. Buffers may + * be smaller or larger than OS memory pages. For each buffer we return one + * entry for each memory page used by the buffer (it fhe buffer is smaller, + * it only uses a part of one memory page). + * + * We expect both sizes (for buffers and memory pages) to be a power-of-2, so + * one is always a multiple of the other. * * In order to get reliable results we also need to touch memory pages, so * that the inquiry about NUMA memory node doesn't return -2 (which indicates @@ -348,11 +358,13 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) */ os_page_count = NBuffers * Max(1, pages_per_buffer); - elog(DEBUG1, "NUMA: NBuffers=%d os_page_query_count=" UINT64_FORMAT " os_page_size=%zu buffers_per_page=%d pages_per_buffer=%d", - NBuffers, os_page_count, os_page_size, buffers_per_page, pages_per_buffer); + elog(DEBUG1, "NUMA: NBuffers=%d os_page_query_count=" UINT64_FORMAT " " + "os_page_size=%zu buffers_per_page=%d pages_per_buffer=%d", + NBuffers, os_page_count, os_page_size, + buffers_per_page, pages_per_buffer); - /* initialize the multi-call context */ + /* initialize the multi-call context, load entries about buffers */ funcctx = SRF_FIRSTCALL_INIT(); @@ -400,7 +412,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) MemoryContextSwitchTo(oldcontext); - /* determine the NUMA node for OS pages */ + /* used to determine the NUMA node for all OS pages at once */ os_page_ptrs = palloc0(sizeof(void *) * os_page_count); os_page_status = palloc(sizeof(uint64) * os_page_count); diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml index 3d9032efafb5..b01f8e713576 100644 --- a/doc/src/sgml/pgbuffercache.sgml +++ b/doc/src/sgml/pgbuffercache.sgml @@ -256,7 +256,7 @@ node_id int - ID of NUMA node + ID of NUMA node
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index b69b3b1520cb..a5fe6c4a0893 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -341,6 +341,8 @@ BufFile Buffer BufferAccessStrategy BufferAccessStrategyType +BufferCacheNumaRec +BufferCacheNumaContext BufferCachePagesContext BufferCachePagesRec BufferDesc From 4df22a2e08a3b39c92c18ecc1df07476a44bb318 Mon Sep 17 00:00:00 2001 From: Jakub Wartak Date: Fri, 21 Feb 2025 14:20:18 +0100 Subject: [PATCH 4/5] Introduce pg_shmem_allocations_numa view Introduce new pg_shmem_alloctions_numa view with information about how shared memory is distributed across NUMA nodes. Author: Jakub Wartak Reviewed-by: Andres Freund Reviewed-by: Bertrand Drouvot Reviewed-by: Tomas Vondra Discussion: https://postgr.es/m/CAKZiRmxh6KWo0aqRqvmcoaX2jUxZYb4kGp3N%3Dq1w%2BDiH-696Xw%40mail.gmail.com --- doc/src/sgml/system-views.sgml | 79 ++++++++++++++ src/backend/catalog/system_views.sql | 8 ++ src/backend/storage/ipc/shmem.c | 132 +++++++++++++++++++++++ src/include/catalog/pg_proc.dat | 8 ++ src/test/regress/expected/numa.out | 12 +++ src/test/regress/expected/numa_1.out | 3 + src/test/regress/expected/privileges.out | 16 ++- src/test/regress/expected/rules.out | 4 + src/test/regress/parallel_schedule | 2 +- src/test/regress/sql/numa.sql | 9 ++ src/test/regress/sql/privileges.sql | 6 +- 11 files changed, 274 insertions(+), 5 deletions(-) create mode 100644 src/test/regress/expected/numa.out create mode 100644 src/test/regress/expected/numa_1.out create mode 100644 src/test/regress/sql/numa.sql diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml index 4f336ee0adfa..a83365ae24ae 100644 --- a/doc/src/sgml/system-views.sgml +++ b/doc/src/sgml/system-views.sgml @@ -181,6 +181,11 @@ shared memory allocations + + pg_shmem_allocations_numa + NUMA node mappings for shared memory allocations + + pg_stats planner statistics @@ -4051,6 +4056,80 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx + + <structname>pg_shmem_allocations_numa</structname> + + + pg_shmem_allocations_numa + + + + The pg_shmem_allocations_numa shows how shared + memory allocations in the server's main shared memory segment are distributed + across NUMA nodes. This includes both memory allocated by + PostgreSQL itself and memory allocated + by extensions using the mechanisms detailed in + . + + + + Note that this view does not include memory allocated using the dynamic + shared memory infrastructure. + + + + <structname>pg_shmem_allocations_numa</structname> Columns + + + + + Column Type + + + Description + + + + + + + + name text + + + The name of the shared memory allocation. + + + + + + node_id int4 + + + ID of NUMA node + + + + + + size int4 + + + Size of the allocation on this particular NUMA memory node in bytes + + + + + +
+ + + By default, the pg_shmem_allocations_numa view can be + read only by superusers or roles with privileges of the + pg_read_all_stats role. + +
+ <structname>pg_stats</structname> diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 273008db37fc..08f780a2e638 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -658,6 +658,14 @@ GRANT SELECT ON pg_shmem_allocations TO pg_read_all_stats; REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations() FROM PUBLIC; GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations() TO pg_read_all_stats; +CREATE VIEW pg_shmem_allocations_numa AS + SELECT * FROM pg_get_shmem_allocations_numa(); + +REVOKE ALL ON pg_shmem_allocations_numa FROM PUBLIC; +GRANT SELECT ON pg_shmem_allocations_numa TO pg_read_all_stats; +REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations_numa() FROM PUBLIC; +GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations_numa() TO pg_read_all_stats; + CREATE VIEW pg_backend_memory_contexts AS SELECT * FROM pg_get_backend_memory_contexts(); diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index 895a43fb39e5..852f2c7c4535 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -68,6 +68,7 @@ #include "fmgr.h" #include "funcapi.h" #include "miscadmin.h" +#include "port/pg_numa.h" #include "storage/lwlock.h" #include "storage/pg_shmem.h" #include "storage/shmem.h" @@ -89,6 +90,8 @@ slock_t *ShmemLock; /* spinlock for shared memory and LWLock static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */ +/* To get reliable results for NUMA inquiry we need to "touch pages" once */ +static bool firstNumaTouch = true; /* * InitShmemAccess() --- set up basic pointers to shared memory. @@ -568,3 +571,132 @@ pg_get_shmem_allocations(PG_FUNCTION_ARGS) return (Datum) 0; } + +/* SQL SRF showing NUMA memory nodes for allocated shared memory */ +Datum +pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) +{ +#define PG_GET_SHMEM_NUMA_SIZES_COLS 3 + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + HASH_SEQ_STATUS hstat; + ShmemIndexEnt *ent; + Datum values[PG_GET_SHMEM_NUMA_SIZES_COLS]; + bool nulls[PG_GET_SHMEM_NUMA_SIZES_COLS]; + Size os_page_size; + void **page_ptrs; + int *pages_status; + uint64 shm_total_page_count, + shm_ent_page_count, + max_nodes; + Size *nodes; + + InitMaterializedSRF(fcinfo, 0); + + if (pg_numa_init() == -1) + { + elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform"); + return (Datum) 0; + } + max_nodes = pg_numa_get_max_node(); + nodes = palloc(sizeof(Size) * (max_nodes + 1)); + + /* + * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, while + * the OS may have different memory page sizes. + * + * To correctly map between them, we need to: 1. Determine the OS memory + * page size 2. Calculate how many OS pages are used by all buffer blocks + * 3. Calculate how many OS pages are contained within each database + * block. + * + * This information is needed before calling move_pages() for NUMA memory + * node inquiry. + */ + os_page_size = pg_numa_get_pagesize(); + + /* + * Allocate memory for page pointers and status based on total shared + * memory size. This simplified approach allocates enough space for all + * pages in shared memory rather than calculating the exact requirements + * for each segment. + */ + shm_total_page_count = ShmemSegHdr->totalsize / os_page_size; + page_ptrs = palloc0(sizeof(void *) * shm_total_page_count); + pages_status = palloc(sizeof(int) * shm_total_page_count); + + if (firstNumaTouch) + elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts"); + + LWLockAcquire(ShmemIndexLock, LW_SHARED); + + hash_seq_init(&hstat, ShmemIndex); + + /* output all allocated entries */ + memset(nulls, 0, sizeof(nulls)); + while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL) + { + int i; + + /* Get number of OS aliged pages */ + shm_ent_page_count = TYPEALIGN(os_page_size, ent->allocated_size) / os_page_size; + + /* + * If we get ever 0xff back from kernel inquiry, then we probably have + * bug in our buffers to OS page mapping code here. + */ + memset(pages_status, 0xff, sizeof(int) * shm_ent_page_count); + + for (i = 0; i < shm_ent_page_count; i++) + { + /* + * In order to get reliable results we also need to touch memory + * pages, so that inquiry about NUMA memory node doesn't return -2 + * (which indicates unmapped/unallocated pages). + */ + volatile uint64 touch pg_attribute_unused(); + + page_ptrs[i] = (char *) ent->location + (i * os_page_size); + if (firstNumaTouch) + pg_numa_touch_mem_if_required(touch, page_ptrs[i]); + + CHECK_FOR_INTERRUPTS(); + } + + if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1) + elog(ERROR, "failed NUMA pages inquiry status: %m"); + + memset(nodes, 0, sizeof(Size) * (max_nodes + 1)); + /* Count number of NUMA nodes used for this shared memory entry */ + for (i = 0; i < shm_ent_page_count; i++) + { + int s = pages_status[i]; + + /* Ensure we are adding only valid index to the array */ + if (s >= 0 && s <= max_nodes) + nodes[s]++; + else + elog(ERROR, "invalid NUMA node id outside of allowed range [0, " UINT64_FORMAT "]: %d", max_nodes, s); + } + + for (i = 0; i <= max_nodes; i++) + { + values[0] = CStringGetTextDatum(ent->key); + values[1] = i; + values[2] = Int64GetDatum(nodes[i] * os_page_size); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + } + + /* + * We are ignoring the following memory regions (as compared to + * pg_get_shmem_allocations()): 1. output shared memory allocated but not + * counted via the shmem index 2. output as-of-yet unused shared memory. + */ + + LWLockRelease(ShmemIndexLock); + firstNumaTouch = false; + + return (Datum) 0; +} diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index dfc59ea0cc8c..a93075c675cb 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -8546,6 +8546,14 @@ proname => 'pg_numa_available', provolatile => 's', prorettype => 'bool', proargtypes => '', prosrc => 'pg_numa_available' }, +# shared memory usage with NUMA info +{ oid => '9686', descr => 'NUMA mappings for the main shared memory segment', + proname => 'pg_get_shmem_allocations_numa', prorows => '50', proretset => 't', + provolatile => 'v', prorettype => 'record', proargtypes => '', + proallargtypes => '{text,int4,int8}', proargmodes => '{o,o,o}', + proargnames => '{name,node_id,size}', + prosrc => 'pg_get_shmem_allocations_numa' }, + # memory context of local backend { oid => '2282', descr => 'information about all memory contexts of local backend', diff --git a/src/test/regress/expected/numa.out b/src/test/regress/expected/numa.out new file mode 100644 index 000000000000..668172f7d79a --- /dev/null +++ b/src/test/regress/expected/numa.out @@ -0,0 +1,12 @@ +SELECT NOT(pg_numa_available()) AS skip_test \gset +\if :skip_test +\quit +\endif +-- switch to superuser +\c - +SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations_numa; + ok +---- + t +(1 row) + diff --git a/src/test/regress/expected/numa_1.out b/src/test/regress/expected/numa_1.out new file mode 100644 index 000000000000..6dd6824b4e4f --- /dev/null +++ b/src/test/regress/expected/numa_1.out @@ -0,0 +1,3 @@ +SELECT NOT(pg_numa_available()) AS skip_test \gset +\if :skip_test +\quit diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out index 1fddb13b6aef..c25062c288f3 100644 --- a/src/test/regress/expected/privileges.out +++ b/src/test/regress/expected/privileges.out @@ -3219,8 +3219,8 @@ REVOKE MAINTAIN ON lock_table FROM regress_locktable_user; -- clean up DROP TABLE lock_table; DROP USER regress_locktable_user; --- test to check privileges of system views pg_shmem_allocations and --- pg_backend_memory_contexts. +-- test to check privileges of system views pg_shmem_allocations, +-- pg_shmem_allocations_numa and pg_backend_memory_contexts. -- switch to superuser \c - CREATE ROLE regress_readallstats; @@ -3242,6 +3242,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT f (1 row) +SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- no + has_table_privilege +--------------------- + f +(1 row) + GRANT pg_read_all_stats TO regress_readallstats; SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes has_table_privilege @@ -3261,6 +3267,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT t (1 row) +SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- yes + has_table_privilege +--------------------- + t +(1 row) + -- run query to ensure that functions within views can be executed SET ROLE regress_readallstats; SELECT COUNT(*) >= 0 AS ok FROM pg_aios; diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 673c63b8d1b6..abfdc97abc5a 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1757,6 +1757,10 @@ pg_shmem_allocations| SELECT name, size, allocated_size FROM pg_get_shmem_allocations() pg_get_shmem_allocations(name, off, size, allocated_size); +pg_shmem_allocations_numa| SELECT name, + node_id, + size + FROM pg_get_shmem_allocations_numa() pg_get_shmem_allocations_numa(name, node_id, size); pg_stat_activity| SELECT s.datid, d.datname, s.pid, diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 0a35f2f8f6a9..0f38caa0d240 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -119,7 +119,7 @@ test: plancache limit plpgsql copy2 temp domain rangefuncs prepare conversion tr # The stats test resets stats, so nothing else needing stats access can be in # this group. # ---------- -test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate +test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate numa # event_trigger depends on create_am and cannot run concurrently with # any test that runs DDL diff --git a/src/test/regress/sql/numa.sql b/src/test/regress/sql/numa.sql new file mode 100644 index 000000000000..034098783fb4 --- /dev/null +++ b/src/test/regress/sql/numa.sql @@ -0,0 +1,9 @@ +SELECT NOT(pg_numa_available()) AS skip_test \gset +\if :skip_test +\quit +\endif + +-- switch to superuser +\c - + +SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations_numa; diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql index 85d7280f35fc..f337aa67c13f 100644 --- a/src/test/regress/sql/privileges.sql +++ b/src/test/regress/sql/privileges.sql @@ -1947,8 +1947,8 @@ REVOKE MAINTAIN ON lock_table FROM regress_locktable_user; DROP TABLE lock_table; DROP USER regress_locktable_user; --- test to check privileges of system views pg_shmem_allocations and --- pg_backend_memory_contexts. +-- test to check privileges of system views pg_shmem_allocations, +-- pg_shmem_allocations_numa and pg_backend_memory_contexts. -- switch to superuser \c - @@ -1958,12 +1958,14 @@ CREATE ROLE regress_readallstats; SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- no SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no +SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- no GRANT pg_read_all_stats TO regress_readallstats; SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- yes SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- yes +SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- yes -- run query to ensure that functions within views can be executed SET ROLE regress_readallstats; From 826d4926b83eaf304b34ede5cfafe43d85bf3263 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Fri, 4 Apr 2025 20:45:21 +0200 Subject: [PATCH 5/5] review --- src/backend/storage/ipc/shmem.c | 54 ++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index 852f2c7c4535..5d979423bd95 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -590,13 +590,11 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) max_nodes; Size *nodes; - InitMaterializedSRF(fcinfo, 0); - if (pg_numa_init() == -1) - { elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform"); - return (Datum) 0; - } + + InitMaterializedSRF(fcinfo, 0); + max_nodes = pg_numa_get_max_node(); nodes = palloc(sizeof(Size) * (max_nodes + 1)); @@ -619,6 +617,9 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) * memory size. This simplified approach allocates enough space for all * pages in shared memory rather than calculating the exact requirements * for each segment. + * + * XXX Isn't this wasteful? But there probably is one large segment of + * shared memory, much larger than the rest anyway. */ shm_total_page_count = ShmemSegHdr->totalsize / os_page_size; page_ptrs = palloc0(sizeof(void *) * shm_total_page_count); @@ -637,8 +638,12 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) { int i; - /* Get number of OS aliged pages */ - shm_ent_page_count = TYPEALIGN(os_page_size, ent->allocated_size) / os_page_size; + /* XXX I assume we use TYPEALIGN as a way to round to whole pages. + * It's a bit misleading to call that "aligned", no? */ + + /* Get number of OS aligned pages */ + shm_ent_page_count + = TYPEALIGN(os_page_size, ent->allocated_size) / os_page_size; /* * If we get ever 0xff back from kernel inquiry, then we probably have @@ -646,16 +651,20 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) */ memset(pages_status, 0xff, sizeof(int) * shm_ent_page_count); + /* + * Setup page_ptrs[] with pointers to all OS pages for this segment, + * and get the NUMA status using pg_numa_query_pages. + * + * In order to get reliable results we also need to touch memory + * pages, so that inquiry about NUMA memory node doesn't return -2 + * (ENOENT, which indicates unmapped/unallocated pages). + */ for (i = 0; i < shm_ent_page_count; i++) { - /* - * In order to get reliable results we also need to touch memory - * pages, so that inquiry about NUMA memory node doesn't return -2 - * (which indicates unmapped/unallocated pages). - */ volatile uint64 touch pg_attribute_unused(); page_ptrs[i] = (char *) ent->location + (i * os_page_size); + if (firstNumaTouch) pg_numa_touch_mem_if_required(touch, page_ptrs[i]); @@ -665,19 +674,27 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1) elog(ERROR, "failed NUMA pages inquiry status: %m"); - memset(nodes, 0, sizeof(Size) * (max_nodes + 1)); /* Count number of NUMA nodes used for this shared memory entry */ + memset(nodes, 0, sizeof(Size) * (max_nodes + 1)); + for (i = 0; i < shm_ent_page_count; i++) { int s = pages_status[i]; /* Ensure we are adding only valid index to the array */ - if (s >= 0 && s <= max_nodes) - nodes[s]++; - else - elog(ERROR, "invalid NUMA node id outside of allowed range [0, " UINT64_FORMAT "]: %d", max_nodes, s); + if (s < 0 || s > max_nodes) + { + elog(ERROR, "invalid NUMA node id outside of allowed range " + "[0, " UINT64_FORMAT "]: %d", max_nodes, s); + } + + nodes[s]++; } + /* + * Add one entry for each NUMA node, including those without allocated + * memory for this segment. + */ for (i = 0; i <= max_nodes; i++) { values[0] = CStringGetTextDatum(ent->key); @@ -693,6 +710,9 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) * We are ignoring the following memory regions (as compared to * pg_get_shmem_allocations()): 1. output shared memory allocated but not * counted via the shmem index 2. output as-of-yet unused shared memory. + * + * XXX Not quite sure why this is at the end, and what "output memory" + * refers to. */ LWLockRelease(ShmemIndexLock);