From e2dee063a0d230dd4765e41e4086faa051dfaaa7 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Fri, 21 Mar 2025 15:06:35 -0400
Subject: [PATCH 01/16] docs: Add acronym and glossary entries for I/O and AIO

These are fairly basic, but better than nothing.  While there are several
opportunities to link to these entries, this patch does not add any. They will
however be referenced by future patches.

Reviewed-by: Noah Misch <noah@leadboat.com>
Discussion: https://postgr.es/m/20250326183102.92.nmisch@google.com
---
 doc/src/sgml/acronyms.sgml | 18 ++++++++++++++++++
 doc/src/sgml/glossary.sgml | 39 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)
diff --git a/doc/src/sgml/acronyms.sgml b/doc/src/sgml/acronyms.sgml
index 58d0d90fece0..2f906e9f018d 100644
--- a/doc/src/sgml/acronyms.sgml
+++ b/doc/src/sgml/acronyms.sgml
@@ -9,6 +9,15 @@
 
   <variablelist>
 
+   <varlistentry>
+    <term><acronym>AIO</acronym></term>
+    <listitem>
+     <para>
+      <link linkend="glossary-aio">Asynchronous <acronym>I/O</acronym></link>
+     </para>
+    </listitem>
+   </varlistentry>
+
    <varlistentry>
     <term><acronym>ACL</acronym></term>
     <listitem>
@@ -354,6 +363,15 @@
     </listitem>
    </varlistentry>
 
+   <varlistentry>
+    <term><acronym>I/O</acronym></term>
+    <listitem>
+     <para>
+      <link linkend="glossary-io">Input/Output</link>
+     </para>
+    </listitem>
+   </varlistentry>
+
    <varlistentry>
     <term><acronym>ISO</acronym></term>
     <listitem>
diff --git a/doc/src/sgml/glossary.sgml b/doc/src/sgml/glossary.sgml
index c0f812e3f5e8..b88cac598e90 100644
--- a/doc/src/sgml/glossary.sgml
+++ b/doc/src/sgml/glossary.sgml
@@ -81,6 +81,31 @@
    </glossdef>
   </glossentry>
 
+  <glossentry id="glossary-aio">
+   <glossterm>Asynchronous <acronym>I/O</acronym></glossterm>
+   <acronym>AIO</acronym>
+   <indexterm>
+    <primary>Asynchronous <acronym>I/O</acronym></primary>
+   </indexterm>
+   <glossdef>
+    <para>
+     Asynchronous <acronym>I/O</acronym> (<acronym>AIO</acronym>) describes
+     performing <acronym>I/O</acronym> in a non-blocking way (asynchronously),
+     in contrast to synchronous <acronym>I/O</acronym>, which blocks for the
+     entire duration of the <acronym>I/O</acronym>.
+    </para>
+    <para>
+     With <acronym>AIO</acronym>, starting an <acronym>I/O</acronym> operation
+     is separated from waiting for the result of the operation, allowing
+     multiple <acronym>I/O</acronym> operations to be initiated concurrently,
+     as well as performing <acronym>CPU</acronym> heavy operations
+     concurrently with <acronym>I/O</acronym>. The price for that increased
+     concurrency is increased complexity.
+    </para>
+    <glossseealso otherterm="glossary-io" />
+   </glossdef>
+  </glossentry>
+
   <glossentry id="glossary-atomic">
    <glossterm>Atomic</glossterm>
    <glossdef>
@@ -938,6 +963,20 @@
    </glossdef>
   </glossentry>
 
+  <glossentry id="glossary-io">
+   <glossterm>Input/Output</glossterm>
+   <acronym>I/O</acronym>
+   <glossdef>
+    <para>
+     Input/Output (<acronym>I/O</acronym>) describes the communication between
+     a program and peripheral devices. In the context of database systems,
+     <acronym>I/O</acronym> commonly, but not exclusively, refers to
+     interaction with storage devices or the network.
+    </para>
+    <glossseealso otherterm="glossary-aio" />
+   </glossdef>
+  </glossentry>
+
   <glossentry id="glossary-insert">
    <glossterm>Insert</glossterm>
    <glossdef>

From 69e7e6b168b6a837a566babbb1c6ca12e014cedb Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 18 Mar 2025 14:40:06 -0400
Subject: [PATCH 02/16] aio: Add pg_aios view

The new view lists all IO handles that are currently in use and is mainly
useful for PG developers, but may also be useful when tuning PG.

FIXME:
- catversion bump before commit

Reviewed-by: Noah Misch <noah@leadboat.com>
Discussion: https://postgr.es/m/uvrtrknj4kdytuboidbhwclo4gxhswwcpgadptsjvjqcluzmah%40brqs62irg4dt
---
 doc/src/sgml/system-views.sgml           | 294 +++++++++++++++++++++++
 src/backend/catalog/system_views.sql     |   7 +
 src/backend/storage/aio/Makefile         |   1 +
 src/backend/storage/aio/aio_funcs.c      | 230 ++++++++++++++++++
 src/backend/storage/aio/meson.build      |   1 +
 src/include/catalog/pg_proc.dat          |   9 +
 src/test/regress/expected/privileges.out |  18 ++
 src/test/regress/expected/rules.out      |  16 ++
 src/test/regress/sql/privileges.sql      |   3 +
 9 files changed, 579 insertions(+)
 create mode 100644 src/backend/storage/aio/aio_funcs.c

diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml
index 3f5a306247e6..e9a59af8c34b 100644
--- a/doc/src/sgml/system-views.sgml
+++ b/doc/src/sgml/system-views.sgml
@@ -51,6 +51,11 @@
     </thead>
 
     <tbody>
+     <row>
+      <entry><link linkend="view-pg-aios"><structname>pg_aios</structname></link></entry>
+      <entry>In-use asynchronous IO handles</entry>
+     </row>
+
      <row>
       <entry><link linkend="view-pg-available-extensions"><structname>pg_available_extensions</structname></link></entry>
       <entry>available extensions</entry>
@@ -231,6 +236,295 @@
   </table>
  </sect1>
 
+ <sect1 id="view-pg-aios">
+  <title><structname>pg_aios</structname></title>
+
+  <indexterm zone="view-pg-aios">
+   <primary>pg_aios</primary>
+  </indexterm>
+
+  <para>
+   The <structname>pg_aios</structname> view lists all <xref
+   linkend="glossary-aio"/> handles that are currently in-use.  An I/O handle
+   is used to reference an I/O operation that is being prepared, executed or
+   is in the process of completing.  <structname>pg_aios</structname> contains
+   one row for each I/O handle.
+  </para>
+
+  <para>
+   This view is mainly useful for developers of
+   <productname>PostgreSQL</productname>, but may also be useful when tuning
+   <productname>PostgreSQL</productname>.
+  </para>
+
+  <table>
+   <title><structname>pg_aios</structname> Columns</title>
+   <tgroup cols="1">
+    <thead>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       Column Type
+      </para>
+      <para>
+       Description
+      </para></entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>pid</structfield> <type>int4</type>
+      </para>
+      <para>
+       Process ID of the server process that is issuing this I/O.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>io_id</structfield> <type>int4</type>
+      </para>
+      <para>
+       Identifier of the I/O handle. Handles are reused once the I/O
+       completed (or if the handle is released before I/O is started). On reuse
+       <link linkend="view-pg-aios-io-generation">
+        <structname>pg_aios</structname>.<structfield>io_generation</structfield>
+       </link>
+       is incremented.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry" id="view-pg-aios-io-generation"><para role="column_definition">
+       <structfield>io_generation</structfield> <type>int8</type>
+      </para>
+      <para>
+       Generation of the I/O handle.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>state</structfield> <type>text</type>
+      </para>
+      <para>
+       State of the I/O handle:
+       <itemizedlist>
+        <listitem>
+         <para>
+          <literal>HANDED_OUT</literal>, referenced by code but not yet used
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>DEFINED</literal>, information necessary for execution is known
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>STAGED</literal>, ready for execution
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>SUBMITTED</literal>, submitted for execution
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>COMPLETED_IO</literal>, finished, but result has not yet been processed
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>COMPLETED_SHARED</literal>, shared completion processing completed
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>COMPLETED_LOCAL</literal>, backend local completion processing completed
+         </para>
+        </listitem>
+       </itemizedlist>
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>operation</structfield> <type>text</type>
+      </para>
+      <para>
+       Operation performed using the I/O handle:
+       <itemizedlist>
+        <listitem>
+         <para>
+          <literal>invalid</literal>, not yet known
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>readv</literal>, a vectored read
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>writev</literal>, a vectored write
+         </para>
+        </listitem>
+       </itemizedlist>
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>off</structfield> <type>int8</type>
+      </para>
+      <para>
+       Offset of the I/O operation.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>length</structfield> <type>int8</type>
+      </para>
+      <para>
+       Length of the I/O operation.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>target</structfield> <type>text</type>
+      </para>
+      <para>
+       What kind of object is the I/O targeting:
+       <itemizedlist spacing="compact">
+        <listitem>
+         <para>
+          <literal>smgr</literal>, I/O on relations
+         </para>
+        </listitem>
+       </itemizedlist>
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>handle_data_len</structfield> <type>int2</type>
+      </para>
+      <para>
+       Length of the data associated with the I/O operation. For I/O to/from
+       <xref linkend="guc-shared-buffers"/> and <xref
+       linkend="guc-temp-buffers"/>, this indicates the number of buffers the
+       I/O is operating on.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>raw_result</structfield> <type>int4</type>
+      </para>
+      <para>
+       Low-level result of the I/O operation, or NULL if the operation has not
+       yet completed.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>result</structfield> <type>text</type>
+      </para>
+      <para>
+       High-level result of the I/O operation:
+       <itemizedlist>
+        <listitem>
+         <para>
+          <literal>UNKNOWN</literal> means that the result of the
+          operation is not yet known.
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>OK</literal> means the I/O completed successfully.
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>PARTIAL</literal> means that the I/O completed without
+          error, but did not process all data. Commonly callers will need to
+          retry and perform the remainder of the work in a separate I/O.
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>WARNING</literal> means that the I/O completed without
+          error, but that execution of the IO triggered a warning. E.g. when
+          encountering a corrupted buffer with <xref
+          linkend="guc-zero-damaged-pages"/> enabled.
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>ERROR</literal> means the I/O failed with an error.
+         </para>
+        </listitem>
+       </itemizedlist>
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>target_desc</structfield> <type>text</type>
+      </para>
+      <para>
+       Description of what the I/O operation is targeting.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>f_sync</structfield> <type>bool</type>
+      </para>
+      <para>
+       Flag indicating whether the I/O is executed synchronously.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>f_localmem</structfield> <type>bool</type>
+      </para>
+      <para>
+       Flag indicating whether the I/O references process local memory.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>f_buffered</structfield> <type>bool</type>
+      </para>
+      <para>
+       Flag indicating whether the I/O is buffered I/O.
+      </para></entry>
+     </row>
+
+    </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   The <structname>pg_aios</structname> view is read-only.
+  </para>
+
+  <para>
+   By default, the <structname>pg_aios</structname> view can be read only by
+   superusers or roles with privileges of the
+   <literal>pg_read_all_stats</literal> role.
+  </para>
+ </sect1>
+
  <sect1 id="view-pg-available-extensions">
   <title><structname>pg_available_extensions</structname></title>
 
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 31d269b7ee0c..64a7240aa772 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -1391,3 +1391,10 @@ CREATE VIEW pg_stat_subscription_stats AS
 
 CREATE VIEW pg_wait_events AS
     SELECT * FROM pg_get_wait_events();
+
+CREATE VIEW pg_aios AS
+    SELECT * FROM pg_get_aios();
+REVOKE ALL ON pg_aios FROM PUBLIC;
+GRANT SELECT ON pg_aios TO pg_read_all_stats;
+REVOKE EXECUTE ON FUNCTION pg_get_aios() FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_get_aios() TO pg_read_all_stats;
diff --git a/src/backend/storage/aio/Makefile b/src/backend/storage/aio/Makefile
index c06c50771e02..3f2469cc3994 100644
--- a/src/backend/storage/aio/Makefile
+++ b/src/backend/storage/aio/Makefile
@@ -11,6 +11,7 @@ include $(top_builddir)/src/Makefile.global
 OBJS = \
 	aio.o \
 	aio_callback.o \
+	aio_funcs.o \
 	aio_init.o \
 	aio_io.o \
 	aio_target.o \
diff --git a/src/backend/storage/aio/aio_funcs.c b/src/backend/storage/aio/aio_funcs.c
new file mode 100644
index 000000000000..584e683371a3
--- /dev/null
+++ b/src/backend/storage/aio/aio_funcs.c
@@ -0,0 +1,230 @@
+/*-------------------------------------------------------------------------
+ *
+ * aio_funcs.c
+ *    AIO - SQL interface for AIO
+ *
+ *
+ * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *    src/backend/storage/aio/aio_funcs.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "funcapi.h"
+#include "nodes/execnodes.h"
+#include "port/atomics.h"
+#include "storage/aio_internal.h"
+#include "storage/lock.h"
+#include "storage/proc.h"
+#include "storage/procnumber.h"
+#include "utils/builtins.h"
+#include "utils/fmgrprotos.h"
+#include "utils/tuplestore.h"
+
+
+/*
+ * Byte length of an iovec.
+ */
+static size_t
+iov_byte_length(const struct iovec *iov, int cnt)
+{
+	size_t		len = 0;
+
+	for (int i = 0; i < cnt; i++)
+	{
+		len += iov[i].iov_len;
+	}
+
+	return len;
+}
+
+Datum
+pg_get_aios(PG_FUNCTION_ARGS)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+
+	InitMaterializedSRF(fcinfo, 0);
+
+#define PG_GET_AIOS_COLS	15
+
+	for (uint64 i = 0; i < pgaio_ctl->io_handle_count; i++)
+	{
+		PgAioHandle *live_ioh = &pgaio_ctl->io_handles[i];
+		uint32		ioh_id = pgaio_io_get_id(live_ioh);
+		Datum		values[PG_GET_AIOS_COLS] = {0};
+		bool		nulls[PG_GET_AIOS_COLS] = {0};
+		ProcNumber	owner;
+		PGPROC	   *owner_proc;
+		int32		owner_pid;
+		PgAioHandleState start_state;
+		uint64		start_generation;
+		PgAioHandle ioh_copy;
+		struct iovec iov_copy[PG_IOV_MAX];
+
+
+		/*
+		 * There is no lock that could prevent the state of the IO to advance
+		 * concurrently - and we don't want to introduce one, as that would
+		 * introduce atomics into a very common path. Instead we
+		 *
+		 * 1) Determine the state + generation of the IO.
+		 *
+		 * 2) Copy the IO to local memory.
+		 *
+		 * 3) Check if state or generation of the IO changed. If the state
+		 * changed, retry, if the generation changed don't display the IO.
+		 */
+
+		/* 1) from above */
+		start_generation = live_ioh->generation;
+
+		/*
+		 * Retry at this point, so we can accept changing states, but not
+		 * changing generations.
+		 */
+retry:
+		pg_read_barrier();
+		start_state = live_ioh->state;
+
+		if (start_state == PGAIO_HS_IDLE)
+			continue;
+
+		/* 2) from above */
+		memcpy(&ioh_copy, live_ioh, sizeof(PgAioHandle));
+
+		/*
+		 * Safe to copy even if no iovec is used - we always reserve the
+		 * required space.
+		 */
+		memcpy(&iov_copy, &pgaio_ctl->iovecs[ioh_copy.iovec_off],
+			   PG_IOV_MAX * sizeof(struct iovec));
+
+		/*
+		 * Copy information about owner before 3) below, if the process exited
+		 * it'd have to wait for the IO to finish first, which we would detect
+		 * in 3).
+		 */
+		owner = ioh_copy.owner_procno;
+		owner_proc = GetPGProcByNumber(owner);
+		owner_pid = owner_proc->pid;
+
+		/* 3) from above */
+		pg_read_barrier();
+
+		/*
+		 * The IO completed and a new one was started with the same ID. Don't
+		 * display it - it really started after this function was called.
+		 * There be a risk of a livelock if we just retried endlessly, if IOs
+		 * complete very quickly.
+		 */
+		if (live_ioh->generation != start_generation)
+			continue;
+
+		/*
+		 * The IO's state changed while we were "rendering" it. Just start
+		 * from scratch. There's no risk of a livelock here, as an IO has a
+		 * limited sets of states it can be in, and state changes go only in a
+		 * single direction.
+		 */
+		if (live_ioh->state != start_state)
+			goto retry;
+
+		/*
+		 * Now that we have copied the IO into local memory and checked that
+		 * it's still in the same state, we are not allowed to access "live"
+		 * memory anymore. To make it slightly easier to catch such cases, set
+		 * the "live" pointers to NULL.
+		 */
+		live_ioh = NULL;
+		owner_proc = NULL;
+
+
+		/* column: owning pid */
+		if (owner_pid != 0)
+			values[0] = Int32GetDatum(owner_pid);
+		else
+			nulls[0] = false;
+
+		/* column: IO's id */
+		values[1] = ioh_id;
+
+		/* column: IO's generation */
+		values[2] = Int64GetDatum(start_generation);
+
+		/* column: IO's state */
+		values[3] = CStringGetTextDatum(pgaio_io_get_state_name(&ioh_copy));
+
+		/*
+		 * If the IO is in PGAIO_HS_HANDED_OUT state, none of the following
+		 * fields are valid yet (or are in the process of being set).
+		 * Therefore we don't want to display any other columns.
+		 */
+		if (start_state == PGAIO_HS_HANDED_OUT)
+		{
+			memset(nulls + 4, 1, (lengthof(nulls) - 4) * sizeof(bool));
+			goto display;
+		}
+
+		/* column: IO's operation */
+		values[4] = CStringGetTextDatum(pgaio_io_get_op_name(&ioh_copy));
+
+		/* columns: details about the IO's operation (offset, length) */
+		switch (ioh_copy.op)
+		{
+			case PGAIO_OP_INVALID:
+				nulls[5] = true;
+				nulls[6] = true;
+				break;
+			case PGAIO_OP_READV:
+				values[5] = Int64GetDatum(ioh_copy.op_data.read.offset);
+				values[6] =
+					Int64GetDatum(iov_byte_length(iov_copy, ioh_copy.op_data.read.iov_length));
+				break;
+			case PGAIO_OP_WRITEV:
+				values[5] = Int64GetDatum(ioh_copy.op_data.write.offset);
+				values[6] =
+					Int64GetDatum(iov_byte_length(iov_copy, ioh_copy.op_data.write.iov_length));
+				break;
+		}
+
+		/* column: IO's target */
+		values[7] = CStringGetTextDatum(pgaio_io_get_target_name(&ioh_copy));
+
+		/* column: length of IO's data array */
+		values[8] = Int16GetDatum(ioh_copy.handle_data_len);
+
+		/* column: raw result (i.e. some form of syscall return value) */
+		if (start_state == PGAIO_HS_COMPLETED_IO
+			|| start_state == PGAIO_HS_COMPLETED_SHARED
+			|| start_state == PGAIO_HS_COMPLETED_LOCAL)
+			values[9] = Int32GetDatum(ioh_copy.result);
+		else
+			nulls[9] = true;
+
+		/*
+		 * column: result in the higher level representation (unknown if not
+		 * finished)
+		 */
+		values[10] =
+			CStringGetTextDatum(pgaio_result_status_string(ioh_copy.distilled_result.status));
+
+		/* column: target description */
+		values[11] = CStringGetTextDatum(pgaio_io_get_target_description(&ioh_copy));
+
+		/* columns: one for each flag */
+		values[12] = BoolGetDatum(ioh_copy.flags & PGAIO_HF_SYNCHRONOUS);
+		values[13] = BoolGetDatum(ioh_copy.flags & PGAIO_HF_REFERENCES_LOCAL);
+		values[14] = BoolGetDatum(ioh_copy.flags & PGAIO_HF_BUFFERED);
+
+display:
+		tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
+	}
+
+	return (Datum) 0;
+}
diff --git a/src/backend/storage/aio/meson.build b/src/backend/storage/aio/meson.build
index 2f0f03d80712..da6df2d3654f 100644
--- a/src/backend/storage/aio/meson.build
+++ b/src/backend/storage/aio/meson.build
@@ -3,6 +3,7 @@
 backend_sources += files(
   'aio.c',
   'aio_callback.c',
+  'aio_funcs.c',
   'aio_init.c',
   'aio_io.c',
   'aio_target.c',
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 8b68b16d79da..d9c41fa426b3 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -12493,4 +12493,13 @@
   proargtypes => 'int4',
   prosrc => 'gist_stratnum_common' },
 
+# AIO related functions
+{ oid => '9200', descr => 'information about in-progress asynchronous IOs',
+  proname => 'pg_get_aios', prorows => '100', proretset => 't',
+  provolatile => 'v', proparallel => 'r', prorettype => 'record', proargtypes => '',
+  proallargtypes => '{int4,int4,int8,text,text,int8,int8,text,int2,int4,text,text,bool,bool,bool}',
+  proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
+  proargnames => '{pid,io_id,io_generation,state,operation,off,length,target,handle_data_len,raw_result,result,target_desc,f_sync,f_localmem,f_buffered}',
+  prosrc => 'pg_get_aios' },
+
 ]
diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out
index 954f549555e2..5588d83e1bfb 100644
--- a/src/test/regress/expected/privileges.out
+++ b/src/test/regress/expected/privileges.out
@@ -3132,6 +3132,12 @@ DROP USER regress_locktable_user;
 -- switch to superuser
 \c -
 CREATE ROLE regress_readallstats;
+SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- no
+ has_table_privilege 
+---------------------
+ f
+(1 row)
+
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
  has_table_privilege 
 ---------------------
@@ -3145,6 +3151,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT
 (1 row)
 
 GRANT pg_read_all_stats TO regress_readallstats;
+SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes
+ has_table_privilege 
+---------------------
+ t
+(1 row)
+
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- yes
  has_table_privilege 
 ---------------------
@@ -3159,6 +3171,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT
 
 -- run query to ensure that functions within views can be executed
 SET ROLE regress_readallstats;
+SELECT COUNT(*) >= 0 AS ok FROM pg_aios;
+ ok 
+----
+ t
+(1 row)
+
 SELECT COUNT(*) >= 0 AS ok FROM pg_backend_memory_contexts;
  ok 
 ----
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 474789691357..d9533deb04e8 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1286,6 +1286,22 @@ drop table cchild;
 SELECT viewname, definition FROM pg_views
 WHERE schemaname = 'pg_catalog'
 ORDER BY viewname;
+pg_aios| SELECT pid,
+    io_id,
+    io_generation,
+    state,
+    operation,
+    off,
+    length,
+    target,
+    handle_data_len,
+    raw_result,
+    result,
+    target_desc,
+    f_sync,
+    f_localmem,
+    f_buffered
+   FROM pg_get_aios() pg_get_aios(pid, io_id, io_generation, state, operation, off, length, target, handle_data_len, raw_result, result, target_desc, f_sync, f_localmem, f_buffered);
 pg_available_extension_versions| SELECT e.name,
     e.version,
     (x.extname IS NOT NULL) AS installed,
diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql
index b81694c24f28..286b1d037569 100644
--- a/src/test/regress/sql/privileges.sql
+++ b/src/test/regress/sql/privileges.sql
@@ -1919,16 +1919,19 @@ DROP USER regress_locktable_user;
 
 CREATE ROLE regress_readallstats;
 
+SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- no
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
 SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no
 
 GRANT pg_read_all_stats TO regress_readallstats;
 
+SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- yes
 SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- yes
 
 -- run query to ensure that functions within views can be executed
 SET ROLE regress_readallstats;
+SELECT COUNT(*) >= 0 AS ok FROM pg_aios;
 SELECT COUNT(*) >= 0 AS ok FROM pg_backend_memory_contexts;
 SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations;
 RESET ROLE;

From b35b9c1c879e350293ea94a507b259e8c2742d11 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 18 Mar 2025 14:40:06 -0400
Subject: [PATCH 03/16] aio: Add README.md explaining higher level design

Reviewed-by: Noah Misch <noah@leadboat.com>
Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi>
Discussion: https://postgr.es/m/uvrtrknj4kdytuboidbhwclo4gxhswwcpgadptsjvjqcluzmah%40brqs62irg4dt
Discussion: https://postgr.es/m/20210223100344.llw5an2aklengrmn@alap3.anarazel.de
Discussion: https://postgr.es/m/stj36ea6yyhoxtqkhpieia2z4krnam7qyetc57rfezgk4zgapf@gcnactj4z56m
---
 src/backend/storage/aio/README.md | 424 ++++++++++++++++++++++++++++++
 src/backend/storage/aio/aio.c     |   2 +
 2 files changed, 426 insertions(+)
 create mode 100644 src/backend/storage/aio/README.md

diff --git a/src/backend/storage/aio/README.md b/src/backend/storage/aio/README.md
new file mode 100644
index 000000000000..ddd59404a592
--- /dev/null
+++ b/src/backend/storage/aio/README.md
@@ -0,0 +1,424 @@
+# Asynchronous & Direct IO
+
+## Motivation
+
+### Why Asynchronous IO
+
+Until the introduction of asynchronous IO Postgres relied on the operating
+system to hide the cost of synchronous IO from Postgres. While this worked
+surprisingly well in a lot of workloads, it does not do as good a job on
+prefetching and controlled writeback as we would like.
+
+There are important expensive operations like `fdatasync()` where the operating
+system cannot hide the storage latency. This is particularly important for WAL
+writes, where the ability to asynchronously issue `fdatasync()` or O_DSYNC
+writes can yield significantly higher throughput.
+
+
+### Why Direct / unbuffered IO
+
+The main reasons to want to use Direct IO are:
+
+- Lower CPU usage / higher throughput. Particularly on modern storage buffered
+  writes are bottlenecked by the operating system having to copy data from the
+  kernel's page cache to postgres buffer pool using the CPU. Whereas direct IO
+  can often move the data directly between the storage devices and postgres'
+  buffer cache, using DMA. While that transfer is ongoing, the CPU is free to
+  perform other work.
+- Reduced latency - Direct IO can have substantially lower latency than
+  buffered IO, which can be impactful for OLTP workloads bottlenecked by WAL
+  write latency.
+- Avoiding double buffering between operating system cache and postgres'
+  shared_buffers.
+- Better control over the timing and pace of dirty data writeback.
+
+
+The main reasons *not* to use Direct IO are:
+
+- Without AIO, Direct IO is unusably slow for most purposes.
+- Even with AIO, many parts of postgres need to be modified to perform
+  explicit prefetching.
+- In situations where shared_buffers cannot be set appropriately large,
+  e.g. because there are many different postgres instances hosted on shared
+  hardware, performance will often be worse than when using buffered IO.
+
+
+## AIO Usage Example
+
+In many cases code that can benefit from AIO does not directly have to
+interact with the AIO interface, but can use AIO via higher-level
+abstractions. See [Helpers](#helpers).
+
+In this example, a buffer will be read into shared buffers.
+
+```C
+/*
+ * Result of the operation, only to be accessed in this backend.
+ */
+PgAioReturn ioret;
+
+/*
+ * Acquire an AIO Handle, ioret will get result upon completion.
+ *
+ * Note that ioret needs to stay alive until the IO completes or
+ * CurrentResourceOwner is released (i.e. an error is thrown).
+ */
+PgAioHandle *ioh = pgaio_io_acquire(CurrentResourceOwner, &ioret);
+
+/*
+ * Reference that can be used to wait for the IO we initiate below. This
+ * reference can reside in local or shared memory and waited upon by any
+ * process. An arbitrary number of references can be made for each IO.
+ */
+PgAioWaitRef iow;
+
+pgaio_io_get_wref(ioh, &iow);
+
+/*
+ * Arrange for shared buffer completion callbacks to be called upon completion
+ * of the IO. This callback will update the buffer descriptors associated with
+ * the AioHandle, which e.g. allows other backends to access the buffer.
+ *
+ * A callback can be passed a small bit of data, e.g. to indicate whether to
+ * zero a buffer if it is invalid.
+ *
+ * Multiple completion callbacks can be registered for each handle.
+ */
+pgaio_io_register_callbacks(ioh, PGAIO_HCB_SHARED_BUFFER_READV, 0);
+
+/*
+ * The completion callback needs to know which buffers to update when the IO
+ * completes. As the AIO subsystem does not know about buffers, we have to
+ * associate this information with the AioHandle, for use by the completion
+ * callback registered above.
+ *
+ * In this example we're reading only a single buffer, hence the 1.
+ */
+pgaio_io_set_handle_data_32(ioh, (uint32 *) buffer, 1);
+
+/*
+ * Pass the AIO handle to lower-level function. When operating on the level of
+ * buffers, we don't know how exactly the IO is performed, that is the
+ * responsibility of the storage manager implementation.
+ *
+ * E.g. md.c needs to translate block numbers into offsets in segments.
+ *
+ * Once the IO handle has been handed off to smgstartreadv(), it may not
+ * further be used, as the IO may immediately get executed below
+ * smgrstartreadv() and the handle reused for another IO.
+ *
+ * To issue multiple IOs in an efficient way, a caller can call
+ * pgaio_enter_batchmode() before starting multiple IOs, and end that batch
+ * with pgaio_exit_batchmode().  Note that one needs to be careful while there
+ * may be unsubmitted IOs, as another backend may need to wait for one of the
+ * unsubmitted IOs. If this backend then had to wait for the other backend,
+ * it'd end in an undetected deadlock. See pgaio_enter_batchmode() for more
+ * details.
+ *
+ * Note that even while in batchmode an IO might get submitted immediately,
+ * e.g. due to reaching a limit on the number of unsubmitted IOs, and even
+ * complete before smgrstartreadv() returns.
+ */
+smgrstartreadv(ioh, operation->smgr, forknum, blkno,
+               BufferGetBlock(buffer), 1);
+
+/*
+ * To benefit from AIO, it is beneficial to perform other work, including
+ * submitting other IOs, before waiting for the IO to complete. Otherwise
+ * we could just have used synchronous, blocking IO.
+ */
+perform_other_work();
+
+/*
+ * We did some other work and now need the IO operation to have completed to
+ * continue.
+ */
+pgaio_wref_wait(&iow);
+
+/*
+ * At this point the IO has completed. We do not yet know whether it succeeded
+ * or failed, however. The buffer's state has been updated, which allows other
+ * backends to use the buffer (if the IO succeeded), or retry the IO (if it
+ * failed).
+ *
+ * Note that in case the IO has failed, a LOG message may have been emitted,
+ * but no ERROR has been raised. This is crucial, as another backend waiting
+ * for this IO should not see an ERROR.
+ *
+ * To check whether the operation succeeded, and to raise an ERROR, or if more
+ * appropriate LOG, the PgAioReturn we passed to pgaio_io_acquire() is used.
+ */
+if (ioret.result.status == PGAIO_RS_ERROR)
+    pgaio_result_report(ioret.result, &ioret.target_data, ERROR);
+
+/*
+ * Besides having succeeded completely, the IO could also have a) partially
+ * completed or b) succeeded with a warning (e.g. due to zero_damaged_pages).
+ * If we e.g. tried to read many blocks at once, the read might have
+ * only succeeded for the first few blocks.
+ *
+ * If the IO partially succeeded and this backend needs all blocks to have
+ * completed, this backend needs to reissue the IO for the remaining buffers.
+ * The AIO subsystem cannot handle this retry transparently.
+ *
+ * As this example is already long, and we only read a single block, we'll just
+ * error out if there's a partial read or a warning.
+ */
+if (ioret.result.status != PGAIO_RS_OK)
+    pgaio_result_report(ioret.result, &ioret.target_data, ERROR);
+
+/*
+ * The IO succeeded, so we can use the buffer now.
+ */
+```
+
+
+## Design Criteria & Motivation
+
+### Deadlock and Starvation Dangers due to AIO
+
+Using AIO in a naive way can easily lead to deadlocks in an environment where
+the source/target of AIO are shared resources, like pages in postgres'
+shared_buffers.
+
+Consider one backend performing readahead on a table, initiating IO for a
+number of buffers ahead of the current "scan position". If that backend then
+performs some operation that blocks, or even just is slow, the IO completion
+for the asynchronously initiated read may not be processed.
+
+This AIO implementation solves this problem by requiring that AIO methods
+either allow AIO completions to be processed by any backend in the system
+(e.g. io_uring), or to guarantee that AIO processing will happen even when the
+issuing backend is blocked (e.g. worker mode, which offloads completion
+processing to the AIO workers).
+
+
+### IO can be started in critical sections
+
+Using AIO for WAL writes can reduce the overhead of WAL logging substantially:
+
+- AIO allows to start WAL writes eagerly, so they complete before needing to
+  wait
+- AIO allows to have multiple WAL flushes in progress at the same time
+- AIO makes it more realistic to use O\_DIRECT + O\_DSYNC, which can reduce
+  the number of roundtrips to storage on some OSs and storage HW (buffered IO
+  and direct IO without O_DSYNC needs to issue a write and after the write's
+  completion a cache flush, whereas O\_DIRECT + O\_DSYNC can use a single
+  Force Unit Access (FUA) write).
+
+The need to be able to execute IO in critical sections has substantial design
+implication on the AIO subsystem. Mainly because completing IOs (see prior
+section) needs to be possible within a critical section, even if the
+to-be-completed IO itself was not issued in a critical section. Consider
+e.g. the case of a backend first starting a number of writes from shared
+buffers and then starting to flush the WAL. Because only a limited amount of
+IO can be in-progress at the same time, initiating IO for flushing the WAL may
+require to first complete IO that was started earlier.
+
+
+### State for AIO needs to live in shared memory
+
+Because postgres uses a process model and because AIOs need to be
+complete-able by any backend much of the state of the AIO subsystem needs to
+live in shared memory.
+
+In an `EXEC_BACKEND` build, a backend's executable code and other process
+local state is not necessarily mapped to the same addresses in each process
+due to ASLR. This means that the shared memory cannot contain pointers to
+callbacks.
+
+
+## Design of the AIO Subsystem
+
+
+### AIO Methods
+
+To achieve portability and performance, multiple methods of performing AIO are
+implemented and others are likely worth adding in the future.
+
+
+#### Synchronous Mode
+
+`io_method=sync` does not actually perform AIO but allows to use the AIO API
+while performing synchronous IO. This can be useful for debugging. The code
+for the synchronous mode is also used as a fallback by e.g. the [worker
+mode](#worker) uses it to execute IO that cannot be executed by workers.
+
+
+#### Worker
+
+`io_method=worker` is available on every platform postgres runs on, and
+implements asynchronous IO - from the view of the issuing process - by
+dispatching the IO to one of several worker processes performing the IO in a
+synchronous manner.
+
+
+#### io_uring
+
+`io_method=io_uring` is available on Linux 5.1+. In contrast to worker mode it
+dispatches all IO from within the process, lowering context switch rate /
+latency.
+
+
+### AIO Handles
+
+The central API piece for postgres' AIO abstraction are AIO handles. To
+execute an IO one first has to acquire an IO handle (`pgaio_io_acquire()`) and
+then "define" it, i.e. associate an IO operation with the handle.
+
+Often AIO handles are acquired on a higher level and then passed to a lower
+level to be fully defined. E.g., for IO to/from shared buffers, bufmgr.c
+routines acquire the handle, which is then passed through smgr.c, md.c to be
+finally fully defined in fd.c.
+
+The functions used at the lowest level to define the operation are
+`pgaio_io_start_*()`.
+
+Because acquisition of an IO handle
+[must always succeed](#io-can-be-started-in-critical-sections)
+and the number of AIO Handles
+[has to be limited](#state-for-aio-needs-to-live-in-shared-memory)
+AIO handles can be reused as soon as they have completed. Obviously code needs
+to be able to react to IO completion. State can be updated using
+[AIO Completion callbacks](#aio-callbacks)
+and the issuing backend can provide a backend local variable to receive the
+result of the IO, as described in
+[AIO Result](#aio-results).
+An IO can be waited for, by both the issuing and any other backend, using
+[AIO References](#aio-wait-references).
+
+
+Because an AIO Handle is not executable just after calling
+`pgaio_io_acquire()` and because `pgaio_io_acquire()` needs to always succeed
+(absent a PANIC), only a single AIO Handle may be acquired (i.e. returned by
+`pgaio_io_acquire()`) without causing the IO to have been defined (by,
+potentially indirectly, causing `pgaio_io_start_*()` to have been
+called). Otherwise a backend could trivially self-deadlock by using up all AIO
+Handles without the ability to wait for some of the IOs to complete.
+
+If it turns out that an AIO Handle is not needed, e.g., because the handle was
+acquired before holding a contended lock, it can be released without being
+defined using `pgaio_io_release()`.
+
+
+### AIO Callbacks
+
+Commonly several layers need to react to completion of an IO. E.g. for a read
+md.c needs to check if the IO outright failed or was shorter than needed,
+bufmgr.c needs to verify the page looks valid and bufmgr.c needs to update the
+BufferDesc to update the buffer's state.
+
+The fact that several layers / subsystems need to react to IO completion poses
+a few challenges:
+
+- Upper layers should not need to know details of lower layers. E.g. bufmgr.c
+  should not assume the IO will pass through md.c.  Therefore upper levels
+  cannot know what lower layers would consider an error.
+
+- Lower layers should not need to know about upper layers. E.g. smgr APIs are
+  used going through shared buffers but are also used bypassing shared
+  buffers. This means that e.g. md.c is not in a position to validate
+  checksums.
+
+- Having code in the AIO subsystem for every possible combination of layers
+  would lead to a lot of duplication.
+
+The "solution" to this is the ability to associate multiple completion
+callbacks with a handle. E.g. bufmgr.c can have a callback to update the
+BufferDesc state and to verify the page and md.c can have another callback to
+check if the IO operation was successful.
+
+As [mentioned](#state-for-aio-needs-to-live-in-shared-memory), shared memory
+currently cannot contain function pointers. Because of that completion
+callbacks are not directly identified by function pointers but by IDs
+(`PgAioHandleCallbackID`).  A substantial added benefit is that that
+allows callbacks to be identified by much smaller amount of memory (a single
+byte currently).
+
+In addition to completion, AIO callbacks also are called to "stage" an
+IO. This is, e.g., used to increase buffer reference counts to account for the
+AIO subsystem referencing the buffer, which is required to handle the case
+where the issuing backend errors out and releases its own pins while the IO is
+still ongoing.
+
+As [explained earlier](#io-can-be-started-in-critical-sections) IO completions
+need to be safe to execute in critical sections. To allow the backend that
+issued the IO to error out in case of failure [AIO Result](#aio-results) can
+be used.
+
+
+### AIO Targets
+
+In addition to the completion callbacks describe above, each AIO Handle has
+exactly one "target". Each target has some space inside an AIO Handle with
+information specific to the target and can provide callbacks to allow to
+reopen the underlying file (required for worker mode) and to describe the IO
+operation (used for debug logging and error messages).
+
+I.e., if two different uses of AIO can describe the identity of the file being
+operated on the same way, it likely makes sense to use the same
+target. E.g. different smgr implementations can describe IO with
+RelFileLocator, ForkNumber and BlockNumber and can thus share a target. In
+contrast, IO for a WAL file would be described with TimeLineID and XLogRecPtr
+and it would not make sense to use the same target for smgr and WAL.
+
+
+### AIO Wait References
+
+As [described above](#aio-handles), AIO Handles can be reused immediately
+after completion and therefore cannot be used to wait for completion of the
+IO. Waiting is enabled using AIO wait references, which do not just identify
+an AIO Handle but also include the handles "generation".
+
+A reference to an AIO Handle can be acquired using `pgaio_io_get_wref()` and
+then waited upon using `pgaio_wref_wait()`.
+
+
+### AIO Results
+
+As AIO completion callbacks
+[are executed in critical sections](#io-can-be-started-in-critical-sections)
+and [may be executed by any backend](#deadlock-and-starvation-dangers-due-to-aio)
+completion callbacks cannot be used to, e.g., make the query that triggered an
+IO ERROR out.
+
+To allow to react to failing IOs the issuing backend can pass a pointer to a
+`PgAioReturn` in backend local memory. Before an AIO Handle is reused the
+`PgAioReturn` is filled with information about the IO. This includes
+information about whether the IO was successful (as a value of
+`PgAioResultStatus`) and enough information to raise an error in case of a
+failure (via `pgaio_result_report()`, with the error details encoded in
+`PgAioResult`).
+
+
+### AIO Errors
+
+It would be very convenient to have shared completion callbacks encode the
+details of errors as an `ErrorData` that could be raised at a later
+time. Unfortunately doing so would require allocating memory. While elog.c can
+guarantee (well, kinda) that logging a message will not run out of memory,
+that only works because a very limited number of messages are in the process
+of being logged.  With AIO a large number of concurrently issued AIOs might
+fail.
+
+To avoid the need for preallocating a potentially large amount of memory (in
+shared memory no less!), completion callbacks instead have to encode errors in
+a more compact format that can be converted into an error message.
+
+
+## Helpers
+
+Using the low-level AIO API introduces too much complexity to do so all over
+the tree. Most uses of AIO should be done via reusable, higher-level,
+helpers.
+
+
+### Read Stream
+
+A common and very beneficial use of AIO are reads where a substantial number
+of to-be-read locations are known ahead of time. E.g., for a sequential scan
+the set of blocks that need to be read can be determined solely by knowing the
+current position and checking the buffer mapping table.
+
+The [Read Stream](../../../include/storage/read_stream.h) interface makes it
+comparatively easy to use AIO for such use cases.
diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c
index e3ed087e8a2b..86f7250b7a5f 100644
--- a/src/backend/storage/aio/aio.c
+++ b/src/backend/storage/aio/aio.c
@@ -24,6 +24,8 @@
  *
  * - read_stream.c - helper for reading buffered relation data
  *
+ * - README.md - higher-level overview over AIO
+ *
  *
  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California

From 927f90f4527f3528b6ed3afa81f81fcca91a2fa4 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 18 Mar 2025 14:40:06 -0400
Subject: [PATCH 04/16] aio: Add test_aio module

To make the tests possible, a few functions from bufmgr.c/localbuf.c had to be
exported, via buf_internals.h.

Reviewed-by: Noah Misch <noah@leadboat.com>
Co-authored-by: Andres Freund <andres@anarazel.de>
Co-authored-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Discussion: https://postgr.es/m/uvrtrknj4kdytuboidbhwclo4gxhswwcpgadptsjvjqcluzmah%40brqs62irg4dt
---
 src/backend/storage/buffer/bufmgr.c           |    8 +-
 src/backend/storage/buffer/localbuf.c         |    3 +-
 src/include/storage/buf_internals.h           |    7 +
 src/test/modules/Makefile                     |    1 +
 src/test/modules/meson.build                  |    1 +
 src/test/modules/test_aio/.gitignore          |    2 +
 src/test/modules/test_aio/Makefile            |   26 +
 src/test/modules/test_aio/meson.build         |   37 +
 src/test/modules/test_aio/t/001_aio.pl        | 1504 +++++++++++++++++
 src/test/modules/test_aio/t/002_io_workers.pl |  125 ++
 src/test/modules/test_aio/test_aio--1.0.sql   |  108 ++
 src/test/modules/test_aio/test_aio.c          |  804 +++++++++
 src/test/modules/test_aio/test_aio.control    |    3 +
 13 files changed, 2621 insertions(+), 8 deletions(-)
 create mode 100644 src/test/modules/test_aio/.gitignore
 create mode 100644 src/test/modules/test_aio/Makefile
 create mode 100644 src/test/modules/test_aio/meson.build
 create mode 100644 src/test/modules/test_aio/t/001_aio.pl
 create mode 100644 src/test/modules/test_aio/t/002_io_workers.pl
 create mode 100644 src/test/modules/test_aio/test_aio--1.0.sql
 create mode 100644 src/test/modules/test_aio/test_aio.c
 create mode 100644 src/test/modules/test_aio/test_aio.control

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index f9681d09e1e7..1c37d7dfe2f9 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -518,10 +518,6 @@ static uint32 WaitBufHdrUnlocked(BufferDesc *buf);
 static int	SyncOneBuffer(int buf_id, bool skip_recently_used,
 						  WritebackContext *wb_context);
 static void WaitIO(BufferDesc *buf);
-static bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait);
-static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
-							  uint32 set_flag_bits, bool forget_owner,
-							  bool release_aio);
 static void AbortBufferIO(Buffer buffer);
 static void shared_buffer_write_error_callback(void *arg);
 static void local_buffer_write_error_callback(void *arg);
@@ -5962,7 +5958,7 @@ WaitIO(BufferDesc *buf)
  * find out if they can perform the I/O as part of a larger operation, without
  * waiting for the answer or distinguishing the reasons why not.
  */
-static bool
+bool
 StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
 {
 	uint32		buf_state;
@@ -6019,7 +6015,7 @@ StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
  * resource owner. (forget_owner=false is used when the resource owner itself
  * is being released)
  */
-static void
+void
 TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
 				  bool forget_owner, bool release_aio)
 {
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index bf89076bb109..ed56202af14f 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -57,7 +57,6 @@ static int	NLocalPinnedBuffers = 0;
 static void InitLocalBuffers(void);
 static Block GetLocalBufferStorage(void);
 static Buffer GetLocalVictimBuffer(void);
-static void InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced);
 
 
 /*
@@ -597,7 +596,7 @@ TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint32 set_flag_bit
  *
  * See also InvalidateBuffer().
  */
-static void
+void
 InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced)
 {
 	Buffer		buffer = BufferDescriptorGetBuffer(bufHdr);
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 72b36a4af26f..0dec7d93b3b2 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -434,6 +434,12 @@ extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_co
 extern void ScheduleBufferTagForWriteback(WritebackContext *wb_context,
 										  IOContext io_context, BufferTag *tag);
 
+/* solely to make it easier to write tests */
+extern bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait);
+extern void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
+							  bool forget_owner, bool release_aio);
+
+
 /* freelist.c */
 extern IOContext IOContextForStrategy(BufferAccessStrategy strategy);
 extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
@@ -478,6 +484,7 @@ extern void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty,
 								   uint32 set_flag_bits, bool release_aio);
 extern bool StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool nowait);
 extern void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln);
+extern void InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced);
 extern void DropRelationLocalBuffers(RelFileLocator rlocator,
 									 ForkNumber forkNum,
 									 BlockNumber firstDelBlock);
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index 4e4be3fa511a..aa1d27bbed31 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -14,6 +14,7 @@ SUBDIRS = \
 		  oauth_validator \
 		  plsample \
 		  spgist_name_ops \
+		  test_aio \
 		  test_bloomfilter \
 		  test_copy_callbacks \
 		  test_custom_rmgrs \
diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build
index 2b0574514731..9de0057bd1d4 100644
--- a/src/test/modules/meson.build
+++ b/src/test/modules/meson.build
@@ -13,6 +13,7 @@ subdir('oauth_validator')
 subdir('plsample')
 subdir('spgist_name_ops')
 subdir('ssl_passphrase_callback')
+subdir('test_aio')
 subdir('test_bloomfilter')
 subdir('test_copy_callbacks')
 subdir('test_custom_rmgrs')
diff --git a/src/test/modules/test_aio/.gitignore b/src/test/modules/test_aio/.gitignore
new file mode 100644
index 000000000000..716e17f5a2ad
--- /dev/null
+++ b/src/test/modules/test_aio/.gitignore
@@ -0,0 +1,2 @@
+# Generated subdirectories
+/tmp_check/
diff --git a/src/test/modules/test_aio/Makefile b/src/test/modules/test_aio/Makefile
new file mode 100644
index 000000000000..f53cc64671a8
--- /dev/null
+++ b/src/test/modules/test_aio/Makefile
@@ -0,0 +1,26 @@
+# src/test/modules/test_aio/Makefile
+
+PGFILEDESC = "test_aio - test code for AIO"
+
+MODULE_big = test_aio
+OBJS = \
+	$(WIN32RES) \
+	test_aio.o
+
+EXTENSION = test_aio
+DATA = test_aio--1.0.sql
+
+TAP_TESTS = 1
+
+export enable_injection_points
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_aio
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/test_aio/meson.build b/src/test/modules/test_aio/meson.build
new file mode 100644
index 000000000000..73d2fd68eaa1
--- /dev/null
+++ b/src/test/modules/test_aio/meson.build
@@ -0,0 +1,37 @@
+# Copyright (c) 2024-2025, PostgreSQL Global Development Group
+
+test_aio_sources = files(
+  'test_aio.c',
+)
+
+if host_system == 'windows'
+  test_aio_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+    '--NAME', 'test_aio',
+    '--FILEDESC', 'test_aio - test code for AIO',])
+endif
+
+test_aio = shared_module('test_aio',
+  test_aio_sources,
+  kwargs: pg_test_mod_args,
+)
+test_install_libs += test_aio
+
+test_install_data += files(
+  'test_aio.control',
+  'test_aio--1.0.sql',
+)
+
+tests += {
+  'name': 'test_aio',
+  'sd': meson.current_source_dir(),
+  'bd': meson.current_build_dir(),
+  'tap': {
+    'env': {
+       'enable_injection_points': get_option('injection_points') ? 'yes' : 'no',
+    },
+    'tests': [
+      't/001_aio.pl',
+      't/002_io_workers.pl',
+    ],
+  },
+}
diff --git a/src/test/modules/test_aio/t/001_aio.pl b/src/test/modules/test_aio/t/001_aio.pl
new file mode 100644
index 000000000000..58a56c8cec24
--- /dev/null
+++ b/src/test/modules/test_aio/t/001_aio.pl
@@ -0,0 +1,1504 @@
+# Copyright (c) 2025, PostgreSQL Global Development Group
+
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+
+###
+# Test io_method=worker
+###
+my $node_worker = create_node('worker');
+$node_worker->start();
+
+test_generic('worker', $node_worker);
+SKIP:
+{
+	skip 'Injection points not supported by this build', 1
+	  unless $ENV{enable_injection_points} eq 'yes';
+	test_inject_worker('worker', $node_worker);
+}
+
+$node_worker->stop();
+
+
+###
+# Test io_method=io_uring
+###
+
+if (have_io_uring())
+{
+	my $node_uring = create_node('io_uring');
+	$node_uring->start();
+	test_generic('io_uring', $node_uring);
+	$node_uring->stop();
+}
+
+
+###
+# Test io_method=sync
+###
+
+my $node_sync = create_node('sync');
+
+# just to have one test not use the default auto-tuning
+
+$node_sync->append_conf(
+	'postgresql.conf', qq(
+io_max_concurrency=4
+));
+
+$node_sync->start();
+test_generic('sync', $node_sync);
+$node_sync->stop();
+
+done_testing();
+
+
+###
+# Test Helpers
+###
+
+sub create_node
+{
+	local $Test::Builder::Level = $Test::Builder::Level + 1;
+
+	my $io_method = shift;
+
+	my $node = PostgreSQL::Test::Cluster->new($io_method);
+
+	# Want to test initdb for each IO method, otherwise we could just reuse
+	# the cluster.
+	#
+	# Unfortunately Cluster::init() puts PG_TEST_INITDB_EXTRA_OPTS after the
+	# options specified by ->extra, if somebody puts -c io_method=xyz in
+	# PG_TEST_INITDB_EXTRA_OPTS it would break this test. Fix that up if we
+	# detect it.
+	local $ENV{PG_TEST_INITDB_EXTRA_OPTS} = $ENV{PG_TEST_INITDB_EXTRA_OPTS};
+	if (defined $ENV{PG_TEST_INITDB_EXTRA_OPTS}
+		&& $ENV{PG_TEST_INITDB_EXTRA_OPTS} =~ m/io_method=/)
+	{
+		$ENV{PG_TEST_INITDB_EXTRA_OPTS} .= " -c io_method=$io_method";
+	}
+
+	$node->init(extra => [ '-c', "io_method=$io_method" ]);
+
+	$node->append_conf(
+		'postgresql.conf', qq(
+shared_preload_libraries=test_aio
+log_min_messages = 'DEBUG3'
+log_statement=all
+log_error_verbosity=default
+restart_after_crash=false
+temp_buffers=100
+));
+
+	ok(1, "$io_method: initdb");
+
+	return $node;
+}
+
+sub have_io_uring
+{
+	# To detect if io_uring is supported, we look at the error message for
+	# assigning an invalid value to an enum GUC, which lists all the valid
+	# options. We need to use -C to deal with running as administrator on
+	# windows, the superuser check is omitted if -C is used.
+	my ($stdout, $stderr) =
+	  run_command [qw(postgres -C invalid -c io_method=invalid)];
+	die "can't determine supported io_method values"
+	  unless $stderr =~ m/Available values: ([^\.]+)\./;
+	my $methods = $1;
+	note "supported io_method values are: $methods";
+
+	return ($methods =~ m/io_uring/) ? 1 : 0;
+}
+
+sub psql_like
+{
+	local $Test::Builder::Level = $Test::Builder::Level + 1;
+	my $io_method = shift;
+	my $psql = shift;
+	my $name = shift;
+	my $sql = shift;
+	my $expected_stdout = shift;
+	my $expected_stderr = shift;
+	my ($cmdret, $output);
+
+	($output, $cmdret) = $psql->query($sql);
+
+	like($output, $expected_stdout, "$io_method: $name: expected stdout");
+	like($psql->{stderr}, $expected_stderr,
+		"$io_method: $name: expected stderr");
+	$psql->{stderr} = '';
+
+	return $output;
+}
+
+sub query_wait_block
+{
+	local $Test::Builder::Level = $Test::Builder::Level + 1;
+	my $io_method = shift;
+	my $node = shift;
+	my $psql = shift;
+	my $name = shift;
+	my $sql = shift;
+	my $waitfor = shift;
+
+	my $pid = $psql->query_safe('SELECT pg_backend_pid()');
+
+	$psql->{stdin} .= qq($sql;\n);
+	$psql->{run}->pump_nb();
+	ok(1, "$io_method: $name: issued sql");
+
+	$node->poll_query_until('postgres',
+		qq(SELECT wait_event FROM pg_stat_activity WHERE pid = $pid),
+		$waitfor);
+	ok(1, "$io_method: $name: observed $waitfor wait event");
+}
+
+# Returns count of checksum failures for the specified database or for shared
+# relations, if $datname is undefined.
+sub checksum_failures
+{
+	local $Test::Builder::Level = $Test::Builder::Level + 1;
+	my $psql = shift;
+	my $datname = shift;
+	my $checksum_count;
+	my $checksum_last_failure;
+
+	if (defined $datname)
+	{
+		$checksum_count = $psql->query_safe(
+			qq(
+SELECT checksum_failures FROM pg_stat_database WHERE datname = '$datname';
+));
+		$checksum_last_failure = $psql->query_safe(
+			qq(
+SELECT checksum_last_failure FROM pg_stat_database WHERE datname = '$datname';
+));
+	}
+	else
+	{
+		$checksum_count = $psql->query_safe(
+			qq(
+SELECT checksum_failures FROM pg_stat_database WHERE datname IS NULL;
+));
+		$checksum_last_failure = $psql->query_safe(
+			qq(
+SELECT checksum_last_failure FROM pg_stat_database WHERE datname IS NULL;
+));
+	}
+
+	return $checksum_count, $checksum_last_failure;
+}
+
+###
+# Sub-tests
+###
+
+# Sanity checks for the IO handle API
+sub test_handle
+{
+	my $io_method = shift;
+	my $node = shift;
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	# leak warning: implicit xact
+	psql_like(
+		$io_method,
+		$psql,
+		"handle_get() leak in implicit xact",
+		qq(SELECT handle_get()),
+		qr/^$/,
+		qr/leaked AIO handle/,
+		"$io_method: leaky handle_get() warns");
+
+	# leak warning: explicit xact
+	psql_like(
+		$io_method, $psql,
+		"handle_get() leak in explicit xact",
+		qq(BEGIN; SELECT handle_get(); COMMIT),
+		qr/^$/, qr/leaked AIO handle/);
+
+
+	# leak warning: explicit xact, rollback
+	psql_like(
+		$io_method,
+		$psql,
+		"handle_get() leak in explicit xact, rollback",
+		qq(BEGIN; SELECT handle_get(); ROLLBACK;),
+		qr/^$/,
+		qr/leaked AIO handle/);
+
+	# leak warning: subtrans
+	psql_like(
+		$io_method,
+		$psql,
+		"handle_get() leak in subxact",
+		qq(BEGIN; SAVEPOINT foo; SELECT handle_get(); COMMIT;),
+		qr/^$/,
+		qr/leaked AIO handle/);
+
+	# leak warning + error: released in different command (thus resowner)
+	psql_like(
+		$io_method,
+		$psql,
+		"handle_release() in different command",
+		qq(BEGIN; SELECT handle_get(); SELECT handle_release_last(); COMMIT;),
+		qr/^$/,
+		qr/leaked AIO handle.*release in unexpected state/ms);
+
+	# no leak, release in same command
+	psql_like(
+		$io_method,
+		$psql,
+		"handle_release() in same command",
+		qq(BEGIN; SELECT handle_get() UNION ALL SELECT handle_release_last(); COMMIT;),
+		qr/^$/,
+		qr/^$/);
+
+	# normal handle use
+	psql_like($io_method, $psql, "handle_get_release()",
+		qq(SELECT handle_get_release()),
+		qr/^$/, qr/^$/);
+
+	# should error out, API violation
+	psql_like(
+		$io_method,
+		$psql,
+		"handle_get_twice()",
+		qq(SELECT handle_get_twice()),
+		qr/^$/,
+		qr/ERROR:  API violation: Only one IO can be handed out$/);
+
+	# recover after error in implicit xact
+	psql_like(
+		$io_method,
+		$psql,
+		"handle error recovery in implicit xact",
+		qq(SELECT handle_get_and_error(); SELECT 'ok', handle_get_release()),
+		qr/^|ok$/,
+		qr/ERROR.*as you command/);
+
+	# recover after error in implicit xact
+	psql_like(
+		$io_method,
+		$psql,
+		"handle error recovery in explicit xact",
+		qq(BEGIN; SELECT handle_get_and_error(); SELECT handle_get_release(), 'ok'; COMMIT;),
+		qr/^|ok$/,
+		qr/ERROR.*as you command/);
+
+	# recover after error in subtrans
+	psql_like(
+		$io_method,
+		$psql,
+		"handle error recovery in explicit subxact",
+		qq(BEGIN; SAVEPOINT foo; SELECT handle_get_and_error(); ROLLBACK TO SAVEPOINT foo; SELECT handle_get_release(); ROLLBACK;),
+		qr/^|ok$/,
+		qr/ERROR.*as you command/);
+
+	$psql->quit();
+}
+
+# Sanity checks for the batchmode API
+sub test_batchmode
+{
+	my $io_method = shift;
+	my $node = shift;
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	# leak warning & recovery: implicit xact
+	psql_like(
+		$io_method,
+		$psql,
+		"batch_start() leak & cleanup in implicit xact",
+		qq(SELECT batch_start()),
+		qr/^$/,
+		qr/open AIO batch at end/,
+		"$io_method: leaky batch_start() warns");
+
+	# leak warning & recovery: explicit xact
+	psql_like(
+		$io_method,
+		$psql,
+		"batch_start() leak & cleanup in explicit xact",
+		qq(BEGIN; SELECT batch_start(); COMMIT;),
+		qr/^$/,
+		qr/open AIO batch at end/,
+		"$io_method: leaky batch_start() warns");
+
+
+	# leak warning & recovery: explicit xact, rollback
+	#
+	# XXX: This doesn't fail right now, due to not getting a chance to do
+	# something at transaction command commit. That's not a correctness issue,
+	# it just means it's a bit harder to find buggy code.
+	#psql_like($io_method, $psql,
+	#		  "batch_start() leak & cleanup after abort",
+	#		  qq(BEGIN; SELECT batch_start(); ROLLBACK;),
+	#		  qr/^$/,
+	#		  qr/open AIO batch at end/, "$io_method: leaky batch_start() warns");
+
+	# no warning, batch closed in same command
+	psql_like(
+		$io_method,
+		$psql,
+		"batch_start(), batch_end() works",
+		qq(SELECT batch_start() UNION ALL SELECT batch_end()),
+		qr/^$/,
+		qr/^$/,
+		"$io_method: batch_start(), batch_end()");
+
+	$psql->quit();
+}
+
+# Test that simple cases of invalid pages are reported
+sub test_io_error
+{
+	my $io_method = shift;
+	my $node = shift;
+	my ($ret, $output);
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	$psql->query_safe(
+		qq(
+CREATE TEMPORARY TABLE tmp_corr(data int not null);
+INSERT INTO tmp_corr SELECT generate_series(1, 10000);
+SELECT modify_rel_block('tmp_corr', 1, corrupt_header=>true);
+));
+
+	foreach my $tblname (qw(tbl_corr tmp_corr))
+	{
+		my $invalid_page_re =
+		  $tblname eq 'tbl_corr'
+		  ? qr/invalid page in block 1 of relation base\/\d+\/\d+/
+		  : qr/invalid page in block 1 of relation base\/\d+\/t\d+_\d+/;
+
+		# verify the error is reported in custom C code
+		psql_like(
+			$io_method,
+			$psql,
+			"read_rel_block_ll() of $tblname page",
+			qq(SELECT read_rel_block_ll('$tblname', 1)),
+			qr/^$/,
+			$invalid_page_re);
+
+		# verify the error is reported for bufmgr reads, seq scan
+		psql_like(
+			$io_method, $psql,
+			"sequential scan of $tblname block fails",
+			qq(SELECT count(*) FROM $tblname),
+			qr/^$/, $invalid_page_re);
+
+		# verify the error is reported for bufmgr reads, tid scan
+		psql_like(
+			$io_method,
+			$psql,
+			"tid scan of $tblname block fails",
+			qq(SELECT count(*) FROM $tblname WHERE ctid = '(1, 1)'),
+			qr/^$/,
+			$invalid_page_re);
+	}
+
+	$psql->quit();
+}
+
+# Test interplay between StartBufferIO and TerminateBufferIO
+sub test_startwait_io
+{
+	my $io_method = shift;
+	my $node = shift;
+	my ($ret, $output);
+
+	my $psql_a = $node->background_psql('postgres', on_error_stop => 0);
+	my $psql_b = $node->background_psql('postgres', on_error_stop => 0);
+
+
+	### Verify behavior for normal tables
+
+	# create a buffer we can play around with
+	my $buf_id = psql_like(
+		$io_method, $psql_a,
+		"creation of toy buffer succeeds",
+		qq(SELECT buffer_create_toy('tbl_ok', 1)),
+		qr/^\d+$/, qr/^$/);
+
+	# check that one backend can perform StartBufferIO
+	psql_like(
+		$io_method,
+		$psql_a,
+		"first StartBufferIO",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);),
+		qr/^t$/,
+		qr/^$/);
+
+	# but not twice on the same buffer (non-waiting)
+	psql_like(
+		$io_method,
+		$psql_a,
+		"second StartBufferIO fails, same session",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);),
+		qr/^f$/,
+		qr/^$/);
+	psql_like(
+		$io_method,
+		$psql_b,
+		"second StartBufferIO fails, other session",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);),
+		qr/^f$/,
+		qr/^$/);
+
+	# start io in a different session, will block
+	query_wait_block(
+		$io_method,
+		$node,
+		$psql_b,
+		"blocking start buffer io",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);),
+		"BufferIo");
+
+	# Terminate the IO, without marking it as success, this should trigger the
+	# waiting session to be able to start the io
+	psql_like(
+		$io_method,
+		$psql_a,
+		"blocking start buffer io, terminating io, not valid",
+		qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>false, io_error=>false, release_aio=>false)),
+		qr/^$/,
+		qr/^$/);
+
+
+	# Because the IO was terminated, but not marked as valid, second session should get the right to start io
+	pump_until($psql_b->{run}, $psql_b->{timeout}, \$psql_b->{stdout}, qr/t/);
+	ok(1, "$io_method: blocking start buffer io, can start io");
+
+	# terminate the IO again
+	$psql_b->query_safe(
+		qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>false, io_error=>false, release_aio=>false);)
+	);
+
+
+	# same as the above scenario, but mark IO as having succeeded
+	psql_like(
+		$io_method,
+		$psql_a,
+		"blocking buffer io w/ success: first start buffer io",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);),
+		qr/^t$/,
+		qr/^$/);
+
+	# start io in a different session, will block
+	query_wait_block(
+		$io_method,
+		$node,
+		$psql_b,
+		"blocking start buffer io",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);),
+		"BufferIo");
+
+	# Terminate the IO, marking it as success
+	psql_like(
+		$io_method,
+		$psql_a,
+		"blocking start buffer io, terminating io, valid",
+		qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>true, io_error=>false, release_aio=>false)),
+		qr/^$/,
+		qr/^$/);
+
+	# Because the IO was terminated, and marked as valid, second session should complete but not need io
+	pump_until($psql_b->{run}, $psql_b->{timeout}, \$psql_b->{stdout}, qr/f/);
+	ok(1, "$io_method: blocking start buffer io, no need to start io");
+
+	# buffer is valid now, make it invalid again
+	$psql_a->query_safe(qq(SELECT buffer_create_toy('tbl_ok', 1);));
+
+
+	### Verify behavior for temporary tables
+
+	# Can't unfortunately share the code with the normal table case, there are
+	# too many behavioral differences.
+
+	# create a buffer we can play around with
+	$psql_a->query_safe(
+		qq(
+CREATE TEMPORARY TABLE tmp_ok(data int not null);
+INSERT INTO tmp_ok SELECT generate_series(1, 10000);
+));
+	$buf_id = $psql_a->query_safe(qq(SELECT buffer_create_toy('tmp_ok', 3);));
+
+	# check that one backend can perform StartLocalBufferIO
+	psql_like(
+		$io_method,
+		$psql_a,
+		"first StartLocalBufferIO",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);),
+		qr/^t$/,
+		qr/^$/);
+
+	# Because local buffers don't use IO_IN_PROGRESS, a second StartLocalBufer
+	# succeeds as well. This test mostly serves as a documentation of that
+	# fact. If we had actually started IO, it'd be different.
+	psql_like(
+		$io_method,
+		$psql_a,
+		"second StartLocalBufferIO succeeds, same session",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);),
+		qr/^t$/,
+		qr/^$/);
+
+	# Terminate the IO again, without marking it as a success
+	$psql_a->query_safe(
+		qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>false, io_error=>false, release_aio=>false);)
+	);
+	psql_like(
+		$io_method,
+		$psql_a,
+		"StartLocalBufferIO after not marking valid succeeds, same session",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);),
+		qr/^t$/,
+		qr/^$/);
+
+	# Terminate the IO again, marking it as a success
+	$psql_a->query_safe(
+		qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>true, io_error=>false, release_aio=>false);)
+	);
+
+	# Now another StartLocalBufferIO should fail, this time because the buffer
+	# is already valid.
+	psql_like(
+		$io_method,
+		$psql_a,
+		"StartLocalBufferIO after marking valid fails",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);),
+		qr/^f$/,
+		qr/^$/);
+
+	$psql_a->quit();
+	$psql_b->quit();
+}
+
+# Test that if the backend issuing a read doesn't wait for the IO's
+# completion, another backend can complete the IO
+sub test_complete_foreign
+{
+	my $io_method = shift;
+	my $node = shift;
+	my ($ret, $output);
+
+	my $psql_a = $node->background_psql('postgres', on_error_stop => 0);
+	my $psql_b = $node->background_psql('postgres', on_error_stop => 0);
+
+	# Issue IO without waiting for completion, then sleep
+	$psql_a->query_safe(
+		qq(SELECT read_rel_block_ll('tbl_ok', 1, wait_complete=>false);));
+
+	# Check that another backend can read the relevant block
+	psql_like(
+		$io_method,
+		$psql_b,
+		"completing read started by sleeping backend",
+		qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(1,1)' LIMIT 1),
+		qr/^1$/,
+		qr/^$/);
+
+	# Issue IO without waiting for completion, then exit.
+	$psql_a->query_safe(
+		qq(SELECT read_rel_block_ll('tbl_ok', 1, wait_complete=>false);));
+	$psql_a->reconnect_and_clear();
+
+	# Check that another backend can read the relevant block. This verifies
+	# that the exiting backend left the AIO in a sane state.
+	psql_like(
+		$io_method,
+		$psql_b,
+		"read buffer started by exited backend",
+		qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(1,1)' LIMIT 1),
+		qr/^1$/,
+		qr/^$/);
+
+	# Read a tbl_corr block, then sleep. The other session will retry the IO
+	# and also fail. The easiest thing to verify that seems to be to check
+	# that both are in the log.
+	my $log_location = -s $node->logfile;
+	$psql_a->query_safe(
+		qq(SELECT read_rel_block_ll('tbl_corr', 1, wait_complete=>false);));
+
+	psql_like(
+		$io_method,
+		$psql_b,
+		"completing read of tbl_corr block started by other backend",
+		qq(SELECT count(*) FROM tbl_corr WHERE ctid = '(1,1)' LIMIT 1),
+		qr/^$/,
+		qr/invalid page in block/);
+
+	# The log message issued for the read_rel_block_ll() should be logged as a LOG
+	$node->wait_for_log(qr/LOG[^\n]+invalid page in/, $log_location);
+	ok(1,
+		"$io_method: completing read of tbl_corr block started by other backend: LOG message for background read"
+	);
+
+	# But for the SELECT, it should be an ERROR
+	$log_location =
+	  $node->wait_for_log(qr/ERROR[^\n]+invalid page in/, $log_location);
+	ok(1,
+		"$io_method: completing read of tbl_corr block started by other backend: ERROR message for foreground read"
+	);
+
+	$psql_a->quit();
+	$psql_b->quit();
+}
+
+# Test that we deal correctly with FDs being closed while IO is in progress
+sub test_close_fd
+{
+	my $io_method = shift;
+	my $node = shift;
+	my ($ret, $output);
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	psql_like(
+		$io_method,
+		$psql,
+		"close all FDs after read, waiting for results",
+		qq(
+			SELECT read_rel_block_ll('tbl_ok', 1,
+				wait_complete=>true,
+				batchmode_enter=>true,
+				smgrreleaseall=>true,
+				batchmode_exit=>true
+			);),
+		qr/^$/,
+		qr/^$/);
+
+	psql_like(
+		$io_method,
+		$psql,
+		"close all FDs after read, no waiting",
+		qq(
+			SELECT read_rel_block_ll('tbl_ok', 1,
+				wait_complete=>false,
+				batchmode_enter=>true,
+				smgrreleaseall=>true,
+				batchmode_exit=>true
+			);),
+		qr/^$/,
+		qr/^$/);
+
+	# Check that another backend can read the relevant block
+	psql_like(
+		$io_method,
+		$psql,
+		"close all FDs after read, no waiting, query works",
+		qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(1,1)' LIMIT 1),
+		qr/^1$/,
+		qr/^$/);
+
+	$psql->quit();
+}
+
+# Tests using injection points. Mostly to exercise hard IO errors that are
+# hard to trigger without using injection points.
+sub test_inject
+{
+	my $io_method = shift;
+	my $node = shift;
+	my ($ret, $output);
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	# injected what we'd expect
+	$psql->query_safe(qq(SELECT inj_io_short_read_attach(8192);));
+	$psql->query_safe(qq(SELECT invalidate_rel_block('tbl_ok', 2);));
+	psql_like(
+		$io_method, $psql,
+		"injection point not triggering failure",
+		qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(2, 1)'),
+		qr/^1$/, qr/^$/);
+
+	# injected a read shorter than a single block, expecting error
+	$psql->query_safe(qq(SELECT inj_io_short_read_attach(17);));
+	$psql->query_safe(qq(SELECT invalidate_rel_block('tbl_ok', 2);));
+	psql_like(
+		$io_method,
+		$psql,
+		"single block short read fails",
+		qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(2, 1)'),
+		qr/^$/,
+		qr/ERROR:.*could not read blocks 2\.\.2 in file "base\/.*": read only 0 of 8192 bytes/
+	);
+
+	# shorten multi-block read to a single block, should retry
+	my $inval_query = qq(SELECT invalidate_rel_block('tbl_ok', 0);
+SELECT invalidate_rel_block('tbl_ok', 1);
+SELECT invalidate_rel_block('tbl_ok', 2);
+SELECT invalidate_rel_block('tbl_ok', 3);
+/* gap */
+SELECT invalidate_rel_block('tbl_ok', 5);
+SELECT invalidate_rel_block('tbl_ok', 6);
+SELECT invalidate_rel_block('tbl_ok', 7);
+SELECT invalidate_rel_block('tbl_ok', 8););
+
+	$psql->query_safe($inval_query);
+	$psql->query_safe(qq(SELECT inj_io_short_read_attach(8192);));
+	psql_like(
+		$io_method, $psql,
+		"multi block short read (1 block) is retried",
+		qq(SELECT count(*) FROM tbl_ok),
+		qr/^10000$/, qr/^$/);
+
+	# shorten multi-block read to two blocks, should retry
+	$psql->query_safe($inval_query);
+	$psql->query_safe(qq(SELECT inj_io_short_read_attach(8192*2);));
+
+	psql_like(
+		$io_method, $psql,
+		"multi block short read (2 blocks) is retried",
+		qq(SELECT count(*) FROM tbl_ok),
+		qr/^10000$/, qr/^$/);
+
+	# verify that page verification errors are detected even as part of a
+	# shortened multi-block read (tbl_corr, block 1 is corrupted)
+	$psql->query_safe(
+		qq(
+SELECT invalidate_rel_block('tbl_corr', 0);
+SELECT invalidate_rel_block('tbl_corr', 1);
+SELECT invalidate_rel_block('tbl_corr', 2);
+SELECT inj_io_short_read_attach(8192);
+	));
+
+	psql_like(
+		$io_method,
+		$psql,
+		"shortened multi-block read detects invalid page",
+		qq(SELECT count(*) FROM tbl_corr WHERE ctid < '(2, 1)'),
+		qr/^$/,
+		qr/ERROR:.*invalid page in block 1 of relation base\/.*/);
+
+	# trigger a hard error, should error out
+	$psql->query_safe(
+		qq(
+SELECT inj_io_short_read_attach(-errno_from_string('EIO'));
+SELECT invalidate_rel_block('tbl_ok', 2);
+	));
+
+	psql_like(
+		$io_method,
+		$psql,
+		"first hard IO error is reported",
+		qq(SELECT count(*) FROM tbl_ok),
+		qr/^$/,
+		qr/ERROR:.*could not read blocks 2\.\.2 in file \"base\/.*\": Input\/output error/
+	);
+
+	psql_like(
+		$io_method,
+		$psql,
+		"second hard IO error is reported",
+		qq(SELECT count(*) FROM tbl_ok),
+		qr/^$/,
+		qr/ERROR:.*could not read blocks 2\.\.2 in file \"base\/.*\": Input\/output error/
+	);
+
+	$psql->query_safe(qq(SELECT inj_io_short_read_detach()));
+
+	# now the IO should be ok.
+	psql_like(
+		$io_method, $psql,
+		"recovers after hard error",
+		qq(SELECT count(*) FROM tbl_ok),
+		qr/^10000$/, qr/^$/);
+
+	# trigger a different hard error, should error out
+	$psql->query_safe(
+		qq(
+SELECT inj_io_short_read_attach(-errno_from_string('EROFS'));
+SELECT invalidate_rel_block('tbl_ok', 2);
+	));
+	psql_like(
+		$io_method,
+		$psql,
+		"different hard IO error is reported",
+		qq(SELECT count(*) FROM tbl_ok),
+		qr/^$/,
+		qr/ERROR:.*could not read blocks 2\.\.2 in file \"base\/.*\": Read-only file system/
+	);
+	$psql->query_safe(qq(SELECT inj_io_short_read_detach()));
+
+	$psql->quit();
+}
+
+# Tests using injection points, only for io_method=worker.
+#
+# io_method=worker has the special case of needing to reopen files. That can
+# in theory fail, because the file could be gone. That's a hard path to test
+# for real, so we use an injection point to trigger it.
+sub test_inject_worker
+{
+	my $io_method = shift;
+	my $node = shift;
+	my ($ret, $output);
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	# trigger a failure to reopen, should error out, but should recover
+	$psql->query_safe(
+		qq(
+SELECT inj_io_reopen_attach();
+SELECT invalidate_rel_block('tbl_ok', 1);
+	));
+
+	psql_like(
+		$io_method,
+		$psql,
+		"failure to open: detected",
+		qq(SELECT count(*) FROM tbl_ok),
+		qr/^$/,
+		qr/ERROR:.*could not read blocks 1\.\.1 in file "base\/.*": No such file or directory/
+	);
+
+	$psql->query_safe(qq(SELECT inj_io_reopen_detach();));
+
+	# check that we indeed recover
+	psql_like(
+		$io_method, $psql,
+		"failure to open: recovers",
+		qq(SELECT count(*) FROM tbl_ok),
+		qr/^10000$/, qr/^$/);
+
+	$psql->quit();
+}
+
+# Verify that we handle a relation getting removed (due to a rollback or a
+# DROP TABLE) while IO is ongoing for that table.
+sub test_invalidate
+{
+	my $io_method = shift;
+	my $node = shift;
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	foreach my $persistency (qw(normal unlogged temporary))
+	{
+		my $sql_persistency = $persistency eq 'normal' ? '' : $persistency;
+		my $tblname = $persistency . '_transactional';
+
+		my $create_sql = qq(
+CREATE $sql_persistency TABLE $tblname (id int not null, data text not null) WITH (AUTOVACUUM_ENABLED = false);
+INSERT INTO $tblname(id, data) SELECT generate_series(1, 10000) as id, repeat('a', 200);
+);
+
+		# Verify that outstanding read IO does not cause problems with
+		# AbortTransaction -> smgrDoPendingDeletes -> smgrdounlinkall -> ...
+		# -> Invalidate[Local]Buffer.
+		$psql->query_safe("BEGIN; $create_sql;");
+		$psql->query_safe(
+			qq(
+SELECT read_rel_block_ll('$tblname', 1, wait_complete=>false);
+));
+		psql_like(
+			$io_method,
+			$psql,
+			"rollback of newly created $persistency table with outstanding IO",
+			qq(ROLLBACK),
+			qr/^$/,
+			qr/^$/);
+
+		# Verify that outstanding read IO does not cause problems with
+		# CommitTransaction -> smgrDoPendingDeletes -> smgrdounlinkall -> ...
+		# -> Invalidate[Local]Buffer.
+		$psql->query_safe("BEGIN; $create_sql; COMMIT;");
+		$psql->query_safe(
+			qq(
+BEGIN;
+SELECT read_rel_block_ll('$tblname', 1, wait_complete=>false);
+));
+
+		psql_like(
+			$io_method, $psql,
+			"drop $persistency table with outstanding IO",
+			qq(DROP TABLE $tblname),
+			qr/^$/, qr/^$/);
+
+		psql_like($io_method, $psql,
+			"commit after drop $persistency table with outstanding IO",
+			qq(COMMIT), qr/^$/, qr/^$/);
+	}
+
+	$psql->quit();
+}
+
+# Test behavior related to ZERO_ON_ERROR and zero_damaged_pages
+sub test_zero
+{
+	my $io_method = shift;
+	my $node = shift;
+
+	my $psql_a = $node->background_psql('postgres', on_error_stop => 0);
+	my $psql_b = $node->background_psql('postgres', on_error_stop => 0);
+
+	foreach my $persistency (qw(normal temporary))
+	{
+		my $sql_persistency = $persistency eq 'normal' ? '' : $persistency;
+
+		$psql_a->query_safe(
+			qq(
+CREATE $sql_persistency TABLE tbl_zero(id int) WITH (AUTOVACUUM_ENABLED = false);
+INSERT INTO tbl_zero SELECT generate_series(1, 10000);
+));
+
+		$psql_a->query_safe(
+			qq(
+SELECT modify_rel_block('tbl_zero', 0, corrupt_header=>true);
+));
+
+		# Check that page validity errors are detected
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: test reading of invalid block 0",
+			qq(
+SELECT read_rel_block_ll('tbl_zero', 0, zero_on_error=>false)),
+			qr/^$/,
+			qr/^psql:<stdin>:\d+: ERROR:  invalid page in block 0 of relation base\/.*\/.*$/
+		);
+
+		# Check that page validity errors are zeroed
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: test zeroing of invalid block 0",
+			qq(
+SELECT read_rel_block_ll('tbl_zero', 0, zero_on_error=>true)),
+			qr/^$/,
+			qr/^psql:<stdin>:\d+: WARNING:  invalid page in block 0 of relation base\/.*\/.*; zeroing out page$/
+		);
+
+		# And that once the corruption is fixed, we can read again
+		$psql_a->query(
+			qq(
+SELECT modify_rel_block('tbl_zero', 0, zero=>true);
+));
+		$psql_a->{stderr} = '';
+
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: test re-read of block 0",
+			qq(
+SELECT read_rel_block_ll('tbl_zero', 0, zero_on_error=>false)),
+			qr/^$/,
+			qr/^$/);
+
+		# Check a page validity error in another block, to ensure we report
+		# the correct block number
+		$psql_a->query_safe(
+			qq(
+SELECT modify_rel_block('tbl_zero', 3, corrupt_header=>true);
+));
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: test zeroing of invalid block 3",
+			qq(SELECT read_rel_block_ll('tbl_zero', 3, zero_on_error=>true);),
+			qr/^$/,
+			qr/^psql:<stdin>:\d+: WARNING:  invalid page in block 3 of relation base\/.*\/.*; zeroing out page$/
+		);
+
+
+		# Check a page validity error in another block, to ensure we report
+		# the correct block number
+		$psql_a->query_safe(
+			qq(
+SELECT modify_rel_block('tbl_zero', 2, corrupt_header=>true);
+SELECT modify_rel_block('tbl_zero', 3, corrupt_header=>true);
+));
+		# First test error
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: test reading of invalid block 2,3 in larger read",
+			qq(SELECT read_rel_block_ll('tbl_zero', 1, nblocks=>4, zero_on_error=>false)),
+			qr/^$/,
+			qr/^psql:<stdin>:\d+: ERROR:  2 invalid pages among blocks 1..4 of relation base\/.*\/.*\nDETAIL:  Block 2 held first invalid page\.\nHINT:[^\n]+$/
+		);
+
+		# Then test zeroing via ZERO_ON_ERROR flag
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: test zeroing of invalid block 2,3 in larger read, ZERO_ON_ERROR",
+			qq(SELECT read_rel_block_ll('tbl_zero', 1, nblocks=>4, zero_on_error=>true)),
+			qr/^$/,
+			qr/^psql:<stdin>:\d+: WARNING:  zeroing out 2 invalid pages among blocks 1..4 of relation base\/.*\/.*\nDETAIL:  Block 2 held first zeroed page\.\nHINT:[^\n]+$/
+		);
+
+		# Then test zeroing vio zero_damaged_pages
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: test zeroing of invalid block 2,3 in larger read, zero_damaged_pages",
+			qq(
+BEGIN;
+SET LOCAL zero_damaged_pages = true;
+SELECT read_rel_block_ll('tbl_zero', 1, nblocks=>4, zero_on_error=>false)
+COMMIT;
+),
+			qr/^$/,
+			qr/^psql:<stdin>:\d+: WARNING:  zeroing out 2 invalid pages among blocks 1..4 of relation base\/.*\/.*\nDETAIL:  Block 2 held first zeroed page\.\nHINT:[^\n]+$/
+		);
+
+		$psql_a->query_safe(qq(COMMIT));
+
+
+		# Verify that bufmgr.c IO detects page validity errors
+		$psql_a->query(
+			qq(
+SELECT invalidate_rel_block('tbl_zero', g.i)
+FROM generate_series(0, 15) g(i);
+SELECT modify_rel_block('tbl_zero', 3, zero=>true);
+));
+		$psql_a->{stderr} = '';
+
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: verify reading zero_damaged_pages=off",
+			qq(
+SELECT count(*) FROM tbl_zero),
+			qr/^$/,
+			qr/^psql:<stdin>:\d+: ERROR:  invalid page in block 2 of relation base\/.*\/.*$/
+		);
+
+		# Verify that bufmgr.c IO zeroes out pages with page validity errors
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: verify zero_damaged_pages=on",
+			qq(
+BEGIN;
+SET LOCAL zero_damaged_pages = true;
+SELECT count(*) FROM tbl_zero;
+COMMIT;
+),
+			qr/^\d+$/,
+			qr/^psql:<stdin>:\d+: WARNING:  invalid page in block 2 of relation base\/.*\/.*$/
+		);
+
+		# Check that warnings/errors about page validity in an IO started by
+		# session A that session B might complete aren't logged visibly to
+		# session B.
+		#
+		# This will only ever trigger for io_method's like io_uring, that can
+		# complete IO's in a client backend. But it doesn't seem worth
+		# restricting to that.
+		#
+		# This requires cross-session access to the same relation, hence the
+		# restriction to non-temporary table.
+		if ($sql_persistency ne 'temporary')
+		{
+			# Create a corruption and then read the block without waiting for
+			# completion.
+			$psql_a->query(qq(
+SELECT modify_rel_block('tbl_zero', 1, corrupt_header=>true);
+SELECT read_rel_block_ll('tbl_zero', 1, wait_complete=>false, zero_on_error=>true)
+));
+
+			psql_like(
+				$io_method,
+				$psql_b,
+				"$persistency: test completing read by other session doesn't generate warning",
+				qq(SELECT count(*) > 0 FROM tbl_zero;),
+			qr/^t$/, qr/^$/);
+		}
+
+		# Clean up
+		$psql_a->query_safe(
+			qq(
+DROP TABLE tbl_zero;
+));
+	}
+
+	$psql_a->{stderr} = '';
+
+	$psql_a->quit();
+	$psql_b->quit();
+}
+
+# Test that we detect checksum failures and report them
+sub test_checksum
+{
+	my $io_method = shift;
+	my $node = shift;
+
+	my $psql_a = $node->background_psql('postgres', on_error_stop => 0);
+
+	$psql_a->query_safe(
+		qq(
+CREATE TABLE tbl_normal(id int) WITH (AUTOVACUUM_ENABLED = false);
+INSERT INTO tbl_normal SELECT generate_series(1, 5000);
+SELECT modify_rel_block('tbl_normal', 3, corrupt_checksum=>true);
+
+CREATE TEMPORARY TABLE tbl_temp(id int) WITH (AUTOVACUUM_ENABLED = false);
+INSERT INTO tbl_temp SELECT generate_series(1, 5000);
+SELECT modify_rel_block('tbl_temp', 3, corrupt_checksum=>true);
+SELECT modify_rel_block('tbl_temp', 4, corrupt_checksum=>true);
+));
+
+	# To be able to test checksum failures on shared rels we need a shared rel
+	# with invalid pages - which is a bit scary. pg_shseclabel seems like a
+	# good bet, as it's not accessed in a default configuration.
+	$psql_a->query_safe(
+		qq(
+SELECT grow_rel('pg_shseclabel', 4);
+SELECT modify_rel_block('pg_shseclabel', 2, corrupt_checksum=>true);
+SELECT modify_rel_block('pg_shseclabel', 3, corrupt_checksum=>true);
+));
+
+
+	# Check that page validity errors are detected, checksums stats increase, normal rel
+	my ($cs_count_before, $cs_ts_before) =
+	  checksum_failures($psql_a, 'postgres');
+	psql_like(
+		$io_method,
+		$psql_a,
+		"normal rel: test reading of invalid block 3",
+		qq(
+SELECT read_rel_block_ll('tbl_normal', 3, nblocks=>1, zero_on_error=>false);),
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: ERROR:  invalid page in block 3 of relation base\/\d+\/\d+$/
+	);
+
+	my ($cs_count_after, $cs_ts_after) =
+	  checksum_failures($psql_a, 'postgres');
+
+	cmp_ok($cs_count_before + 1,
+		'<=', $cs_count_after,
+		"$io_method: normal rel: checksum count increased");
+	cmp_ok($cs_ts_after, 'ne', '',
+		"$io_method: normal rel: checksum timestamp is not null");
+
+
+	# Check that page validity errors are detected, checksums stats increase, temp rel
+	($cs_count_after, $cs_ts_after) = checksum_failures($psql_a, 'postgres');
+	psql_like(
+		$io_method,
+		$psql_a,
+		"temp rel: test reading of invalid block 4, valid block 5",
+		qq(
+SELECT read_rel_block_ll('tbl_temp', 4, nblocks=>2, zero_on_error=>false);),
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: ERROR:  invalid page in block 4 of relation base\/\d+\/t\d+_\d+$/
+	);
+
+	($cs_count_after, $cs_ts_after) = checksum_failures($psql_a, 'postgres');
+
+	cmp_ok($cs_count_before + 1,
+		'<=', $cs_count_after,
+		"$io_method: temp rel: checksum count increased");
+	cmp_ok($cs_ts_after, 'ne', '',
+		"$io_method: temp rel: checksum timestamp is not null");
+
+
+	# Check that page validity errors are detected, checksums stats increase, shared rel
+	($cs_count_before, $cs_ts_after) = checksum_failures($psql_a);
+	psql_like(
+		$io_method,
+		$psql_a,
+		"shared rel: reading of invalid blocks 2+3",
+		qq(
+SELECT read_rel_block_ll('pg_shseclabel', 2, nblocks=>2, zero_on_error=>false);),
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: ERROR:  2 invalid pages among blocks 2..3 of relation global\/\d+\nDETAIL:  Block 2 held first invalid page\.\nHINT:[^\n]+$/
+	);
+
+	($cs_count_after, $cs_ts_after) = checksum_failures($psql_a);
+
+	cmp_ok($cs_count_before + 1,
+		'<=', $cs_count_after,
+		"$io_method: shared rel: checksum count increased");
+	cmp_ok($cs_ts_after, 'ne', '',
+		"$io_method: shared rel: checksum timestamp is not null");
+
+
+	# and restore sanity
+	$psql_a->query(
+		qq(
+SELECT modify_rel_block('pg_shseclabel', 1, zero=>true);
+DROP TABLE tbl_normal;
+));
+	$psql_a->{stderr} = '';
+
+	$psql_a->quit();
+}
+
+# Verify checksum handling when creating database from an invalid database.
+# This also serves as a minimal check that cross-database IO is handled
+# reasonably.
+sub test_checksum_createdb
+{
+	my $io_method = shift;
+	my $node = shift;
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	$node->safe_psql('postgres',
+		'CREATE DATABASE regression_createdb_source');
+
+	$node->safe_psql(
+		'regression_createdb_source', qq(
+CREATE EXTENSION test_aio;
+CREATE TABLE tbl_cs_fail(data int not null) WITH (AUTOVACUUM_ENABLED = false);
+INSERT INTO tbl_cs_fail SELECT generate_series(1, 1000);
+SELECT modify_rel_block('tbl_cs_fail', 1, corrupt_checksum=>true);
+));
+
+	my $createdb_sql = qq(
+CREATE DATABASE regression_createdb_target
+TEMPLATE regression_createdb_source
+STRATEGY wal_log;
+);
+
+	# Verify that CREATE DATABASE of an invalid database fails and is
+	# accounted for accurately.
+	my ($cs_count_before, $cs_ts_before) =
+	  checksum_failures($psql, 'regression_createdb_source');
+	psql_like(
+		$io_method,
+		$psql,
+		"create database w/ wal strategy, invalid source",
+		$createdb_sql,
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: ERROR:  invalid page in block 1 of relation base\/\d+\/\d+$/
+	);
+	my ($cs_count_after, $cs_ts_after) =
+	  checksum_failures($psql, 'regression_createdb_source');
+	cmp_ok($cs_count_before + 1, '<=', $cs_count_after,
+		"$io_method: create database w/ wal strategy, invalid source: checksum count increased"
+	);
+
+	# Verify that CREATE DATABASE of the fixed database succeeds.
+	$node->safe_psql(
+		'regression_createdb_source', qq(
+SELECT modify_rel_block('tbl_cs_fail', 1, zero=>true);
+));
+	psql_like($io_method, $psql,
+		"create database w/ wal strategy, valid source",
+		$createdb_sql, qr/^$/, qr/^$/);
+
+	$psql->quit();
+}
+
+# Test that we detect checksum failures and report them
+#
+# In several places we make sure that the server log actually contains
+# individual information for each block involved in the IO.
+sub test_ignore_checksum
+{
+	my $io_method = shift;
+	my $node = shift;
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	# Test setup
+	$psql->query_safe(
+		qq(
+CREATE TABLE tbl_cs_fail(id int) WITH (AUTOVACUUM_ENABLED = false);
+INSERT INTO tbl_cs_fail SELECT generate_series(1, 10000);
+));
+
+	my $count_sql = "SELECT count(*) FROM tbl_cs_fail";
+	my $invalidate_sql = qq(
+SELECT invalidate_rel_block('tbl_cs_fail', g.i)
+FROM generate_series(0, 6) g(i);
+);
+
+	my $expect = $psql->query_safe($count_sql);
+
+
+	# Very basic tests for ignore_checksum_failure=off / on
+
+	$psql->query_safe(
+		qq(
+SELECT modify_rel_block('tbl_cs_fail', 1, corrupt_checksum=>true);
+SELECT modify_rel_block('tbl_cs_fail', 5, corrupt_checksum=>true);
+SELECT modify_rel_block('tbl_cs_fail', 6, corrupt_checksum=>true);
+));
+
+	$psql->query_safe($invalidate_sql);
+	psql_like($io_method, $psql,
+		"reading block w/ wrong checksum with ignore_checksum_failure=off fails",
+		$count_sql, qr/^$/, qr/ERROR:  invalid page in block/);
+
+	$psql->query_safe("SET ignore_checksum_failure=on");
+
+	$psql->query_safe($invalidate_sql);
+	psql_like($io_method, $psql,
+			  "reading block w/ wrong checksum with ignore_checksum_failure=off succeeds",
+			  $count_sql,
+			  qr/^$expect$/,
+			  qr/WARNING:  ignoring (checksum failure|\d checksum failures)/);
+
+
+	# Verify that ignore_checksum_failure=off works in multi-block reads
+
+	$psql->query_safe(
+		qq(
+SELECT modify_rel_block('tbl_cs_fail', 2, zero=>true);
+SELECT modify_rel_block('tbl_cs_fail', 3, corrupt_checksum=>true);
+SELECT modify_rel_block('tbl_cs_fail', 4, corrupt_header=>true);
+));
+
+	my $log_location = -s $node->logfile;
+	psql_like(
+		$io_method,
+		$psql,
+		"test reading of checksum failed block 3, with ignore",
+		qq(
+SELECT read_rel_block_ll('tbl_cs_fail', 3, nblocks=>1, zero_on_error=>false);),
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: WARNING:  ignoring checksum failure in block 3/
+	);
+
+	# Check that the log contains a LOG message about the failure
+	$log_location =
+	  $node->wait_for_log(qr/LOG:  ignoring checksum failure/, $log_location);
+
+	# check that we error
+	psql_like(
+		$io_method,
+		$psql,
+		"test reading of valid block 2, checksum failed 3, invalid 4, zero=false with ignore",
+		qq(
+SELECT read_rel_block_ll('tbl_cs_fail', 2, nblocks=>3, zero_on_error=>false);),
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: ERROR:  invalid page in block 4 of relation base\/\d+\/\d+$/
+	);
+
+	# Test multi-block read with different problems in different blocks
+	$psql->query(
+		qq(
+SELECT modify_rel_block('tbl_cs_fail', 1, zero=>true);
+SELECT modify_rel_block('tbl_cs_fail', 2, corrupt_checksum=>true);
+SELECT modify_rel_block('tbl_cs_fail', 3, corrupt_checksum=>true, corrupt_header=>true);
+SELECT modify_rel_block('tbl_cs_fail', 4, corrupt_header=>true);
+SELECT modify_rel_block('tbl_cs_fail', 5, corrupt_header=>true);
+));
+	$psql->{stderr} = '';
+
+	$log_location = -s $node->logfile;
+	psql_like(
+		$io_method,
+		$psql,
+		"test reading of valid block 1, checksum failed 2, 3, invalid 3-5, zero=true",
+		qq(
+SELECT read_rel_block_ll('tbl_cs_fail', 1, nblocks=>5, zero_on_error=>true);),
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: WARNING:  zeroing 3 page\(s\) and ignoring 2 checksum failure\(s\) among blocks 1..5 of relation/
+	);
+
+
+	# Unfortunately have to scan the whole log since determining $log_location
+	# above in each of the tests, as wait_for_log() returns the size of the
+	# file.
+
+	$node->wait_for_log(qr/LOG:  ignoring checksum failure in block 2/,
+						$log_location);
+	ok(1, "$io_method: found information about checksum failure in block 2");
+
+	$node->wait_for_log(qr/LOG:  invalid page in block 3 of relation base.*; zeroing out page/,
+						$log_location);
+	ok(1, "$io_method: found information about invalid page in block 3");
+
+	$node->wait_for_log(qr/LOG:  invalid page in block 4 of relation base.*; zeroing out page/,
+						$log_location);
+	ok(1, "$io_method: found information about checksum failure in block 4");
+
+	$node->wait_for_log(qr/LOG:  invalid page in block 5 of relation base.*; zeroing out page/,
+						$log_location);
+	ok(1, "$io_method: found information about checksum failure in block 5");
+
+
+	# Reading a page with both an invalid header and an invalid checksum
+	$psql->query(
+		qq(
+SELECT modify_rel_block('tbl_cs_fail', 3, corrupt_checksum=>true, corrupt_header=>true);
+));
+	$psql->{stderr} = '';
+
+	psql_like(
+		$io_method,
+		$psql,
+		"test reading of block with both invalid header and invalid checksum, zero=false",
+		qq(
+SELECT read_rel_block_ll('tbl_cs_fail', 3, nblocks=>1, zero_on_error=>false);),
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: ERROR:  invalid page in block 3 of relation/
+	);
+
+	psql_like(
+		$io_method,
+		$psql,
+		"test reading of block 3 with both invalid header and invalid checksum, zero=true",
+		qq(
+SELECT read_rel_block_ll('tbl_cs_fail', 3, nblocks=>1, zero_on_error=>true);),
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: WARNING:  invalid page in block 3 of relation base\/.*; zeroing out page/
+	);
+
+
+	$psql->quit();
+}
+
+
+# Run all tests that are supported for all io_methods
+sub test_generic
+{
+	my $io_method = shift;
+	my $node = shift;
+
+	is($node->safe_psql('postgres', 'SHOW io_method'),
+		$io_method, "$io_method: io_method set correctly");
+
+	$node->safe_psql(
+		'postgres', qq(
+CREATE EXTENSION test_aio;
+CREATE TABLE tbl_corr(data int not null) WITH (AUTOVACUUM_ENABLED = false);
+CREATE TABLE tbl_ok(data int not null) WITH (AUTOVACUUM_ENABLED = false);
+
+INSERT INTO tbl_corr SELECT generate_series(1, 10000);
+INSERT INTO tbl_ok SELECT generate_series(1, 10000);
+SELECT grow_rel('tbl_corr', 16);
+SELECT grow_rel('tbl_ok', 16);
+
+SELECT modify_rel_block('tbl_corr', 1, corrupt_header=>true);
+CHECKPOINT;
+));
+
+	test_handle($io_method, $node);
+	test_io_error($io_method, $node);
+	test_batchmode($io_method, $node);
+	test_startwait_io($io_method, $node);
+	test_complete_foreign($io_method, $node);
+	test_close_fd($io_method, $node);
+	test_invalidate($io_method, $node);
+	test_zero($io_method, $node);
+	test_checksum($io_method, $node);
+	test_ignore_checksum($io_method, $node);
+	test_checksum_createdb($io_method, $node);
+
+  SKIP:
+	{
+		skip 'Injection points not supported by this build', 1
+		  unless $ENV{enable_injection_points} eq 'yes';
+		test_inject($io_method, $node);
+	}
+}
diff --git a/src/test/modules/test_aio/t/002_io_workers.pl b/src/test/modules/test_aio/t/002_io_workers.pl
new file mode 100644
index 000000000000..af5fae15ea78
--- /dev/null
+++ b/src/test/modules/test_aio/t/002_io_workers.pl
@@ -0,0 +1,125 @@
+# Copyright (c) 2025, PostgreSQL Global Development Group
+
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+use List::Util qw(shuffle);
+
+
+my $node = PostgreSQL::Test::Cluster->new('worker');
+$node->init();
+$node->append_conf(
+	'postgresql.conf', qq(
+io_method=worker
+));
+
+$node->start();
+
+# Test changing the number of I/O worker processes while also evaluating the
+# handling of their termination.
+test_number_of_io_workers_dynamic($node);
+
+$node->stop();
+
+done_testing();
+
+
+sub test_number_of_io_workers_dynamic
+{
+	my $node = shift;
+
+	my $prev_worker_count = $node->safe_psql('postgres', 'SHOW io_workers');
+
+	# Verify that worker count can't be set to 0
+	change_number_of_io_workers($node, 0, $prev_worker_count, 1);
+
+	# Verify that worker count can't be set to 33 (above the max)
+	change_number_of_io_workers($node, 33, $prev_worker_count, 1);
+
+	# Try changing IO workers to a random value and verify that the worker
+	# count ends up as expected. Always test the min/max of workers.
+	#
+	# Valid range for io_workers is [1, 32]. 8 tests in total seems
+	# reasonable.
+	my @io_workers_range = shuffle(1 ... 32);
+	foreach my $worker_count (1, 32, @io_workers_range[ 0, 6 ])
+	{
+		$prev_worker_count =
+		  change_number_of_io_workers($node, $worker_count,
+			$prev_worker_count, 0);
+	}
+}
+
+sub change_number_of_io_workers
+{
+	my $node = shift;
+	my $worker_count = shift;
+	my $prev_worker_count = shift;
+	my $expect_failure = shift;
+	my ($result, $stdout, $stderr);
+
+	($result, $stdout, $stderr) =
+	  $node->psql('postgres', "ALTER SYSTEM SET io_workers = $worker_count");
+	$node->safe_psql('postgres', 'SELECT pg_reload_conf()');
+
+	if ($expect_failure)
+	{
+		ok( $stderr =~
+			  /$worker_count is outside the valid range for parameter "io_workers"/,
+			"updating number of io_workers to $worker_count failed, as expected"
+		);
+
+		return $prev_worker_count;
+	}
+	else
+	{
+		is( $node->safe_psql('postgres', 'SHOW io_workers'),
+			$worker_count,
+			"updating number of io_workers from $prev_worker_count to $worker_count"
+		);
+
+		check_io_worker_count($node, $worker_count);
+		terminate_io_worker($node, $worker_count);
+		check_io_worker_count($node, $worker_count);
+
+		return $worker_count;
+	}
+}
+
+sub terminate_io_worker
+{
+	my $node = shift;
+	my $worker_count = shift;
+	my ($pid, $ret);
+
+	# Select a random io worker
+	$pid = $node->safe_psql(
+		'postgres',
+		qq(SELECT pid FROM pg_stat_activity WHERE
+			backend_type = 'io worker' ORDER BY RANDOM() LIMIT 1));
+
+	# terminate IO worker with SIGINT
+	is(PostgreSQL::Test::Utils::system_log('pg_ctl', 'kill', 'INT', $pid),
+		0, "random io worker process signalled with INT");
+
+	# Check that worker exits
+	ok( $node->poll_query_until(
+			'postgres',
+			qq(SELECT COUNT(*) FROM pg_stat_activity WHERE pid = $pid), '0'),
+		"random io worker process exited after signal");
+}
+
+sub check_io_worker_count
+{
+	my $node = shift;
+	my $worker_count = shift;
+
+	ok( $node->poll_query_until(
+			'postgres',
+			qq(SELECT COUNT(*) FROM pg_stat_activity WHERE backend_type = 'io worker'),
+			$worker_count),
+		"io worker count is $worker_count");
+}
diff --git a/src/test/modules/test_aio/test_aio--1.0.sql b/src/test/modules/test_aio/test_aio--1.0.sql
new file mode 100644
index 000000000000..e495481c41e4
--- /dev/null
+++ b/src/test/modules/test_aio/test_aio--1.0.sql
@@ -0,0 +1,108 @@
+/* src/test/modules/test_aio/test_aio--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION test_aio" to load this file. \quit
+
+
+CREATE FUNCTION errno_from_string(sym text)
+RETURNS pg_catalog.int4 STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+
+CREATE FUNCTION grow_rel(rel regclass, nblocks int)
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+
+CREATE FUNCTION modify_rel_block(rel regclass, blockno int,
+  zero bool DEFAULT false,
+  corrupt_header bool DEFAULT false,
+  corrupt_checksum bool DEFAULT false)
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION read_rel_block_ll(
+    rel regclass,
+    blockno int,
+    nblocks int DEFAULT 1,
+    wait_complete bool DEFAULT true,
+    batchmode_enter bool DEFAULT false,
+    smgrreleaseall bool DEFAULT false,
+    batchmode_exit bool DEFAULT false,
+    zero_on_error bool DEFAULT false)
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION invalidate_rel_block(rel regclass, blockno int)
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION buffer_create_toy(rel regclass, blockno int4)
+RETURNS pg_catalog.int4 STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION buffer_call_start_io(buffer int, for_input bool, nowait bool)
+RETURNS pg_catalog.bool STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION buffer_call_terminate_io(buffer int, for_input bool, succeed bool, io_error bool, release_aio bool)
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+
+
+/*
+ * Handle related functions
+ */
+CREATE FUNCTION handle_get_and_error()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION handle_get_twice()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION handle_get()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION handle_get_release()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION handle_release_last()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+
+/*
+ * Batchmode related functions
+ */
+CREATE FUNCTION batch_start()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION batch_end()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+
+
+/*
+ * Injection point related functions
+ */
+CREATE FUNCTION inj_io_short_read_attach(result int)
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION inj_io_short_read_detach()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION inj_io_reopen_attach()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION inj_io_reopen_detach()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/src/test/modules/test_aio/test_aio.c b/src/test/modules/test_aio/test_aio.c
new file mode 100644
index 000000000000..bef3b0e3ad04
--- /dev/null
+++ b/src/test/modules/test_aio/test_aio.c
@@ -0,0 +1,804 @@
+/*-------------------------------------------------------------------------
+ *
+ * test_aio.c
+ *		Helpers to write tests for AIO
+ *
+ * This module provides interface functions for C functionality to SQL, to
+ * make it possible to test AIO related behavior in a targeted way from SQL.
+ * It'd not generally be safe to export these functions to SQL, but for a test
+ * that's fine.
+ *
+ * Copyright (c) 2020-2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/test/modules/test_aio/test_aio.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/relation.h"
+#include "fmgr.h"
+#include "storage/aio.h"
+#include "storage/aio_internal.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+#include "storage/checksum.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "utils/builtins.h"
+#include "utils/injection_point.h"
+#include "utils/rel.h"
+
+
+PG_MODULE_MAGIC;
+
+
+typedef struct InjIoErrorState
+{
+	bool		enabled_short_read;
+	bool		enabled_reopen;
+
+	bool		short_read_result_set;
+	int			short_read_result;
+}			InjIoErrorState;
+
+static InjIoErrorState * inj_io_error_state;
+
+/* Shared memory init callbacks */
+static shmem_request_hook_type prev_shmem_request_hook = NULL;
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+
+
+static PgAioHandle *last_handle;
+
+
+
+static void
+test_aio_shmem_request(void)
+{
+	if (prev_shmem_request_hook)
+		prev_shmem_request_hook();
+
+	RequestAddinShmemSpace(sizeof(InjIoErrorState));
+}
+
+static void
+test_aio_shmem_startup(void)
+{
+	bool		found;
+
+	if (prev_shmem_startup_hook)
+		prev_shmem_startup_hook();
+
+	/* Create or attach to the shared memory state */
+	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+
+	inj_io_error_state = ShmemInitStruct("injection_points",
+										 sizeof(InjIoErrorState),
+										 &found);
+
+	if (!found)
+	{
+		/* First time through, initialize */
+		inj_io_error_state->enabled_short_read = false;
+		inj_io_error_state->enabled_reopen = false;
+
+#ifdef USE_INJECTION_POINTS
+		InjectionPointAttach("AIO_PROCESS_COMPLETION_BEFORE_SHARED",
+							 "test_aio",
+							 "inj_io_short_read",
+							 NULL,
+							 0);
+		InjectionPointLoad("AIO_PROCESS_COMPLETION_BEFORE_SHARED");
+
+		InjectionPointAttach("AIO_WORKER_AFTER_REOPEN",
+							 "test_aio",
+							 "inj_io_reopen",
+							 NULL,
+							 0);
+		InjectionPointLoad("AIO_WORKER_AFTER_REOPEN");
+
+#endif
+	}
+	else
+	{
+		/*
+		 * Pre-load the injection points now, so we can call them in a
+		 * critical section.
+		 */
+#ifdef USE_INJECTION_POINTS
+		InjectionPointLoad("AIO_PROCESS_COMPLETION_BEFORE_SHARED");
+		InjectionPointLoad("AIO_WORKER_AFTER_REOPEN");
+		elog(LOG, "injection point loaded");
+#endif
+	}
+
+	LWLockRelease(AddinShmemInitLock);
+}
+
+void
+_PG_init(void)
+{
+	if (!process_shared_preload_libraries_in_progress)
+		return;
+
+	prev_shmem_request_hook = shmem_request_hook;
+	shmem_request_hook = test_aio_shmem_request;
+	prev_shmem_startup_hook = shmem_startup_hook;
+	shmem_startup_hook = test_aio_shmem_startup;
+}
+
+
+PG_FUNCTION_INFO_V1(errno_from_string);
+Datum
+errno_from_string(PG_FUNCTION_ARGS)
+{
+	const char *sym = text_to_cstring(PG_GETARG_TEXT_PP(0));
+
+	if (strcmp(sym, "EIO") == 0)
+		PG_RETURN_INT32(EIO);
+	else if (strcmp(sym, "EAGAIN") == 0)
+		PG_RETURN_INT32(EAGAIN);
+	else if (strcmp(sym, "EINTR") == 0)
+		PG_RETURN_INT32(EINTR);
+	else if (strcmp(sym, "ENOSPC") == 0)
+		PG_RETURN_INT32(ENOSPC);
+	else if (strcmp(sym, "EROFS") == 0)
+		PG_RETURN_INT32(EROFS);
+
+	ereport(ERROR,
+			errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+			errmsg_internal("%s is not a supported errno value", sym));
+	PG_RETURN_INT32(0);
+}
+
+PG_FUNCTION_INFO_V1(grow_rel);
+Datum
+grow_rel(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	uint32		nblocks = PG_GETARG_UINT32(1);
+	Relation	rel;
+#define MAX_BUFFERS_TO_EXTEND_BY 64
+	Buffer		victim_buffers[MAX_BUFFERS_TO_EXTEND_BY];
+
+	rel = relation_open(relid, AccessExclusiveLock);
+
+	while (nblocks > 0)
+	{
+		uint32		extend_by_pages;
+
+		extend_by_pages = Min(nblocks, MAX_BUFFERS_TO_EXTEND_BY);
+
+		ExtendBufferedRelBy(BMR_REL(rel),
+							MAIN_FORKNUM,
+							NULL,
+							0,
+							extend_by_pages,
+							victim_buffers,
+							&extend_by_pages);
+
+		nblocks -= extend_by_pages;
+
+		for (uint32 i = 0; i < extend_by_pages; i++)
+		{
+			ReleaseBuffer(victim_buffers[i]);
+		}
+	}
+
+	relation_close(rel, NoLock);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(modify_rel_block);
+Datum
+modify_rel_block(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	BlockNumber blkno = PG_GETARG_UINT32(1);
+	bool		zero = PG_GETARG_BOOL(2);
+	bool		corrupt_header = PG_GETARG_BOOL(3);
+	bool		corrupt_checksum = PG_GETARG_BOOL(4);
+	Page		page = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
+	Relation	rel;
+	Buffer		buf;
+	PageHeader	ph;
+
+	rel = relation_open(relid, AccessExclusiveLock);
+
+	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno,
+							 RBM_ZERO_ON_ERROR, NULL);
+
+	LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+	/*
+	 * copy the page to local memory, seems nicer than to directly modify in
+	 * the buffer pool.
+	 */
+	memcpy(page, BufferGetPage(buf), BLCKSZ);
+
+	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+	ReleaseBuffer(buf);
+
+	/*
+	 * Don't want to have a buffer in-memory that's marked valid where the
+	 * on-disk contents are invalid. Particularly not if the in-memory buffer
+	 * could be dirty...
+	 *
+	 * While we hold an AEL on the relation nobody else should be able to read
+	 * the buffer in.
+	 *
+	 * NB: This is probably racy, better don't copy this to non-test code.
+	 */
+	if (BufferIsLocal(buf))
+		InvalidateLocalBuffer(GetLocalBufferDescriptor(-buf - 1), true);
+	else
+		EvictUnpinnedBuffer(buf);
+
+	/*
+	 * Now modify the page as asked for by the caller.
+	 */
+	if (zero)
+		memset(page, 0, BufferGetPageSize(buf));
+
+	if (PageIsEmpty(page) && (corrupt_header || corrupt_checksum))
+		PageInit(page, BufferGetPageSize(buf), 0);
+
+	ph = (PageHeader) page;
+
+	if (corrupt_header)
+		ph->pd_special = BLCKSZ + 1;
+
+	if (corrupt_checksum)
+	{
+		bool		successfully_corrupted = 0;
+
+		/*
+		 * Any single modification of the checksum could just end up being
+		 * valid again. To be sure
+		 */
+		for (int i = 0; i < 100; i++)
+		{
+			uint16		verify_checksum;
+			uint16		old_checksum;
+
+			old_checksum = ph->pd_checksum;
+			ph->pd_checksum = old_checksum + 1;
+
+			elog(LOG, "corrupting checksum of blk %u from %u to %u",
+				 blkno, old_checksum, ph->pd_checksum);
+
+			verify_checksum = pg_checksum_page(page, blkno);
+			if (verify_checksum != ph->pd_checksum)
+			{
+				successfully_corrupted = true;
+				break;
+			}
+		}
+
+		if (!successfully_corrupted)
+			elog(ERROR, "could not corrupt checksum, what's going on?");
+	}
+	else
+	{
+		PageSetChecksumInplace(page, blkno);
+	}
+
+	smgrwrite(RelationGetSmgr(rel),
+			  MAIN_FORKNUM, blkno, page, true);
+
+	relation_close(rel, NoLock);
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * Ensures a buffer for rel & blkno is in shared buffers, without actually
+ * caring about the buffer contents. Used to set up test scenarios.
+ */
+static Buffer
+create_toy_buffer(Relation rel, BlockNumber blkno)
+{
+	Buffer		buf;
+	BufferDesc *buf_hdr;
+	uint32		buf_state;
+	bool		was_pinned = false;
+
+	/* place buffer in shared buffers without erroring out */
+	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK, NULL);
+	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+	if (RelationUsesLocalBuffers(rel))
+	{
+		buf_hdr = GetLocalBufferDescriptor(-buf - 1);
+		buf_state = pg_atomic_read_u32(&buf_hdr->state);
+	}
+	else
+	{
+		buf_hdr = GetBufferDescriptor(buf - 1);
+		buf_state = LockBufHdr(buf_hdr);
+	}
+
+	/*
+	 * We should be the only backend accessing this buffer. This is just a
+	 * small bit of belt-and-suspenders defense, none of this code should ever
+	 * run in a cluster with real data.
+	 */
+	if (BUF_STATE_GET_REFCOUNT(buf_state) > 1)
+		was_pinned = true;
+	else
+		buf_state &= ~(BM_VALID | BM_DIRTY);
+
+	if (RelationUsesLocalBuffers(rel))
+		pg_atomic_unlocked_write_u32(&buf_hdr->state, buf_state);
+	else
+		UnlockBufHdr(buf_hdr, buf_state);
+
+	if (was_pinned)
+		elog(ERROR, "toy buffer %d was already pinned",
+			 buf);
+
+	return buf;
+}
+
+/*
+ * A "low level" read. This does similar things to what
+ * StartReadBuffers()/WaitReadBuffers() do, but provides more control (and
+ * less sanity).
+ */
+PG_FUNCTION_INFO_V1(read_rel_block_ll);
+Datum
+read_rel_block_ll(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	BlockNumber blkno = PG_GETARG_UINT32(1);
+	int			nblocks = PG_GETARG_INT32(2);
+	bool		wait_complete = PG_GETARG_BOOL(3);
+	bool		batchmode_enter = PG_GETARG_BOOL(4);
+	bool		call_smgrreleaseall = PG_GETARG_BOOL(5);
+	bool		batchmode_exit = PG_GETARG_BOOL(6);
+	bool		zero_on_error = PG_GETARG_BOOL(7);
+	Relation	rel;
+	Buffer		bufs[PG_IOV_MAX];
+	BufferDesc *buf_hdrs[PG_IOV_MAX];
+	Page		pages[PG_IOV_MAX];
+	uint8		srb_flags = 0;
+	PgAioReturn ior;
+	PgAioHandle *ioh;
+	PgAioWaitRef iow;
+	SMgrRelation smgr;
+
+	if (nblocks <= 0 || nblocks > PG_IOV_MAX)
+		elog(ERROR, "nblocks is out of range");
+
+	rel = relation_open(relid, AccessExclusiveLock);
+
+	for (int i = 0; i < nblocks; i++)
+	{
+		bufs[i] = create_toy_buffer(rel, blkno + i);
+		pages[i] = BufferGetBlock(bufs[i]);
+		buf_hdrs[i] = BufferIsLocal(bufs[i]) ?
+			GetLocalBufferDescriptor(-bufs[i] - 1) :
+			GetBufferDescriptor(bufs[i] - 1);
+	}
+
+	smgr = RelationGetSmgr(rel);
+
+	pgstat_prepare_report_checksum_failure(smgr->smgr_rlocator.locator.dbOid);
+
+	ioh = pgaio_io_acquire(CurrentResourceOwner, &ior);
+	pgaio_io_get_wref(ioh, &iow);
+
+	if (RelationUsesLocalBuffers(rel))
+	{
+		for (int i = 0; i < nblocks; i++)
+			StartLocalBufferIO(buf_hdrs[i], true, false);
+		pgaio_io_set_flag(ioh, PGAIO_HF_REFERENCES_LOCAL);
+	}
+	else
+	{
+		for (int i = 0; i < nblocks; i++)
+			StartBufferIO(buf_hdrs[i], true, false);
+	}
+
+	pgaio_io_set_handle_data_32(ioh, (uint32 *) bufs, nblocks);
+
+	if (zero_on_error | zero_damaged_pages)
+		srb_flags |= READ_BUFFERS_ZERO_ON_ERROR;
+	if (ignore_checksum_failure)
+		srb_flags |= READ_BUFFERS_IGNORE_CHECKSUM_FAILURES;
+
+	pgaio_io_register_callbacks(ioh,
+								RelationUsesLocalBuffers(rel) ?
+								PGAIO_HCB_LOCAL_BUFFER_READV :
+								PGAIO_HCB_SHARED_BUFFER_READV,
+								srb_flags);
+
+	if (batchmode_enter)
+		pgaio_enter_batchmode();
+
+	smgrstartreadv(ioh, smgr, MAIN_FORKNUM, blkno,
+				   (void *) pages, nblocks);
+
+	if (call_smgrreleaseall)
+		smgrreleaseall();
+
+	if (batchmode_exit)
+		pgaio_exit_batchmode();
+
+	for (int i = 0; i < nblocks; i++)
+		ReleaseBuffer(bufs[i]);
+
+	if (wait_complete)
+	{
+		pgaio_wref_wait(&iow);
+
+		if (ior.result.status != PGAIO_RS_OK)
+			pgaio_result_report(ior.result,
+								&ior.target_data,
+								ior.result.status == PGAIO_RS_ERROR ?
+								ERROR : WARNING);
+	}
+
+	relation_close(rel, NoLock);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(invalidate_rel_block);
+Datum
+invalidate_rel_block(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	BlockNumber blkno = PG_GETARG_UINT32(1);
+	Relation	rel;
+	PrefetchBufferResult pr;
+	Buffer		buf;
+
+	rel = relation_open(relid, AccessExclusiveLock);
+
+	/*
+	 * This is a gross hack, but there's no other API exposed that allows to
+	 * get a buffer ID without actually reading the block in.
+	 */
+	pr = PrefetchBuffer(rel, MAIN_FORKNUM, blkno);
+	buf = pr.recent_buffer;
+
+	if (BufferIsValid(buf))
+	{
+		/* if the buffer contents aren't valid, this'll return false */
+		if (ReadRecentBuffer(rel->rd_locator, MAIN_FORKNUM, blkno, buf))
+		{
+			BufferDesc *buf_hdr = BufferIsLocal(buf) ?
+				GetLocalBufferDescriptor(-buf - 1)
+				: GetBufferDescriptor(buf - 1);
+
+			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+			if (pg_atomic_read_u32(&buf_hdr->state) & BM_DIRTY)
+			{
+				if (BufferIsLocal(buf))
+					FlushLocalBuffer(buf_hdr, NULL);
+				else
+					FlushOneBuffer(buf);
+			}
+			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+			ReleaseBuffer(buf);
+
+			if (BufferIsLocal(buf))
+				InvalidateLocalBuffer(GetLocalBufferDescriptor(-buf - 1), true);
+			else if (!EvictUnpinnedBuffer(buf))
+				elog(ERROR, "couldn't evict");
+		}
+	}
+
+	relation_close(rel, AccessExclusiveLock);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(buffer_create_toy);
+Datum
+buffer_create_toy(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	BlockNumber blkno = PG_GETARG_UINT32(1);
+	Relation	rel;
+	Buffer		buf;
+
+	rel = relation_open(relid, AccessExclusiveLock);
+
+	buf = create_toy_buffer(rel, blkno);
+	ReleaseBuffer(buf);
+
+	relation_close(rel, NoLock);
+
+	PG_RETURN_INT32(buf);
+}
+
+PG_FUNCTION_INFO_V1(buffer_call_start_io);
+Datum
+buffer_call_start_io(PG_FUNCTION_ARGS)
+{
+	Buffer		buf = PG_GETARG_INT32(0);
+	bool		for_input = PG_GETARG_BOOL(1);
+	bool		nowait = PG_GETARG_BOOL(2);
+	bool		can_start;
+
+	if (BufferIsLocal(buf))
+		can_start = StartLocalBufferIO(GetLocalBufferDescriptor(-buf - 1),
+									   for_input, nowait);
+	else
+		can_start = StartBufferIO(GetBufferDescriptor(buf - 1),
+								  for_input, nowait);
+
+	/*
+	 * For tests we don't want the resowner release preventing us from
+	 * orchestrating odd scenarios.
+	 */
+	if (can_start && !BufferIsLocal(buf))
+		ResourceOwnerForgetBufferIO(CurrentResourceOwner,
+									buf);
+
+	ereport(LOG,
+			errmsg("buffer %d after StartBufferIO: %s",
+				   buf, DebugPrintBufferRefcount(buf)),
+			errhidestmt(true), errhidecontext(true));
+
+	PG_RETURN_BOOL(can_start);
+}
+
+PG_FUNCTION_INFO_V1(buffer_call_terminate_io);
+Datum
+buffer_call_terminate_io(PG_FUNCTION_ARGS)
+{
+	Buffer		buf = PG_GETARG_INT32(0);
+	bool		for_input = PG_GETARG_BOOL(1);
+	bool		succeed = PG_GETARG_BOOL(2);
+	bool		io_error = PG_GETARG_BOOL(3);
+	bool		release_aio = PG_GETARG_BOOL(4);
+	bool		clear_dirty = false;
+	uint32		set_flag_bits = 0;
+
+	if (io_error)
+		set_flag_bits |= BM_IO_ERROR;
+
+	if (for_input)
+	{
+		clear_dirty = false;
+
+		if (succeed)
+			set_flag_bits |= BM_VALID;
+	}
+	else
+	{
+		if (succeed)
+			clear_dirty = true;
+	}
+
+	ereport(LOG,
+			errmsg("buffer %d before Terminate[Local]BufferIO: %s",
+				   buf, DebugPrintBufferRefcount(buf)),
+			errhidestmt(true), errhidecontext(true));
+
+	if (BufferIsLocal(buf))
+		TerminateLocalBufferIO(GetLocalBufferDescriptor(-buf - 1),
+							   clear_dirty, set_flag_bits, release_aio);
+	else
+		TerminateBufferIO(GetBufferDescriptor(buf - 1),
+						  clear_dirty, set_flag_bits, false, release_aio);
+
+	ereport(LOG,
+			errmsg("buffer %d after Terminate[Local]BufferIO: %s",
+				   buf, DebugPrintBufferRefcount(buf)),
+			errhidestmt(true), errhidecontext(true));
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(handle_get);
+Datum
+handle_get(PG_FUNCTION_ARGS)
+{
+	last_handle = pgaio_io_acquire(CurrentResourceOwner, NULL);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(handle_release_last);
+Datum
+handle_release_last(PG_FUNCTION_ARGS)
+{
+	if (!last_handle)
+		elog(ERROR, "no handle");
+
+	pgaio_io_release(last_handle);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(handle_get_and_error);
+Datum
+handle_get_and_error(PG_FUNCTION_ARGS)
+{
+	pgaio_io_acquire(CurrentResourceOwner, NULL);
+
+	elog(ERROR, "as you command");
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(handle_get_twice);
+Datum
+handle_get_twice(PG_FUNCTION_ARGS)
+{
+	pgaio_io_acquire(CurrentResourceOwner, NULL);
+	pgaio_io_acquire(CurrentResourceOwner, NULL);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(handle_get_release);
+Datum
+handle_get_release(PG_FUNCTION_ARGS)
+{
+	PgAioHandle *handle;
+
+	handle = pgaio_io_acquire(CurrentResourceOwner, NULL);
+	pgaio_io_release(handle);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(batch_start);
+Datum
+batch_start(PG_FUNCTION_ARGS)
+{
+	pgaio_enter_batchmode();
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(batch_end);
+Datum
+batch_end(PG_FUNCTION_ARGS)
+{
+	pgaio_exit_batchmode();
+	PG_RETURN_VOID();
+}
+
+#ifdef USE_INJECTION_POINTS
+extern PGDLLEXPORT void inj_io_short_read(const char *name, const void *private_data);
+extern PGDLLEXPORT void inj_io_reopen(const char *name, const void *private_data);
+
+void
+inj_io_short_read(const char *name, const void *private_data)
+{
+	PgAioHandle *ioh;
+
+	ereport(LOG,
+			errmsg("short read injection point called, is enabled: %d",
+				   inj_io_error_state->enabled_reopen),
+			errhidestmt(true), errhidecontext(true));
+
+	if (inj_io_error_state->enabled_short_read)
+	{
+		ioh = pgaio_inj_io_get();
+
+		/*
+		 * Only shorten reads that are actually longer than the target size,
+		 * otherwise we can trigger over-reads.
+		 */
+		if (inj_io_error_state->short_read_result_set
+			&& ioh->op == PGAIO_OP_READV
+			&& inj_io_error_state->short_read_result <= ioh->result)
+		{
+			struct iovec *iov = &pgaio_ctl->iovecs[ioh->iovec_off];
+			int32		old_result = ioh->result;
+			int32		new_result = inj_io_error_state->short_read_result;
+			int32		processed = 0;
+
+			ereport(LOG,
+					errmsg("short read inject point, changing result from %d to %d",
+						   old_result, new_result),
+					errhidestmt(true), errhidecontext(true));
+
+			/*
+			 * The underlying IO actually completed OK, and thus the "invalid"
+			 * portion of the IOV actually contains valid data. That can hide
+			 * a lot of problems, e.g. if we were to wrongly mark a buffer,
+			 * that wasn't read according to the shortened-read, IO as valid,
+			 * the contents would look valid and we might miss a bug.
+			 *
+			 * To avoid that, iterate through the IOV and zero out the
+			 * "failed" portion of the IO.
+			 */
+			for (int i = 0; i < ioh->op_data.read.iov_length; i++)
+			{
+				if (processed + iov[i].iov_len <= new_result)
+					processed += iov[i].iov_len;
+				else if (processed <= new_result)
+				{
+					uint32		ok_part = new_result - processed;
+
+					memset((char *) iov[i].iov_base + ok_part, 0, iov[i].iov_len - ok_part);
+					processed += iov[i].iov_len;
+				}
+				else
+				{
+					memset((char *) iov[i].iov_base, 0, iov[i].iov_len);
+				}
+			}
+
+			ioh->result = new_result;
+		}
+	}
+}
+
+void
+inj_io_reopen(const char *name, const void *private_data)
+{
+	ereport(LOG,
+			errmsg("reopen injection point called, is enabled: %d",
+				   inj_io_error_state->enabled_reopen),
+			errhidestmt(true), errhidecontext(true));
+
+	if (inj_io_error_state->enabled_reopen)
+		elog(ERROR, "injection point triggering failure to reopen ");
+}
+#endif
+
+PG_FUNCTION_INFO_V1(inj_io_short_read_attach);
+Datum
+inj_io_short_read_attach(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+	inj_io_error_state->enabled_short_read = true;
+	inj_io_error_state->short_read_result_set = !PG_ARGISNULL(0);
+	if (inj_io_error_state->short_read_result_set)
+		inj_io_error_state->short_read_result = PG_GETARG_INT32(0);
+#else
+	elog(ERROR, "injection points not supported");
+#endif
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(inj_io_short_read_detach);
+Datum
+inj_io_short_read_detach(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+	inj_io_error_state->enabled_short_read = false;
+#else
+	elog(ERROR, "injection points not supported");
+#endif
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(inj_io_reopen_attach);
+Datum
+inj_io_reopen_attach(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+	inj_io_error_state->enabled_reopen = true;
+#else
+	elog(ERROR, "injection points not supported");
+#endif
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(inj_io_reopen_detach);
+Datum
+inj_io_reopen_detach(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+	inj_io_error_state->enabled_reopen = false;
+#else
+	elog(ERROR, "injection points not supported");
+#endif
+	PG_RETURN_VOID();
+}
diff --git a/src/test/modules/test_aio/test_aio.control b/src/test/modules/test_aio/test_aio.control
new file mode 100644
index 000000000000..cd91c3ed16b9
--- /dev/null
+++ b/src/test/modules/test_aio/test_aio.control
@@ -0,0 +1,3 @@
+comment = 'Test code for AIO'
+default_version = '1.0'
+module_pathname = '$libdir/test_aio'

From d88688b864475124daad3e038c7efcf3e4dea8f4 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 31 Mar 2025 19:27:04 -0400
Subject: [PATCH 05/16] md: Add comment & assert to buffer-zeroing path in
 md[start]readv()

mdreadv() has a codepath to zero out buffers when a read returns zero bytes,
guarded by a check for zero_damaged_pages || InRecovery.

The InRecovery codepath to zero out buffers in mdreadv() appears to be
unreachable. The only known paths to reach mdreadv()/mdstartreadv() in
recovery are XLogReadBufferExtended(), vm_readbuf(), and fsm_readbuf(), each
of which takes care to extend the relation if necessary. This looks to either
have been the case for a long time, or the code was never reachable.

The zero_damaged_pages path is incomplete, as as missing segments are not
created.

Putting blocks into the buffer-pool that do not exist on disk is rather
problematic, as such blocks will, at least initially, not be found by scans
that rely on smgrnblocks(), as they are beyond EOF. It also can cause weird
problems with relation extension, as relation extension does not expect blocks
beyond EOF to exist.

Therefore we would like to remove that path.

mdstartreadv(), which I added in e5fe570b51c, does not implement this zeroing
logic. I had started a discussion about that a while ago (linked below), but
forgot to act on the conclusion of the discussion, namely to disable the
in-memory-zeroing behavior.

We could certainly implement equivalent zeroing logic in mdstartreadv(), but
it would have to be more complicated due to potential differences in the
zero_damaged_pages setting between the definer and completor of IO. Given that
we want to remove the logic, that does not seem worth implementing the
necessary logic.

For now, put an Assert(false) comments documenting this choice into mdreadv()
and comments documenting the deprecation of the path in mdreadv() and the
non-implementation of it in mdstartreadv().  If we, during testing, discover
that we do need the path, we can implement it at that time.

Discussion: https://postgr.es/m/postgr.es/m/20250330024513.ac.nmisch@google.com
Discussion: https://postgr.es/m/postgr.es/m/3qxxsnciyffyf3wyguiz4besdp5t5uxvv3utg75cbcszojlz7p@uibfzmnukkbd
---
 src/backend/storage/smgr/md.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index b55892b94051..ecc33713d8fa 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -910,9 +910,30 @@ mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 				 * is ON or we are InRecovery, we should instead return zeroes
 				 * without complaining.  This allows, for example, the case of
 				 * trying to update a block that was later truncated away.
+				 *
+				 * NB: We think that this codepath is unreachable in recovery
+				 * and incomplete with zero_damaged_pages, as missing segments
+				 * are not created. Putting blocks into the buffer-pool that
+				 * do not exist on disk is rather problematic, as it will not
+				 * be found by scans that rely on smgrnblocks(), as they are
+				 * beyond EOF. It also can cause weird problems with relation
+				 * extension, as relation extension does not expect blocks
+				 * beyond EOF to exist.
+				 *
+				 * Therefore we do not want to copy the logic into
+				 * mdstartreadv(), where it would have to be more complicated
+				 * due to potential differences in the zero_damaged_pages
+				 * setting between the definer and completor of IO.
+				 *
+				 * For PG 18, we are putting an Assert(false) in into
+				 * mdreadv() (triggering failures in assertion-enabled builds,
+				 * but continuing to work in production builds). Afterwards we
+				 * plan to remove this code entirely.
 				 */
 				if (zero_damaged_pages || InRecovery)
 				{
+					Assert(false);	/* see comment above */
+
 					for (BlockNumber i = transferred_this_segment / BLCKSZ;
 						 i < nblocks_this_segment;
 						 ++i)
@@ -1007,6 +1028,13 @@ mdstartreadv(PgAioHandle *ioh,
 	/*
 	 * The error checks corresponding to the post-read checks in mdreadv() are
 	 * in md_readv_complete().
+	 *
+	 * However we chose, at least for now, to not implement the
+	 * zero_damaged_pages logic present in mdreadv(). As outlined in mdreadv()
+	 * that logic is rather problematic, and we want to get rid of it. Here
+	 * equivalent logic would have to be more complicated due to potential
+	 * differences in the zero_damaged_pages setting between the definer and
+	 * completor of IO.
 	 */
 }
 

From 81b0f39ac885d6896f9920da337d9c66966bc28b Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Sun, 30 Mar 2025 15:38:32 -0400
Subject: [PATCH 06/16] aio: comment polishing

Author:
Reviewed-by:
Discussion: https://postgr.es/m/
Backpatch:
---
 src/backend/storage/smgr/md.c   | 2 +-
 src/backend/storage/smgr/smgr.c | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index ecc33713d8fa..bc7d94df870d 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -1981,7 +1981,7 @@ md_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 		 * might not process the query result immediately (because it is busy
 		 * doing another part of query processing) or at all (e.g. if it was
 		 * cancelled or errored out due to another IO also failing).  The
-		 * issuer of the IO will emit an ERROR when processing the IO's
+		 * definer of the IO will emit an ERROR when processing the IO's
 		 * results
 		 */
 		pgaio_result_report(result, td, LOG_SERVER_ONLY);
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 320dc04d83ae..d532fea39d93 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -738,6 +738,14 @@ smgrreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
  * blocks not successfully read might bear unspecified modifications, up to
  * the full nblocks). This maintains the abstraction that smgr operates on the
  * level of blocks, rather than bytes.
+ *
+ * Compared to smgrreadv(), more responsibilities fall on the caller:
+ * - Partial reads need to be handle by the caller re-issuing IO for the
+ *   unread blocks
+ * - smgr will ereport(LOG_SERVER_ONLY) some problems, but higher layers are
+ *   responsible for pgaio_result_report() to mirror that news to the user (if
+ *   the IO results in PGAIO_RS_WARNING) or abort the (sub)transaction (if
+ *   PGAIO_RS_ERROR).
  */
 void
 smgrstartreadv(PgAioHandle *ioh,

From 527b9dd8b8286f4d1ff30a76605c343ff63ed2af Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Sun, 30 Mar 2025 14:03:55 -0400
Subject: [PATCH 07/16] aio: Add errcontext for processing I/Os for another
 backend

Push an ErrorContextCallback adding additional detail about the process
performing the I/O and the owner of the I/O when those are not the same.

For io_method worker, this adds context specifying which process owns
the I/O that the I/O worker is processing.

For io_method io_uring, this adds context only when a backend is
*completing* I/O for another backend. It specifies the pid of the owning
process.

Author: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/20250325141120.8e.nmisch%40google.com
---
 src/backend/storage/aio/method_io_uring.c | 31 +++++++++++++++++++++++
 src/backend/storage/aio/method_worker.c   | 29 +++++++++++++++++++++
 2 files changed, 60 insertions(+)

diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c
index 0bcdab14ae7e..c719ba2727a8 100644
--- a/src/backend/storage/aio/method_io_uring.c
+++ b/src/backend/storage/aio/method_io_uring.c
@@ -302,14 +302,41 @@ pgaio_uring_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
 	return num_staged_ios;
 }
 
+static void
+pgaio_uring_completion_error_callback(void *arg)
+{
+	ProcNumber	owner;
+	PGPROC	   *owner_proc;
+	int32		owner_pid;
+	PgAioHandle *ioh = arg;
+
+	if (!ioh)
+		return;
+
+	/* No need for context if a backend is completing the IO for itself */
+	if (ioh->owner_procno == MyProcNumber)
+		return;
+
+	owner = ioh->owner_procno;
+	owner_proc = GetPGProcByNumber(owner);
+	owner_pid = owner_proc->pid;
+
+	errcontext("completing I/O on behalf of process %d", owner_pid);
+}
+
 static void
 pgaio_uring_drain_locked(PgAioUringContext *context)
 {
 	int			ready;
 	int			orig_ready;
+	ErrorContextCallback errcallback = {0};
 
 	Assert(LWLockHeldByMeInMode(&context->completion_lock, LW_EXCLUSIVE));
 
+	errcallback.callback = pgaio_uring_completion_error_callback;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
 	/*
 	 * Don't drain more events than available right now. Otherwise it's
 	 * plausible that one backend could get stuck, for a while, receiving CQEs
@@ -337,9 +364,11 @@ pgaio_uring_drain_locked(PgAioUringContext *context)
 			PgAioHandle *ioh;
 
 			ioh = io_uring_cqe_get_data(cqe);
+			errcallback.arg = ioh;
 			io_uring_cqe_seen(&context->io_uring_ring, cqe);
 
 			pgaio_io_process_completion(ioh, cqe->res);
+			errcallback.arg = NULL;
 		}
 
 		END_CRIT_SECTION();
@@ -348,6 +377,8 @@ pgaio_uring_drain_locked(PgAioUringContext *context)
 					"drained %d/%d, now expecting %d",
 					ncqes, orig_ready, io_uring_cq_ready(&context->io_uring_ring));
 	}
+
+	error_context_stack = errcallback.previous;
 }
 
 static void
diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c
index 4a7853d13fac..31d94ac82c54 100644
--- a/src/backend/storage/aio/method_worker.c
+++ b/src/backend/storage/aio/method_worker.c
@@ -357,11 +357,33 @@ pgaio_worker_register(void)
 	on_shmem_exit(pgaio_worker_die, 0);
 }
 
+static void
+pgaio_worker_error_callback(void *arg)
+{
+	ProcNumber	owner;
+	PGPROC	   *owner_proc;
+	int32		owner_pid;
+	PgAioHandle *ioh = arg;
+
+	if (!ioh)
+		return;
+
+	Assert(ioh->owner_procno != MyProcNumber);
+	Assert(MyBackendType == B_IO_WORKER);
+
+	owner = ioh->owner_procno;
+	owner_proc = GetPGProcByNumber(owner);
+	owner_pid = owner_proc->pid;
+
+	errcontext("I/O worker executing I/O on behalf of process %d", owner_pid);
+}
+
 void
 IoWorkerMain(const void *startup_data, size_t startup_data_len)
 {
 	sigjmp_buf	local_sigjmp_buf;
 	PgAioHandle *volatile error_ioh = NULL;
+	ErrorContextCallback errcallback = {0};
 	volatile int error_errno = 0;
 	char		cmd[128];
 
@@ -388,6 +410,10 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
 	sprintf(cmd, "%d", MyIoWorkerId);
 	set_ps_display(cmd);
 
+	errcallback.callback = pgaio_worker_error_callback;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
 	/* see PostgresMain() */
 	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
 	{
@@ -471,6 +497,7 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
 
 			ioh = &pgaio_ctl->io_handles[io_index];
 			error_ioh = ioh;
+			errcallback.arg = ioh;
 
 			pgaio_debug_io(DEBUG4, ioh,
 						   "worker %d processing IO",
@@ -511,6 +538,7 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
 			pgaio_io_perform_synchronously(ioh);
 
 			RESUME_INTERRUPTS();
+			errcallback.arg = NULL;
 		}
 		else
 		{
@@ -522,6 +550,7 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
 		CHECK_FOR_INTERRUPTS();
 	}
 
+	error_context_stack = errcallback.previous;
 	proc_exit(0);
 }
 

From 615ce93f0a353a742682c078a1a1690b9fdb4a25 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 18 Mar 2025 14:40:06 -0400
Subject: [PATCH 08/16] aio: Experimental heuristics to increase batching in
 read_stream.c

Author:
Reviewed-by:
Discussion: https://postgr.es/m/
Backpatch:
---
 src/backend/storage/aio/read_stream.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index 36c54fb695b0..cec93129f582 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -404,6 +404,30 @@ read_stream_start_pending_read(ReadStream *stream)
 static void
 read_stream_look_ahead(ReadStream *stream)
 {
+	/*
+	 * Batch-submitting multiple IOs is more efficient than doing so
+	 * one-by-one. If we just ramp up to the max, we'll only be allowed to
+	 * submit one io_combine_limit sized IO. Defer submitting IO in that case.
+	 *
+	 * FIXME: This needs better heuristics.
+	 */
+#if 1
+	if (!stream->sync_mode && stream->distance > (io_combine_limit * 8))
+	{
+		if (stream->pinned_buffers + stream->pending_read_nblocks > ((stream->distance * 3) / 4))
+		{
+#if 0
+			ereport(LOG,
+					errmsg("reduce reduce reduce: pinned: %d, pending: %d, distance: %d",
+						   stream->pinned_buffers,
+						   stream->pending_read_nblocks,
+						   stream->distance));
+#endif
+			return;
+		}
+	}
+#endif
+
 	/*
 	 * Allow amortizing the cost of submitting IO over multiple IOs. This
 	 * requires that we don't do any operations that could lead to a deadlock

From a7d518a023a6f906cbf1b146fb271fe34a219aba Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Sat, 29 Mar 2025 13:28:52 -0400
Subject: [PATCH 09/16] aio: Implement smgr/md/fd write support

TODO:
- Right now the sync.c integration with smgr.c/md.c isn't properly safe to use
  in a critical section

  The only reason it doesn't immediately fail is that it's reasonably rare
  that RegisterSyncRequest() fails *and* either:

  - smgropen()->hash_search(HASH_ENTER) decides to resize the hash table, even
    though the lookup is guaranteed to succeed for io_method=worker.

  - an io_method=uring completion is run in a different backend and smgropen()
    needs to build a new entry and thus needs to allocate memory

  For a bit I thought this could be worked around easily enough by not doing
  an smgropen() in mdsyncfiletag(), or adding a "fallible" smgropen() and
  instead just opening the file directly. That actually does kinda solve the
  problem, but only because the memory allocation in PathNameOpenFile()
  uses malloc(), not palloc() and thus doesn't trigger

- temp_file_limit implementation
---
 src/backend/storage/aio/aio_callback.c |   1 +
 src/backend/storage/file/fd.c          |  28 ++++
 src/backend/storage/smgr/md.c          | 199 +++++++++++++++++++++++++
 src/backend/storage/smgr/smgr.c        |  29 ++++
 src/include/storage/aio.h              |   1 +
 src/include/storage/fd.h               |   1 +
 src/include/storage/md.h               |   5 +
 src/include/storage/smgr.h             |   5 +
 8 files changed, 269 insertions(+)

diff --git a/src/backend/storage/aio/aio_callback.c b/src/backend/storage/aio/aio_callback.c
index bf42778a48c7..abe5b191b427 100644
--- a/src/backend/storage/aio/aio_callback.c
+++ b/src/backend/storage/aio/aio_callback.c
@@ -41,6 +41,7 @@ static const PgAioHandleCallbacksEntry aio_handle_cbs[] = {
 	CALLBACK_ENTRY(PGAIO_HCB_INVALID, aio_invalid_cb),
 
 	CALLBACK_ENTRY(PGAIO_HCB_MD_READV, aio_md_readv_cb),
+	CALLBACK_ENTRY(PGAIO_HCB_MD_WRITEV, aio_md_writev_cb),
 
 	CALLBACK_ENTRY(PGAIO_HCB_SHARED_BUFFER_READV, aio_shared_buffer_readv_cb),
 
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 0e8299dd5564..2138d47dab93 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -2348,6 +2348,34 @@ FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset,
 	return returnCode;
 }
 
+int
+FileStartWriteV(PgAioHandle *ioh, File file,
+				int iovcnt, off_t offset,
+				uint32 wait_event_info)
+{
+	int			returnCode;
+	Vfd		   *vfdP;
+
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FileStartWriteV: %d (%s) " INT64_FORMAT " %d",
+			   file, VfdCache[file].fileName,
+			   (int64) offset,
+			   iovcnt));
+
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
+	vfdP = &VfdCache[file];
+
+	/* FIXME: think about / reimplement  temp_file_limit */
+
+	pgaio_io_start_writev(ioh, vfdP->fd, iovcnt, offset);
+
+	return 0;
+}
+
 int
 FileSync(File file, uint32 wait_event_info)
 {
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index bc7d94df870d..bacfe5f121ba 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -155,12 +155,19 @@ static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
 
 static PgAioResult md_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data);
 static void md_readv_report(PgAioResult result, const PgAioTargetData *target_data, int elevel);
+static PgAioResult md_writev_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data);
+static void md_writev_report(PgAioResult result, const PgAioTargetData *target_data, int elevel);
 
 const PgAioHandleCallbacks aio_md_readv_cb = {
 	.complete_shared = md_readv_complete,
 	.report = md_readv_report,
 };
 
+const PgAioHandleCallbacks aio_md_writev_cb = {
+	.complete_shared = md_writev_complete,
+	.report = md_writev_report,
+};
+
 
 static inline int
 _mdfd_open_flags(void)
@@ -1143,6 +1150,64 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	}
 }
 
+/*
+ * mdstartwritev() -- Asynchronous version of mdrwritev().
+ */
+void
+mdstartwritev(PgAioHandle *ioh,
+			  SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+			  const void **buffers, BlockNumber nblocks, bool skipFsync)
+{
+	off_t		seekpos;
+	MdfdVec    *v;
+	BlockNumber nblocks_this_segment;
+	struct iovec *iov;
+	int			iovcnt;
+	int			ret;
+
+	v = _mdfd_getseg(reln, forknum, blocknum, false,
+					 EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
+
+	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+	nblocks_this_segment =
+		Min(nblocks,
+			RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
+
+	if (nblocks_this_segment != nblocks)
+		elog(ERROR, "write crossing segment boundary");
+
+	iovcnt = pgaio_io_get_iovec(ioh, &iov);
+
+	Assert(nblocks <= iovcnt);
+
+	iovcnt = buffers_to_iovec(iov, unconstify(void **, buffers), nblocks_this_segment);
+
+	Assert(iovcnt <= nblocks_this_segment);
+
+	if (!(io_direct_flags & IO_DIRECT_DATA))
+		pgaio_io_set_flag(ioh, PGAIO_HF_BUFFERED);
+
+	pgaio_io_set_target_smgr(ioh,
+							 reln,
+							 forknum,
+							 blocknum,
+							 nblocks,
+							 skipFsync);
+	pgaio_io_register_callbacks(ioh, PGAIO_HCB_MD_WRITEV, 0);
+
+	ret = FileStartWriteV(ioh, v->mdfd_vfd, iovcnt, seekpos, WAIT_EVENT_DATA_FILE_WRITE);
+	if (ret != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not start writing blocks %u..%u in file \"%s\": %m",
+						blocknum,
+						blocknum + nblocks_this_segment - 1,
+						FilePathName(v->mdfd_vfd))));
+}
+
 
 /*
  * mdwriteback() -- Tell the kernel to write pages back to storage.
@@ -1531,6 +1596,40 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 	}
 }
 
+/*
+ * Like register_dirty_segment(), except for use by AIO. In the completion
+ * callback we don't have access to the MdfdVec (the completion callback might
+ * be executed in a different backend than the issuing backend), therefore we
+ * have to implement this slightly differently.
+ */
+static void
+register_dirty_segment_aio(RelFileLocator locator, ForkNumber forknum, uint64 segno)
+{
+	FileTag		tag;
+
+	INIT_MD_FILETAG(tag, locator, forknum, segno);
+
+	/*
+	 * Can't block here waiting for checkpointer to accept our sync request,
+	 * as checkpointer might be waiting for this AIO to finish if offloaded to
+	 * a worker.
+	 */
+	if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
+	{
+		char		path[MAXPGPATH];
+
+		ereport(DEBUG1,
+				(errmsg_internal("could not forward fsync request because request queue is full")));
+
+		/* reuse mdsyncfiletag() to avoid duplicating code */
+		if (mdsyncfiletag(&tag, path))
+			ereport(data_sync_elevel(ERROR),
+					(errcode_for_file_access(),
+					 errmsg("could not fsync file \"%s\": %m",
+							path)));
+	}
+}
+
 /*
  * register_unlink_segment() -- Schedule a file to be deleted after next checkpoint
  */
@@ -2065,3 +2164,103 @@ md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
 					   td->smgr.nblocks * (size_t) BLCKSZ));
 	}
 }
+
+/*
+ * AIO completion callback for mdstartwritev().
+ */
+static PgAioResult
+md_writev_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
+{
+	PgAioTargetData *td = pgaio_io_get_target_data(ioh);
+	PgAioResult result = prior_result;
+
+	if (prior_result.result < 0)
+	{
+		result.status = PGAIO_RS_ERROR;
+		result.id = PGAIO_HCB_MD_WRITEV;
+		/* For "hard" errors, track the error number in error_data */
+		result.error_data = -prior_result.result;
+		result.result = 0;
+
+		pgaio_result_report(result, td, LOG);
+
+		return result;
+	}
+
+	/*
+	 * As explained above smgrstartwritev(), the smgr API operates on the
+	 * level of blocks, rather than bytes. Convert.
+	 */
+	result.result /= BLCKSZ;
+
+	Assert(result.result <= td->smgr.nblocks);
+
+	if (result.result == 0)
+	{
+		/* consider 0 blocks written a failure */
+		result.status = PGAIO_RS_ERROR;
+		result.id = PGAIO_HCB_MD_WRITEV;
+		result.error_data = 0;
+
+		pgaio_result_report(result, td, LOG);
+
+		return result;
+	}
+
+	if (result.status != PGAIO_RS_ERROR &&
+		result.result < td->smgr.nblocks)
+	{
+		/* partial writes should be retried at upper level */
+		result.status = PGAIO_RS_PARTIAL;
+		result.id = PGAIO_HCB_MD_WRITEV;
+	}
+
+	if (!td->smgr.skip_fsync)
+		register_dirty_segment_aio(td->smgr.rlocator, td->smgr.forkNum,
+								   td->smgr.blockNum / ((BlockNumber) RELSEG_SIZE));
+
+	return result;
+}
+
+/*
+ * AIO error reporting callback for mdstartwritev().
+ */
+static void
+md_writev_report(PgAioResult result, const PgAioTargetData *td, int elevel)
+{
+	RelPathStr	path;
+
+	path = relpathbackend(td->smgr.rlocator,
+						  td->smgr.is_temp ? MyProcNumber : INVALID_PROC_NUMBER,
+						  td->smgr.forkNum);
+
+	if (result.error_data != 0)
+	{
+		errno = result.error_data;	/* for errcode_for_file_access() */
+
+		ereport(elevel,
+				errcode_for_file_access(),
+				errmsg("could not write blocks %u..%u in file \"%s\": %m",
+					   td->smgr.blockNum,
+					   td->smgr.blockNum + td->smgr.nblocks,
+					   path.str)
+			);
+	}
+	else
+	{
+		/*
+		 * NB: This will typically only be output in debug messages, while
+		 * retrying a partial IO.
+		 */
+		ereport(elevel,
+				errcode(ERRCODE_DATA_CORRUPTED),
+				errmsg("could not write blocks %u..%u in file \"%s\": wrote only %zu of %zu bytes",
+					   td->smgr.blockNum,
+					   td->smgr.blockNum + td->smgr.nblocks - 1,
+					   path.str,
+					   result.result * (size_t) BLCKSZ,
+					   td->smgr.nblocks * (size_t) BLCKSZ
+					   )
+			);
+	}
+}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index d532fea39d93..85daf686e08a 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -115,6 +115,11 @@ typedef struct f_smgr
 								BlockNumber blocknum,
 								const void **buffers, BlockNumber nblocks,
 								bool skipFsync);
+	void		(*smgr_startwritev) (PgAioHandle *ioh,
+									 SMgrRelation reln, ForkNumber forknum,
+									 BlockNumber blocknum,
+									 const void **buffers, BlockNumber nblocks,
+									 bool skipFsync);
 	void		(*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
 								   BlockNumber blocknum, BlockNumber nblocks);
 	BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
@@ -142,6 +147,7 @@ static const f_smgr smgrsw[] = {
 		.smgr_readv = mdreadv,
 		.smgr_startreadv = mdstartreadv,
 		.smgr_writev = mdwritev,
+		.smgr_startwritev = mdstartwritev,
 		.smgr_writeback = mdwriteback,
 		.smgr_nblocks = mdnblocks,
 		.smgr_truncate = mdtruncate,
@@ -795,6 +801,29 @@ smgrwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	RESUME_INTERRUPTS();
 }
 
+/*
+ * smgrstartwritev() -- asynchronous version of smgrwritev()
+ *
+ * This starts an asynchronous writev IO using the IO handle `ioh`. Other than
+ * `ioh` all parameters are the same as smgrwritev().
+ *
+ * Completion callbacks above smgr will be passed the result as the number of
+ * successfully written blocks if the write [partially] succeeds. This
+ * maintains the abstraction that smgr operates on the level of blocks, rather
+ * than bytes.
+ */
+void
+smgrstartwritev(PgAioHandle *ioh,
+				SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+				const void **buffers, BlockNumber nblocks, bool skipFsync)
+{
+	HOLD_INTERRUPTS();
+	smgrsw[reln->smgr_which].smgr_startwritev(ioh,
+											  reln, forknum, blocknum, buffers,
+											  nblocks, skipFsync);
+	RESUME_INTERRUPTS();
+}
+
 /*
  * smgrwriteback() -- Trigger kernel writeback for the supplied range of
  *					   blocks.
diff --git a/src/include/storage/aio.h b/src/include/storage/aio.h
index 9fe9d9ad9fa4..bfe0d93683b1 100644
--- a/src/include/storage/aio.h
+++ b/src/include/storage/aio.h
@@ -194,6 +194,7 @@ typedef enum PgAioHandleCallbackID
 	PGAIO_HCB_INVALID = 0,
 
 	PGAIO_HCB_MD_READV,
+	PGAIO_HCB_MD_WRITEV,
 
 	PGAIO_HCB_SHARED_BUFFER_READV,
 
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index b77d8e5e30e3..2cc7c5a47617 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -112,6 +112,7 @@ extern int	FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event
 extern ssize_t FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset, uint32 wait_event_info);
 extern ssize_t FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset, uint32 wait_event_info);
 extern int	FileStartReadV(struct PgAioHandle *ioh, File file, int iovcnt, off_t offset, uint32 wait_event_info);
+extern int	FileStartWriteV(struct PgAioHandle *ioh, File file, int iovcnt, off_t offset, uint32 wait_event_info);
 extern int	FileSync(File file, uint32 wait_event_info);
 extern int	FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info);
 extern int	FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info);
diff --git a/src/include/storage/md.h b/src/include/storage/md.h
index 9d7131eff438..47ae6c36c94f 100644
--- a/src/include/storage/md.h
+++ b/src/include/storage/md.h
@@ -21,6 +21,7 @@
 #include "storage/sync.h"
 
 extern const PgAioHandleCallbacks aio_md_readv_cb;
+extern const PgAioHandleCallbacks aio_md_writev_cb;
 
 /* md storage manager functionality */
 extern void mdinit(void);
@@ -45,6 +46,10 @@ extern void mdstartreadv(PgAioHandle *ioh,
 extern void mdwritev(SMgrRelation reln, ForkNumber forknum,
 					 BlockNumber blocknum,
 					 const void **buffers, BlockNumber nblocks, bool skipFsync);
+extern void mdstartwritev(PgAioHandle *ioh,
+						  SMgrRelation reln, ForkNumber forknum,
+						  BlockNumber blocknum,
+						  const void **buffers, BlockNumber nblocks, bool skipFsync);
 extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
 						BlockNumber blocknum, BlockNumber nblocks);
 extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 856ebcda350e..f00b3763ac99 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -108,6 +108,11 @@ extern void smgrwritev(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum,
 					   const void **buffers, BlockNumber nblocks,
 					   bool skipFsync);
+extern void smgrstartwritev(PgAioHandle *ioh,
+							SMgrRelation reln, ForkNumber forknum,
+							BlockNumber blocknum,
+							const void **buffers, BlockNumber nblocks,
+							bool skipFsync);
 extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum,
 						  BlockNumber blocknum, BlockNumber nblocks);
 extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);

From eb254b3debcfe2dbf83a92e075b77f8928173a03 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 18 Mar 2025 14:40:06 -0400
Subject: [PATCH 10/16] aio: Add bounce buffers

---
 src/backend/storage/aio/README.md             |  27 +++
 src/backend/storage/aio/aio.c                 | 178 ++++++++++++++++++
 src/backend/storage/aio/aio_init.c            | 123 ++++++++++++
 src/backend/utils/misc/guc_tables.c           |  13 ++
 src/backend/utils/misc/postgresql.conf.sample |   2 +
 src/backend/utils/resowner/resowner.c         |  25 ++-
 src/include/storage/aio.h                     |  15 ++
 src/include/storage/aio_internal.h            |  34 ++++
 src/include/storage/aio_types.h               |   2 +
 src/include/utils/resowner.h                  |   2 +
 src/test/modules/test_aio/test_aio--1.0.sql   |  21 +++
 src/test/modules/test_aio/test_aio.c          |  55 ++++++
 src/tools/pgindent/typedefs.list              |   1 +
 13 files changed, 496 insertions(+), 2 deletions(-)

diff --git a/src/backend/storage/aio/README.md b/src/backend/storage/aio/README.md
index ddd59404a592..534aaad22bc3 100644
--- a/src/backend/storage/aio/README.md
+++ b/src/backend/storage/aio/README.md
@@ -406,6 +406,33 @@ shared memory no less!), completion callbacks instead have to encode errors in
 a more compact format that can be converted into an error message.
 
 
+### AIO Bounce Buffers
+
+For some uses of AIO there is no convenient memory location as the source /
+destination of an AIO. E.g. when data checksums are enabled, writes from
+shared buffers currently cannot be done directly from shared buffers, as a
+shared buffer lock still allows some modification, e.g., for hint bits (see
+`FlushBuffer()`). If the write were done in-place, such modifications can
+cause the checksum to fail.
+
+For synchronous IO this is solved by copying the buffer to separate memory
+before computing the checksum and using that copy as the source buffer for the
+AIO.
+
+However, for AIO that is not a workable solution:
+- Instead of a single buffer many buffers are required, as many IOs might be
+  in flight
+- When using the [worker method](#worker), the source/target of IO needs to be
+  in shared memory, otherwise the workers won't be able to access the memory.
+
+The AIO subsystem addresses this by providing a limited number of bounce
+buffers that can be used as the source / target for IO. A bounce buffer can be
+acquired with `pgaio_bounce_buffer_get()` and multiple bounce buffers can be
+associated with an AIO Handle with `pgaio_io_assoc_bounce_buffer()`.
+
+Bounce buffers are automatically released when the IO completes.
+
+
 ## Helpers
 
 Using the low-level AIO API introduces too much complexity to do so all over
diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c
index 86f7250b7a5f..cff48964d076 100644
--- a/src/backend/storage/aio/aio.c
+++ b/src/backend/storage/aio/aio.c
@@ -62,6 +62,8 @@ static PgAioHandle *pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation
 static const char *pgaio_io_state_get_name(PgAioHandleState s);
 static void pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation);
 
+static void pgaio_bounce_buffer_wait_for_free(void);
+
 
 /* Options for io_method. */
 const struct config_enum_entry io_method_options[] = {
@@ -76,6 +78,7 @@ const struct config_enum_entry io_method_options[] = {
 /* GUCs */
 int			io_method = DEFAULT_IO_METHOD;
 int			io_max_concurrency = -1;
+int			io_bounce_buffers = -1;
 
 /* global control for AIO */
 PgAioCtl   *pgaio_ctl;
@@ -662,6 +665,21 @@ pgaio_io_reclaim(PgAioHandle *ioh)
 	if (ioh->state != PGAIO_HS_HANDED_OUT)
 		dclist_delete_from(&pgaio_my_backend->in_flight_ios, &ioh->node);
 
+	/* reclaim all associated bounce buffers */
+	if (!slist_is_empty(&ioh->bounce_buffers))
+	{
+		slist_mutable_iter it;
+
+		slist_foreach_modify(it, &ioh->bounce_buffers)
+		{
+			PgAioBounceBuffer *bb = slist_container(PgAioBounceBuffer, node, it.cur);
+
+			slist_delete_current(&it);
+
+			slist_push_head(&pgaio_my_backend->idle_bbs, &bb->node);
+		}
+	}
+
 	if (ioh->resowner)
 	{
 		ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
@@ -1046,6 +1064,166 @@ pgaio_submit_staged(void)
 
 
 
+/* --------------------------------------------------------------------------------
+ * Functions primarily related to PgAioBounceBuffer
+ * --------------------------------------------------------------------------------
+ */
+
+PgAioBounceBuffer *
+pgaio_bounce_buffer_get(void)
+{
+	PgAioBounceBuffer *bb = NULL;
+	slist_node *node;
+
+	if (pgaio_my_backend->handed_out_bb != NULL)
+		elog(ERROR, "can only hand out one BB");
+
+	/*
+	 * XXX: It probably is not a good idea to have bounce buffers be per
+	 * backend, that's a fair bit of memory.
+	 */
+	if (slist_is_empty(&pgaio_my_backend->idle_bbs))
+	{
+		pgaio_bounce_buffer_wait_for_free();
+	}
+
+	node = slist_pop_head_node(&pgaio_my_backend->idle_bbs);
+	bb = slist_container(PgAioBounceBuffer, node, node);
+
+	pgaio_my_backend->handed_out_bb = bb;
+
+	bb->resowner = CurrentResourceOwner;
+	ResourceOwnerRememberAioBounceBuffer(bb->resowner, &bb->resowner_node);
+
+	return bb;
+}
+
+void
+pgaio_io_assoc_bounce_buffer(PgAioHandle *ioh, PgAioBounceBuffer *bb)
+{
+	if (pgaio_my_backend->handed_out_bb != bb)
+		elog(ERROR, "can only assign handed out BB");
+	pgaio_my_backend->handed_out_bb = NULL;
+
+	/*
+	 * There can be many bounce buffers assigned in case of vectorized IOs.
+	 */
+	slist_push_head(&ioh->bounce_buffers, &bb->node);
+
+	/* once associated with an IO, the IO has ownership */
+	ResourceOwnerForgetAioBounceBuffer(bb->resowner, &bb->resowner_node);
+	bb->resowner = NULL;
+}
+
+uint32
+pgaio_bounce_buffer_id(PgAioBounceBuffer *bb)
+{
+	return bb - pgaio_ctl->bounce_buffers;
+}
+
+void
+pgaio_bounce_buffer_release(PgAioBounceBuffer *bb)
+{
+	if (pgaio_my_backend->handed_out_bb != bb)
+		elog(ERROR, "can only release handed out BB");
+
+	slist_push_head(&pgaio_my_backend->idle_bbs, &bb->node);
+	pgaio_my_backend->handed_out_bb = NULL;
+
+	ResourceOwnerForgetAioBounceBuffer(bb->resowner, &bb->resowner_node);
+	bb->resowner = NULL;
+}
+
+void
+pgaio_bounce_buffer_release_resowner(dlist_node *bb_node, bool on_error)
+{
+	PgAioBounceBuffer *bb = dlist_container(PgAioBounceBuffer, resowner_node, bb_node);
+
+	Assert(bb->resowner);
+
+	if (!on_error)
+		elog(WARNING, "leaked AIO bounce buffer");
+
+	pgaio_bounce_buffer_release(bb);
+}
+
+char *
+pgaio_bounce_buffer_buffer(PgAioBounceBuffer *bb)
+{
+	return bb->buffer;
+}
+
+static void
+pgaio_bounce_buffer_wait_for_free(void)
+{
+	static uint32 lastpos = 0;
+
+	if (pgaio_my_backend->num_staged_ios > 0)
+	{
+		pgaio_debug(DEBUG2, "submitting %d, while acquiring free bb",
+					pgaio_my_backend->num_staged_ios);
+		pgaio_submit_staged();
+	}
+
+	for (uint32 i = lastpos; i < lastpos + io_max_concurrency; i++)
+	{
+		uint32		thisoff = pgaio_my_backend->io_handle_off + (i % io_max_concurrency);
+		PgAioHandle *ioh = &pgaio_ctl->io_handles[thisoff];
+
+		switch (ioh->state)
+		{
+			case PGAIO_HS_IDLE:
+			case PGAIO_HS_HANDED_OUT:
+				continue;
+			case PGAIO_HS_DEFINED:	/* should have been submitted above */
+			case PGAIO_HS_STAGED:
+				elog(ERROR, "shouldn't get here with io:%d in state %d",
+					 pgaio_io_get_id(ioh), ioh->state);
+				break;
+			case PGAIO_HS_COMPLETED_IO:
+			case PGAIO_HS_SUBMITTED:
+				if (!slist_is_empty(&ioh->bounce_buffers))
+				{
+					pgaio_debug_io(DEBUG2, ioh,
+								   "waiting for IO to reclaim BB with %d in flight",
+								   dclist_count(&pgaio_my_backend->in_flight_ios));
+
+					/* see comment in pgaio_io_wait_for_free() about raciness */
+					pgaio_io_wait(ioh, ioh->generation);
+
+					if (slist_is_empty(&pgaio_my_backend->idle_bbs))
+						elog(WARNING, "empty after wait");
+
+					if (!slist_is_empty(&pgaio_my_backend->idle_bbs))
+					{
+						lastpos = i;
+						return;
+					}
+				}
+				break;
+			case PGAIO_HS_COMPLETED_SHARED:
+			case PGAIO_HS_COMPLETED_LOCAL:
+				/* reclaim */
+				pgaio_io_reclaim(ioh);
+
+				if (!slist_is_empty(&pgaio_my_backend->idle_bbs))
+				{
+					lastpos = i;
+					return;
+				}
+				break;
+		}
+	}
+
+	/*
+	 * The submission above could have caused the IO to complete at any time.
+	 */
+	if (slist_is_empty(&pgaio_my_backend->idle_bbs))
+		elog(PANIC, "no more bbs");
+}
+
+
+
 /* --------------------------------------------------------------------------------
  * Other
  * --------------------------------------------------------------------------------
diff --git a/src/backend/storage/aio/aio_init.c b/src/backend/storage/aio/aio_init.c
index 885c3940c662..95b10933fedb 100644
--- a/src/backend/storage/aio/aio_init.c
+++ b/src/backend/storage/aio/aio_init.c
@@ -88,6 +88,32 @@ AioHandleDataShmemSize(void)
 							 io_max_concurrency));
 }
 
+static Size
+AioBounceBufferDescShmemSize(void)
+{
+	Size		sz;
+
+	/* PgAioBounceBuffer itself */
+	sz = mul_size(sizeof(PgAioBounceBuffer),
+				  mul_size(AioProcs(), io_bounce_buffers));
+
+	return sz;
+}
+
+static Size
+AioBounceBufferDataShmemSize(void)
+{
+	Size		sz;
+
+	/* and the associated buffer */
+	sz = mul_size(BLCKSZ,
+				  mul_size(io_bounce_buffers, AioProcs()));
+	/* memory for alignment */
+	sz += BLCKSZ;
+
+	return sz;
+}
+
 /*
  * Choose a suitable value for io_max_concurrency.
  *
@@ -113,6 +139,33 @@ AioChooseMaxConcurrency(void)
 	return Min(max_proportional_pins, 64);
 }
 
+/*
+ * Choose a suitable value for io_bounce_buffers.
+ *
+ * It's very unlikely to be useful to allocate more bounce buffers for each
+ * backend than the backend is allowed to pin. Additionally, bounce buffers
+ * currently are used for writes, it seems very uncommon for more than 10% of
+ * shared_buffers to be written out concurrently.
+ *
+ * XXX: This quickly can take up significant amounts of memory, the logic
+ * should probably fine tuned.
+ */
+static int
+AioChooseBounceBuffers(void)
+{
+	uint32		max_backends;
+	int			max_proportional_pins;
+
+	/* Similar logic to LimitAdditionalPins() */
+	max_backends = MaxBackends + NUM_AUXILIARY_PROCS;
+	max_proportional_pins = (NBuffers / 10) / max_backends;
+
+	max_proportional_pins = Max(max_proportional_pins, 1);
+
+	/* apply upper limit */
+	return Min(max_proportional_pins, 256);
+}
+
 Size
 AioShmemSize(void)
 {
@@ -136,11 +189,31 @@ AioShmemSize(void)
 							PGC_S_OVERRIDE);
 	}
 
+
+	/*
+	 * If io_bounce_buffers is -1, we automatically choose a suitable value.
+	 *
+	 * See also comment above.
+	 */
+	if (io_bounce_buffers == -1)
+	{
+		char		buf[32];
+
+		snprintf(buf, sizeof(buf), "%d", AioChooseBounceBuffers());
+		SetConfigOption("io_bounce_buffers", buf, PGC_POSTMASTER,
+						PGC_S_DYNAMIC_DEFAULT);
+		if (io_bounce_buffers == -1)	/* failed to apply it? */
+			SetConfigOption("io_bounce_buffers", buf, PGC_POSTMASTER,
+							PGC_S_OVERRIDE);
+	}
+
 	sz = add_size(sz, AioCtlShmemSize());
 	sz = add_size(sz, AioBackendShmemSize());
 	sz = add_size(sz, AioHandleShmemSize());
 	sz = add_size(sz, AioHandleIOVShmemSize());
 	sz = add_size(sz, AioHandleDataShmemSize());
+	sz = add_size(sz, AioBounceBufferDescShmemSize());
+	sz = add_size(sz, AioBounceBufferDataShmemSize());
 
 	/* Reserve space for method specific resources. */
 	if (pgaio_method_ops->shmem_size)
@@ -156,6 +229,9 @@ AioShmemInit(void)
 	uint32		io_handle_off = 0;
 	uint32		iovec_off = 0;
 	uint32		per_backend_iovecs = io_max_concurrency * io_max_combine_limit;
+	uint32		bounce_buffers_off = 0;
+	uint32		per_backend_bb = io_bounce_buffers;
+	char	   *bounce_buffers_data;
 
 	pgaio_ctl = (PgAioCtl *)
 		ShmemInitStruct("AioCtl", AioCtlShmemSize(), &found);
@@ -167,6 +243,7 @@ AioShmemInit(void)
 
 	pgaio_ctl->io_handle_count = AioProcs() * io_max_concurrency;
 	pgaio_ctl->iovec_count = AioProcs() * per_backend_iovecs;
+	pgaio_ctl->bounce_buffers_count = AioProcs() * per_backend_bb;
 
 	pgaio_ctl->backend_state = (PgAioBackend *)
 		ShmemInitStruct("AioBackend", AioBackendShmemSize(), &found);
@@ -179,6 +256,40 @@ AioShmemInit(void)
 	pgaio_ctl->handle_data = (uint64 *)
 		ShmemInitStruct("AioHandleData", AioHandleDataShmemSize(), &found);
 
+	pgaio_ctl->bounce_buffers = (PgAioBounceBuffer *)
+		ShmemInitStruct("AioBounceBufferDesc", AioBounceBufferDescShmemSize(),
+						&found);
+
+	bounce_buffers_data =
+		ShmemInitStruct("AioBounceBufferData", AioBounceBufferDataShmemSize(),
+						&found);
+	bounce_buffers_data =
+		(char *) TYPEALIGN(BLCKSZ, (uintptr_t) bounce_buffers_data);
+	pgaio_ctl->bounce_buffers_data = bounce_buffers_data;
+
+
+	/* Initialize IO handles. */
+	for (uint64 i = 0; i < pgaio_ctl->io_handle_count; i++)
+	{
+		PgAioHandle *ioh = &pgaio_ctl->io_handles[i];
+
+		ioh->op = PGAIO_OP_INVALID;
+		ioh->target = PGAIO_TID_INVALID;
+		ioh->state = PGAIO_HS_IDLE;
+
+		slist_init(&ioh->bounce_buffers);
+	}
+
+	/* Initialize Bounce Buffers. */
+	for (uint64 i = 0; i < pgaio_ctl->bounce_buffers_count; i++)
+	{
+		PgAioBounceBuffer *bb = &pgaio_ctl->bounce_buffers[i];
+
+		bb->buffer = bounce_buffers_data;
+		bounce_buffers_data += BLCKSZ;
+	}
+
+
 	for (int procno = 0; procno < AioProcs(); procno++)
 	{
 		PgAioBackend *bs = &pgaio_ctl->backend_state[procno];
@@ -186,9 +297,13 @@ AioShmemInit(void)
 		bs->io_handle_off = io_handle_off;
 		io_handle_off += io_max_concurrency;
 
+		bs->bounce_buffers_off = bounce_buffers_off;
+		bounce_buffers_off += per_backend_bb;
+
 		dclist_init(&bs->idle_ios);
 		memset(bs->staged_ios, 0, sizeof(PgAioHandle *) * PGAIO_SUBMIT_BATCH_SIZE);
 		dclist_init(&bs->in_flight_ios);
+		slist_init(&bs->idle_bbs);
 
 		/* initialize per-backend IOs */
 		for (int i = 0; i < io_max_concurrency; i++)
@@ -210,6 +325,14 @@ AioShmemInit(void)
 			dclist_push_tail(&bs->idle_ios, &ioh->node);
 			iovec_off += io_max_combine_limit;
 		}
+
+		/* initialize per-backend bounce buffers */
+		for (int i = 0; i < per_backend_bb; i++)
+		{
+			PgAioBounceBuffer *bb = &pgaio_ctl->bounce_buffers[bs->bounce_buffers_off + i];
+
+			slist_push_head(&bs->idle_bbs, &bb->node);
+		}
 	}
 
 out:
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 4eaeca89f2c7..bd49b302293c 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -3293,6 +3293,19 @@ struct config_int ConfigureNamesInt[] =
 		check_io_max_concurrency, NULL, NULL
 	},
 
+	{
+		{"io_bounce_buffers",
+			PGC_POSTMASTER,
+			RESOURCES_IO,
+			gettext_noop("Number of IO Bounce Buffers reserved for each backend."),
+			NULL,
+			GUC_UNIT_BLOCKS
+		},
+		&io_bounce_buffers,
+		-1, -1, 4096,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"io_workers",
 			PGC_SIGHUP,
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index ff56a1f0732c..2c6456e907f5 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -211,6 +211,8 @@
 					# -1 sets based on shared_buffers
 					# (change requires restart)
 #io_workers = 3				# 1-32;
+#io_bounce_buffers = -1			# -1 sets based on shared_buffers
+					# (change requires restart)
 
 # - Worker Processes -
 
diff --git a/src/backend/utils/resowner/resowner.c b/src/backend/utils/resowner/resowner.c
index d39f3e1b655c..81e7e27965a6 100644
--- a/src/backend/utils/resowner/resowner.c
+++ b/src/backend/utils/resowner/resowner.c
@@ -159,10 +159,11 @@ struct ResourceOwnerData
 	LOCALLOCK  *locks[MAX_RESOWNER_LOCKS];	/* list of owned locks */
 
 	/*
-	 * AIO handles need be registered in critical sections and therefore
-	 * cannot use the normal ResourceElem mechanism.
+	 * AIO handles & bounce buffers need be registered in critical sections
+	 * and therefore cannot use the normal ResourceElem mechanism.
 	 */
 	dlist_head	aio_handles;
+	dlist_head	aio_bounce_buffers;
 };
 
 
@@ -434,6 +435,7 @@ ResourceOwnerCreate(ResourceOwner parent, const char *name)
 	}
 
 	dlist_init(&owner->aio_handles);
+	dlist_init(&owner->aio_bounce_buffers);
 
 	return owner;
 }
@@ -742,6 +744,13 @@ ResourceOwnerReleaseInternal(ResourceOwner owner,
 
 			pgaio_io_release_resowner(node, !isCommit);
 		}
+
+		while (!dlist_is_empty(&owner->aio_bounce_buffers))
+		{
+			dlist_node *node = dlist_head_node(&owner->aio_bounce_buffers);
+
+			pgaio_bounce_buffer_release_resowner(node, !isCommit);
+		}
 	}
 	else if (phase == RESOURCE_RELEASE_LOCKS)
 	{
@@ -1111,3 +1120,15 @@ ResourceOwnerForgetAioHandle(ResourceOwner owner, struct dlist_node *ioh_node)
 {
 	dlist_delete_from(&owner->aio_handles, ioh_node);
 }
+
+void
+ResourceOwnerRememberAioBounceBuffer(ResourceOwner owner, struct dlist_node *ioh_node)
+{
+	dlist_push_tail(&owner->aio_bounce_buffers, ioh_node);
+}
+
+void
+ResourceOwnerForgetAioBounceBuffer(ResourceOwner owner, struct dlist_node *ioh_node)
+{
+	dlist_delete_from(&owner->aio_bounce_buffers, ioh_node);
+}
diff --git a/src/include/storage/aio.h b/src/include/storage/aio.h
index bfe0d93683b1..f91f0afc5a58 100644
--- a/src/include/storage/aio.h
+++ b/src/include/storage/aio.h
@@ -353,6 +353,20 @@ extern bool pgaio_have_staged(void);
 
 
 
+/* --------------------------------------------------------------------------------
+ * Bounce Buffers
+ * --------------------------------------------------------------------------------
+ */
+
+extern PgAioBounceBuffer *pgaio_bounce_buffer_get(void);
+extern void pgaio_io_assoc_bounce_buffer(PgAioHandle *ioh, PgAioBounceBuffer *bb);
+extern uint32 pgaio_bounce_buffer_id(PgAioBounceBuffer *bb);
+extern void pgaio_bounce_buffer_release(PgAioBounceBuffer *bb);
+extern char *pgaio_bounce_buffer_buffer(PgAioBounceBuffer *bb);
+extern void pgaio_bounce_buffer_release_resowner(struct dlist_node *bb_node, bool on_error);
+
+
+
 /* --------------------------------------------------------------------------------
  * Other
  * --------------------------------------------------------------------------------
@@ -365,6 +379,7 @@ extern void pgaio_closing_fd(int fd);
 /* GUCs */
 extern PGDLLIMPORT int io_method;
 extern PGDLLIMPORT int io_max_concurrency;
+extern PGDLLIMPORT int io_bounce_buffers;
 
 
 #endif							/* AIO_H */
diff --git a/src/include/storage/aio_internal.h b/src/include/storage/aio_internal.h
index 7f18da2c8565..833f97361a1f 100644
--- a/src/include/storage/aio_internal.h
+++ b/src/include/storage/aio_internal.h
@@ -127,6 +127,12 @@ struct PgAioHandle
 	/* raw result of the IO operation */
 	int32		result;
 
+	/*
+	 * List of bounce_buffers owned by IO. It would suffice to use an index
+	 * based linked list here.
+	 */
+	slist_head	bounce_buffers;
+
 	/**
 	 * In which list the handle is registered, depends on the state:
 	 * - IDLE, in per-backend list
@@ -182,11 +188,24 @@ struct PgAioHandle
 };
 
 
+/* typedef is in aio_types.h */
+struct PgAioBounceBuffer
+{
+	slist_node	node;
+	struct ResourceOwnerData *resowner;
+	dlist_node	resowner_node;
+	char	   *buffer;
+};
+
+
 typedef struct PgAioBackend
 {
 	/* index into PgAioCtl->io_handles */
 	uint32		io_handle_off;
 
+	/* index into PgAioCtl->bounce_buffers */
+	uint32		bounce_buffers_off;
+
 	/* IO Handles that currently are not used */
 	dclist_head idle_ios;
 
@@ -217,6 +236,12 @@ typedef struct PgAioBackend
 	 * IOs being appended at the end.
 	 */
 	dclist_head in_flight_ios;
+
+	/* Bounce Buffers that currently are not used */
+	slist_head	idle_bbs;
+
+	/* see handed_out_io */
+	PgAioBounceBuffer *handed_out_bb;
 } PgAioBackend;
 
 
@@ -244,6 +269,15 @@ typedef struct PgAioCtl
 
 	uint32		io_handle_count;
 	PgAioHandle *io_handles;
+
+	/*
+	 * To perform AIO on buffers that are not located in shared memory (either
+	 * because they are not in shared memory or because we need to operate on
+	 * a copy, as e.g. the case for writes when checksums are in use)
+	 */
+	uint32		bounce_buffers_count;
+	PgAioBounceBuffer *bounce_buffers;
+	char	   *bounce_buffers_data;
 } PgAioCtl;
 
 
diff --git a/src/include/storage/aio_types.h b/src/include/storage/aio_types.h
index 181833660778..3c18dade49cb 100644
--- a/src/include/storage/aio_types.h
+++ b/src/include/storage/aio_types.h
@@ -134,4 +134,6 @@ typedef struct PgAioReturn
 } PgAioReturn;
 
 
+typedef struct PgAioBounceBuffer PgAioBounceBuffer;
+
 #endif							/* AIO_TYPES_H */
diff --git a/src/include/utils/resowner.h b/src/include/utils/resowner.h
index aede4bfc820a..7e2ec2241693 100644
--- a/src/include/utils/resowner.h
+++ b/src/include/utils/resowner.h
@@ -168,5 +168,7 @@ extern void ResourceOwnerForgetLock(ResourceOwner owner, struct LOCALLOCK *local
 struct dlist_node;
 extern void ResourceOwnerRememberAioHandle(ResourceOwner owner, struct dlist_node *ioh_node);
 extern void ResourceOwnerForgetAioHandle(ResourceOwner owner, struct dlist_node *ioh_node);
+extern void ResourceOwnerRememberAioBounceBuffer(ResourceOwner owner, struct dlist_node *bb_node);
+extern void ResourceOwnerForgetAioBounceBuffer(ResourceOwner owner, struct dlist_node *bb_node);
 
 #endif							/* RESOWNER_H */
diff --git a/src/test/modules/test_aio/test_aio--1.0.sql b/src/test/modules/test_aio/test_aio--1.0.sql
index e495481c41e4..e2a812351662 100644
--- a/src/test/modules/test_aio/test_aio--1.0.sql
+++ b/src/test/modules/test_aio/test_aio--1.0.sql
@@ -87,6 +87,27 @@ RETURNS pg_catalog.void STRICT
 AS 'MODULE_PATHNAME' LANGUAGE C;
 
 
+CREATE FUNCTION bb_get_and_error()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION bb_get_twice()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION bb_get()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION bb_get_release()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION bb_release_last()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+
 
 /*
  * Injection point related functions
diff --git a/src/test/modules/test_aio/test_aio.c b/src/test/modules/test_aio/test_aio.c
index bef3b0e3ad04..2a7f4378ef37 100644
--- a/src/test/modules/test_aio/test_aio.c
+++ b/src/test/modules/test_aio/test_aio.c
@@ -52,6 +52,7 @@ static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
 
 
 static PgAioHandle *last_handle;
+static PgAioBounceBuffer *last_bb;
 
 
 
@@ -669,6 +670,60 @@ batch_end(PG_FUNCTION_ARGS)
 	PG_RETURN_VOID();
 }
 
+PG_FUNCTION_INFO_V1(bb_get);
+Datum
+bb_get(PG_FUNCTION_ARGS)
+{
+	last_bb = pgaio_bounce_buffer_get();
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(bb_release_last);
+Datum
+bb_release_last(PG_FUNCTION_ARGS)
+{
+	if (!last_bb)
+		elog(ERROR, "no bb");
+
+	pgaio_bounce_buffer_release(last_bb);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(bb_get_and_error);
+Datum
+bb_get_and_error(PG_FUNCTION_ARGS)
+{
+	pgaio_bounce_buffer_get();
+
+	elog(ERROR, "as you command");
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(bb_get_twice);
+Datum
+bb_get_twice(PG_FUNCTION_ARGS)
+{
+	pgaio_bounce_buffer_get();
+	pgaio_bounce_buffer_get();
+
+	PG_RETURN_VOID();
+}
+
+
+PG_FUNCTION_INFO_V1(bb_get_release);
+Datum
+bb_get_release(PG_FUNCTION_ARGS)
+{
+	PgAioBounceBuffer *bb;
+
+	bb = pgaio_bounce_buffer_get();
+	pgaio_bounce_buffer_release(bb);
+
+	PG_RETURN_VOID();
+}
+
 #ifdef USE_INJECTION_POINTS
 extern PGDLLEXPORT void inj_io_short_read(const char *name, const void *private_data);
 extern PGDLLEXPORT void inj_io_reopen(const char *name, const void *private_data);
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index b66cecd87991..3a67ee01b469 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2137,6 +2137,7 @@ PermutationStep
 PermutationStepBlocker
 PermutationStepBlockerType
 PgAioBackend
+PgAioBounceBuffer
 PgAioCtl
 PgAioHandle
 PgAioHandleCallbackID

From c779e9b07a9ade857db047788c792a0817d55007 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Sun, 30 Mar 2025 16:43:54 -0400
Subject: [PATCH 11/16] bufmgr: Implement AIO write support

As of this commit there are no users of these AIO facilities, that'll come in
later commits.

Problems with AIO writes:

- Write logic needs to be rebased on-top of the patch series to not hit bit
  dirty buffers while IO is going on

  The performance impact of doing the memory copies is rather substantial, as
  on intel memory bandwidth is *the* IO bottleneck even just for the checksum
  computation, without a copy. That makes the memory copy for something like
  bounce buffers hurt really badly.

  And the memory usage of bounce buffers is also really concerning.

  And even without checksums, several filesystems *really* don't like buffers
  getting modified during DIO writes. Which I think would mean we ought to use
  bounce buffers for *all* writes, which would impose a *very* substantial
  overhead (basically removing the benefit of DMA happening off-cpu).

- I think it requires new lwlock.c infrastructure (as v1 of aio had), to make
  LockBuffer(BUFFER_LOCK_EXCLUSIVE) etc wait in a concurrency safe manner for
  in-progress writes

  I can think of ways to solve this purely in bufmgr.c, but only in ways that
  would cause other problems (e.g. setting BM_IO_IN_PROGRESS before waiting
  for an exclusive lock) and/or expensive.
---
 src/backend/storage/aio/aio_callback.c |   2 +
 src/backend/storage/buffer/bufmgr.c    | 189 ++++++++++++++++++++++++-
 src/include/storage/aio.h              |   4 +-
 src/include/storage/bufmgr.h           |   2 +
 4 files changed, 190 insertions(+), 7 deletions(-)

diff --git a/src/backend/storage/aio/aio_callback.c b/src/backend/storage/aio/aio_callback.c
index abe5b191b427..21665e7eccb3 100644
--- a/src/backend/storage/aio/aio_callback.c
+++ b/src/backend/storage/aio/aio_callback.c
@@ -44,8 +44,10 @@ static const PgAioHandleCallbacksEntry aio_handle_cbs[] = {
 	CALLBACK_ENTRY(PGAIO_HCB_MD_WRITEV, aio_md_writev_cb),
 
 	CALLBACK_ENTRY(PGAIO_HCB_SHARED_BUFFER_READV, aio_shared_buffer_readv_cb),
+	CALLBACK_ENTRY(PGAIO_HCB_SHARED_BUFFER_WRITEV, aio_shared_buffer_writev_cb),
 
 	CALLBACK_ENTRY(PGAIO_HCB_LOCAL_BUFFER_READV, aio_local_buffer_readv_cb),
+	CALLBACK_ENTRY(PGAIO_HCB_LOCAL_BUFFER_WRITEV, aio_local_buffer_writev_cb),
 #undef CALLBACK_ENTRY
 };
 
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 1c37d7dfe2f9..9bf30c44af01 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -5536,7 +5536,15 @@ LockBuffer(Buffer buffer, int mode)
 	else if (mode == BUFFER_LOCK_SHARE)
 		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
 	else if (mode == BUFFER_LOCK_EXCLUSIVE)
+	{
+		/*
+		 * FIXME: Wait for AIO writes, otherwise there would be a risk of
+		 * deadlock. This isn't entirely trivial to do in a race-free way, IO
+		 * could be started between us checking whether there is IO and
+		 * enqueueing ourselves for the lock.
+		 */
 		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
+	}
 	else
 		elog(ERROR, "unrecognized buffer lock mode: %d", mode);
 }
@@ -5551,6 +5559,19 @@ ConditionalLockBuffer(Buffer buffer)
 {
 	BufferDesc *buf;
 
+	/*
+	 * FIXME: Wait for AIO writes. Some code does not deal well
+	 * ConditionalLockBuffer() continuously failing, e.g.
+	 * spginsert()->spgdoinsert() ends up busy-looping (easily reproducible by
+	 * just making this function always fail and running the regression
+	 * tests). While that code could be fixed, it'd be hard to find all
+	 * problematic places.
+	 *
+	 * It would be OK to wait for the IO as waiting for IO completion does not
+	 * need to wait for any locks that could lead to an undetected deadlock or
+	 * such.
+	 */
+
 	Assert(BufferIsPinned(buffer));
 	if (BufferIsLocal(buffer))
 		return true;			/* act as though we got it */
@@ -5614,10 +5635,8 @@ LockBufferForCleanup(Buffer buffer)
 	CheckBufferIsPinnedOnce(buffer);
 
 	/*
-	 * We do not yet need to be worried about in-progress AIOs holding a pin,
-	 * as we, so far, only support doing reads via AIO and this function can
-	 * only be called once the buffer is valid (i.e. no read can be in
-	 * flight).
+	 * FIXME: See AIO related comments in LockBuffer() and
+	 * ConditionalLockBuffer()
 	 */
 
 	/* Nobody else to wait for */
@@ -5630,6 +5649,11 @@ LockBufferForCleanup(Buffer buffer)
 	{
 		uint32		buf_state;
 
+		/*
+		 * FIXME: LockBuffer()'s handling of in-progress writes (once
+		 * implemented) should suffice to deal with deadlock risk.
+		 */
+
 		/* Try to acquire lock */
 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 		buf_state = LockBufHdr(bufHdr);
@@ -5777,7 +5801,13 @@ ConditionalLockBufferForCleanup(Buffer buffer)
 
 	Assert(BufferIsValid(buffer));
 
-	/* see AIO related comment in LockBufferForCleanup() */
+	/*
+	 * FIXME: Should wait for IO for the same reason as in
+	 * ConditionalLockBuffer(). Needs to happen before the
+	 * ConditionalLockBuffer() call below, as we'd never reach the
+	 * ConditionalLockBuffer() call due the buffer pin held for the duration
+	 * of the IO.
+	 */
 
 	if (BufferIsLocal(buffer))
 	{
@@ -5834,7 +5864,10 @@ IsBufferCleanupOK(Buffer buffer)
 
 	Assert(BufferIsValid(buffer));
 
-	/* see AIO related comment in LockBufferForCleanup() */
+	/*
+	 * FIXME: See AIO related comments in LockBuffer() and
+	 * ConditionalLockBuffer()
+	 */
 
 	if (BufferIsLocal(buffer))
 	{
@@ -7140,12 +7173,129 @@ buffer_readv_report(PgAioResult result, const PgAioTargetData *td,
 			affected_count > 1 ? errhint_internal(hint_mult, affected_count - 1) : 0);
 }
 
+/*
+ * Helper for AIO writev completion callbacks, supporting both shared and temp
+ * buffers. Gets called once for each buffer in a multi-page write.
+ */
+static pg_attribute_always_inline PgAioResult
+buffer_writev_complete_one(uint8 buf_off, Buffer buffer, uint8 flags,
+						   bool failed, bool is_temp)
+{
+	BufferDesc *buf_hdr = is_temp ?
+		GetLocalBufferDescriptor(-buffer - 1)
+		: GetBufferDescriptor(buffer - 1);
+	PgAioResult result = {.status = PGAIO_RS_OK};
+	bool		clear_dirty;
+	uint32		set_flag_bits;
+
+#ifdef USE_ASSERT_CHECKING
+	{
+		uint32		buf_state = pg_atomic_read_u32(&buf_hdr->state);
+
+		Assert(buf_state & BM_VALID);
+		Assert(buf_state & BM_TAG_VALID);
+		/* temp buffers don't use BM_IO_IN_PROGRESS */
+		if (!is_temp)
+			Assert(buf_state & BM_IO_IN_PROGRESS);
+		Assert(buf_state & BM_DIRTY);
+	}
+#endif
+
+	clear_dirty = failed ? false : true;
+	set_flag_bits = failed ? BM_IO_ERROR : 0;
+
+	if (is_temp)
+		TerminateLocalBufferIO(buf_hdr, clear_dirty, set_flag_bits, true);
+	else
+		TerminateBufferIO(buf_hdr, clear_dirty, set_flag_bits, false, true);
+
+	/*
+	 * The initiator of IO is not managing the lock (i.e. we called
+	 * LWLockDisown()), we are.
+	 */
+	if (!is_temp)
+		LWLockReleaseDisowned(BufferDescriptorGetContentLock(buf_hdr),
+							  LW_SHARED);
+
+	/* FIXME: tracepoint */
+
+	return result;
+}
+
+/*
+ * Perform completion handling of a single AIO write. This write may cover
+ * multiple blocks / buffers.
+ *
+ * Shared between shared and local buffers, to reduce code duplication.
+ */
+static pg_attribute_always_inline PgAioResult
+buffer_writev_complete(PgAioHandle *ioh, PgAioResult prior_result,
+					   uint8 cb_data, bool is_temp)
+{
+	PgAioResult result = prior_result;
+	PgAioTargetData *td = pgaio_io_get_target_data(ioh);
+	uint64	   *io_data;
+	uint8		handle_data_len;
+
+	if (is_temp)
+	{
+		Assert(td->smgr.is_temp);
+		Assert(pgaio_io_get_owner(ioh) == MyProcNumber);
+	}
+	else
+		Assert(!td->smgr.is_temp);
+
+	/*
+	 * Iterate over all the buffers affected by this IO and call appropriate
+	 * per-buffer completion function for each buffer.
+	 */
+	io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
+	for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
+	{
+		Buffer		buf = io_data[buf_off];
+		PgAioResult buf_result;
+		bool		failed;
+
+		Assert(BufferIsValid(buf));
+
+		/*
+		 * If the entire failed on a lower-level, each buffer needs to be
+		 * marked as failed. In case of a partial read, some buffers may be
+		 * ok.
+		 */
+		failed =
+			prior_result.status == PGAIO_RS_ERROR
+			|| prior_result.result <= buf_off;
+
+		buf_result = buffer_writev_complete_one(buf_off, buf, cb_data, failed,
+												is_temp);
+
+		/*
+		 * If there wasn't any prior error and the IO for this page failed in
+		 * some form, set the whole IO's to the page's result.
+		 */
+		if (result.status != PGAIO_RS_ERROR && buf_result.status != PGAIO_RS_OK)
+		{
+			result = buf_result;
+			pgaio_result_report(result, td, LOG);
+		}
+	}
+
+	return result;
+}
+
 static void
 shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
 {
 	buffer_stage_common(ioh, false, false);
 }
 
+static void
+shared_buffer_writev_stage(PgAioHandle *ioh, uint8 cb_data)
+{
+	buffer_stage_common(ioh, true, false);
+}
+
 static PgAioResult
 shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
 							 uint8 cb_data)
@@ -7191,6 +7341,13 @@ shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result,
 	return prior_result;
 }
 
+static PgAioResult
+shared_buffer_writev_complete(PgAioHandle *ioh, PgAioResult prior_result,
+							  uint8 cb_data)
+{
+	return buffer_writev_complete(ioh, prior_result, cb_data, false);
+}
+
 static void
 local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
 {
@@ -7204,6 +7361,17 @@ local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
 	return buffer_readv_complete(ioh, prior_result, cb_data, true);
 }
 
+static void
+local_buffer_writev_stage(PgAioHandle *ioh, uint8 cb_data)
+{
+	/*
+	 * Currently this is unreachable as the only write support is for
+	 * checkpointer / bgwriter, which don't deal with local buffers.
+	 */
+	elog(ERROR, "should be unreachable");
+}
+
+
 /* readv callback is passed READ_BUFFERS_* flags as callback data */
 const PgAioHandleCallbacks aio_shared_buffer_readv_cb = {
 	.stage = shared_buffer_readv_stage,
@@ -7213,6 +7381,11 @@ const PgAioHandleCallbacks aio_shared_buffer_readv_cb = {
 	.report = buffer_readv_report,
 };
 
+const PgAioHandleCallbacks aio_shared_buffer_writev_cb = {
+	.stage = shared_buffer_writev_stage,
+	.complete_shared = shared_buffer_writev_complete,
+};
+
 /* readv callback is passed READ_BUFFERS_* flags as callback data */
 const PgAioHandleCallbacks aio_local_buffer_readv_cb = {
 	.stage = local_buffer_readv_stage,
@@ -7226,3 +7399,7 @@ const PgAioHandleCallbacks aio_local_buffer_readv_cb = {
 	.complete_local = local_buffer_readv_complete,
 	.report = buffer_readv_report,
 };
+
+const PgAioHandleCallbacks aio_local_buffer_writev_cb = {
+	.stage = local_buffer_writev_stage,
+};
diff --git a/src/include/storage/aio.h b/src/include/storage/aio.h
index f91f0afc5a58..72d5680e767e 100644
--- a/src/include/storage/aio.h
+++ b/src/include/storage/aio.h
@@ -197,11 +197,13 @@ typedef enum PgAioHandleCallbackID
 	PGAIO_HCB_MD_WRITEV,
 
 	PGAIO_HCB_SHARED_BUFFER_READV,
+	PGAIO_HCB_SHARED_BUFFER_WRITEV,
 
 	PGAIO_HCB_LOCAL_BUFFER_READV,
+	PGAIO_HCB_LOCAL_BUFFER_WRITEV,
 } PgAioHandleCallbackID;
 
-#define PGAIO_HCB_MAX	PGAIO_HCB_LOCAL_BUFFER_READV
+#define PGAIO_HCB_MAX	PGAIO_HCB_LOCAL_BUFFER_WRITEV
 StaticAssertDecl(PGAIO_HCB_MAX <= (1 << PGAIO_RESULT_ID_BITS),
 				 "PGAIO_HCB_MAX is too big for PGAIO_RESULT_ID_BITS");
 
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index f2192ceb2719..492feab0cb56 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -174,7 +174,9 @@ extern PGDLLIMPORT int backend_flush_after;
 extern PGDLLIMPORT int bgwriter_flush_after;
 
 extern const PgAioHandleCallbacks aio_shared_buffer_readv_cb;
+extern const PgAioHandleCallbacks aio_shared_buffer_writev_cb;
 extern const PgAioHandleCallbacks aio_local_buffer_readv_cb;
+extern const PgAioHandleCallbacks aio_local_buffer_writev_cb;
 
 /* in buf_init.c */
 extern PGDLLIMPORT char *BufferBlocks;

From 6074c30595e7bdb9a6ce19be257c96f253ab9830 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 18 Mar 2025 14:40:06 -0400
Subject: [PATCH 12/16] aio: Add IO queue helper

This is likely never going to anywhere - Thomas Munro is working on something
more complete. But I needed a way to exercise aio for checkpointer / bgwriter.
---
 src/backend/storage/aio/Makefile    |   1 +
 src/backend/storage/aio/io_queue.c  | 204 ++++++++++++++++++++++++++++
 src/backend/storage/aio/meson.build |   1 +
 src/include/storage/io_queue.h      |  33 +++++
 src/tools/pgindent/typedefs.list    |   2 +
 5 files changed, 241 insertions(+)
 create mode 100644 src/backend/storage/aio/io_queue.c
 create mode 100644 src/include/storage/io_queue.h

diff --git a/src/backend/storage/aio/Makefile b/src/backend/storage/aio/Makefile
index 3f2469cc3994..86fa4276fda9 100644
--- a/src/backend/storage/aio/Makefile
+++ b/src/backend/storage/aio/Makefile
@@ -15,6 +15,7 @@ OBJS = \
 	aio_init.o \
 	aio_io.o \
 	aio_target.o \
+	io_queue.o \
 	method_io_uring.o \
 	method_sync.o \
 	method_worker.o \
diff --git a/src/backend/storage/aio/io_queue.c b/src/backend/storage/aio/io_queue.c
new file mode 100644
index 000000000000..526aa1d5e06d
--- /dev/null
+++ b/src/backend/storage/aio/io_queue.c
@@ -0,0 +1,204 @@
+/*-------------------------------------------------------------------------
+ *
+ * io_queue.c
+ *	  AIO - Mechanism for tracking many IOs
+ *
+ * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/aio/io_queue.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "lib/ilist.h"
+#include "storage/aio.h"
+#include "storage/io_queue.h"
+#include "utils/resowner.h"
+
+
+
+typedef struct TrackedIO
+{
+	PgAioWaitRef iow;
+	dlist_node	node;
+} TrackedIO;
+
+struct IOQueue
+{
+	int			depth;
+	int			unsubmitted;
+
+	bool		has_reserved;
+
+	dclist_head idle;
+	dclist_head in_progress;
+
+	TrackedIO	tracked_ios[FLEXIBLE_ARRAY_MEMBER];
+};
+
+
+IOQueue *
+io_queue_create(int depth, int flags)
+{
+	size_t		sz;
+	IOQueue    *ioq;
+
+	sz = offsetof(IOQueue, tracked_ios)
+		+ sizeof(TrackedIO) * depth;
+
+	ioq = palloc0(sz);
+
+	ioq->depth = 0;
+
+	for (int i = 0; i < depth; i++)
+	{
+		TrackedIO  *tio = &ioq->tracked_ios[i];
+
+		pgaio_wref_clear(&tio->iow);
+		dclist_push_tail(&ioq->idle, &tio->node);
+	}
+
+	return ioq;
+}
+
+void
+io_queue_wait_one(IOQueue *ioq)
+{
+	/* submit all pending IO before waiting */
+	pgaio_submit_staged();
+
+	while (!dclist_is_empty(&ioq->in_progress))
+	{
+		/* FIXME: Should we really pop here already? */
+		dlist_node *node = dclist_pop_head_node(&ioq->in_progress);
+		TrackedIO  *tio = dclist_container(TrackedIO, node, node);
+
+		pgaio_wref_wait(&tio->iow);
+		dclist_push_head(&ioq->idle, &tio->node);
+	}
+}
+
+void
+io_queue_reserve(IOQueue *ioq)
+{
+	if (ioq->has_reserved)
+		return;
+
+	if (dclist_is_empty(&ioq->idle))
+		io_queue_wait_one(ioq);
+
+	Assert(!dclist_is_empty(&ioq->idle));
+
+	ioq->has_reserved = true;
+}
+
+PgAioHandle *
+io_queue_acquire_io(IOQueue *ioq)
+{
+	PgAioHandle *ioh;
+
+	io_queue_reserve(ioq);
+
+	Assert(!dclist_is_empty(&ioq->idle));
+
+	if (!io_queue_is_empty(ioq))
+	{
+		ioh = pgaio_io_acquire_nb(CurrentResourceOwner, NULL);
+		if (ioh == NULL)
+		{
+			/*
+			 * Need to wait for all IOs, blocking might not be legal in the
+			 * context.
+			 *
+			 * XXX: This doesn't make a whole lot of sense, we're also
+			 * blocking here. What was I smoking when I wrote the above?
+			 */
+			io_queue_wait_all(ioq);
+			ioh = pgaio_io_acquire(CurrentResourceOwner, NULL);
+		}
+	}
+	else
+	{
+		ioh = pgaio_io_acquire(CurrentResourceOwner, NULL);
+	}
+
+	return ioh;
+}
+
+void
+io_queue_track(IOQueue *ioq, const struct PgAioWaitRef *iow)
+{
+	dlist_node *node;
+	TrackedIO  *tio;
+
+	Assert(ioq->has_reserved);
+	ioq->has_reserved = false;
+
+	Assert(!dclist_is_empty(&ioq->idle));
+
+	node = dclist_pop_head_node(&ioq->idle);
+	tio = dclist_container(TrackedIO, node, node);
+
+	tio->iow = *iow;
+
+	dclist_push_tail(&ioq->in_progress, &tio->node);
+
+	ioq->unsubmitted++;
+
+	/*
+	 * XXX: Should have some smarter logic here. We don't want to wait too
+	 * long to submit, that'll mean we're more likely to block. But we also
+	 * don't want to have the overhead of submitting every IO individually.
+	 */
+	if (ioq->unsubmitted >= 4)
+	{
+		pgaio_submit_staged();
+		ioq->unsubmitted = 0;
+	}
+}
+
+void
+io_queue_wait_all(IOQueue *ioq)
+{
+	/* submit all pending IO before waiting */
+	pgaio_submit_staged();
+
+	while (!dclist_is_empty(&ioq->in_progress))
+	{
+		/* wait for the last IO to minimize unnecessary wakeups */
+		dlist_node *node = dclist_tail_node(&ioq->in_progress);
+		TrackedIO  *tio = dclist_container(TrackedIO, node, node);
+
+		if (!pgaio_wref_check_done(&tio->iow))
+		{
+			ereport(DEBUG3,
+					errmsg("io_queue_wait_all for io:%d",
+						   pgaio_wref_get_id(&tio->iow)),
+					errhidestmt(true),
+					errhidecontext(true));
+
+			pgaio_wref_wait(&tio->iow);
+		}
+
+		dclist_delete_from(&ioq->in_progress, &tio->node);
+		dclist_push_head(&ioq->idle, &tio->node);
+	}
+}
+
+bool
+io_queue_is_empty(IOQueue *ioq)
+{
+	return dclist_is_empty(&ioq->in_progress);
+}
+
+void
+io_queue_free(IOQueue *ioq)
+{
+	io_queue_wait_all(ioq);
+
+	pfree(ioq);
+}
diff --git a/src/backend/storage/aio/meson.build b/src/backend/storage/aio/meson.build
index da6df2d3654f..270c4a64428b 100644
--- a/src/backend/storage/aio/meson.build
+++ b/src/backend/storage/aio/meson.build
@@ -7,6 +7,7 @@ backend_sources += files(
   'aio_init.c',
   'aio_io.c',
   'aio_target.c',
+  'io_queue.c',
   'method_io_uring.c',
   'method_sync.c',
   'method_worker.c',
diff --git a/src/include/storage/io_queue.h b/src/include/storage/io_queue.h
new file mode 100644
index 000000000000..92b1e9afe6f5
--- /dev/null
+++ b/src/include/storage/io_queue.h
@@ -0,0 +1,33 @@
+/*-------------------------------------------------------------------------
+ *
+ * io_queue.h
+ *	  Mechanism for tracking many IOs
+ *
+ *
+ * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/io_queue.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef IO_QUEUE_H
+#define IO_QUEUE_H
+
+#include "storage/aio_types.h"
+
+struct IOQueue;
+typedef struct IOQueue IOQueue;
+
+struct PgAioWaitRef;
+
+extern IOQueue *io_queue_create(int depth, int flags);
+extern void io_queue_track(IOQueue *ioq, const PgAioWaitRef *iow);
+extern void io_queue_wait_one(IOQueue *ioq);
+extern void io_queue_wait_all(IOQueue *ioq);
+extern bool io_queue_is_empty(IOQueue *ioq);
+extern void io_queue_reserve(IOQueue *ioq);
+extern PgAioHandle *io_queue_acquire_io(IOQueue *ioq);
+extern void io_queue_free(IOQueue *ioq);
+
+#endif							/* IO_QUEUE_H */
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 3a67ee01b469..0c6ddadc51da 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1196,6 +1196,7 @@ IOContext
 IOFuncSelector
 IOObject
 IOOp
+IOQueue
 IO_STATUS_BLOCK
 IPCompareMethod
 ITEM
@@ -3022,6 +3023,7 @@ TocEntry
 TokenAuxData
 TokenizedAuthLine
 TrackItem
+TrackedIO
 TransApplyAction
 TransInvalidationInfo
 TransState

From b9d5e77b3a273be284aed43cb9148cfc1aef32c7 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 18 Mar 2025 14:40:06 -0400
Subject: [PATCH 13/16] bufmgr: use AIO in checkpointer, bgwriter

This is far from ready - just included to be able to exercise AIO writes and
get some preliminary numbers.  In all likelihood this will instead be based
on-top of work by Thomas Munro instead of the preceding commit.

TODO;

- This doesn't implement bgwriter_flush_after, checkpointer_flush_after

  I think that's not too hard to do, it's mainly round tuits.

- The queuing logic doesn't carefully respect pin limits

  That might be ok for checkpointer and bgwriter, but the infrastructure
  should be usable outside of this as well.
---
 src/backend/postmaster/bgwriter.c     |  19 +-
 src/backend/postmaster/checkpointer.c |  11 +-
 src/backend/storage/buffer/bufmgr.c   | 594 +++++++++++++++++++++++---
 src/backend/storage/page/bufpage.c    |  10 +
 src/include/postmaster/bgwriter.h     |   3 +-
 src/include/storage/buf_internals.h   |   2 +
 src/include/storage/bufmgr.h          |   3 +-
 src/include/storage/bufpage.h         |   1 +
 src/tools/pgindent/typedefs.list      |   1 +
 9 files changed, 586 insertions(+), 58 deletions(-)

diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 72f5acceec78..6e8801a39e32 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -38,11 +38,13 @@
 #include "postmaster/auxprocess.h"
 #include "postmaster/bgwriter.h"
 #include "postmaster/interrupt.h"
+#include "storage/aio.h"
 #include "storage/aio_subsys.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "storage/condition_variable.h"
 #include "storage/fd.h"
+#include "storage/io_queue.h"
 #include "storage/lwlock.h"
 #include "storage/proc.h"
 #include "storage/procsignal.h"
@@ -90,6 +92,7 @@ BackgroundWriterMain(const void *startup_data, size_t startup_data_len)
 	sigjmp_buf	local_sigjmp_buf;
 	MemoryContext bgwriter_context;
 	bool		prev_hibernate;
+	IOQueue    *ioq;
 	WritebackContext wb_context;
 
 	Assert(startup_data_len == 0);
@@ -131,6 +134,7 @@ BackgroundWriterMain(const void *startup_data, size_t startup_data_len)
 											 ALLOCSET_DEFAULT_SIZES);
 	MemoryContextSwitchTo(bgwriter_context);
 
+	ioq = io_queue_create(128, 0);
 	WritebackContextInit(&wb_context, &bgwriter_flush_after);
 
 	/*
@@ -228,12 +232,22 @@ BackgroundWriterMain(const void *startup_data, size_t startup_data_len)
 		/* Clear any already-pending wakeups */
 		ResetLatch(MyLatch);
 
+		/*
+		 * FIXME: this is theoretically racy, but I didn't want to copy
+		 * ProcessMainLoopInterrupts() remaining body here.
+		 */
+		if (ShutdownRequestPending)
+		{
+			io_queue_wait_all(ioq);
+			io_queue_free(ioq);
+		}
+
 		ProcessMainLoopInterrupts();
 
 		/*
 		 * Do one cycle of dirty-buffer writing.
 		 */
-		can_hibernate = BgBufferSync(&wb_context);
+		can_hibernate = BgBufferSync(ioq, &wb_context);
 
 		/* Report pending statistics to the cumulative stats system */
 		pgstat_report_bgwriter();
@@ -250,6 +264,9 @@ BackgroundWriterMain(const void *startup_data, size_t startup_data_len)
 			smgrdestroyall();
 		}
 
+		/* finish IO before sleeping, to avoid blocking other backends */
+		io_queue_wait_all(ioq);
+
 		/*
 		 * Log a new xl_running_xacts every now and then so replication can
 		 * get into a consistent state faster (think of suboverflowed
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index fda91ffd1ce2..904fe167eb4d 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -49,10 +49,12 @@
 #include "postmaster/bgwriter.h"
 #include "postmaster/interrupt.h"
 #include "replication/syncrep.h"
+#include "storage/aio.h"
 #include "storage/aio_subsys.h"
 #include "storage/bufmgr.h"
 #include "storage/condition_variable.h"
 #include "storage/fd.h"
+#include "storage/io_queue.h"
 #include "storage/ipc.h"
 #include "storage/lwlock.h"
 #include "storage/pmsignal.h"
@@ -766,7 +768,7 @@ ImmediateCheckpointRequested(void)
  * fraction between 0.0 meaning none, and 1.0 meaning all done.
  */
 void
-CheckpointWriteDelay(int flags, double progress)
+CheckpointWriteDelay(IOQueue *ioq, int flags, double progress)
 {
 	static int	absorb_counter = WRITES_PER_ABSORB;
 
@@ -800,6 +802,13 @@ CheckpointWriteDelay(int flags, double progress)
 		/* Report interim statistics to the cumulative stats system */
 		pgstat_report_checkpointer();
 
+		/*
+		 * Ensure all pending IO is submitted to avoid unnecessary delays for
+		 * other processes.
+		 */
+		io_queue_wait_all(ioq);
+
+
 		/*
 		 * This sleep used to be connected to bgwriter_delay, typically 200ms.
 		 * That resulted in more frequent wakeups if not much work to do.
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 9bf30c44af01..d9a362d75539 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -52,6 +52,7 @@
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "storage/fd.h"
+#include "storage/io_queue.h"
 #include "storage/ipc.h"
 #include "storage/lmgr.h"
 #include "storage/proc.h"
@@ -76,6 +77,7 @@
 /* Bits in SyncOneBuffer's return value */
 #define BUF_WRITTEN				0x01
 #define BUF_REUSABLE			0x02
+#define BUF_CANT_MERGE			0x04
 
 #define RELS_BSEARCH_THRESHOLD		20
 
@@ -515,8 +517,6 @@ static void UnpinBuffer(BufferDesc *buf);
 static void UnpinBufferNoOwner(BufferDesc *buf);
 static void BufferSync(int flags);
 static uint32 WaitBufHdrUnlocked(BufferDesc *buf);
-static int	SyncOneBuffer(int buf_id, bool skip_recently_used,
-						  WritebackContext *wb_context);
 static void WaitIO(BufferDesc *buf);
 static void AbortBufferIO(Buffer buffer);
 static void shared_buffer_write_error_callback(void *arg);
@@ -532,6 +532,7 @@ static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_c
 static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
 						IOObject io_object, IOContext io_context);
+
 static void FindAndDropRelationBuffers(RelFileLocator rlocator,
 									   ForkNumber forkNum,
 									   BlockNumber nForkBlock,
@@ -3321,6 +3322,57 @@ UnpinBufferNoOwner(BufferDesc *buf)
 	}
 }
 
+typedef struct BuffersToWrite
+{
+	int			nbuffers;
+	BufferTag	start_at_tag;
+	uint32		max_combine;
+
+	XLogRecPtr	max_lsn;
+
+	PgAioHandle *ioh;
+	PgAioWaitRef iow;
+
+	uint64		total_writes;
+
+	Buffer		buffers[IOV_MAX];
+	PgAioBounceBuffer *bounce_buffers[IOV_MAX];
+	const void *data_ptrs[IOV_MAX];
+} BuffersToWrite;
+
+static int	PrepareToWriteBuffer(BuffersToWrite *to_write, Buffer buf,
+								 bool skip_recently_used,
+								 IOQueue *ioq, WritebackContext *wb_context);
+
+static void WriteBuffers(BuffersToWrite *to_write,
+						 IOQueue *ioq, WritebackContext *wb_context);
+
+static void
+BuffersToWriteInit(BuffersToWrite *to_write,
+				   IOQueue *ioq, WritebackContext *wb_context)
+{
+	to_write->total_writes = 0;
+	to_write->nbuffers = 0;
+	to_write->ioh = NULL;
+	pgaio_wref_clear(&to_write->iow);
+	to_write->max_lsn = InvalidXLogRecPtr;
+
+	pgaio_enter_batchmode();
+}
+
+static void
+BuffersToWriteEnd(BuffersToWrite *to_write)
+{
+	if (to_write->ioh != NULL)
+	{
+		pgaio_io_release(to_write->ioh);
+		to_write->ioh = NULL;
+	}
+
+	pgaio_exit_batchmode();
+}
+
+
 #define ST_SORT sort_checkpoint_bufferids
 #define ST_ELEMENT_TYPE CkptSortItem
 #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
@@ -3352,7 +3404,10 @@ BufferSync(int flags)
 	binaryheap *ts_heap;
 	int			i;
 	int			mask = BM_DIRTY;
+	IOQueue    *ioq;
 	WritebackContext wb_context;
+	BuffersToWrite to_write;
+	int			max_combine;
 
 	/*
 	 * Unless this is a shutdown checkpoint or we have been explicitly told,
@@ -3414,7 +3469,9 @@ BufferSync(int flags)
 	if (num_to_scan == 0)
 		return;					/* nothing to do */
 
+	ioq = io_queue_create(512, 0);
 	WritebackContextInit(&wb_context, &checkpoint_flush_after);
+	max_combine = Min(io_bounce_buffers, io_combine_limit);
 
 	TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
 
@@ -3522,48 +3579,91 @@ BufferSync(int flags)
 	 */
 	num_processed = 0;
 	num_written = 0;
+
+	BuffersToWriteInit(&to_write, ioq, &wb_context);
+
 	while (!binaryheap_empty(ts_heap))
 	{
 		BufferDesc *bufHdr = NULL;
 		CkptTsStatus *ts_stat = (CkptTsStatus *)
 			DatumGetPointer(binaryheap_first(ts_heap));
+		bool		batch_continue = true;
 
-		buf_id = CkptBufferIds[ts_stat->index].buf_id;
-		Assert(buf_id != -1);
-
-		bufHdr = GetBufferDescriptor(buf_id);
-
-		num_processed++;
+		Assert(ts_stat->num_scanned <= ts_stat->num_to_scan);
 
 		/*
-		 * We don't need to acquire the lock here, because we're only looking
-		 * at a single bit. It's possible that someone else writes the buffer
-		 * and clears the flag right after we check, but that doesn't matter
-		 * since SyncOneBuffer will then do nothing.  However, there is a
-		 * further race condition: it's conceivable that between the time we
-		 * examine the bit here and the time SyncOneBuffer acquires the lock,
-		 * someone else not only wrote the buffer but replaced it with another
-		 * page and dirtied it.  In that improbable case, SyncOneBuffer will
-		 * write the buffer though we didn't need to.  It doesn't seem worth
-		 * guarding against this, though.
+		 * Collect a batch of buffers to write out from the current
+		 * tablespace. That causes some imbalance between the tablespaces, but
+		 * that's more than outweighed by the efficiency gain due to batching.
 		 */
-		if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
+		while (batch_continue &&
+			   to_write.nbuffers < max_combine &&
+			   ts_stat->num_scanned < ts_stat->num_to_scan)
 		{
-			if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
+			buf_id = CkptBufferIds[ts_stat->index].buf_id;
+			Assert(buf_id != -1);
+
+			bufHdr = GetBufferDescriptor(buf_id);
+
+			num_processed++;
+
+			/*
+			 * We don't need to acquire the lock here, because we're only
+			 * looking at a single bit. It's possible that someone else writes
+			 * the buffer and clears the flag right after we check, but that
+			 * doesn't matter since SyncOneBuffer will then do nothing.
+			 * However, there is a further race condition: it's conceivable
+			 * that between the time we examine the bit here and the time
+			 * SyncOneBuffer acquires the lock, someone else not only wrote
+			 * the buffer but replaced it with another page and dirtied it. In
+			 * that improbable case, SyncOneBuffer will write the buffer
+			 * though we didn't need to.  It doesn't seem worth guarding
+			 * against this, though.
+			 */
+			if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
+			{
+				int			result = PrepareToWriteBuffer(&to_write, buf_id + 1, false,
+														  ioq, &wb_context);
+
+				if (result & BUF_CANT_MERGE)
+				{
+					Assert(to_write.nbuffers > 0);
+					WriteBuffers(&to_write, ioq, &wb_context);
+
+					result = PrepareToWriteBuffer(&to_write, buf_id + 1, false,
+												  ioq, &wb_context);
+					Assert(result != BUF_CANT_MERGE);
+				}
+
+				if (result & BUF_WRITTEN)
+				{
+					TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
+					PendingCheckpointerStats.buffers_written++;
+					num_written++;
+				}
+				else
+				{
+					batch_continue = false;
+				}
+			}
+			else
 			{
-				TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
-				PendingCheckpointerStats.buffers_written++;
-				num_written++;
+				if (to_write.nbuffers > 0)
+					WriteBuffers(&to_write, ioq, &wb_context);
 			}
+
+			/*
+			 * Measure progress independent of actually having to flush the
+			 * buffer - otherwise writing become unbalanced.
+			 */
+			ts_stat->progress += ts_stat->progress_slice;
+			ts_stat->num_scanned++;
+			ts_stat->index++;
 		}
 
-		/*
-		 * Measure progress independent of actually having to flush the buffer
-		 * - otherwise writing become unbalanced.
-		 */
-		ts_stat->progress += ts_stat->progress_slice;
-		ts_stat->num_scanned++;
-		ts_stat->index++;
+		if (to_write.nbuffers > 0)
+			WriteBuffers(&to_write, ioq, &wb_context);
+
 
 		/* Have all the buffers from the tablespace been processed? */
 		if (ts_stat->num_scanned == ts_stat->num_to_scan)
@@ -3581,15 +3681,23 @@ BufferSync(int flags)
 		 *
 		 * (This will check for barrier events even if it doesn't sleep.)
 		 */
-		CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
+		CheckpointWriteDelay(ioq, flags, (double) num_processed / num_to_scan);
 	}
 
+	Assert(to_write.nbuffers == 0);
+	io_queue_wait_all(ioq);
+
 	/*
 	 * Issue all pending flushes. Only checkpointer calls BufferSync(), so
 	 * IOContext will always be IOCONTEXT_NORMAL.
 	 */
 	IssuePendingWritebacks(&wb_context, IOCONTEXT_NORMAL);
 
+	io_queue_wait_all(ioq);		/* IssuePendingWritebacks might have added
+								 * more */
+	io_queue_free(ioq);
+	BuffersToWriteEnd(&to_write);
+
 	pfree(per_ts_stat);
 	per_ts_stat = NULL;
 	binaryheap_free(ts_heap);
@@ -3615,7 +3723,7 @@ BufferSync(int flags)
  * bgwriter_lru_maxpages to 0.)
  */
 bool
-BgBufferSync(WritebackContext *wb_context)
+BgBufferSync(IOQueue *ioq, WritebackContext *wb_context)
 {
 	/* info obtained from freelist.c */
 	int			strategy_buf_id;
@@ -3658,6 +3766,9 @@ BgBufferSync(WritebackContext *wb_context)
 	long		new_strategy_delta;
 	uint32		new_recent_alloc;
 
+	BuffersToWrite to_write;
+	int			max_combine;
+
 	/*
 	 * Find out where the freelist clock sweep currently is, and how many
 	 * buffer allocations have happened since our last call.
@@ -3678,6 +3789,8 @@ BgBufferSync(WritebackContext *wb_context)
 		return true;
 	}
 
+	max_combine = Min(io_bounce_buffers, io_combine_limit);
+
 	/*
 	 * Compute strategy_delta = how many buffers have been scanned by the
 	 * clock sweep since last time.  If first time through, assume none. Then
@@ -3834,11 +3947,25 @@ BgBufferSync(WritebackContext *wb_context)
 	num_written = 0;
 	reusable_buffers = reusable_buffers_est;
 
+	BuffersToWriteInit(&to_write, ioq, wb_context);
+
 	/* Execute the LRU scan */
 	while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
 	{
-		int			sync_state = SyncOneBuffer(next_to_clean, true,
-											   wb_context);
+		int			sync_state;
+
+		sync_state = PrepareToWriteBuffer(&to_write, next_to_clean + 1,
+										  true, ioq, wb_context);
+		if (sync_state & BUF_CANT_MERGE)
+		{
+			Assert(to_write.nbuffers > 0);
+
+			WriteBuffers(&to_write, ioq, wb_context);
+
+			sync_state = PrepareToWriteBuffer(&to_write, next_to_clean + 1,
+											  true, ioq, wb_context);
+			Assert(sync_state != BUF_CANT_MERGE);
+		}
 
 		if (++next_to_clean >= NBuffers)
 		{
@@ -3849,6 +3976,13 @@ BgBufferSync(WritebackContext *wb_context)
 
 		if (sync_state & BUF_WRITTEN)
 		{
+			Assert(sync_state & BUF_REUSABLE);
+
+			if (to_write.nbuffers == max_combine)
+			{
+				WriteBuffers(&to_write, ioq, wb_context);
+			}
+
 			reusable_buffers++;
 			if (++num_written >= bgwriter_lru_maxpages)
 			{
@@ -3860,6 +3994,11 @@ BgBufferSync(WritebackContext *wb_context)
 			reusable_buffers++;
 	}
 
+	if (to_write.nbuffers > 0)
+		WriteBuffers(&to_write, ioq, wb_context);
+
+	BuffersToWriteEnd(&to_write);
+
 	PendingBgWriterStats.buf_written_clean += num_written;
 
 #ifdef BGW_DEBUG
@@ -3898,8 +4037,66 @@ BgBufferSync(WritebackContext *wb_context)
 	return (bufs_to_lap == 0 && recent_alloc == 0);
 }
 
+static inline bool
+BufferTagsSameRel(const BufferTag *tag1, const BufferTag *tag2)
+{
+	return (tag1->spcOid == tag2->spcOid) &&
+		(tag1->dbOid == tag2->dbOid) &&
+		(tag1->relNumber == tag2->relNumber) &&
+		(tag1->forkNum == tag2->forkNum)
+		;
+}
+
+static bool
+CanMergeWrite(BuffersToWrite *to_write, BufferDesc *cur_buf_hdr)
+{
+	BlockNumber cur_block = cur_buf_hdr->tag.blockNum;
+
+	Assert(to_write->nbuffers > 0); /* can't merge with nothing */
+	Assert(to_write->start_at_tag.relNumber != InvalidOid);
+	Assert(to_write->start_at_tag.blockNum != InvalidBlockNumber);
+
+	Assert(to_write->ioh != NULL);
+
+	/*
+	 * First check if the blocknumber is one that we could actually merge,
+	 * that's cheaper than checking the tablespace/db/relnumber/fork match.
+	 */
+	if (to_write->start_at_tag.blockNum + to_write->nbuffers != cur_block)
+		return false;
+
+	if (!BufferTagsSameRel(&to_write->start_at_tag, &cur_buf_hdr->tag))
+		return false;
+
+	/*
+	 * Need to check with smgr how large a write we're allowed to make. To
+	 * reduce the overhead of the smgr check, only inquire once, when
+	 * processing the first to-be-merged buffer. That avoids the overhead in
+	 * the common case of writing out buffers that definitely not mergeable.
+	 */
+	if (to_write->nbuffers == 1)
+	{
+		SMgrRelation smgr;
+
+		smgr = smgropen(BufTagGetRelFileLocator(&to_write->start_at_tag), INVALID_PROC_NUMBER);
+
+		to_write->max_combine = smgrmaxcombine(smgr,
+											   to_write->start_at_tag.forkNum,
+											   to_write->start_at_tag.blockNum);
+	}
+	else
+	{
+		Assert(to_write->max_combine > 0);
+	}
+
+	if (to_write->start_at_tag.blockNum + to_write->max_combine <= cur_block)
+		return false;
+
+	return true;
+}
+
 /*
- * SyncOneBuffer -- process a single buffer during syncing.
+ * PrepareToWriteBuffer -- process a single buffer during syncing.
  *
  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
  * buffers marked recently used, as these are not replacement candidates.
@@ -3908,22 +4105,50 @@ BgBufferSync(WritebackContext *wb_context)
  *	BUF_WRITTEN: we wrote the buffer.
  *	BUF_REUSABLE: buffer is available for replacement, ie, it has
  *		pin count 0 and usage count 0.
+ *	BUF_CANT_MERGE: can't combine this write with prior writes, caller needs
+ *		to issue those first
  *
  * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
  * after locking it, but we don't care all that much.)
  */
 static int
-SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
+PrepareToWriteBuffer(BuffersToWrite *to_write, Buffer buf,
+					 bool skip_recently_used,
+					 IOQueue *ioq, WritebackContext *wb_context)
 {
-	BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
-	int			result = 0;
+	BufferDesc *cur_buf_hdr = GetBufferDescriptor(buf - 1);
 	uint32		buf_state;
-	BufferTag	tag;
+	int			result = 0;
+	XLogRecPtr	cur_buf_lsn;
+	LWLock	   *content_lock;
+	bool		may_block;
+
+	/*
+	 * Check if this buffer can be written out together with already prepared
+	 * writes. We check before we have pinned the buffer, so the buffer can be
+	 * written out and replaced between this check and us pinning the buffer -
+	 * we'll recheck below. The reason for the pre-check is that we don't want
+	 * to pin the buffer just to find out that we can't merge the IO.
+	 */
+	if (to_write->nbuffers != 0)
+	{
+		if (!CanMergeWrite(to_write, cur_buf_hdr))
+		{
+			result |= BUF_CANT_MERGE;
+			return result;
+		}
+	}
+	else
+	{
+		to_write->start_at_tag = cur_buf_hdr->tag;
+	}
 
 	/* Make sure we can handle the pin */
 	ReservePrivateRefCountEntry();
 	ResourceOwnerEnlarge(CurrentResourceOwner);
 
+	/* XXX: Should also check if we are allowed to pin one more buffer */
+
 	/*
 	 * Check whether buffer needs writing.
 	 *
@@ -3933,7 +4158,7 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 	 * don't worry because our checkpoint.redo points before log record for
 	 * upcoming changes and so we are not required to write such dirty buffer.
 	 */
-	buf_state = LockBufHdr(bufHdr);
+	buf_state = LockBufHdr(cur_buf_hdr);
 
 	if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
 		BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
@@ -3942,40 +4167,300 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 	}
 	else if (skip_recently_used)
 	{
+#if 0
+		elog(LOG, "at block %d: skip recent with nbuffers %d",
+			 cur_buf_hdr->tag.blockNum, to_write->nbuffers);
+#endif
 		/* Caller told us not to write recently-used buffers */
-		UnlockBufHdr(bufHdr, buf_state);
+		UnlockBufHdr(cur_buf_hdr, buf_state);
 		return result;
 	}
 
 	if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
 	{
 		/* It's clean, so nothing to do */
-		UnlockBufHdr(bufHdr, buf_state);
+		UnlockBufHdr(cur_buf_hdr, buf_state);
 		return result;
 	}
 
+	/* pin the buffer, from now on its identity can't change anymore */
+	PinBuffer_Locked(cur_buf_hdr);
+
 	/*
-	 * Pin it, share-lock it, write it.  (FlushBuffer will do nothing if the
-	 * buffer is clean by the time we've locked it.)
+	 * Acquire IO, if needed, now that it's likely that we'll need to write.
 	 */
-	PinBuffer_Locked(bufHdr);
-	LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
+	if (to_write->ioh == NULL)
+	{
+		/* otherwise we should already have acquired a handle */
+		Assert(to_write->nbuffers == 0);
 
-	FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
+		to_write->ioh = io_queue_acquire_io(ioq);
+		pgaio_io_get_wref(to_write->ioh, &to_write->iow);
+	}
 
-	LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
+	/*
+	 * If we are merging, check if the buffer's identity possibly changed
+	 * while we hadn't yet pinned it.
+	 *
+	 * XXX: It might be worth checking if we still want to write the buffer
+	 * out, e.g. it could have been replaced with a buffer that doesn't have
+	 * BM_CHECKPOINT_NEEDED set.
+	 */
+	if (to_write->nbuffers != 0)
+	{
+		if (!CanMergeWrite(to_write, cur_buf_hdr))
+		{
+			elog(LOG, "changed identity");
+			UnpinBuffer(cur_buf_hdr);
+
+			result |= BUF_CANT_MERGE;
+
+			return result;
+		}
+	}
+
+	may_block = to_write->nbuffers == 0
+		&& !pgaio_have_staged()
+		&& io_queue_is_empty(ioq)
+		;
+	content_lock = BufferDescriptorGetContentLock(cur_buf_hdr);
+
+	if (!may_block)
+	{
+		if (LWLockConditionalAcquire(content_lock, LW_SHARED))
+		{
+			/* done */
+		}
+		else if (to_write->nbuffers == 0)
+		{
+			/*
+			 * Need to wait for all prior IO to finish before blocking for
+			 * lock acquisition, to avoid the risk a deadlock due to us
+			 * waiting for another backend that is waiting for our unsubmitted
+			 * IO to complete.
+			 */
+			pgaio_submit_staged();
+			io_queue_wait_all(ioq);
 
-	tag = bufHdr->tag;
+			elog(DEBUG2, "at block %u: can't block, nbuffers = 0",
+				 cur_buf_hdr->tag.blockNum
+				);
 
-	UnpinBuffer(bufHdr);
+			may_block = to_write->nbuffers == 0
+				&& !pgaio_have_staged()
+				&& io_queue_is_empty(ioq)
+				;
+			Assert(may_block);
+
+			LWLockAcquire(content_lock, LW_SHARED);
+		}
+		else
+		{
+			elog(DEBUG2, "at block %d: can't block nbuffers = %d",
+				 cur_buf_hdr->tag.blockNum,
+				 to_write->nbuffers);
+
+			UnpinBuffer(cur_buf_hdr);
+			result |= BUF_CANT_MERGE;
+			Assert(to_write->nbuffers > 0);
+
+			return result;
+		}
+	}
+	else
+	{
+		LWLockAcquire(content_lock, LW_SHARED);
+	}
+
+	if (!may_block)
+	{
+		if (!StartBufferIO(cur_buf_hdr, false, !may_block))
+		{
+			pgaio_submit_staged();
+			io_queue_wait_all(ioq);
+
+			may_block = io_queue_is_empty(ioq) && to_write->nbuffers == 0 && !pgaio_have_staged();
+
+			if (!StartBufferIO(cur_buf_hdr, false, !may_block))
+			{
+				elog(DEBUG2, "at block %d: non-waitable StartBufferIO returns false, %d",
+					 cur_buf_hdr->tag.blockNum,
+					 may_block);
+
+				/*
+				 * FIXME: can't tell whether this is because the buffer has
+				 * been cleaned
+				 */
+				if (!may_block)
+				{
+					result |= BUF_CANT_MERGE;
+					Assert(to_write->nbuffers > 0);
+				}
+				LWLockRelease(content_lock);
+				UnpinBuffer(cur_buf_hdr);
+
+				return result;
+			}
+		}
+	}
+	else
+	{
+		if (!StartBufferIO(cur_buf_hdr, false, false))
+		{
+			elog(DEBUG2, "waitable StartBufferIO returns false");
+			LWLockRelease(content_lock);
+			UnpinBuffer(cur_buf_hdr);
+
+			/*
+			 * FIXME: Historically we returned BUF_WRITTEN in this case, which
+			 * seems wrong
+			 */
+			return result;
+		}
+	}
 
 	/*
-	 * SyncOneBuffer() is only called by checkpointer and bgwriter, so
-	 * IOContext will always be IOCONTEXT_NORMAL.
+	 * Run PageGetLSN while holding header lock, since we don't have the
+	 * buffer locked exclusively in all cases.
+	 */
+	buf_state = LockBufHdr(cur_buf_hdr);
+
+	cur_buf_lsn = BufferGetLSN(cur_buf_hdr);
+
+	/* To check if block content changes while flushing. - vadim 01/17/97 */
+	buf_state &= ~BM_JUST_DIRTIED;
+
+	UnlockBufHdr(cur_buf_hdr, buf_state);
+
+	to_write->buffers[to_write->nbuffers] = buf;
+	to_write->nbuffers++;
+
+	if (buf_state & BM_PERMANENT &&
+		(to_write->max_lsn == InvalidXLogRecPtr || to_write->max_lsn < cur_buf_lsn))
+	{
+		to_write->max_lsn = cur_buf_lsn;
+	}
+
+	result |= BUF_WRITTEN;
+
+	return result;
+}
+
+static void
+WriteBuffers(BuffersToWrite *to_write,
+			 IOQueue *ioq, WritebackContext *wb_context)
+{
+	SMgrRelation smgr;
+	Buffer		first_buf;
+	BufferDesc *first_buf_hdr;
+	bool		needs_checksum;
+
+	Assert(to_write->nbuffers > 0 && to_write->nbuffers <= io_combine_limit);
+
+	first_buf = to_write->buffers[0];
+	first_buf_hdr = GetBufferDescriptor(first_buf - 1);
+
+	smgr = smgropen(BufTagGetRelFileLocator(&first_buf_hdr->tag), INVALID_PROC_NUMBER);
+
+	/*
+	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
+	 * rule that log updates must hit disk before any of the data-file changes
+	 * they describe do.
+	 *
+	 * However, this rule does not apply to unlogged relations, which will be
+	 * lost after a crash anyway.  Most unlogged relation pages do not bear
+	 * LSNs since we never emit WAL records for them, and therefore flushing
+	 * up through the buffer LSN would be useless, but harmless.  However,
+	 * GiST indexes use LSNs internally to track page-splits, and therefore
+	 * unlogged GiST pages bear "fake" LSNs generated by
+	 * GetFakeLSNForUnloggedRel.  It is unlikely but possible that the fake
+	 * LSN counter could advance past the WAL insertion point; and if it did
+	 * happen, attempting to flush WAL through that location would fail, with
+	 * disastrous system-wide consequences.  To make sure that can't happen,
+	 * skip the flush if the buffer isn't permanent.
 	 */
-	ScheduleBufferTagForWriteback(wb_context, IOCONTEXT_NORMAL, &tag);
+	if (to_write->max_lsn != InvalidXLogRecPtr)
+		XLogFlush(to_write->max_lsn);
 
-	return result | BUF_WRITTEN;
+	/*
+	 * Now it's safe to write the buffer to disk. Note that no one else should
+	 * have been able to write it, while we were busy with log flushing,
+	 * because we got the exclusive right to perform I/O by setting the
+	 * BM_IO_IN_PROGRESS bit.
+	 */
+
+	for (int nbuf = 0; nbuf < to_write->nbuffers; nbuf++)
+	{
+		Buffer		cur_buf = to_write->buffers[nbuf];
+		BufferDesc *cur_buf_hdr = GetBufferDescriptor(cur_buf - 1);
+		Block		bufBlock;
+		char	   *bufToWrite;
+
+		bufBlock = BufHdrGetBlock(cur_buf_hdr);
+		needs_checksum = PageNeedsChecksumCopy((Page) bufBlock);
+
+		/*
+		 * Update page checksum if desired.  Since we have only shared lock on
+		 * the buffer, other processes might be updating hint bits in it, so
+		 * we must copy the page to a bounce buffer if we do checksumming.
+		 */
+		if (needs_checksum)
+		{
+			PgAioBounceBuffer *bb = pgaio_bounce_buffer_get();
+
+			pgaio_io_assoc_bounce_buffer(to_write->ioh, bb);
+
+			bufToWrite = pgaio_bounce_buffer_buffer(bb);
+			memcpy(bufToWrite, bufBlock, BLCKSZ);
+			PageSetChecksumInplace((Page) bufToWrite, cur_buf_hdr->tag.blockNum);
+		}
+		else
+		{
+			bufToWrite = bufBlock;
+		}
+
+		to_write->data_ptrs[nbuf] = bufToWrite;
+	}
+
+	pgaio_io_set_handle_data_32(to_write->ioh,
+								(uint32 *) to_write->buffers,
+								to_write->nbuffers);
+	pgaio_io_register_callbacks(to_write->ioh, PGAIO_HCB_SHARED_BUFFER_WRITEV, 0);
+
+	smgrstartwritev(to_write->ioh, smgr,
+					BufTagGetForkNum(&first_buf_hdr->tag),
+					first_buf_hdr->tag.blockNum,
+					to_write->data_ptrs,
+					to_write->nbuffers,
+					false);
+	pgstat_count_io_op(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
+					   IOOP_WRITE, 1, BLCKSZ * to_write->nbuffers);
+
+
+	for (int nbuf = 0; nbuf < to_write->nbuffers; nbuf++)
+	{
+		Buffer		cur_buf = to_write->buffers[nbuf];
+		BufferDesc *cur_buf_hdr = GetBufferDescriptor(cur_buf - 1);
+
+		UnpinBuffer(cur_buf_hdr);
+	}
+
+	io_queue_track(ioq, &to_write->iow);
+	to_write->total_writes++;
+
+	/* clear state for next write */
+	to_write->nbuffers = 0;
+	to_write->start_at_tag.relNumber = InvalidOid;
+	to_write->start_at_tag.blockNum = InvalidBlockNumber;
+	to_write->max_combine = 0;
+	to_write->max_lsn = InvalidXLogRecPtr;
+	to_write->ioh = NULL;
+	pgaio_wref_clear(&to_write->iow);
+
+	/*
+	 * FIXME: Implement issuing writebacks (note wb_context isn't used here).
+	 * Possibly needs to be integrated with io_queue.c.
+	 */
 }
 
 /*
@@ -4349,6 +4834,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 	error_context_stack = errcallback.previous;
 }
 
+
 /*
  * RelationGetNumberOfBlocksInFork
  *		Determines the current number of pages in the specified relation fork.
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 82457bacc62f..d4bbd2cdf00e 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -1490,6 +1490,16 @@ PageIndexTupleOverwrite(Page page, OffsetNumber offnum,
 	return true;
 }
 
+bool
+PageNeedsChecksumCopy(Page page)
+{
+	if (PageIsNew(page))
+		return false;
+
+	/* If we don't need a checksum, just return the passed-in data */
+	return DataChecksumsEnabled();
+}
+
 
 /*
  * Set checksum for a page in shared buffers.
diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h
index 800ecbfd13b3..a8081d411b68 100644
--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -31,7 +31,8 @@ pg_noreturn extern void BackgroundWriterMain(const void *startup_data, size_t st
 pg_noreturn extern void CheckpointerMain(const void *startup_data, size_t startup_data_len);
 
 extern void RequestCheckpoint(int flags);
-extern void CheckpointWriteDelay(int flags, double progress);
+struct IOQueue;
+extern void CheckpointWriteDelay(struct IOQueue *ioq, int flags, double progress);
 
 extern bool ForwardSyncRequest(const FileTag *ftag, SyncRequestType type);
 
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 0dec7d93b3b2..45c2b70b7360 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -21,6 +21,8 @@
 #include "storage/buf.h"
 #include "storage/bufmgr.h"
 #include "storage/condition_variable.h"
+#include "storage/io_queue.h"
+#include "storage/latch.h"
 #include "storage/lwlock.h"
 #include "storage/procnumber.h"
 #include "storage/shmem.h"
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 492feab0cb56..89e0ca11288b 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -297,7 +297,8 @@ extern bool ConditionalLockBufferForCleanup(Buffer buffer);
 extern bool IsBufferCleanupOK(Buffer buffer);
 extern bool HoldingBufferPinThatDelaysRecovery(void);
 
-extern bool BgBufferSync(struct WritebackContext *wb_context);
+struct IOQueue;
+extern bool BgBufferSync(struct IOQueue *ioq, struct WritebackContext *wb_context);
 
 extern uint32 GetPinLimit(void);
 extern uint32 GetLocalPinLimit(void);
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index aeb67c498c59..000a3ab23f90 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -507,5 +507,6 @@ extern bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum,
 									Item newtup, Size newsize);
 extern char *PageSetChecksumCopy(Page page, BlockNumber blkno);
 extern void PageSetChecksumInplace(Page page, BlockNumber blkno);
+extern bool PageNeedsChecksumCopy(Page page);
 
 #endif							/* BUFPAGE_H */
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 0c6ddadc51da..e89c501fa8af 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -349,6 +349,7 @@ BufferManagerRelation
 BufferStrategyControl
 BufferTag
 BufferUsage
+BuffersToWrite
 BuildAccumulator
 BuiltinScript
 BulkInsertState

From e93050c439e3ea216fff938065a47744a97b9680 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 18 Mar 2025 14:40:06 -0400
Subject: [PATCH 14/16] Ensure a resowner exists for all paths that may perform
 AIO

Reviewed-by: Noah Misch <noah@leadboat.com>
Discussion: https://postgr.es/m/1f6b50a7-38ef-4d87-8246-786d39f46ab9@iki.fi
---
 src/backend/bootstrap/bootstrap.c         | 7 +++++++
 src/backend/replication/logical/logical.c | 6 ++++++
 src/backend/utils/init/postinit.c         | 6 +++++-
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index 6db864892d0d..e554504e1f0d 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -361,8 +361,15 @@ BootstrapModeMain(int argc, char *argv[], bool check_only)
 	BaseInit();
 
 	bootstrap_signals();
+
+	/* need a resowner for IO during BootStrapXLOG() */
+	CreateAuxProcessResourceOwner();
+
 	BootStrapXLOG(bootstrap_data_checksum_version);
 
+	ReleaseAuxProcessResources(true);
+	CurrentResourceOwner = NULL;
+
 	/*
 	 * To ensure that src/common/link-canary.c is linked into the backend, we
 	 * must call it from somewhere.  Here is as good as anywhere.
diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c
index a8d2e024d344..72c32f5c32e8 100644
--- a/src/backend/replication/logical/logical.c
+++ b/src/backend/replication/logical/logical.c
@@ -386,6 +386,12 @@ CreateInitDecodingContext(const char *plugin,
 	slot->data.plugin = plugin_name;
 	SpinLockRelease(&slot->mutex);
 
+	if (CurrentResourceOwner == NULL)
+	{
+		Assert(am_walsender);
+		CurrentResourceOwner = AuxProcessResourceOwner;
+	}
+
 	if (XLogRecPtrIsInvalid(restart_lsn))
 		ReplicationSlotReserveWal();
 	else
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 7958ea11b735..222e24bcb08b 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -785,8 +785,12 @@ InitPostgres(const char *in_dbname, Oid dboid,
 		 * We don't yet have an aux-process resource owner, but StartupXLOG
 		 * and ShutdownXLOG will need one.  Hence, create said resource owner
 		 * (and register a callback to clean it up after ShutdownXLOG runs).
+		 *
+		 * In bootstrap mode CreateAuxProcessResourceOwner() was already
+		 * called in BootstrapModeMain().
 		 */
-		CreateAuxProcessResourceOwner();
+		if (!bootstrap)
+			CreateAuxProcessResourceOwner();
 
 		StartupXLOG();
 		/* Release (and warn about) any buffer pins leaked in StartupXLOG */

From e1a943b854fcfcee6a34e45f5f983c04c2172648 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 18 Mar 2025 14:40:06 -0400
Subject: [PATCH 15/16] Temporary: Increase BAS_BULKREAD size

Without this we only can execute very little AIO for sequential scans, as
there's just not enough buffers in the ring.  This isn't the right fix, as
just increasing the ring size can have negative performance implications in
workloads where the kernel has all the data cached.

Author:
Reviewed-By:
Discussion: https://postgr.es/m/
Backpatch:
---
 src/backend/storage/buffer/freelist.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 336715b6c634..b72a5957a206 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -555,7 +555,12 @@ GetAccessStrategy(BufferAccessStrategyType btype)
 			return NULL;
 
 		case BAS_BULKREAD:
-			ring_size_kb = 256;
+
+			/*
+			 * FIXME: Temporary increase to allow large enough streaming reads
+			 * to actually benefit from AIO. This needs a better solution.
+			 */
+			ring_size_kb = 2 * 1024;
 			break;
 		case BAS_BULKWRITE:
 			ring_size_kb = 16 * 1024;

From b33bd816c331d7cf30478f2a8d5503ac3589e5d1 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 18 Mar 2025 14:40:06 -0400
Subject: [PATCH 16/16] WIP: Use MAP_POPULATE

For benchmarking it's quite annoying that the first time a memory is touched
has completely different perf characteristics than subsequent accesses. Using
MAP_POPULATE reduces that substantially.
---
 src/backend/port/sysv_shmem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 197926d44f6b..a700b02d5a18 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -620,7 +620,7 @@ CreateAnonymousSegment(Size *size)
 			allocsize += hugepagesize - (allocsize % hugepagesize);
 
 		ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
-				   PG_MMAP_FLAGS | mmap_flags, -1, 0);
+				   PG_MMAP_FLAGS | MAP_POPULATE | mmap_flags, -1, 0);
 		mmap_errno = errno;
 		if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED)
 			elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m",