From 1243a7204a4a1eb0b59224f1eed42c67350bfb3e Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Mon, 30 Sep 2024 22:48:12 +0200
Subject: [PATCH 1/3] WIP: index prefetching

Allows the index AM to provide items (TIDs and tuples) in batches, which
is then used to implement prefetching of heap tuples in index scans
(including index-only scans). This is similar to prefetching already
done in bitmap scans, and can result in significant speedups.

The index AM may implement an optional "amgetbatch" callback, returning
a batch of items. The indexam.c code then handles this transparently
through the existing "getnext" interface.

It is up to the index AM to return only batches that it can handle
internally. For example, most of the later patches adding support for
batching to relevant index AMs (btree, hash, gist, sp-gist) restrict the
batches to a single leaf page. This makes implementation of batching
much simpler, with only minimal changes to the index AMs, but it's not a
hard requirement. The index AM can produce batches spanning arbitrary
number of leaf pages. This is left as a possible future improvement.

Most of the batching/prefetching logic happens in indexam.c. This means
the executor code can continue to call the interface just like before.

The only "violation" happens in index-only scans, which need to check
the visibility map both when the prefetching pages (we don't want to
prefetch pages that are unnecessary) and later when reading the data.
For cached data the visibility map checks can be fairly expensive, so
it's desirable to keep and reuse the result of the first check.

At the moment, the prefetching does not handle mark/restore plans. This
is doable, but requires additional synchronization between the batching
and index AM code in the "opposite direction".

This patch does not actually add batching to any of the index AMs, it's
just the common infrastructure.

TODO Add the new index AM callback to sgml docs.

Re-introduce the callback to check VM and remember the result.

It can happen the first few batches (leaf pages) may be returned from
the index, skipping the heap fetches. Which means the read stream does
no reads until much later after the first batches are already freed.
Because the reads only happen when first reading from the stream. In
that case we need to be careful about initializing the stream position
because setting it to (0,0) would be wrong as the batch is already gone.
So just initialize to readPost, which should be initialized already.

Could it happen later, or just on first call? Probably first call only,
as the read stream always looks ahead for the block that actually needs
reading.
---
 src/backend/access/heap/heapam_handler.c      |   81 +-
 src/backend/access/index/genam.c              |   30 +-
 src/backend/access/index/indexam.c            | 1381 ++++++++++++++++-
 src/backend/access/table/tableam.c            |    2 +-
 src/backend/commands/constraint.c             |    3 +-
 src/backend/executor/execIndexing.c           |   12 +-
 src/backend/executor/execReplication.c        |    9 +-
 src/backend/executor/nodeIndexonlyscan.c      |  133 +-
 src/backend/executor/nodeIndexscan.c          |   32 +-
 src/backend/utils/adt/selfuncs.c              |    7 +-
 src/backend/utils/misc/guc_tables.c           |   10 +
 src/backend/utils/misc/postgresql.conf.sample |    1 +
 src/include/access/amapi.h                    |   10 +
 src/include/access/genam.h                    |   13 +-
 src/include/access/relscan.h                  |  160 ++
 src/include/access/tableam.h                  |   12 +-
 src/include/nodes/execnodes.h                 |    7 +
 src/test/regress/expected/sysviews.out        |    3 +-
 src/tools/pgindent/typedefs.list              |    5 +
 19 files changed, 1872 insertions(+), 39 deletions(-)

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index ac082fefa77a..f79d97a8c64e 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -79,11 +79,12 @@ heapam_slot_callbacks(Relation relation)
  */
 
 static IndexFetchTableData *
-heapam_index_fetch_begin(Relation rel)
+heapam_index_fetch_begin(Relation rel, ReadStream *rs)
 {
 	IndexFetchHeapData *hscan = palloc0(sizeof(IndexFetchHeapData));
 
 	hscan->xs_base.rel = rel;
+	hscan->xs_base.rs = rs;
 	hscan->xs_cbuf = InvalidBuffer;
 
 	return &hscan->xs_base;
@@ -94,6 +95,9 @@ heapam_index_fetch_reset(IndexFetchTableData *scan)
 {
 	IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
 
+	if (scan->rs)
+		read_stream_reset(scan->rs);
+
 	if (BufferIsValid(hscan->xs_cbuf))
 	{
 		ReleaseBuffer(hscan->xs_cbuf);
@@ -108,6 +112,9 @@ heapam_index_fetch_end(IndexFetchTableData *scan)
 
 	heapam_index_fetch_reset(scan);
 
+	if (scan->rs)
+		read_stream_end(scan->rs);
+
 	pfree(hscan);
 }
 
@@ -130,15 +137,72 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
 		/* Switch to correct buffer if we don't have it already */
 		Buffer		prev_buf = hscan->xs_cbuf;
 
-		hscan->xs_cbuf = ReleaseAndReadBuffer(hscan->xs_cbuf,
-											  hscan->xs_base.rel,
-											  ItemPointerGetBlockNumber(tid));
+		/*
+		 * Read the block for the requested TID. With a read stream, simply
+		 * read the next block we queued earlier (from the callback).
+		 * Otherwise just do the regular read using the TID.
+		 *
+		 * XXX It's a bit fragile to just read buffers, expecting the right
+		 * block, which we queued from the callback sometime much earlier. If
+		 * the two streams get out of sync in any way (which can happen
+		 * easily, due to some optimization heuristics), it may misbehave in
+		 * strange ways.
+		 *
+		 * XXX We need to support both the old ReadBuffer and ReadStream, as
+		 * some places are unlikely to benefit from a read stream - e.g.
+		 * because they only fetch a single tuple. So better to support this.
+		 *
+		 * XXX Another reason is that some index AMs may not support the
+		 * batching interface, which is a prerequisite for using read_stream
+		 * API.
+		 */
+		if (scan->rs)
+			hscan->xs_cbuf = read_stream_next_buffer(scan->rs, NULL);
+		else
+			hscan->xs_cbuf = ReleaseAndReadBuffer(hscan->xs_cbuf,
+												  hscan->xs_base.rel,
+												  ItemPointerGetBlockNumber(tid));
+
+		/* We should always get a valid buffer for a valid TID. */
+		Assert(BufferIsValid(hscan->xs_cbuf));
+
+		/*
+		 * Did we read the expected block number (per the TID)? For the
+		 * regular buffer reads this should always match, but with the read
+		 * stream it might disagree due to a bug elsewhere (happened
+		 * repeatedly).
+		 */
+		Assert(BufferGetBlockNumber(hscan->xs_cbuf) == ItemPointerGetBlockNumber(tid));
 
 		/*
 		 * Prune page, but only if we weren't already on this page
 		 */
 		if (prev_buf != hscan->xs_cbuf)
 			heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf);
+
+		/*
+		 * When using the read stream, release the old buffer.
+		 *
+		 * XXX Not sure this is really needed, or maybe this is not the right
+		 * place to do this, and buffers should be released elsewhere. The
+		 * problem is that other place may not really know if the index scan
+		 * uses read stream API.
+		 *
+		 * XXX We need to do this, because otherwise the caller would need to
+		 * do different things depending on whether the read_stream was used
+		 * or not. With the read_stream it'd have to also explicitly release
+		 * the buffers, but doing that for every caller seems error prone
+		 * (easy to forget). It's also not clear whether it would free the
+		 * buffer before or after the index_fetch_tuple call (we don't know if
+		 * the buffer changed until *after* the call, etc.).
+		 *
+		 * XXX Does this do the right thing when reading the same page? That
+		 * should return the same buffer, so won't we release it prematurely?
+		 */
+		if (scan->rs && (prev_buf != InvalidBuffer))
+		{
+			ReleaseBuffer(prev_buf);
+		}
 	}
 
 	/* Obtain share-lock on the buffer so we can examine visibility */
@@ -753,7 +817,14 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 
 		tableScan = NULL;
 		heapScan = NULL;
-		indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, NULL, 0, 0);
+
+		/*
+		 * XXX Maybe enable batching/prefetch for clustering? Seems like it
+		 * might be a pretty substantial win if the table is not yet well
+		 * clustered by the index.
+		 */
+		indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, NULL, 0, 0,
+									false);
 		index_rescan(indexScan, NULL, 0, NULL, 0);
 	}
 	else
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index 8f532e14590e..8266d5e0e872 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -446,8 +446,21 @@ systable_beginscan(Relation heapRelation,
 				elog(ERROR, "column is not in index");
 		}
 
+		/*
+		 * No batching/prefetch for catalogs. We don't expect that to help
+		 * very much, because we usually need just one row, and even if we
+		 * need multiple rows, they tend to be colocated in heap.
+		 *
+		 * XXX Maybe we could do that, the prefetching only ramps up over time
+		 * anyway? There was a problem with infinite recursion when looking up
+		 * effective_io_concurrency for a tablespace (which may do an index
+		 * scan internally), but the read_stream should care of that. Still,
+		 * we don't expect this to help a lot.
+		 *
+		 * XXX This also means scans on catalogs won't use read_stream.
+		 */
 		sysscan->iscan = index_beginscan(heapRelation, irel,
-										 snapshot, NULL, nkeys, 0);
+										 snapshot, NULL, nkeys, 0, false);
 		index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0);
 		sysscan->scan = NULL;
 
@@ -707,8 +720,21 @@ systable_beginscan_ordered(Relation heapRelation,
 			elog(ERROR, "column is not in index");
 	}
 
+	/*
+	 * No batching/prefetch for catalogs. We don't expect that to help very
+	 * much, because we usually need just one row, and even if we need
+	 * multiple rows, they tend to be colocated in heap.
+	 *
+	 * XXX Maybe we could do that, the prefetching only ramps up over time
+	 * anyway? There was a problem with infinite recursion when looking up
+	 * effective_io_concurrency for a tablespace (which may do an index scan
+	 * internally), but the read_stream should care of that. Still, we don't
+	 * expect this to help a lot.
+	 *
+	 * XXX This also means scans on catalogs won't use read_stream.
+	 */
 	sysscan->iscan = index_beginscan(heapRelation, indexRelation,
-									 snapshot, NULL, nkeys, 0);
+									 snapshot, NULL, nkeys, 0, false);
 	index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0);
 	sysscan->scan = NULL;
 
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 219df1971da6..190a112e4571 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -44,6 +44,7 @@
 #include "postgres.h"
 
 #include "access/amapi.h"
+#include "access/nbtree.h"		/* XXX for MaxTIDsPerBTreePage (should remove) */
 #include "access/relation.h"
 #include "access/reloptions.h"
 #include "access/relscan.h"
@@ -58,6 +59,8 @@
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
 
+/* enable batching / prefetching during index scans */
+bool		enable_indexscan_batching = false;
 
 /* ----------------------------------------------------------------
  *					macros used in index_ routines
@@ -109,6 +112,36 @@ static IndexScanDesc index_beginscan_internal(Relation indexRelation,
 											  ParallelIndexScanDesc pscan, bool temp_snap);
 static inline void validate_relation_kind(Relation r);
 
+/* index batching */
+static void index_batch_init(IndexScanDesc scan);
+static void index_batch_reset(IndexScanDesc scan, bool complete);
+static void index_batch_end(IndexScanDesc scan);
+static bool index_batch_getnext(IndexScanDesc scan);
+static void index_batch_free(IndexScanDesc scan, IndexScanBatch batch);
+static ItemPointer index_batch_getnext_tid(IndexScanDesc scan,
+										   ScanDirection direction);
+
+static BlockNumber index_scan_stream_read_next(ReadStream *stream,
+											   void *callback_private_data,
+											   void *per_buffer_data);
+
+static bool index_batch_pos_advance(IndexScanDesc scan, IndexScanBatchPos *pos);
+static void index_batch_pos_reset(IndexScanDesc scan, IndexScanBatchPos *pos);
+static void index_batch_kill_item(IndexScanDesc scan);
+
+static void AssertCheckBatchPosValid(IndexScanDesc scan, IndexScanBatchPos *pos);
+static void AssertCheckBatch(IndexScanDesc scan, IndexScanBatch batch);
+static void AssertCheckBatches(IndexScanDesc scan);
+
+
+#define INDEX_SCAN_BATCH(scan, idx)	\
+		((scan)->xs_batches->batches[(idx) % (scan)->xs_batches->maxBatches])
+
+#ifdef INDEXAM_DEBUG
+#define DEBUG_LOG(...) elog(WARNING, __VA_ARGS__)
+#else
+#define DEBUG_LOG(...)
+#endif
 
 /* ----------------------------------------------------------------
  *				   index_ interface functions
@@ -250,6 +283,10 @@ index_insert_cleanup(Relation indexRelation,
 /*
  * index_beginscan - start a scan of an index with amgettuple
  *
+ * enable_batching determines whether the scan should try using the batching
+ * interface (amgetbatch/amfreebatch), if supported by the index AM, or the
+ * regular amgettuple interface.
+ *
  * Caller must be holding suitable locks on the heap and the index.
  */
 IndexScanDesc
@@ -257,8 +294,10 @@ index_beginscan(Relation heapRelation,
 				Relation indexRelation,
 				Snapshot snapshot,
 				IndexScanInstrumentation *instrument,
-				int nkeys, int norderbys)
+				int nkeys, int norderbys,
+				bool enable_batching)
 {
+	ReadStream *rs = NULL;
 	IndexScanDesc scan;
 
 	Assert(snapshot != InvalidSnapshot);
@@ -273,8 +312,45 @@ index_beginscan(Relation heapRelation,
 	scan->xs_snapshot = snapshot;
 	scan->instrument = instrument;
 
+	/*
+	 * If explicitly requested and supported by both the index AM and the
+	 * plan, initialize batching info. We only use stream read API with
+	 * batching enabled (so not with systable scans). But maybe we should
+	 * change that, and just use different read_next callbacks (or something
+	 * like that)?
+	 *
+	 * XXX Maybe we should have a separate "amcanbatch" call, to let the AM
+	 * decide if batching is supported depending on the scan details. That
+	 * might be needed for certain index AMs, that can do batching only for
+	 * some scans (I'm thinking about GiST/SP-GiST indexes, with ORDER BY).
+	 *
+	 * XXX Do this before initializing xs_heapfetch, so that we can pass the
+	 * read stream to it.
+	 */
+	if ((indexRelation->rd_indam->amgetbatch != NULL) &&
+		enable_batching &&
+		enable_indexscan_batching)
+	{
+		/*
+		 * XXX We do this after index_beginscan_internal(), which means we
+		 * can't init the batch state in there (it doesn't even know if
+		 * batching will be used at that point). We can't init the read_stream
+		 * there, because it needs the heapRelation.
+		 */
+		index_batch_init(scan);
+
+		/* initialize stream */
+		rs = read_stream_begin_relation(READ_STREAM_DEFAULT,
+										NULL,
+										heapRelation,
+										MAIN_FORKNUM,
+										index_scan_stream_read_next,
+										scan,
+										0);
+	}
+
 	/* prepare to fetch index matches from table */
-	scan->xs_heapfetch = table_index_fetch_begin(heapRelation);
+	scan->xs_heapfetch = table_index_fetch_begin(heapRelation, rs);
 
 	return scan;
 }
@@ -337,6 +413,12 @@ index_beginscan_internal(Relation indexRelation,
 	scan->parallel_scan = pscan;
 	scan->xs_temp_snap = temp_snap;
 
+	/*
+	 * No batching by default, so set it to NULL. Will be initialized later if
+	 * batching is requested and AM supports it.
+	 */
+	scan->xs_batches = NULL;
+
 	return scan;
 }
 
@@ -370,6 +452,19 @@ index_rescan(IndexScanDesc scan,
 	scan->kill_prior_tuple = false; /* for safety */
 	scan->xs_heap_continue = false;
 
+	/*
+	 * Reset the batching. This makes it look like there are no batches,
+	 * discards reads already scheduled to the read stream, etc.
+	 *
+	 * XXX We do this before calling amrescan, so that it could reinitialize
+	 * everything (this probably does not matter very much, now that we've
+	 * moved all the batching logic to indexam.c, it was more important when
+	 * the index AM was responsible for more of it).
+	 *
+	 * XXX Maybe this should also happen before table_index_fetch_reset?
+	 */
+	index_batch_reset(scan, true);
+
 	scan->indexRelation->rd_indam->amrescan(scan, keys, nkeys,
 											orderbys, norderbys);
 }
@@ -384,6 +479,9 @@ index_endscan(IndexScanDesc scan)
 	SCAN_CHECKS;
 	CHECK_SCAN_PROCEDURE(amendscan);
 
+	/* Cleanup batching, so that the AM can release pins and so on. */
+	index_batch_end(scan);
+
 	/* Release resources (like buffer pins) from table accesses */
 	if (scan->xs_heapfetch)
 	{
@@ -414,7 +512,46 @@ index_markpos(IndexScanDesc scan)
 	SCAN_CHECKS;
 	CHECK_SCAN_PROCEDURE(ammarkpos);
 
-	scan->indexRelation->rd_indam->ammarkpos(scan);
+	/*
+	 * Without batching, just use the ammarkpos() callback. With batching
+	 * everything is handled at this layer, without calling the AM.
+	 */
+	if (scan->xs_batches == NULL)
+	{
+		scan->indexRelation->rd_indam->ammarkpos(scan);
+	}
+	else
+	{
+		IndexScanBatches *batches = scan->xs_batches;
+		IndexScanBatchPos *pos = &batches->markPos;
+		IndexScanBatchData *batch = batches->markBatch;
+
+		/*
+		 * Free the previous mark batch (if any), but only if the batch is no
+		 * longer valid (in the current first/next range). This means that if
+		 * we're marking the same batch (different item), we don't really do
+		 * anything.
+		 *
+		 * XXX Should have some macro for this check, I guess.
+		 */
+		if ((batch != NULL) &&
+			(pos->batch < batches->firstBatch || pos->batch >= batches->nextBatch))
+		{
+			batches->markBatch = NULL;
+			index_batch_free(scan, batch);
+		}
+
+		/* just copy the read position (which has to be valid) */
+		batches->markPos = batches->readPos;
+		batches->markBatch = INDEX_SCAN_BATCH(scan, batches->markPos.batch);
+
+		/*
+		 * FIXME we need to make sure the batch does not get freed during the
+		 * regular advances.
+		 */
+
+		AssertCheckBatchPosValid(scan, &batches->markPos);
+	}
 }
 
 /* ----------------
@@ -447,7 +584,58 @@ index_restrpos(IndexScanDesc scan)
 	scan->kill_prior_tuple = false; /* for safety */
 	scan->xs_heap_continue = false;
 
-	scan->indexRelation->rd_indam->amrestrpos(scan);
+	/*
+	 * Without batching, just use the amrestrpos() callback. With batching
+	 * everything is handled at this layer, without calling the AM.
+	 */
+	if (scan->xs_batches == NULL)
+		scan->indexRelation->rd_indam->amrestrpos(scan);
+	else
+	{
+		IndexScanBatches *batches = scan->xs_batches;
+		IndexScanBatchPos *pos = &batches->markPos;
+		IndexScanBatchData *batch = scan->xs_batches->markBatch;
+
+		Assert(batch != NULL);
+
+		/*
+		 * XXX The pos can be invalid, if we already advanced past the the
+		 * marked batch (and stashed it in markBatch instead of freeing). So
+		 * this assert would be incorrect.
+		 */
+		/* AssertCheckBatchPosValid(scan, &pos); */
+
+		/* FIXME we should still check the batch was not freed yet */
+
+		/*
+		 * Reset the batching state, except for the marked batch, and make it
+		 * look like we have a single batch - the marked one.
+		 *
+		 * XXX This seems a bit ugly / hacky, maybe there's a more elegant way
+		 * to do this?
+		 */
+		index_batch_reset(scan, false);
+
+		batches->markPos = *pos;
+		batches->readPos = *pos;
+		batches->firstBatch = pos->batch;
+		batches->nextBatch = (batches->firstBatch + 1);
+
+		INDEX_SCAN_BATCH(scan, batches->markPos.batch) = batch;
+
+		/*
+		 * XXX I really dislike that we have so many definitions of "current"
+		 * batch. We have readPos, streamPos, currentBatch, ... seems very ad
+		 * hoc - I just added a new "current" field when I needed one. We
+		 * should make that somewhat more consistent, or at least explain it
+		 * clearly somewhere.
+		 *
+		 * XXX Do we even need currentBatch? It's not accessed anywhere, at
+		 * least not in this patch.
+		 */
+		// batches->currentBatch = batch;
+		batches->markBatch = batch; /* also remember this */
+	}
 }
 
 /*
@@ -569,6 +757,18 @@ index_parallelrescan(IndexScanDesc scan)
 	if (scan->xs_heapfetch)
 		table_index_fetch_reset(scan->xs_heapfetch);
 
+	/*
+	 * Reset the batching. This makes it look like there are no batches,
+	 * discards reads already scheduled to the read stream, etc. We Do this
+	 * before calling amrescan, so that it can reinitialize everything.
+	 *
+	 * XXX We do this before calling amparallelrescan, so that it could
+	 * reinitialize everything (this probably does not matter very much, now
+	 * that we've moved all the batching logic to indexam.c, it was more
+	 * important when the index AM was responsible for more of it).
+	 */
+	index_batch_reset(scan, true);
+
 	/* amparallelrescan is optional; assume no-op if not provided by AM */
 	if (scan->indexRelation->rd_indam->amparallelrescan != NULL)
 		scan->indexRelation->rd_indam->amparallelrescan(scan);
@@ -583,10 +783,12 @@ IndexScanDesc
 index_beginscan_parallel(Relation heaprel, Relation indexrel,
 						 IndexScanInstrumentation *instrument,
 						 int nkeys, int norderbys,
-						 ParallelIndexScanDesc pscan)
+						 ParallelIndexScanDesc pscan,
+						 bool enable_batching)
 {
 	Snapshot	snapshot;
 	IndexScanDesc scan;
+	ReadStream *rs = NULL;
 
 	Assert(RelFileLocatorEquals(heaprel->rd_locator, pscan->ps_locator));
 	Assert(RelFileLocatorEquals(indexrel->rd_locator, pscan->ps_indexlocator));
@@ -604,8 +806,48 @@ index_beginscan_parallel(Relation heaprel, Relation indexrel,
 	scan->xs_snapshot = snapshot;
 	scan->instrument = instrument;
 
+	/*
+	 * If explicitly requested and supported by both the index AM and the
+	 * plan, initialize batching info. We only use stream read API with
+	 * batching enabled (so not with systable scans). But maybe we should
+	 * change that, and just use different read_next callbacks (or something
+	 * like that)?
+	 *
+	 * XXX Maybe we should have a separate "amcanbatch" call, to let the AM
+	 * decide if batching is supported depending on the scan details. That
+	 * might be needed for certain index AMs, that can do batching only for
+	 * some scans (I'm thinking about GiST/SP-GiST indexes, with ORDER BY).
+	 *
+	 * XXX Do this before initializing xs_heapfetch, so that we can pass the
+	 * read stream to it.
+	 *
+	 * XXX Pretty duplicate with the code in index_beginscan(), so maybe move
+	 * into a shared function.
+	 */
+	if ((indexrel->rd_indam->amgetbatch != NULL) &&
+		enable_batching &&
+		enable_indexscan_batching)
+	{
+		/*
+		 * XXX We do this after index_beginscan_internal(), which means we
+		 * can't init the batch state in there (it doesn't even know if
+		 * batching will be used at that point). We can't init the read_stream
+		 * there, because it needs the heapRelation.
+		 */
+		index_batch_init(scan);
+
+		/* initialize stream */
+		rs = read_stream_begin_relation(READ_STREAM_DEFAULT,
+										NULL,
+										heaprel,
+										MAIN_FORKNUM,
+										index_scan_stream_read_next,
+										scan,
+										0);
+	}
+
 	/* prepare to fetch index matches from table */
-	scan->xs_heapfetch = table_index_fetch_begin(heaprel);
+	scan->xs_heapfetch = table_index_fetch_begin(heaprel, rs);
 
 	return scan;
 }
@@ -628,6 +870,27 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 	/* XXX: we should assert that a snapshot is pushed or registered */
 	Assert(TransactionIdIsValid(RecentXmin));
 
+	/*
+	 * When using batching (which may be disabled for various reasons - e.g.
+	 * through a GUC, the index AM not supporting it), redirect the code to
+	 * the "batch" variant. If needed (e.g. for the first call) the call may
+	 * read the next batch (leaf page) from the index (but that's driven by
+	 * the read stream).
+	 *
+	 * XXX Maybe we should enable batching based on the plan too, so that we
+	 * don't do batching when it's probably useless (e.g. semijoins or queries
+	 * with LIMIT 1 etc.). The amcanbatch() callback might consider things
+	 * like that, or maybe that should be considered outside AM. However, the
+	 * slow ramp-up (starting with small batches) in read_stream should handle
+	 * this well enough.
+	 *
+	 * XXX Perhaps it'd be possible to do both in index_getnext_slot(), i.e.
+	 * call either the original code without batching, or the new batching
+	 * code if supported/enabled. It's not great to have duplicated code.
+	 */
+	if (scan->xs_batches != NULL)
+		return index_batch_getnext_tid(scan, direction);
+
 	/*
 	 * The AM's amgettuple proc finds the next index entry matching the scan
 	 * keys, and puts the TID into scan->xs_heaptid.  It should also set
@@ -694,9 +957,22 @@ index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot)
 	 * amgettuple call, in index_getnext_tid).  We do not do this when in
 	 * recovery because it may violate MVCC to do so.  See comments in
 	 * RelationGetIndexScan().
+	 *
+	 * XXX For scans using batching, record the flag in the batch (we will
+	 * pass it to the AM later, when freeing it). Otherwise just pass it to
+	 * the AM using the kill_prior_tuple field.
 	 */
 	if (!scan->xactStartedInRecovery)
-		scan->kill_prior_tuple = all_dead;
+	{
+		if (scan->xs_batches == NULL)
+		{
+			scan->kill_prior_tuple = all_dead;
+		}
+		else if (all_dead)
+		{
+			index_batch_kill_item(scan);
+		}
+	}
 
 	return found;
 }
@@ -1084,3 +1360,1094 @@ index_opclass_options(Relation indrel, AttrNumber attnum, Datum attoptions,
 
 	return build_local_reloptions(&relopts, attoptions, validate);
 }
+
+/*
+ * INDEX BATCHING (AND PREFETCHING)
+ *
+ * The traditional AM interface (amgettuple) is designed to walk the index one
+ * leaf page at a time, and the state (representing the leaf page) is managed
+ * by the AM implementation. Before advancing to the next leaf page, the index
+ * AM forgets the "current" leaf page. This makes it impossible to implement
+ * features that operate on multiple leaf pages - like for example prefetch.
+ *
+ * The batching relaxes this by extending the AM API with two new methods,
+ * amgetbatch and amfreebatch, that separate the "advance" to the next leaf
+ * page, and "forgetting" the previous one. This means there may be multiple
+ * leaf pages loaded at once, if necessary. It's a bit like having multiple
+ * "positions" within the index.
+ *
+ * The AM is no longer responsible for management of these "batches" - once
+ * a batch is returned from amgetbatch(), it's up to indexam.c to determine
+ * when it's no longer necessary, and call amfreebatch(). That is, the AM
+ * can no longer discard a leaf page when advancing to the next one.
+ *
+ * This allows operating on "future" index entries, e.g. to prefetch tuples
+ * from the table. Without the batching, we could do this within the single
+ * leaf page, which has limitations, e.g. inability to prefetch beyond the
+ * of the current leaf page, and the prefetch distance drop to 0. (Most
+ * indexes have many index items per leaf page, so the prefetching would
+ * be beneficial even with this limitation, but it's not great either.)
+ *
+ * Moving the batch management to the indexam.c also means defining a common
+ * batch state, instead of each index AM defining it's own opaque state. The
+ * AM merely "fills" the batch, and everything else is handled by code in
+ * indexam.c (so not AM-specific). Including prefetching.
+ *
+ * Without this "common" batch definition, each AM would need to do a fair
+ * bit of the prefetching on it's own.
+ *
+ *
+ * note: Strictly speaking, the AM may keep a second leaf page because of
+ * mark/restore may, but that's a minor detail.
+ *
+ * note: There are different definitions of "batch" - I use it as a synonym
+ * for a leaf page, or the index tuples read from one leaf page. Others use
+ * "batch" when talking about all the leaf pages kept in memory at a given
+ * moment in time (so in a way, there's a single batch, changing over time).
+ * It's not my ambition to present a binding definition of a batch, but it's
+ * good to consider this when reading comments by other people.
+ *
+ * note: In theory, how the batch maps to leaf pages is mostly up to the index
+ * AM - as long as it can "advance" between batches, etc. it could use batches
+ * that represent a subset of a leaf page, or multiple leaf pages at once.
+ *
+ * note: Or maybe it doesn't need to map to leaf pages at all, at least not
+ * in a simple way. Consider for example ordered scans on SP-GiST indexes,
+ * or similar cases. I think that could be handled by having "abstract"
+ * batches - such indexes don't support mark/restore or changing direction,
+ * so this should be OK.
+ *
+ * note: When thinking about an index AM, think about BTREE, unless another
+ * AM is mentioned explicitly. Most AMs are based on / derived from BTREE,
+ * and everything about BTREE directly extends to them.
+ *
+ * note: In the following text "index AM" refers to an implementation of a
+ * particular index AM (e.g. BTREE), i.e. code src/backend/access/nbtree),
+ * while "indexam.c" is the shared executor level used to interact with
+ * indexes.
+ *
+ *
+ * index scan state
+ * ----------------
+ * With the traditional API (amgettuple), index scan state is stored at the
+ * scan-level in AM-specific structs - e.g. in BTScanOpaque for BTREE). So
+ * there can be only a single leaf page "loaded" for a scan at a time.
+ *
+ * With the new API (amgetbatch/amfreebatch), an index scan needs to store
+ * multiple batches - but not in private "scan opaque" struct. Instead,
+ * the queue of batches and some of the other information was moved to the
+ * IndexScanDesc, into a common struct. So the AM-specific scan-opaque
+ * structs get split and moved into three places:
+ *
+ * 1) scan-opaque - Fields that are truly related to the scan as a whole
+ *    remain in the struct (which is AM-specific, i.e. each AM method may
+ *    keep something different). Example: scankeys/arraykeys are still
+ *    kept in BTScanOpaque.
+ *
+ * 2) batch-opaque - AM-specific information related to a particular leaf
+ *    page are moved to a new batch-level struct. A good example are for
+ *    example the position of the leaf page / batch in the index (current
+ *    page, left/righ pages, etc.).
+ *
+ * 3) batch - A significant part of the patch is introducing a common
+ *    representation of a batch, common to all the index AMs. Until now
+ *    each AM had it's own way of representing tuples from a leaf page,
+ *    and accessing it required going through the AM again. The common
+ *    representation allows accessing the batches through the indexam.c
+ *    layer, without having to go through the AM.
+ *
+ *
+ * amgetbatch/amfreebatch
+ * ----------------------
+ * To support batching, the index AM needs to implement two optional
+ * callbacks - amgetbatch() and amfreebatch(), which load data from the
+ * "next" leaf page, and then free it when the batch is no longer needed.
+ *
+ * For now the amgettuple() callback is still required even for AMs that
+ * support batching, so that we can fall-back to the non-batched scan
+ * for cases when batching is not supported (e.g. scans of system tables)
+ * or when batching is disabled using the enable_indexscan_batching GUC.
+ *
+ *
+ * batch
+ * ----------------------
+ * A good way to visualize batching is a sliding window over the key space of
+ * an index. At any given moment, we have a "window" representing a range of
+ * the keys, consisting of one or more batches, each with items from a single
+ * leaf page.
+ *
+ * For now, each batch is exactly one whole leaf page. We might allow batches
+ * to be smaller or larger, but that doesn't seem very useful. It would make
+ * things more complex, without providing much benefit. Ultimately it's up to
+ * the index AM - it can produce any batches it wants, as long as it keeps
+ * necessary information in the batch-opaque struct, and handles this in the
+ * amgetbatch/amfreebatch callbacks.
+ *
+ *
+ * prefetching: leaf pages vs. heap pages
+ * --------------------------------------
+ * This patch is only about prefetching pages from the indexed relation (e.g.
+ * heap), not about prefetching index leaf pages etc. The read_next callback
+ * does read leaf pages when needed (after reaching the end of the current
+ * batch), but this is synchronous, and the callback will block until the leaf
+ * page is read.
+ *
+ *
+ * gradual ramp up
+ * ---------------
+ * The prefetching is driven by the read_stream API / implementation. There
+ * are no explicit fadvise calls in the index code, that all happens in the
+ * read stream. The read stream does the usual gradual ramp up to not regress
+ * LIMIT 1 queries etc.
+ *
+ *
+ * kill_prior_tuples
+ * -----------------
+ * If we decide a tuple should be "killed" in the index, the a flag is used to
+ * pass this information to indexam.c - the item is recorded in the batch, and
+ * the actual killing is postponed until the batch is freed using amfreebatch().
+ * The scan flag is reset to false, so that the index AM does not get confused
+ * and does not do something for a different "current" item.
+ *
+ * That is, this is very similar to what happens without batching, except that
+ * the killed items are accumulated in indexam.c, not in the AM.
+ */
+
+/*
+ * Maximum number of batches (leaf pages) we can keep in memory.
+ *
+ * The value 64 value is arbitrary, it's about 1MB of data with 8KB pages. We
+ * should not really need this many batches - we need a certain number of TIDs,
+ * to satisfy the prefetch distance, and there usually are many index tuples
+ * per page. In the worst case we might have one index tuple per leaf page,
+ * but even that may not quite work in some cases.
+ *
+ * But there may be cases when this does not work - some examples:
+ *
+ * a) the index may be bloated, with many pages only have a single index item
+ *
+ * b) the index is correlated, and we skip prefetches of duplicate blocks
+ *
+ * c) we may be doing index-only scan, and we don't prefetch all-visible pages
+ *
+ * So we might need to load huge number of batches before we find the first
+ * block to load from the table. Or enough pages to satisfy the prefetch
+ * distance.
+ *
+ * XXX Currently, once we hit this number of batches, we fail in the stream
+ * callback (or rather in index_batch_getnext), because that's where we load
+ * batches. It'd be nice to "pause" the read stream for a bit instead, but
+ * there's no built-in way to do that. So we can only "stop" the stream by
+ * returning InvalidBlockNumber. But we could also remember this, and do
+ * read_stream_reset() to continue, after consuming all the already scheduled
+ * blocks.
+ *
+ * XXX Maybe 64 is too high - it also defines the maximum amount of overhead
+ * allowed. In the worst case, reading a single row might trigger reading this
+ * many leaf pages (e.g. with IOS). Which might be an issue with LIMIT queries,
+ * when we actually won't need most of the leaf pages.
+ *
+ * XXX We could/should use a lower value for testing, to make it more likely
+ * we hit this issue. With 64 the whole check-world passes without hitting
+ * the limit, wo we wouldn't test it's handled correctly.
+ */
+#define INDEX_SCAN_MAX_BATCHES	64
+
+#define INDEX_SCAN_BATCH_COUNT(scan) \
+	((scan)->xs_batches->nextBatch - (scan)->xs_batches->firstBatch)
+
+#define INDEX_SCAN_BATCH_LOADED(scan, idx) \
+	((idx) < (scan)->xs_batches->nextBatch)
+
+#define INDEX_SCAN_BATCH_FULL(scan) \
+	(INDEX_SCAN_BATCH_COUNT(scan) == scan->xs_batches->maxBatches)
+
+/*
+ * Check that a position (batch,item) is valid with respect to the batches we
+ * have currently loaded.
+ *
+ * XXX The "marked" batch is an exception. The marked batch may get outside
+ * the range of current batches, so make sure to never check the position
+ * for that.
+ */
+static void
+AssertCheckBatchPosValid(IndexScanDesc scan, IndexScanBatchPos *pos)
+{
+#ifdef USE_ASSERT_CHECKING
+	IndexScanBatches *batch = scan->xs_batches;
+
+	/* make sure the position is valid for currently loaded batches */
+	Assert(pos->batch >= batch->firstBatch);
+	Assert(pos->batch < batch->nextBatch);
+#endif
+}
+
+/*
+ * Check a single batch is valid.
+ */
+static void
+AssertCheckBatch(IndexScanDesc scan, IndexScanBatch batch)
+{
+#ifdef USE_ASSERT_CHECKING
+	/* there must be valid range of items */
+	Assert(batch->firstItem <= batch->lastItem);
+	Assert(batch->firstItem >= 0);
+	Assert(batch->lastItem <= MaxTIDsPerBTreePage); /* XXX tied to BTREE */
+
+	/* we should have items (buffer and pointers) */
+	Assert(batch->items != NULL);
+	// Assert(batch->currTuples != NULL);
+
+	/*
+	 * The number of killed items must be valid, and there must be an array of
+	 * indexes if there are items.
+	 */
+	Assert(batch->numKilled >= 0);
+	Assert(batch->numKilled <= MaxTIDsPerBTreePage);	/* XXX tied to BTREE */
+	Assert(!((batch->numKilled > 0) && (batch->killedItems == NULL)));
+
+	/* XXX can we check some of the other batch fields? */
+#endif
+}
+
+/*
+ * Check invariants on current batches
+ *
+ * Makes sure the indexes are set as expected, the buffer size is within
+ * limits, and so on.
+ */
+static void
+AssertCheckBatches(IndexScanDesc scan)
+{
+#ifdef USE_ASSERT_CHECKING
+	IndexScanBatches *batches = scan->xs_batches;
+
+	/* we should have batches initialized */
+	Assert(batches != NULL);
+
+	/* We should not have too many batches. */
+	Assert((batches->maxBatches > 0) &&
+		   (batches->maxBatches <= INDEX_SCAN_MAX_BATCHES));
+
+	/*
+	 * The first/next indexes should define a valid range (in the cyclic
+	 * buffer, and should not overflow maxBatches.
+	 */
+	Assert((batches->firstBatch >= 0) &&
+		   (batches->firstBatch <= batches->nextBatch));
+	Assert((batches->nextBatch - batches->firstBatch) <= batches->maxBatches);
+
+	/* Check all current batches */
+	for (int i = batches->firstBatch; i < batches->nextBatch; i++)
+	{
+		IndexScanBatch batch = INDEX_SCAN_BATCH(scan, i);
+
+		AssertCheckBatch(scan, batch);
+	}
+#endif
+}
+
+/* debug: print info about current batches */
+static void
+index_batch_print(const char *label, IndexScanDesc scan)
+{
+#ifdef INDEXAM_DEBUG
+	IndexScanBatches *batches = scan->xs_batches;
+
+	if (!scan->xs_batches)
+		return;
+
+	DEBUG_LOG("%s: batches firstBatch %d nextBatch %d maxBatches %d",
+			  label,
+			  batches->firstBatch, batches->nextBatch, batches->maxBatches);
+
+	for (int i = batches->firstBatch; i < batches->nextBatch; i++)
+	{
+		IndexScanBatchData *batch = INDEX_SCAN_BATCH(scan, i);
+
+		DEBUG_LOG("%s: batch %d %p first %d last %d item %d killed %d",
+				  label, i, batch, batch->firstItem, batch->lastItem,
+				  batch->itemIndex, batch->numKilled);
+	}
+#endif
+}
+
+/*
+ * index_batch_pos_advance
+ *		Advance the position to the next item, depending on scan direction.
+ *
+ * Advance the position to the next item, either in the same batch or the
+ * following one (if already available).
+ *
+ * We can advance only if we already have some batches loaded, and there's
+ * either enough items in the current batch, or some more items in the
+ * subsequent batches.
+ *
+ * If this is the first advance, right after loading the first batch, the
+ * position is still be undefined. Otherwise we expect the position to be
+ * valid.
+ *
+ * Returns true if the position was advanced, false otherwise.
+ *
+ * The poisition is guaranteed to be valid only after an advance.
+ */
+static bool
+index_batch_pos_advance(IndexScanDesc scan, IndexScanBatchPos *pos)
+{
+	IndexScanBatchData *batch;
+	ScanDirection direction = scan->xs_batches->direction;
+
+	/* make sure we have batching initialized and consistent */
+	AssertCheckBatches(scan);
+
+	/* should know direction by now */
+	Assert(direction != NoMovementScanDirection);
+
+	/* We can't advance if there are no batches available. */
+	if (INDEX_SCAN_BATCH_COUNT(scan) == 0)
+		return false;
+
+	/*
+	 * If the position has not been advanced yet, it has to be right after we
+	 * loaded the first batch. In that case just initialize it to the first
+	 * item in the batch (or last item, if it's backwards scaa).
+	 *
+	 * XXX Maybe we should just explicitly initialize the postition after
+	 * loading the first batch, without having to go through the advance.
+	 *
+	 * XXX Add a macro INDEX_SCAN_POS_DEFINED() or something like this, to
+	 * make this easier to understand.
+	 */
+	if ((pos->batch == -1) && (pos->index == -1))
+	{
+		/* we should have loaded the very first batch */
+		Assert(scan->xs_batches->firstBatch == 0);
+
+		batch = INDEX_SCAN_BATCH(scan, scan->xs_batches->firstBatch);
+		Assert(batch != NULL);
+
+		pos->batch = 0;
+
+		if (ScanDirectionIsForward(direction))
+			pos->index = batch->firstItem;
+		else
+			pos->index = batch->lastItem;
+
+		/* the position we just set has to be valid */
+		AssertCheckBatchPosValid(scan, pos);
+
+		return true;
+	}
+
+	/*
+	 * The position is already defined, so we should have some batches loaded
+	 * and the position has to be valid with respect to those.
+	 */
+	AssertCheckBatchPosValid(scan, pos);
+
+	/*
+	 * Advance to the next item in the same batch. If the position is for the
+	 * last item in the batch, try advancing to the next batch (if loaded).
+	 */
+	batch = INDEX_SCAN_BATCH(scan, pos->batch);
+
+	if (ScanDirectionIsForward(direction))
+	{
+		if (pos->index < batch->lastItem)
+		{
+			pos->index++;
+
+			/* the position has to be valid */
+			AssertCheckBatchPosValid(scan, pos);
+
+			return true;
+		}
+	}
+	else						/* ScanDirectionIsBackward */
+	{
+		if (pos->index > batch->firstItem)
+		{
+			pos->index--;
+
+			/* the position has to be valid */
+			AssertCheckBatchPosValid(scan, pos);
+
+			return true;
+		}
+	}
+
+	/*
+	 * We couldn't advance within the same batch, try advancing to the next
+	 * batch, if it's already loaded.
+	 */
+	if (INDEX_SCAN_BATCH_LOADED(scan, pos->batch + 1))
+	{
+		/* advance to the next batch */
+		pos->batch++;
+
+		batch = INDEX_SCAN_BATCH(scan, pos->batch);
+		Assert(batch != NULL);
+
+		if (ScanDirectionIsForward(direction))
+			pos->index = batch->firstItem;
+		else
+			pos->index = batch->lastItem;
+
+		/* the position has to be valid */
+		AssertCheckBatchPosValid(scan, pos);
+
+		return true;
+	}
+
+	/* can't advance */
+	return false;
+}
+
+/*
+ * index_batch_pos_reset
+ *		Reset the position, so that it looks as if never advanced.
+ */
+static void
+index_batch_pos_reset(IndexScanDesc scan, IndexScanBatchPos *pos)
+{
+	pos->batch = -1;
+	pos->index = -1;
+}
+
+/*
+ * index_scan_stream_read_next
+ *		return the next block to pass to the read stream
+ *
+ * This assumes the "current" scan direction, requested by the caller. If
+ * that changes before consuming all buffers, we'll reset the stream and
+ * start from scratch. Which may seem inefficient, but it's no worse than
+ * what we do now, and it's not a very common case.
+ *
+ * The position of the read_stream is stored in streamPos, which may be
+ * ahead of the current readPos (which is what got consumed by the scan).
+ *
+ * The scan direction change is checked / handled elsewhere. Here we rely
+ * on having the correct value in xs_batches->direction.
+ */
+static BlockNumber
+index_scan_stream_read_next(ReadStream *stream,
+							void *callback_private_data,
+							void *per_buffer_data)
+{
+	IndexScanDesc scan = (IndexScanDesc) callback_private_data;
+	IndexScanBatchPos *pos = &scan->xs_batches->streamPos;
+
+	/* we should have set the direction already */
+	Assert(scan->xs_batches->direction != NoMovementScanDirection);
+
+	/*
+	 * The read position has to be valid, because we initialize/advance it
+	 * before maybe even attempting to read the heap tuple. And it lags behind
+	 * the stream position, so it can't be invalid yet. If this is the first
+	 * time for this callback, we will use the readPos to init streamPos, so
+	 * better check it's valid.
+	 */
+	AssertCheckBatchPosValid(scan, &scan->xs_batches->readPos);
+
+	/*
+	 * Try to advance to the next item, and if there's none in the current
+	 * batch, try loading the next batch.
+	 *
+	 * XXX This loop shouldn't happen more than twice, because if we fail to
+	 * advance the position, we'll try to load the next batch and then in the
+	 * next loop the advance has to succeed.
+	 */
+	while (true)
+	{
+		bool		advanced = false;
+
+		/*
+		 * If the stream position is undefined, just use the read position.
+		 *
+		 * It's possible we got here only fairly late in the scan, e.g. if
+		 * many tuples got skipped in the index-only scan, etc. In this case
+		 * just use the read position as a starting point.
+		 *
+		 * The first batch is loaded from index_batch_getnext_tid(), because
+		 * we don't get here until the first index_fetch_heap() call - only
+		 * then can read_stream start loading more batches. It's also possible
+		 * to disable prefetching (effective_io_concurrency=0), in which case
+		 * all batches get loaded in index_batch_getnext_tid.
+		 */
+		if ((pos->batch == -1) && (pos->index == -1))
+		{
+			*pos = scan->xs_batches->readPos;
+			advanced = true;
+		}
+		else if (index_batch_pos_advance(scan, pos))
+		{
+			advanced = true;
+		}
+
+		/* FIXME maybe check the streamPos is not behind readPos? */
+
+		/* If we advanced the position, return the block for the TID. */
+		if (advanced)
+		{
+			IndexScanBatch batch = INDEX_SCAN_BATCH(scan, pos->batch);
+			ItemPointer tid = &batch->items[pos->index].heapTid;
+
+			DEBUG_LOG("index_scan_stream_read_next: index %d TID (%u,%u)",
+					  pos->index,
+					  ItemPointerGetBlockNumber(tid),
+					  ItemPointerGetOffsetNumber(tid));
+
+			/*
+			 * if there's a prefetch callback, use it to decide if we will
+			 * need to read the block
+			 */
+			if (scan->xs_batches->prefetchCallback &&
+				!scan->xs_batches->prefetchCallback(scan, scan->xs_batches->prefetchArgument, pos))
+			{
+				DEBUG_LOG("index_scan_stream_read_next: skip block (callback)");
+				continue;
+			}
+
+			return ItemPointerGetBlockNumber(tid);
+		}
+
+		/*
+		 * Couldn't advance the position, so either there are no more items in
+		 * the current batch, or maybe we don't have any batches yet (if is
+		 * the first time through). Try loading the next batch - if that
+		 * succeeds, try the advance again (and this time the advance should
+		 * work).
+		 *
+		 * If we fail to load the next batch, we're done.
+		 */
+		if (!index_batch_getnext(scan))
+			break;
+	}
+
+	/* no more items in this scan */
+	return InvalidBlockNumber;
+}
+
+/* ----------------
+ *		index_batch_getnext - get the next batch of TIDs from a scan
+ *
+ * Returns true if we managed to read at least some TIDs into the batch, or
+ * false if there are no more TIDs in the scan. The batch load may fail for
+ * multiple reasons - there really may not be more batches in the scan, or
+ * maybe we reached INDEX_SCAN_MAX_BATCHES.
+ *
+ * Returns true if the batch was loaded successfully, false otherwise.
+ *
+ * XXX This only loads the TIDs and resets the various batch fields to
+ * fresh state. It does not set xs_heaptid/xs_itup/xs_hitup, that's the
+ * responsibility of the following index_batch_getnext_tid() calls.
+ * ----------------
+ */
+static bool
+index_batch_getnext(IndexScanDesc scan)
+{
+	IndexScanBatchData *batch;
+	ItemPointerData tid;
+	ScanDirection direction = scan->xs_batches->direction;
+	IndexTuple	itup;
+
+	SCAN_CHECKS;
+	CHECK_SCAN_PROCEDURE(amgetbatch);
+
+	/* XXX: we should assert that a snapshot is pushed or registered */
+	Assert(TransactionIdIsValid(RecentXmin));
+
+	/*
+	 * If we already used the maximum number of batch slots available, it's
+	 * pointless to try loading another one. This can happen for various
+	 * reasons, e.g. for index-only scans on all-visible table, or skipping
+	 * duplicate blocks on perfectly correlated indexes, etc.
+	 *
+	 * We could enlarge the array to allow more batches, but that's futile, we
+	 * can always construct a case using more memory. Not only it would risk
+	 * OOM, it'd also be inefficient because this happens early in the scan
+	 * (so it'd interfere with LIMIT queries).
+	 *
+	 * XXX For now we just error out, but the correct solution is to pause the
+	 * stream by returning InvalidBlockNumber and then unpause it by doing
+	 * read_stream_reset.
+	 */
+	if (INDEX_SCAN_BATCH_FULL(scan))
+	{
+		DEBUG_LOG("index_batch_getnext: ran out of space for batches");
+		scan->xs_batches->reset = true;
+	}
+
+	/*
+	 * Did we fill the batch queue, either in this or some earlier call?
+	 * If yes, we have to consume everything from currently loaded batch
+	 * before we reset the stream and continue. It's a bit like 'finished'
+	 * but it's only a temporary pause, not the end of the stream.
+	 */
+	if (scan->xs_batches->reset)
+		return NULL;
+
+	/*
+	 * Did we already read the last batch for this scan?
+	 *
+	 * We may read the batches in two places, so we need to remember that,
+	 * otherwise the retry restarts the scan.
+	 *
+	 * XXX This comment might be obsolete, from before using the read_stream.
+	 *
+	 * XXX Also, maybe we should do this before calling INDEX_SCAN_BATCH_FULL?
+	 */
+	if (scan->xs_batches->finished)
+		return NULL;
+
+	index_batch_print("index_batch_getnext / start", scan);
+
+	/*
+	 * FIXME btgetbatch calls _bt_returnitem, which however sets xs_heaptid,
+	 * and so would interfere with index scans (because this may get executed
+	 * from the read_stream_next_buffer callback during the scan (fetching
+	 * heap tuples in heapam_index_fetch_tuple). Ultimately we should not do
+	 * _bt_returnitem at all, just functions like _bt_steppage etc. while
+	 * loading the next batch.
+	 *
+	 * XXX I think this is no longer true, the amgetbatch does not do that I
+	 * believe (_bt_returnitem_batch should not set these fields).
+	 */
+	tid = scan->xs_heaptid;
+	itup = scan->xs_itup;
+
+	batch = scan->indexRelation->rd_indam->amgetbatch(scan, direction);
+	if (batch != NULL)
+	{
+		/*
+		 * We got the batch from the AM, but we need to add it to the queue.
+		 * Maybe that should be part of the "batch allocation" that happens in
+		 * the AM?
+		 */
+		int			batchIndex = scan->xs_batches->nextBatch;
+
+		INDEX_SCAN_BATCH(scan, batchIndex) = batch;
+
+		scan->xs_batches->nextBatch++;
+
+		/*
+		 * XXX Why do we need currentBatch, actually? It doesn't seem to be
+		 * used anywhere, just set ...
+		 */
+		// scan->xs_batches->currentBatch = batch;
+
+		DEBUG_LOG("index_batch_getnext firstBatch %d nextBatch %d batch %p",
+				  scan->xs_batches->firstBatch, scan->xs_batches->nextBatch, batch);
+	}
+	else
+		scan->xs_batches->finished = true;
+
+	/* XXX see FIXME above */
+	scan->xs_heaptid = tid;
+	scan->xs_itup = itup;
+
+	AssertCheckBatches(scan);
+
+	index_batch_print("index_batch_getnext / end", scan);
+
+	return (batch != NULL);
+}
+
+/* ----------------
+ *		index_getnext_batch_tid - get the next TID from the current batch
+ *
+ * The calling convention is similar to index_getnext_tid() - NULL means no
+ * more items in the current batch, and no more batches.
+ *
+ * If we advance to the next batch, we release the previous one (unless it's
+ * tracked for mark/restore).
+ *
+ * Returns the next TID, or NULL if no more items (or batches).
+ *
+ * FIXME This only sets xs_heaptid and xs_itup (if requested). Not sure if
+ * we need to do something with xs_hitup. Should this set xs_hitup?
+ *
+ * XXX Maybe if we advance the position to the next batch, we could keep the
+ * batch for a bit more, in case the scan direction changes (as long as it
+ * fits into maxBatches)? But maybe that's unnecessary complexity for too
+ * little gain, we'd need to be careful about releasing the batches lazily.
+ * ----------------
+ */
+static ItemPointer
+index_batch_getnext_tid(IndexScanDesc scan, ScanDirection direction)
+{
+	IndexScanBatchPos *pos;
+
+	/* shouldn't get here without batching */
+	AssertCheckBatches(scan);
+
+	/* read the next TID from the index */
+	pos = &scan->xs_batches->readPos;
+
+	/* FIXME handle change of scan direction (reset stream, ...) */
+	scan->xs_batches->direction = direction;
+
+	DEBUG_LOG("index_batch_getnext_tid pos %d %d direction %d",
+			  pos->batch, pos->index, direction);
+
+	/*
+	 * Try advancing the batch position. If that doesn't succeed, it means we
+	 * don't have more items in the current batch, and there's no future batch
+	 * loaded. So try loading another batch, and maybe retry.
+	 *
+	 * FIXME This loop shouldn't happen more than twice. Maybe we should have
+	 * some protection against infinite loops? If the advance/getnext
+	 * functions get to disagree?
+	 */
+	while (true)
+	{
+		/*
+		 * If we manage to advance to the next items, return it and we're
+		 * done. Otherwise try loading another batch.
+		 */
+		if (index_batch_pos_advance(scan, pos))
+		{
+			IndexScanBatchData *batch = INDEX_SCAN_BATCH(scan, pos->batch);
+
+			Assert(batch != NULL);
+
+			/* set the TID / itup for the scan */
+			scan->xs_heaptid = batch->items[pos->index].heapTid;
+			scan->xs_itup = (IndexTuple) (batch->currTuples + batch->items[pos->index].tupleOffset);
+
+			DEBUG_LOG("pos batch %p first %d last %d pos %d/%d TID (%u,%u)",
+					  batch, batch->firstItem, batch->lastItem,
+					  pos->batch, pos->index,
+					  ItemPointerGetBlockNumber(&scan->xs_heaptid),
+					  ItemPointerGetOffsetNumber(&scan->xs_heaptid));
+
+			/*
+			 * If we advanced to the next batch, release the batch we no
+			 * longer need. The positions is the "read" position, and we can
+			 * compare it to firstBatch.
+			 */
+			if (pos->batch != scan->xs_batches->firstBatch)
+			{
+				batch = INDEX_SCAN_BATCH(scan, scan->xs_batches->firstBatch);
+				Assert(batch != NULL);
+
+				/*
+				 * XXX When advancing readPos, the streamPos may get behind as
+				 * we're only advancing it when actually requesting heap blocks.
+				 * But we may not do that often enough - e.g. IOS may not need
+				 * to access all-visible heap blocks, so the read_next callback
+				 * does not get invoked for a long time. It's possible the
+				 * stream gets so mucu behind the position gets invalid, as we
+				 * already removed the batch. But that means we don't need any
+				 * heap blocks until the current read position - if we did, we
+				 * would not be in this situation (or it's a sign of a bug, as
+				 * those two places are expected to be in sync). So if the
+				 * streamPos still points at the batch we're about to free,
+				 * just reset the position - we'll set it to readPos in the
+				 * read_next callback later.
+				 *
+				 * XXX This can happen after the queue gets full, we "pause"
+				 * the stream, and then reset it to continue. But I think that
+				 * just increases the probability of hitting the issue, it's
+				 * just more chance to to not advance the streamPos, which
+				 * depends on when we try to fetch the first heap block after
+				 * calling read_stream_reset().
+				 */
+				if (scan->xs_batches->streamPos.batch == scan->xs_batches->firstBatch)
+				{
+					index_batch_pos_reset(scan, &scan->xs_batches->streamPos);
+				}
+
+				DEBUG_LOG("index_batch_getnext_tid free batch %p firstBatch %d nextBatch %d",
+						  batch,
+						  scan->xs_batches->firstBatch,
+						  scan->xs_batches->nextBatch);
+
+				/* Free the batch (except when it's needed for mark/restore). */
+				index_batch_free(scan, batch);
+
+				/*
+				 * In any case, remove the batch from the regular queue, even
+				 * if we kept it for mar/restore.
+				 */
+				scan->xs_batches->firstBatch++;
+
+				DEBUG_LOG("index_batch_getnext_tid batch freed firstBatch %d nextBatch %d",
+						  scan->xs_batches->firstBatch,
+						  scan->xs_batches->nextBatch);
+
+				index_batch_print("index_batch_getnext_tid / free old batch", scan);
+
+				/* we can't skip any batches */
+				Assert(scan->xs_batches->firstBatch == pos->batch);
+			}
+
+			return &scan->xs_heaptid;
+		}
+
+		/*
+		 * We failed to advance, i.e. we ran out of currently loaded batches.
+		 * So if we filled the queue, this is a good time to reset the stream
+		 * (before we try loading the next batch).
+		 */
+		if (scan->xs_batches->reset)
+		{
+			DEBUG_LOG("resetting read stream pos %d,%d",
+					  scan->xs_batches->readPos.batch, scan->xs_batches->readPos.index);
+
+			scan->xs_batches->reset = false;
+
+			/*
+			 * Need to reset the stream position, it might be too far behind.
+			 * Ultimately we want to set it to readPos, but we can't do that
+			 * yet - readPos still point sat the old batch, so just reset it
+			 * and we'll init it to readPos later in the callback.
+			 */
+			index_batch_pos_reset(scan, &scan->xs_batches->streamPos);
+
+			read_stream_reset(scan->xs_heapfetch->rs);
+		}
+
+		/*
+		 * Failed to advance the read position, so try reading the next batch.
+		 * If this fails, we're done - there's nothing more to load.
+		 *
+		 * Most of the batches should be loaded from read_stream_next_buffer,
+		 * but we need to call index_batch_getnext here too, for two reasons.
+		 * First, the read_stream only gets working after we try fetching the
+		 * first heap tuple, so we need to load the first batch from here.
+		 * Second, while most batches will be preloaded by the stream thank's
+		 * to prefetching, it's possible to set effective_io_concurrency=0, in
+		 * which case all the batch loads happen from here.
+		 */
+		if (!index_batch_getnext(scan))
+			break;
+
+		DEBUG_LOG("loaded next batch, retry to advance position");
+	}
+
+	/*
+	 * If we get here, we failed to advance the position and there are no more
+	 * batches, so we're done.
+	 */
+	DEBUG_LOG("no more batches to process");
+
+	return NULL;
+}
+
+/*
+ * index_batch_init
+ *		Initialize various fields / arrays needed by batching.
+ *
+ * FIXME This is a bit ad-hoc hodge podge, due to how I was adding more and
+ * more pieces. Some of the fields may be not quite necessary, needs cleanup.
+ */
+static void
+index_batch_init(IndexScanDesc scan)
+{
+	/* init batching info, assume batching is supported by the AM */
+	Assert(scan->indexRelation->rd_indam->amgetbatch != NULL);
+	Assert(scan->indexRelation->rd_indam->amfreebatch != NULL);
+
+	scan->xs_batches = palloc0(sizeof(IndexScanBatches));
+
+	/* We don't know direction of the scan yet. */
+	scan->xs_batches->direction = NoMovementScanDirection;
+
+	/* Initialize the batch */
+	scan->xs_batches->maxBatches = INDEX_SCAN_MAX_BATCHES;
+	scan->xs_batches->firstBatch = 0;	/* first batch */
+	scan->xs_batches->nextBatch = 0;	/* first batch is empty */
+
+	scan->xs_batches->batches
+		= palloc(sizeof(IndexScanBatchData *) * scan->xs_batches->maxBatches);
+
+	/* positions in the queue of batches */
+	index_batch_pos_reset(scan, &scan->xs_batches->readPos);
+	index_batch_pos_reset(scan, &scan->xs_batches->streamPos);
+	index_batch_pos_reset(scan, &scan->xs_batches->markPos);
+
+	// scan->xs_batches->currentBatch = NULL;
+}
+
+/*
+ * index_batch_reset
+ *		Reset the batch before reading the next chunk of data.
+ *
+ * complete - true means we reset even marked batch
+ *
+  * XXX Should this reset the batch memory context, xs_itup, xs_hitup, etc?
+ */
+static void
+index_batch_reset(IndexScanDesc scan, bool complete)
+{
+	IndexScanBatches *batches = scan->xs_batches;
+
+	/* bail out if batching not enabled */
+	if (!batches)
+		return;
+
+	AssertCheckBatches(scan);
+
+	index_batch_print("index_batch_reset", scan);
+
+	/* With batching enabled, we should have a read stream. Reset it. */
+	Assert(scan->xs_heapfetch);
+	read_stream_reset(scan->xs_heapfetch->rs);
+
+	/* reset the positions */
+	index_batch_pos_reset(scan, &batches->readPos);
+	index_batch_pos_reset(scan, &batches->streamPos);
+
+	/*
+	 * With "complete" reset, make sure to also free the marked batch, either
+	 * by just forgetting it (if it's still in the queue), or by explicitly
+	 * freeing it.
+	 *
+	 * XXX Do this before the loop, so that it calls the amfreebatch().
+	 */
+	if (complete && (batches->markBatch != NULL))
+	{
+		IndexScanBatchPos *pos = &batches->markPos;
+		IndexScanBatch batch = batches->markBatch;
+
+		/* always reset the position, forget the marked batch */
+		batches->markBatch = NULL;
+
+		/*
+		 * If we've already moved past the marked batch (it's not in the
+		 * current queue), free it explicitly. Otherwise it'll be in the freed
+		 * later.
+		 */
+		if ((pos->batch < batches->firstBatch) ||
+			(pos->batch >= batches->nextBatch))
+		{
+			index_batch_free(scan, batch);
+		}
+
+		/* reset position only after the queue range check */
+		index_batch_pos_reset(scan, &batches->markPos);
+	}
+
+	/* release all currently loaded batches */
+	while (batches->firstBatch < batches->nextBatch)
+	{
+		IndexScanBatch batch = INDEX_SCAN_BATCH(scan, batches->firstBatch);
+
+		DEBUG_LOG("freeing batch %d %p", batches->firstBatch, batch);
+
+		index_batch_free(scan, batch);
+
+		/* update the valid range, so that asserts / debugging works */
+		batches->firstBatch++;
+	}
+
+	/* reset relevant IndexScanBatches fields */
+	batches->maxBatches = INDEX_SCAN_MAX_BATCHES;
+	batches->firstBatch = 0;	/* first batch */
+	batches->nextBatch = 0;		/* first batch is empty */
+
+	batches->finished = false;
+	batches->reset = false;
+	// batches->currentBatch = NULL;
+
+	AssertCheckBatches(scan);
+}
+
+static void
+index_batch_kill_item(IndexScanDesc scan)
+{
+	IndexScanBatchPos *pos = &scan->xs_batches->readPos;
+	IndexScanBatchData *batch = INDEX_SCAN_BATCH(scan, pos->batch);
+
+	/* FIXME mark item at current readPos as deleted */
+	AssertCheckBatchPosValid(scan, pos);
+
+	/*
+	 * XXX Too tied to btree (through MaxTIDsPerBTreePage), we should make
+	 * this AM agnostic. We could maybe even replace this with Bitmapset. It
+	 * might be more expensive if we only kill items at the end of the page
+	 * (in which case we still have to walk the first part to find the bits at
+	 * the end). But given the lower memory usage it still sees like a good
+	 * tradeoff overall.
+	 */
+	if (batch->killedItems == NULL)
+		batch->killedItems = (int *)
+			palloc(MaxTIDsPerBTreePage * sizeof(int));
+	if (batch->numKilled < MaxTIDsPerBTreePage)
+		batch->killedItems[batch->numKilled++] = pos->index;
+
+	/* elog(WARNING, "index_batch_kill_item (%d,%d)", pos->batch, pos->index); */
+	/* FIXME index_batch_kill_item not implemented */
+}
+
+static void
+index_batch_free(IndexScanDesc scan, IndexScanBatch batch)
+{
+	SCAN_CHECKS;
+	CHECK_SCAN_PROCEDURE(amfreebatch);
+
+	AssertCheckBatch(scan, batch);
+
+	/* don't free the batch that is marked */
+	if (batch == scan->xs_batches->markBatch)
+		return;
+
+	scan->indexRelation->rd_indam->amfreebatch(scan, batch);
+}
+
+/* */
+static void
+index_batch_end(IndexScanDesc scan)
+{
+	index_batch_reset(scan, true);
+}
+
+IndexScanBatch
+index_batch_alloc(int maxitems, bool want_itup)
+{
+	IndexScanBatch batch = palloc(sizeof(IndexScanBatchData));
+
+	batch->firstItem = -1;
+	batch->lastItem = -1;
+	batch->itemIndex = -1;
+
+	batch->killedItems = NULL;	/* FIXME allocate an array, actually */
+	batch->numKilled = 0;		/* nothing killed yet */
+
+	/*
+	 * If we are doing an index-only scan, these are the tuple storage
+	 * workspaces for the currPos and markPos respectively.  Each is of size
+	 * BLCKSZ, so it can hold as much as a full page's worth of tuples.
+	 *
+	 * XXX allocate
+	 */
+	batch->currTuples = NULL;	/* tuple storage for currPos */
+	if (want_itup)
+		batch->currTuples = palloc(BLCKSZ);
+
+	/*
+	 * XXX Maybe don't size to MaxTIDsPerBTreePage? We don't reuse batches
+	 * (unlike currPos), so we can size it for just what we need.
+	 */
+	batch->items = palloc0(sizeof(IndexScanBatchPosItem) * maxitems);
+
+	/*
+	 * batch contents (TIDs, index tuples, kill bitmap, ...)
+	 *
+	 * XXX allocate as needed?
+	 */
+	batch->itups = NULL;		/* IndexTuples, if requested */
+	batch->htups = NULL;		/* HeapTuples, if requested */
+	batch->recheck = NULL;		/* recheck flags */
+	batch->privateData = NULL;	/* private data for batch */
+
+	/* xs_orderbyvals / xs_orderbynulls */
+	batch->orderbyvals = NULL;
+	batch->orderbynulls = NULL;
+
+	/* AM-specific per-batch state */
+	batch->opaque = NULL;
+
+	return batch;
+}
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index a56c5eceb14a..be8e02a9c452 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -217,7 +217,7 @@ table_index_fetch_tuple_check(Relation rel,
 	bool		found;
 
 	slot = table_slot_create(rel, NULL);
-	scan = table_index_fetch_begin(rel);
+	scan = table_index_fetch_begin(rel, NULL);
 	found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again,
 									all_dead);
 	table_index_fetch_end(scan);
diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c
index 3497a8221f29..8a5d79a27a66 100644
--- a/src/backend/commands/constraint.c
+++ b/src/backend/commands/constraint.c
@@ -106,7 +106,8 @@ unique_key_recheck(PG_FUNCTION_ARGS)
 	 */
 	tmptid = checktid;
 	{
-		IndexFetchTableData *scan = table_index_fetch_begin(trigdata->tg_relation);
+		IndexFetchTableData *scan = table_index_fetch_begin(trigdata->tg_relation,
+															NULL);
 		bool		call_again = false;
 
 		if (!table_index_fetch_tuple(scan, &tmptid, SnapshotSelf, slot,
diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c
index bdf862b24062..1ec046adeffd 100644
--- a/src/backend/executor/execIndexing.c
+++ b/src/backend/executor/execIndexing.c
@@ -815,7 +815,17 @@ check_exclusion_or_unique_constraint(Relation heap, Relation index,
 retry:
 	conflict = false;
 	found_self = false;
-	index_scan = index_beginscan(heap, index, &DirtySnapshot, NULL, indnkeyatts, 0);
+
+	/*
+	 * It doesn't seem very useful to allow batching/prefetching when checking
+	 * exclusion/uniqueness constraints. We should only find either no or just
+	 * one row, I think.
+	 *
+	 * XXX Maybe there are cases where we could find multiple "candidate"
+	 * rows, e.g. with exclusion constraints? Not sure.
+	 */
+	index_scan = index_beginscan(heap, index, &DirtySnapshot, NULL, indnkeyatts, 0,
+								 false);
 	index_rescan(index_scan, scankeys, indnkeyatts, NULL, 0);
 
 	while (index_getnext_slot(index_scan, ForwardScanDirection, existing_slot))
diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c
index 53ddd25c42db..9c7df9b9ccbc 100644
--- a/src/backend/executor/execReplication.c
+++ b/src/backend/executor/execReplication.c
@@ -201,8 +201,13 @@ RelationFindReplTupleByIndex(Relation rel, Oid idxoid,
 	/* Build scan key. */
 	skey_attoff = build_replindex_scan_key(skey, rel, idxrel, searchslot);
 
-	/* Start an index scan. */
-	scan = index_beginscan(rel, idxrel, &snap, NULL, skey_attoff, 0);
+	/*
+	 * Start an index scan.
+	 *
+	 * XXX No prefetching for replication identity. We expect to find just one
+	 * row, so prefetching would be pointless.
+	 */
+	scan = index_beginscan(rel, idxrel, &snap, NULL, skey_attoff, 0, false);
 
 retry:
 	found = false;
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index f464cca9507a..1a14f5faa68c 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -49,7 +49,13 @@
 static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node);
 static void StoreIndexTuple(IndexOnlyScanState *node, TupleTableSlot *slot,
 							IndexTuple itup, TupleDesc itupdesc);
+static bool ios_prefetch_block(IndexScanDesc scan, void *data,
+							   IndexScanBatchPos *pos);
 
+/* values stored in ios_prefetch_block in the batch cache */
+#define		IOS_UNKNOWN_VISIBILITY		0	/* default value */
+#define		IOS_ALL_VISIBLE				1
+#define		IOS_NOT_ALL_VISIBLE			2
 
 /* ----------------------------------------------------------------
  *		IndexOnlyNext
@@ -94,15 +100,26 @@ IndexOnlyNext(IndexOnlyScanState *node)
 								   estate->es_snapshot,
 								   &node->ioss_Instrument,
 								   node->ioss_NumScanKeys,
-								   node->ioss_NumOrderByKeys);
+								   node->ioss_NumOrderByKeys,
+								   node->ioss_CanBatch);
 
 		node->ioss_ScanDesc = scandesc;
 
-
 		/* Set it up for index-only scan */
 		node->ioss_ScanDesc->xs_want_itup = true;
 		node->ioss_VMBuffer = InvalidBuffer;
 
+		/*
+		 * Set the prefetch callback info, if the scan has batching enabled
+		 * (we only know what after index_beginscan, which also checks which
+		 * callbacks are defined for the AM.
+		 */
+		if (scandesc->xs_batches != NULL)
+		{
+			scandesc->xs_batches->prefetchCallback = ios_prefetch_block;
+			scandesc->xs_batches->prefetchArgument = (void *) node;
+		}
+
 		/*
 		 * If no run-time keys to calculate or they are ready, go ahead and
 		 * pass the scankeys to the index AM.
@@ -120,10 +137,42 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	 */
 	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
 	{
+		bool		all_visible;
 		bool		tuple_from_heap = false;
 
 		CHECK_FOR_INTERRUPTS();
 
+		/*
+		 * Without batching, inspect the VM directly. With batching, we need
+		 * to retrieve the visibility information seen by the read_stream
+		 * callback (or rather by ios_prefetch_block), otherwise the
+		 * read_stream might get out of sync (if the VM got updated since
+		 * then).
+		 */
+		if (scandesc->xs_batches == NULL)
+		{
+			all_visible = VM_ALL_VISIBLE(scandesc->heapRelation,
+										 ItemPointerGetBlockNumber(tid),
+										 &node->ioss_VMBuffer);
+		}
+		else
+		{
+			/*
+			 * Reuse the previously determined page visibility info, or
+			 * calculate it now. If we decided not to prefetch the block, the
+			 * page had to be all-visible at that point. The VM bit might have
+			 * changed since then, but the tuple visibility could not have.
+			 *
+			 * XXX It's a bit weird we use the visibility to decide if we
+			 * should skip prefetching the block, and then deduce the
+			 * visibility from that (even if it matches pretty clearly). But
+			 * maybe we could/should have a more direct way to read the
+			 * private state?
+			 */
+			all_visible = !ios_prefetch_block(scandesc, node,
+											  &scandesc->xs_batches->readPos);
+		}
+
 		/*
 		 * We can skip the heap fetch if the TID references a heap page on
 		 * which all tuples are known visible to everybody.  In any case,
@@ -158,9 +207,7 @@ IndexOnlyNext(IndexOnlyScanState *node)
 		 * It's worth going through this complexity to avoid needing to lock
 		 * the VM buffer, which could cause significant contention.
 		 */
-		if (!VM_ALL_VISIBLE(scandesc->heapRelation,
-							ItemPointerGetBlockNumber(tid),
-							&node->ioss_VMBuffer))
+		if (!all_visible)
 		{
 			/*
 			 * Rats, we have to visit the heap to check visibility.
@@ -596,6 +643,20 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
 	indexstate->recheckqual =
 		ExecInitQual(node->recheckqual, (PlanState *) indexstate);
 
+	/*
+	 * All index scans can do batching.
+	 *
+	 * XXX Maybe this should check if the index AM supports batching, or even
+	 * call something like "amcanbatch" (does not exist yet). Or check the
+	 * enable_indexscan_batching GUC?
+	 *
+	 * XXX For now we only know if the scan gets to use batching after the
+	 * index_beginscan() returns, so maybe this name is a bit misleading. It's
+	 * more about "allow batching". But maybe this field is unnecessary - we
+	 * check all the interesting stuff in index_beginscan() anyway.
+	 */
+	indexstate->ioss_CanBatch = true;
+
 	/*
 	 * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
 	 * here.  This allows an index-advisor plugin to EXPLAIN a plan containing
@@ -783,13 +844,21 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node,
 		return;
 	}
 
+	/*
+	 * XXX Do we actually want prefetching for parallel index scans? Maybe
+	 * not, but then we need to be careful not to call index_batch_getnext_tid
+	 * (which now can happen, because we'll call IndexOnlyNext even for
+	 * parallel plans). Although, that should not happen, because we only call
+	 * that with (xs_batches != NULL).
+	 */
 	node->ioss_ScanDesc =
 		index_beginscan_parallel(node->ss.ss_currentRelation,
 								 node->ioss_RelationDesc,
 								 &node->ioss_Instrument,
 								 node->ioss_NumScanKeys,
 								 node->ioss_NumOrderByKeys,
-								 piscan);
+								 piscan,
+								 node->ioss_CanBatch);
 	node->ioss_ScanDesc->xs_want_itup = true;
 	node->ioss_VMBuffer = InvalidBuffer;
 
@@ -849,13 +918,15 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node,
 		return;
 	}
 
+	/* XXX Do we actually want prefetching for parallel index scans? */
 	node->ioss_ScanDesc =
 		index_beginscan_parallel(node->ss.ss_currentRelation,
 								 node->ioss_RelationDesc,
 								 &node->ioss_Instrument,
 								 node->ioss_NumScanKeys,
 								 node->ioss_NumOrderByKeys,
-								 piscan);
+								 piscan,
+								 node->ioss_CanBatch);
 	node->ioss_ScanDesc->xs_want_itup = true;
 
 	/*
@@ -889,3 +960,51 @@ ExecIndexOnlyScanRetrieveInstrumentation(IndexOnlyScanState *node)
 	node->ioss_SharedInfo = palloc(size);
 	memcpy(node->ioss_SharedInfo, SharedInfo, size);
 }
+
+/* FIXME duplicate from indexam.c */
+#define INDEX_SCAN_BATCH(scan, idx)	\
+		((scan)->xs_batches->batches[(idx) % (scan)->xs_batches->maxBatches])
+
+/*
+ * ios_prefetch_block
+ *		Callback to only prefetch blocks that are not all-visible.
+ *
+ * We don't want to inspect the visibility map repeatedly, so the result of
+ * VM_ALL_VISIBLE is stored in the batch private data. The values are set
+ * to 0 by default, so we use two constants to remember if all-visible or
+ * not all-visible.
+ *
+ * However, this is not merely a question of performance. The VM may get
+ * modified during the scan, and we need to make sure the two places (the
+ * read_next callback and the index_fetch_heap here) make the same decision,
+ * otherwise we might get out of sync with the stream. For example, the
+ * callback might find a page is all-visible (and skips reading the block),
+ * and then someone might update the page, resetting the VM bit. If this
+ * place attempts to read the page from the stream, it'll fail because it
+ * will probably receive an entirely different page.
+ */
+static bool
+ios_prefetch_block(IndexScanDesc scan, void *arg, IndexScanBatchPos *pos)
+{
+	IndexOnlyScanState *node = (IndexOnlyScanState *) arg;
+	IndexScanBatch batch = INDEX_SCAN_BATCH(scan, pos->batch);
+
+	if (batch->privateData == NULL)
+		batch->privateData = palloc0(sizeof(Datum) * (batch->lastItem + 1));
+
+	if (batch->privateData[pos->index] == IOS_UNKNOWN_VISIBILITY)
+	{
+		bool		all_visible;
+		ItemPointer tid = &batch->items[pos->index].heapTid;
+
+		all_visible = VM_ALL_VISIBLE(scan->heapRelation,
+									 ItemPointerGetBlockNumber(tid),
+									 &node->ioss_VMBuffer);
+
+		batch->privateData[pos->index]
+			= all_visible ? IOS_ALL_VISIBLE : IOS_NOT_ALL_VISIBLE;
+	}
+
+	/* prefetch only blocks that are not all-visible */
+	return (batch->privateData[pos->index] == IOS_NOT_ALL_VISIBLE);
+}
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index 7fcaa37fe625..177d74c2c273 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -111,7 +111,8 @@ IndexNext(IndexScanState *node)
 								   estate->es_snapshot,
 								   &node->iss_Instrument,
 								   node->iss_NumScanKeys,
-								   node->iss_NumOrderByKeys);
+								   node->iss_NumOrderByKeys,
+								   node->iss_CanBatch);
 
 		node->iss_ScanDesc = scandesc;
 
@@ -201,13 +202,16 @@ IndexNextWithReorder(IndexScanState *node)
 		/*
 		 * We reach here if the index scan is not parallel, or if we're
 		 * serially executing an index scan that was planned to be parallel.
+		 *
+		 * XXX Should we use batching here? Does it even work for reordering?
 		 */
 		scandesc = index_beginscan(node->ss.ss_currentRelation,
 								   node->iss_RelationDesc,
 								   estate->es_snapshot,
 								   &node->iss_Instrument,
 								   node->iss_NumScanKeys,
-								   node->iss_NumOrderByKeys);
+								   node->iss_NumOrderByKeys,
+								   false);
 
 		node->iss_ScanDesc = scandesc;
 
@@ -965,6 +969,18 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags)
 	indexstate->indexorderbyorig =
 		ExecInitExprList(node->indexorderbyorig, (PlanState *) indexstate);
 
+	/*
+	 * All index scans can do batching.
+	 *
+	 * XXX Maybe this should check if the index AM supports batching, or even
+	 * call something like "amcanbatch" (does not exist yet). Or check the
+	 * enable_indexscan_batching GUC?
+	 *
+	 * XXX Well, we disable batching for reordering, so maybe we should check
+	 * that here instead? But maybe it's unnecessary limitation?
+	 */
+	indexstate->iss_CanBatch = true;
+
 	/*
 	 * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
 	 * here.  This allows an index-advisor plugin to EXPLAIN a plan containing
@@ -1719,13 +1735,17 @@ ExecIndexScanInitializeDSM(IndexScanState *node,
 		return;
 	}
 
+	/*
+	 * XXX Do we actually want prefetching for parallel index scans?
+	 */
 	node->iss_ScanDesc =
 		index_beginscan_parallel(node->ss.ss_currentRelation,
 								 node->iss_RelationDesc,
 								 &node->iss_Instrument,
 								 node->iss_NumScanKeys,
 								 node->iss_NumOrderByKeys,
-								 piscan);
+								 piscan,
+								 node->iss_CanBatch);
 
 	/*
 	 * If no run-time keys to calculate or they are ready, go ahead and pass
@@ -1783,13 +1803,17 @@ ExecIndexScanInitializeWorker(IndexScanState *node,
 		return;
 	}
 
+	/*
+	 * XXX Do we actually want prefetching for parallel index scans?
+	 */
 	node->iss_ScanDesc =
 		index_beginscan_parallel(node->ss.ss_currentRelation,
 								 node->iss_RelationDesc,
 								 &node->iss_Instrument,
 								 node->iss_NumScanKeys,
 								 node->iss_NumOrderByKeys,
-								 piscan);
+								 piscan,
+								 node->iss_CanBatch);
 
 	/*
 	 * If no run-time keys to calculate or they are ready, go ahead and pass
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index a96b1b9c0bc6..facc83bb83a5 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -6719,9 +6719,14 @@ get_actual_variable_endpoint(Relation heapRel,
 	InitNonVacuumableSnapshot(SnapshotNonVacuumable,
 							  GlobalVisTestFor(heapRel));
 
+	/*
+	 * XXX I'm not sure about batching/prefetching here. In most cases we
+	 * expect to find the endpoints immediately, but sometimes we have a lot
+	 * of dead tuples - and then prefetching might help.
+	 */
 	index_scan = index_beginscan(heapRel, indexRel,
 								 &SnapshotNonVacuumable, NULL,
-								 1, 0);
+								 1, 0, false);
 	/* Set it up for index-only scan */
 	index_scan->xs_want_itup = true;
 	index_rescan(index_scan, scankeys, 1, NULL, 0);
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 2f8cbd867599..36d2b7f1e68f 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -809,6 +809,16 @@ struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_indexscan_batching", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of index-scan batching."),
+			NULL,
+			GUC_EXPLAIN
+		},
+		&enable_indexscan_batching,
+		true,
+		NULL, NULL, NULL
+	},
 	{
 		{"enable_indexonlyscan", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of index-only-scan plans."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 34826d01380b..649df2b06a0d 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -415,6 +415,7 @@
 #enable_hashjoin = on
 #enable_incremental_sort = on
 #enable_indexscan = on
+#enable_indexscan_batching = on
 #enable_indexonlyscan = on
 #enable_material = on
 #enable_memoize = on
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index 52916bab7a31..0028bb558436 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -196,6 +196,14 @@ typedef void (*amrescan_function) (IndexScanDesc scan,
 typedef bool (*amgettuple_function) (IndexScanDesc scan,
 									 ScanDirection direction);
 
+/* next batch of valid tuples */
+typedef IndexScanBatch(*amgetbatch_function) (IndexScanDesc scan,
+											  ScanDirection direction);
+
+/* release batch of valid tuples */
+typedef void (*amfreebatch_function) (IndexScanDesc scan,
+									  IndexScanBatch batch);
+
 /* fetch all valid tuples */
 typedef int64 (*amgetbitmap_function) (IndexScanDesc scan,
 									   TIDBitmap *tbm);
@@ -307,6 +315,8 @@ typedef struct IndexAmRoutine
 	ambeginscan_function ambeginscan;
 	amrescan_function amrescan;
 	amgettuple_function amgettuple; /* can be NULL */
+	amgetbatch_function amgetbatch; /* can be NULL */
+	amfreebatch_function amfreebatch;	/* can be NULL */
 	amgetbitmap_function amgetbitmap;	/* can be NULL */
 	amendscan_function amendscan;
 	ammarkpos_function ammarkpos;	/* can be NULL */
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index 5b2ab181b5f8..8bef942b11d5 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -15,6 +15,7 @@
 #define GENAM_H
 
 #include "access/htup.h"
+#include "access/itup.h"
 #include "access/sdir.h"
 #include "access/skey.h"
 #include "nodes/tidbitmap.h"
@@ -111,6 +112,7 @@ typedef bool (*IndexBulkDeleteCallback) (ItemPointer itemptr, void *state);
 
 /* struct definitions appear in relscan.h */
 typedef struct IndexScanDescData *IndexScanDesc;
+typedef struct IndexScanBatchData *IndexScanBatch;
 typedef struct SysScanDescData *SysScanDesc;
 
 typedef struct ParallelIndexScanDescData *ParallelIndexScanDesc;
@@ -155,6 +157,8 @@ typedef struct IndexOrderByDistance
  * generalized index_ interface routines (in indexam.c)
  */
 
+extern PGDLLIMPORT bool enable_indexscan_batching;
+
 /*
  * IndexScanIsValid
  *		True iff the index scan is valid.
@@ -179,7 +183,8 @@ extern IndexScanDesc index_beginscan(Relation heapRelation,
 									 Relation indexRelation,
 									 Snapshot snapshot,
 									 IndexScanInstrumentation *instrument,
-									 int nkeys, int norderbys);
+									 int nkeys, int norderbys,
+									 bool enable_batching);
 extern IndexScanDesc index_beginscan_bitmap(Relation indexRelation,
 											Snapshot snapshot,
 											IndexScanInstrumentation *instrument,
@@ -205,7 +210,8 @@ extern IndexScanDesc index_beginscan_parallel(Relation heaprel,
 											  Relation indexrel,
 											  IndexScanInstrumentation *instrument,
 											  int nkeys, int norderbys,
-											  ParallelIndexScanDesc pscan);
+											  ParallelIndexScanDesc pscan,
+											  bool enable_batching);
 extern ItemPointer index_getnext_tid(IndexScanDesc scan,
 									 ScanDirection direction);
 struct TupleTableSlot;
@@ -213,7 +219,6 @@ extern bool index_fetch_heap(IndexScanDesc scan, struct TupleTableSlot *slot);
 extern bool index_getnext_slot(IndexScanDesc scan, ScanDirection direction,
 							   struct TupleTableSlot *slot);
 extern int64 index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap);
-
 extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info,
 												IndexBulkDeleteResult *istat,
 												IndexBulkDeleteCallback callback,
@@ -231,7 +236,7 @@ extern void index_store_float8_orderby_distances(IndexScanDesc scan,
 												 bool recheckOrderBy);
 extern bytea *index_opclass_options(Relation indrel, AttrNumber attnum,
 									Datum attoptions, bool validate);
-
+extern IndexScanBatch index_batch_alloc(int maxitems, bool want_itup);
 
 /*
  * index access method support routines (in genam.c)
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index b5e0fb386c0a..b63af845ca6b 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -16,9 +16,11 @@
 
 #include "access/htup_details.h"
 #include "access/itup.h"
+#include "access/sdir.h"
 #include "nodes/tidbitmap.h"
 #include "port/atomics.h"
 #include "storage/buf.h"
+#include "storage/read_stream.h"
 #include "storage/relfilelocator.h"
 #include "storage/spin.h"
 #include "utils/relcache.h"
@@ -121,10 +123,162 @@ typedef struct ParallelBlockTableScanWorkerData *ParallelBlockTableScanWorker;
 typedef struct IndexFetchTableData
 {
 	Relation	rel;
+	ReadStream *rs;
 } IndexFetchTableData;
 
 struct IndexScanInstrumentation;
 
+/* Forward declaration, the prefetch callback needs IndexScanDescData. */
+typedef struct IndexScanBatchData IndexScanBatchData;
+
+/*
+ * XXX parts of BTScanOpaqueData, BTScanPosItem and BTScanPosData relevant
+ * for one batch.
+ */
+typedef struct IndexScanBatchPosItem	/* what we remember about each match */
+{
+	ItemPointerData heapTid;	/* TID of referenced heap item */
+	OffsetNumber indexOffset;	/* index item's location within page */
+	LocationIndex tupleOffset;	/* IndexTuple's offset in workspace, if any */
+} IndexScanBatchPosItem;
+
+/*
+ * Data about one batch of items returned by the index AM. This is similar
+ * to the AM-specific "opaque" structs, used by each AM to track items
+ * loaded from one leaf page, but generalized for all AMs.
+ *
+ * XXX Not sure which of there fields are 100% needed for all index AMs,
+ * most of this comes from nbtree.
+ *
+ * XXX Mostly a copy of BTScanPosData, but other AMs may need different (or
+ * only some of those) fields.
+ */
+typedef struct IndexScanBatchData
+{
+	/*
+	 * AM-specific concept of position within the index, and other stuff the
+	 * AM might need to store for each batch.
+	 *
+	 * XXX maybe "position" is not the best name, it can have other stuff the
+	 * AM needs to keep per-batch (even only for reading the leaf items, like
+	 * nextTupleOffset).
+	 */
+	void	   *opaque;
+
+	/*
+	 * The items array is always ordered in index order (ie, increasing
+	 * indexoffset).  When scanning backwards it is convenient to fill the
+	 * array back-to-front, so we start at the last slot and fill downwards.
+	 * Hence we need both a first-valid-entry and a last-valid-entry counter.
+	 * itemIndex is a cursor showing which entry was last returned to caller.
+	 *
+	 * XXX Do we need all these indexes, or would it be enough to have just
+	 * 0-indexed array with only itemIndex?
+	 */
+	int			firstItem;		/* first valid index in items[] */
+	int			lastItem;		/* last valid index in items[] */
+	int			itemIndex;		/* current index in items[] */
+
+	/* info about killed items if any (killedItems is NULL if never used) */
+	int		   *killedItems;	/* indexes of killed items */
+	int			numKilled;		/* number of currently stored items */
+
+	/*
+	 * If we are doing an index-only scan, these are the tuple storage
+	 * workspaces for the currPos and markPos respectively.  Each is of size
+	 * BLCKSZ, so it can hold as much as a full page's worth of tuples.
+	 *
+	 * XXX maybe currTuples should be part of the am-specific per-batch state
+	 * stored in "position" field?
+	 */
+	char	   *currTuples;		/* tuple storage for currPos */
+	IndexScanBatchPosItem *items;	/* XXX don't size to MaxTIDsPerBTreePage */
+
+	/*
+	 * batch contents (TIDs, index tuples, kill bitmap, ...)
+	 *
+	 * XXX Shouldn't this be part of the "IndexScanBatchPosItem" struct? To
+	 * keep everything in one place? Or why should we have separate arrays?
+	 * One advantage is that we don't need to allocate memory for arrays that
+	 * we don't need ... e.g. if we don't need heap tuples, we don't allocate
+	 * that. We couldn't do that with everything in one struct.
+	 */
+	IndexTuple *itups;			/* IndexTuples, if requested */
+	HeapTuple  *htups;			/* HeapTuples, if requested */
+	bool	   *recheck;		/* recheck flags */
+
+	/* XXX why do we need this on top of "opaque" pointer? */
+	Datum	   *privateData;	/* private data for batch */
+
+	/* xs_orderbyvals / xs_orderbynulls */
+	Datum	   *orderbyvals;
+	bool	   *orderbynulls;
+
+} IndexScanBatchData;
+
+/*
+ * Position in the queue of batches - index of a batch, index of item in a batch.
+ */
+typedef struct IndexScanBatchPos
+{
+	int			batch;
+	int			index;
+} IndexScanBatchPos;
+
+typedef struct IndexScanDescData IndexScanDescData;
+typedef bool (*IndexPrefetchCallback) (IndexScanDescData * scan, void *arg, IndexScanBatchPos *pos);
+
+/*
+ * Queue
+ */
+typedef struct IndexScanBatches
+{
+	/*
+	 * Did we read the last batch? The batches may be loaded from multiple
+	 * places, and we need to remember when we fail to load the next batch in
+	 * a given scan (which means "no more batches"). amgetbatch may restart
+	 * the scan on the get call, so we need to remember it's over.
+	 */
+	bool		finished;
+	bool		reset;
+
+	/*
+	 * Current scan direction, for the currently loaded batches. This is used
+	 * to load data in the read stream API callback, etc.
+	 *
+	 * XXX May need some work to use already loaded batches after change of
+	 * direction, instead of just throwing everything away. May need to reset
+	 * the stream but keep the batches?
+	 */
+	ScanDirection direction;
+
+	/* positions in the queue of batches (batch + item) */
+	IndexScanBatchPos readPos;	/* read position */
+	IndexScanBatchPos streamPos;	/* prefetch position (for read stream API) */
+	IndexScanBatchPos markPos;	/* mark/restore position */
+
+	IndexScanBatchData *markBatch;
+	// IndexScanBatchData *currentBatch;
+
+	/*
+	 * Array of batches returned by the AM. The array has a capacity (but can
+	 * be resized if needed). The firstBatch is an index of the first batch,
+	 * but needs to be translated by (modulo maxBatches) into index in the
+	 * batches array.
+	 *
+	 * FIXME Maybe these fields should be uint32, or something like that?
+	 */
+	int			maxBatches;		/* size of the batches array */
+	int			firstBatch;		/* first used batch slot */
+	int			nextBatch;		/* next empty batch slot */
+
+	IndexScanBatchData **batches;
+
+	/* callback to skip prefetching in IOS etc. */
+	IndexPrefetchCallback prefetchCallback;
+	void	   *prefetchArgument;
+} IndexScanBatches;
+
 /*
  * We use the same IndexScanDescData structure for both amgettuple-based
  * and amgetbitmap-based index scans.  Some fields are only relevant in
@@ -176,6 +330,12 @@ typedef struct IndexScanDescData
 
 	bool		xs_recheck;		/* T means scan keys must be rechecked */
 
+	/*
+	 * Batches index scan keep a list of batches loaded from the index in a
+	 * circular buffer.
+	 */
+	IndexScanBatches *xs_batches;
+
 	/*
 	 * When fetching with an ordering operator, the values of the ORDER BY
 	 * expressions of the last returned tuple, according to the index.  If
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 8713e12cbfb9..5bed359cf135 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -413,8 +413,14 @@ typedef struct TableAmRoutine
 	 * structure with additional information.
 	 *
 	 * Tuples for an index scan can then be fetched via index_fetch_tuple.
+	 *
+	 * The ReadStream pointer is optional - NULL means the regular buffer
+	 * reads are used. If a valid ReadStream is provided, the callback
+	 * (generating the blocks to read) and index_fetch_tuple (consuming the
+	 * buffers) need to agree on the exact order.
 	 */
-	struct IndexFetchTableData *(*index_fetch_begin) (Relation rel);
+	struct IndexFetchTableData *(*index_fetch_begin) (Relation rel,
+													  ReadStream *rs);
 
 	/*
 	 * Reset index fetch. Typically this will release cross index fetch
@@ -1149,9 +1155,9 @@ table_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
  * Tuples for an index scan can then be fetched via table_index_fetch_tuple().
  */
 static inline IndexFetchTableData *
-table_index_fetch_begin(Relation rel)
+table_index_fetch_begin(Relation rel, ReadStream *rs)
 {
-	return rel->rd_tableam->index_fetch_begin(rel);
+	return rel->rd_tableam->index_fetch_begin(rel, rs);
 }
 
 /*
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 5b6cadb5a6c1..ef672e203d0e 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1697,6 +1697,7 @@ typedef struct
  *		OrderByTypByVals   is the datatype of order by expression pass-by-value?
  *		OrderByTypLens	   typlens of the datatypes of order by expressions
  *		PscanLen		   size of parallel index scan descriptor
+ *		CanBatch		   batching (and prefetching) enabled
  * ----------------
  */
 typedef struct IndexScanState
@@ -1726,6 +1727,10 @@ typedef struct IndexScanState
 	bool	   *iss_OrderByTypByVals;
 	int16	   *iss_OrderByTypLens;
 	Size		iss_PscanLen;
+
+	/* batching/prefetching enabled? */
+	bool		iss_CanBatch;
+
 } IndexScanState;
 
 /* ----------------
@@ -1749,6 +1754,7 @@ typedef struct IndexScanState
  *		PscanLen		   size of parallel index-only scan descriptor
  *		NameCStringAttNums attnums of name typed columns to pad to NAMEDATALEN
  *		NameCStringCount   number of elements in the NameCStringAttNums array
+ *		CanBatch		   batching (and prefetching) enabled
  * ----------------
  */
 typedef struct IndexOnlyScanState
@@ -1772,6 +1778,7 @@ typedef struct IndexOnlyScanState
 	Size		ioss_PscanLen;
 	AttrNumber *ioss_NameCStringAttNums;
 	int			ioss_NameCStringCount;
+	bool		ioss_CanBatch;
 } IndexOnlyScanState;
 
 /* ----------------
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index ae17d028ed3b..220b61fad2dc 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -158,6 +158,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_incremental_sort        | on
  enable_indexonlyscan           | on
  enable_indexscan               | on
+ enable_indexscan_batching      | on
  enable_material                | on
  enable_memoize                 | on
  enable_mergejoin               | on
@@ -172,7 +173,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan                 | on
  enable_sort                    | on
  enable_tidscan                 | on
-(24 rows)
+(25 rows)
 
 -- There are always wait event descriptions for various types.  InjectionPoint
 -- may be present or absent, depending on history since last postmaster start.
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index e5879e00dffe..060d964e3995 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1260,6 +1260,10 @@ IndexOrderByDistance
 IndexPath
 IndexRuntimeKeyInfo
 IndexScan
+IndexScanBatchData
+IndexScanBatchPos
+IndexScanBatchPosItem
+IndexScanBatches
 IndexScanDesc
 IndexScanInstrumentation
 IndexScanState
@@ -3396,6 +3400,7 @@ amendscan_function
 amestimateparallelscan_function
 amgetbitmap_function
 amgettuple_function
+amgetbatch_function
 aminitparallelscan_function
 aminsert_function
 aminsertcleanup_function

From 03ecfcd5297f1ac42aa45c42972cce42b98076e3 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Mon, 30 Sep 2024 22:48:39 +0200
Subject: [PATCH 2/3] WIP: batching for nbtree indexes

Adds batching/prefetching for btree indexes. Returns only batches from a
single leaf page. Does not support mark/restore yet.
---
 src/backend/access/nbtree/nbtree.c    |  319 ++++
 src/backend/access/nbtree/nbtsearch.c | 1998 +++++++++++++++++++++++--
 src/backend/access/nbtree/nbtutils.c  |  179 +++
 src/include/access/nbtree.h           |   72 +-
 src/tools/pgindent/typedefs.list      |    2 +
 5 files changed, 2417 insertions(+), 153 deletions(-)

diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 765659887af7..405c601d3ffd 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -159,6 +159,8 @@ bthandler(PG_FUNCTION_ARGS)
 	amroutine->ambeginscan = btbeginscan;
 	amroutine->amrescan = btrescan;
 	amroutine->amgettuple = btgettuple;
+	amroutine->amgetbatch = btgetbatch;
+	amroutine->amfreebatch = btfreebatch;
 	amroutine->amgetbitmap = btgetbitmap;
 	amroutine->amendscan = btendscan;
 	amroutine->ammarkpos = btmarkpos;
@@ -279,6 +281,158 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
 	return res;
 }
 
+/* FIXME duplicate from indexam.c */
+#define INDEX_SCAN_BATCH(scan, idx)	\
+		((scan)->xs_batches->batches[(idx) % (scan)->xs_batches->maxBatches])
+
+/*
+ *	btgetbatch() -- Get the next batch of tuples in the scan.
+ *
+ * XXX Simplified version of btgettuple(), but for batches of tuples.
+ */
+IndexScanBatch
+btgetbatch(IndexScanDesc scan, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	IndexScanBatch res;
+	BTBatchScanPos pos = NULL;
+
+	/* batching does not work with regular scan-level positions */
+	Assert(!BTScanPosIsValid(so->currPos));
+	Assert(!BTScanPosIsValid(so->markPos));
+
+	/* btree indexes are never lossy */
+	scan->xs_recheck = false;
+
+	if (scan->xs_batches->firstBatch < scan->xs_batches->nextBatch)
+	{
+		IndexScanBatch batch = INDEX_SCAN_BATCH(scan, scan->xs_batches->nextBatch-1);
+		pos = (BTBatchScanPos) batch->opaque;
+	}
+
+	/* Each loop iteration performs another primitive index scan */
+	do
+	{
+		/*
+		 * If we've already initialized this scan, we can just advance it in
+		 * the appropriate direction.  If we haven't done so yet, we call
+		 * _bt_first() to get the first item in the scan.
+		 */
+		if (pos == NULL)
+			res = _bt_first_batch(scan, dir);
+		else
+		{
+			/*
+			 * Now continue the scan.
+			 */
+			res = _bt_next_batch(scan, pos, dir);
+		}
+
+		/* If we have a batch, return it ... */
+		if (res)
+			break;
+
+		/*
+		 * XXX we need to invoke _bt_first_batch on the next iteration, to
+		 * advance SAOP keys etc. But indexam.c already does this, but that's
+		 * only after this returns, so maybe this should do this in some other
+		 * way, not sure who should be responsible for setting currentBatch.
+		 *
+		 * XXX Maybe we don't even need that field? What is a current batch
+		 * anyway? There seem to be at least multiple concepts of "current"
+		 * batch, one for the read stream, another for executor ...
+		 */
+		// scan->xs_batches->currentBatch = res;
+
+		/*
+		 * We may do a new scan, depending on what _bt_start_prim_scan says.
+		 * In that case we need to start from scratch, not from the position
+		 * of the last batch. In regular non-batched scans we have currPos,
+		 * because we have just one leaf page for the whole scan, and we
+		 * invalidate it before loading the next one. But with batching that
+		 * doesn't work - we have many leafs, it's not clear which one is
+		 * 'current' (well, it's the last), and we can't invalidate it,
+		 * that's up to amfreebatch(). For now we deduce the position and
+		 * reset it to NULL, to indicate the same thing.
+		 *
+		 * XXX Maybe we should have something like 'currentBatch'? But then
+		 * that probably should be in BTScanOpaque, not in the generic
+		 * indexam.c part? Or it it a sufficiently generic thing? How would
+		 * we keep it in sync with the batch queue? If freeing batches is
+		 * up to indexam, how do we ensure the currentBatch does not point
+		 * to already removed batch?
+		 */
+		pos = NULL;
+
+		/* ... otherwise see if we need another primitive index scan */
+	} while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
+
+	return res;
+}
+
+/*
+ *	btgetbatch() -- Get the next batch of tuples in the scan.
+ *
+ * XXX Pretty much like btgettuple(), but for batches of tuples.
+ */
+void
+btfreebatch(IndexScanDesc scan, IndexScanBatch batch)
+{
+	BTScanOpaque so PG_USED_FOR_ASSERTS_ONLY = (BTScanOpaque) scan->opaque;
+
+	/* batching does not work with regular scan-level positions */
+	Assert(!BTScanPosIsValid(so->currPos));
+	Assert(!BTScanPosIsValid(so->markPos));
+
+	/*
+	 * Check to see if we should kill tuples from the previous batch.
+	 */
+	_bt_kill_batch(scan, batch);
+
+	/* free all the stuff that might be allocated */
+
+	if (batch->items)
+		pfree(batch->items);
+
+	if (batch->itups)
+		pfree(batch->itups);
+
+	if (batch->htups)
+		pfree(batch->htups);
+
+	if (batch->recheck)
+		pfree(batch->recheck);
+
+	if (batch->privateData)
+		pfree(batch->privateData);
+
+	if (batch->orderbyvals)
+		pfree(batch->orderbyvals);
+
+	if (batch->orderbynulls)
+		pfree(batch->orderbynulls);
+
+	if (batch->currTuples)
+		pfree(batch->currTuples);
+
+	if (batch->opaque)
+	{
+		BTBatchScanPos pos = (BTBatchScanPos) batch->opaque;
+
+		BTBatchScanPosIsValid(*pos);
+		BTBatchScanPosIsPinned(*pos);
+
+		BTBatchScanPosUnpinIfPinned(*pos);
+
+		pfree(batch->opaque);
+	}
+
+	/* and finally free the batch itself */
+	pfree(batch);
+
+	return;
+}
+
 /*
  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
  */
@@ -376,6 +530,10 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
 
 /*
  *	btrescan() -- rescan an index relation
+ *
+ * Batches should have been freed from indexam using btfreebatch() before we
+ * get here, but then some of the generic scan stuff needs to be reset here.
+ * But we shouldn't need to do anything particular here, I think.
  */
 void
 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
@@ -400,6 +558,10 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	BTScanPosUnpinIfPinned(so->markPos);
 	BTScanPosInvalidate(so->markPos);
 
+	/* FIXME should be in indexam.c I think */
+	// if (scan->xs_batches)
+	//	scan->xs_batches->currentBatch = NULL;
+
 	/*
 	 * Allocate tuple workspace arrays, if needed for an index-only scan and
 	 * not already done in a previous rescan call.  To save on palloc
@@ -433,6 +595,10 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 
 /*
  *	btendscan() -- close down a scan
+ *
+ * Batches should have been freed from indexam using btfreebatch() before we
+ * get here, but then some of the generic scan stuff needs to be reset here.
+ * But we shouldn't need to do anything particular here, I think.
  */
 void
 btendscan(IndexScanDesc scan)
@@ -469,12 +635,18 @@ btendscan(IndexScanDesc scan)
 
 /*
  *	btmarkpos() -- save current scan position
+ *
+ * With batching, all the interesting markpos() stuff happens in indexam.c. We
+ * should not even get here.
  */
 void
 btmarkpos(IndexScanDesc scan)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 
+	/* with batching, mark/restore is handled in indexam */
+	Assert(scan->xs_batches == NULL);
+
 	/* There may be an old mark with a pin (but no lock). */
 	BTScanPosUnpinIfPinned(so->markPos);
 
@@ -495,12 +667,18 @@ btmarkpos(IndexScanDesc scan)
 
 /*
  *	btrestrpos() -- restore scan to last saved position
+ *
+ * With batching, all the interesting restrpos() stuff happens in indexam.c. We
+ * should not even get here.
  */
 void
 btrestrpos(IndexScanDesc scan)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 
+	/* with batching, mark/restore is handled in indexam */
+	Assert(scan->xs_batches == NULL);
+
 	if (so->markItemIndex >= 0)
 	{
 		/*
@@ -900,6 +1078,147 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
 	return status;
 }
 
+/*
+ * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
+ *		page.  Other scans must wait until we call _bt_parallel_release()
+ *		or _bt_parallel_done().
+ *
+ * The return value is true if we successfully seized the scan and false
+ * if we did not.  The latter case occurs when no pages remain, or when
+ * another primitive index scan is scheduled that caller's backend cannot
+ * start just yet (only backends that call from _bt_first are capable of
+ * starting primitive index scans, which they indicate by passing first=true).
+ *
+ * If the return value is true, *next_scan_page returns the next page of the
+ * scan, and *last_curr_page returns the page that *next_scan_page came from.
+ * An invalid *next_scan_page means the scan hasn't yet started, or that
+ * caller needs to start the next primitive index scan (if it's the latter
+ * case we'll set so.needPrimScan).
+ *
+ * Callers should ignore the value of *next_scan_page and *last_curr_page if
+ * the return value is false.
+ */
+bool
+_bt_parallel_seize_batch(IndexScanDesc scan, BTBatchScanPos pos,
+						 BlockNumber *next_scan_page,
+						 BlockNumber *last_curr_page, bool first)
+{
+	Relation	rel = scan->indexRelation;
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	bool		exit_loop = false,
+				status = true,
+				endscan = false;
+	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
+	BTParallelScanDesc btscan;
+
+	*next_scan_page = InvalidBlockNumber;
+	*last_curr_page = InvalidBlockNumber;
+
+	/*
+	 * Reset so->currPos, and initialize moreLeft/moreRight such that the next
+	 * call to _bt_readnextpage treats this backend similarly to a serial
+	 * backend that steps from *last_curr_page to *next_scan_page (unless this
+	 * backend's so->currPos is initialized by _bt_readfirstpage before then).
+	 */
+	BTScanPosInvalidate(so->currPos);
+	pos->moreLeft = pos->moreRight = true;
+
+	if (first)
+	{
+		/*
+		 * Initialize array related state when called from _bt_first, assuming
+		 * that this will be the first primitive index scan for the scan
+		 */
+		so->needPrimScan = false;
+		so->scanBehind = false;
+		so->oppositeDirCheck = false;
+	}
+	else
+	{
+		/*
+		 * Don't attempt to seize the scan when it requires another primitive
+		 * index scan, since caller's backend cannot start it right now
+		 */
+		if (so->needPrimScan)
+			return false;
+	}
+
+	btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
+												  parallel_scan->ps_offset_am);
+
+	while (1)
+	{
+		LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
+
+		if (btscan->btps_pageStatus == BTPARALLEL_DONE)
+		{
+			/* We're done with this parallel index scan */
+			status = false;
+		}
+		else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
+				 btscan->btps_nextScanPage == P_NONE)
+		{
+			/* End this parallel index scan */
+			status = false;
+			endscan = true;
+		}
+		else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
+		{
+			Assert(so->numArrayKeys);
+
+			if (first)
+			{
+				/* Can start scheduled primitive scan right away, so do so */
+				btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
+
+				/* Restore scan's array keys from serialized values */
+				_bt_parallel_restore_arrays(rel, btscan, so);
+				exit_loop = true;
+			}
+			else
+			{
+				/*
+				 * Don't attempt to seize the scan when it requires another
+				 * primitive index scan, since caller's backend cannot start
+				 * it right now
+				 */
+				status = false;
+			}
+
+			/*
+			 * Either way, update backend local state to indicate that a
+			 * pending primitive scan is required
+			 */
+			so->needPrimScan = true;
+			so->scanBehind = false;
+			so->oppositeDirCheck = false;
+		}
+		else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
+		{
+			/*
+			 * We have successfully seized control of the scan for the purpose
+			 * of advancing it to a new page!
+			 */
+			btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
+			Assert(btscan->btps_nextScanPage != P_NONE);
+			*next_scan_page = btscan->btps_nextScanPage;
+			*last_curr_page = btscan->btps_lastCurrPage;
+			exit_loop = true;
+		}
+		LWLockRelease(&btscan->btps_lock);
+		if (exit_loop || !status)
+			break;
+		ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
+	}
+	ConditionVariableCancelSleep();
+
+	/* When the scan has reached the rightmost (or leftmost) page, end it */
+	if (endscan)
+		_bt_parallel_done(scan);
+
+	return status;
+}
+
 /*
  * _bt_parallel_release() -- Complete the process of advancing the scan to a
  *		new page.  We now have the new value btps_nextScanPage; another backend
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 77264ddeecb5..10b28a76c0f6 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -24,8 +24,20 @@
 #include "utils/lsyscache.h"
 #include "utils/rel.h"
 
+/*
+ * XXX A lot of the new functions are copies of the non-batching version, with
+ * changes to make it work with batching (which means with position provided
+ * by the caller, not from the BTScanOpaque). The duplication is not great,
+ * but it's a bit unclear what to do about it. One option would be to remove
+ * the amgettuple() interface altogether, once the batching API works, but we
+ * may also choose to keep both (e.g. for cases that don't support batching,
+ * like scans of catalogs). In that case we'd need to do some refactoring to
+ * share as much code as possible.
+ */
 
 static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
+
+/* static void _bt_drop_lock_and_maybe_pin_batch(IndexScanDesc scan, BTBatchScanPos sp); */
 static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
 							Buffer buf, bool forupdate, BTStack stack,
 							int access);
@@ -34,24 +46,44 @@ static int	_bt_binsrch_posting(BTScanInsert key, Page page,
 								OffsetNumber offnum);
 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
 						 OffsetNumber offnum, bool firstpage);
+static IndexScanBatch _bt_readpage_batch(IndexScanDesc scan, BTBatchScanPos pos,
+										 ScanDirection dir, OffsetNumber offnum,
+										 bool firstPage);
 static void _bt_saveitem(BTScanOpaque so, int itemIndex,
 						 OffsetNumber offnum, IndexTuple itup);
+static void _bt_saveitem_batch(IndexScanBatch batch, int itemIndex,
+							   OffsetNumber offnum, IndexTuple itup);
 static int	_bt_setuppostingitems(BTScanOpaque so, int itemIndex,
 								  OffsetNumber offnum, ItemPointer heapTid,
 								  IndexTuple itup);
+static int	_bt_setuppostingitems_batch(IndexScanBatch batch, int itemIndex,
+										OffsetNumber offnum, ItemPointer heapTid,
+										IndexTuple itup);
 static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
 									   OffsetNumber offnum,
 									   ItemPointer heapTid, int tupleOffset);
+static inline void _bt_savepostingitem_batch(IndexScanBatch batch, int itemIndex,
+											 OffsetNumber offnum,
+											 ItemPointer heapTid, int tupleOffset);
 static inline void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so);
 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
+static IndexScanBatch _bt_steppage_batch(IndexScanDesc scan, BTBatchScanPos pos,
+										 ScanDirection dir);
 static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum,
 							  ScanDirection dir);
+static IndexScanBatch _bt_readfirstpage_batch(IndexScanDesc scan, BTBatchScanPos pos,
+											  OffsetNumber offnum,
+											  ScanDirection dir);
 static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
 							 BlockNumber lastcurrblkno, ScanDirection dir,
 							 bool seized);
+static IndexScanBatch _bt_readnextpage_batch(IndexScanDesc scan, BTBatchScanPos pos,
+											 BlockNumber blkno, BlockNumber lastcurrblkno,
+											 ScanDirection dir, bool seized);
 static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
 										 BlockNumber lastcurrblkno);
 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
+static IndexScanBatch _bt_endpoint_batch(IndexScanDesc scan, ScanDirection dir);
 
 
 /*
@@ -77,6 +109,20 @@ _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
 	}
 }
 
+/* static void */
+/* _bt_drop_lock_and_maybe_pin_batch(IndexScanDesc scan, BTBatchScanPos sp) */
+/* { */
+/* 	_bt_unlockbuf(scan->indexRelation, sp->buf); */
+/*  */
+/* /	if (IsMVCCSnapshot(scan->xs_snapshot) && */
+/* 		RelationNeedsWAL(scan->indexRelation) && */
+/* 		!scan->xs_want_itup) */
+/* 	{ */
+/* 		ReleaseBuffer(sp->buf); */
+/* 		sp->buf = InvalidBuffer; */
+/* 	} */
+/* } */
+
 /*
  *	_bt_search() -- Search the tree for a particular scankey,
  *		or more precisely for the first leaf page it could be on.
@@ -1570,136 +1616,1344 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
 }
 
 /*
- *	_bt_readpage() -- Load data from current index page into so->currPos
+ *	_bt_first_batch() -- Load the first batch in a scan.
  *
- * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
- * is not changed here.  Also, currPos.moreLeft and moreRight must be valid;
- * they are updated as appropriate.  All other fields of so->currPos are
- * initialized from scratch here.
+ * A batch variant of _bt_first(). Most of the comments for that function
+ * apply here too.
  *
- * We scan the current page starting at offnum and moving in the indicated
- * direction.  All items matching the scan keys are loaded into currPos.items.
- * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
- * that there can be no more matching tuples in the current scan direction
- * (could just be for the current primitive index scan when scan has arrays).
+ * XXX This only populates the batch, it does not set any other fields like
+ * scan->xs_heaptid or scan->xs_itup. That happens in getnext_tid() calls.
  *
- * In the case of a parallel scan, caller must have called _bt_parallel_seize
- * prior to calling this function; this function will invoke
- * _bt_parallel_release before returning.
+ * XXX I'm not sure it works to mix batched and non-batches calls, e.g. get
+ * a TID and then a batch of TIDs. It probably should work as long as we
+ * update itemIndex correctly, but we need to be careful about killed items
+ * (right now the two places use different ways to communicate which items
+ * should be killed).
  *
- * Returns true if any matching items found on the page, false if none.
+ * XXX We probably should not rely on _bt_first/_bt_steppage, because that
+ * very much relies on currPos, and it's just laziness to rely on that. For
+ * batching we probably need something else anyway.
  */
-static bool
-_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
-			 bool firstpage)
+IndexScanBatch
+_bt_first_batch(IndexScanDesc scan, ScanDirection dir)
 {
 	Relation	rel = scan->indexRelation;
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
-	Page		page;
-	BTPageOpaque opaque;
-	OffsetNumber minoff;
-	OffsetNumber maxoff;
-	BTReadPageState pstate;
-	bool		arrayKeys;
-	int			itemIndex,
-				indnatts;
+	BTStack		stack;
+	OffsetNumber offnum;
+	BTScanInsertData inskey;
+	ScanKey		startKeys[INDEX_MAX_KEYS];
+	ScanKeyData notnullkeys[INDEX_MAX_KEYS];
+	int			keysz = 0;
+	StrategyNumber strat_total;
+	BlockNumber blkno = InvalidBlockNumber,
+				lastcurrblkno;
+	BTBatchScanPosData pos;
 
-	/* save the page/buffer block number, along with its sibling links */
-	page = BufferGetPage(so->currPos.buf);
-	opaque = BTPageGetOpaque(page);
-	so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
-	so->currPos.prevPage = opaque->btpo_prev;
-	so->currPos.nextPage = opaque->btpo_next;
+	BTBatchScanPosInvalidate(pos);
 
-	Assert(!P_IGNORE(opaque));
-	Assert(BTScanPosIsPinned(so->currPos));
-	Assert(!so->needPrimScan);
+	/* batching does not work with regular scan-level positions */
+	Assert(!BTScanPosIsValid(so->currPos));
+	Assert(!BTScanPosIsValid(so->markPos));
 
-	if (scan->parallel_scan)
-	{
-		/* allow next/prev page to be read by other worker without delay */
-		if (ScanDirectionIsForward(dir))
-			_bt_parallel_release(scan, so->currPos.nextPage,
-								 so->currPos.currPage);
-		else
-			_bt_parallel_release(scan, so->currPos.prevPage,
-								 so->currPos.currPage);
-	}
+	/* FIXME maybe check there's no active batch yet */
+	/* Assert(!BTScanPosIsValid(so->currPos)); */
 
-	/* initialize remaining currPos fields related to current page */
-	so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
-	so->currPos.dir = dir;
-	so->currPos.nextTupleOffset = 0;
-	/* either moreLeft or moreRight should be set now (may be unset later) */
-	Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight :
-		   so->currPos.moreLeft);
+	/*
+	 * Examine the scan keys and eliminate any redundant keys; also mark the
+	 * keys that must be matched to continue the scan.
+	 */
+	_bt_preprocess_keys(scan);
 
-	PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot);
+	/*
+	 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
+	 * never be satisfied (eg, x == 1 AND x > 2).
+	 */
+	if (!so->qual_ok)
+	{
+		Assert(!so->needPrimScan);
+		_bt_parallel_done(scan);
+		return false;
+	}
 
-	/* initialize local variables */
-	indnatts = IndexRelationGetNumberOfAttributes(rel);
-	arrayKeys = so->numArrayKeys != 0;
-	minoff = P_FIRSTDATAKEY(opaque);
-	maxoff = PageGetMaxOffsetNumber(page);
+	/*
+	 * If this is a parallel scan, we must seize the scan.  _bt_readfirstpage
+	 * will likely release the parallel scan later on.
+	 */
+	if (scan->parallel_scan != NULL &&
+		!_bt_parallel_seize_batch(scan, &pos, &blkno, &lastcurrblkno, true))
+		return false;
 
-	/* initialize page-level state that we'll pass to _bt_checkkeys */
-	pstate.minoff = minoff;
-	pstate.maxoff = maxoff;
-	pstate.finaltup = NULL;
-	pstate.page = page;
-	pstate.firstpage = firstpage;
-	pstate.forcenonrequired = false;
-	pstate.startikey = 0;
-	pstate.offnum = InvalidOffsetNumber;
-	pstate.skip = InvalidOffsetNumber;
-	pstate.continuescan = true; /* default assumption */
-	pstate.rechecks = 0;
-	pstate.targetdistance = 0;
-	pstate.nskipadvances = 0;
+	/*
+	 * Initialize the scan's arrays (if any) for the current scan direction
+	 * (except when they were already set to later values as part of
+	 * scheduling the primitive index scan that is now underway)
+	 */
+	if (so->numArrayKeys && !so->needPrimScan)
+		_bt_start_array_keys(scan, dir);
 
-	if (ScanDirectionIsForward(dir))
+	if (blkno != InvalidBlockNumber)
 	{
-		/* SK_SEARCHARRAY forward scans must provide high key up front */
-		if (arrayKeys)
-		{
-			if (!P_RIGHTMOST(opaque))
-			{
-				ItemId		iid = PageGetItemId(page, P_HIKEY);
+		/*
+		 * We anticipated calling _bt_search, but another worker bet us to it.
+		 * _bt_readnextpage releases the scan for us (not _bt_readfirstpage).
+		 */
+		Assert(scan->parallel_scan != NULL);
+		Assert(!so->needPrimScan);
+		Assert(blkno != P_NONE);
 
-				pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
+		return _bt_readnextpage_batch(scan, &pos, blkno, lastcurrblkno, dir, true);
+	}
 
-				if (so->scanBehind &&
-					!_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
-				{
-					/* Schedule another primitive index scan after all */
-					so->currPos.moreRight = false;
-					so->needPrimScan = true;
-					if (scan->parallel_scan)
-						_bt_parallel_primscan_schedule(scan,
-													   so->currPos.currPage);
-					return false;
-				}
-			}
+	/*
+	 * Count an indexscan for stats, now that we know that we'll call
+	 * _bt_search/_bt_endpoint below
+	 */
+	pgstat_count_index_scan(rel);
+	if (scan->instrument)
+		scan->instrument->nsearches++;
 
-			so->scanBehind = so->oppositeDirCheck = false;	/* reset */
-		}
+	/*----------
+	 * Examine the scan keys to discover where we need to start the scan.
+	 *
+	 * We want to identify the keys that can be used as starting boundaries;
+	 * these are =, >, or >= keys for a forward scan or =, <, <= keys for
+	 * a backwards scan.  We can use keys for multiple attributes so long as
+	 * the prior attributes had only =, >= (resp. =, <=) keys.  Once we accept
+	 * a > or < boundary or find an attribute with no boundary (which can be
+	 * thought of as the same as "> -infinity"), we can't use keys for any
+	 * attributes to its right, because it would break our simplistic notion
+	 * of what initial positioning strategy to use.
+	 *
+	 * When the scan keys include cross-type operators, _bt_preprocess_keys
+	 * may not be able to eliminate redundant keys; in such cases we will
+	 * arbitrarily pick a usable one for each attribute.  This is correct
+	 * but possibly not optimal behavior.  (For example, with keys like
+	 * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
+	 * x=5 would be more efficient.)  Since the situation only arises given
+	 * a poorly-worded query plus an incomplete opfamily, live with it.
+	 *
+	 * When both equality and inequality keys appear for a single attribute
+	 * (again, only possible when cross-type operators appear), we *must*
+	 * select one of the equality keys for the starting point, because
+	 * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
+	 * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
+	 * start at x=4, we will fail and stop before reaching x=10.  If multiple
+	 * equality quals survive preprocessing, however, it doesn't matter which
+	 * one we use --- by definition, they are either redundant or
+	 * contradictory.
+	 *
+	 * In practice we rarely see any "attribute boundary key gaps" here.
+	 * Preprocessing can usually backfill skip array keys for any attributes
+	 * that were omitted from the original scan->keyData[] input keys.  All
+	 * array keys are always considered = keys, but we'll sometimes need to
+	 * treat the current key value as if we were using an inequality strategy.
+	 * This happens with range skip arrays, which store inequality keys in the
+	 * array's low_compare/high_compare fields (used to find the first/last
+	 * set of matches, when = key will lack a usable sk_argument value).
+	 * These are always preferred over any redundant "standard" inequality
+	 * keys on the same column (per the usual rule about preferring = keys).
+	 * Note also that any column with an = skip array key can never have an
+	 * additional, contradictory = key.
+	 *
+	 * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP
+	 * array keys whose array is "null_elem=true") imply a NOT NULL qualifier.
+	 * If the index stores nulls at the end of the index we'll be starting
+	 * from, and we have no boundary key for the column (which means the key
+	 * we deduced NOT NULL from is an inequality key that constrains the other
+	 * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
+	 * use as a boundary key.  If we didn't do this, we might find ourselves
+	 * traversing a lot of null entries at the start of the scan.
+	 *
+	 * In this loop, row-comparison keys are treated the same as keys on their
+	 * first (leftmost) columns.  We'll add on lower-order columns of the row
+	 * comparison below, if possible.
+	 *
+	 * The selected scan keys (at most one per index column) are remembered by
+	 * storing their addresses into the local startKeys[] array.
+	 *
+	 * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
+	 * the next primitive index scan (for scans with array keys) based in part
+	 * on an understanding of how it'll enable us to reposition the scan.
+	 * They're directly aware of how we'll sometimes cons up an explicit
+	 * SK_SEARCHNOTNULL key.  They'll even end primitive scans by applying a
+	 * symmetric "deduce NOT NULL" rule of their own.  This allows top-level
+	 * scans to skip large groups of NULLs through repeated deductions about
+	 * key strictness (for a required inequality key) and whether NULLs in the
+	 * key's index column are stored last or first (relative to non-NULLs).
+	 * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
+	 * need to be kept in sync.
+	 *----------
+	 */
+	strat_total = BTEqualStrategyNumber;
+	if (so->numberOfKeys > 0)
+	{
+		AttrNumber	curattr;
+		ScanKey		chosen;
+		ScanKey		impliesNN;
+		ScanKey		cur;
 
 		/*
-		 * Consider pstate.startikey optimization once the ongoing primitive
-		 * index scan has already read at least one page
+		 * chosen is the so-far-chosen key for the current attribute, if any.
+		 * We don't cast the decision in stone until we reach keys for the
+		 * next attribute.
 		 */
-		if (!pstate.firstpage && minoff < maxoff)
-			_bt_set_startikey(scan, &pstate);
-
-		/* load items[] in ascending order */
-		itemIndex = 0;
-
-		offnum = Max(offnum, minoff);
+		cur = so->keyData;
+		curattr = 1;
+		chosen = NULL;
+		/* Also remember any scankey that implies a NOT NULL constraint */
+		impliesNN = NULL;
 
-		while (offnum <= maxoff)
+		/*
+		 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
+		 * pass to handle after-last-key processing.  Actual exit from the
+		 * loop is at one of the "break" statements below.
+		 */
+		for (int i = 0;; cur++, i++)
 		{
-			ItemId		iid = PageGetItemId(page, offnum);
-			IndexTuple	itup;
+			if (i >= so->numberOfKeys || cur->sk_attno != curattr)
+			{
+				/*
+				 * Done looking at keys for curattr.
+				 *
+				 * If this is a scan key for a skip array whose current
+				 * element is MINVAL, choose low_compare (when scanning
+				 * backwards it'll be MAXVAL, and we'll choose high_compare).
+				 *
+				 * Note: if the array's low_compare key makes 'chosen' NULL,
+				 * then we behave as if the array's first element is -inf,
+				 * except when !array->null_elem implies a usable NOT NULL
+				 * constraint.
+				 */
+				if (chosen != NULL &&
+					(chosen->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))
+				{
+					int			ikey = chosen - so->keyData;
+					ScanKey		skipequalitykey = chosen;
+					BTArrayKeyInfo *array = NULL;
+
+					for (int arridx = 0; arridx < so->numArrayKeys; arridx++)
+					{
+						array = &so->arrayKeys[arridx];
+						if (array->scan_key == ikey)
+							break;
+					}
+
+					if (ScanDirectionIsForward(dir))
+					{
+						Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL));
+						chosen = array->low_compare;
+					}
+					else
+					{
+						Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL));
+						chosen = array->high_compare;
+					}
+
+					Assert(chosen == NULL ||
+						   chosen->sk_attno == skipequalitykey->sk_attno);
+
+					if (!array->null_elem)
+						impliesNN = skipequalitykey;
+					else
+						Assert(chosen == NULL && impliesNN == NULL);
+				}
+
+				/*
+				 * If we didn't find a usable boundary key, see if we can
+				 * deduce a NOT NULL key
+				 */
+				if (chosen == NULL && impliesNN != NULL &&
+					((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
+					 ScanDirectionIsForward(dir) :
+					 ScanDirectionIsBackward(dir)))
+				{
+					/* Yes, so build the key in notnullkeys[keysz] */
+					chosen = &notnullkeys[keysz];
+					ScanKeyEntryInitialize(chosen,
+										   (SK_SEARCHNOTNULL | SK_ISNULL |
+											(impliesNN->sk_flags &
+											 (SK_BT_DESC | SK_BT_NULLS_FIRST))),
+										   curattr,
+										   ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
+											BTGreaterStrategyNumber :
+											BTLessStrategyNumber),
+										   InvalidOid,
+										   InvalidOid,
+										   InvalidOid,
+										   (Datum) 0);
+				}
+
+				/*
+				 * If we still didn't find a usable boundary key, quit; else
+				 * save the boundary key pointer in startKeys.
+				 */
+				if (chosen == NULL)
+					break;
+				startKeys[keysz++] = chosen;
+
+				/*
+				 * We can only consider adding more boundary keys when the one
+				 * that we just chose to add uses either the = or >= strategy
+				 * (during backwards scans we can only do so when the key that
+				 * we just added to startKeys[] uses the = or <= strategy)
+				 */
+				strat_total = chosen->sk_strategy;
+				if (strat_total == BTGreaterStrategyNumber ||
+					strat_total == BTLessStrategyNumber)
+					break;
+
+				/*
+				 * If the key that we just added to startKeys[] is a skip
+				 * array = key whose current element is marked NEXT or PRIOR,
+				 * make strat_total > or < (and stop adding boundary keys).
+				 * This can only happen with opclasses that lack skip support.
+				 */
+				if (chosen->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR))
+				{
+					Assert(chosen->sk_flags & SK_BT_SKIP);
+					Assert(strat_total == BTEqualStrategyNumber);
+
+					if (ScanDirectionIsForward(dir))
+					{
+						Assert(!(chosen->sk_flags & SK_BT_PRIOR));
+						strat_total = BTGreaterStrategyNumber;
+					}
+					else
+					{
+						Assert(!(chosen->sk_flags & SK_BT_NEXT));
+						strat_total = BTLessStrategyNumber;
+					}
+
+					/*
+					 * We're done.  We'll never find an exact = match for a
+					 * NEXT or PRIOR sentinel sk_argument value.  There's no
+					 * sense in trying to add more keys to startKeys[].
+					 */
+					break;
+				}
+
+				/*
+				 * Done if that was the last scan key output by preprocessing.
+				 * Also done if there is a gap index attribute that lacks a
+				 * usable key (only possible when preprocessing was unable to
+				 * generate a skip array key to "fill in the gap").
+				 */
+				if (i >= so->numberOfKeys ||
+					cur->sk_attno != curattr + 1)
+					break;
+
+				/*
+				 * Reset for next attr.
+				 */
+				curattr = cur->sk_attno;
+				chosen = NULL;
+				impliesNN = NULL;
+			}
+
+			/*
+			 * Can we use this key as a starting boundary for this attr?
+			 *
+			 * If not, does it imply a NOT NULL constraint?  (Because
+			 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
+			 * *any* inequality key works for that; we need not test.)
+			 */
+			switch (cur->sk_strategy)
+			{
+				case BTLessStrategyNumber:
+				case BTLessEqualStrategyNumber:
+					if (chosen == NULL)
+					{
+						if (ScanDirectionIsBackward(dir))
+							chosen = cur;
+						else
+							impliesNN = cur;
+					}
+					break;
+				case BTEqualStrategyNumber:
+					/* override any non-equality choice */
+					chosen = cur;
+					break;
+				case BTGreaterEqualStrategyNumber:
+				case BTGreaterStrategyNumber:
+					if (chosen == NULL)
+					{
+						if (ScanDirectionIsForward(dir))
+							chosen = cur;
+						else
+							impliesNN = cur;
+					}
+					break;
+			}
+		}
+	}
+
+	/*
+	 * If we found no usable boundary keys, we have to start from one end of
+	 * the tree.  Walk down that edge to the first or last key, and scan from
+	 * there.
+	 *
+	 * Note: calls _bt_readfirstpage for us, which releases the parallel scan.
+	 */
+	if (keysz == 0)
+		return _bt_endpoint_batch(scan, dir);
+
+	/*
+	 * We want to start the scan somewhere within the index.  Set up an
+	 * insertion scankey we can use to search for the boundary point we
+	 * identified above.  The insertion scankey is built using the keys
+	 * identified by startKeys[].  (Remaining insertion scankey fields are
+	 * initialized after initial-positioning scan keys are finalized.)
+	 */
+	Assert(keysz <= INDEX_MAX_KEYS);
+	for (int i = 0; i < keysz; i++)
+	{
+		ScanKey		cur = startKeys[i];
+
+		Assert(cur->sk_attno == i + 1);
+
+		if (cur->sk_flags & SK_ROW_HEADER)
+		{
+			/*
+			 * Row comparison header: look to the first row member instead
+			 */
+			ScanKey		subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
+
+			/*
+			 * Cannot be a NULL in the first row member: _bt_preprocess_keys
+			 * would've marked the qual as unsatisfiable, preventing us from
+			 * ever getting this far
+			 */
+			Assert(subkey->sk_flags & SK_ROW_MEMBER);
+			Assert(subkey->sk_attno == cur->sk_attno);
+			Assert(!(subkey->sk_flags & SK_ISNULL));
+
+			/*
+			 * The member scankeys are already in insertion format (ie, they
+			 * have sk_func = 3-way-comparison function)
+			 */
+			memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
+
+			/*
+			 * If the row comparison is the last positioning key we accepted,
+			 * try to add additional keys from the lower-order row members.
+			 * (If we accepted independent conditions on additional index
+			 * columns, we use those instead --- doesn't seem worth trying to
+			 * determine which is more restrictive.)  Note that this is OK
+			 * even if the row comparison is of ">" or "<" type, because the
+			 * condition applied to all but the last row member is effectively
+			 * ">=" or "<=", and so the extra keys don't break the positioning
+			 * scheme.  But, by the same token, if we aren't able to use all
+			 * the row members, then the part of the row comparison that we
+			 * did use has to be treated as just a ">=" or "<=" condition, and
+			 * so we'd better adjust strat_total accordingly.
+			 */
+			if (i == keysz - 1)
+			{
+				bool		used_all_subkeys = false;
+
+				Assert(!(subkey->sk_flags & SK_ROW_END));
+				for (;;)
+				{
+					subkey++;
+					Assert(subkey->sk_flags & SK_ROW_MEMBER);
+					if (subkey->sk_attno != keysz + 1)
+						break;	/* out-of-sequence, can't use it */
+					if (subkey->sk_strategy != cur->sk_strategy)
+						break;	/* wrong direction, can't use it */
+					if (subkey->sk_flags & SK_ISNULL)
+						break;	/* can't use null keys */
+					Assert(keysz < INDEX_MAX_KEYS);
+					memcpy(inskey.scankeys + keysz, subkey,
+						   sizeof(ScanKeyData));
+					keysz++;
+					if (subkey->sk_flags & SK_ROW_END)
+					{
+						used_all_subkeys = true;
+						break;
+					}
+				}
+				if (!used_all_subkeys)
+				{
+					switch (strat_total)
+					{
+						case BTLessStrategyNumber:
+							strat_total = BTLessEqualStrategyNumber;
+							break;
+						case BTGreaterStrategyNumber:
+							strat_total = BTGreaterEqualStrategyNumber;
+							break;
+					}
+				}
+				break;			/* done with outer loop */
+			}
+		}
+		else
+		{
+			/*
+			 * Ordinary comparison key.  Transform the search-style scan key
+			 * to an insertion scan key by replacing the sk_func with the
+			 * appropriate btree comparison function.
+			 *
+			 * If scankey operator is not a cross-type comparison, we can use
+			 * the cached comparison function; otherwise gotta look it up in
+			 * the catalogs.  (That can't lead to infinite recursion, since no
+			 * indexscan initiated by syscache lookup will use cross-data-type
+			 * operators.)
+			 *
+			 * We support the convention that sk_subtype == InvalidOid means
+			 * the opclass input type; this is a hack to simplify life for
+			 * ScanKeyInit().
+			 */
+			if (cur->sk_subtype == rel->rd_opcintype[i] ||
+				cur->sk_subtype == InvalidOid)
+			{
+				FmgrInfo   *procinfo;
+
+				procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
+				ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
+											   cur->sk_flags,
+											   cur->sk_attno,
+											   InvalidStrategy,
+											   cur->sk_subtype,
+											   cur->sk_collation,
+											   procinfo,
+											   cur->sk_argument);
+			}
+			else
+			{
+				RegProcedure cmp_proc;
+
+				cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
+											 rel->rd_opcintype[i],
+											 cur->sk_subtype,
+											 BTORDER_PROC);
+				if (!RegProcedureIsValid(cmp_proc))
+					elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
+						 BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
+						 cur->sk_attno, RelationGetRelationName(rel));
+				ScanKeyEntryInitialize(inskey.scankeys + i,
+									   cur->sk_flags,
+									   cur->sk_attno,
+									   InvalidStrategy,
+									   cur->sk_subtype,
+									   cur->sk_collation,
+									   cmp_proc,
+									   cur->sk_argument);
+			}
+		}
+	}
+
+	/*----------
+	 * Examine the selected initial-positioning strategy to determine exactly
+	 * where we need to start the scan, and set flag variables to control the
+	 * initial descent by _bt_search (and our _bt_binsrch call for the leaf
+	 * page _bt_search returns).
+	 *----------
+	 */
+	_bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
+	inskey.anynullkeys = false; /* unused */
+	inskey.scantid = NULL;
+	inskey.keysz = keysz;
+	switch (strat_total)
+	{
+		case BTLessStrategyNumber:
+
+			inskey.nextkey = false;
+			inskey.backward = true;
+			break;
+
+		case BTLessEqualStrategyNumber:
+
+			inskey.nextkey = true;
+			inskey.backward = true;
+			break;
+
+		case BTEqualStrategyNumber:
+
+			/*
+			 * If a backward scan was specified, need to start with last equal
+			 * item not first one.
+			 */
+			if (ScanDirectionIsBackward(dir))
+			{
+				/*
+				 * This is the same as the <= strategy
+				 */
+				inskey.nextkey = true;
+				inskey.backward = true;
+			}
+			else
+			{
+				/*
+				 * This is the same as the >= strategy
+				 */
+				inskey.nextkey = false;
+				inskey.backward = false;
+			}
+			break;
+
+		case BTGreaterEqualStrategyNumber:
+
+			/*
+			 * Find first item >= scankey
+			 */
+			inskey.nextkey = false;
+			inskey.backward = false;
+			break;
+
+		case BTGreaterStrategyNumber:
+
+			/*
+			 * Find first item > scankey
+			 */
+			inskey.nextkey = true;
+			inskey.backward = false;
+			break;
+
+		default:
+			/* can't get here, but keep compiler quiet */
+			elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
+			return false;
+	}
+
+	/*
+	 * Use the manufactured insertion scan key to descend the tree and
+	 * position ourselves on the target leaf page.
+	 */
+	Assert(ScanDirectionIsBackward(dir) == inskey.backward);
+	stack = _bt_search(rel, NULL, &inskey, &pos.buf, BT_READ);
+
+	/* don't need to keep the stack around... */
+	_bt_freestack(stack);
+
+	if (!BufferIsValid(pos.buf))
+	{
+		/*
+		 * We only get here if the index is completely empty. Lock relation
+		 * because nothing finer to lock exists.  Without a buffer lock, it's
+		 * possible for another transaction to insert data between
+		 * _bt_search() and PredicateLockRelation().  We have to try again
+		 * after taking the relation-level predicate lock, to close a narrow
+		 * window where we wouldn't scan concurrently inserted tuples, but the
+		 * writer wouldn't see our predicate lock.
+		 */
+		if (IsolationIsSerializable())
+		{
+			PredicateLockRelation(rel, scan->xs_snapshot);
+			stack = _bt_search(rel, NULL, &inskey, &pos.buf, BT_READ);
+			_bt_freestack(stack);
+		}
+
+		if (!BufferIsValid(pos.buf))
+		{
+			Assert(!so->needPrimScan);
+			_bt_parallel_done(scan);
+			return false;
+		}
+	}
+
+	/* position to the precise item on the page */
+	offnum = _bt_binsrch(rel, &inskey, pos.buf);
+
+	/*
+	 * Now load data from the first page of the scan (usually the page
+	 * currently in so->currPos.buf).
+	 *
+	 * If inskey.nextkey = false and inskey.backward = false, offnum is
+	 * positioned at the first non-pivot tuple >= inskey.scankeys.
+	 *
+	 * If inskey.nextkey = false and inskey.backward = true, offnum is
+	 * positioned at the last non-pivot tuple < inskey.scankeys.
+	 *
+	 * If inskey.nextkey = true and inskey.backward = false, offnum is
+	 * positioned at the first non-pivot tuple > inskey.scankeys.
+	 *
+	 * If inskey.nextkey = true and inskey.backward = true, offnum is
+	 * positioned at the last non-pivot tuple <= inskey.scankeys.
+	 *
+	 * It's possible that _bt_binsrch returned an offnum that is out of bounds
+	 * for the page.  For example, when inskey is both < the leaf page's high
+	 * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
+	 */
+	return _bt_readfirstpage_batch(scan, &pos, offnum, dir);
+}
+
+/*
+ *	_bt_next_batch() -- Get the next batch of items in a scan.
+ *
+ * A batch variant of _bt_next(). Most of the comments for that function
+ * apply here too.
+ *
+ * We should only get here only when the current batch has no more items
+ * in the given direction. We don't get here with empty batches, that's
+ * handled by _bt_fist_batch().
+ *
+ * XXX See also the comments at _bt_first_batch() about returning a single
+ * batch for the page, etc.
+ */
+IndexScanBatch
+_bt_next_batch(IndexScanDesc scan, BTBatchScanPos pos, ScanDirection dir)
+{
+	BTScanOpaque so PG_USED_FOR_ASSERTS_ONLY = (BTScanOpaque) scan->opaque;
+	// BTBatchScanPos pos;
+	BTBatchScanPosData tmp;
+	// IndexScanBatch	batch;
+	// int 			idx;
+
+	/* batching does not work with regular scan-level positions */
+	Assert(!BTScanPosIsValid(so->currPos));
+	Assert(!BTScanPosIsValid(so->markPos));
+
+	/*
+	 * restore the BTScanOpaque from the current batch
+	 *
+	 * XXX This is pretty ugly/expensive. Ideally we'd have all the fields
+	 * needed to determine "location" in the index (essentially BTScanPosData)
+	 * in the batch, without cloning all the other stuff.
+	 */
+	// Assert(scan->xs_batches->currentBatch != NULL);
+
+	/*
+	 * Use the last batch as the "current" batch. We use the streamPos if
+	 * initialized, or the readPos as a fallback. Alternatively, we could
+	 * simply use the last batch in the queue, i.e. (nextBatch - 1).
+	 *
+	 * Even better, we could pass the "correct" batch from indexam.c, and
+	 * let that figure out which position to move from.
+	 */
+/*
+	idx = scan->xs_batches->streamPos.batch;
+	if (idx == -1)
+		idx = scan->xs_batches->readPos.batch;
+
+	batch = INDEX_SCAN_BATCH(scan, idx);
+	Assert(batch != NULL);
+	pos = (BTBatchScanPos) batch->opaque;
+*/
+
+	Assert(BTBatchScanPosIsPinned(*pos));
+
+	memcpy(&tmp, pos, sizeof(tmp));
+
+	/*
+	 * Advance to next page, load the data into the index batch.
+	 *
+	 * FIXME It may not be quite correct to just pass the position from
+	 * current batch, some of the functions scribble over it (e.g.
+	 * _bt_readpage_batch). Maybe we should create a copy, or something?
+	 *
+	 * XXX For now we pass a local copy "tmp".
+	 */
+	return _bt_steppage_batch(scan, &tmp, dir);
+}
+
+/*
+ *	_bt_kill_batch() -- remember the items-to-be-killed from the current batch
+ *
+ * We simply translate the bitmap into the "regular" killedItems array, and let
+ * that to drive which items are killed.
+ */
+void
+_bt_kill_batch(IndexScanDesc scan, IndexScanBatch batch)
+{
+	BTScanOpaque so PG_USED_FOR_ASSERTS_ONLY = (BTScanOpaque) scan->opaque;
+
+	/* batching does not work with regular scan-level positions */
+	Assert(!BTScanPosIsValid(so->currPos));
+	Assert(!BTScanPosIsValid(so->markPos));
+
+	/* we should only get here for scans with batching */
+	Assert(scan->xs_batches);
+
+	/* bail out if the batch has no killed items */
+	if (batch->numKilled == 0)
+		return;
+
+	/*
+	 * XXX Now what? we don't have the currPos around anymore, so we should
+	 * load that, and apply the killed items to that, somehow?
+	 */
+	/* FIXME: _bt_kill_batch not implemented */
+
+	/*
+	 * XXX maybe we should have a separate callback for this, and call it from
+	 * the indexam.c directly whenever we think it's appropriate? And not only
+	 * from here when freeing the batch?
+	 */
+	_bt_killitems_batch(scan, batch);
+}
+
+/*
+ *	_bt_readpage() -- Load data from current index page into so->currPos
+ *
+ * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
+ * is not changed here.  Also, currPos.moreLeft and moreRight must be valid;
+ * they are updated as appropriate.  All other fields of so->currPos are
+ * initialized from scratch here.
+ *
+ * We scan the current page starting at offnum and moving in the indicated
+ * direction.  All items matching the scan keys are loaded into currPos.items.
+ * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
+ * that there can be no more matching tuples in the current scan direction
+ * (could just be for the current primitive index scan when scan has arrays).
+ *
+ * In the case of a parallel scan, caller must have called _bt_parallel_seize
+ * prior to calling this function; this function will invoke
+ * _bt_parallel_release before returning.
+ *
+ * Returns true if any matching items found on the page, false if none.
+ */
+static bool
+_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
+			 bool firstpage)
+{
+	Relation	rel = scan->indexRelation;
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	Page		page;
+	BTPageOpaque opaque;
+	OffsetNumber minoff;
+	OffsetNumber maxoff;
+	BTReadPageState pstate;
+	bool		arrayKeys;
+	int			itemIndex,
+				indnatts;
+
+	/* save the page/buffer block number, along with its sibling links */
+	page = BufferGetPage(so->currPos.buf);
+	opaque = BTPageGetOpaque(page);
+	so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
+	so->currPos.prevPage = opaque->btpo_prev;
+	so->currPos.nextPage = opaque->btpo_next;
+
+	Assert(!P_IGNORE(opaque));
+	Assert(BTScanPosIsPinned(so->currPos));
+	Assert(!so->needPrimScan);
+
+	if (scan->parallel_scan)
+	{
+		/* allow next/prev page to be read by other worker without delay */
+		if (ScanDirectionIsForward(dir))
+			_bt_parallel_release(scan, so->currPos.nextPage,
+								 so->currPos.currPage);
+		else
+			_bt_parallel_release(scan, so->currPos.prevPage,
+								 so->currPos.currPage);
+	}
+
+	/* initialize remaining currPos fields related to current page */
+	so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
+	so->currPos.dir = dir;
+	so->currPos.nextTupleOffset = 0;
+	/* either moreLeft or moreRight should be set now (may be unset later) */
+	Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight :
+		   so->currPos.moreLeft);
+
+	PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot);
+
+	/* initialize local variables */
+	indnatts = IndexRelationGetNumberOfAttributes(rel);
+	arrayKeys = so->numArrayKeys != 0;
+	minoff = P_FIRSTDATAKEY(opaque);
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	/* initialize page-level state that we'll pass to _bt_checkkeys */
+	pstate.minoff = minoff;
+	pstate.maxoff = maxoff;
+	pstate.finaltup = NULL;
+	pstate.page = page;
+	pstate.firstpage = firstpage;
+	pstate.forcenonrequired = false;
+	pstate.startikey = 0;
+	pstate.offnum = InvalidOffsetNumber;
+	pstate.skip = InvalidOffsetNumber;
+	pstate.continuescan = true; /* default assumption */
+	pstate.rechecks = 0;
+	pstate.targetdistance = 0;
+	pstate.nskipadvances = 0;
+
+	if (ScanDirectionIsForward(dir))
+	{
+		/* SK_SEARCHARRAY forward scans must provide high key up front */
+		if (arrayKeys)
+		{
+			if (!P_RIGHTMOST(opaque))
+			{
+				ItemId		iid = PageGetItemId(page, P_HIKEY);
+
+				pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
+
+				if (so->scanBehind &&
+					!_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
+				{
+					/* Schedule another primitive index scan after all */
+					so->currPos.moreRight = false;
+					so->needPrimScan = true;
+					if (scan->parallel_scan)
+						_bt_parallel_primscan_schedule(scan,
+													   so->currPos.currPage);
+					return false;
+				}
+			}
+
+			so->scanBehind = so->oppositeDirCheck = false;	/* reset */
+		}
+
+		/*
+		 * Consider pstate.startikey optimization once the ongoing primitive
+		 * index scan has already read at least one page
+		 */
+		if (!pstate.firstpage && minoff < maxoff)
+			_bt_set_startikey(scan, &pstate);
+
+		/* load items[] in ascending order */
+		itemIndex = 0;
+
+		offnum = Max(offnum, minoff);
+
+		while (offnum <= maxoff)
+		{
+			ItemId		iid = PageGetItemId(page, offnum);
+			IndexTuple	itup;
+			bool		passes_quals;
+
+			/*
+			 * If the scan specifies not to return killed tuples, then we
+			 * treat a killed tuple as not passing the qual
+			 */
+			if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
+			{
+				offnum = OffsetNumberNext(offnum);
+				continue;
+			}
+
+			itup = (IndexTuple) PageGetItem(page, iid);
+			Assert(!BTreeTupleIsPivot(itup));
+
+			pstate.offnum = offnum;
+			passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
+										 itup, indnatts);
+
+			/*
+			 * Check if we need to skip ahead to a later tuple (only possible
+			 * when the scan uses array keys)
+			 */
+			if (arrayKeys && OffsetNumberIsValid(pstate.skip))
+			{
+				Assert(!passes_quals && pstate.continuescan);
+				Assert(offnum < pstate.skip);
+				Assert(!pstate.forcenonrequired);
+
+				offnum = pstate.skip;
+				pstate.skip = InvalidOffsetNumber;
+				continue;
+			}
+
+			if (passes_quals)
+			{
+				/* tuple passes all scan key conditions */
+				if (!BTreeTupleIsPosting(itup))
+				{
+					/* Remember it */
+					_bt_saveitem(so, itemIndex, offnum, itup);
+					itemIndex++;
+				}
+				else
+				{
+					int			tupleOffset;
+
+					/*
+					 * Set up state to return posting list, and remember first
+					 * TID
+					 */
+					tupleOffset =
+						_bt_setuppostingitems(so, itemIndex, offnum,
+											  BTreeTupleGetPostingN(itup, 0),
+											  itup);
+					itemIndex++;
+					/* Remember additional TIDs */
+					for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+					{
+						_bt_savepostingitem(so, itemIndex, offnum,
+											BTreeTupleGetPostingN(itup, i),
+											tupleOffset);
+						itemIndex++;
+					}
+				}
+			}
+			/* When !continuescan, there can't be any more matches, so stop */
+			if (!pstate.continuescan)
+				break;
+
+			offnum = OffsetNumberNext(offnum);
+		}
+
+		/*
+		 * We don't need to visit page to the right when the high key
+		 * indicates that no more matches will be found there.
+		 *
+		 * Checking the high key like this works out more often than you might
+		 * think.  Leaf page splits pick a split point between the two most
+		 * dissimilar tuples (this is weighed against the need to evenly share
+		 * free space).  Leaf pages with high key attribute values that can
+		 * only appear on non-pivot tuples on the right sibling page are
+		 * common.
+		 */
+		if (pstate.continuescan && !so->scanBehind && !P_RIGHTMOST(opaque))
+		{
+			ItemId		iid = PageGetItemId(page, P_HIKEY);
+			IndexTuple	itup = (IndexTuple) PageGetItem(page, iid);
+			int			truncatt;
+
+			truncatt = BTreeTupleGetNAtts(itup, rel);
+			pstate.forcenonrequired = false;
+			pstate.startikey = 0;	/* _bt_set_startikey ignores P_HIKEY */
+			_bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
+		}
+
+		if (!pstate.continuescan)
+			so->currPos.moreRight = false;
+
+		Assert(itemIndex <= MaxTIDsPerBTreePage);
+		so->currPos.firstItem = 0;
+		so->currPos.lastItem = itemIndex - 1;
+		so->currPos.itemIndex = 0;
+	}
+	else
+	{
+		/* SK_SEARCHARRAY backward scans must provide final tuple up front */
+		if (arrayKeys)
+		{
+			if (minoff <= maxoff && !P_LEFTMOST(opaque))
+			{
+				ItemId		iid = PageGetItemId(page, minoff);
+
+				pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
+
+				if (so->scanBehind &&
+					!_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
+				{
+					/* Schedule another primitive index scan after all */
+					so->currPos.moreLeft = false;
+					so->needPrimScan = true;
+					if (scan->parallel_scan)
+						_bt_parallel_primscan_schedule(scan,
+													   so->currPos.currPage);
+					return false;
+				}
+			}
+
+			so->scanBehind = so->oppositeDirCheck = false;	/* reset */
+		}
+
+		/*
+		 * Consider pstate.startikey optimization once the ongoing primitive
+		 * index scan has already read at least one page
+		 */
+		if (!pstate.firstpage && minoff < maxoff)
+			_bt_set_startikey(scan, &pstate);
+
+		/* load items[] in descending order */
+		itemIndex = MaxTIDsPerBTreePage;
+
+		offnum = Min(offnum, maxoff);
+
+		while (offnum >= minoff)
+		{
+			ItemId		iid = PageGetItemId(page, offnum);
+			IndexTuple	itup;
+			bool		tuple_alive;
+			bool		passes_quals;
+
+			/*
+			 * If the scan specifies not to return killed tuples, then we
+			 * treat a killed tuple as not passing the qual.  Most of the
+			 * time, it's a win to not bother examining the tuple's index
+			 * keys, but just skip to the next tuple (previous, actually,
+			 * since we're scanning backwards).  However, if this is the first
+			 * tuple on the page, we do check the index keys, to prevent
+			 * uselessly advancing to the page to the left.  This is similar
+			 * to the high key optimization used by forward scans.
+			 */
+			if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
+			{
+				if (offnum > minoff)
+				{
+					offnum = OffsetNumberPrev(offnum);
+					continue;
+				}
+
+				tuple_alive = false;
+			}
+			else
+				tuple_alive = true;
+
+			itup = (IndexTuple) PageGetItem(page, iid);
+			Assert(!BTreeTupleIsPivot(itup));
+
+			pstate.offnum = offnum;
+			if (arrayKeys && offnum == minoff && pstate.forcenonrequired)
+			{
+				pstate.forcenonrequired = false;
+				pstate.startikey = 0;
+			}
+			passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
+										 itup, indnatts);
+
+			if (arrayKeys && so->scanBehind)
+			{
+				/*
+				 * Done scanning this page, but not done with the current
+				 * primscan.
+				 *
+				 * Note: Forward scans don't check this explicitly, since they
+				 * prefer to reuse pstate.skip for this instead.
+				 */
+				Assert(!passes_quals && pstate.continuescan);
+				Assert(!pstate.forcenonrequired);
+
+				break;
+			}
+
+			/*
+			 * Check if we need to skip ahead to a later tuple (only possible
+			 * when the scan uses array keys)
+			 */
+			if (arrayKeys && OffsetNumberIsValid(pstate.skip))
+			{
+				Assert(!passes_quals && pstate.continuescan);
+				Assert(offnum > pstate.skip);
+				Assert(!pstate.forcenonrequired);
+
+				offnum = pstate.skip;
+				pstate.skip = InvalidOffsetNumber;
+				continue;
+			}
+
+			if (passes_quals && tuple_alive)
+			{
+				/* tuple passes all scan key conditions */
+				if (!BTreeTupleIsPosting(itup))
+				{
+					/* Remember it */
+					itemIndex--;
+					_bt_saveitem(so, itemIndex, offnum, itup);
+				}
+				else
+				{
+					int			tupleOffset;
+
+					/*
+					 * Set up state to return posting list, and remember first
+					 * TID.
+					 *
+					 * Note that we deliberately save/return items from
+					 * posting lists in ascending heap TID order for backwards
+					 * scans.  This allows _bt_killitems() to make a
+					 * consistent assumption about the order of items
+					 * associated with the same posting list tuple.
+					 */
+					itemIndex--;
+					tupleOffset =
+						_bt_setuppostingitems(so, itemIndex, offnum,
+											  BTreeTupleGetPostingN(itup, 0),
+											  itup);
+					/* Remember additional TIDs */
+					for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+					{
+						itemIndex--;
+						_bt_savepostingitem(so, itemIndex, offnum,
+											BTreeTupleGetPostingN(itup, i),
+											tupleOffset);
+					}
+				}
+			}
+			/* When !continuescan, there can't be any more matches, so stop */
+			if (!pstate.continuescan)
+				break;
+
+			offnum = OffsetNumberPrev(offnum);
+		}
+
+		/*
+		 * We don't need to visit page to the left when no more matches will
+		 * be found there
+		 */
+		if (!pstate.continuescan)
+			so->currPos.moreLeft = false;
+
+		Assert(itemIndex >= 0);
+		so->currPos.firstItem = itemIndex;
+		so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
+		so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
+	}
+
+	/*
+	 * If _bt_set_startikey told us to temporarily treat the scan's keys as
+	 * nonrequired (possible only during scans with array keys), there must be
+	 * no lasting consequences for the scan's array keys.  The scan's arrays
+	 * should now have exactly the same elements as they would have had if the
+	 * nonrequired behavior had never been used.  (In general, a scan's arrays
+	 * are expected to track its progress through the index's key space.)
+	 *
+	 * We are required (by _bt_set_startikey) to call _bt_checkkeys against
+	 * pstate.finaltup with pstate.forcenonrequired=false to allow the scan's
+	 * arrays to recover.  Assert that that step hasn't been missed.
+	 */
+	Assert(!pstate.forcenonrequired);
+
+	return (so->currPos.firstItem <= so->currPos.lastItem);
+}
+
+static IndexScanBatch
+_bt_readpage_batch(IndexScanDesc scan, BTBatchScanPos pos, ScanDirection dir, OffsetNumber offnum,
+				   bool firstpage)
+{
+	Relation	rel = scan->indexRelation;
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	Page		page;
+	BTPageOpaque opaque;
+	OffsetNumber minoff;
+	OffsetNumber maxoff;
+	BTReadPageState pstate;
+	bool		arrayKeys;
+	int			itemIndex,
+				indnatts;
+
+	/* result */
+	/* IndexScanBatch batch = ddd; */
+	IndexScanBatch batch;
+
+	/* batching does not work with regular scan-level positions */
+	Assert(!BTScanPosIsValid(so->currPos));
+	Assert(!BTScanPosIsValid(so->markPos));
+
+	/*
+	 * FIXME fake for _bt_checkkeys, needs to be set properly elsewhere (not
+	 * sure where)
+	 */
+
+	/*
+	 * XXX we shouldn't be passing this info through currPos but directly, I
+	 * guess.
+	 */
+	so->currPos.dir = dir;
+
+	/*
+	 * XXX We can pass the exact number if items from this page, by using
+	 * maxoff
+	 */
+	batch = index_batch_alloc(MaxTIDsPerBTreePage, scan->xs_want_itup);
+
+	/* FIXME but we don't copy the contents until the end */
+	batch->opaque = palloc0(sizeof(BTBatchScanPosData));
+
+	/* bogus values */
+	batch->firstItem = -1;
+	batch->lastItem = -1;
+	batch->itemIndex = -1;
+
+	/* if (so->currTuples) */
+	/* { */
+	/* batch->currTuples = (char *) palloc(BLCKSZ); */
+	/* memcpy(batch->currTuples, so->currTuples, BLCKSZ); */
+	/* } */
+
+	/* save the page/buffer block number, along with its sibling links */
+	page = BufferGetPage(pos->buf);
+	opaque = BTPageGetOpaque(page);
+	pos->currPage = BufferGetBlockNumber(pos->buf);
+	pos->prevPage = opaque->btpo_prev;
+	pos->nextPage = opaque->btpo_next;
+
+	Assert(!P_IGNORE(opaque));
+	Assert(BTBatchScanPosIsPinned(*pos));
+	Assert(!so->needPrimScan);
+
+	if (scan->parallel_scan)
+	{
+		/* allow next/prev page to be read by other worker without delay */
+		if (ScanDirectionIsForward(dir))
+			_bt_parallel_release(scan, pos->nextPage,
+								 pos->currPage);
+		else
+			_bt_parallel_release(scan, pos->prevPage,
+								 pos->currPage);
+	}
+
+	/* initialize remaining currPos fields related to current page */
+	pos->lsn = BufferGetLSNAtomic(pos->buf);
+	pos->dir = dir;
+	pos->nextTupleOffset = 0;
+	/* either moreLeft or moreRight should be set now (may be unset later) */
+	Assert(ScanDirectionIsForward(dir) ? pos->moreRight : pos->moreLeft);
+
+	PredicateLockPage(rel, pos->currPage, scan->xs_snapshot);
+
+	/* initialize local variables */
+	indnatts = IndexRelationGetNumberOfAttributes(rel);
+	arrayKeys = so->numArrayKeys != 0;
+	minoff = P_FIRSTDATAKEY(opaque);
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	/* initialize page-level state that we'll pass to _bt_checkkeys */
+	pstate.minoff = minoff;
+	pstate.maxoff = maxoff;
+	pstate.finaltup = NULL;
+	pstate.page = page;
+	pstate.firstpage = firstpage;
+	pstate.forcenonrequired = false;
+	pstate.startikey = 0;
+	pstate.offnum = InvalidOffsetNumber;
+	pstate.skip = InvalidOffsetNumber;
+	pstate.continuescan = true; /* default assumption */
+	pstate.rechecks = 0;
+	pstate.targetdistance = 0;
+	pstate.nskipadvances = 0;
+
+	if (ScanDirectionIsForward(dir))
+	{
+		/* SK_SEARCHARRAY forward scans must provide high key up front */
+		if (arrayKeys)
+		{
+			if (!P_RIGHTMOST(opaque))
+			{
+				ItemId		iid = PageGetItemId(page, P_HIKEY);
+
+				pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
+
+				if (so->scanBehind &&
+					!_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
+				{
+					/* Schedule another primitive index scan after all */
+					pos->moreRight = false;
+					so->needPrimScan = true;
+					if (scan->parallel_scan)
+						_bt_parallel_primscan_schedule(scan,
+													   pos->currPage);
+					return NULL;
+				}
+			}
+
+			so->scanBehind = so->oppositeDirCheck = false;	/* reset */
+		}
+
+		/*
+		 * Consider pstate.startikey optimization once the ongoing primitive
+		 * index scan has already read at least one page
+		 */
+		if (!pstate.firstpage && minoff < maxoff)
+			_bt_set_startikey(scan, &pstate);
+
+		/* load items[] in ascending order */
+		itemIndex = 0;
+
+		offnum = Max(offnum, minoff);
+
+		while (offnum <= maxoff)
+		{
+			ItemId		iid = PageGetItemId(page, offnum);
+			IndexTuple	itup;
 			bool		passes_quals;
 
 			/*
@@ -1740,7 +2994,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 				if (!BTreeTupleIsPosting(itup))
 				{
 					/* Remember it */
-					_bt_saveitem(so, itemIndex, offnum, itup);
+					_bt_saveitem_batch(batch, itemIndex, offnum, itup);
 					itemIndex++;
 				}
 				else
@@ -1752,16 +3006,16 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 					 * TID
 					 */
 					tupleOffset =
-						_bt_setuppostingitems(so, itemIndex, offnum,
-											  BTreeTupleGetPostingN(itup, 0),
-											  itup);
+						_bt_setuppostingitems_batch(batch, itemIndex, offnum,
+													BTreeTupleGetPostingN(itup, 0),
+													itup);
 					itemIndex++;
 					/* Remember additional TIDs */
 					for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
 					{
-						_bt_savepostingitem(so, itemIndex, offnum,
-											BTreeTupleGetPostingN(itup, i),
-											tupleOffset);
+						_bt_savepostingitem_batch(batch, itemIndex, offnum,
+												  BTreeTupleGetPostingN(itup, i),
+												  tupleOffset);
 						itemIndex++;
 					}
 				}
@@ -1792,17 +3046,17 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 
 			truncatt = BTreeTupleGetNAtts(itup, rel);
 			pstate.forcenonrequired = false;
-			pstate.startikey = 0;	/* _bt_set_startikey ignores P_HIKEY */
+			pstate.startikey = 0;	/* _bt_set_startikey ignores HIKEY */
 			_bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
 		}
 
 		if (!pstate.continuescan)
-			so->currPos.moreRight = false;
+			pos->moreRight = false;
 
 		Assert(itemIndex <= MaxTIDsPerBTreePage);
-		so->currPos.firstItem = 0;
-		so->currPos.lastItem = itemIndex - 1;
-		so->currPos.itemIndex = 0;
+		batch->firstItem = 0;
+		batch->lastItem = itemIndex - 1;
+		batch->itemIndex = 0;
 	}
 	else
 	{
@@ -1819,12 +3073,12 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 					!_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
 				{
 					/* Schedule another primitive index scan after all */
-					so->currPos.moreLeft = false;
+					pos->moreLeft = false;
 					so->needPrimScan = true;
 					if (scan->parallel_scan)
 						_bt_parallel_primscan_schedule(scan,
-													   so->currPos.currPage);
-					return false;
+													   pos->currPage);
+					return NULL;
 				}
 			}
 
@@ -1922,7 +3176,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 				{
 					/* Remember it */
 					itemIndex--;
-					_bt_saveitem(so, itemIndex, offnum, itup);
+					_bt_saveitem_batch(batch, itemIndex, offnum, itup);
 				}
 				else
 				{
@@ -1940,16 +3194,16 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 					 */
 					itemIndex--;
 					tupleOffset =
-						_bt_setuppostingitems(so, itemIndex, offnum,
-											  BTreeTupleGetPostingN(itup, 0),
-											  itup);
+						_bt_setuppostingitems_batch(batch, itemIndex, offnum,
+													BTreeTupleGetPostingN(itup, 0),
+													itup);
 					/* Remember additional TIDs */
 					for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
 					{
 						itemIndex--;
-						_bt_savepostingitem(so, itemIndex, offnum,
-											BTreeTupleGetPostingN(itup, i),
-											tupleOffset);
+						_bt_savepostingitem_batch(batch, itemIndex, offnum,
+												  BTreeTupleGetPostingN(itup, i),
+												  tupleOffset);
 					}
 				}
 			}
@@ -1965,12 +3219,12 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 		 * be found there
 		 */
 		if (!pstate.continuescan)
-			so->currPos.moreLeft = false;
+			pos->moreLeft = false;
 
 		Assert(itemIndex >= 0);
-		so->currPos.firstItem = itemIndex;
-		so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
-		so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
+		batch->firstItem = itemIndex;
+		batch->lastItem = MaxTIDsPerBTreePage - 1;
+		batch->itemIndex = MaxTIDsPerBTreePage - 1;
 	}
 
 	/*
@@ -1987,7 +3241,12 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 	 */
 	Assert(!pstate.forcenonrequired);
 
-	return (so->currPos.firstItem <= so->currPos.lastItem);
+	if (batch->firstItem > batch->lastItem)
+		return NULL;
+
+	memcpy(batch->opaque, pos, sizeof(BTBatchScanPosData));
+
+	return batch;
 }
 
 /* Save an index item into so->currPos.items[itemIndex] */
@@ -2005,9 +3264,97 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
 	{
 		Size		itupsz = IndexTupleSize(itup);
 
-		currItem->tupleOffset = so->currPos.nextTupleOffset;
-		memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
-		so->currPos.nextTupleOffset += MAXALIGN(itupsz);
+		currItem->tupleOffset = so->currPos.nextTupleOffset;
+		memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
+		so->currPos.nextTupleOffset += MAXALIGN(itupsz);
+	}
+}
+
+/*
+ * Setup state to save TIDs/items from a single posting list tuple.
+ *
+ * Saves an index item into so->currPos.items[itemIndex] for TID that is
+ * returned to scan first.  Second or subsequent TIDs for posting list should
+ * be saved by calling _bt_savepostingitem().
+ *
+ * Returns an offset into tuple storage space that main tuple is stored at if
+ * needed.
+ */
+static int
+_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
+					  ItemPointer heapTid, IndexTuple itup)
+{
+	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+	Assert(BTreeTupleIsPosting(itup));
+
+	currItem->heapTid = *heapTid;
+	currItem->indexOffset = offnum;
+	if (so->currTuples)
+	{
+		/* Save base IndexTuple (truncate posting list) */
+		IndexTuple	base;
+		Size		itupsz = BTreeTupleGetPostingOffset(itup);
+
+		itupsz = MAXALIGN(itupsz);
+		currItem->tupleOffset = so->currPos.nextTupleOffset;
+		base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
+		memcpy(base, itup, itupsz);
+		/* Defensively reduce work area index tuple header size */
+		base->t_info &= ~INDEX_SIZE_MASK;
+		base->t_info |= itupsz;
+		so->currPos.nextTupleOffset += itupsz;
+
+		return currItem->tupleOffset;
+	}
+
+	return 0;
+}
+
+/*
+ * Save an index item into so->currPos.items[itemIndex] for current posting
+ * tuple.
+ *
+ * Assumes that _bt_setuppostingitems() has already been called for current
+ * posting list tuple.  Caller passes its return value as tupleOffset.
+ */
+static inline void
+_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
+					ItemPointer heapTid, int tupleOffset)
+{
+	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+	currItem->heapTid = *heapTid;
+	currItem->indexOffset = offnum;
+
+	/*
+	 * Have index-only scans return the same base IndexTuple for every TID
+	 * that originates from the same posting list
+	 */
+	if (so->currTuples)
+		currItem->tupleOffset = tupleOffset;
+}
+
+/* Save an index item into so->currPos.items[itemIndex] */
+static void
+_bt_saveitem_batch(IndexScanBatch batch, int itemIndex,
+				   OffsetNumber offnum, IndexTuple itup)
+{
+	BTBatchScanPos pos = (BTBatchScanPos) batch->opaque;
+
+	Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
+
+	/* copy the populated part of the items array */
+	batch->items[itemIndex].heapTid = itup->t_tid;
+	batch->items[itemIndex].indexOffset = offnum;
+
+	if (batch->currTuples)
+	{
+		Size		itupsz = IndexTupleSize(itup);
+
+		batch->items[itemIndex].tupleOffset = pos->nextTupleOffset;
+		memcpy(batch->currTuples + pos->nextTupleOffset, itup, itupsz);
+		pos->nextTupleOffset += MAXALIGN(itupsz);
 	}
 }
 
@@ -2022,31 +3369,34 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
  * needed.
  */
 static int
-_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
-					  ItemPointer heapTid, IndexTuple itup)
+_bt_setuppostingitems_batch(IndexScanBatch batch, int itemIndex, OffsetNumber offnum,
+							ItemPointer heapTid, IndexTuple itup)
 {
-	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+	BTBatchScanPos pos = (BTBatchScanPos) batch->opaque;
+	IndexScanBatchPosItem *item = &batch->items[itemIndex];
 
 	Assert(BTreeTupleIsPosting(itup));
 
-	currItem->heapTid = *heapTid;
-	currItem->indexOffset = offnum;
-	if (so->currTuples)
+	/* copy the populated part of the items array */
+	item->heapTid = *heapTid;
+	item->indexOffset = offnum;
+
+	if (batch->currTuples)
 	{
 		/* Save base IndexTuple (truncate posting list) */
 		IndexTuple	base;
 		Size		itupsz = BTreeTupleGetPostingOffset(itup);
 
 		itupsz = MAXALIGN(itupsz);
-		currItem->tupleOffset = so->currPos.nextTupleOffset;
-		base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
+		item->tupleOffset = pos->nextTupleOffset;
+		base = (IndexTuple) (batch->currTuples + pos->nextTupleOffset);
 		memcpy(base, itup, itupsz);
 		/* Defensively reduce work area index tuple header size */
 		base->t_info &= ~INDEX_SIZE_MASK;
 		base->t_info |= itupsz;
-		so->currPos.nextTupleOffset += itupsz;
+		pos->nextTupleOffset += itupsz;
 
-		return currItem->tupleOffset;
+		return item->tupleOffset;
 	}
 
 	return 0;
@@ -2060,20 +3410,20 @@ _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
  * posting list tuple.  Caller passes its return value as tupleOffset.
  */
 static inline void
-_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
-					ItemPointer heapTid, int tupleOffset)
+_bt_savepostingitem_batch(IndexScanBatch batch, int itemIndex, OffsetNumber offnum,
+						  ItemPointer heapTid, int tupleOffset)
 {
-	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+	IndexScanBatchPosItem *item = &batch->items[itemIndex];
 
-	currItem->heapTid = *heapTid;
-	currItem->indexOffset = offnum;
+	item->heapTid = *heapTid;
+	item->indexOffset = offnum;
 
 	/*
 	 * Have index-only scans return the same base IndexTuple for every TID
 	 * that originates from the same posting list
 	 */
-	if (so->currTuples)
-		currItem->tupleOffset = tupleOffset;
+	if (batch->currTuples)
+		item->tupleOffset = tupleOffset;
 }
 
 /*
@@ -2186,6 +3536,71 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 	return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
 }
 
+/*
+ *	a batching version of _bt_steppage(), ignoring irrelevant bits
+ */
+static IndexScanBatch
+_bt_steppage_batch(IndexScanDesc scan, BTBatchScanPos pos, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	BlockNumber blkno,
+				lastcurrblkno;
+
+	/* batching does not work with regular scan-level positions */
+	Assert(!BTScanPosIsValid(so->currPos));
+	Assert(!BTScanPosIsValid(so->markPos));
+
+	/* Batching has a different concept of position, stored in the batch. */
+	Assert(BTBatchScanPosIsValid(*pos));
+
+	/*
+	 * killitems
+	 *
+	 * No need to handle killtuples here, that's going to be dealt with at the
+	 * indexam.c level when freeing the batch, or possibly in when calling
+	 * amfreebatch.
+	 */
+
+	/*
+	 * mark/restore
+	 *
+	 * Mark/restore shall also be handled at the indexam.c level, by keeping
+	 * the correct batch around, etc. We don't discard the old batch here.
+	 *
+	 * In _bt_steppage this also handled primitive scans for array keys, but
+	 * that probably would be handled at indexam.c level too.
+	 */
+
+	/* Don't unpin the buffer here, keep the batch pinned until amfreebatch. */
+
+	/* Walk to the next page with data */
+	if (ScanDirectionIsForward(dir))
+		blkno = pos->nextPage;
+	else
+		blkno = pos->prevPage;
+
+	lastcurrblkno = pos->currPage;
+
+	/*
+	 * Cancel primitive index scans that were scheduled when the call to
+	 * _bt_readpage for currPos happened to use the opposite direction to the
+	 * one that we're stepping in now.  (It's okay to leave the scan's array
+	 * keys as-is, since the next _bt_readpage will advance them.)
+	 *
+	 * XXX Not sure this is correct. Can we combine the direction from some
+	 * older batch (with mark/restore?) and the current needPrimScan from the
+	 * latest batch we processed? But, the mark/restore code in indexam should
+	 * reset this somehow.
+	 *
+	 * XXX However, aren't primitive scans very btree-specific code? How could
+	 * indexam.c ever handle that?
+	 */
+	if (pos->dir != dir)
+		so->needPrimScan = false;
+
+	return _bt_readnextpage_batch(scan, pos, blkno, lastcurrblkno, dir, false);
+}
+
 /*
  *	_bt_readfirstpage() -- Read first page containing valid data for _bt_first
  *
@@ -2265,6 +3680,77 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
 	return true;
 }
 
+static IndexScanBatch
+_bt_readfirstpage_batch(IndexScanDesc scan, BTBatchScanPos pos, OffsetNumber offnum, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	IndexScanBatch batch;
+
+	/* batching does not work with regular scan-level positions */
+	Assert(!BTScanPosIsValid(so->currPos));
+	Assert(!BTScanPosIsValid(so->markPos));
+
+	so->numKilled = 0;			/* just paranoia */
+	so->markItemIndex = -1;		/* ditto */
+
+	/* copy position info from BTScanOpaque */
+
+	/* Initialize so->currPos for the first page (page in so->currPos.buf) */
+	if (so->needPrimScan)
+	{
+		Assert(so->numArrayKeys);
+
+		pos->moreLeft = true;
+		pos->moreRight = true;
+		so->needPrimScan = false;
+	}
+	else if (ScanDirectionIsForward(dir))
+	{
+		pos->moreLeft = false;
+		pos->moreRight = true;
+	}
+	else
+	{
+		pos->moreLeft = true;
+		pos->moreRight = false;
+	}
+
+	/*
+	 * Attempt to load matching tuples from the first page.
+	 *
+	 * Note that _bt_readpage will finish initializing the so->currPos fields.
+	 * _bt_readpage also releases parallel scan (even when it returns false).
+	 */
+	if ((batch = _bt_readpage_batch(scan, pos, dir, offnum, true)) != NULL)
+	{
+		pos = (BTBatchScanPos) batch->opaque;
+
+		/*
+		 * _bt_readpage succeeded.  Drop the lock (and maybe the pin) on
+		 * so->currPos.buf in preparation for btgettuple returning tuples.
+		 */
+		Assert(BTBatchScanPosIsPinned(*pos));
+
+		/* _bt_drop_lock_and_maybe_pin_batch(scan, pos); */
+		/* XXX drop just the lock, not the pin, that's up to btfreebatch */
+		/* without this btfreebatch triggers an assert when unpinning the */
+		/* buffer, because that checks we're not holding a lock on it */
+		_bt_unlockbuf(scan->indexRelation, pos->buf);
+		return batch;
+	}
+
+	/* There's no actually-matching data on the page in so->currPos.buf */
+	_bt_unlockbuf(scan->indexRelation, pos->buf);
+
+	/* XXX Not sure we can drop the pin before calling steppage_batch? But */
+	/* without this, \d+ reports unreleased buffer ... */
+	/* And the non-batch code doesn't need to do this. */
+	ReleaseBuffer(pos->buf);
+
+	/* Call _bt_readnextpage using its _bt_steppage wrapper function */
+	return _bt_steppage_batch(scan, pos, dir);
+}
+
 /*
  *	_bt_readnextpage() -- Read next page containing valid data for _bt_next
  *
@@ -2412,6 +3898,138 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
 	return true;
 }
 
+static IndexScanBatch
+_bt_readnextpage_batch(IndexScanDesc scan, BTBatchScanPos pos, BlockNumber blkno,
+					   BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
+{
+	Relation	rel = scan->indexRelation;
+	BTScanOpaque so PG_USED_FOR_ASSERTS_ONLY = (BTScanOpaque) scan->opaque;
+
+	/* BTBatchScanPosData	newpos; */
+	IndexScanBatch newbatch = NULL;
+
+	/* batching does not work with regular scan-level positions */
+	Assert(!BTScanPosIsValid(so->currPos));
+	Assert(!BTScanPosIsValid(so->markPos));
+
+	Assert(pos->currPage == lastcurrblkno || seized);
+	Assert(BTBatchScanPosIsPinned(*pos) || seized);
+
+	/* initialize the new position to the old one, we'll modify it */
+	/* newpos = *pos; */
+
+	/* pos->moreLeft = pos->moreRight = false; */
+
+	/*
+	 * Remember that the scan already read lastcurrblkno, a page to the left
+	 * of blkno (or remember reading a page to the right, for backwards scans)
+	 */
+	if (ScanDirectionIsForward(dir))
+		pos->moreLeft = true;
+	else
+		pos->moreRight = true;
+
+	for (;;)
+	{
+		Page		page;
+		BTPageOpaque opaque;
+
+		if (blkno == P_NONE ||
+			(ScanDirectionIsForward(dir) ?
+			 !pos->moreRight : !pos->moreLeft))
+		{
+			/* most recent _bt_readpage call (for lastcurrblkno) ended scan */
+			Assert(pos->currPage == lastcurrblkno && !seized);
+			BTBatchScanPosInvalidate(*pos);
+			_bt_parallel_done(scan);	/* iff !so->needPrimScan */
+			return NULL;
+		}
+
+		Assert(!so->needPrimScan);
+
+		/* parallel scan must never actually visit so->currPos blkno */
+		if (!seized && scan->parallel_scan != NULL &&
+			!_bt_parallel_seize_batch(scan, pos, &blkno, &lastcurrblkno, false))
+		{
+			/* whole scan is now done (or another primitive scan required) */
+			BTBatchScanPosInvalidate(*pos);
+			return NULL;
+		}
+
+		if (ScanDirectionIsForward(dir))
+		{
+			/* read blkno, but check for interrupts first */
+			CHECK_FOR_INTERRUPTS();
+			pos->buf = _bt_getbuf(rel, blkno, BT_READ);
+		}
+		else
+		{
+			/* read blkno, avoiding race (also checks for interrupts) */
+			pos->buf = _bt_lock_and_validate_left(rel, &blkno,
+												  lastcurrblkno);
+			if (pos->buf == InvalidBuffer)
+			{
+				/* must have been a concurrent deletion of leftmost page */
+				BTBatchScanPosInvalidate(*pos);
+				_bt_parallel_done(scan);
+				return NULL;
+			}
+		}
+
+		page = BufferGetPage(pos->buf);
+		opaque = BTPageGetOpaque(page);
+		lastcurrblkno = blkno;
+		if (likely(!P_IGNORE(opaque)))
+		{
+			/* see if there are any matches on this page */
+			if (ScanDirectionIsForward(dir))
+			{
+				/* note that this will clear moreRight if we can stop */
+				if ((newbatch = _bt_readpage_batch(scan, pos, dir, P_FIRSTDATAKEY(opaque), false)) != NULL)
+					break;
+				blkno = pos->nextPage;
+			}
+			else
+			{
+				/* note that this will clear moreLeft if we can stop */
+				if ((newbatch = _bt_readpage_batch(scan, pos, dir, PageGetMaxOffsetNumber(page), false)) != NULL)
+					break;
+				blkno = pos->prevPage;
+			}
+		}
+		else
+		{
+			/* _bt_readpage not called, so do all this for ourselves */
+			if (ScanDirectionIsForward(dir))
+				blkno = opaque->btpo_next;
+			else
+				blkno = opaque->btpo_prev;
+			if (scan->parallel_scan != NULL)
+				_bt_parallel_release(scan, blkno, lastcurrblkno);
+		}
+
+		/* no matching tuples on this page */
+		_bt_relbuf(rel, pos->buf);
+		seized = false;			/* released by _bt_readpage (or by us) */
+	}
+
+	/* */
+	Assert(newbatch != NULL);
+
+	pos = (BTBatchScanPos) newbatch->opaque;
+
+	/*
+	 * _bt_readpage succeeded.  Drop the lock (and maybe the pin) on
+	 * so->currPos.buf in preparation for btgettuple returning tuples.
+	 */
+	Assert(pos->currPage == blkno);
+	Assert(BTBatchScanPosIsPinned(*pos));
+	/* _bt_drop_lock_and_maybe_pin_batch(scan, pos); */
+	_bt_unlockbuf(scan->indexRelation, pos->buf);
+
+	return newbatch;
+}
+
 /*
  * _bt_lock_and_validate_left() -- lock caller's left sibling blkno,
  * recovering from concurrent page splits/page deletions when necessary
@@ -2693,3 +4311,79 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 	_bt_returnitem(scan, so);
 	return true;
 }
+
+/*
+ *	_bt_endpoint() -- Find the first or last page in the index, and scan
+ * from there to the first key satisfying all the quals.
+ *
+ * This is used by _bt_first() to set up a scan when we've determined
+ * that the scan must start at the beginning or end of the index (for
+ * a forward or backward scan respectively).
+ *
+ * Parallel scan callers must have seized the scan before calling here.
+ * Exit conditions are the same as for _bt_first().
+ */
+static IndexScanBatch
+_bt_endpoint_batch(IndexScanDesc scan, ScanDirection dir)
+{
+	Relation	rel = scan->indexRelation;
+	BTScanOpaque so PG_USED_FOR_ASSERTS_ONLY = (BTScanOpaque) scan->opaque;
+	Page		page;
+	BTPageOpaque opaque;
+	OffsetNumber start;
+	BTBatchScanPosData pos;
+
+	BTBatchScanPosInvalidate(pos);
+
+	/* batching does not work with regular scan-level positions */
+	Assert(!BTScanPosIsValid(so->currPos));
+	Assert(!BTScanPosIsValid(so->markPos));
+
+	Assert(!BTScanPosIsValid(so->currPos));
+	Assert(!so->needPrimScan);
+
+	/*
+	 * Scan down to the leftmost or rightmost leaf page.  This is a simplified
+	 * version of _bt_search().
+	 */
+	pos.buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
+
+	if (!BufferIsValid(pos.buf))
+	{
+		/*
+		 * Empty index. Lock the whole relation, as nothing finer to lock
+		 * exists.
+		 */
+		PredicateLockRelation(rel, scan->xs_snapshot);
+		_bt_parallel_done(scan);
+		return false;
+	}
+
+	page = BufferGetPage(pos.buf);
+	opaque = BTPageGetOpaque(page);
+	Assert(P_ISLEAF(opaque));
+
+	if (ScanDirectionIsForward(dir))
+	{
+		/* There could be dead pages to the left, so not this: */
+		/* Assert(P_LEFTMOST(opaque)); */
+
+		start = P_FIRSTDATAKEY(opaque);
+	}
+	else if (ScanDirectionIsBackward(dir))
+	{
+		Assert(P_RIGHTMOST(opaque));
+
+		start = PageGetMaxOffsetNumber(page);
+	}
+	else
+	{
+		elog(ERROR, "invalid scan direction: %d", (int) dir);
+		start = 0;				/* keep compiler quiet */
+	}
+
+	/*
+	 * Now load data from the first page of the scan.
+	 */
+	return _bt_readfirstpage_batch(scan, &pos, start, dir);
+}
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 11802a4c2151..187f6fa5934b 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -3492,6 +3492,185 @@ _bt_killitems(IndexScanDesc scan)
 	_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
 }
 
+/*
+ * _bt_killitems_batch
+ *		a variant of _bt_killitems, using the batch-level killedItems
+ */
+void
+_bt_killitems_batch(IndexScanDesc scan, IndexScanBatch batch)
+{
+	/* BTScanOpaque so = (BTScanOpaque) scan->opaque; */
+	BTBatchScanPos pos = (BTBatchScanPos) batch->opaque;
+	Page		page;
+	BTPageOpaque opaque;
+	OffsetNumber minoff;
+	OffsetNumber maxoff;
+	int			i;
+	int			numKilled = batch->numKilled;
+	bool		killedsomething = false;
+	bool		droppedpin PG_USED_FOR_ASSERTS_ONLY;
+
+	Assert(BTBatchScanPosIsValid(*pos));
+
+	/*
+	 * Always reset the scan state, so we don't look for same items on other
+	 * pages.
+	 */
+	batch->numKilled = 0;
+
+	if (BTBatchScanPosIsPinned(*pos))
+	{
+		/*
+		 * We have held the pin on this page since we read the index tuples,
+		 * so all we need to do is lock it.  The pin will have prevented
+		 * re-use of any TID on the page, so there is no need to check the
+		 * LSN.
+		 */
+		droppedpin = false;
+		_bt_lockbuf(scan->indexRelation, pos->buf, BT_READ);
+
+		page = BufferGetPage(pos->buf);
+	}
+	else
+	{
+		Buffer		buf;
+
+		droppedpin = true;
+		/* Attempt to re-read the buffer, getting pin and lock. */
+		buf = _bt_getbuf(scan->indexRelation, pos->currPage, BT_READ);
+
+		page = BufferGetPage(buf);
+		if (BufferGetLSNAtomic(buf) == pos->lsn)
+			pos->buf = buf;
+		else
+		{
+			/* Modified while not pinned means hinting is not safe. */
+			_bt_relbuf(scan->indexRelation, buf);
+			return;
+		}
+	}
+
+	opaque = BTPageGetOpaque(page);
+	minoff = P_FIRSTDATAKEY(opaque);
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	for (i = 0; i < numKilled; i++)
+	{
+		int			itemIndex = batch->killedItems[i];
+		IndexScanBatchPosItem *kitem = &batch->items[itemIndex];
+		OffsetNumber offnum = kitem->indexOffset;
+
+		Assert(itemIndex >= batch->firstItem &&
+			   itemIndex <= batch->lastItem);
+		if (offnum < minoff)
+			continue;			/* pure paranoia */
+		while (offnum <= maxoff)
+		{
+			ItemId		iid = PageGetItemId(page, offnum);
+			IndexTuple	ituple = (IndexTuple) PageGetItem(page, iid);
+			bool		killtuple = false;
+
+			if (BTreeTupleIsPosting(ituple))
+			{
+				int			pi = i + 1;
+				int			nposting = BTreeTupleGetNPosting(ituple);
+				int			j;
+
+				/*
+				 * We rely on the convention that heap TIDs in the scanpos
+				 * items array are stored in ascending heap TID order for a
+				 * group of TIDs that originally came from a posting list
+				 * tuple.  This convention even applies during backwards
+				 * scans, where returning the TIDs in descending order might
+				 * seem more natural.  This is about effectiveness, not
+				 * correctness.
+				 *
+				 * Note that the page may have been modified in almost any way
+				 * since we first read it (in the !droppedpin case), so it's
+				 * possible that this posting list tuple wasn't a posting list
+				 * tuple when we first encountered its heap TIDs.
+				 */
+				for (j = 0; j < nposting; j++)
+				{
+					ItemPointer item = BTreeTupleGetPostingN(ituple, j);
+
+					if (!ItemPointerEquals(item, &kitem->heapTid))
+						break;	/* out of posting list loop */
+
+					/*
+					 * kitem must have matching offnum when heap TIDs match,
+					 * though only in the common case where the page can't
+					 * have been concurrently modified
+					 */
+					Assert(kitem->indexOffset == offnum || !droppedpin);
+
+					/*
+					 * Read-ahead to later kitems here.
+					 *
+					 * We rely on the assumption that not advancing kitem here
+					 * will prevent us from considering the posting list tuple
+					 * fully dead by not matching its next heap TID in next
+					 * loop iteration.
+					 *
+					 * If, on the other hand, this is the final heap TID in
+					 * the posting list tuple, then tuple gets killed
+					 * regardless (i.e. we handle the case where the last
+					 * kitem is also the last heap TID in the last index tuple
+					 * correctly -- posting tuple still gets killed).
+					 */
+					if (pi < numKilled)
+						kitem = &batch->items[batch->killedItems[pi++]];
+				}
+
+				/*
+				 * Don't bother advancing the outermost loop's int iterator to
+				 * avoid processing killed items that relate to the same
+				 * offnum/posting list tuple.  This micro-optimization hardly
+				 * seems worth it.  (Further iterations of the outermost loop
+				 * will fail to match on this same posting list's first heap
+				 * TID instead, so we'll advance to the next offnum/index
+				 * tuple pretty quickly.)
+				 */
+				if (j == nposting)
+					killtuple = true;
+			}
+			else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
+				killtuple = true;
+
+			/*
+			 * Mark index item as dead, if it isn't already.  Since this
+			 * happens while holding a buffer lock possibly in shared mode,
+			 * it's possible that multiple processes attempt to do this
+			 * simultaneously, leading to multiple full-page images being sent
+			 * to WAL (if wal_log_hints or data checksums are enabled), which
+			 * is undesirable.
+			 */
+			if (killtuple && !ItemIdIsDead(iid))
+			{
+				/* found the item/all posting list items */
+				ItemIdMarkDead(iid);
+				killedsomething = true;
+				break;			/* out of inner search loop */
+			}
+			offnum = OffsetNumberNext(offnum);
+		}
+	}
+
+	/*
+	 * Since this can be redone later if needed, mark as dirty hint.
+	 *
+	 * Whenever we mark anything LP_DEAD, we also set the page's
+	 * BTP_HAS_GARBAGE flag, which is likewise just a hint.  (Note that we
+	 * only rely on the page-level flag in !heapkeyspace indexes.)
+	 */
+	if (killedsomething)
+	{
+		opaque->btpo_flags |= BTP_HAS_GARBAGE;
+		MarkBufferDirtyHint(pos->buf, true);
+	}
+
+	_bt_unlockbuf(scan->indexRelation, pos->buf);
+}
 
 /*
  * The following routines manage a shared-memory area in which we track
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index ebca02588d3e..a00a1108ba51 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1001,6 +1001,38 @@ typedef struct BTScanPosData
 
 typedef BTScanPosData *BTScanPos;
 
+/*
+ * Minimal AM-specific concept of "position" for batching.
+ */
+typedef struct BTBatchScanPosData
+{
+	Buffer		buf;			/* currPage buf (invalid means unpinned) */
+
+	/* page details as of the saved position's call to _bt_readpage */
+	BlockNumber currPage;		/* page referenced by items array */
+	BlockNumber prevPage;		/* currPage's left link */
+	BlockNumber nextPage;		/* currPage's right link */
+	XLogRecPtr	lsn;			/* currPage's LSN */
+
+	/* scan direction for the saved position's call to _bt_readpage */
+	ScanDirection dir;
+
+	/*
+	 * If we are doing an index-only scan, nextTupleOffset is the first free
+	 * location in the associated tuple storage workspace.
+	 */
+	int			nextTupleOffset;
+
+	/*
+	 * moreLeft and moreRight track whether we think there may be matching
+	 * index entries to the left and right of the current page, respectively.
+	 */
+	bool		moreLeft;
+	bool		moreRight;
+} BTBatchScanPosData;
+
+typedef BTBatchScanPosData *BTBatchScanPos;
+
 #define BTScanPosIsPinned(scanpos) \
 ( \
 	AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
@@ -1017,7 +1049,6 @@ typedef BTScanPosData *BTScanPos;
 		if (BTScanPosIsPinned(scanpos)) \
 			BTScanPosUnpin(scanpos); \
 	} while (0)
-
 #define BTScanPosIsValid(scanpos) \
 ( \
 	AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
@@ -1030,6 +1061,35 @@ typedef BTScanPosData *BTScanPos;
 		(scanpos).currPage = InvalidBlockNumber; \
 	} while (0)
 
+#define BTBatchScanPosIsPinned(scanpos) \
+( \
+	AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
+				!BufferIsValid((scanpos).buf)), \
+	BufferIsValid((scanpos).buf) \
+)
+#define BTBatchScanPosUnpin(scanpos) \
+	do { \
+		ReleaseBuffer((scanpos).buf); \
+		(scanpos).buf = InvalidBuffer; \
+	} while (0)
+#define BTBatchScanPosUnpinIfPinned(scanpos) \
+	do { \
+		if (BTBatchScanPosIsPinned(scanpos)) \
+			BTBatchScanPosUnpin(scanpos); \
+	} while (0)
+#define BTBatchScanPosIsValid(scanpos) \
+( \
+	AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
+				!BufferIsValid((scanpos).buf)), \
+	BlockNumberIsValid((scanpos).currPage) \
+)
+#define BTBatchScanPosInvalidate(scanpos) \
+	do { \
+		(scanpos).buf = InvalidBuffer; \
+		(scanpos).currPage = InvalidBlockNumber; \
+	} while (0)
+
+
 /* We need one of these for each equality-type SK_SEARCHARRAY scan key */
 typedef struct BTArrayKeyInfo
 {
@@ -1191,6 +1251,8 @@ extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys);
 extern Size btestimateparallelscan(Relation rel, int nkeys, int norderbys);
 extern void btinitparallelscan(void *target);
 extern bool btgettuple(IndexScanDesc scan, ScanDirection dir);
+extern IndexScanBatch btgetbatch(IndexScanDesc scan, ScanDirection dir);
+extern void btfreebatch(IndexScanDesc scan, IndexScanBatch batch);
 extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
 extern void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 					 ScanKey orderbys, int norderbys);
@@ -1215,6 +1277,9 @@ extern StrategyNumber bttranslatecmptype(CompareType cmptype, Oid opfamily);
  */
 extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
 							   BlockNumber *last_curr_page, bool first);
+extern bool _bt_parallel_seize_batch(IndexScanDesc scan, BTBatchScanPos pos,
+									 BlockNumber *next_scan_page,
+									 BlockNumber *last_curr_page, bool first);
 extern void _bt_parallel_release(IndexScanDesc scan,
 								 BlockNumber next_scan_page,
 								 BlockNumber curr_page);
@@ -1308,6 +1373,10 @@ extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
 
+extern IndexScanBatch _bt_first_batch(IndexScanDesc scan, ScanDirection dir);
+extern IndexScanBatch _bt_next_batch(IndexScanDesc scan, BTBatchScanPos pos, ScanDirection dir);
+extern void _bt_kill_batch(IndexScanDesc scan, IndexScanBatch batch);
+
 /*
  * prototypes for functions in nbtutils.c
  */
@@ -1326,6 +1395,7 @@ extern bool _bt_scanbehind_checkkeys(IndexScanDesc scan, ScanDirection dir,
 									 IndexTuple finaltup);
 extern void _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate);
 extern void _bt_killitems(IndexScanDesc scan);
+extern void _bt_killitems_batch(IndexScanDesc scan, IndexScanBatch batch);
 extern BTCycleId _bt_vacuum_cycleid(Relation rel);
 extern BTCycleId _bt_start_vacuum(Relation rel);
 extern void _bt_end_vacuum(Relation rel);
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 060d964e3995..1e5548aacb93 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -195,6 +195,8 @@ BOOL
 BOOLEAN
 BOX
 BTArrayKeyInfo
+BTBatchInfo
+BTBatchScanPosData
 BTBuildState
 BTCallbackState
 BTCycleId

From 4dd789b4c5340f84dbd054ec22a9190bd0e064f8 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Wed, 1 Jan 2025 22:10:37 +0100
Subject: [PATCH 3/3] WIP: Don't read the same block repeatedly

---
 src/backend/access/heap/heapam_handler.c | 57 ++++++++++++++++++++++--
 src/backend/access/index/indexam.c       | 11 +++++
 src/backend/storage/buffer/bufmgr.c      | 40 +++++++++++++++++
 src/include/access/relscan.h             |  2 +
 src/include/storage/bufmgr.h             |  2 +
 5 files changed, 109 insertions(+), 3 deletions(-)

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index f79d97a8c64e..326d5fed681e 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -136,6 +136,7 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
 	{
 		/* Switch to correct buffer if we don't have it already */
 		Buffer		prev_buf = hscan->xs_cbuf;
+		bool		release_prev = true;
 
 		/*
 		 * Read the block for the requested TID. With a read stream, simply
@@ -157,7 +158,56 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
 		 * API.
 		 */
 		if (scan->rs)
-			hscan->xs_cbuf = read_stream_next_buffer(scan->rs, NULL);
+		{
+			/*
+			 * If we're trying to read the same block as the last time, don't
+			 * try reading it from the stream again, but just return the last
+			 * buffer. We need to check if the previous buffer is still pinned
+			 * and contains the correct block (it might have been unpinned,
+			 * used for a different block, so we need to be careful).
+			 *
+			 * The place scheduling the blocks (index_scan_stream_read_next)
+			 * needs to do the same thing and not schedule the blocks if it
+			 * matches the previous one. Otherwise the stream will get out of
+			 * sync, causing confusion.
+			 *
+			 * This is what ReleaseAndReadBuffer does too, but it does not
+			 * have a queue of requests scheduled from somewhere else, so it
+			 * does not need to worry about that.
+			 *
+			 * XXX Maybe we should remember the block in IndexFetchTableData,
+			 * so that we can make the check even cheaper, without looking at
+			 * the buffer descriptor? But that assumes the buffer was not
+			 * unpinned (or repinned) elsewhere, before we got back here. But
+			 * can that even happen? If yes, I guess we shouldn't be releasing
+			 * the prev buffer anyway.
+			 *
+			 * XXX This has undesired impact on prefetch distance. The read
+			 * stream schedules reads for a certain number of future blocks,
+			 * but if we skip duplicate blocks, the prefetch distance may get
+			 * unexpectedly large (e.g. for correlated indexes, with long runs
+			 * of TIDs from the same heap page). This may spend a lot of CPU
+			 * time in the index_scan_stream_read_next callback, but more
+			 * importantly it may require reading (and keeping) a lot of leaf
+			 * pages from the index.
+			 *
+			 * XXX What if we pinned the buffer twice (increase the refcount),
+			 * so that if the caller unpins the buffer, we still keep the
+			 * second pin. Wouldn't that mean we don't need to worry about the
+			 * possibility someone loaded another page into the buffer?
+			 *
+			 * XXX We might also keep a longer history of recent blocks, not
+			 * just the immediately preceding one. But that makes it harder,
+			 * because the two places (read_next callback and here) need to
+			 * have a slightly different view.
+			 */
+			if (BufferMatches(hscan->xs_cbuf,
+							  hscan->xs_base.rel,
+							  ItemPointerGetBlockNumber(tid)))
+				release_prev = false;
+			else
+				hscan->xs_cbuf = read_stream_next_buffer(scan->rs, NULL);
+		}
 		else
 			hscan->xs_cbuf = ReleaseAndReadBuffer(hscan->xs_cbuf,
 												  hscan->xs_base.rel,
@@ -181,7 +231,8 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
 			heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf);
 
 		/*
-		 * When using the read stream, release the old buffer.
+		 * When using the read stream, release the old buffer - but only if
+		 * we're reading a different block.
 		 *
 		 * XXX Not sure this is really needed, or maybe this is not the right
 		 * place to do this, and buffers should be released elsewhere. The
@@ -199,7 +250,7 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
 		 * XXX Does this do the right thing when reading the same page? That
 		 * should return the same buffer, so won't we release it prematurely?
 		 */
-		if (scan->rs && (prev_buf != InvalidBuffer))
+		if (scan->rs && (prev_buf != InvalidBuffer) && release_prev)
 		{
 			ReleaseBuffer(prev_buf);
 		}
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 190a112e4571..ae4f3ffb0cac 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -1908,6 +1908,15 @@ index_scan_stream_read_next(ReadStream *stream,
 				continue;
 			}
 
+			/* same block as before, don't need to read it */
+			if (scan->xs_batches->lastBlock == ItemPointerGetBlockNumber(tid))
+			{
+				DEBUG_LOG("index_scan_stream_read_next: skip block (lastBlock)");
+				continue;
+			}
+
+			scan->xs_batches->lastBlock = ItemPointerGetBlockNumber(tid);
+
 			return ItemPointerGetBlockNumber(tid);
 		}
 
@@ -2268,6 +2277,7 @@ index_batch_init(IndexScanDesc scan)
 	index_batch_pos_reset(scan, &scan->xs_batches->markPos);
 
 	// scan->xs_batches->currentBatch = NULL;
+	scan->xs_batches->lastBlock = InvalidBlockNumber;
 }
 
 /*
@@ -2350,6 +2360,7 @@ index_batch_reset(IndexScanDesc scan, bool complete)
 	batches->finished = false;
 	batches->reset = false;
 	// batches->currentBatch = NULL;
+	batches->lastBlock = InvalidBlockNumber;
 
 	AssertCheckBatches(scan);
 }
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 0b317d2d809f..35c3526e2501 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3045,6 +3045,46 @@ ReleaseAndReadBuffer(Buffer buffer,
 	return ReadBuffer(relation, blockNum);
 }
 
+/*
+ * BufferMatches
+ *		Check if the buffer (still) contains the expected page.
+ *
+ * Check if the buffer contains the expected page. The buffer may be invalid,
+ * or valid and pinned.
+ */
+bool
+BufferMatches(Buffer buffer,
+			  Relation relation,
+			  BlockNumber blockNum)
+{
+	ForkNumber	forkNum = MAIN_FORKNUM;
+	BufferDesc *bufHdr;
+
+	if (BufferIsValid(buffer))
+	{
+		Assert(BufferIsPinned(buffer));
+		if (BufferIsLocal(buffer))
+		{
+			bufHdr = GetLocalBufferDescriptor(-buffer - 1);
+			if (bufHdr->tag.blockNum == blockNum &&
+				BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
+				BufTagGetForkNum(&bufHdr->tag) == forkNum)
+				return true;
+		}
+		else
+		{
+			bufHdr = GetBufferDescriptor(buffer - 1);
+			/* we have pin, so it's ok to examine tag without spinlock */
+			if (bufHdr->tag.blockNum == blockNum &&
+				BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
+				BufTagGetForkNum(&bufHdr->tag) == forkNum)
+				return true;
+		}
+	}
+
+	return false;
+}
+
 /*
  * PinBuffer -- make buffer unavailable for replacement.
  *
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index b63af845ca6b..2bbd0db0223a 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -242,6 +242,8 @@ typedef struct IndexScanBatches
 	bool		finished;
 	bool		reset;
 
+	BlockNumber lastBlock;
+
 	/*
 	 * Current scan direction, for the currently loaded batches. This is used
 	 * to load data in the read stream API callback, etc.
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 41fdc1e76938..3b7d4e6a6a28 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -237,6 +237,8 @@ extern void IncrBufferRefCount(Buffer buffer);
 extern void CheckBufferIsPinnedOnce(Buffer buffer);
 extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation,
 								   BlockNumber blockNum);
+extern bool BufferMatches(Buffer buffer, Relation relation,
+						  BlockNumber blockNum);
 
 extern Buffer ExtendBufferedRel(BufferManagerRelation bmr,
 								ForkNumber forkNum,