10 files changed, 123 insertions, 10 deletions
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 070f14c8b91..36544ecfd58 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -2282,9 +2282,12 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
  * previously-saved right link or left link.  lastcurrblkno is the page that
  * was current at the point where the blkno link was saved, which we use to
  * reason about concurrent page splits/page deletions during backwards scans.
+ * In the common case where seized=false, blkno is either so->currPos.nextPage
+ * or so->currPos.prevPage, and lastcurrblkno is so->currPos.currPage.
  *
- * On entry, caller shouldn't hold any locks or pins on any page (we work
- * directly off of blkno and lastcurrblkno instead).  Parallel scan callers
+ * On entry, so->currPos shouldn't be locked by caller.  so->currPos.buf must
+ * be InvalidBuffer/unpinned as needed by caller (note that lastcurrblkno
+ * won't need to be read again in almost all cases).  Parallel scan callers
  * that seized the scan before calling here should pass seized=true; such a
  * caller's blkno and lastcurrblkno arguments come from the seized scan.
  * seized=false callers just pass us the blkno/lastcurrblkno taken from their
@@ -2301,8 +2304,8 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
  * success exit (except during so->dropPin index scans, when we drop the pin
  * eagerly to avoid blocking VACUUM).
  *
- * If there are no more matching records in the given direction, we drop all
- * locks and pins, invalidate so->currPos, and return false.
+ * If there are no more matching records in the given direction, we invalidate
+ * so->currPos (while ensuring it retains no locks or pins), and return false.
  *
  * We always release the scan for a parallel scan caller, regardless of
  * success or failure; we'll call _bt_parallel_release as soon as possible.
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 1914859b2ee..47ffc0a2307 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7498,6 +7498,10 @@ CreateCheckPoint(int flags)
 	if (PriorRedoPtr != InvalidXLogRecPtr)
 		UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
 
+#ifdef USE_INJECTION_POINTS
+	INJECTION_POINT("checkpoint-before-old-wal-removal", NULL);
+#endif
+
 	/*
 	 * Delete old log files, those no longer needed for last checkpoint to
 	 * prevent the disk holding the xlog from growing full.
diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 255bd795361..b5400749353 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -144,7 +144,7 @@ execTuplesHashPrepare(int numCols,
  *	hashfunctions: FmgrInfos of datatype-specific hashing functions to use
  *	collations: collations to use in comparisons
  *	nbuckets: initial estimate of hashtable size
- *	additionalsize: size of data stored in ->additional
+ *	additionalsize: size of data that may be stored along with the hash entry
  *	metacxt: memory context for long-lived allocation, but not per-entry data
  *	tablecxt: memory context in which to store table entries
  *	tempcxt: short-lived context for evaluation hash and comparison functions
@@ -288,7 +288,7 @@ ResetTupleHashTable(TupleHashTable hashtable)
  *
  * If isnew isn't NULL, then a new entry is created if no existing entry
  * matches.  On return, *isnew is true if the entry is newly created,
- * false if it existed already.  ->additional_data in the new entry has
+ * false if it existed already.  The additional data in the new entry has
  * been zeroed.
  */
 TupleHashEntry
diff --git a/src/backend/executor/nodeTidrangescan.c b/src/backend/executor/nodeTidrangescan.c
index ab2eab9596e..26f7420b64b 100644
--- a/src/backend/executor/nodeTidrangescan.c
+++ b/src/backend/executor/nodeTidrangescan.c
@@ -128,9 +128,11 @@ TidExprListCreate(TidRangeScanState *tidrangestate)
  *		TidRangeEval
  *
  *		Compute and set node's block and offset range to scan by evaluating
- *		the trss_tidexprs.  Returns false if we detect the range cannot
+ *		node->trss_tidexprs.  Returns false if we detect the range cannot
  *		contain any tuples.  Returns true if it's possible for the range to
- *		contain tuples.
+ *		contain tuples.  We don't bother validating that trss_mintid is less
+ *		than or equal to trss_maxtid, as the scan_set_tidrange() table AM
+ *		function will handle that.
  * ----------------------------------------------------------------
  */
 static bool
diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c
index 1d56d0c4ef3..f1eb798f3e9 100644
--- a/src/backend/replication/logical/logical.c
+++ b/src/backend/replication/logical/logical.c
@@ -29,6 +29,7 @@
 #include "postgres.h"
 
 #include "access/xact.h"
+#include "access/xlog_internal.h"
 #include "access/xlogutils.h"
 #include "fmgr.h"
 #include "miscadmin.h"
@@ -41,6 +42,7 @@
 #include "storage/proc.h"
 #include "storage/procarray.h"
 #include "utils/builtins.h"
+#include "utils/injection_point.h"
 #include "utils/inval.h"
 #include "utils/memutils.h"
 
@@ -1825,9 +1827,13 @@ LogicalConfirmReceivedLocation(XLogRecPtr lsn)
 	{
 		bool		updated_xmin = false;
 		bool		updated_restart = false;
+		XLogRecPtr	restart_lsn pg_attribute_unused();
 
 		SpinLockAcquire(&MyReplicationSlot->mutex);
 
+		/* remember the old restart lsn */
+		restart_lsn = MyReplicationSlot->data.restart_lsn;
+
 		/*
 		 * Prevent moving the confirmed_flush backwards, as this could lead to
 		 * data duplication issues caused by replicating already replicated
@@ -1881,6 +1887,18 @@ LogicalConfirmReceivedLocation(XLogRecPtr lsn)
 		/* first write new xmin to disk, so we know what's up after a crash */
 		if (updated_xmin || updated_restart)
 		{
+#ifdef USE_INJECTION_POINTS
+			XLogSegNo	seg1,
+						seg2;
+
+			XLByteToSeg(restart_lsn, seg1, wal_segment_size);
+			XLByteToSeg(MyReplicationSlot->data.restart_lsn, seg2, wal_segment_size);
+
+			/* trigger injection point, but only if segment changes */
+			if (seg1 != seg2)
+				INJECTION_POINT("logical-replication-slot-advance-segment", NULL);
+#endif
+
 			ReplicationSlotMarkDirty();
 			ReplicationSlotSave();
 			elog(DEBUG1, "updated xmin: %u restart: %u", updated_xmin, updated_restart);
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index 600b87fa9cb..c64f020742f 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -424,6 +424,7 @@ ReplicationSlotCreate(const char *name, bool db_specific,
 	slot->candidate_restart_valid = InvalidXLogRecPtr;
 	slot->candidate_restart_lsn = InvalidXLogRecPtr;
 	slot->last_saved_confirmed_flush = InvalidXLogRecPtr;
+	slot->last_saved_restart_lsn = InvalidXLogRecPtr;
 	slot->inactive_since = 0;
 
 	/*
@@ -1165,20 +1166,41 @@ ReplicationSlotsComputeRequiredLSN(void)
 	{
 		ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
 		XLogRecPtr	restart_lsn;
+		XLogRecPtr	last_saved_restart_lsn;
 		bool		invalidated;
+		ReplicationSlotPersistency persistency;
 
 		if (!s->in_use)
 			continue;
 
 		SpinLockAcquire(&s->mutex);
+		persistency = s->data.persistency;
 		restart_lsn = s->data.restart_lsn;
 		invalidated = s->data.invalidated != RS_INVAL_NONE;
+		last_saved_restart_lsn = s->last_saved_restart_lsn;
 		SpinLockRelease(&s->mutex);
 
 		/* invalidated slots need not apply */
 		if (invalidated)
 			continue;
 
+		/*
+		 * For persistent slot use last_saved_restart_lsn to compute the
+		 * oldest LSN for removal of WAL segments.  The segments between
+		 * last_saved_restart_lsn and restart_lsn might be needed by a
+		 * persistent slot in the case of database crash.  Non-persistent
+		 * slots can't survive the database crash, so we don't care about
+		 * last_saved_restart_lsn for them.
+		 */
+		if (persistency == RS_PERSISTENT)
+		{
+			if (last_saved_restart_lsn != InvalidXLogRecPtr &&
+				restart_lsn > last_saved_restart_lsn)
+			{
+				restart_lsn = last_saved_restart_lsn;
+			}
+		}
+
 		if (restart_lsn != InvalidXLogRecPtr &&
 			(min_required == InvalidXLogRecPtr ||
 			 restart_lsn < min_required))
@@ -1216,7 +1238,9 @@ ReplicationSlotsComputeLogicalRestartLSN(void)
 	{
 		ReplicationSlot *s;
 		XLogRecPtr	restart_lsn;
+		XLogRecPtr	last_saved_restart_lsn;
 		bool		invalidated;
+		ReplicationSlotPersistency persistency;
 
 		s = &ReplicationSlotCtl->replication_slots[i];
 
@@ -1230,14 +1254,33 @@ ReplicationSlotsComputeLogicalRestartLSN(void)
 
 		/* read once, it's ok if it increases while we're checking */
 		SpinLockAcquire(&s->mutex);
+		persistency = s->data.persistency;
 		restart_lsn = s->data.restart_lsn;
 		invalidated = s->data.invalidated != RS_INVAL_NONE;
+		last_saved_restart_lsn = s->last_saved_restart_lsn;
 		SpinLockRelease(&s->mutex);
 
 		/* invalidated slots need not apply */
 		if (invalidated)
 			continue;
 
+		/*
+		 * For persistent slot use last_saved_restart_lsn to compute the
+		 * oldest LSN for removal of WAL segments.  The segments between
+		 * last_saved_restart_lsn and restart_lsn might be needed by a
+		 * persistent slot in the case of database crash.  Non-persistent
+		 * slots can't survive the database crash, so we don't care about
+		 * last_saved_restart_lsn for them.
+		 */
+		if (persistency == RS_PERSISTENT)
+		{
+			if (last_saved_restart_lsn != InvalidXLogRecPtr &&
+				restart_lsn > last_saved_restart_lsn)
+			{
+				restart_lsn = last_saved_restart_lsn;
+			}
+		}
+
 		if (restart_lsn == InvalidXLogRecPtr)
 			continue;
 
@@ -1455,6 +1498,7 @@ ReplicationSlotReserveWal(void)
 
 	Assert(slot != NULL);
 	Assert(slot->data.restart_lsn == InvalidXLogRecPtr);
+	Assert(slot->last_saved_restart_lsn == InvalidXLogRecPtr);
 
 	/*
 	 * The replication slot mechanism is used to prevent removal of required
@@ -1766,6 +1810,8 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes,
 		 */
 		SpinLockAcquire(&s->mutex);
 
+		Assert(s->data.restart_lsn >= s->last_saved_restart_lsn);
+
 		restart_lsn = s->data.restart_lsn;
 
 		/* we do nothing if the slot is already invalid */
@@ -1835,7 +1881,10 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes,
 			 * just rely on .invalidated.
 			 */
 			if (invalidation_cause == RS_INVAL_WAL_REMOVED)
+			{
 				s->data.restart_lsn = InvalidXLogRecPtr;
+				s->last_saved_restart_lsn = InvalidXLogRecPtr;
+			}
 
 			/* Let caller know */
 			*invalidated = true;
@@ -2079,6 +2128,12 @@ CheckPointReplicationSlots(bool is_shutdown)
 		SaveSlotToPath(s, path, LOG);
 	}
 	LWLockRelease(ReplicationSlotAllocationLock);
+
+	/*
+	 * Recompute the required LSN as SaveSlotToPath() updated
+	 * last_saved_restart_lsn for slots.
+	 */
+	ReplicationSlotsComputeRequiredLSN();
 }
 
 /*
@@ -2354,6 +2409,7 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel)
 	if (!slot->just_dirtied)
 		slot->dirty = false;
 	slot->last_saved_confirmed_flush = cp.slotdata.confirmed_flush;
+	slot->last_saved_restart_lsn = cp.slotdata.restart_lsn;
 	SpinLockRelease(&slot->mutex);
 
 	LWLockRelease(&slot->io_in_progress_lock);
@@ -2569,6 +2625,7 @@ RestoreSlotFromDisk(const char *name)
 		slot->effective_xmin = cp.slotdata.xmin;
 		slot->effective_catalog_xmin = cp.slotdata.catalog_xmin;
 		slot->last_saved_confirmed_flush = cp.slotdata.confirmed_flush;
+		slot->last_saved_restart_lsn = cp.slotdata.restart_lsn;
 
 		slot->candidate_catalog_xmin = InvalidTransactionId;
 		slot->candidate_xmin_lsn = InvalidXLogRecPtr;
diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c
index 6c6c0a908e2..3643f27ad6e 100644
--- a/src/backend/storage/aio/aio.c
+++ b/src/backend/storage/aio/aio.c
@@ -556,6 +556,13 @@ bool
 pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state)
 {
 	*state = ioh->state;
+
+	/*
+	 * Ensure that we don't see an earlier state of the handle than ioh->state
+	 * due to compiler or CPU reordering. This protects both ->generation as
+	 * directly used here, and other fields in the handle accessed in the
+	 * caller if the handle was not reused.
+	 */
 	pg_read_barrier();
 
 	return ioh->generation != ref_generation;
@@ -773,7 +780,12 @@ pgaio_io_wait_for_free(void)
 			 * Note that no interrupts are processed between the state check
 			 * and the call to reclaim - that's important as otherwise an
 			 * interrupt could have already reclaimed the handle.
+			 *
+			 * Need to ensure that there's no reordering, in the more common
+			 * paths, where we wait for IO, that's done by
+			 * pgaio_io_was_recycled().
 			 */
+			pg_read_barrier();
 			pgaio_io_reclaim(ioh);
 			reclaimed++;
 		}
@@ -852,7 +864,12 @@ pgaio_io_wait_for_free(void)
 				 * check and the call to reclaim - that's important as
 				 * otherwise an interrupt could have already reclaimed the
 				 * handle.
+				 *
+				 * Need to ensure that there's no reordering, in the more
+				 * common paths, where we wait for IO, that's done by
+				 * pgaio_io_was_recycled().
 				 */
+				pg_read_barrier();
 				pgaio_io_reclaim(ioh);
 				break;
 		}
diff --git a/src/backend/storage/aio/aio_callback.c b/src/backend/storage/aio/aio_callback.c
index 0ad9795bb7e..03c9bba0802 100644
--- a/src/backend/storage/aio/aio_callback.c
+++ b/src/backend/storage/aio/aio_callback.c
@@ -256,6 +256,9 @@ pgaio_io_call_complete_shared(PgAioHandle *ioh)
 					   pgaio_result_status_string(result.status),
 					   result.id, result.error_data, result.result);
 		result = ce->cb->complete_shared(ioh, result, cb_data);
+
+		/* the callback should never transition to unknown */
+		Assert(result.status != PGAIO_RS_UNKNOWN);
 	}
 
 	ioh->distilled_result = result;
@@ -290,6 +293,7 @@ pgaio_io_call_complete_local(PgAioHandle *ioh)
 
 	/* start with distilled result from shared callback */
 	result = ioh->distilled_result;
+	Assert(result.status != PGAIO_RS_UNKNOWN);
 
 	for (int i = ioh->num_callbacks; i > 0; i--)
 	{
@@ -306,6 +310,9 @@ pgaio_io_call_complete_local(PgAioHandle *ioh)
 					   pgaio_result_status_string(result.status),
 					   result.id, result.error_data, result.result);
 		result = ce->cb->complete_local(ioh, result, cb_data);
+
+		/* the callback should never transition to unknown */
+		Assert(result.status != PGAIO_RS_UNKNOWN);
 	}
 
 	/*
diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c
index 743cccc2acd..36be179678d 100644
--- a/src/backend/storage/aio/method_worker.c
+++ b/src/backend/storage/aio/method_worker.c
@@ -461,7 +461,12 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
 		int			nwakeups = 0;
 		int			worker;
 
-		/* Try to get a job to do. */
+		/*
+		 * Try to get a job to do.
+		 *
+		 * The lwlock acquisition also provides the necessary memory barrier
+		 * to ensure that we don't see an outdated data in the handle.
+		 */
 		LWLockAcquire(AioWorkerSubmissionQueueLock, LW_EXCLUSIVE);
 		if ((io_index = pgaio_worker_submission_queue_consume()) == UINT32_MAX)
 		{
diff --git a/src/backend/utils/adt/mcxtfuncs.c b/src/backend/utils/adt/mcxtfuncs.c
index 396c2f223b4..fe6dce9cba3 100644
--- a/src/backend/utils/adt/mcxtfuncs.c
+++ b/src/backend/utils/adt/mcxtfuncs.c
@@ -38,7 +38,7 @@ typedef struct MemoryContextId
 {
 	MemoryContext context;
 	int			context_id;
-}			MemoryContextId;
+} MemoryContextId;
 
 /*
  * int_list_to_array