Adjust max_slot_wal_keep_size behavior per review

alvherre · alvherre · commit 6f7a862bed3a · 2020-06-24T14:23:39.000-04:00
In pg_replication_slot, change output from normal/reserved/lost to reserved/extended/unreserved/ lost, which better expresses the possible states particularly near the time where segments are no longer safe but checkpoint has not run yet. Under the new definition, reserved means the slot is consuming WAL that's still under the normal WAL size constraints; extended means it's consuming WAL that's being protected by wal_keep_segments or the slot itself, whose size is below max_slot_wal_keep_size; unreserved means the WAL is no longer safe, but checkpoint has not yet removed those files. Such as slot is in imminent danger, but can still continue for a little while and may catch up to the reserved WAL space. Also, there were some bugs in the calculations used to report the status; fixed those. Backpatch to 13. Reported-by: Fujii Masao <masao.fujii@oss.nttdata.com> Author: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Reviewed-by: Fujii Masao <masao.fujii@oss.nttdata.com> Reviewed-by: Álvaro Herrera <alvherre@alvh.no-ip.org> Discussion: https://postgr.es/m/20200616.120236.1809496990963386593.horikyota.ntt@gmail.com
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
@@ -11239,19 +11239,29 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
        Possible values are:
        <itemizedlist>
         <listitem>
-         <para><literal>normal</literal> means that the claimed files
+         <para><literal>reserved</literal> means that the claimed files
           are within <varname>max_wal_size</varname>.</para>
         </listitem>
         <listitem>
-         <para><literal>reserved</literal> means
+         <para><literal>extended</literal> means
           that <varname>max_wal_size</varname> is exceeded but the files are
-          still held, either by some replication slot or
-          by <varname>wal_keep_segments</varname>.</para>
+          still retained, either by the replication slot or
+          by <varname>wal_keep_segments</varname>.
+         </para>
         </listitem>
         <listitem>
-         <para><literal>lost</literal> means that some WAL files are
-          definitely lost and this slot cannot be used to resume replication
-          anymore.</para>
+         <para>
+          <literal>unreserved</literal> means that the slot no longer
+          retains the required WAL files and some of them are to be removed at
+          the next checkpoint.  This state can return
+          to <literal>reserved</literal> or <literal>extended</literal>.
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>lost</literal> means that some required WAL files have
+          been removed and this slot is no longer usable.
+         </para>
         </listitem>
        </itemizedlist>
        The last two states are seen only when
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
@@ -9488,15 +9488,20 @@ CreateRestartPoint(int flags)
  *		(typically a slot's restart_lsn)
  *
  * Returns one of the following enum values:
- * * WALAVAIL_NORMAL means targetLSN is available because it is in the range
- *   of max_wal_size.
  *
- * * WALAVAIL_PRESERVED means it is still available by preserving extra
+ * * WALAVAIL_RESERVED means targetLSN is available and it is in the range of
+ *   max_wal_size.
+ *
+ * * WALAVAIL_EXTENDED means it is still available by preserving extra
  *   segments beyond max_wal_size. If max_slot_wal_keep_size is smaller
  *   than max_wal_size, this state is not returned.
  *
- * * WALAVAIL_REMOVED means it is definitely lost. A replication stream on
- *   a slot with this LSN cannot continue.
+ * * WALAVAIL_UNRESERVED means it is being lost and the next checkpoint will
+ *   remove reserved segments. The walsender using this slot may return to the
+ *   above.
+ *
+ * * WALAVAIL_REMOVED means it has been removed. A replication stream on
+ *   a slot with this LSN cannot continue after a restart.
  *
  * * WALAVAIL_INVALID_LSN means the slot hasn't been set to reserve WAL.
  */
@@ -9512,13 +9517,18 @@ GetWALAvailability(XLogRecPtr targetLSN)
 													 * slot */
 	uint64		keepSegs;
 
-	/* slot does not reserve WAL. Either deactivated, or has never been active */
+	/*
+	 * slot does not reserve WAL. Either deactivated, or has never been active
+	 */
 	if (XLogRecPtrIsInvalid(targetLSN))
 		return WALAVAIL_INVALID_LSN;
 
 	currpos = GetXLogWriteRecPtr();
 
-	/* calculate oldest segment currently needed by slots */
+	/*
+	 * calculate the oldest segment currently reserved by all slots,
+	 * considering wal_keep_segments and max_slot_wal_keep_size
+	 */
 	XLByteToSeg(targetLSN, targetSeg, wal_segment_size);
 	KeepLogSeg(currpos, &oldestSlotSeg);
 
@@ -9529,38 +9539,33 @@ GetWALAvailability(XLogRecPtr targetLSN)
 	 */
 	oldestSeg = XLogGetLastRemovedSegno() + 1;
 
-	/* calculate oldest segment by max_wal_size and wal_keep_segments */
+	/* calculate oldest segment by max_wal_size */
 	XLByteToSeg(currpos, currSeg, wal_segment_size);
-	keepSegs = ConvertToXSegs(Max(max_wal_size_mb, wal_keep_segments),
-							  wal_segment_size) + 1;
+	keepSegs = ConvertToXSegs(max_wal_size_mb, wal_segment_size) + 1;
 
 	if (currSeg > keepSegs)
 		oldestSegMaxWalSize = currSeg - keepSegs;
 	else
 		oldestSegMaxWalSize = 1;
 
 	/*
-	 * If max_slot_wal_keep_size has changed after the last call, the segment
-	 * that would been kept by the current setting might have been lost by the
-	 * previous setting. No point in showing normal or keeping status values
-	 * if the targetSeg is known to be lost.
+	 * No point in returning reserved or extended status values if the
+	 * targetSeg is known to be lost.
 	 */
-	if (targetSeg >= oldestSeg)
+	if (targetSeg >= oldestSlotSeg)
 	{
-		/*
-		 * show "normal" when targetSeg is within max_wal_size, even if
-		 * max_slot_wal_keep_size is smaller than max_wal_size.
-		 */
-		if ((max_slot_wal_keep_size_mb <= 0 ||
-			 max_slot_wal_keep_size_mb >= max_wal_size_mb) &&
-			oldestSegMaxWalSize <= targetSeg)
-			return WALAVAIL_NORMAL;
-
-		/* being retained by slots */
-		if (oldestSlotSeg <= targetSeg)
+		/* show "reserved" when targetSeg is within max_wal_size */
+		if (targetSeg >= oldestSegMaxWalSize)
 			return WALAVAIL_RESERVED;
+
+		/* being retained by slots exceeding max_wal_size */
+		return WALAVAIL_EXTENDED;
 	}
 
+	/* WAL segments are no longer retained but haven't been removed yet */
+	if (targetSeg >= oldestSeg)
+		return WALAVAIL_UNRESERVED;
+
 	/* Definitely lost */
 	return WALAVAIL_REMOVED;
 }
diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c
@@ -359,24 +359,47 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
 				nulls[i++] = true;
 				break;
 
-			case WALAVAIL_NORMAL:
-				values[i++] = CStringGetTextDatum("normal");
-				break;
-
 			case WALAVAIL_RESERVED:
 				values[i++] = CStringGetTextDatum("reserved");
 				break;
 
+			case WALAVAIL_EXTENDED:
+				values[i++] = CStringGetTextDatum("extended");
+				break;
+
+			case WALAVAIL_UNRESERVED:
+				values[i++] = CStringGetTextDatum("unreserved");
+				break;
+
 			case WALAVAIL_REMOVED:
+
+				/*
+				 * If we read the restart_lsn long enough ago, maybe that file
+				 * has been removed by now.  However, the walsender could have
+				 * moved forward enough that it jumped to another file after
+				 * we looked.  If checkpointer signalled the process to
+				 * termination, then it's definitely lost; but if a process is
+				 * still alive, then "unreserved" seems more appropriate.
+				 */
+				if (!XLogRecPtrIsInvalid(slot_contents.data.restart_lsn))
+				{
+					int			pid;
+
+					SpinLockAcquire(&slot->mutex);
+					pid = slot->active_pid;
+					SpinLockRelease(&slot->mutex);
+					if (pid != 0)
+					{
+						values[i++] = CStringGetTextDatum("unreserved");
+						break;
+					}
+				}
 				values[i++] = CStringGetTextDatum("lost");
 				break;
-
-			default:
-				elog(ERROR, "invalid walstate: %d", (int) walstate);
 		}
 
 		if (max_slot_wal_keep_size_mb >= 0 &&
-			(walstate == WALAVAIL_NORMAL || walstate == WALAVAIL_RESERVED) &&
+			(walstate == WALAVAIL_RESERVED || walstate == WALAVAIL_EXTENDED) &&
 			((last_removed_seg = XLogGetLastRemovedSegno()) != 0))
 		{
 			XLogRecPtr	min_safe_lsn;
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
@@ -270,8 +270,10 @@ extern CheckpointStatsData CheckpointStats;
 typedef enum WALAvailability
 {
 	WALAVAIL_INVALID_LSN,		/* parameter error */
-	WALAVAIL_NORMAL,			/* WAL segment is within max_wal_size */
-	WALAVAIL_RESERVED,			/* WAL segment is reserved by a slot */
+	WALAVAIL_RESERVED,			/* WAL segment is within max_wal_size */
+	WALAVAIL_EXTENDED,			/* WAL segment is reserved by a slot or
+								 * wal_keep_segments */
+	WALAVAIL_UNRESERVED,		/* no longer reserved, but not removed yet */
 	WALAVAIL_REMOVED			/* WAL segment has been removed */
 } WALAvailability;
 
diff --git a/src/test/recovery/t/019_replslot_limit.pl b/src/test/recovery/t/019_replslot_limit.pl
@@ -56,7 +56,7 @@
 $result = $node_master->safe_psql('postgres',
 	"SELECT wal_status, min_safe_lsn is NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
-is($result, "normal|t", 'check the catching-up state');
+is($result, "reserved|t", 'check the catching-up state');
 
 # Advance WAL by five segments (= 5MB) on master
 advance_wal($node_master, 1);
@@ -66,7 +66,8 @@
 $result = $node_master->safe_psql('postgres',
 	"SELECT wal_status, min_safe_lsn is NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
-is($result, "normal|t", 'check that it is safe if WAL fits in max_wal_size');
+is($result, "reserved|t",
+	'check that it is safe if WAL fits in max_wal_size');
 
 advance_wal($node_master, 4);
 $node_master->safe_psql('postgres', "CHECKPOINT;");
@@ -75,7 +76,7 @@
 $result = $node_master->safe_psql('postgres',
 	"SELECT wal_status, min_safe_lsn is NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
-is($result, "normal|t", 'check that slot is working');
+is($result, "reserved|t", 'check that slot is working');
 
 # The standby can reconnect to master
 $node_standby->start;
@@ -99,7 +100,7 @@
 
 $result = $node_master->safe_psql('postgres',
 	"SELECT wal_status FROM pg_replication_slots WHERE slot_name = 'rep1'");
-is($result, "normal", 'check that max_slot_wal_keep_size is working');
+is($result, "reserved", 'check that max_slot_wal_keep_size is working');
 
 # Advance WAL again then checkpoint, reducing remain by 2 MB.
 advance_wal($node_master, 2);
@@ -108,7 +109,7 @@
 # The slot is still working
 $result = $node_master->safe_psql('postgres',
 	"SELECT wal_status FROM pg_replication_slots WHERE slot_name = 'rep1'");
-is($result, "normal",
+is($result, "reserved",
 	'check that min_safe_lsn gets close to the current LSN');
 
 # The standby can reconnect to master
@@ -125,7 +126,7 @@
 $result = $node_master->safe_psql('postgres',
 	"SELECT wal_status as remain FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
-is($result, "normal",
+is($result, "extended",
 	'check that wal_keep_segments overrides max_slot_wal_keep_size');
 # restore wal_keep_segments
 $result = $node_master->safe_psql('postgres',
@@ -143,19 +144,20 @@
 # Slot gets into 'reserved' state
 $result = $node_master->safe_psql('postgres',
 	"SELECT wal_status FROM pg_replication_slots WHERE slot_name = 'rep1'");
-is($result, "reserved", 'check that the slot state changes to "reserved"');
+is($result, "extended", 'check that the slot state changes to "extended"');
 
 # do checkpoint so that the next checkpoint runs too early
 $node_master->safe_psql('postgres', "CHECKPOINT;");
 
 # Advance WAL again without checkpoint; remain goes to 0.
 advance_wal($node_master, 1);
 
-# Slot gets into 'lost' state
+# Slot gets into 'unreserved' state
 $result = $node_master->safe_psql('postgres',
 	"SELECT wal_status, min_safe_lsn is NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
-is($result, "lost|t", 'check that the slot state changes to "lost"');
+is($result, "unreserved|t",
+	'check that the slot state changes to "unreserved"');
 
 # The standby still can connect to master before a checkpoint
 $node_standby->start;
@@ -186,7 +188,8 @@
 $result = $node_master->safe_psql('postgres',
 	"SELECT slot_name, active, restart_lsn IS NULL, wal_status, min_safe_lsn FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
-is($result, "rep1|f|t|lost|", 'check that the slot became inactive');
+is($result, "rep1|f|t|lost|",
+	'check that the slot became inactive and the state "lost" persists');
 
 # The standby no longer can connect to the master
 $logstart = get_log_size($node_standby);