Add more LOG messages when starting and ending recovery from a backup

michaelpq · michaelpq · commit edbd1b41ab5b · 2024-01-29T09:04:51.000+09:00
Three LOG messages are added in the recovery code paths, providing information that can be useful to track corruption issues depending on the state of the cluster, telling that: - Recovery has started from a backup_label. - Recovery is restarting from a backup start LSN, without a backup_label. - Recovery has completed from a backup. This was originally applied on HEAD as of 1d35f70, and there is consensus that this can be useful for older versions. This applies cleanly down to 15, so do it down to this version for now (older versions have heavily refactored the WAL recovery paths, making the change less straight-forward to do). Author: Andres Freund Reviewed-by: David Steele, Laurenz Albe, Michael Paquier Discussion: https://postgr.es/m/20231117041811.vz4vgkthwjnwp2pp@awork3.anarazel.de Backpatch-through: 15
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
@@ -623,6 +623,22 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 		if (StandbyModeRequested)
 			EnableStandbyMode();
 
+		/*
+		 * Omitting backup_label when creating a new replica, PITR node etc.
+		 * unfortunately is a common cause of corruption.  Logging that
+		 * backup_label was used makes it a bit easier to exclude that as the
+		 * cause of observed corruption.
+		 *
+		 * Do so before we try to read the checkpoint record (which can fail),
+		 * as otherwise it can be hard to understand why a checkpoint other
+		 * than ControlFile->checkPoint is used.
+		 */
+		ereport(LOG,
+				(errmsg("starting backup recovery with redo LSN %X/%X, checkpoint LSN %X/%X, on timeline ID %u",
+						LSN_FORMAT_ARGS(RedoStartLSN),
+						LSN_FORMAT_ARGS(CheckPointLoc),
+						CheckPointTLI)));
+
 		/*
 		 * When a backup_label file is present, we want to roll forward from
 		 * the checkpoint it identifies, rather than using pg_control.
@@ -761,6 +777,16 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 				EnableStandbyMode();
 		}
 
+		/*
+		 * For the same reason as when starting up with backup_label present,
+		 * emit a log message when we continue initializing from a base
+		 * backup.
+		 */
+		if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
+			ereport(LOG,
+					(errmsg("restarting backup recovery with redo LSN %X/%X",
+							LSN_FORMAT_ARGS(ControlFile->backupStartPoint))));
+
 		/* Get the last valid checkpoint record. */
 		CheckPointLoc = ControlFile->checkPoint;
 		CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID;
@@ -2123,6 +2149,9 @@ CheckRecoveryConsistency(void)
 	if (!XLogRecPtrIsInvalid(backupEndPoint) &&
 		backupEndPoint <= lastReplayedEndRecPtr)
 	{
+		XLogRecPtr	saveBackupStartPoint = backupStartPoint;
+		XLogRecPtr	saveBackupEndPoint = backupEndPoint;
+
 		elog(DEBUG1, "end of backup reached");
 
 		/*
@@ -2133,6 +2162,11 @@ CheckRecoveryConsistency(void)
 		backupStartPoint = InvalidXLogRecPtr;
 		backupEndPoint = InvalidXLogRecPtr;
 		backupEndRequired = false;
+
+		ereport(LOG,
+				(errmsg("completed backup recovery with redo LSN %X/%X and end LSN %X/%X",
+						LSN_FORMAT_ARGS(saveBackupStartPoint),
+						LSN_FORMAT_ARGS(saveBackupEndPoint))));
 	}
 
 	/*