Add CheckBuffer() to check on-disk pages without shared buffer loading

michaelpq · michaelpq · commit c780a7a90a8e · 2020-10-28T11:12:46.000+09:00
CheckBuffer() is designed to be a concurrent-safe function able to run sanity checks on a relation page without loading it into the shared buffers. The operation is done using a lock on the partition involved in the shared buffer mapping hashtable and an I/O lock for the buffer itself, preventing the risk of false positives due to any concurrent activity. The primary use of this function is the detection of on-disk corruptions for relation pages. If a page is found in shared buffers, the on-disk page is checked if not dirty (a follow-up checkpoint would flush a valid version of the page if dirty anyway), as it could be possible that a page was present for a long time in shared buffers with its on-disk version corrupted. Such a scenario could lead to a corrupted cluster if a host is plugged off for example. If the page is not found in shared buffers, its on-disk state is checked. PageIsVerifiedExtended() is used to apply the same sanity checks as when a page gets loaded into shared buffers. This function will be used by an upcoming patch able to check the state of on-disk relation pages using a SQL function. Author: Julien Rouhaud, Michael Paquier Reviewed-by: Masahiko Sawada Discussion: https://postgr.es/m/CAOBaU_aVvMjQn=ge5qPiJOPMmOj5=ii3st5Q0Y+WuLML5sR17w@mail.gmail.com
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
@@ -4585,3 +4585,95 @@ TestForOldSnapshot_impl(Snapshot snapshot, Relation relation)
 				(errcode(ERRCODE_SNAPSHOT_TOO_OLD),
 				 errmsg("snapshot too old")));
 }
+
+
+/*
+ * CheckBuffer
+ *
+ * Check the state of a buffer without loading it into the shared buffers. To
+ * avoid torn pages and possible false positives when reading data, a shared
+ * LWLock is taken on the target buffer pool partition mapping, and we check
+ * if the page is in shared buffers or not.  An I/O lock is taken on the block
+ * to prevent any concurrent activity from happening.
+ *
+ * If the page is found as dirty in the shared buffers, it is ignored as
+ * it will be flushed to disk either before the end of the next checkpoint
+ * or during recovery in the event of an unsafe shutdown.
+ *
+ * If the page is found in the shared buffers but is not dirty, we still
+ * check the state of its data on disk, as it could be possible that the
+ * page stayed in shared buffers for a rather long time while the on-disk
+ * data got corrupted.
+ *
+ * If the page is not found in shared buffers, the block is read from disk
+ * while holding the buffer pool partition mapping LWLock.
+ *
+ * The page data is stored in a private memory area local to this function
+ * while running the checks.
+ */
+bool
+CheckBuffer(SMgrRelation smgr, ForkNumber forknum, BlockNumber blkno)
+{
+	char		buffer[BLCKSZ];
+	BufferTag	buf_tag;		/* identity of requested block */
+	uint32		buf_hash;		/* hash value for buf_tag */
+	LWLock	   *partLock;		/* buffer partition lock for the buffer */
+	BufferDesc *bufdesc;
+	int			buf_id;
+
+	Assert(smgrexists(smgr, forknum));
+
+	/* create a tag so we can look after the buffer */
+	INIT_BUFFERTAG(buf_tag, smgr->smgr_rnode.node, forknum, blkno);
+
+	/* determine its hash code and partition lock ID */
+	buf_hash = BufTableHashCode(&buf_tag);
+	partLock = BufMappingPartitionLock(buf_hash);
+
+	/* see if the block is in the buffer pool or not */
+	LWLockAcquire(partLock, LW_SHARED);
+	buf_id = BufTableLookup(&buf_tag, buf_hash);
+	if (buf_id >= 0)
+	{
+		uint32		buf_state;
+
+		/*
+		 * Found it.  Now, retrieve its state to know what to do with it, and
+		 * release the pin immediately.  We do so to limit overhead as much as
+		 * possible.  We keep the shared LWLock on the target buffer mapping
+		 * partition for now, so this buffer cannot be evicted, and we acquire
+		 * an I/O Lock on the buffer as we may need to read its contents from
+		 * disk.
+		 */
+		bufdesc = GetBufferDescriptor(buf_id);
+
+		LWLockAcquire(BufferDescriptorGetIOLock(bufdesc), LW_SHARED);
+		buf_state = LockBufHdr(bufdesc);
+		UnlockBufHdr(bufdesc, buf_state);
+
+		/* If the page is dirty or invalid, skip it */
+		if ((buf_state & BM_DIRTY) != 0 || (buf_state & BM_TAG_VALID) == 0)
+		{
+			LWLockRelease(BufferDescriptorGetIOLock(bufdesc));
+			LWLockRelease(partLock);
+			return true;
+		}
+
+		/* Read the buffer from disk, with the I/O lock still held */
+		smgrread(smgr, forknum, blkno, buffer);
+		LWLockRelease(BufferDescriptorGetIOLock(bufdesc));
+	}
+	else
+	{
+		/*
+		 * Simply read the buffer.  There's no risk of modification on it as
+		 * we are holding the buffer pool partition mapping lock.
+		 */
+		smgrread(smgr, forknum, blkno, buffer);
+	}
+
+	/* buffer lookup done, so now do its check */
+	LWLockRelease(partLock);
+
+	return PageIsVerifiedExtended(buffer, blkno, PIV_REPORT_STAT);
+}
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
@@ -240,6 +240,9 @@ extern void AtProcExit_LocalBuffers(void);
 
 extern void TestForOldSnapshot_impl(Snapshot snapshot, Relation relation);
 
+extern bool CheckBuffer(struct SMgrRelationData *smgr, ForkNumber forknum,
+						BlockNumber blkno);
+
 /* in freelist.c */
 extern BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype);
 extern void FreeAccessStrategy(BufferAccessStrategy strategy);