Fix failure to remove non-first segments of temporary tables.

tglsfdc · tglsfdc · commit 0e758ae89a20 · 2022-11-07T11:36:45.000-05:00
Commit 4ab5dae broke mdunlinkfork's logic for removing additional segments of a multi-gigabyte table, because it neglected to advance "segno" after unlinking the first segment, in the code path where it chooses to unlink that one immediately. Then the main remove loop gets ENOENT at segment zero and figures it's done, so we never remove whatever additional segments might exist. The main problem here is with large temporary tables, but WAL replay of a drop of a large regular table would also fail to remove extra segments. The third case where this path is taken is for non-main forks; but I doubt it matters for those since they probably never exceed 1GB. The simplest fix is just to increment segno after that unlink(). (Probably this logic could do with a more thorough rethink, but not with mere hours to go before 15.1 wraps.) While here, also fix an incautious assumption that register_forget_request cannot change errno. I don't think that that has any really bad consequences, as we'd end up trying to unlink the zero'th segment either way, but it greatly complicates reasoning about what could happen here. Also make a couple of other cosmetic fixes. Per bug #17679 from Balazs Szilfai. Back-patch into v15, as the faulty patch was. Discussion: https://postgr.es/m/17679-1095d04450cf6a6e@postgresql.org
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
@@ -330,11 +330,15 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
 	{
 		if (!RelFileLocatorBackendIsTemp(rlocator))
 		{
+			int			save_errno;
+
 			/* Prevent other backends' fds from holding on to the disk space */
 			ret = do_truncate(path);
 
 			/* Forget any pending sync requests for the first segment */
+			save_errno = errno;
 			register_forget_request(rlocator, forknum, 0 /* first seg */ );
+			errno = save_errno;
 		}
 		else
 			ret = 0;
@@ -347,6 +351,7 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
 				ereport(WARNING,
 						(errcode_for_file_access(),
 						 errmsg("could not remove file \"%s\": %m", path)));
+			segno++;
 		}
 	}
 	else
@@ -359,21 +364,22 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
 		 * segment later, rather than now.
 		 *
 		 * If we're performing a binary upgrade, the dangers described in the
-		 * header comments for mdunlink() do not exist, since after a crash
-		 * or even a simple ERROR, the upgrade fails and the whole new cluster
+		 * header comments for mdunlink() do not exist, since after a crash or
+		 * even a simple ERROR, the upgrade fails and the whole new cluster
 		 * must be recreated from scratch. And, on the other hand, it is
-		 * important to remove the files from disk immediately, because we
-		 * may be about to reuse the same relfilenumber.
+		 * important to remove the files from disk immediately, because we may
+		 * be about to reuse the same relfilenumber.
 		 */
 		if (!IsBinaryUpgrade)
 		{
 			register_unlink_segment(rlocator, forknum, 0 /* first seg */ );
-			++segno;
+			segno++;
 		}
 	}
 
 	/*
-	 * Delete any additional segments.
+	 * Delete any remaining segments (we might or might not have dealt with
+	 * the first one above).
 	 */
 	if (ret >= 0)
 	{

Original file line number	Diff line number	Diff line change
`@@ -330,11 +330,15 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)`
`330`	`330`	`{`
`331`	`331`	`if (!RelFileLocatorBackendIsTemp(rlocator))`
`332`	`332`	`{`
	`333`	`+ int save_errno;`
	`334`	`+`
`333`	`335`	`/* Prevent other backends' fds from holding on to the disk space */`
`334`	`336`	`ret = do_truncate(path);`
`335`	`337`
`336`	`338`	`/* Forget any pending sync requests for the first segment */`
	`339`	`+ save_errno = errno;`
`337`	`340`	`register_forget_request(rlocator, forknum, 0 /* first seg */ );`
	`341`	`+ errno = save_errno;`
`338`	`342`	`}`
`339`	`343`	`else`
`340`	`344`	`ret = 0;`
`@@ -347,6 +351,7 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)`
`347`	`351`	`ereport(WARNING,`
`348`	`352`	`(errcode_for_file_access(),`
`349`	`353`	`errmsg("could not remove file \"%s\": %m", path)));`
	`354`	`+ segno++;`
`350`	`355`	`}`
`351`	`356`	`}`
`352`	`357`	`else`
`@@ -359,21 +364,22 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)`
`359`	`364`	`* segment later, rather than now.`
`360`	`365`	`*`
`361`	`366`	`* If we're performing a binary upgrade, the dangers described in the`
`362`		`- * header comments for mdunlink() do not exist, since after a crash`
`363`		`- * or even a simple ERROR, the upgrade fails and the whole new cluster`
	`367`	`+ * header comments for mdunlink() do not exist, since after a crash or`
	`368`	`+ * even a simple ERROR, the upgrade fails and the whole new cluster`
`364`	`369`	`* must be recreated from scratch. And, on the other hand, it is`
`365`		`- * important to remove the files from disk immediately, because we`
`366`		`- * may be about to reuse the same relfilenumber.`
	`370`	`+ * important to remove the files from disk immediately, because we may`
	`371`	`+ * be about to reuse the same relfilenumber.`
`367`	`372`	`*/`
`368`	`373`	`if (!IsBinaryUpgrade)`
`369`	`374`	`{`
`370`	`375`	`register_unlink_segment(rlocator, forknum, 0 /* first seg */ );`
`371`		`- ++segno;`
	`376`	`+ segno++;`
`372`	`377`	`}`
`373`	`378`	`}`
`374`	`379`
`375`	`380`	`/*`
`376`		`- * Delete any additional segments.`
	`381`	`+ * Delete any remaining segments (we might or might not have dealt with`
	`382`	`+ * the first one above).`
`377`	`383`	`*/`
`378`	`384`	`if (ret >= 0)`
`379`	`385`	`{`