Dept. of further reflection: I looked around to see if any other callers

tglsfdc · tglsfdc · commit 1a3de15a3a4c · 2004-08-15T23:44:46.000Z
of XLogInsert had the same sort of checkpoint interlock problem as
RecordTransactionCommit, and indeed I found some.  Btree index build
and ALTER TABLE SET TABLESPACE write data outside the friendly confines
of the buffer manager, and therefore they have to take their own
responsibility for checkpoint interlock.  The easiest solution seems to
be to force smgrimmedsync at the end of the index build or table copy,
even when the operation is being WAL-logged.  This is sufficient since
the new index or table will be of interest to no one if we don't get
as far as committing the current transaction.
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
@@ -56,7 +56,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.85 2004/07/21 22:31:20 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.86 2004/08/15 23:44:46 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -322,16 +322,15 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
 			wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
 		smgrwrite(wstate->index->rd_smgr, wstate->btws_pages_written++,
 				  (char *) wstate->btws_zeropage,
-				  !wstate->btws_use_wal);
+				  true);
 	}
 
 	/*
-	 * Now write the page.  If not using WAL, say isTemp = true, to suppress
-	 * duplicate fsync.  If we are using WAL, it surely isn't a temp index,
-	 * so !use_wal is a sufficient condition.
+	 * Now write the page.  We say isTemp = true even if it's not a
+	 * temp index, because there's no need for smgr to schedule an fsync
+	 * for this write; we'll do it ourselves before ending the build.
 	 */
-	smgrwrite(wstate->index->rd_smgr, blkno, (char *) page,
-			  !wstate->btws_use_wal);
+	smgrwrite(wstate->index->rd_smgr, blkno, (char *) page, true);
 
 	if (blkno == wstate->btws_pages_written)
 		wstate->btws_pages_written++;
@@ -802,9 +801,20 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	_bt_uppershutdown(wstate, state);
 
 	/*
-	 * If we weren't using WAL, and the index isn't temp, we must fsync it
-	 * down to disk before it's safe to commit the transaction.
+	 * If the index isn't temp, we must fsync it down to disk before it's
+	 * safe to commit the transaction.  (For a temp index we don't care
+	 * since the index will be uninteresting after a crash anyway.)
+	 *
+	 * It's obvious that we must do this when not WAL-logging the build.
+	 * It's less obvious that we have to do it even if we did WAL-log the
+	 * index pages.  The reason is that since we're building outside
+	 * shared buffers, a CHECKPOINT occurring during the build has no way
+	 * to flush the previously written data to disk (indeed it won't know
+	 * the index even exists).  A crash later on would replay WAL from the
+	 * checkpoint, therefore it wouldn't replay our earlier WAL entries.
+	 * If we do not fsync those pages here, they might still not be on disk
+	 * when the crash occurs.
 	 */
-	if (!wstate->btws_use_wal && !wstate->index->rd_istemp)
+	if (!wstate->index->rd_istemp)
 		smgrimmedsync(wstate->index->rd_smgr);
 }
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/tablecmds.c,v 1.125 2004/08/13 04:50:28 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/tablecmds.c,v 1.126 2004/08/15 23:44:46 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -5479,18 +5479,29 @@ copy_relation_data(Relation rel, SMgrRelation dst)
 		}
 
 		/*
-		 * Now write the page.  If not using WAL, say isTemp = true, to
-		 * suppress duplicate fsync.  If we are using WAL, it surely isn't a
-		 * temp rel, so !use_wal is a sufficient condition.
+		 * Now write the page.  We say isTemp = true even if it's not a
+		 * temp rel, because there's no need for smgr to schedule an fsync
+		 * for this write; we'll do it ourselves below.
 		 */
-		smgrwrite(dst, blkno, buf, !use_wal);
+		smgrwrite(dst, blkno, buf, true);
 	}
 
 	/*
-	 * If we weren't using WAL, and the rel isn't temp, we must fsync it
-	 * down to disk before it's safe to commit the transaction.
+	 * If the rel isn't temp, we must fsync it down to disk before it's
+	 * safe to commit the transaction.  (For a temp rel we don't care
+	 * since the rel will be uninteresting after a crash anyway.)
+	 *
+	 * It's obvious that we must do this when not WAL-logging the copy.
+	 * It's less obvious that we have to do it even if we did WAL-log the
+	 * copied pages.  The reason is that since we're copying outside
+	 * shared buffers, a CHECKPOINT occurring during the copy has no way
+	 * to flush the previously written data to disk (indeed it won't know
+	 * the new rel even exists).  A crash later on would replay WAL from the
+	 * checkpoint, therefore it wouldn't replay our earlier WAL entries.
+	 * If we do not fsync those pages here, they might still not be on disk
+	 * when the crash occurs.
 	 */
-	if (!use_wal && !rel->rd_istemp)
+	if (!rel->rd_istemp)
 		smgrimmedsync(dst);
 }
 
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
@@ -11,7 +11,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.77 2004/07/17 03:28:55 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.78 2004/08/15 23:44:46 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -621,14 +621,15 @@ smgrtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
  *
  *		Synchronously force all of the specified relation down to disk.
  *
- *		This is really only useful for non-WAL-logged index building:
- *		instead of incrementally WAL-logging the index build steps,
- *		we can just write completed index pages to disk with smgrwrite
+ *		This is useful for building completely new relations (eg, new
+ *		indexes).  Instead of incrementally WAL-logging the index build
+ *		steps, we can just write completed index pages to disk with smgrwrite
  *		or smgrextend, and then fsync the completed index file before
  *		committing the transaction.  (This is sufficient for purposes of
  *		crash recovery, since it effectively duplicates forcing a checkpoint
- *		for the completed index.  But it is *not* workable if one wishes
- *		to use the WAL log for PITR or replication purposes.)
+ *		for the completed index.  But it is *not* sufficient if one wishes
+ *		to use the WAL log for PITR or replication purposes: in that case
+ *		we have to make WAL entries as well.)
  *
  *		The preceding writes should specify isTemp = true to avoid
  *		duplicative fsyncs.

Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@`
`11`	`11`	`*`
`12`	`12`	`*`
`13`	`13`	`* IDENTIFICATION`
`14`		`- * $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.77 2004/07/17 03:28:55 tgl Exp $`
	`14`	`+ * $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.78 2004/08/15 23:44:46 tgl Exp $`
`15`	`15`	`*`
`16`	`16`	`*-------------------------------------------------------------------------`
`17`	`17`	`*/`
`@@ -621,14 +621,15 @@ smgrtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)`
`621`	`621`	`*`
`622`	`622`	`* Synchronously force all of the specified relation down to disk.`
`623`	`623`	`*`
`624`		`- * This is really only useful for non-WAL-logged index building:`
`625`		`- * instead of incrementally WAL-logging the index build steps,`
`626`		`- * we can just write completed index pages to disk with smgrwrite`
	`624`	`+ * This is useful for building completely new relations (eg, new`
	`625`	`+ * indexes). Instead of incrementally WAL-logging the index build`
	`626`	`+ * steps, we can just write completed index pages to disk with smgrwrite`
`627`	`627`	`* or smgrextend, and then fsync the completed index file before`
`628`	`628`	`* committing the transaction. (This is sufficient for purposes of`
`629`	`629`	`* crash recovery, since it effectively duplicates forcing a checkpoint`
`630`		`- * for the completed index. But it is not workable if one wishes`
`631`		`- * to use the WAL log for PITR or replication purposes.)`
	`630`	`+ * for the completed index. But it is not sufficient if one wishes`
	`631`	`+ * to use the WAL log for PITR or replication purposes: in that case`
	`632`	`+ * we have to make WAL entries as well.)`
`632`	`633`	`*`
`633`	`634`	`* The preceding writes should specify isTemp = true to avoid`
`634`	`635`	`* duplicative fsyncs.`