8
8
*
9
9
*
10
10
* IDENTIFICATION
11
- * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.129 2007/07/03 14:51:24 tgl Exp $
11
+ * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.130 2007/11/15 20:36:40 tgl Exp $
12
12
*
13
13
*-------------------------------------------------------------------------
14
14
*/
34
34
/* special values for the segno arg to RememberFsyncRequest */
35
35
#define FORGET_RELATION_FSYNC (InvalidBlockNumber)
36
36
#define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1)
37
+ #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
37
38
38
39
/*
39
40
* On Windows, we have to interpret EACCES as possibly meaning the same as
@@ -113,6 +114,10 @@ static MemoryContext MdCxt; /* context for all md.c allocations */
113
114
* table remembers the pending operations. We use a hash table mostly as
114
115
* a convenient way of eliminating duplicate requests.
115
116
*
117
+ * We use a similar mechanism to remember no-longer-needed files that can
118
+ * be deleted after the next checkpoint, but we use a linked list instead of
119
+ * a hash table, because we don't expect there to be any duplicate requests.
120
+ *
116
121
* (Regular backends do not track pending operations locally, but forward
117
122
* them to the bgwriter.)
118
123
*/
@@ -131,9 +136,17 @@ typedef struct
131
136
CycleCtr cycle_ctr ; /* mdsync_cycle_ctr when request was made */
132
137
} PendingOperationEntry ;
133
138
139
+ typedef struct
140
+ {
141
+ RelFileNode rnode ; /* the dead relation to delete */
142
+ CycleCtr cycle_ctr ; /* mdckpt_cycle_ctr when request was made */
143
+ } PendingUnlinkEntry ;
144
+
134
145
static HTAB * pendingOpsTable = NULL ;
146
+ static List * pendingUnlinks = NIL ;
135
147
136
148
static CycleCtr mdsync_cycle_ctr = 0 ;
149
+ static CycleCtr mdckpt_cycle_ctr = 0 ;
137
150
138
151
139
152
typedef enum /* behavior for mdopen & _mdfd_getseg */
@@ -146,6 +159,7 @@ typedef enum /* behavior for mdopen & _mdfd_getseg */
146
159
/* local routines */
147
160
static MdfdVec * mdopen (SMgrRelation reln , ExtensionBehavior behavior );
148
161
static void register_dirty_segment (SMgrRelation reln , MdfdVec * seg );
162
+ static void register_unlink (RelFileNode rnode );
149
163
static MdfdVec * _fdvec_alloc (void );
150
164
151
165
#ifndef LET_OS_MANAGE_FILESIZE
@@ -188,6 +202,7 @@ mdinit(void)
188
202
100L ,
189
203
& hash_ctl ,
190
204
HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT );
205
+ pendingUnlinks = NIL ;
191
206
}
192
207
}
193
208
@@ -254,14 +269,37 @@ mdcreate(SMgrRelation reln, bool isRedo)
254
269
* Note that we're passed a RelFileNode --- by the time this is called,
255
270
* there won't be an SMgrRelation hashtable entry anymore.
256
271
*
272
+ * Actually, we don't unlink the first segment file of the relation, but
273
+ * just truncate it to zero length, and record a request to unlink it after
274
+ * the next checkpoint. Additional segments can be unlinked immediately,
275
+ * however. Leaving the empty file in place prevents that relfilenode
276
+ * number from being reused. The scenario this protects us from is:
277
+ * 1. We delete a relation (and commit, and actually remove its file).
278
+ * 2. We create a new relation, which by chance gets the same relfilenode as
279
+ * the just-deleted one (OIDs must've wrapped around for that to happen).
280
+ * 3. We crash before another checkpoint occurs.
281
+ * During replay, we would delete the file and then recreate it, which is fine
282
+ * if the contents of the file were repopulated by subsequent WAL entries.
283
+ * But if we didn't WAL-log insertions, but instead relied on fsyncing the
284
+ * file after populating it (as for instance CLUSTER and CREATE INDEX do),
285
+ * the contents of the file would be lost forever. By leaving the empty file
286
+ * until after the next checkpoint, we prevent reassignment of the relfilenode
287
+ * number until it's safe, because relfilenode assignment skips over any
288
+ * existing file.
289
+ *
257
290
* If isRedo is true, it's okay for the relation to be already gone.
258
- * Also, any failure should be reported as WARNING not ERROR, because
291
+ * Also, we should remove the file immediately instead of queuing a request
292
+ * for later, since during redo there's no possibility of creating a
293
+ * conflicting relation.
294
+ *
295
+ * Note: any failure should be reported as WARNING not ERROR, because
259
296
* we are usually not in a transaction anymore when this is called.
260
297
*/
261
298
void
262
299
mdunlink (RelFileNode rnode , bool isRedo )
263
300
{
264
301
char * path ;
302
+ int ret ;
265
303
266
304
/*
267
305
* We have to clean out any pending fsync requests for the doomed relation,
@@ -271,8 +309,15 @@ mdunlink(RelFileNode rnode, bool isRedo)
271
309
272
310
path = relpath (rnode );
273
311
274
- /* Delete the first segment, or only segment if not doing segmenting */
275
- if (unlink (path ) < 0 )
312
+ /*
313
+ * Delete or truncate the first segment, or only segment if not doing
314
+ * segmenting
315
+ */
316
+ if (isRedo )
317
+ ret = unlink (path );
318
+ else
319
+ ret = truncate (path , 0 );
320
+ if (ret < 0 )
276
321
{
277
322
if (!isRedo || errno != ENOENT )
278
323
ereport (WARNING ,
@@ -316,6 +361,10 @@ mdunlink(RelFileNode rnode, bool isRedo)
316
361
#endif
317
362
318
363
pfree (path );
364
+
365
+ /* Register request to unlink first segment later */
366
+ if (!isRedo )
367
+ register_unlink (rnode );
319
368
}
320
369
321
370
/*
@@ -1063,6 +1112,91 @@ mdsync(void)
1063
1112
mdsync_in_progress = false;
1064
1113
}
1065
1114
1115
+ /*
1116
+ * mdpreckpt() -- Do pre-checkpoint work
1117
+ *
1118
+ * To distinguish unlink requests that arrived before this checkpoint
1119
+ * started from those that arrived during the checkpoint, we use a cycle
1120
+ * counter similar to the one we use for fsync requests. That cycle
1121
+ * counter is incremented here.
1122
+ *
1123
+ * This must be called *before* the checkpoint REDO point is determined.
1124
+ * That ensures that we won't delete files too soon.
1125
+ *
1126
+ * Note that we can't do anything here that depends on the assumption
1127
+ * that the checkpoint will be completed.
1128
+ */
1129
+ void
1130
+ mdpreckpt (void )
1131
+ {
1132
+ ListCell * cell ;
1133
+
1134
+ /*
1135
+ * In case the prior checkpoint wasn't completed, stamp all entries in
1136
+ * the list with the current cycle counter. Anything that's in the
1137
+ * list at the start of checkpoint can surely be deleted after the
1138
+ * checkpoint is finished, regardless of when the request was made.
1139
+ */
1140
+ foreach (cell , pendingUnlinks )
1141
+ {
1142
+ PendingUnlinkEntry * entry = (PendingUnlinkEntry * ) lfirst (cell );
1143
+
1144
+ entry -> cycle_ctr = mdckpt_cycle_ctr ;
1145
+ }
1146
+
1147
+ /*
1148
+ * Any unlink requests arriving after this point will be assigned the
1149
+ * next cycle counter, and won't be unlinked until next checkpoint.
1150
+ */
1151
+ mdckpt_cycle_ctr ++ ;
1152
+ }
1153
+
1154
+ /*
1155
+ * mdpostckpt() -- Do post-checkpoint work
1156
+ *
1157
+ * Remove any lingering files that can now be safely removed.
1158
+ */
1159
+ void
1160
+ mdpostckpt (void )
1161
+ {
1162
+ while (pendingUnlinks != NIL )
1163
+ {
1164
+ PendingUnlinkEntry * entry = (PendingUnlinkEntry * ) linitial (pendingUnlinks );
1165
+ char * path ;
1166
+
1167
+ /*
1168
+ * New entries are appended to the end, so if the entry is new
1169
+ * we've reached the end of old entries.
1170
+ */
1171
+ if (entry -> cycle_ctr == mdsync_cycle_ctr )
1172
+ break ;
1173
+
1174
+ /* Else assert we haven't missed it */
1175
+ Assert ((CycleCtr ) (entry -> cycle_ctr + 1 ) == mdckpt_cycle_ctr );
1176
+
1177
+ /* Unlink the file */
1178
+ path = relpath (entry -> rnode );
1179
+ if (unlink (path ) < 0 )
1180
+ {
1181
+ /*
1182
+ * ENOENT shouldn't happen either, but it doesn't really matter
1183
+ * because we would've deleted it now anyway.
1184
+ */
1185
+ if (errno != ENOENT )
1186
+ ereport (WARNING ,
1187
+ (errcode_for_file_access (),
1188
+ errmsg ("could not remove relation %u/%u/%u: %m" ,
1189
+ entry -> rnode .spcNode ,
1190
+ entry -> rnode .dbNode ,
1191
+ entry -> rnode .relNode )));
1192
+ }
1193
+ pfree (path );
1194
+
1195
+ pendingUnlinks = list_delete_first (pendingUnlinks );
1196
+ pfree (entry );
1197
+ }
1198
+ }
1199
+
1066
1200
/*
1067
1201
* register_dirty_segment() -- Mark a relation segment as needing fsync
1068
1202
*
@@ -1096,19 +1230,53 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
1096
1230
}
1097
1231
}
1098
1232
1233
+ /*
1234
+ * register_unlink() -- Schedule a file to be deleted after next checkpoint
1235
+ *
1236
+ * As with register_dirty_segment, this could involve either a local or
1237
+ * a remote pending-ops table.
1238
+ */
1239
+ static void
1240
+ register_unlink (RelFileNode rnode )
1241
+ {
1242
+ if (pendingOpsTable )
1243
+ {
1244
+ /* push it into local pending-ops table */
1245
+ RememberFsyncRequest (rnode , UNLINK_RELATION_REQUEST );
1246
+ }
1247
+ else
1248
+ {
1249
+ /*
1250
+ * Notify the bgwriter about it. If we fail to queue the request
1251
+ * message, we have to sleep and try again, because we can't simply
1252
+ * delete the file now. Ugly, but hopefully won't happen often.
1253
+ *
1254
+ * XXX should we just leave the file orphaned instead?
1255
+ */
1256
+ Assert (IsUnderPostmaster );
1257
+ while (!ForwardFsyncRequest (rnode , UNLINK_RELATION_REQUEST ))
1258
+ pg_usleep (10000L ); /* 10 msec seems a good number */
1259
+ }
1260
+ }
1261
+
1099
1262
/*
1100
1263
* RememberFsyncRequest() -- callback from bgwriter side of fsync request
1101
1264
*
1102
- * We stuff the fsync request into the local hash table for execution
1103
- * during the bgwriter's next checkpoint.
1265
+ * We stuff most fsync requests into the local hash table for execution
1266
+ * during the bgwriter's next checkpoint. UNLINK requests go into a
1267
+ * separate linked list, however, because they get processed separately.
1104
1268
*
1105
1269
* The range of possible segment numbers is way less than the range of
1106
1270
* BlockNumber, so we can reserve high values of segno for special purposes.
1107
- * We define two: FORGET_RELATION_FSYNC means to cancel pending fsyncs for
1108
- * a relation, and FORGET_DATABASE_FSYNC means to cancel pending fsyncs for
1109
- * a whole database. (These are a tad slow because the hash table has to be
1110
- * searched linearly, but it doesn't seem worth rethinking the table structure
1111
- * for them.)
1271
+ * We define three:
1272
+ * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
1273
+ * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
1274
+ * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
1275
+ * checkpoint.
1276
+ *
1277
+ * (Handling the FORGET_* requests is a tad slow because the hash table has
1278
+ * to be searched linearly, but it doesn't seem worth rethinking the table
1279
+ * structure for them.)
1112
1280
*/
1113
1281
void
1114
1282
RememberFsyncRequest (RelFileNode rnode , BlockNumber segno )
@@ -1147,6 +1315,20 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
1147
1315
}
1148
1316
}
1149
1317
}
1318
+ else if (segno == UNLINK_RELATION_REQUEST )
1319
+ {
1320
+ /* Unlink request: put it in the linked list */
1321
+ MemoryContext oldcxt = MemoryContextSwitchTo (MdCxt );
1322
+ PendingUnlinkEntry * entry ;
1323
+
1324
+ entry = palloc (sizeof (PendingUnlinkEntry ));
1325
+ entry -> rnode = rnode ;
1326
+ entry -> cycle_ctr = mdckpt_cycle_ctr ;
1327
+
1328
+ pendingUnlinks = lappend (pendingUnlinks , entry );
1329
+
1330
+ MemoryContextSwitchTo (oldcxt );
1331
+ }
1150
1332
else
1151
1333
{
1152
1334
/* Normal case: enter a request to fsync this segment */
0 commit comments