Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 6cc4451

Browse files
committed
Prevent re-use of a deleted relation's relfilenode until after the next
checkpoint. This guards against an unlikely data-loss scenario in which we re-use the relfilenode, then crash, then replay the deletion and recreation of the file. Even then we'd be OK if all insertions into the new relation had been WAL-logged ... but that's not guaranteed given all the no-WAL-logging optimizations that have recently been added. Patch by Heikki Linnakangas, per a discussion last month.
1 parent 7a550cb commit 6cc4451

File tree

5 files changed

+274
-25
lines changed

5 files changed

+274
-25
lines changed

src/backend/access/transam/xlog.c

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.286 2007/10/12 19:39:59 tgl Exp $
10+
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.287 2007/11/15 20:36:40 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -45,6 +45,7 @@
4545
#include "storage/fd.h"
4646
#include "storage/pmsignal.h"
4747
#include "storage/procarray.h"
48+
#include "storage/smgr.h"
4849
#include "storage/spin.h"
4950
#include "utils/builtins.h"
5051
#include "utils/pg_locale.h"
@@ -5663,6 +5664,14 @@ CreateCheckPoint(int flags)
56635664
UpdateControlFile();
56645665
}
56655666

5667+
/*
5668+
* Let smgr prepare for checkpoint; this has to happen before we
5669+
* determine the REDO pointer. Note that smgr must not do anything
5670+
* that'd have to be undone if we decide no checkpoint is needed.
5671+
*/
5672+
smgrpreckpt();
5673+
5674+
/* Begin filling in the checkpoint WAL record */
56665675
MemSet(&checkPoint, 0, sizeof(checkPoint));
56675676
checkPoint.ThisTimeLineID = ThisTimeLineID;
56685677
checkPoint.time = time(NULL);
@@ -5886,6 +5895,11 @@ CreateCheckPoint(int flags)
58865895
*/
58875896
END_CRIT_SECTION();
58885897

5898+
/*
5899+
* Let smgr do post-checkpoint cleanup (eg, deleting old files).
5900+
*/
5901+
smgrpostckpt();
5902+
58895903
/*
58905904
* Delete old log files (those no longer needed even for previous
58915905
* checkpoint).

src/backend/commands/tablespace.c

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
*
3838
*
3939
* IDENTIFICATION
40-
* $PostgreSQL: pgsql/src/backend/commands/tablespace.c,v 1.49 2007/08/01 22:45:08 tgl Exp $
40+
* $PostgreSQL: pgsql/src/backend/commands/tablespace.c,v 1.50 2007/11/15 20:36:40 tgl Exp $
4141
*
4242
*-------------------------------------------------------------------------
4343
*/
@@ -57,6 +57,7 @@
5757
#include "commands/comment.h"
5858
#include "commands/tablespace.h"
5959
#include "miscadmin.h"
60+
#include "postmaster/bgwriter.h"
6061
#include "storage/fd.h"
6162
#include "utils/acl.h"
6263
#include "utils/builtins.h"
@@ -460,13 +461,29 @@ DropTableSpace(DropTableSpaceStmt *stmt)
460461
LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE);
461462

462463
/*
463-
* Try to remove the physical infrastructure
464+
* Try to remove the physical infrastructure.
464465
*/
465466
if (!remove_tablespace_directories(tablespaceoid, false))
466-
ereport(ERROR,
467-
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
468-
errmsg("tablespace \"%s\" is not empty",
469-
tablespacename)));
467+
{
468+
/*
469+
* Not all files deleted? However, there can be lingering empty files
470+
* in the directories, left behind by for example DROP TABLE, that
471+
* have been scheduled for deletion at next checkpoint (see comments
472+
* in mdunlink() for details). We could just delete them immediately,
473+
* but we can't tell them apart from important data files that we
474+
* mustn't delete. So instead, we force a checkpoint which will clean
475+
* out any lingering files, and try again.
476+
*/
477+
RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT);
478+
if (!remove_tablespace_directories(tablespaceoid, false))
479+
{
480+
/* Still not empty, the files must be important then */
481+
ereport(ERROR,
482+
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
483+
errmsg("tablespace \"%s\" is not empty",
484+
tablespacename)));
485+
}
486+
}
470487

471488
/* Record the filesystem change in XLOG */
472489
{

src/backend/storage/smgr/md.c

Lines changed: 193 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.129 2007/07/03 14:51:24 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.130 2007/11/15 20:36:40 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -34,6 +34,7 @@
3434
/* special values for the segno arg to RememberFsyncRequest */
3535
#define FORGET_RELATION_FSYNC (InvalidBlockNumber)
3636
#define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1)
37+
#define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
3738

3839
/*
3940
* On Windows, we have to interpret EACCES as possibly meaning the same as
@@ -113,6 +114,10 @@ static MemoryContext MdCxt; /* context for all md.c allocations */
113114
* table remembers the pending operations. We use a hash table mostly as
114115
* a convenient way of eliminating duplicate requests.
115116
*
117+
* We use a similar mechanism to remember no-longer-needed files that can
118+
* be deleted after the next checkpoint, but we use a linked list instead of
119+
* a hash table, because we don't expect there to be any duplicate requests.
120+
*
116121
* (Regular backends do not track pending operations locally, but forward
117122
* them to the bgwriter.)
118123
*/
@@ -131,9 +136,17 @@ typedef struct
131136
CycleCtr cycle_ctr; /* mdsync_cycle_ctr when request was made */
132137
} PendingOperationEntry;
133138

139+
typedef struct
140+
{
141+
RelFileNode rnode; /* the dead relation to delete */
142+
CycleCtr cycle_ctr; /* mdckpt_cycle_ctr when request was made */
143+
} PendingUnlinkEntry;
144+
134145
static HTAB *pendingOpsTable = NULL;
146+
static List *pendingUnlinks = NIL;
135147

136148
static CycleCtr mdsync_cycle_ctr = 0;
149+
static CycleCtr mdckpt_cycle_ctr = 0;
137150

138151

139152
typedef enum /* behavior for mdopen & _mdfd_getseg */
@@ -146,6 +159,7 @@ typedef enum /* behavior for mdopen & _mdfd_getseg */
146159
/* local routines */
147160
static MdfdVec *mdopen(SMgrRelation reln, ExtensionBehavior behavior);
148161
static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
162+
static void register_unlink(RelFileNode rnode);
149163
static MdfdVec *_fdvec_alloc(void);
150164

151165
#ifndef LET_OS_MANAGE_FILESIZE
@@ -188,6 +202,7 @@ mdinit(void)
188202
100L,
189203
&hash_ctl,
190204
HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
205+
pendingUnlinks = NIL;
191206
}
192207
}
193208

@@ -254,14 +269,37 @@ mdcreate(SMgrRelation reln, bool isRedo)
254269
* Note that we're passed a RelFileNode --- by the time this is called,
255270
* there won't be an SMgrRelation hashtable entry anymore.
256271
*
272+
* Actually, we don't unlink the first segment file of the relation, but
273+
* just truncate it to zero length, and record a request to unlink it after
274+
* the next checkpoint. Additional segments can be unlinked immediately,
275+
* however. Leaving the empty file in place prevents that relfilenode
276+
* number from being reused. The scenario this protects us from is:
277+
* 1. We delete a relation (and commit, and actually remove its file).
278+
* 2. We create a new relation, which by chance gets the same relfilenode as
279+
* the just-deleted one (OIDs must've wrapped around for that to happen).
280+
* 3. We crash before another checkpoint occurs.
281+
* During replay, we would delete the file and then recreate it, which is fine
282+
* if the contents of the file were repopulated by subsequent WAL entries.
283+
* But if we didn't WAL-log insertions, but instead relied on fsyncing the
284+
* file after populating it (as for instance CLUSTER and CREATE INDEX do),
285+
* the contents of the file would be lost forever. By leaving the empty file
286+
* until after the next checkpoint, we prevent reassignment of the relfilenode
287+
* number until it's safe, because relfilenode assignment skips over any
288+
* existing file.
289+
*
257290
* If isRedo is true, it's okay for the relation to be already gone.
258-
* Also, any failure should be reported as WARNING not ERROR, because
291+
* Also, we should remove the file immediately instead of queuing a request
292+
* for later, since during redo there's no possibility of creating a
293+
* conflicting relation.
294+
*
295+
* Note: any failure should be reported as WARNING not ERROR, because
259296
* we are usually not in a transaction anymore when this is called.
260297
*/
261298
void
262299
mdunlink(RelFileNode rnode, bool isRedo)
263300
{
264301
char *path;
302+
int ret;
265303

266304
/*
267305
* We have to clean out any pending fsync requests for the doomed relation,
@@ -271,8 +309,15 @@ mdunlink(RelFileNode rnode, bool isRedo)
271309

272310
path = relpath(rnode);
273311

274-
/* Delete the first segment, or only segment if not doing segmenting */
275-
if (unlink(path) < 0)
312+
/*
313+
* Delete or truncate the first segment, or only segment if not doing
314+
* segmenting
315+
*/
316+
if (isRedo)
317+
ret = unlink(path);
318+
else
319+
ret = truncate(path, 0);
320+
if (ret < 0)
276321
{
277322
if (!isRedo || errno != ENOENT)
278323
ereport(WARNING,
@@ -316,6 +361,10 @@ mdunlink(RelFileNode rnode, bool isRedo)
316361
#endif
317362

318363
pfree(path);
364+
365+
/* Register request to unlink first segment later */
366+
if (!isRedo)
367+
register_unlink(rnode);
319368
}
320369

321370
/*
@@ -1063,6 +1112,91 @@ mdsync(void)
10631112
mdsync_in_progress = false;
10641113
}
10651114

1115+
/*
1116+
* mdpreckpt() -- Do pre-checkpoint work
1117+
*
1118+
* To distinguish unlink requests that arrived before this checkpoint
1119+
* started from those that arrived during the checkpoint, we use a cycle
1120+
* counter similar to the one we use for fsync requests. That cycle
1121+
* counter is incremented here.
1122+
*
1123+
* This must be called *before* the checkpoint REDO point is determined.
1124+
* That ensures that we won't delete files too soon.
1125+
*
1126+
* Note that we can't do anything here that depends on the assumption
1127+
* that the checkpoint will be completed.
1128+
*/
1129+
void
1130+
mdpreckpt(void)
1131+
{
1132+
ListCell *cell;
1133+
1134+
/*
1135+
* In case the prior checkpoint wasn't completed, stamp all entries in
1136+
* the list with the current cycle counter. Anything that's in the
1137+
* list at the start of checkpoint can surely be deleted after the
1138+
* checkpoint is finished, regardless of when the request was made.
1139+
*/
1140+
foreach(cell, pendingUnlinks)
1141+
{
1142+
PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1143+
1144+
entry->cycle_ctr = mdckpt_cycle_ctr;
1145+
}
1146+
1147+
/*
1148+
* Any unlink requests arriving after this point will be assigned the
1149+
* next cycle counter, and won't be unlinked until next checkpoint.
1150+
*/
1151+
mdckpt_cycle_ctr++;
1152+
}
1153+
1154+
/*
1155+
* mdpostckpt() -- Do post-checkpoint work
1156+
*
1157+
* Remove any lingering files that can now be safely removed.
1158+
*/
1159+
void
1160+
mdpostckpt(void)
1161+
{
1162+
while (pendingUnlinks != NIL)
1163+
{
1164+
PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
1165+
char *path;
1166+
1167+
/*
1168+
* New entries are appended to the end, so if the entry is new
1169+
* we've reached the end of old entries.
1170+
*/
1171+
if (entry->cycle_ctr == mdsync_cycle_ctr)
1172+
break;
1173+
1174+
/* Else assert we haven't missed it */
1175+
Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
1176+
1177+
/* Unlink the file */
1178+
path = relpath(entry->rnode);
1179+
if (unlink(path) < 0)
1180+
{
1181+
/*
1182+
* ENOENT shouldn't happen either, but it doesn't really matter
1183+
* because we would've deleted it now anyway.
1184+
*/
1185+
if (errno != ENOENT)
1186+
ereport(WARNING,
1187+
(errcode_for_file_access(),
1188+
errmsg("could not remove relation %u/%u/%u: %m",
1189+
entry->rnode.spcNode,
1190+
entry->rnode.dbNode,
1191+
entry->rnode.relNode)));
1192+
}
1193+
pfree(path);
1194+
1195+
pendingUnlinks = list_delete_first(pendingUnlinks);
1196+
pfree(entry);
1197+
}
1198+
}
1199+
10661200
/*
10671201
* register_dirty_segment() -- Mark a relation segment as needing fsync
10681202
*
@@ -1096,19 +1230,53 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
10961230
}
10971231
}
10981232

1233+
/*
1234+
* register_unlink() -- Schedule a file to be deleted after next checkpoint
1235+
*
1236+
* As with register_dirty_segment, this could involve either a local or
1237+
* a remote pending-ops table.
1238+
*/
1239+
static void
1240+
register_unlink(RelFileNode rnode)
1241+
{
1242+
if (pendingOpsTable)
1243+
{
1244+
/* push it into local pending-ops table */
1245+
RememberFsyncRequest(rnode, UNLINK_RELATION_REQUEST);
1246+
}
1247+
else
1248+
{
1249+
/*
1250+
* Notify the bgwriter about it. If we fail to queue the request
1251+
* message, we have to sleep and try again, because we can't simply
1252+
* delete the file now. Ugly, but hopefully won't happen often.
1253+
*
1254+
* XXX should we just leave the file orphaned instead?
1255+
*/
1256+
Assert(IsUnderPostmaster);
1257+
while (!ForwardFsyncRequest(rnode, UNLINK_RELATION_REQUEST))
1258+
pg_usleep(10000L); /* 10 msec seems a good number */
1259+
}
1260+
}
1261+
10991262
/*
11001263
* RememberFsyncRequest() -- callback from bgwriter side of fsync request
11011264
*
1102-
* We stuff the fsync request into the local hash table for execution
1103-
* during the bgwriter's next checkpoint.
1265+
* We stuff most fsync requests into the local hash table for execution
1266+
* during the bgwriter's next checkpoint. UNLINK requests go into a
1267+
* separate linked list, however, because they get processed separately.
11041268
*
11051269
* The range of possible segment numbers is way less than the range of
11061270
* BlockNumber, so we can reserve high values of segno for special purposes.
1107-
* We define two: FORGET_RELATION_FSYNC means to cancel pending fsyncs for
1108-
* a relation, and FORGET_DATABASE_FSYNC means to cancel pending fsyncs for
1109-
* a whole database. (These are a tad slow because the hash table has to be
1110-
* searched linearly, but it doesn't seem worth rethinking the table structure
1111-
* for them.)
1271+
* We define three:
1272+
* - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
1273+
* - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
1274+
* - UNLINK_RELATION_REQUEST is a request to delete the file after the next
1275+
* checkpoint.
1276+
*
1277+
* (Handling the FORGET_* requests is a tad slow because the hash table has
1278+
* to be searched linearly, but it doesn't seem worth rethinking the table
1279+
* structure for them.)
11121280
*/
11131281
void
11141282
RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
@@ -1147,6 +1315,20 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
11471315
}
11481316
}
11491317
}
1318+
else if (segno == UNLINK_RELATION_REQUEST)
1319+
{
1320+
/* Unlink request: put it in the linked list */
1321+
MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
1322+
PendingUnlinkEntry *entry;
1323+
1324+
entry = palloc(sizeof(PendingUnlinkEntry));
1325+
entry->rnode = rnode;
1326+
entry->cycle_ctr = mdckpt_cycle_ctr;
1327+
1328+
pendingUnlinks = lappend(pendingUnlinks, entry);
1329+
1330+
MemoryContextSwitchTo(oldcxt);
1331+
}
11501332
else
11511333
{
11521334
/* Normal case: enter a request to fsync this segment */

0 commit comments

Comments
 (0)