Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 5c15606

Browse files
committed
Fix several recently introduced issues around handling new relation forks.
Most of these stem from d25f519 "tableam: relation creation, VACUUM FULL/CLUSTER, SET TABLESPACE.". 1) To pass data to the relation_set_new_filenode() RelationSetNewRelfilenode() was made to update RelationData.rd_rel directly. That's not OK however, as it makes the relcache entries temporarily inconsistent. Which among other scenarios is a problem if a REINDEX targets an index on pg_class - the CatalogTupleUpdate() in RelationSetNewRelfilenode(). Presumably that was introduced because other places in the code do so - while those aren't "good practice" they don't appear to be actively buggy (e.g. because system tables may not be targeted). I (Andres) should have caught this while reviewing and signficantly evolving the code in that commit, mea culpa. Fix that by instead passing in the new RelFileNode as separate argument to relation_set_new_filenode() and rely on the relcache to update the catalog entry. Also revert that the RelationMapUpdateMap() call was changed to immediate, and undo some other more unnecessary changes. 2) Document that the relation_set_new_filenode cannot rely on the whole relcache entry to be valid. It might be worthwhile to refactor the code to never have to rely on that, but given the way heap_create() is currently coded, that'd be a large change. 3) ATExecSetTableSpace() shouldn't do FlushRelationBuffers() itself. A table AM might not use shared buffers at all. Move to index_copy_data() and heapam_relation_copy_data(). 4) heapam_relation_set_new_filenode() previously sometimes accessed rel->rd_rel->relpersistence rather than the `persistence` argument. Code movement mistake. 5) Previously heapam_relation_set_new_filenode() re-opened the smgr relation to create the init for, if necesary. Instead have RelationCreateStorage() return the SMgrRelation and use it to create the init fork. 6) Add a note about the danger of modifying the relcache directly to ATExecSetTableSpace() - it's currently not a bug because there's a check ERRORing for catalog tables. Regression tests and assertion improvements that together trigger the bug described in 1) will be added in a later commit, as there is a related bug on all branches. Reported-By: Michael Paquier Diagnosed-By: Tom Lane and Andres Freund Author: Andres Freund Reviewed-By: Tom Lane Discussion: https://postgr.es/m/20190418011430.GA19133@paquier.xyz
1 parent 9ee7414 commit 5c15606

File tree

8 files changed

+101
-61
lines changed

8 files changed

+101
-61
lines changed

src/backend/access/heap/heapam_handler.c

+24-11
Original file line numberDiff line numberDiff line change
@@ -566,10 +566,14 @@ heapam_finish_bulk_insert(Relation relation, int options)
566566
*/
567567

568568
static void
569-
heapam_relation_set_new_filenode(Relation rel, char persistence,
569+
heapam_relation_set_new_filenode(Relation rel,
570+
const RelFileNode *newrnode,
571+
char persistence,
570572
TransactionId *freezeXid,
571573
MultiXactId *minmulti)
572574
{
575+
SMgrRelation srel;
576+
573577
/*
574578
* Initialize to the minimum XID that could put tuples in the table. We
575579
* know that no xacts older than RecentXmin are still running, so that
@@ -587,7 +591,7 @@ heapam_relation_set_new_filenode(Relation rel, char persistence,
587591
*/
588592
*minmulti = GetOldestMultiXactId();
589593

590-
RelationCreateStorage(rel->rd_node, persistence);
594+
srel = RelationCreateStorage(*newrnode, persistence);
591595

592596
/*
593597
* If required, set up an init fork for an unlogged table so that it can
@@ -598,16 +602,17 @@ heapam_relation_set_new_filenode(Relation rel, char persistence,
598602
* while replaying, for example, XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE
599603
* record. Therefore, logging is necessary even if wal_level=minimal.
600604
*/
601-
if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED)
605+
if (persistence == RELPERSISTENCE_UNLOGGED)
602606
{
603607
Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
604608
rel->rd_rel->relkind == RELKIND_MATVIEW ||
605609
rel->rd_rel->relkind == RELKIND_TOASTVALUE);
606-
RelationOpenSmgr(rel);
607-
smgrcreate(rel->rd_smgr, INIT_FORKNUM, false);
608-
log_smgrcreate(&rel->rd_smgr->smgr_rnode.node, INIT_FORKNUM);
609-
smgrimmedsync(rel->rd_smgr, INIT_FORKNUM);
610+
smgrcreate(srel, INIT_FORKNUM, false);
611+
log_smgrcreate(newrnode, INIT_FORKNUM);
612+
smgrimmedsync(srel, INIT_FORKNUM);
610613
}
614+
615+
smgrclose(srel);
611616
}
612617

613618
static void
@@ -617,21 +622,29 @@ heapam_relation_nontransactional_truncate(Relation rel)
617622
}
618623

619624
static void
620-
heapam_relation_copy_data(Relation rel, RelFileNode newrnode)
625+
heapam_relation_copy_data(Relation rel, const RelFileNode *newrnode)
621626
{
622627
SMgrRelation dstrel;
623628

624-
dstrel = smgropen(newrnode, rel->rd_backend);
629+
dstrel = smgropen(*newrnode, rel->rd_backend);
625630
RelationOpenSmgr(rel);
626631

632+
/*
633+
* Since we copy the file directly without looking at the shared buffers,
634+
* we'd better first flush out any pages of the source relation that are
635+
* in shared buffers. We assume no new changes will be made while we are
636+
* holding exclusive lock on the rel.
637+
*/
638+
FlushRelationBuffers(rel);
639+
627640
/*
628641
* Create and copy all forks of the relation, and schedule unlinking of
629642
* old physical files.
630643
*
631644
* NOTE: any conflict in relfilenode value will be caught in
632645
* RelationCreateStorage().
633646
*/
634-
RelationCreateStorage(newrnode, rel->rd_rel->relpersistence);
647+
RelationCreateStorage(*newrnode, rel->rd_rel->relpersistence);
635648

636649
/* copy main fork */
637650
RelationCopyStorage(rel->rd_smgr, dstrel, MAIN_FORKNUM,
@@ -652,7 +665,7 @@ heapam_relation_copy_data(Relation rel, RelFileNode newrnode)
652665
if (rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT ||
653666
(rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
654667
forkNum == INIT_FORKNUM))
655-
log_smgrcreate(&newrnode, forkNum);
668+
log_smgrcreate(newrnode, forkNum);
656669
RelationCopyStorage(rel->rd_smgr, dstrel, forkNum,
657670
rel->rd_rel->relpersistence);
658671
}

src/backend/catalog/heap.c

+3-2
Original file line numberDiff line numberDiff line change
@@ -435,8 +435,9 @@ heap_create(const char *relname,
435435
case RELKIND_RELATION:
436436
case RELKIND_TOASTVALUE:
437437
case RELKIND_MATVIEW:
438-
table_relation_set_new_filenode(rel, relpersistence,
439-
relfrozenxid, relminmxid);
438+
table_relation_set_new_filenode(rel, &rel->rd_node,
439+
relpersistence,
440+
relfrozenxid, relminmxid);
440441
break;
441442
}
442443
}

src/backend/catalog/storage.c

+9-3
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
7575
* This function is transactional. The creation is WAL-logged, and if the
7676
* transaction aborts later on, the storage will be destroyed.
7777
*/
78-
void
78+
SMgrRelation
7979
RelationCreateStorage(RelFileNode rnode, char relpersistence)
8080
{
8181
PendingRelDelete *pending;
@@ -99,7 +99,7 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence)
9999
break;
100100
default:
101101
elog(ERROR, "invalid relpersistence: %c", relpersistence);
102-
return; /* placate compiler */
102+
return NULL; /* placate compiler */
103103
}
104104

105105
srel = smgropen(rnode, backend);
@@ -117,13 +117,15 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence)
117117
pending->nestLevel = GetCurrentTransactionNestLevel();
118118
pending->next = pendingDeletes;
119119
pendingDeletes = pending;
120+
121+
return srel;
120122
}
121123

122124
/*
123125
* Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL.
124126
*/
125127
void
126-
log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum)
128+
log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum)
127129
{
128130
xl_smgr_create xlrec;
129131

@@ -294,6 +296,10 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
294296

295297
/*
296298
* Copy a fork's data, block by block.
299+
*
300+
* Note that this requires that there is no dirty data in shared buffers. If
301+
* it's possible that there are, callers need to flush those using
302+
* e.g. FlushRelationBuffers(rel).
297303
*/
298304
void
299305
RelationCopyStorage(SMgrRelation src, SMgrRelation dst,

src/backend/commands/tablecmds.c

+16-10
Original file line numberDiff line numberDiff line change
@@ -12236,14 +12236,6 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode)
1223612236
elog(ERROR, "cache lookup failed for relation %u", tableOid);
1223712237
rd_rel = (Form_pg_class) GETSTRUCT(tuple);
1223812238

12239-
/*
12240-
* Since we copy the file directly without looking at the shared buffers,
12241-
* we'd better first flush out any pages of the source relation that are
12242-
* in shared buffers. We assume no new changes will be made while we are
12243-
* holding exclusive lock on the rel.
12244-
*/
12245-
FlushRelationBuffers(rel);
12246-
1224712239
/*
1224812240
* Relfilenodes are not unique in databases across tablespaces, so we need
1224912241
* to allocate a new one in the new tablespace.
@@ -12266,10 +12258,16 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode)
1226612258
Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
1226712259
rel->rd_rel->relkind == RELKIND_MATVIEW ||
1226812260
rel->rd_rel->relkind == RELKIND_TOASTVALUE);
12269-
table_relation_copy_data(rel, newrnode);
12261+
table_relation_copy_data(rel, &newrnode);
1227012262
}
1227112263

12272-
/* update the pg_class row */
12264+
/*
12265+
* Update the pg_class row.
12266+
*
12267+
* NB: This wouldn't work if ATExecSetTableSpace() were allowed to be
12268+
* executed on pg_class or its indexes (the above copy wouldn't contain
12269+
* the updated pg_class entry), but that's forbidden above.
12270+
*/
1227312271
rd_rel->reltablespace = (newTableSpace == MyDatabaseTableSpace) ? InvalidOid : newTableSpace;
1227412272
rd_rel->relfilenode = newrelfilenode;
1227512273
CatalogTupleUpdate(pg_class, &tuple->t_self, tuple);
@@ -12537,6 +12535,14 @@ index_copy_data(Relation rel, RelFileNode newrnode)
1253712535
dstrel = smgropen(newrnode, rel->rd_backend);
1253812536
RelationOpenSmgr(rel);
1253912537

12538+
/*
12539+
* Since we copy the file directly without looking at the shared buffers,
12540+
* we'd better first flush out any pages of the source relation that are
12541+
* in shared buffers. We assume no new changes will be made while we are
12542+
* holding exclusive lock on the rel.
12543+
*/
12544+
FlushRelationBuffers(rel);
12545+
1254012546
/*
1254112547
* Create and copy all forks of the relation, and schedule unlinking of
1254212548
* old physical files.

src/backend/utils/cache/relcache.c

+29-26
Original file line numberDiff line numberDiff line change
@@ -3440,6 +3440,7 @@ RelationSetNewRelfilenode(Relation relation, char persistence)
34403440
Form_pg_class classform;
34413441
MultiXactId minmulti = InvalidMultiXactId;
34423442
TransactionId freezeXid = InvalidTransactionId;
3443+
RelFileNode newrnode;
34433444

34443445
/* Allocate a new relfilenode */
34453446
newrelfilenode = GetNewRelFileNode(relation->rd_rel->reltablespace, NULL,
@@ -3462,39 +3463,23 @@ RelationSetNewRelfilenode(Relation relation, char persistence)
34623463
*/
34633464
RelationDropStorage(relation);
34643465

3465-
/*
3466-
* Now update the pg_class row. However, if we're dealing with a mapped
3467-
* index, pg_class.relfilenode doesn't change; instead we have to send the
3468-
* update to the relation mapper.
3469-
*/
3470-
if (RelationIsMapped(relation))
3471-
RelationMapUpdateMap(RelationGetRelid(relation),
3472-
newrelfilenode,
3473-
relation->rd_rel->relisshared,
3474-
true);
3475-
else
3476-
{
3477-
relation->rd_rel->relfilenode = newrelfilenode;
3478-
classform->relfilenode = newrelfilenode;
3479-
}
3480-
3481-
RelationInitPhysicalAddr(relation);
3466+
/* initialize new relfilenode from old relfilenode */
3467+
newrnode = relation->rd_node;
34823468

34833469
/*
34843470
* Create storage for the main fork of the new relfilenode. If it's
34853471
* table-like object, call into table AM to do so, which'll also create
34863472
* the table's init fork.
34873473
*
3488-
* NOTE: any conflict in relfilenode value will be caught here, if
3489-
* GetNewRelFileNode messes up for any reason.
3474+
* NOTE: If relevant for the AM, any conflict in relfilenode value will be
3475+
* caught here, if GetNewRelFileNode messes up for any reason.
34903476
*/
3477+
newrnode = relation->rd_node;
3478+
newrnode.relNode = newrelfilenode;
34913479

3492-
/*
3493-
* Create storage for relation.
3494-
*/
34953480
switch (relation->rd_rel->relkind)
34963481
{
3497-
/* shouldn't be called for these */
3482+
/* shouldn't be called for these */
34983483
case RELKIND_VIEW:
34993484
case RELKIND_COMPOSITE_TYPE:
35003485
case RELKIND_FOREIGN_TABLE:
@@ -3505,18 +3490,36 @@ RelationSetNewRelfilenode(Relation relation, char persistence)
35053490

35063491
case RELKIND_INDEX:
35073492
case RELKIND_SEQUENCE:
3508-
RelationCreateStorage(relation->rd_node, persistence);
3509-
RelationOpenSmgr(relation);
3493+
{
3494+
SMgrRelation srel;
3495+
3496+
srel = RelationCreateStorage(newrnode, persistence);
3497+
smgrclose(srel);
3498+
}
35103499
break;
35113500

35123501
case RELKIND_RELATION:
35133502
case RELKIND_TOASTVALUE:
35143503
case RELKIND_MATVIEW:
3515-
table_relation_set_new_filenode(relation, persistence,
3504+
table_relation_set_new_filenode(relation, &newrnode,
3505+
persistence,
35163506
&freezeXid, &minmulti);
35173507
break;
35183508
}
35193509

3510+
/*
3511+
* However, if we're dealing with a mapped index, pg_class.relfilenode
3512+
* doesn't change; instead we have to send the update to the relation
3513+
* mapper.
3514+
*/
3515+
if (RelationIsMapped(relation))
3516+
RelationMapUpdateMap(RelationGetRelid(relation),
3517+
newrelfilenode,
3518+
relation->rd_rel->relisshared,
3519+
false);
3520+
else
3521+
classform->relfilenode = newrelfilenode;
3522+
35203523
/* These changes are safe even for a mapped relation */
35213524
if (relation->rd_rel->relkind != RELKIND_SEQUENCE)
35223525
{

src/include/access/tableam.h

+18-7
Original file line numberDiff line numberDiff line change
@@ -416,14 +416,20 @@ typedef struct TableAmRoutine
416416
* This callback needs to create a new relation filenode for `rel`, with
417417
* appropriate durability behaviour for `persistence`.
418418
*
419-
* On output *freezeXid, *minmulti must be set to the values appropriate
419+
* Note that only the subset of the relcache filled by
420+
* RelationBuildLocalRelation() can be relied upon and that the relation's
421+
* catalog entries either will either not yet exist (new relation), or
422+
* will still reference the old relfilenode.
423+
*
424+
* As output *freezeXid, *minmulti must be set to the values appropriate
420425
* for pg_class.{relfrozenxid, relminmxid}. For AMs that don't need those
421426
* fields to be filled they can be set to InvalidTransactionId and
422427
* InvalidMultiXactId, respectively.
423428
*
424429
* See also table_relation_set_new_filenode().
425430
*/
426431
void (*relation_set_new_filenode) (Relation rel,
432+
const RelFileNode *newrnode,
427433
char persistence,
428434
TransactionId *freezeXid,
429435
MultiXactId *minmulti);
@@ -444,7 +450,8 @@ typedef struct TableAmRoutine
444450
* This can typically be implemented by directly copying the underlying
445451
* storage, unless it contains references to the tablespace internally.
446452
*/
447-
void (*relation_copy_data) (Relation rel, RelFileNode newrnode);
453+
void (*relation_copy_data) (Relation rel,
454+
const RelFileNode *newrnode);
448455

449456
/* See table_relation_copy_for_cluster() */
450457
void (*relation_copy_for_cluster) (Relation NewHeap,
@@ -1251,21 +1258,25 @@ table_finish_bulk_insert(Relation rel, int options)
12511258
*/
12521259

12531260
/*
1254-
* Create a new relation filenode for `rel`, with persistence set to
1261+
* Create storage for `rel` in `newrode`, with persistence set to
12551262
* `persistence`.
12561263
*
12571264
* This is used both during relation creation and various DDL operations to
1258-
* create a new relfilenode that can be filled from scratch.
1265+
* create a new relfilenode that can be filled from scratch. When creating
1266+
* new storage for an existing relfilenode, this should be called before the
1267+
* relcache entry has been updated.
12591268
*
12601269
* *freezeXid, *minmulti are set to the xid / multixact horizon for the table
12611270
* that pg_class.{relfrozenxid, relminmxid} have to be set to.
12621271
*/
12631272
static inline void
1264-
table_relation_set_new_filenode(Relation rel, char persistence,
1273+
table_relation_set_new_filenode(Relation rel,
1274+
const RelFileNode *newrnode,
1275+
char persistence,
12651276
TransactionId *freezeXid,
12661277
MultiXactId *minmulti)
12671278
{
1268-
rel->rd_tableam->relation_set_new_filenode(rel, persistence,
1279+
rel->rd_tableam->relation_set_new_filenode(rel, newrnode, persistence,
12691280
freezeXid, minmulti);
12701281
}
12711282

@@ -1288,7 +1299,7 @@ table_relation_nontransactional_truncate(Relation rel)
12881299
* changing a relation's tablespace.
12891300
*/
12901301
static inline void
1291-
table_relation_copy_data(Relation rel, RelFileNode newrnode)
1302+
table_relation_copy_data(Relation rel, const RelFileNode *newrnode)
12921303
{
12931304
rel->rd_tableam->relation_copy_data(rel, newrnode);
12941305
}

src/include/catalog/storage.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
#include "storage/smgr.h"
2020
#include "utils/relcache.h"
2121

22-
extern void RelationCreateStorage(RelFileNode rnode, char relpersistence);
22+
extern SMgrRelation RelationCreateStorage(RelFileNode rnode, char relpersistence);
2323
extern void RelationDropStorage(Relation rel);
2424
extern void RelationPreserveStorage(RelFileNode rnode, bool atCommit);
2525
extern void RelationTruncate(Relation rel, BlockNumber nblocks);

src/include/catalog/storage_xlog.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ typedef struct xl_smgr_truncate
5050
int flags;
5151
} xl_smgr_truncate;
5252

53-
extern void log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum);
53+
extern void log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum);
5454

5555
extern void smgr_redo(XLogReaderState *record);
5656
extern void smgr_desc(StringInfo buf, XLogReaderState *record);

0 commit comments

Comments
 (0)