Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 4ab5dae

Browse files
committed
Use TRUNCATE to preserve relfilenode for pg_largeobject + index.
Commit 9a974cb arranged to preserve the relfilenode of user tables across pg_upgrade, but failed to notice that pg_upgrade treats pg_largeobject as a user table and thus it needs the same treatment. Otherwise, large objects will appear to vanish after a pg_upgrade. Commit d498e05 fixed this problem by teaching pg_dump to UPDATE pg_class.relfilenode for pg_largeobject and its index. However, because an UPDATE on the catalog rows doesn't change anything on disk, this can leave stray files behind in the new cluster. They will normally be empty, but it's a little bit untidy. Hence, this commit arranges to do the same thing using DDL. Specifically, it makes TRUNCATE work for the pg_largeobject catalog when in binary-upgrade mode, and it then uses that command in binary-upgrade dumps as a way of setting pg_class.relfilenode for pg_largeobject and its index. That way, the old files are removed from the new cluster. Discussion: http://postgr.es/m/CA+TgmoYYMXGUJO5GZk1-MByJGu_bB8CbOL6GJQC8=Bzt6x6vDg@mail.gmail.com
1 parent 02e5c27 commit 4ab5dae

File tree

4 files changed

+120
-15
lines changed

4 files changed

+120
-15
lines changed

src/backend/commands/tablecmds.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
#include "catalog/pg_depend.h"
4141
#include "catalog/pg_foreign_table.h"
4242
#include "catalog/pg_inherits.h"
43+
#include "catalog/pg_largeobject.h"
4344
#include "catalog/pg_namespace.h"
4445
#include "catalog/pg_opclass.h"
4546
#include "catalog/pg_statistic_ext.h"
@@ -2181,7 +2182,15 @@ truncate_check_rel(Oid relid, Form_pg_class reltuple)
21812182
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
21822183
errmsg("\"%s\" is not a table", relname)));
21832184

2184-
if (!allowSystemTableMods && IsSystemClass(relid, reltuple))
2185+
/*
2186+
* Most system catalogs can't be truncated at all, or at least not unless
2187+
* allow_system_table_mods=on. As an exception, however, we allow
2188+
* pg_largeobject to be truncated as part of pg_upgrade, because we need
2189+
* to change its relfilenode to match the old cluster, and allowing a
2190+
* TRUNCATE command to be executed is the easiest way of doing that.
2191+
*/
2192+
if (!allowSystemTableMods && IsSystemClass(relid, reltuple)
2193+
&& (!IsBinaryUpgrade || relid != LargeObjectRelationId))
21852194
ereport(ERROR,
21862195
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
21872196
errmsg("permission denied: \"%s\" is a system catalog",

src/backend/storage/smgr/md.c

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,7 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
319319
{
320320
char *path;
321321
int ret;
322+
BlockNumber segno = 0;
322323

323324
path = relpath(rnode, forkNum);
324325

@@ -353,8 +354,22 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
353354
/* Prevent other backends' fds from holding on to the disk space */
354355
ret = do_truncate(path);
355356

356-
/* Register request to unlink first segment later */
357-
register_unlink_segment(rnode, forkNum, 0 /* first seg */ );
357+
/*
358+
* Except during a binary upgrade, register request to unlink first
359+
* segment later, rather than now.
360+
*
361+
* If we're performing a binary upgrade, the dangers described in the
362+
* header comments for mdunlink() do not exist, since after a crash
363+
* or even a simple ERROR, the upgrade fails and the whole new cluster
364+
* must be recreated from scratch. And, on the other hand, it is
365+
* important to remove the files from disk immediately, because we
366+
* may be about to reuse the same relfilenode.
367+
*/
368+
if (!IsBinaryUpgrade)
369+
{
370+
register_unlink_segment(rnode, forkNum, 0 /* first seg */ );
371+
++segno;
372+
}
358373
}
359374

360375
/*
@@ -363,15 +378,17 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
363378
if (ret >= 0)
364379
{
365380
char *segpath = (char *) palloc(strlen(path) + 12);
366-
BlockNumber segno;
367381

368382
/*
369383
* Note that because we loop until getting ENOENT, we will correctly
370384
* remove all inactive segments as well as active ones.
371385
*/
372-
for (segno = 1;; segno++)
386+
for (;; segno++)
373387
{
374-
sprintf(segpath, "%s.%u", path, segno);
388+
if (segno == 0)
389+
strcpy(segpath, path);
390+
else
391+
sprintf(segpath, "%s.%u", path, segno);
375392

376393
if (!RelFileNodeBackendIsTemp(rnode))
377394
{

src/backend/utils/cache/relcache.c

Lines changed: 61 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
#include "access/tupdesc_details.h"
4242
#include "access/xact.h"
4343
#include "access/xlog.h"
44+
#include "catalog/binary_upgrade.h"
4445
#include "catalog/catalog.h"
4546
#include "catalog/indexing.h"
4647
#include "catalog/namespace.h"
@@ -3707,9 +3708,36 @@ RelationSetNewRelfilenode(Relation relation, char persistence)
37073708
TransactionId freezeXid = InvalidTransactionId;
37083709
RelFileNode newrnode;
37093710

3710-
/* Allocate a new relfilenode */
3711-
newrelfilenode = GetNewRelFileNode(relation->rd_rel->reltablespace, NULL,
3712-
persistence);
3711+
if (!IsBinaryUpgrade)
3712+
{
3713+
/* Allocate a new relfilenode */
3714+
newrelfilenode = GetNewRelFileNode(relation->rd_rel->reltablespace,
3715+
NULL, persistence);
3716+
}
3717+
else if (relation->rd_rel->relkind == RELKIND_INDEX)
3718+
{
3719+
if (!OidIsValid(binary_upgrade_next_index_pg_class_relfilenode))
3720+
ereport(ERROR,
3721+
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3722+
errmsg("index relfilenode value not set when in binary upgrade mode")));
3723+
3724+
newrelfilenode = binary_upgrade_next_index_pg_class_relfilenode;
3725+
binary_upgrade_next_index_pg_class_relfilenode = InvalidOid;
3726+
}
3727+
else if (relation->rd_rel->relkind == RELKIND_RELATION)
3728+
{
3729+
if (!OidIsValid(binary_upgrade_next_heap_pg_class_relfilenode))
3730+
ereport(ERROR,
3731+
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3732+
errmsg("heap relfilenode value not set when in binary upgrade mode")));
3733+
3734+
newrelfilenode = binary_upgrade_next_heap_pg_class_relfilenode;
3735+
binary_upgrade_next_heap_pg_class_relfilenode = InvalidOid;
3736+
}
3737+
else
3738+
ereport(ERROR,
3739+
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3740+
errmsg("unexpected request for new relfilenode in binary upgrade mode")));
37133741

37143742
/*
37153743
* Get a writable copy of the pg_class tuple for the given relation.
@@ -3724,9 +3752,37 @@ RelationSetNewRelfilenode(Relation relation, char persistence)
37243752
classform = (Form_pg_class) GETSTRUCT(tuple);
37253753

37263754
/*
3727-
* Schedule unlinking of the old storage at transaction commit.
3755+
* Schedule unlinking of the old storage at transaction commit, except
3756+
* when performing a binary upgrade, when we must do it immediately.
37283757
*/
3729-
RelationDropStorage(relation);
3758+
if (IsBinaryUpgrade)
3759+
{
3760+
SMgrRelation srel;
3761+
3762+
/*
3763+
* During a binary upgrade, we use this code path to ensure that
3764+
* pg_largeobject and its index have the same relfilenode values as in
3765+
* the old cluster. This is necessary because pg_upgrade treats
3766+
* pg_largeobject like a user table, not a system table. It is however
3767+
* possible that a table or index may need to end up with the same
3768+
* relfilenode in the new cluster as what it had in the old cluster.
3769+
* Hence, we can't wait until commit time to remove the old storage.
3770+
*
3771+
* In general, this function needs to have transactional semantics,
3772+
* and removing the old storage before commit time surely isn't.
3773+
* However, it doesn't really matter, because if a binary upgrade
3774+
* fails at this stage, the new cluster will need to be recreated
3775+
* anyway.
3776+
*/
3777+
srel = smgropen(relation->rd_node, relation->rd_backend);
3778+
smgrdounlinkall(&srel, 1, false);
3779+
smgrclose(srel);
3780+
}
3781+
else
3782+
{
3783+
/* Not a binary upgrade, so just schedule it to happen later. */
3784+
RelationDropStorage(relation);
3785+
}
37303786

37313787
/*
37323788
* Create storage for the main fork of the new relfilenode. If it's a

src/bin/pg_dump/pg_dump.c

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3141,6 +3141,7 @@ dumpDatabase(Archive *fout)
31413141
PGresult *lo_res;
31423142
PQExpBuffer loFrozenQry = createPQExpBuffer();
31433143
PQExpBuffer loOutQry = createPQExpBuffer();
3144+
PQExpBuffer loVacQry = createPQExpBuffer();
31443145
int i_relfrozenxid,
31453146
i_relfilenode,
31463147
i_oid,
@@ -3167,15 +3168,36 @@ dumpDatabase(Archive *fout)
31673168
i_relfilenode = PQfnumber(lo_res, "relfilenode");
31683169
i_oid = PQfnumber(lo_res, "oid");
31693170

3170-
appendPQExpBufferStr(loOutQry, "\n-- For binary upgrade, preserve values for pg_largeobject and its index\n");
3171+
appendPQExpBufferStr(loOutQry, "\n-- For binary upgrade, set pg_largeobject relfrozenxid and relminmxid\n");
3172+
appendPQExpBufferStr(loVacQry, "\n-- For binary upgrade, preserve pg_largeobject and index relfilenodes\n");
31713173
for (int i = 0; i < PQntuples(lo_res); ++i)
3174+
{
3175+
Oid oid;
3176+
Oid relfilenode;
3177+
31723178
appendPQExpBuffer(loOutQry, "UPDATE pg_catalog.pg_class\n"
3173-
"SET relfrozenxid = '%u', relminmxid = '%u', relfilenode = '%u'\n"
3179+
"SET relfrozenxid = '%u', relminmxid = '%u'\n"
31743180
"WHERE oid = %u;\n",
31753181
atooid(PQgetvalue(lo_res, i, i_relfrozenxid)),
31763182
atooid(PQgetvalue(lo_res, i, i_relminmxid)),
3177-
atooid(PQgetvalue(lo_res, i, i_relfilenode)),
3178-
atooid(PQgetvalue(lo_res, i, i_oid)));
3183+
atooid(PQgetvalue(lo_res, i, i_relfilenode)));
3184+
3185+
oid = atooid(PQgetvalue(lo_res, i, i_oid));
3186+
relfilenode = atooid(PQgetvalue(lo_res, i, i_relfilenode));
3187+
3188+
if (oid == LargeObjectRelationId)
3189+
appendPQExpBuffer(loVacQry,
3190+
"SELECT pg_catalog.binary_upgrade_set_next_heap_relfilenode('%u'::pg_catalog.oid);\n",
3191+
relfilenode);
3192+
else if (oid == LargeObjectLOidPNIndexId)
3193+
appendPQExpBuffer(loVacQry,
3194+
"SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('%u'::pg_catalog.oid);\n",
3195+
relfilenode);
3196+
}
3197+
3198+
appendPQExpBufferStr(loVacQry,
3199+
"TRUNCATE pg_catalog.pg_largeobject;\n");
3200+
appendPQExpBufferStr(loOutQry, loVacQry->data);
31793201

31803202
ArchiveEntry(fout, nilCatalogId, createDumpId(),
31813203
ARCHIVE_OPTS(.tag = "pg_largeobject",
@@ -3187,6 +3209,7 @@ dumpDatabase(Archive *fout)
31873209

31883210
destroyPQExpBuffer(loFrozenQry);
31893211
destroyPQExpBuffer(loOutQry);
3212+
destroyPQExpBuffer(loVacQry);
31903213
}
31913214

31923215
PQclear(res);

0 commit comments

Comments
 (0)