Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 3d65b05

Browse files
committed
Fix bogus cache-invalidation logic in logical replication worker.
The code recorded cache invalidation events by zeroing the "localreloid" field of affected cache entries. However, it's possible for an inval event to occur even while we have the entry open and locked. So an ill-timed inval could result in "cache lookup failed for relation 0" errors, if the worker's code tried to use the cleared field. We can fix that by creating a separate bool field to record whether the entry needs to be revalidated. (In the back branches, cram the bool into what had been padding space, to avoid an ABI break in the somewhat unlikely event that any extension is looking at this struct.) Also, rearrange the logic in logicalrep_rel_open so that it does the right thing in cases where table_open would fail. We should retry the lookup by name in that case, but we didn't. The real-world impact of this is probably small. In the first place, the error conditions are very low probability, and in the second place, the worker would just exit and get restarted. We only noticed because in a CLOBBER_CACHE_ALWAYS build, the failure can occur repeatedly, preventing the worker from making progress. Nonetheless, it's clearly a bug, and it impedes a useful type of testing; so back-patch to v10 where this code was introduced. Discussion: https://postgr.es/m/1032727.1600096803@sss.pgh.pa.us
1 parent e568ed0 commit 3d65b05

File tree

2 files changed

+53
-29
lines changed

2 files changed

+53
-29
lines changed

src/backend/replication/logical/relation.c

Lines changed: 44 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ logicalrep_relmap_invalidate_cb(Datum arg, Oid reloid)
7777
{
7878
if (entry->localreloid == reloid)
7979
{
80-
entry->localreloid = InvalidOid;
80+
entry->localrelvalid = false;
8181
hash_seq_term(&status);
8282
break;
8383
}
@@ -91,7 +91,7 @@ logicalrep_relmap_invalidate_cb(Datum arg, Oid reloid)
9191
hash_seq_init(&status, LogicalRepRelMap);
9292

9393
while ((entry = (LogicalRepRelMapEntry *) hash_seq_search(&status)) != NULL)
94-
entry->localreloid = InvalidOid;
94+
entry->localrelvalid = false;
9595
}
9696
}
9797

@@ -230,15 +230,13 @@ logicalrep_rel_att_by_name(LogicalRepRelation *remoterel, const char *attname)
230230
/*
231231
* Open the local relation associated with the remote one.
232232
*
233-
* Optionally rebuilds the Relcache mapping if it was invalidated
234-
* by local DDL.
233+
* Rebuilds the Relcache mapping if it was invalidated by local DDL.
235234
*/
236235
LogicalRepRelMapEntry *
237236
logicalrep_rel_open(LogicalRepRelId remoteid, LOCKMODE lockmode)
238237
{
239238
LogicalRepRelMapEntry *entry;
240239
bool found;
241-
Oid relid = InvalidOid;
242240
LogicalRepRelation *remoterel;
243241

244242
if (LogicalRepRelMap == NULL)
@@ -254,14 +252,45 @@ logicalrep_rel_open(LogicalRepRelId remoteid, LOCKMODE lockmode)
254252

255253
remoterel = &entry->remoterel;
256254

255+
/* Ensure we don't leak a relcache refcount. */
256+
if (entry->localrel)
257+
elog(ERROR, "remote relation ID %u is already open", remoteid);
258+
257259
/*
258260
* When opening and locking a relation, pending invalidation messages are
259-
* processed which can invalidate the relation. We need to update the
260-
* local cache both when we are first time accessing the relation and when
261-
* the relation is invalidated (aka entry->localreloid is set InvalidOid).
261+
* processed which can invalidate the relation. Hence, if the entry is
262+
* currently considered valid, try to open the local relation by OID and
263+
* see if invalidation ensues.
264+
*/
265+
if (entry->localrelvalid)
266+
{
267+
entry->localrel = try_table_open(entry->localreloid, lockmode);
268+
if (!entry->localrel)
269+
{
270+
/* Table was renamed or dropped. */
271+
entry->localrelvalid = false;
272+
}
273+
else if (!entry->localrelvalid)
274+
{
275+
/* Note we release the no-longer-useful lock here. */
276+
table_close(entry->localrel, lockmode);
277+
entry->localrel = NULL;
278+
}
279+
}
280+
281+
/*
282+
* If the entry has been marked invalid since we last had lock on it,
283+
* re-open the local relation by name and rebuild all derived data.
262284
*/
263-
if (!OidIsValid(entry->localreloid))
285+
if (!entry->localrelvalid)
264286
{
287+
Oid relid;
288+
int found;
289+
Bitmapset *idkey;
290+
TupleDesc desc;
291+
MemoryContext oldctx;
292+
int i;
293+
265294
/* Try to find and lock the relation by name. */
266295
relid = RangeVarGetRelid(makeRangeVar(remoterel->nspname,
267296
remoterel->relname, -1),
@@ -272,21 +301,7 @@ logicalrep_rel_open(LogicalRepRelId remoteid, LOCKMODE lockmode)
272301
errmsg("logical replication target relation \"%s.%s\" does not exist",
273302
remoterel->nspname, remoterel->relname)));
274303
entry->localrel = table_open(relid, NoLock);
275-
276-
}
277-
else
278-
{
279-
relid = entry->localreloid;
280-
entry->localrel = table_open(entry->localreloid, lockmode);
281-
}
282-
283-
if (!OidIsValid(entry->localreloid))
284-
{
285-
int found;
286-
Bitmapset *idkey;
287-
TupleDesc desc;
288-
MemoryContext oldctx;
289-
int i;
304+
entry->localreloid = relid;
290305

291306
/* Check for supported relkind. */
292307
CheckSubscriptionRelkind(entry->localrel->rd_rel->relkind,
@@ -380,7 +395,7 @@ logicalrep_rel_open(LogicalRepRelId remoteid, LOCKMODE lockmode)
380395
}
381396
}
382397

383-
entry->localreloid = relid;
398+
entry->localrelvalid = true;
384399
}
385400

386401
if (entry->state != SUBREL_STATE_READY)
@@ -523,7 +538,7 @@ logicalrep_partmap_invalidate_cb(Datum arg, Oid reloid)
523538
{
524539
if (entry->localreloid == reloid)
525540
{
526-
entry->localreloid = InvalidOid;
541+
entry->localrelvalid = false;
527542
hash_seq_term(&status);
528543
break;
529544
}
@@ -537,7 +552,7 @@ logicalrep_partmap_invalidate_cb(Datum arg, Oid reloid)
537552
hash_seq_init(&status, LogicalRepPartMap);
538553

539554
while ((entry = (LogicalRepRelMapEntry *) hash_seq_search(&status)) != NULL)
540-
entry->localreloid = InvalidOid;
555+
entry->localrelvalid = false;
541556
}
542557
}
543558

@@ -656,6 +671,8 @@ logicalrep_partition_open(LogicalRepRelMapEntry *root,
656671

657672
entry->updatable = root->updatable;
658673

674+
entry->localrelvalid = true;
675+
659676
/* state and statelsn are left set to 0. */
660677
MemoryContextSwitchTo(oldctx);
661678

src/include/replication/logicalrelation.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,16 @@ typedef struct LogicalRepRelMapEntry
1919
{
2020
LogicalRepRelation remoterel; /* key is remoterel.remoteid */
2121

22-
/* Mapping to local relation, filled as needed. */
22+
/*
23+
* Validity flag -- when false, revalidate all derived info at next
24+
* logicalrep_rel_open. (While the localrel is open, we assume our lock
25+
* on that rel ensures the info remains good.)
26+
*/
27+
bool localrelvalid;
28+
29+
/* Mapping to local relation. */
2330
Oid localreloid; /* local relation id */
24-
Relation localrel; /* relcache entry */
31+
Relation localrel; /* relcache entry (NULL when closed) */
2532
AttrMap *attrmap; /* map of local attributes to remote ones */
2633
bool updatable; /* Can apply updates/deletes? */
2734

0 commit comments

Comments
 (0)