Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit c5454f9

Browse files
committed
Fix subtransaction cleanup after an outer-subtransaction portal fails.
Formerly, we treated only portals created in the current subtransaction as having failed during subtransaction abort. However, if the error occurred while running a portal created in an outer subtransaction (ie, a cursor declared before the last savepoint), that has to be considered broken too. To allow reliable detection of which ones those are, add a bookkeeping field to struct Portal that tracks the innermost subtransaction in which each portal has actually been executed. (Without this, we'd end up failing portals containing functions that had called the subtransaction, thereby breaking plpgsql exception blocks completely.) In addition, when we fail an outer-subtransaction Portal, transfer its resources into the subtransaction's resource owner, so that they're released early in cleanup of the subxact. This fixes a problem reported by Jim Nasby in which a function executed in an outer-subtransaction cursor could cause an Assert failure or crash by referencing a relation created within the inner subtransaction. The proximate cause of the Assert failure is that AtEOSubXact_RelationCache assumed it could blow away a relcache entry without first checking that the entry had zero refcount. That was a bad idea on its own terms, so add such a check there, and to the similar coding in AtEOXact_RelationCache. This provides an independent safety measure in case there are still ways to provoke the situation despite the Portal-level changes. This has been broken since subtransactions were invented, so back-patch to all supported branches. Tom Lane and Michael Paquier
1 parent 1bbd52c commit c5454f9

File tree

8 files changed

+197
-29
lines changed

8 files changed

+197
-29
lines changed

src/backend/access/transam/xact.c

+1
Original file line numberDiff line numberDiff line change
@@ -4585,6 +4585,7 @@ AbortSubTransaction(void)
45854585
AfterTriggerEndSubXact(false);
45864586
AtSubAbort_Portals(s->subTransactionId,
45874587
s->parent->subTransactionId,
4588+
s->curTransactionOwner,
45884589
s->parent->curTransactionOwner);
45894590
AtEOSubXact_LargeObject(false, s->subTransactionId,
45904591
s->parent->subTransactionId);

src/backend/commands/portalcmds.c

+1-5
Original file line numberDiff line numberDiff line change
@@ -335,11 +335,7 @@ PersistHoldablePortal(Portal portal)
335335
/*
336336
* Check for improper portal use, and mark portal active.
337337
*/
338-
if (portal->status != PORTAL_READY)
339-
ereport(ERROR,
340-
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
341-
errmsg("portal \"%s\" cannot be run", portal->name)));
342-
portal->status = PORTAL_ACTIVE;
338+
MarkPortalActive(portal);
343339

344340
/*
345341
* Set up global portal context pointers.

src/backend/tcop/pquery.c

+2-10
Original file line numberDiff line numberDiff line change
@@ -734,11 +734,7 @@ PortalRun(Portal portal, long count, bool isTopLevel,
734734
/*
735735
* Check for improper portal use, and mark portal active.
736736
*/
737-
if (portal->status != PORTAL_READY)
738-
ereport(ERROR,
739-
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
740-
errmsg("portal \"%s\" cannot be run", portal->name)));
741-
portal->status = PORTAL_ACTIVE;
737+
MarkPortalActive(portal);
742738

743739
/*
744740
* Set up global portal context pointers.
@@ -1398,11 +1394,7 @@ PortalRunFetch(Portal portal,
13981394
/*
13991395
* Check for improper portal use, and mark portal active.
14001396
*/
1401-
if (portal->status != PORTAL_READY)
1402-
ereport(ERROR,
1403-
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1404-
errmsg("portal \"%s\" cannot be run", portal->name)));
1405-
portal->status = PORTAL_ACTIVE;
1397+
MarkPortalActive(portal);
14061398

14071399
/*
14081400
* Set up global portal context pointers.

src/backend/utils/cache/relcache.c

+32-3
Original file line numberDiff line numberDiff line change
@@ -2048,7 +2048,9 @@ RelationClearRelation(Relation relation, bool rebuild)
20482048
{
20492049
/*
20502050
* As per notes above, a rel to be rebuilt MUST have refcnt > 0; while of
2051-
* course it would be a bad idea to blow away one with nonzero refcnt.
2051+
* course it would be an equally bad idea to blow away one with nonzero
2052+
* refcnt, since that would leave someone somewhere with a dangling
2053+
* pointer. All callers are expected to have verified that this holds.
20522054
*/
20532055
Assert(rebuild ?
20542056
!RelationHasReferenceCountZero(relation) :
@@ -2654,11 +2656,25 @@ AtEOXact_cleanup(Relation relation, bool isCommit)
26542656
{
26552657
if (isCommit)
26562658
relation->rd_createSubid = InvalidSubTransactionId;
2657-
else
2659+
else if (RelationHasReferenceCountZero(relation))
26582660
{
26592661
RelationClearRelation(relation, false);
26602662
return;
26612663
}
2664+
else
2665+
{
2666+
/*
2667+
* Hmm, somewhere there's a (leaked?) reference to the relation.
2668+
* We daren't remove the entry for fear of dereferencing a
2669+
* dangling pointer later. Bleat, and mark it as not belonging to
2670+
* the current transaction. Hopefully it'll get cleaned up
2671+
* eventually. This must be just a WARNING to avoid
2672+
* error-during-error-recovery loops.
2673+
*/
2674+
relation->rd_createSubid = InvalidSubTransactionId;
2675+
elog(WARNING, "cannot remove relcache entry for \"%s\" because it has nonzero refcount",
2676+
RelationGetRelationName(relation));
2677+
}
26622678
}
26632679

26642680
/*
@@ -2747,11 +2763,24 @@ AtEOSubXact_cleanup(Relation relation, bool isCommit,
27472763
{
27482764
if (isCommit)
27492765
relation->rd_createSubid = parentSubid;
2750-
else
2766+
else if (RelationHasReferenceCountZero(relation))
27512767
{
27522768
RelationClearRelation(relation, false);
27532769
return;
27542770
}
2771+
else
2772+
{
2773+
/*
2774+
* Hmm, somewhere there's a (leaked?) reference to the relation.
2775+
* We daren't remove the entry for fear of dereferencing a
2776+
* dangling pointer later. Bleat, and transfer it to the parent
2777+
* subtransaction so we can try again later. This must be just a
2778+
* WARNING to avoid error-during-error-recovery loops.
2779+
*/
2780+
relation->rd_createSubid = parentSubid;
2781+
elog(WARNING, "cannot remove relcache entry for \"%s\" because it has nonzero refcount",
2782+
RelationGetRelationName(relation));
2783+
}
27552784
}
27562785

27572786
/*

src/backend/utils/mmgr/portalmem.c

+74-8
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,7 @@ CreatePortal(const char *name, bool allowDup, bool dupSilent)
232232
portal->status = PORTAL_NEW;
233233
portal->cleanup = PortalCleanup;
234234
portal->createSubid = GetCurrentSubTransactionId();
235+
portal->activeSubid = portal->createSubid;
235236
portal->strategy = PORTAL_MULTI_QUERY;
236237
portal->cursorOptions = CURSOR_OPT_NO_SCROLL;
237238
portal->atStart = true;
@@ -402,6 +403,25 @@ UnpinPortal(Portal portal)
402403
portal->portalPinned = false;
403404
}
404405

406+
/*
407+
* MarkPortalActive
408+
* Transition a portal from READY to ACTIVE state.
409+
*
410+
* NOTE: never set portal->status = PORTAL_ACTIVE directly; call this instead.
411+
*/
412+
void
413+
MarkPortalActive(Portal portal)
414+
{
415+
/* For safety, this is a runtime test not just an Assert */
416+
if (portal->status != PORTAL_READY)
417+
ereport(ERROR,
418+
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
419+
errmsg("portal \"%s\" cannot be run", portal->name)));
420+
/* Perform the state transition */
421+
portal->status = PORTAL_ACTIVE;
422+
portal->activeSubid = GetCurrentSubTransactionId();
423+
}
424+
405425
/*
406426
* MarkPortalDone
407427
* Transition a portal from ACTIVE to DONE state.
@@ -690,6 +710,7 @@ PreCommit_Portals(bool isPrepare)
690710
* not belonging to this transaction.
691711
*/
692712
portal->createSubid = InvalidSubTransactionId;
713+
portal->activeSubid = InvalidSubTransactionId;
693714

694715
/* Report we changed state */
695716
result = true;
@@ -836,8 +857,8 @@ AtCleanup_Portals(void)
836857
/*
837858
* Pre-subcommit processing for portals.
838859
*
839-
* Reassign the portals created in the current subtransaction to the parent
840-
* subtransaction.
860+
* Reassign portals created or used in the current subtransaction to the
861+
* parent subtransaction.
841862
*/
842863
void
843864
AtSubCommit_Portals(SubTransactionId mySubid,
@@ -859,21 +880,24 @@ AtSubCommit_Portals(SubTransactionId mySubid,
859880
if (portal->resowner)
860881
ResourceOwnerNewParent(portal->resowner, parentXactOwner);
861882
}
883+
if (portal->activeSubid == mySubid)
884+
portal->activeSubid = parentSubid;
862885
}
863886
}
864887

865888
/*
866889
* Subtransaction abort handling for portals.
867890
*
868-
* Deactivate portals created during the failed subtransaction.
869-
* Note that per AtSubCommit_Portals, this will catch portals created
891+
* Deactivate portals created or used during the failed subtransaction.
892+
* Note that per AtSubCommit_Portals, this will catch portals created/used
870893
* in descendants of the subtransaction too.
871894
*
872895
* We don't destroy any portals here; that's done in AtSubCleanup_Portals.
873896
*/
874897
void
875898
AtSubAbort_Portals(SubTransactionId mySubid,
876899
SubTransactionId parentSubid,
900+
ResourceOwner myXactOwner,
877901
ResourceOwner parentXactOwner)
878902
{
879903
HASH_SEQ_STATUS status;
@@ -885,16 +909,58 @@ AtSubAbort_Portals(SubTransactionId mySubid,
885909
{
886910
Portal portal = hentry->portal;
887911

912+
/* Was it created in this subtransaction? */
888913
if (portal->createSubid != mySubid)
914+
{
915+
/* No, but maybe it was used in this subtransaction? */
916+
if (portal->activeSubid == mySubid)
917+
{
918+
/* Maintain activeSubid until the portal is removed */
919+
portal->activeSubid = parentSubid;
920+
921+
/*
922+
* Upper-level portals that failed while running in this
923+
* subtransaction must be forced into FAILED state, for the
924+
* same reasons discussed below.
925+
*
926+
* We assume we can get away without forcing upper-level READY
927+
* portals to fail, even if they were run and then suspended.
928+
* In theory a suspended upper-level portal could have
929+
* acquired some references to objects that are about to be
930+
* destroyed, but there should be sufficient defenses against
931+
* such cases: the portal's original query cannot contain such
932+
* references, and any references within, say, cached plans of
933+
* PL/pgSQL functions are not from active queries and should
934+
* be protected by revalidation logic.
935+
*/
936+
if (portal->status == PORTAL_ACTIVE)
937+
MarkPortalFailed(portal);
938+
939+
/*
940+
* Also, if we failed it during the current subtransaction
941+
* (either just above, or earlier), reattach its resource
942+
* owner to the current subtransaction's resource owner, so
943+
* that any resources it still holds will be released while
944+
* cleaning up this subtransaction. This prevents some corner
945+
* cases wherein we might get Asserts or worse while cleaning
946+
* up objects created during the current subtransaction
947+
* (because they're still referenced within this portal).
948+
*/
949+
if (portal->status == PORTAL_FAILED && portal->resowner)
950+
{
951+
ResourceOwnerNewParent(portal->resowner, myXactOwner);
952+
portal->resowner = NULL;
953+
}
954+
}
955+
/* Done if it wasn't created in this subtransaction */
889956
continue;
957+
}
890958

891959
/*
892960
* Force any live portals of my own subtransaction into FAILED state.
893961
* We have to do this because they might refer to objects created or
894-
* changed in the failed subtransaction, leading to crashes if
895-
* execution is resumed, or even if we just try to run ExecutorEnd.
896-
* (Note we do NOT do this to upper-level portals, since they cannot
897-
* have such references and hence may be able to continue.)
962+
* changed in the failed subtransaction, leading to crashes within
963+
* ExecutorEnd when portalcmds.c tries to close down the portal.
898964
*/
899965
if (portal->status == PORTAL_READY ||
900966
portal->status == PORTAL_ACTIVE)

src/include/utils/portal.h

+9-3
Original file line numberDiff line numberDiff line change
@@ -119,12 +119,16 @@ typedef struct PortalData
119119
MemoryContext heap; /* subsidiary memory for portal */
120120
ResourceOwner resowner; /* resources owned by portal */
121121
void (*cleanup) (Portal portal); /* cleanup hook */
122-
SubTransactionId createSubid; /* the ID of the creating subxact */
123122

124123
/*
125-
* if createSubid is InvalidSubTransactionId, the portal is held over from
126-
* a previous transaction
124+
* State data for remembering which subtransaction(s) the portal was
125+
* created or used in. If the portal is held over from a previous
126+
* transaction, both subxids are InvalidSubTransactionId. Otherwise,
127+
* createSubid is the creating subxact and activeSubid is the last subxact
128+
* in which we ran the portal.
127129
*/
130+
SubTransactionId createSubid; /* the creating subxact */
131+
SubTransactionId activeSubid; /* the last subxact with activity */
128132

129133
/* The query or queries the portal will execute */
130134
const char *sourceText; /* text of query (as of 8.4, never NULL) */
@@ -201,12 +205,14 @@ extern void AtSubCommit_Portals(SubTransactionId mySubid,
201205
ResourceOwner parentXactOwner);
202206
extern void AtSubAbort_Portals(SubTransactionId mySubid,
203207
SubTransactionId parentSubid,
208+
ResourceOwner myXactOwner,
204209
ResourceOwner parentXactOwner);
205210
extern void AtSubCleanup_Portals(SubTransactionId mySubid);
206211
extern Portal CreatePortal(const char *name, bool allowDup, bool dupSilent);
207212
extern Portal CreateNewPortal(void);
208213
extern void PinPortal(Portal portal);
209214
extern void UnpinPortal(Portal portal);
215+
extern void MarkPortalActive(Portal portal);
210216
extern void MarkPortalDone(Portal portal);
211217
extern void MarkPortalFailed(Portal portal);
212218
extern void PortalDrop(Portal portal, bool isTopCommit);

src/test/regress/expected/transactions.out

+46
Original file line numberDiff line numberDiff line change
@@ -613,6 +613,52 @@ fetch from foo;
613613
(1 row)
614614

615615
abort;
616+
-- Test for proper cleanup after a failure in a cursor portal
617+
-- that was created in an outer subtransaction
618+
CREATE FUNCTION invert(x float8) RETURNS float8 LANGUAGE plpgsql AS
619+
$$ begin return 1/x; end $$;
620+
CREATE FUNCTION create_temp_tab() RETURNS text
621+
LANGUAGE plpgsql AS $$
622+
BEGIN
623+
CREATE TEMP TABLE new_table (f1 float8);
624+
-- case of interest is that we fail while holding an open
625+
-- relcache reference to new_table
626+
INSERT INTO new_table SELECT invert(0.0);
627+
RETURN 'foo';
628+
END $$;
629+
BEGIN;
630+
DECLARE ok CURSOR FOR SELECT * FROM int8_tbl;
631+
DECLARE ctt CURSOR FOR SELECT create_temp_tab();
632+
FETCH ok;
633+
q1 | q2
634+
-----+-----
635+
123 | 456
636+
(1 row)
637+
638+
SAVEPOINT s1;
639+
FETCH ok; -- should work
640+
q1 | q2
641+
-----+------------------
642+
123 | 4567890123456789
643+
(1 row)
644+
645+
FETCH ctt; -- error occurs here
646+
ERROR: division by zero
647+
CONTEXT: PL/pgSQL function invert(double precision) line 1 at RETURN
648+
SQL statement "INSERT INTO new_table SELECT invert(0.0)"
649+
PL/pgSQL function create_temp_tab() line 6 at SQL statement
650+
ROLLBACK TO s1;
651+
FETCH ok; -- should work
652+
q1 | q2
653+
------------------+-----
654+
4567890123456789 | 123
655+
(1 row)
656+
657+
FETCH ctt; -- must be rejected
658+
ERROR: portal "ctt" cannot be run
659+
COMMIT;
660+
DROP FUNCTION create_temp_tab();
661+
DROP FUNCTION invert(x float8);
616662
-- Test for successful cleanup of an aborted transaction at session exit.
617663
-- THIS MUST BE THE LAST TEST IN THIS FILE.
618664
begin;

src/test/regress/sql/transactions.sql

+32
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,38 @@ fetch from foo;
387387

388388
abort;
389389

390+
391+
-- Test for proper cleanup after a failure in a cursor portal
392+
-- that was created in an outer subtransaction
393+
CREATE FUNCTION invert(x float8) RETURNS float8 LANGUAGE plpgsql AS
394+
$$ begin return 1/x; end $$;
395+
396+
CREATE FUNCTION create_temp_tab() RETURNS text
397+
LANGUAGE plpgsql AS $$
398+
BEGIN
399+
CREATE TEMP TABLE new_table (f1 float8);
400+
-- case of interest is that we fail while holding an open
401+
-- relcache reference to new_table
402+
INSERT INTO new_table SELECT invert(0.0);
403+
RETURN 'foo';
404+
END $$;
405+
406+
BEGIN;
407+
DECLARE ok CURSOR FOR SELECT * FROM int8_tbl;
408+
DECLARE ctt CURSOR FOR SELECT create_temp_tab();
409+
FETCH ok;
410+
SAVEPOINT s1;
411+
FETCH ok; -- should work
412+
FETCH ctt; -- error occurs here
413+
ROLLBACK TO s1;
414+
FETCH ok; -- should work
415+
FETCH ctt; -- must be rejected
416+
COMMIT;
417+
418+
DROP FUNCTION create_temp_tab();
419+
DROP FUNCTION invert(x float8);
420+
421+
390422
-- Test for successful cleanup of an aborted transaction at session exit.
391423
-- THIS MUST BE THE LAST TEST IN THIS FILE.
392424

0 commit comments

Comments
 (0)