Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 53c5b86

Browse files
committed
Tighten up error recovery for fast-path locking.
The previous code could cause a backend crash after BEGIN; SAVEPOINT a; LOCK TABLE foo (interrupted by ^C or statement timeout); ROLLBACK TO SAVEPOINT a; LOCK TABLE foo, and might have leaked strong-lock counts in other situations. Report by Zoltán Böszörményi; patch review by Jeff Davis.
1 parent ab77b2d commit 53c5b86

File tree

7 files changed

+94
-31
lines changed

7 files changed

+94
-31
lines changed

src/backend/access/transam/xact.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -2259,7 +2259,7 @@ AbortTransaction(void)
22592259
* Also clean up any open wait for lock, since the lock manager will choke
22602260
* if we try to wait for another lock before doing this.
22612261
*/
2262-
LockWaitCancel();
2262+
LockErrorCleanup();
22632263

22642264
/*
22652265
* check the current transaction state
@@ -4144,7 +4144,7 @@ AbortSubTransaction(void)
41444144
AbortBufferIO();
41454145
UnlockBuffers();
41464146

4147-
LockWaitCancel();
4147+
LockErrorCleanup();
41484148

41494149
/*
41504150
* check the current transaction state

src/backend/storage/lmgr/README

+1-1
Original file line numberDiff line numberDiff line change
@@ -560,7 +560,7 @@ examines every member of the wait queue (this was not true in the 7.0
560560
implementation, BTW). Therefore, if ProcLockWakeup is always invoked
561561
after a lock is released or a wait queue is rearranged, there can be no
562562
failure to wake a wakable process. One should also note that
563-
LockWaitCancel (abort a waiter due to outside factors) must run
563+
LockErrorCleanup (abort a waiter due to outside factors) must run
564564
ProcLockWakeup, in case the canceled waiter was soft-blocking other
565565
waiters.
566566

src/backend/storage/lmgr/lock.c

+75-16
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,8 @@ static HTAB *LockMethodProcLockHash;
250250
static HTAB *LockMethodLocalHash;
251251

252252

253-
/* private state for GrantAwaitedLock */
253+
/* private state for error cleanup */
254+
static LOCALLOCK *StrongLockInProgress;
254255
static LOCALLOCK *awaitedLock;
255256
static ResourceOwner awaitedOwner;
256257

@@ -338,6 +339,8 @@ static void RemoveLocalLock(LOCALLOCK *locallock);
338339
static PROCLOCK *SetupLockInTable(LockMethod lockMethodTable, PGPROC *proc,
339340
const LOCKTAG *locktag, uint32 hashcode, LOCKMODE lockmode);
340341
static void GrantLockLocal(LOCALLOCK *locallock, ResourceOwner owner);
342+
static void BeginStrongLockAcquire(LOCALLOCK *locallock, uint32 fasthashcode);
343+
static void FinishStrongLockAcquire(void);
341344
static void WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner);
342345
static void ReleaseLockForOwner(LOCALLOCK *locallock, ResourceOwner owner);
343346
static bool UnGrantLock(LOCK *lock, LOCKMODE lockmode,
@@ -738,22 +741,11 @@ LockAcquireExtended(const LOCKTAG *locktag,
738741
}
739742
else if (FastPathStrongMode(lockmode))
740743
{
741-
/*
742-
* Adding to a memory location is not atomic, so we take a
743-
* spinlock to ensure we don't collide with someone else trying
744-
* to bump the count at the same time.
745-
*
746-
* XXX: It might be worth considering using an atomic fetch-and-add
747-
* instruction here, on architectures where that is supported.
748-
*/
749-
Assert(locallock->holdsStrongLockCount == FALSE);
750-
SpinLockAcquire(&FastPathStrongRelationLocks->mutex);
751-
FastPathStrongRelationLocks->count[fasthashcode]++;
752-
locallock->holdsStrongLockCount = TRUE;
753-
SpinLockRelease(&FastPathStrongRelationLocks->mutex);
744+
BeginStrongLockAcquire(locallock, fasthashcode);
754745
if (!FastPathTransferRelationLocks(lockMethodTable, locktag,
755746
hashcode))
756747
{
748+
AbortStrongLockAcquire();
757749
if (reportMemoryError)
758750
ereport(ERROR,
759751
(errcode(ERRCODE_OUT_OF_MEMORY),
@@ -779,6 +771,7 @@ LockAcquireExtended(const LOCKTAG *locktag,
779771
hashcode, lockmode);
780772
if (!proclock)
781773
{
774+
AbortStrongLockAcquire();
782775
LWLockRelease(partitionLock);
783776
if (reportMemoryError)
784777
ereport(ERROR,
@@ -820,6 +813,7 @@ LockAcquireExtended(const LOCKTAG *locktag,
820813
*/
821814
if (dontWait)
822815
{
816+
AbortStrongLockAcquire();
823817
if (proclock->holdMask == 0)
824818
{
825819
uint32 proclock_hashcode;
@@ -884,6 +878,7 @@ LockAcquireExtended(const LOCKTAG *locktag,
884878
*/
885879
if (!(proclock->holdMask & LOCKBIT_ON(lockmode)))
886880
{
881+
AbortStrongLockAcquire();
887882
PROCLOCK_PRINT("LockAcquire: INCONSISTENT", proclock);
888883
LOCK_PRINT("LockAcquire: INCONSISTENT", lock, lockmode);
889884
/* Should we retry ? */
@@ -894,6 +889,12 @@ LockAcquireExtended(const LOCKTAG *locktag,
894889
LOCK_PRINT("LockAcquire: granted", lock, lockmode);
895890
}
896891

892+
/*
893+
* Lock state is fully up-to-date now; if we error out after this, no
894+
* special error cleanup is required.
895+
*/
896+
FinishStrongLockAcquire();
897+
897898
LWLockRelease(partitionLock);
898899

899900
/*
@@ -1349,6 +1350,64 @@ GrantLockLocal(LOCALLOCK *locallock, ResourceOwner owner)
13491350
locallock->numLockOwners++;
13501351
}
13511352

1353+
/*
1354+
* BeginStrongLockAcquire - inhibit use of fastpath for a given LOCALLOCK,
1355+
* and arrange for error cleanup if it fails
1356+
*/
1357+
static void
1358+
BeginStrongLockAcquire(LOCALLOCK *locallock, uint32 fasthashcode)
1359+
{
1360+
Assert(StrongLockInProgress == NULL);
1361+
Assert(locallock->holdsStrongLockCount == FALSE);
1362+
1363+
/*
1364+
* Adding to a memory location is not atomic, so we take a
1365+
* spinlock to ensure we don't collide with someone else trying
1366+
* to bump the count at the same time.
1367+
*
1368+
* XXX: It might be worth considering using an atomic fetch-and-add
1369+
* instruction here, on architectures where that is supported.
1370+
*/
1371+
1372+
SpinLockAcquire(&FastPathStrongRelationLocks->mutex);
1373+
FastPathStrongRelationLocks->count[fasthashcode]++;
1374+
locallock->holdsStrongLockCount = TRUE;
1375+
StrongLockInProgress = locallock;
1376+
SpinLockRelease(&FastPathStrongRelationLocks->mutex);
1377+
}
1378+
1379+
/*
1380+
* FinishStrongLockAcquire - cancel pending cleanup for a strong lock
1381+
* acquisition once it's no longer needed
1382+
*/
1383+
static void
1384+
FinishStrongLockAcquire(void)
1385+
{
1386+
StrongLockInProgress = NULL;
1387+
}
1388+
1389+
/*
1390+
* AbortStrongLockAcquire - undo strong lock state changes performed by
1391+
* BeginStrongLockAcquire.
1392+
*/
1393+
void
1394+
AbortStrongLockAcquire(void)
1395+
{
1396+
uint32 fasthashcode;
1397+
LOCALLOCK *locallock = StrongLockInProgress;
1398+
1399+
if (locallock == NULL)
1400+
return;
1401+
1402+
fasthashcode = FastPathStrongLockHashPartition(locallock->hashcode);
1403+
Assert(locallock->holdsStrongLockCount == TRUE);
1404+
SpinLockAcquire(&FastPathStrongRelationLocks->mutex);
1405+
FastPathStrongRelationLocks->count[fasthashcode]--;
1406+
locallock->holdsStrongLockCount = FALSE;
1407+
StrongLockInProgress = NULL;
1408+
SpinLockRelease(&FastPathStrongRelationLocks->mutex);
1409+
}
1410+
13521411
/*
13531412
* GrantAwaitedLock -- call GrantLockLocal for the lock we are doing
13541413
* WaitOnLock on.
@@ -1414,7 +1473,7 @@ WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner)
14141473
* We can and do use a PG_TRY block to try to clean up after failure, but
14151474
* this still has a major limitation: elog(FATAL) can occur while waiting
14161475
* (eg, a "die" interrupt), and then control won't come back here. So all
1417-
* cleanup of essential state should happen in LockWaitCancel, not here.
1476+
* cleanup of essential state should happen in LockErrorCleanup, not here.
14181477
* We can use PG_TRY to clear the "waiting" status flags, since doing that
14191478
* is unimportant if the process exits.
14201479
*/
@@ -1441,7 +1500,7 @@ WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner)
14411500
}
14421501
PG_CATCH();
14431502
{
1444-
/* In this path, awaitedLock remains set until LockWaitCancel */
1503+
/* In this path, awaitedLock remains set until LockErrorCleanup */
14451504

14461505
/* Report change to non-waiting status */
14471506
pgstat_report_waiting(false);

src/backend/storage/lmgr/proc.c

+11-8
Original file line numberDiff line numberDiff line change
@@ -635,17 +635,20 @@ IsWaitingForLock(void)
635635
}
636636

637637
/*
638-
* Cancel any pending wait for lock, when aborting a transaction.
638+
* Cancel any pending wait for lock, when aborting a transaction, and revert
639+
* any strong lock count acquisition for a lock being acquired.
639640
*
640641
* (Normally, this would only happen if we accept a cancel/die
641-
* interrupt while waiting; but an ereport(ERROR) while waiting is
642-
* within the realm of possibility, too.)
642+
* interrupt while waiting; but an ereport(ERROR) before or during the lock
643+
* wait is within the realm of possibility, too.)
643644
*/
644645
void
645-
LockWaitCancel(void)
646+
LockErrorCleanup(void)
646647
{
647648
LWLockId partitionLock;
648649

650+
AbortStrongLockAcquire();
651+
649652
/* Nothing to do if we weren't waiting for a lock */
650653
if (lockAwaited == NULL)
651654
return;
@@ -709,7 +712,7 @@ ProcReleaseLocks(bool isCommit)
709712
if (!MyProc)
710713
return;
711714
/* If waiting, get off wait queue (should only be needed after error) */
712-
LockWaitCancel();
715+
LockErrorCleanup();
713716
/* Release locks */
714717
LockReleaseAll(DEFAULT_LOCKMETHOD, !isCommit);
715718

@@ -1019,7 +1022,7 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
10191022
* NOTE: this may also cause us to exit critical-section state, possibly
10201023
* allowing a cancel/die interrupt to be accepted. This is OK because we
10211024
* have recorded the fact that we are waiting for a lock, and so
1022-
* LockWaitCancel will clean up if cancel/die happens.
1025+
* LockErrorCleanup will clean up if cancel/die happens.
10231026
*/
10241027
LWLockRelease(partitionLock);
10251028

@@ -1062,7 +1065,7 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
10621065
* don't, because we have no shared-state-change work to do after being
10631066
* granted the lock (the grantor did it all). We do have to worry about
10641067
* updating the locallock table, but if we lose control to an error,
1065-
* LockWaitCancel will fix that up.
1068+
* LockErrorCleanup will fix that up.
10661069
*/
10671070
do
10681071
{
@@ -1207,7 +1210,7 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
12071210
LWLockAcquire(partitionLock, LW_EXCLUSIVE);
12081211

12091212
/*
1210-
* We no longer want LockWaitCancel to do anything.
1213+
* We no longer want LockErrorCleanup to do anything.
12111214
*/
12121215
lockAwaited = NULL;
12131216

src/backend/tcop/postgres.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -2575,7 +2575,7 @@ die(SIGNAL_ARGS)
25752575
/* bump holdoff count to make ProcessInterrupts() a no-op */
25762576
/* until we are done getting ready for it */
25772577
InterruptHoldoffCount++;
2578-
LockWaitCancel(); /* prevent CheckDeadLock from running */
2578+
LockErrorCleanup(); /* prevent CheckDeadLock from running */
25792579
DisableNotifyInterrupt();
25802580
DisableCatchupInterrupt();
25812581
InterruptHoldoffCount--;
@@ -2617,7 +2617,7 @@ StatementCancelHandler(SIGNAL_ARGS)
26172617
/* bump holdoff count to make ProcessInterrupts() a no-op */
26182618
/* until we are done getting ready for it */
26192619
InterruptHoldoffCount++;
2620-
LockWaitCancel(); /* prevent CheckDeadLock from running */
2620+
LockErrorCleanup(); /* prevent CheckDeadLock from running */
26212621
DisableNotifyInterrupt();
26222622
DisableCatchupInterrupt();
26232623
InterruptHoldoffCount--;
@@ -2776,7 +2776,7 @@ RecoveryConflictInterrupt(ProcSignalReason reason)
27762776
/* bump holdoff count to make ProcessInterrupts() a no-op */
27772777
/* until we are done getting ready for it */
27782778
InterruptHoldoffCount++;
2779-
LockWaitCancel(); /* prevent CheckDeadLock from running */
2779+
LockErrorCleanup(); /* prevent CheckDeadLock from running */
27802780
DisableNotifyInterrupt();
27812781
DisableCatchupInterrupt();
27822782
InterruptHoldoffCount--;

src/include/storage/lock.h

+1
Original file line numberDiff line numberDiff line change
@@ -489,6 +489,7 @@ extern LockAcquireResult LockAcquireExtended(const LOCKTAG *locktag,
489489
bool sessionLock,
490490
bool dontWait,
491491
bool report_memory_error);
492+
extern void AbortStrongLockAcquire(void);
492493
extern bool LockRelease(const LOCKTAG *locktag,
493494
LOCKMODE lockmode, bool sessionLock);
494495
extern void LockReleaseSession(LOCKMETHODID lockmethodid);

src/include/storage/proc.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,7 @@ extern int ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable);
244244
extern PGPROC *ProcWakeup(PGPROC *proc, int waitStatus);
245245
extern void ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock);
246246
extern bool IsWaitingForLock(void);
247-
extern void LockWaitCancel(void);
247+
extern void LockErrorCleanup(void);
248248

249249
extern void ProcWaitForSignal(void);
250250
extern void ProcSendSignal(int pid);

0 commit comments

Comments
 (0)