Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 818fefd

Browse files
committed
Fix race leading to incorrect conflict cause in InvalidatePossiblyObsoleteSlot()
The invalidation of an active slot is done in two steps: - Termination of the backend holding it, if any. - Report that the slot is obsolete, with a conflict cause depending on the slot's data. This can be racy because between these two steps the slot mutex would be released while doing system calls, which means that the effective_xmin and effective_catalog_xmin could advance during that time, detecting a conflict cause different than the one originally wanted before the process owning a slot is terminated. Holding the mutex longer is not an option, so this commit changes the code to record the LSNs stored in the slot during the termination of the process owning the slot. Bonus thanks to Alexander Lakhin for the various tests and the analysis. Author: Bertrand Drouvot Reviewed-by: Michael Paquier, Bharath Rupireddy Discussion: https://postgr.es/m/ZaTjW2Xh+TQUCOH0@ip-10-97-1-34.eu-west-3.compute.internal Backpatch-through: 16
1 parent 01ec4d8 commit 818fefd

File tree

1 file changed

+33
-6
lines changed

1 file changed

+33
-6
lines changed

src/backend/replication/slot.c

+33-6
Original file line numberDiff line numberDiff line change
@@ -1454,6 +1454,11 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
14541454
{
14551455
int last_signaled_pid = 0;
14561456
bool released_lock = false;
1457+
bool terminated = false;
1458+
XLogRecPtr initial_effective_xmin = InvalidXLogRecPtr;
1459+
XLogRecPtr initial_catalog_effective_xmin = InvalidXLogRecPtr;
1460+
XLogRecPtr initial_restart_lsn = InvalidXLogRecPtr;
1461+
ReplicationSlotInvalidationCause conflict_prev PG_USED_FOR_ASSERTS_ONLY = RS_INVAL_NONE;
14571462

14581463
for (;;)
14591464
{
@@ -1488,11 +1493,24 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
14881493
*/
14891494
if (s->data.invalidated == RS_INVAL_NONE)
14901495
{
1496+
/*
1497+
* The slot's mutex will be released soon, and it is possible that
1498+
* those values change since the process holding the slot has been
1499+
* terminated (if any), so record them here to ensure that we
1500+
* would report the correct conflict cause.
1501+
*/
1502+
if (!terminated)
1503+
{
1504+
initial_restart_lsn = s->data.restart_lsn;
1505+
initial_effective_xmin = s->effective_xmin;
1506+
initial_catalog_effective_xmin = s->effective_catalog_xmin;
1507+
}
1508+
14911509
switch (cause)
14921510
{
14931511
case RS_INVAL_WAL_REMOVED:
1494-
if (s->data.restart_lsn != InvalidXLogRecPtr &&
1495-
s->data.restart_lsn < oldestLSN)
1512+
if (initial_restart_lsn != InvalidXLogRecPtr &&
1513+
initial_restart_lsn < oldestLSN)
14961514
conflict = cause;
14971515
break;
14981516
case RS_INVAL_HORIZON:
@@ -1501,12 +1519,12 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
15011519
/* invalid DB oid signals a shared relation */
15021520
if (dboid != InvalidOid && dboid != s->data.database)
15031521
break;
1504-
if (TransactionIdIsValid(s->effective_xmin) &&
1505-
TransactionIdPrecedesOrEquals(s->effective_xmin,
1522+
if (TransactionIdIsValid(initial_effective_xmin) &&
1523+
TransactionIdPrecedesOrEquals(initial_effective_xmin,
15061524
snapshotConflictHorizon))
15071525
conflict = cause;
1508-
else if (TransactionIdIsValid(s->effective_catalog_xmin) &&
1509-
TransactionIdPrecedesOrEquals(s->effective_catalog_xmin,
1526+
else if (TransactionIdIsValid(initial_catalog_effective_xmin) &&
1527+
TransactionIdPrecedesOrEquals(initial_catalog_effective_xmin,
15101528
snapshotConflictHorizon))
15111529
conflict = cause;
15121530
break;
@@ -1519,6 +1537,13 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
15191537
}
15201538
}
15211539

1540+
/*
1541+
* The conflict cause recorded previously should not change while the
1542+
* process owning the slot (if any) has been terminated.
1543+
*/
1544+
Assert(!(conflict_prev != RS_INVAL_NONE && terminated &&
1545+
conflict_prev != conflict));
1546+
15221547
/* if there's no conflict, we're done */
15231548
if (conflict == RS_INVAL_NONE)
15241549
{
@@ -1601,6 +1626,8 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
16011626
(void) kill(active_pid, SIGTERM);
16021627

16031628
last_signaled_pid = active_pid;
1629+
terminated = true;
1630+
conflict_prev = conflict;
16041631
}
16051632

16061633
/* Wait until the slot is released. */

0 commit comments

Comments
 (0)