Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit b2c95a3

Browse files
committed
Fix replication origin-related race conditions
Similar to what was fixed in commit 9915de6 for replication slots, but this time it's related to replication origins: DROP SUBSCRIPTION attempts to drop the replication origin, but that fails if the replication worker process hasn't yet marked it unused. This causes failures in the buildfarm: ERROR: could not drop replication origin with OID 1, in use by PID 34069 Like the aforementioned commit, fix by having the process running DROP SUBSCRIPTION sleep until the worker marks the the replication origin struct as free. This uses a condition variable on each replication origin shmem state struct, so that the session trying to drop can sleep and expect to be awakened by the process keeping the origin open. Also fix a SGML markup in the previous commit. Discussion: https://postgr.es/m/20170808001433.rozlseaf4m2wkw3n@alvherre.pgsql
1 parent 030273b commit b2c95a3

File tree

6 files changed

+58
-15
lines changed

6 files changed

+58
-15
lines changed

doc/src/sgml/monitoring.sgml

+6-2
Original file line numberDiff line numberDiff line change
@@ -1222,11 +1222,11 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
12221222
</row>
12231223
<row>
12241224
<entry><literal>LibPQWalReceiverConnect</></entry>
1225-
<entry>Waiting in WAL receiver to establish connection to remote server.<entry>
1225+
<entry>Waiting in WAL receiver to establish connection to remote server.</entry>
12261226
</row>
12271227
<row>
12281228
<entry><literal>LibPQWalReceiverReceive</></entry>
1229-
<entry>Waiting in WAL receiver to receive data from remote server.<entry>
1229+
<entry>Waiting in WAL receiver to receive data from remote server.</entry>
12301230
</row>
12311231
<row>
12321232
<entry><literal>SSLOpenServer</></entry>
@@ -1302,6 +1302,10 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
13021302
<entry><literal>ProcArrayGroupUpdate</></entry>
13031303
<entry>Waiting for group leader to clear transaction id at transaction end.</entry>
13041304
</row>
1305+
<row>
1306+
<entry><literal>ReplicationOriginDrop</></entry>
1307+
<entry>Waiting for a replication origin to become inactive to be dropped.</entry>
1308+
</row>
13051309
<row>
13061310
<entry><literal>ReplicationSlotDrop</></entry>
13071311
<entry>Waiting for a replication slot to become inactive to be dropped.</entry>

src/backend/commands/subscriptioncmds.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -939,7 +939,7 @@ DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel)
939939
snprintf(originname, sizeof(originname), "pg_%u", subid);
940940
originid = replorigin_by_name(originname, true);
941941
if (originid != InvalidRepOriginId)
942-
replorigin_drop(originid);
942+
replorigin_drop(originid, false);
943943

944944
/*
945945
* If there is no slot associated with the subscription, we can finish

src/backend/postmaster/pgstat.c

+3
Original file line numberDiff line numberDiff line change
@@ -3609,6 +3609,9 @@ pgstat_get_wait_ipc(WaitEventIPC w)
36093609
case WAIT_EVENT_PROCARRAY_GROUP_UPDATE:
36103610
event_name = "ProcArrayGroupUpdate";
36113611
break;
3612+
case WAIT_EVENT_REPLICATION_ORIGIN_DROP:
3613+
event_name = "ReplicationOriginDrop";
3614+
break;
36123615
case WAIT_EVENT_REPLICATION_SLOT_DROP:
36133616
event_name = "ReplicationSlotDrop";
36143617
break;

src/backend/replication/logical/origin.c

+46-11
Original file line numberDiff line numberDiff line change
@@ -79,15 +79,15 @@
7979
#include "access/xact.h"
8080

8181
#include "catalog/indexing.h"
82-
8382
#include "nodes/execnodes.h"
8483

8584
#include "replication/origin.h"
8685
#include "replication/logical.h"
87-
86+
#include "pgstat.h"
8887
#include "storage/fd.h"
8988
#include "storage/ipc.h"
9089
#include "storage/lmgr.h"
90+
#include "storage/condition_variable.h"
9191
#include "storage/copydir.h"
9292

9393
#include "utils/builtins.h"
@@ -124,6 +124,11 @@ typedef struct ReplicationState
124124
*/
125125
int acquired_by;
126126

127+
/*
128+
* Condition variable that's signalled when acquired_by changes.
129+
*/
130+
ConditionVariable origin_cv;
131+
127132
/*
128133
* Lock protecting remote_lsn and local_lsn.
129134
*/
@@ -324,16 +329,18 @@ replorigin_create(char *roname)
324329
* Needs to be called in a transaction.
325330
*/
326331
void
327-
replorigin_drop(RepOriginId roident)
332+
replorigin_drop(RepOriginId roident, bool nowait)
328333
{
329-
HeapTuple tuple = NULL;
334+
HeapTuple tuple;
330335
Relation rel;
331336
int i;
332337

333338
Assert(IsTransactionState());
334339

335340
rel = heap_open(ReplicationOriginRelationId, ExclusiveLock);
336341

342+
restart:
343+
tuple = NULL;
337344
/* cleanup the slot state info */
338345
LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE);
339346

@@ -346,11 +353,21 @@ replorigin_drop(RepOriginId roident)
346353
{
347354
if (state->acquired_by != 0)
348355
{
349-
ereport(ERROR,
350-
(errcode(ERRCODE_OBJECT_IN_USE),
351-
errmsg("could not drop replication origin with OID %d, in use by PID %d",
352-
state->roident,
353-
state->acquired_by)));
356+
ConditionVariable *cv;
357+
358+
if (nowait)
359+
ereport(ERROR,
360+
(errcode(ERRCODE_OBJECT_IN_USE),
361+
errmsg("could not drop replication origin with OID %d, in use by PID %d",
362+
state->roident,
363+
state->acquired_by)));
364+
cv = &state->origin_cv;
365+
366+
LWLockRelease(ReplicationOriginLock);
367+
ConditionVariablePrepareToSleep(cv);
368+
ConditionVariableSleep(cv, WAIT_EVENT_REPLICATION_ORIGIN_DROP);
369+
ConditionVariableCancelSleep();
370+
goto restart;
354371
}
355372

356373
/* first WAL log */
@@ -382,7 +399,7 @@ replorigin_drop(RepOriginId roident)
382399

383400
CommandCounterIncrement();
384401

385-
/* now release lock again, */
402+
/* now release lock again */
386403
heap_close(rel, ExclusiveLock);
387404
}
388405

@@ -476,8 +493,11 @@ ReplicationOriginShmemInit(void)
476493
MemSet(replication_states, 0, ReplicationOriginShmemSize());
477494

478495
for (i = 0; i < max_replication_slots; i++)
496+
{
479497
LWLockInitialize(&replication_states[i].lock,
480498
replication_states_ctl->tranche_id);
499+
ConditionVariableInit(&replication_states[i].origin_cv);
500+
}
481501
}
482502

483503
LWLockRegisterTranche(replication_states_ctl->tranche_id,
@@ -957,16 +977,23 @@ replorigin_get_progress(RepOriginId node, bool flush)
957977
static void
958978
ReplicationOriginExitCleanup(int code, Datum arg)
959979
{
980+
ConditionVariable *cv = NULL;
981+
960982
LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE);
961983

962984
if (session_replication_state != NULL &&
963985
session_replication_state->acquired_by == MyProcPid)
964986
{
987+
cv = &session_replication_state->origin_cv;
988+
965989
session_replication_state->acquired_by = 0;
966990
session_replication_state = NULL;
967991
}
968992

969993
LWLockRelease(ReplicationOriginLock);
994+
995+
if (cv)
996+
ConditionVariableBroadcast(cv);
970997
}
971998

972999
/*
@@ -1056,6 +1083,9 @@ replorigin_session_setup(RepOriginId node)
10561083
session_replication_state->acquired_by = MyProcPid;
10571084

10581085
LWLockRelease(ReplicationOriginLock);
1086+
1087+
/* probably this one is pointless */
1088+
ConditionVariableBroadcast(&session_replication_state->origin_cv);
10591089
}
10601090

10611091
/*
@@ -1067,6 +1097,8 @@ replorigin_session_setup(RepOriginId node)
10671097
void
10681098
replorigin_session_reset(void)
10691099
{
1100+
ConditionVariable *cv;
1101+
10701102
Assert(max_replication_slots != 0);
10711103

10721104
if (session_replication_state == NULL)
@@ -1077,9 +1109,12 @@ replorigin_session_reset(void)
10771109
LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE);
10781110

10791111
session_replication_state->acquired_by = 0;
1112+
cv = &session_replication_state->origin_cv;
10801113
session_replication_state = NULL;
10811114

10821115
LWLockRelease(ReplicationOriginLock);
1116+
1117+
ConditionVariableBroadcast(cv);
10831118
}
10841119

10851120
/*
@@ -1170,7 +1205,7 @@ pg_replication_origin_drop(PG_FUNCTION_ARGS)
11701205
roident = replorigin_by_name(name, false);
11711206
Assert(OidIsValid(roident));
11721207

1173-
replorigin_drop(roident);
1208+
replorigin_drop(roident, false);
11741209

11751210
pfree(name);
11761211

src/include/pgstat.h

+1
Original file line numberDiff line numberDiff line change
@@ -812,6 +812,7 @@ typedef enum
812812
WAIT_EVENT_PARALLEL_FINISH,
813813
WAIT_EVENT_PARALLEL_BITMAP_SCAN,
814814
WAIT_EVENT_PROCARRAY_GROUP_UPDATE,
815+
WAIT_EVENT_REPLICATION_ORIGIN_DROP,
815816
WAIT_EVENT_REPLICATION_SLOT_DROP,
816817
WAIT_EVENT_SAFE_SNAPSHOT,
817818
WAIT_EVENT_SYNC_REP

src/include/replication/origin.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ extern PGDLLIMPORT TimestampTz replorigin_session_origin_timestamp;
4141
/* API for querying & manipulating replication origins */
4242
extern RepOriginId replorigin_by_name(char *name, bool missing_ok);
4343
extern RepOriginId replorigin_create(char *name);
44-
extern void replorigin_drop(RepOriginId roident);
44+
extern void replorigin_drop(RepOriginId roident, bool nowait);
4545
extern bool replorigin_by_oid(RepOriginId roident, bool missing_ok,
4646
char **roname);
4747

0 commit comments

Comments
 (0)