@@ -157,6 +157,7 @@ ReplicationSlotsShmemInit(void)
157
157
/* everything else is zeroed by the memset above */
158
158
SpinLockInit (& slot -> mutex );
159
159
LWLockInitialize (& slot -> io_in_progress_lock , LWTRANCHE_REPLICATION_SLOT_IO_IN_PROGRESS );
160
+ ConditionVariableInit (& slot -> active_cv );
160
161
}
161
162
}
162
163
}
@@ -313,61 +314,102 @@ ReplicationSlotCreate(const char *name, bool db_specific,
313
314
LWLockRelease (ReplicationSlotControlLock );
314
315
315
316
/*
316
- * Now that the slot has been marked as in_use and in_active , it's safe to
317
+ * Now that the slot has been marked as in_use and active , it's safe to
317
318
* let somebody else try to allocate a slot.
318
319
*/
319
320
LWLockRelease (ReplicationSlotAllocationLock );
321
+
322
+ /* Let everybody know we've modified this slot */
323
+ ConditionVariableBroadcast (& slot -> active_cv );
320
324
}
321
325
322
326
/*
323
327
* Find a previously created slot and mark it as used by this backend.
324
328
*/
325
329
void
326
- ReplicationSlotAcquire (const char * name )
330
+ ReplicationSlotAcquire (const char * name , bool nowait )
327
331
{
328
- ReplicationSlot * slot = NULL ;
332
+ ReplicationSlot * slot ;
333
+ int active_pid ;
329
334
int i ;
330
- int active_pid = 0 ; /* Keep compiler quiet */
331
335
336
+ retry :
332
337
Assert (MyReplicationSlot == NULL );
333
338
334
- /* Search for the named slot and mark it active if we find it. */
339
+ /*
340
+ * Search for the named slot and mark it active if we find it. If the
341
+ * slot is already active, we exit the loop with active_pid set to the PID
342
+ * of the backend that owns it.
343
+ */
344
+ active_pid = 0 ;
345
+ slot = NULL ;
335
346
LWLockAcquire (ReplicationSlotControlLock , LW_SHARED );
336
347
for (i = 0 ; i < max_replication_slots ; i ++ )
337
348
{
338
349
ReplicationSlot * s = & ReplicationSlotCtl -> replication_slots [i ];
339
350
340
351
if (s -> in_use && strcmp (name , NameStr (s -> data .name )) == 0 )
341
352
{
353
+ /*
354
+ * This is the slot we want. We don't know yet if it's active,
355
+ * so get ready to sleep on it in case it is. (We may end up not
356
+ * sleeping, but we don't want to do this while holding the
357
+ * spinlock.)
358
+ */
359
+ ConditionVariablePrepareToSleep (& s -> active_cv );
360
+
342
361
SpinLockAcquire (& s -> mutex );
362
+
343
363
active_pid = s -> active_pid ;
344
364
if (active_pid == 0 )
345
365
active_pid = s -> active_pid = MyProcPid ;
366
+
346
367
SpinLockRelease (& s -> mutex );
347
368
slot = s ;
369
+
348
370
break ;
349
371
}
350
372
}
351
373
LWLockRelease (ReplicationSlotControlLock );
352
374
353
- /* If we did not find the slot or it was already active , error out. */
375
+ /* If we did not find the slot, error out. */
354
376
if (slot == NULL )
355
377
ereport (ERROR ,
356
378
(errcode (ERRCODE_UNDEFINED_OBJECT ),
357
379
errmsg ("replication slot \"%s\" does not exist" , name )));
380
+
381
+ /*
382
+ * If we found the slot but it's already active in another backend, we
383
+ * either error out or retry after a short wait, as caller specified.
384
+ */
358
385
if (active_pid != MyProcPid )
359
- ereport (ERROR ,
360
- (errcode (ERRCODE_OBJECT_IN_USE ),
361
- errmsg ("replication slot \"%s\" is active for PID %d" ,
362
- name , active_pid )));
386
+ {
387
+ if (nowait )
388
+ ereport (ERROR ,
389
+ (errcode (ERRCODE_OBJECT_IN_USE ),
390
+ errmsg ("replication slot \"%s\" is active for PID %d" ,
391
+ name , active_pid )));
392
+
393
+ /* Wait here until we get signaled, and then restart */
394
+ ConditionVariableSleep (& slot -> active_cv , PG_WAIT_LOCK );
395
+ ConditionVariableCancelSleep ();
396
+ goto retry ;
397
+ }
398
+ else
399
+ ConditionVariableCancelSleep (); /* no sleep needed after all */
400
+
401
+ /* Let everybody know we've modified this slot */
402
+ ConditionVariableBroadcast (& slot -> active_cv );
363
403
364
404
/* We made this slot active, so it's ours now. */
365
405
MyReplicationSlot = slot ;
366
406
}
367
407
368
408
/*
369
- * Release a replication slot, this or another backend can ReAcquire it
370
- * later. Resources this slot requires will be preserved.
409
+ * Release the replication slot that this backend considers to own.
410
+ *
411
+ * This or another backend can re-acquire the slot later.
412
+ * Resources this slot requires will be preserved.
371
413
*/
372
414
void
373
415
ReplicationSlotRelease (void )
@@ -385,17 +427,6 @@ ReplicationSlotRelease(void)
385
427
*/
386
428
ReplicationSlotDropAcquired ();
387
429
}
388
- else if (slot -> data .persistency == RS_PERSISTENT )
389
- {
390
- /*
391
- * Mark persistent slot inactive. We're not freeing it, just
392
- * disconnecting.
393
- */
394
- SpinLockAcquire (& slot -> mutex );
395
- slot -> active_pid = 0 ;
396
- SpinLockRelease (& slot -> mutex );
397
- }
398
-
399
430
400
431
/*
401
432
* If slot needed to temporarily restrain both data and catalog xmin to
@@ -412,6 +443,18 @@ ReplicationSlotRelease(void)
412
443
ReplicationSlotsComputeRequiredXmin (false);
413
444
}
414
445
446
+ if (slot -> data .persistency == RS_PERSISTENT )
447
+ {
448
+ /*
449
+ * Mark persistent slot inactive. We're not freeing it, just
450
+ * disconnecting, but wake up others that may be waiting for it.
451
+ */
452
+ SpinLockAcquire (& slot -> mutex );
453
+ slot -> active_pid = 0 ;
454
+ SpinLockRelease (& slot -> mutex );
455
+ ConditionVariableBroadcast (& slot -> active_cv );
456
+ }
457
+
415
458
MyReplicationSlot = NULL ;
416
459
417
460
/* might not have been set when we've been a plain slot */
@@ -430,32 +473,43 @@ ReplicationSlotCleanup(void)
430
473
431
474
Assert (MyReplicationSlot == NULL );
432
475
433
- /*
434
- * No need for locking as we are only interested in slots active in
435
- * current process and those are not touched by other processes.
436
- */
476
+ restart :
477
+ LWLockAcquire (ReplicationSlotControlLock , LW_SHARED );
437
478
for (i = 0 ; i < max_replication_slots ; i ++ )
438
479
{
439
480
ReplicationSlot * s = & ReplicationSlotCtl -> replication_slots [i ];
440
481
482
+ if (!s -> in_use )
483
+ continue ;
484
+
485
+ SpinLockAcquire (& s -> mutex );
441
486
if (s -> active_pid == MyProcPid )
442
487
{
443
- Assert (s -> in_use && s -> data .persistency == RS_TEMPORARY );
488
+ Assert (s -> data .persistency == RS_TEMPORARY );
489
+ SpinLockRelease (& s -> mutex );
490
+ LWLockRelease (ReplicationSlotControlLock ); /* avoid deadlock */
444
491
445
492
ReplicationSlotDropPtr (s );
493
+
494
+ ConditionVariableBroadcast (& s -> active_cv );
495
+ goto restart ;
446
496
}
497
+ else
498
+ SpinLockRelease (& s -> mutex );
447
499
}
500
+
501
+ LWLockRelease (ReplicationSlotControlLock );
448
502
}
449
503
450
504
/*
451
505
* Permanently drop replication slot identified by the passed in name.
452
506
*/
453
507
void
454
- ReplicationSlotDrop (const char * name )
508
+ ReplicationSlotDrop (const char * name , bool nowait )
455
509
{
456
510
Assert (MyReplicationSlot == NULL );
457
511
458
- ReplicationSlotAcquire (name );
512
+ ReplicationSlotAcquire (name , nowait );
459
513
460
514
ReplicationSlotDropAcquired ();
461
515
}
@@ -527,6 +581,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
527
581
slot -> active_pid = 0 ;
528
582
SpinLockRelease (& slot -> mutex );
529
583
584
+ /* wake up anyone waiting on this slot */
585
+ ConditionVariableBroadcast (& slot -> active_cv );
586
+
530
587
ereport (fail_softly ? WARNING : ERROR ,
531
588
(errcode_for_file_access (),
532
589
errmsg ("could not rename file \"%s\" to \"%s\": %m" ,
@@ -535,15 +592,18 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
535
592
536
593
/*
537
594
* The slot is definitely gone. Lock out concurrent scans of the array
538
- * long enough to kill it. It's OK to clear the active flag here without
595
+ * long enough to kill it. It's OK to clear the active PID here without
539
596
* grabbing the mutex because nobody else can be scanning the array here,
540
597
* and nobody can be attached to this slot and thus access it without
541
598
* scanning the array.
599
+ *
600
+ * Also wake up processes waiting for it.
542
601
*/
543
602
LWLockAcquire (ReplicationSlotControlLock , LW_EXCLUSIVE );
544
603
slot -> active_pid = 0 ;
545
604
slot -> in_use = false;
546
605
LWLockRelease (ReplicationSlotControlLock );
606
+ ConditionVariableBroadcast (& slot -> active_cv );
547
607
548
608
/*
549
609
* Slot is dead and doesn't prevent resource removal anymore, recompute
0 commit comments