35
35
#include "access/clog.h"
36
36
#include "access/slru.h"
37
37
#include "access/transam.h"
38
+ #include "access/twophase.h"
38
39
#include "access/xlog.h"
39
40
#include "access/xloginsert.h"
40
41
#include "access/xlogutils.h"
41
42
#include "miscadmin.h"
42
43
#include "pg_trace.h"
44
+ #include "storage/proc.h"
43
45
44
46
/*
45
47
* Defines for CLOG page sizes. A page is the same BLCKSZ as is used
@@ -86,11 +88,17 @@ static void WriteZeroPageXlogRec(int pageno);
86
88
static void WriteTruncateXlogRec (int pageno );
87
89
static void TransactionIdSetPageStatus (TransactionId xid , int nsubxids ,
88
90
TransactionId * subxids , XidStatus status ,
89
- XLogRecPtr lsn , int pageno );
91
+ XLogRecPtr lsn , int pageno ,
92
+ bool all_xact_same_page );
90
93
static void TransactionIdSetStatusBit (TransactionId xid , XidStatus status ,
91
94
XLogRecPtr lsn , int slotno );
92
95
static void set_status_by_pages (int nsubxids , TransactionId * subxids ,
93
96
XidStatus status , XLogRecPtr lsn );
97
+ static bool TransactionGroupUpdateXidStatus (TransactionId xid , XidStatus status ,
98
+ XLogRecPtr lsn , int pageno );
99
+ static void TransactionIdSetPageStatusInternal (TransactionId xid , int nsubxids ,
100
+ TransactionId * subxids , XidStatus status ,
101
+ XLogRecPtr lsn , int pageno );
94
102
95
103
96
104
/*
@@ -173,7 +181,7 @@ TransactionIdSetTreeStatus(TransactionId xid, int nsubxids,
173
181
* Set the parent and all subtransactions in a single call
174
182
*/
175
183
TransactionIdSetPageStatus (xid , nsubxids , subxids , status , lsn ,
176
- pageno );
184
+ pageno , true );
177
185
}
178
186
else
179
187
{
@@ -200,7 +208,7 @@ TransactionIdSetTreeStatus(TransactionId xid, int nsubxids,
200
208
*/
201
209
pageno = TransactionIdToPage (xid );
202
210
TransactionIdSetPageStatus (xid , nsubxids_on_first_page , subxids , status ,
203
- lsn , pageno );
211
+ lsn , pageno , false );
204
212
205
213
/*
206
214
* Now work through the rest of the subxids one clog page at a time,
@@ -238,7 +246,7 @@ set_status_by_pages(int nsubxids, TransactionId *subxids,
238
246
239
247
TransactionIdSetPageStatus (InvalidTransactionId ,
240
248
num_on_page , subxids + offset ,
241
- status , lsn , pageno );
249
+ status , lsn , pageno , false );
242
250
offset = i ;
243
251
pageno = TransactionIdToPage (subxids [offset ]);
244
252
}
@@ -248,21 +256,78 @@ set_status_by_pages(int nsubxids, TransactionId *subxids,
248
256
* Record the final state of transaction entries in the commit log for
249
257
* all entries on a single page. Atomic only on this page.
250
258
*
251
- * Otherwise API is same as TransactionIdSetTreeStatus()
259
+ * When there is contention on CLogControlLock, we try to group multiple
260
+ * updates; a single leader process will perform transaction status updates
261
+ * for multiple backends so that the number of times CLogControlLock needs
262
+ * to be acquired is reduced. We don't try to do this if a process has
263
+ * overflowed the subxids array in its PGPROC, since in that case we
264
+ * don't have a complete list of XIDs for it. We also skip it if a process
265
+ * has XIDs on more than one CLOG page, or on a different CLOG page than
266
+ * processes already waiting for a group update. This latter condition
267
+ * has a race condition (see TransactionGroupUpdateXidStatus) but the
268
+ * worst thing that happens if we mess up is a small loss of efficiency;
269
+ * the intent is to avoid having the leader access pages it wouldn't
270
+ * otherwise need to touch. Finally, we skip it for prepared transactions,
271
+ * which don't have the semaphore we would need for this optimization,
272
+ * and which are anyway probably not all that common.
252
273
*/
253
274
static void
254
275
TransactionIdSetPageStatus (TransactionId xid , int nsubxids ,
255
276
TransactionId * subxids , XidStatus status ,
256
- XLogRecPtr lsn , int pageno )
277
+ XLogRecPtr lsn , int pageno ,
278
+ bool all_xact_same_page )
279
+ {
280
+ if (all_xact_same_page &&
281
+ nsubxids < PGPROC_MAX_CACHED_SUBXIDS &&
282
+ !IsGXactActive ())
283
+ {
284
+ /*
285
+ * If we can immediately acquire CLogControlLock, we update the status
286
+ * of our own XID and release the lock. If not, try use group XID
287
+ * update. If that doesn't work out, fall back to waiting for the
288
+ * lock to perform an update for this transaction only.
289
+ */
290
+ if (LWLockConditionalAcquire (CLogControlLock , LW_EXCLUSIVE ))
291
+ {
292
+ TransactionIdSetPageStatusInternal (xid , nsubxids , subxids , status , lsn , pageno );
293
+ LWLockRelease (CLogControlLock );
294
+ }
295
+ else if (!TransactionGroupUpdateXidStatus (xid , status , lsn , pageno ))
296
+ {
297
+ LWLockAcquire (CLogControlLock , LW_EXCLUSIVE );
298
+
299
+ TransactionIdSetPageStatusInternal (xid , nsubxids , subxids , status , lsn , pageno );
300
+
301
+ LWLockRelease (CLogControlLock );
302
+ }
303
+ }
304
+ else
305
+ {
306
+ LWLockAcquire (CLogControlLock , LW_EXCLUSIVE );
307
+
308
+ TransactionIdSetPageStatusInternal (xid , nsubxids , subxids , status , lsn , pageno );
309
+
310
+ LWLockRelease (CLogControlLock );
311
+ }
312
+ }
313
+
314
+ /*
315
+ * Record the final state of transaction entry in the commit log
316
+ *
317
+ * We don't do any locking here; caller must handle that.
318
+ */
319
+ static void
320
+ TransactionIdSetPageStatusInternal (TransactionId xid , int nsubxids ,
321
+ TransactionId * subxids , XidStatus status ,
322
+ XLogRecPtr lsn , int pageno )
257
323
{
258
324
int slotno ;
259
325
int i ;
260
326
261
327
Assert (status == TRANSACTION_STATUS_COMMITTED ||
262
328
status == TRANSACTION_STATUS_ABORTED ||
263
329
(status == TRANSACTION_STATUS_SUB_COMMITTED && !TransactionIdIsValid (xid )));
264
-
265
- LWLockAcquire (CLogControlLock , LW_EXCLUSIVE );
330
+ Assert (LWLockHeldByMeInMode (CLogControlLock , LW_EXCLUSIVE ));
266
331
267
332
/*
268
333
* If we're doing an async commit (ie, lsn is valid), then we must wait
@@ -310,8 +375,166 @@ TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
310
375
}
311
376
312
377
ClogCtl -> shared -> page_dirty [slotno ] = true;
378
+ }
313
379
380
+ /*
381
+ * When we cannot immediately acquire CLogControlLock in exclusive mode at
382
+ * commit time, add ourselves to a list of processes that need their XIDs
383
+ * status update. The first process to add itself to the list will acquire
384
+ * CLogControlLock in exclusive mode and set transaction status as required
385
+ * on behalf of all group members. This avoids a great deal of contention
386
+ * around CLogControlLock when many processes are trying to commit at once,
387
+ * since the lock need not be repeatedly handed off from one committing
388
+ * process to the next.
389
+ *
390
+ * Returns true when transaction status has been updated in clog; returns
391
+ * false if we decided against applying the optimization because the page
392
+ * number we need to update differs from those processes already waiting.
393
+ */
394
+ static bool
395
+ TransactionGroupUpdateXidStatus (TransactionId xid , XidStatus status ,
396
+ XLogRecPtr lsn , int pageno )
397
+ {
398
+ volatile PROC_HDR * procglobal = ProcGlobal ;
399
+ PGPROC * proc = MyProc ;
400
+ uint32 nextidx ;
401
+ uint32 wakeidx ;
402
+
403
+ /* We should definitely have an XID whose status needs to be updated. */
404
+ Assert (TransactionIdIsValid (xid ));
405
+
406
+ /*
407
+ * Add ourselves to the list of processes needing a group XID status
408
+ * update.
409
+ */
410
+ proc -> clogGroupMember = true;
411
+ proc -> clogGroupMemberXid = xid ;
412
+ proc -> clogGroupMemberXidStatus = status ;
413
+ proc -> clogGroupMemberPage = pageno ;
414
+ proc -> clogGroupMemberLsn = lsn ;
415
+
416
+ nextidx = pg_atomic_read_u32 (& procglobal -> clogGroupFirst );
417
+
418
+ while (true)
419
+ {
420
+ /*
421
+ * Add the proc to list, if the clog page where we need to update the
422
+ * current transaction status is same as group leader's clog page.
423
+ *
424
+ * There is a race condition here, which is that after doing the below
425
+ * check and before adding this proc's clog update to a group, the
426
+ * group leader might have already finished the group update for this
427
+ * page and becomes group leader of another group. This will lead to a
428
+ * situation where a single group can have different clog page
429
+ * updates. This isn't likely and will still work, just maybe a bit
430
+ * less efficiently.
431
+ */
432
+ if (nextidx != INVALID_PGPROCNO &&
433
+ ProcGlobal -> allProcs [nextidx ].clogGroupMemberPage != proc -> clogGroupMemberPage )
434
+ {
435
+ proc -> clogGroupMember = false;
436
+ return false;
437
+ }
438
+
439
+ pg_atomic_write_u32 (& proc -> clogGroupNext , nextidx );
440
+
441
+ if (pg_atomic_compare_exchange_u32 (& procglobal -> clogGroupFirst ,
442
+ & nextidx ,
443
+ (uint32 ) proc -> pgprocno ))
444
+ break ;
445
+ }
446
+
447
+ /*
448
+ * If the list was not empty, the leader will update the status of our
449
+ * XID. It is impossible to have followers without a leader because the
450
+ * first process that has added itself to the list will always have
451
+ * nextidx as INVALID_PGPROCNO.
452
+ */
453
+ if (nextidx != INVALID_PGPROCNO )
454
+ {
455
+ int extraWaits = 0 ;
456
+
457
+ /* Sleep until the leader updates our XID status. */
458
+ for (;;)
459
+ {
460
+ /* acts as a read barrier */
461
+ PGSemaphoreLock (proc -> sem );
462
+ if (!proc -> clogGroupMember )
463
+ break ;
464
+ extraWaits ++ ;
465
+ }
466
+
467
+ Assert (pg_atomic_read_u32 (& proc -> clogGroupNext ) == INVALID_PGPROCNO );
468
+
469
+ /* Fix semaphore count for any absorbed wakeups */
470
+ while (extraWaits -- > 0 )
471
+ PGSemaphoreUnlock (proc -> sem );
472
+ return true;
473
+ }
474
+
475
+ /* We are the leader. Acquire the lock on behalf of everyone. */
476
+ LWLockAcquire (CLogControlLock , LW_EXCLUSIVE );
477
+
478
+ /*
479
+ * Now that we've got the lock, clear the list of processes waiting for
480
+ * group XID status update, saving a pointer to the head of the list.
481
+ * Trying to pop elements one at a time could lead to an ABA problem.
482
+ */
483
+ nextidx = pg_atomic_exchange_u32 (& procglobal -> clogGroupFirst , INVALID_PGPROCNO );
484
+
485
+ /* Remember head of list so we can perform wakeups after dropping lock. */
486
+ wakeidx = nextidx ;
487
+
488
+ /* Walk the list and update the status of all XIDs. */
489
+ while (nextidx != INVALID_PGPROCNO )
490
+ {
491
+ PGPROC * proc = & ProcGlobal -> allProcs [nextidx ];
492
+ PGXACT * pgxact = & ProcGlobal -> allPgXact [nextidx ];
493
+
494
+ /*
495
+ * Overflowed transactions should not use group XID status update
496
+ * mechanism.
497
+ */
498
+ Assert (!pgxact -> overflowed );
499
+
500
+ TransactionIdSetPageStatusInternal (proc -> clogGroupMemberXid ,
501
+ pgxact -> nxids ,
502
+ proc -> subxids .xids ,
503
+ proc -> clogGroupMemberXidStatus ,
504
+ proc -> clogGroupMemberLsn ,
505
+ proc -> clogGroupMemberPage );
506
+
507
+ /* Move to next proc in list. */
508
+ nextidx = pg_atomic_read_u32 (& proc -> clogGroupNext );
509
+ }
510
+
511
+ /* We're done with the lock now. */
314
512
LWLockRelease (CLogControlLock );
513
+
514
+ /*
515
+ * Now that we've released the lock, go back and wake everybody up. We
516
+ * don't do this under the lock so as to keep lock hold times to a
517
+ * minimum. The system calls we need to perform to wake other processes
518
+ * up are probably slower and can cause performance slowdown if done under
519
+ * lock.
520
+ */
521
+ while (wakeidx != INVALID_PGPROCNO )
522
+ {
523
+ PGPROC * proc = & ProcGlobal -> allProcs [wakeidx ];
524
+
525
+ wakeidx = pg_atomic_read_u32 (& proc -> clogGroupNext );
526
+ pg_atomic_write_u32 (& proc -> clogGroupNext , INVALID_PGPROCNO );
527
+
528
+ /* ensure all previous writes are visible before follower continues. */
529
+ pg_write_barrier ();
530
+
531
+ proc -> clogGroupMember = false;
532
+
533
+ if (proc != MyProc )
534
+ PGSemaphoreUnlock (proc -> sem );
535
+ }
536
+
537
+ return true;
315
538
}
316
539
317
540
/*
0 commit comments