@@ -302,6 +302,11 @@ static bool doPageWrites;
302
302
* so it's a plain spinlock. The other locks are held longer (potentially
303
303
* over I/O operations), so we use LWLocks for them. These locks are:
304
304
*
305
+ * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
306
+ * It is only held while initializing and changing the mapping. If the
307
+ * contents of the buffer being replaced haven't been written yet, the mapping
308
+ * lock is released while the write is done, and reacquired afterwards.
309
+ *
305
310
* WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
306
311
* XLogFlush).
307
312
*
@@ -468,32 +473,21 @@ typedef struct XLogCtlData
468
473
pg_atomic_uint64 logFlushResult ; /* last byte + 1 flushed */
469
474
470
475
/*
471
- * Latest reserved for inititalization page in the cache (last byte
472
- * position + 1).
476
+ * Latest initialized page in the cache (last byte position + 1).
473
477
*
474
- * To change the identity of a buffer, you need to advance
475
- * InitializeReserved first . To change the identity of a buffer that's
478
+ * To change the identity of a buffer (and InitializedUpTo) , you need to
479
+ * hold WALBufMappingLock . To change the identity of a buffer that's
476
480
* still dirty, the old page needs to be written out first, and for that
477
481
* you need WALWriteLock, and you need to ensure that there are no
478
482
* in-progress insertions to the page by calling
479
483
* WaitXLogInsertionsToFinish().
480
484
*/
481
- pg_atomic_uint64 InitializeReserved ;
482
-
483
- /*
484
- * Latest initialized page in the cache (last byte position + 1).
485
- *
486
- * InitializedUpTo is updated after the buffer initialization. After
487
- * update, waiters got notification using InitializedUpToCondVar.
488
- */
489
- pg_atomic_uint64 InitializedUpTo ;
490
- ConditionVariable InitializedUpToCondVar ;
485
+ XLogRecPtr InitializedUpTo ;
491
486
492
487
/*
493
488
* These values do not change after startup, although the pointed-to pages
494
- * and xlblocks values certainly do. xlblocks values are changed
495
- * lock-free according to the check for the xlog write position and are
496
- * accompanied by changes of InitializeReserved and InitializedUpTo.
489
+ * and xlblocks values certainly do. xlblocks values are protected by
490
+ * WALBufMappingLock.
497
491
*/
498
492
char * pages ; /* buffers for unwritten XLOG pages */
499
493
pg_atomic_uint64 * xlblocks ; /* 1st byte ptr-s + XLOG_BLCKSZ */
@@ -816,9 +810,9 @@ XLogInsertRecord(XLogRecData *rdata,
816
810
* fullPageWrites from changing until the insertion is finished.
817
811
*
818
812
* Step 2 can usually be done completely in parallel. If the required WAL
819
- * page is not initialized yet, you have to go through AdvanceXLInsertBuffer,
820
- * which will ensure it is initialized. But the WAL writer tries to do that
821
- * ahead of insertions to avoid that from happening in the critical path.
813
+ * page is not initialized yet, you have to grab WALBufMappingLock to
814
+ * initialize it, but the WAL writer tries to do that ahead of insertions
815
+ * to avoid that from happening in the critical path.
822
816
*
823
817
*----------
824
818
*/
@@ -1997,70 +1991,32 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
1997
1991
XLogRecPtr NewPageEndPtr = InvalidXLogRecPtr ;
1998
1992
XLogRecPtr NewPageBeginPtr ;
1999
1993
XLogPageHeader NewPage ;
2000
- XLogRecPtr ReservedPtr ;
2001
1994
int npages pg_attribute_unused () = 0 ;
2002
1995
2003
- /*
2004
- * We must run the loop below inside the critical section as we expect
2005
- * XLogCtl->InitializedUpTo to eventually keep up. The most of callers
2006
- * already run inside the critical section. Except for WAL writer, which
2007
- * passed 'opportunistic == true', and therefore we don't perform
2008
- * operations that could error out.
2009
- *
2010
- * Start an explicit critical section anyway though.
2011
- */
2012
- Assert (CritSectionCount > 0 || opportunistic );
2013
- START_CRIT_SECTION ();
1996
+ LWLockAcquire (WALBufMappingLock , LW_EXCLUSIVE );
2014
1997
2015
- /*--
2016
- * Loop till we get all the pages in WAL buffer before 'upto' reserved for
2017
- * initialization. Multiple process can initialize different buffers with
2018
- * this loop in parallel as following.
2019
- *
2020
- * 1. Reserve page for initialization using XLogCtl->InitializeReserved.
2021
- * 2. Initialize the reserved page.
2022
- * 3. Attempt to advance XLogCtl->InitializedUpTo,
1998
+ /*
1999
+ * Now that we have the lock, check if someone initialized the page
2000
+ * already.
2023
2001
*/
2024
- ReservedPtr = pg_atomic_read_u64 (& XLogCtl -> InitializeReserved );
2025
- while (upto >= ReservedPtr || opportunistic )
2002
+ while (upto >= XLogCtl -> InitializedUpTo || opportunistic )
2026
2003
{
2027
- Assert ( ReservedPtr % XLOG_BLCKSZ == 0 );
2004
+ nextidx = XLogRecPtrToBufIdx ( XLogCtl -> InitializedUpTo );
2028
2005
2029
2006
/*
2030
- * Get ending-offset of the buffer page we need to replace.
2031
- *
2032
- * We don't lookup into xlblocks, but rather calculate position we
2033
- * must wait to be written. If it was written, xlblocks will have this
2034
- * position (or uninitialized)
2007
+ * Get ending-offset of the buffer page we need to replace (this may
2008
+ * be zero if the buffer hasn't been used yet). Fall through if it's
2009
+ * already written out.
2035
2010
*/
2036
- if (ReservedPtr + XLOG_BLCKSZ > XLOG_BLCKSZ * XLOGbuffers )
2037
- OldPageRqstPtr = ReservedPtr + XLOG_BLCKSZ - XLOG_BLCKSZ * XLOGbuffers ;
2038
- else
2039
- OldPageRqstPtr = InvalidXLogRecPtr ;
2040
-
2041
- if (LogwrtResult .Write < OldPageRqstPtr && opportunistic )
2011
+ OldPageRqstPtr = pg_atomic_read_u64 (& XLogCtl -> xlblocks [nextidx ]);
2012
+ if (LogwrtResult .Write < OldPageRqstPtr )
2042
2013
{
2043
2014
/*
2044
- * If we just want to pre-initialize as much as we can without
2045
- * flushing, give up now.
2015
+ * Nope, got work to do. If we just want to pre-initialize as much
2016
+ * as we can without flushing, give up now.
2046
2017
*/
2047
- upto = ReservedPtr - 1 ;
2048
- break ;
2049
- }
2050
-
2051
- /*
2052
- * Attempt to reserve the page for initialization. Failure means that
2053
- * this page got reserved by another process.
2054
- */
2055
- if (!pg_atomic_compare_exchange_u64 (& XLogCtl -> InitializeReserved ,
2056
- & ReservedPtr ,
2057
- ReservedPtr + XLOG_BLCKSZ ))
2058
- continue ;
2059
-
2060
- /* Fall through if it's already written out. */
2061
- if (LogwrtResult .Write < OldPageRqstPtr )
2062
- {
2063
- /* Nope, got work to do. */
2018
+ if (opportunistic )
2019
+ break ;
2064
2020
2065
2021
/* Advance shared memory write request position */
2066
2022
SpinLockAcquire (& XLogCtl -> info_lck );
@@ -2075,6 +2031,14 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
2075
2031
RefreshXLogWriteResult (LogwrtResult );
2076
2032
if (LogwrtResult .Write < OldPageRqstPtr )
2077
2033
{
2034
+ /*
2035
+ * Must acquire write lock. Release WALBufMappingLock first,
2036
+ * to make sure that all insertions that we need to wait for
2037
+ * can finish (up to this same position). Otherwise we risk
2038
+ * deadlock.
2039
+ */
2040
+ LWLockRelease (WALBufMappingLock );
2041
+
2078
2042
WaitXLogInsertionsToFinish (OldPageRqstPtr );
2079
2043
2080
2044
LWLockAcquire (WALWriteLock , LW_EXCLUSIVE );
@@ -2096,24 +2060,20 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
2096
2060
pgWalUsage .wal_buffers_full ++ ;
2097
2061
TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE ();
2098
2062
}
2063
+ /* Re-acquire WALBufMappingLock and retry */
2064
+ LWLockAcquire (WALBufMappingLock , LW_EXCLUSIVE );
2065
+ continue ;
2099
2066
}
2100
2067
}
2101
2068
2102
2069
/*
2103
2070
* Now the next buffer slot is free and we can set it up to be the
2104
2071
* next output page.
2105
2072
*/
2106
- NewPageBeginPtr = ReservedPtr ;
2073
+ NewPageBeginPtr = XLogCtl -> InitializedUpTo ;
2107
2074
NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ ;
2108
- nextidx = XLogRecPtrToBufIdx (ReservedPtr );
2109
2075
2110
- #ifdef USE_ASSERT_CHECKING
2111
- {
2112
- XLogRecPtr storedBound = pg_atomic_read_u64 (& XLogCtl -> xlblocks [nextidx ]);
2113
-
2114
- Assert (storedBound == OldPageRqstPtr || storedBound == InvalidXLogRecPtr );
2115
- }
2116
- #endif
2076
+ Assert (XLogRecPtrToBufIdx (NewPageBeginPtr ) == nextidx );
2117
2077
2118
2078
NewPage = (XLogPageHeader ) (XLogCtl -> pages + nextidx * (Size ) XLOG_BLCKSZ );
2119
2079
@@ -2179,50 +2139,11 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
2179
2139
pg_write_barrier ();
2180
2140
2181
2141
pg_atomic_write_u64 (& XLogCtl -> xlblocks [nextidx ], NewPageEndPtr );
2182
-
2183
- /*
2184
- * Try to advance XLogCtl->InitializedUpTo.
2185
- *
2186
- * If the CAS operation failed, then some of previous pages are not
2187
- * initialized yet, and this backend gives up.
2188
- *
2189
- * Since initializer of next page might give up on advancing of
2190
- * InitializedUpTo, this backend have to attempt advancing until it
2191
- * find page "in the past" or concurrent backend succeeded at
2192
- * advancing. When we finish advancing XLogCtl->InitializedUpTo, we
2193
- * notify all the waiters with XLogCtl->InitializedUpToCondVar.
2194
- */
2195
- while (pg_atomic_compare_exchange_u64 (& XLogCtl -> InitializedUpTo , & NewPageBeginPtr , NewPageEndPtr ))
2196
- {
2197
- NewPageBeginPtr = NewPageEndPtr ;
2198
- NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ ;
2199
- nextidx = XLogRecPtrToBufIdx (NewPageBeginPtr );
2200
-
2201
- if (pg_atomic_read_u64 (& XLogCtl -> xlblocks [nextidx ]) != NewPageEndPtr )
2202
- {
2203
- /*
2204
- * Page at nextidx wasn't initialized yet, so we cann't move
2205
- * InitializedUpto further. It will be moved by backend which
2206
- * will initialize nextidx.
2207
- */
2208
- ConditionVariableBroadcast (& XLogCtl -> InitializedUpToCondVar );
2209
- break ;
2210
- }
2211
- }
2142
+ XLogCtl -> InitializedUpTo = NewPageEndPtr ;
2212
2143
2213
2144
npages ++ ;
2214
2145
}
2215
-
2216
- END_CRIT_SECTION ();
2217
-
2218
- /*
2219
- * All the pages in WAL buffer before 'upto' were reserved for
2220
- * initialization. However, some pages might be reserved by concurrent
2221
- * processes. Wait till they finish initialization.
2222
- */
2223
- while (upto >= pg_atomic_read_u64 (& XLogCtl -> InitializedUpTo ))
2224
- ConditionVariableSleep (& XLogCtl -> InitializedUpToCondVar , WAIT_EVENT_WAL_BUFFER_INIT );
2225
- ConditionVariableCancelSleep ();
2146
+ LWLockRelease (WALBufMappingLock );
2226
2147
2227
2148
#ifdef WAL_DEBUG
2228
2149
if (XLOG_DEBUG && npages > 0 )
@@ -5123,10 +5044,6 @@ XLOGShmemInit(void)
5123
5044
pg_atomic_init_u64 (& XLogCtl -> logWriteResult , InvalidXLogRecPtr );
5124
5045
pg_atomic_init_u64 (& XLogCtl -> logFlushResult , InvalidXLogRecPtr );
5125
5046
pg_atomic_init_u64 (& XLogCtl -> unloggedLSN , InvalidXLogRecPtr );
5126
-
5127
- pg_atomic_init_u64 (& XLogCtl -> InitializeReserved , InvalidXLogRecPtr );
5128
- pg_atomic_init_u64 (& XLogCtl -> InitializedUpTo , InvalidXLogRecPtr );
5129
- ConditionVariableInit (& XLogCtl -> InitializedUpToCondVar );
5130
5047
}
5131
5048
5132
5049
/*
@@ -6146,7 +6063,7 @@ StartupXLOG(void)
6146
6063
memset (page + len , 0 , XLOG_BLCKSZ - len );
6147
6064
6148
6065
pg_atomic_write_u64 (& XLogCtl -> xlblocks [firstIdx ], endOfRecoveryInfo -> lastPageBeginPtr + XLOG_BLCKSZ );
6149
- pg_atomic_write_u64 ( & XLogCtl -> InitializedUpTo , endOfRecoveryInfo -> lastPageBeginPtr + XLOG_BLCKSZ ) ;
6066
+ XLogCtl -> InitializedUpTo = endOfRecoveryInfo -> lastPageBeginPtr + XLOG_BLCKSZ ;
6150
6067
}
6151
6068
else
6152
6069
{
@@ -6155,9 +6072,8 @@ StartupXLOG(void)
6155
6072
* let the first attempt to insert a log record to initialize the next
6156
6073
* buffer.
6157
6074
*/
6158
- pg_atomic_write_u64 ( & XLogCtl -> InitializedUpTo , EndOfLog ) ;
6075
+ XLogCtl -> InitializedUpTo = EndOfLog ;
6159
6076
}
6160
- pg_atomic_write_u64 (& XLogCtl -> InitializeReserved , pg_atomic_read_u64 (& XLogCtl -> InitializedUpTo ));
6161
6077
6162
6078
/*
6163
6079
* Update local and shared status. This is OK to do without any locks
0 commit comments