[CF 5784] Optimize shared LWLock acquisition for high-core-count systems

Commitfest Bot · Commitfest Bot · commit d73c3594cc68 · 2025-05-30T13:35:21.000Z
This branch was automatically generated by a robot using patches from an email thread registered at: https://commitfest.postgresql.org/patch/5784 The branch will be overwritten each time a new patch version is posted to the thread, and also periodically to check for bitrot caused by changes on the master branch. Patch(es): https://www.postgresql.org/message-id/73d53acf-4f66-41df-b438-5c2e6115d4de@intel.com Author(s): Zhiguo Zhou
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
@@ -97,20 +97,41 @@
 #define LW_FLAG_BITS				3
 #define LW_FLAG_MASK				(((1<<LW_FLAG_BITS)-1)<<(32-LW_FLAG_BITS))
 
-/* assumes MAX_BACKENDS is a (power of 2) - 1, checked below */
-#define LW_VAL_EXCLUSIVE			(MAX_BACKENDS + 1)
+/*
+ * already (power of 2)-1, i.e. suitable for a mask
+ *
+ * Originally, the LW_SHARED lock reference count was maintained in bits
+ * [MAX_BACKEND_BITS-1:0] of LWLock.state, with a theoretical maximum of
+ * MAX_BACKENDS (when all MAX_BACKENDS processes hold the lock concurrently).
+ *
+ * To reduce lock acquisition overhead, we optimized LWLockAttemptLock by
+ * merging the read and update operations for the LW_SHARED lock's state.
+ * This eliminates the need for separate atomic instructions - a critical
+ * improvement given the high cost of atomic operations on high-core-count
+ * systems.
+ *
+ * This optimization introduces a scenario where the reference count may
+ * temporarily increment even when a reader fails to acquire an exclusive lock.
+ * However, since each process retries lock acquisition up to *twice* before
+ * waiting on a semaphore, the reference count is bounded by MAX_BACKENDS * 2.
+ *
+ * To ensure compatibility with this upper bound:
+ * 1. LW_SHARED_MASK has been extended by 1 bit
+ * 2. LW_VAL_EXCLUSIVE is left-shifted by 1 bit
+ */
+#define LW_SHARED_MASK				((MAX_BACKENDS << 1) + 1)
+#define LW_VAL_EXCLUSIVE			(LW_SHARED_MASK + 1)
+#define LW_LOCK_MASK				(LW_SHARED_MASK	| LW_VAL_EXCLUSIVE)
 #define LW_VAL_SHARED				1
 
-/* already (power of 2)-1, i.e. suitable for a mask */
-#define LW_SHARED_MASK				MAX_BACKENDS
-#define LW_LOCK_MASK				(MAX_BACKENDS | LW_VAL_EXCLUSIVE)
+/* assumes MAX_BACKENDS is a (power of 2) - 1, checked below */
 
 
 StaticAssertDecl(((MAX_BACKENDS + 1) & MAX_BACKENDS) == 0,
 				 "MAX_BACKENDS + 1 needs to be a power of 2");
 
-StaticAssertDecl((MAX_BACKENDS & LW_FLAG_MASK) == 0,
-				 "MAX_BACKENDS and LW_FLAG_MASK overlap");
+StaticAssertDecl((LW_SHARED_MASK & LW_FLAG_MASK) == 0,
+				 "LW_SHARED_MASK and LW_FLAG_MASK overlap");
 
 StaticAssertDecl((LW_VAL_EXCLUSIVE & LW_FLAG_MASK) == 0,
 				 "LW_VAL_EXCLUSIVE and LW_FLAG_MASK overlap");
@@ -277,15 +298,17 @@ PRINT_LWDEBUG(const char *where, LWLock *lock, LWLockMode mode)
 	if (Trace_lwlocks)
 	{
 		uint32		state = pg_atomic_read_u32(&lock->state);
+		uint32		excl = (state & LW_VAL_EXCLUSIVE) != 0;
+		uint32		shared = excl ? 0 : state & LW_SHARED_MASK;
 
 		ereport(LOG,
 				(errhidestmt(true),
 				 errhidecontext(true),
 				 errmsg_internal("%d: %s(%s %p): excl %u shared %u haswaiters %u waiters %u rOK %d",
 								 MyProcPid,
 								 where, T_NAME(lock), lock,
-								 (state & LW_VAL_EXCLUSIVE) != 0,
-								 state & LW_SHARED_MASK,
+								 excl,
+								 shared,
 								 (state & LW_FLAG_HAS_WAITERS) != 0,
 								 pg_atomic_read_u32(&lock->nwaiters),
 								 (state & LW_FLAG_RELEASE_OK) != 0)));
@@ -790,15 +813,30 @@ GetLWLockIdentifier(uint32 classId, uint16 eventId)
  * This function will not block waiting for a lock to become free - that's the
  * caller's job.
  *
+ * willwait: true if the caller is willing to wait for the lock to become free
+ *          false if the caller is not willing to wait.
+ *
  * Returns true if the lock isn't free and we need to wait.
  */
 static bool
-LWLockAttemptLock(LWLock *lock, LWLockMode mode)
+LWLockAttemptLock(LWLock *lock, LWLockMode mode, bool willwait)
 {
 	uint32		old_state;
 
 	Assert(mode == LW_EXCLUSIVE || mode == LW_SHARED);
 
+	/*
+	 * To avoid conflicts between the reference count and the LW_VAL_EXCLUSIVE
+	 * flag, this optimization is disabled when willwait is false. See detailed
+	 * comments in this file where LW_SHARED_MASK is defined for more explaination.
+	 */
+	if (willwait && mode == LW_SHARED)
+	{
+		old_state = pg_atomic_fetch_add_u32(&lock->state, LW_VAL_SHARED);
+		Assert((old_state & LW_LOCK_MASK) != LW_LOCK_MASK);
+		return (old_state & LW_VAL_EXCLUSIVE) != 0;
+	}
+
 	/*
 	 * Read once outside the loop, later iterations will get the newer value
 	 * via compare & exchange.
@@ -1242,7 +1280,7 @@ LWLockAcquire(LWLock *lock, LWLockMode mode)
 		 * Try to grab the lock the first time, we're not in the waitqueue
 		 * yet/anymore.
 		 */
-		mustwait = LWLockAttemptLock(lock, mode);
+		mustwait = LWLockAttemptLock(lock, mode, true);
 
 		if (!mustwait)
 		{
@@ -1265,7 +1303,7 @@ LWLockAcquire(LWLock *lock, LWLockMode mode)
 		LWLockQueueSelf(lock, mode);
 
 		/* we're now guaranteed to be woken up if necessary */
-		mustwait = LWLockAttemptLock(lock, mode);
+		mustwait = LWLockAttemptLock(lock, mode, true);
 
 		/* ok, grabbed the lock the second time round, need to undo queueing */
 		if (!mustwait)
@@ -1368,7 +1406,7 @@ LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
 	HOLD_INTERRUPTS();
 
 	/* Check for the lock */
-	mustwait = LWLockAttemptLock(lock, mode);
+	mustwait = LWLockAttemptLock(lock, mode, false);
 
 	if (mustwait)
 	{
@@ -1435,13 +1473,13 @@ LWLockAcquireOrWait(LWLock *lock, LWLockMode mode)
 	 * NB: We're using nearly the same twice-in-a-row lock acquisition
 	 * protocol as LWLockAcquire(). Check its comments for details.
 	 */
-	mustwait = LWLockAttemptLock(lock, mode);
+	mustwait = LWLockAttemptLock(lock, mode, true);
 
 	if (mustwait)
 	{
 		LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE);
 
-		mustwait = LWLockAttemptLock(lock, mode);
+		mustwait = LWLockAttemptLock(lock, mode, true);
 
 		if (mustwait)
 		{
@@ -1843,7 +1881,10 @@ LWLockReleaseInternal(LWLock *lock, LWLockMode mode)
 	 * others, even if we still have to wakeup other waiters.
 	 */
 	if (mode == LW_EXCLUSIVE)
-		oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_EXCLUSIVE);
+	{
+		oldstate = pg_atomic_fetch_and_u32(&lock->state, ~LW_LOCK_MASK);
+		oldstate &= ~LW_LOCK_MASK;
+	}
 	else
 		oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_SHARED);