Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 6150a1b

Browse files
committed
Move buffer I/O and content LWLocks out of the main tranche.
Move the content lock directly into the BufferDesc, so that locking and pinning a buffer touches only one cache line rather than two. Adjust the definition of BufferDesc slightly so that this doesn't make the BufferDesc any larger than one cache line (at least on platforms where a spinlock is only 1 or 2 bytes). We can't fit the I/O locks into the BufferDesc and stay within one cache line, so move those to a completely separate tranche. This leaves a relatively limited number of LWLocks in the main tranche, so increase the padding of those remaining locks to a full cache line, rather than allowing adjacent locks to share a cache line, hopefully reducing false sharing. Performance testing shows that these changes make little difference on laptop-class machines, but help significantly on larger servers, especially those with more than 2 sockets. Andres Freund, originally based on an earlier patch by Simon Riggs. Review and cosmetic adjustments (including heavy rewriting of the comments) by me.
1 parent 3fed417 commit 6150a1b

File tree

6 files changed

+151
-67
lines changed

6 files changed

+151
-67
lines changed

src/backend/storage/buffer/buf_init.c

+51-10
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020

2121
BufferDescPadded *BufferDescriptors;
2222
char *BufferBlocks;
23+
LWLockMinimallyPadded *BufferIOLWLockArray = NULL;
24+
LWLockTranche BufferIOLWLockTranche;
25+
LWLockTranche BufferContentLWLockTranche;
2326

2427

2528
/*
@@ -65,22 +68,45 @@ void
6568
InitBufferPool(void)
6669
{
6770
bool foundBufs,
68-
foundDescs;
71+
foundDescs,
72+
foundIOLocks;
6973

7074
/* Align descriptors to a cacheline boundary. */
71-
BufferDescriptors = (BufferDescPadded *) CACHELINEALIGN(
72-
ShmemInitStruct("Buffer Descriptors",
73-
NBuffers * sizeof(BufferDescPadded) + PG_CACHE_LINE_SIZE,
74-
&foundDescs));
75+
BufferDescriptors = (BufferDescPadded *)
76+
CACHELINEALIGN(
77+
ShmemInitStruct("Buffer Descriptors",
78+
NBuffers * sizeof(BufferDescPadded)
79+
+ PG_CACHE_LINE_SIZE,
80+
&foundDescs));
7581

7682
BufferBlocks = (char *)
7783
ShmemInitStruct("Buffer Blocks",
7884
NBuffers * (Size) BLCKSZ, &foundBufs);
7985

80-
if (foundDescs || foundBufs)
86+
/* Align lwlocks to cacheline boundary */
87+
BufferIOLWLockArray = (LWLockMinimallyPadded *)
88+
CACHELINEALIGN(ShmemInitStruct("Buffer IO Locks",
89+
NBuffers * (Size) sizeof(LWLockMinimallyPadded)
90+
+ PG_CACHE_LINE_SIZE,
91+
&foundIOLocks));
92+
93+
BufferIOLWLockTranche.name = "Buffer IO Locks";
94+
BufferIOLWLockTranche.array_base = BufferIOLWLockArray;
95+
BufferIOLWLockTranche.array_stride = sizeof(LWLockMinimallyPadded);
96+
LWLockRegisterTranche(LWTRANCHE_BUFFER_IO_IN_PROGRESS,
97+
&BufferIOLWLockTranche);
98+
99+
BufferContentLWLockTranche.name = "Buffer Content Locks";
100+
BufferContentLWLockTranche.array_base =
101+
((char *) BufferDescriptors) + offsetof(BufferDesc, content_lock);
102+
BufferContentLWLockTranche.array_stride = sizeof(BufferDescPadded);
103+
LWLockRegisterTranche(LWTRANCHE_BUFFER_CONTENT,
104+
&BufferContentLWLockTranche);
105+
106+
if (foundDescs || foundBufs || foundIOLocks)
81107
{
82-
/* both should be present or neither */
83-
Assert(foundDescs && foundBufs);
108+
/* should find all of these, or none of them */
109+
Assert(foundDescs && foundBufs && foundIOLocks);
84110
/* note: this path is only taken in EXEC_BACKEND case */
85111
}
86112
else
@@ -110,8 +136,11 @@ InitBufferPool(void)
110136
*/
111137
buf->freeNext = i + 1;
112138

113-
buf->io_in_progress_lock = LWLockAssign();
114-
buf->content_lock = LWLockAssign();
139+
LWLockInitialize(BufferDescriptorGetContentLock(buf),
140+
LWTRANCHE_BUFFER_CONTENT);
141+
142+
LWLockInitialize(BufferDescriptorGetIOLock(buf),
143+
LWTRANCHE_BUFFER_IO_IN_PROGRESS);
115144
}
116145

117146
/* Correct last entry of linked list */
@@ -144,5 +173,17 @@ BufferShmemSize(void)
144173
/* size of stuff controlled by freelist.c */
145174
size = add_size(size, StrategyShmemSize());
146175

176+
/*
177+
* It would be nice to include the I/O locks in the BufferDesc, but that
178+
* would increase the size of a BufferDesc to more than one cache line, and
179+
* benchmarking has shown that keeping every BufferDesc aligned on a cache
180+
* line boundary is important for performance. So, instead, the array of
181+
* I/O locks is allocated in a separate tranche. Because those locks are
182+
* not highly contentended, we lay out the array with minimal padding.
183+
*/
184+
size = add_size(size, mul_size(NBuffers, sizeof(LWLockMinimallyPadded)));
185+
/* to allow aligning the above */
186+
size = add_size(size, PG_CACHE_LINE_SIZE);
187+
147188
return size;
148189
}

src/backend/storage/buffer/bufmgr.c

+30-27
Original file line numberDiff line numberDiff line change
@@ -738,7 +738,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
738738
if (!isLocalBuf)
739739
{
740740
if (mode == RBM_ZERO_AND_LOCK)
741-
LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
741+
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
742+
LW_EXCLUSIVE);
742743
else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
743744
LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
744745
}
@@ -879,7 +880,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
879880
if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
880881
!isLocalBuf)
881882
{
882-
LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
883+
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
883884
}
884885

885886
if (isLocalBuf)
@@ -1045,7 +1046,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
10451046
* happens to be trying to split the page the first one got from
10461047
* StrategyGetBuffer.)
10471048
*/
1048-
if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED))
1049+
if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
1050+
LW_SHARED))
10491051
{
10501052
/*
10511053
* If using a nondefault strategy, and writing the buffer
@@ -1067,7 +1069,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
10671069
StrategyRejectBuffer(strategy, buf))
10681070
{
10691071
/* Drop lock/pin and loop around for another buffer */
1070-
LWLockRelease(buf->content_lock);
1072+
LWLockRelease(BufferDescriptorGetContentLock(buf));
10711073
UnpinBuffer(buf, true);
10721074
continue;
10731075
}
@@ -1080,7 +1082,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
10801082
smgr->smgr_rnode.node.relNode);
10811083

10821084
FlushBuffer(buf, NULL);
1083-
LWLockRelease(buf->content_lock);
1085+
LWLockRelease(BufferDescriptorGetContentLock(buf));
10841086

10851087
TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
10861088
smgr->smgr_rnode.node.spcNode,
@@ -1395,7 +1397,7 @@ MarkBufferDirty(Buffer buffer)
13951397

13961398
Assert(BufferIsPinned(buffer));
13971399
/* unfortunately we can't check if the lock is held exclusively */
1398-
Assert(LWLockHeldByMe(bufHdr->content_lock));
1400+
Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
13991401

14001402
LockBufHdr(bufHdr);
14011403

@@ -1595,8 +1597,8 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
15951597
if (ref->refcount == 0)
15961598
{
15971599
/* I'd better not still hold any locks on the buffer */
1598-
Assert(!LWLockHeldByMe(buf->content_lock));
1599-
Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
1600+
Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
1601+
Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
16001602

16011603
LockBufHdr(buf);
16021604

@@ -2116,11 +2118,11 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
21162118
* buffer is clean by the time we've locked it.)
21172119
*/
21182120
PinBuffer_Locked(bufHdr);
2119-
LWLockAcquire(bufHdr->content_lock, LW_SHARED);
2121+
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
21202122

21212123
FlushBuffer(bufHdr, NULL);
21222124

2123-
LWLockRelease(bufHdr->content_lock);
2125+
LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
21242126
UnpinBuffer(bufHdr, true);
21252127

21262128
return result | BUF_WRITTEN;
@@ -2926,9 +2928,9 @@ FlushRelationBuffers(Relation rel)
29262928
(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
29272929
{
29282930
PinBuffer_Locked(bufHdr);
2929-
LWLockAcquire(bufHdr->content_lock, LW_SHARED);
2931+
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
29302932
FlushBuffer(bufHdr, rel->rd_smgr);
2931-
LWLockRelease(bufHdr->content_lock);
2933+
LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
29322934
UnpinBuffer(bufHdr, true);
29332935
}
29342936
else
@@ -2978,9 +2980,9 @@ FlushDatabaseBuffers(Oid dbid)
29782980
(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
29792981
{
29802982
PinBuffer_Locked(bufHdr);
2981-
LWLockAcquire(bufHdr->content_lock, LW_SHARED);
2983+
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
29822984
FlushBuffer(bufHdr, NULL);
2983-
LWLockRelease(bufHdr->content_lock);
2985+
LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
29842986
UnpinBuffer(bufHdr, true);
29852987
}
29862988
else
@@ -3004,7 +3006,7 @@ FlushOneBuffer(Buffer buffer)
30043006

30053007
bufHdr = GetBufferDescriptor(buffer - 1);
30063008

3007-
Assert(LWLockHeldByMe(bufHdr->content_lock));
3009+
Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
30083010

30093011
FlushBuffer(bufHdr, NULL);
30103012
}
@@ -3101,7 +3103,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
31013103

31023104
Assert(GetPrivateRefCount(buffer) > 0);
31033105
/* here, either share or exclusive lock is OK */
3104-
Assert(LWLockHeldByMe(bufHdr->content_lock));
3106+
Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
31053107

31063108
/*
31073109
* This routine might get called many times on the same page, if we are
@@ -3254,11 +3256,11 @@ LockBuffer(Buffer buffer, int mode)
32543256
buf = GetBufferDescriptor(buffer - 1);
32553257

32563258
if (mode == BUFFER_LOCK_UNLOCK)
3257-
LWLockRelease(buf->content_lock);
3259+
LWLockRelease(BufferDescriptorGetContentLock(buf));
32583260
else if (mode == BUFFER_LOCK_SHARE)
3259-
LWLockAcquire(buf->content_lock, LW_SHARED);
3261+
LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
32603262
else if (mode == BUFFER_LOCK_EXCLUSIVE)
3261-
LWLockAcquire(buf->content_lock, LW_EXCLUSIVE);
3263+
LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
32623264
else
32633265
elog(ERROR, "unrecognized buffer lock mode: %d", mode);
32643266
}
@@ -3279,7 +3281,8 @@ ConditionalLockBuffer(Buffer buffer)
32793281

32803282
buf = GetBufferDescriptor(buffer - 1);
32813283

3282-
return LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE);
3284+
return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
3285+
LW_EXCLUSIVE);
32833286
}
32843287

32853288
/*
@@ -3489,8 +3492,8 @@ WaitIO(BufferDesc *buf)
34893492
UnlockBufHdr(buf);
34903493
if (!(sv_flags & BM_IO_IN_PROGRESS))
34913494
break;
3492-
LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
3493-
LWLockRelease(buf->io_in_progress_lock);
3495+
LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
3496+
LWLockRelease(BufferDescriptorGetIOLock(buf));
34943497
}
34953498
}
34963499

@@ -3523,7 +3526,7 @@ StartBufferIO(BufferDesc *buf, bool forInput)
35233526
* Grab the io_in_progress lock so that other processes can wait for
35243527
* me to finish the I/O.
35253528
*/
3526-
LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
3529+
LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
35273530

35283531
LockBufHdr(buf);
35293532

@@ -3537,7 +3540,7 @@ StartBufferIO(BufferDesc *buf, bool forInput)
35373540
* him to get unwedged.
35383541
*/
35393542
UnlockBufHdr(buf);
3540-
LWLockRelease(buf->io_in_progress_lock);
3543+
LWLockRelease(BufferDescriptorGetIOLock(buf));
35413544
WaitIO(buf);
35423545
}
35433546

@@ -3547,7 +3550,7 @@ StartBufferIO(BufferDesc *buf, bool forInput)
35473550
{
35483551
/* someone else already did the I/O */
35493552
UnlockBufHdr(buf);
3550-
LWLockRelease(buf->io_in_progress_lock);
3553+
LWLockRelease(BufferDescriptorGetIOLock(buf));
35513554
return false;
35523555
}
35533556

@@ -3595,7 +3598,7 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits)
35953598

35963599
InProgressBuf = NULL;
35973600

3598-
LWLockRelease(buf->io_in_progress_lock);
3601+
LWLockRelease(BufferDescriptorGetIOLock(buf));
35993602
}
36003603

36013604
/*
@@ -3620,7 +3623,7 @@ AbortBufferIO(void)
36203623
* we can use TerminateBufferIO. Anyone who's executing WaitIO on the
36213624
* buffer will be in a busy spin until we succeed in doing this.
36223625
*/
3623-
LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
3626+
LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
36243627

36253628
LockBufHdr(buf);
36263629
Assert(buf->flags & BM_IO_IN_PROGRESS);

src/backend/storage/lmgr/lwlock.c

+8-7
Original file line numberDiff line numberDiff line change
@@ -344,18 +344,15 @@ NumLWLocks(void)
344344
int numLocks;
345345

346346
/*
347-
* Possibly this logic should be spread out among the affected modules,
348-
* the same way that shmem space estimation is done. But for now, there
349-
* are few enough users of LWLocks that we can get away with just keeping
350-
* the knowledge here.
347+
* Many users of LWLocks no longer reserve space in the main array here,
348+
* but instead allocate separate tranches. The latter approach has the
349+
* advantage of allowing LWLOCK_STATS and LOCK_DEBUG output to produce
350+
* more useful output.
351351
*/
352352

353353
/* Predefined LWLocks */
354354
numLocks = NUM_FIXED_LWLOCKS;
355355

356-
/* bufmgr.c needs two for each shared buffer */
357-
numLocks += 2 * NBuffers;
358-
359356
/* proc.c needs one for each backend or auxiliary process */
360357
numLocks += MaxBackends + NUM_AUXILIARY_PROCS;
361358

@@ -423,6 +420,10 @@ CreateLWLocks(void)
423420
StaticAssertExpr(LW_VAL_EXCLUSIVE > (uint32) MAX_BACKENDS,
424421
"MAX_BACKENDS too big for lwlock.c");
425422

423+
StaticAssertExpr(sizeof(LWLock) <= LWLOCK_MINIMAL_SIZE &&
424+
sizeof(LWLock) <= LWLOCK_PADDED_SIZE,
425+
"Miscalculated LWLock padding");
426+
426427
if (!IsUnderPostmaster)
427428
{
428429
int numLocks = NumLWLocks();

src/include/storage/buf_internals.h

+16-7
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,8 @@ typedef struct buftag
115115
* Note: buf_hdr_lock must be held to examine or change the tag, flags,
116116
* usage_count, refcount, or wait_backend_pid fields. buf_id field never
117117
* changes after initialization, so does not need locking. freeNext is
118-
* protected by the buffer_strategy_lock not buf_hdr_lock. The LWLocks can take
119-
* care of themselves. The buf_hdr_lock is *not* used to control access to
118+
* protected by the buffer_strategy_lock not buf_hdr_lock. The LWLock can
119+
* take care of itself. The buf_hdr_lock is *not* used to control access to
120120
* the data in the buffer!
121121
*
122122
* An exception is that if we have the buffer pinned, its tag can't change
@@ -133,22 +133,24 @@ typedef struct buftag
133133
*
134134
* We use this same struct for local buffer headers, but the lock fields
135135
* are not used and not all of the flag bits are useful either.
136+
*
137+
* Be careful to avoid increasing the size of the struct when adding or
138+
* reordering members. Keeping it below 64 bytes (the most common CPU
139+
* cache line size) is fairly important for performance.
136140
*/
137141
typedef struct BufferDesc
138142
{
139143
BufferTag tag; /* ID of page contained in buffer */
140144
BufFlags flags; /* see bit definitions above */
141-
uint16 usage_count; /* usage counter for clock sweep code */
145+
uint8 usage_count; /* usage counter for clock sweep code */
146+
slock_t buf_hdr_lock; /* protects a subset of fields, see above */
142147
unsigned refcount; /* # of backends holding pins on buffer */
143148
int wait_backend_pid; /* backend PID of pin-count waiter */
144149

145-
slock_t buf_hdr_lock; /* protects the above fields */
146-
147150
int buf_id; /* buffer's index number (from 0) */
148151
int freeNext; /* link in freelist chain */
149152

150-
LWLock *io_in_progress_lock; /* to wait for I/O to complete */
151-
LWLock *content_lock; /* to lock access to buffer contents */
153+
LWLock content_lock; /* to lock access to buffer contents */
152154
} BufferDesc;
153155

154156
/*
@@ -184,6 +186,13 @@ typedef union BufferDescPadded
184186

185187
#define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
186188

189+
#define BufferDescriptorGetIOLock(bdesc) \
190+
(&(BufferIOLWLockArray[(bdesc)->buf_id]).lock)
191+
#define BufferDescriptorGetContentLock(bdesc) \
192+
((LWLock*) (&(bdesc)->content_lock))
193+
194+
extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray;
195+
187196
/*
188197
* The freeNext field is either the index of the next freelist entry,
189198
* or one of these special values:

0 commit comments

Comments
 (0)