Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 7f242d8

Browse files
committed
Try to avoid running with a full fsync request queue.
When we need to insert a new entry and the queue is full, compact the entire queue in the hopes of making room for the new entry. Doing this on every insertion might worsen contention on BgWriterCommLock, but when the queue it's full, it's far better than allowing the backend to perform its own fsync, per testing by Greg Smith as reported in http://archives.postgresql.org/pgsql-hackers/2011-01/msg02665.php Original idea from Greg Smith. Patch by me. Review by Chris Browne and Greg Smith
1 parent b2826ad commit 7f242d8

File tree

2 files changed

+131
-11
lines changed

2 files changed

+131
-11
lines changed

src/backend/postmaster/bgwriter.c

+124-10
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ static void CheckArchiveTimeout(void);
182182
static void BgWriterNap(void);
183183
static bool IsCheckpointOnSchedule(double progress);
184184
static bool ImmediateCheckpointRequested(void);
185+
static bool CompactBgwriterRequestQueue(void);
185186

186187
/* Signal handlers */
187188

@@ -1064,14 +1065,15 @@ RequestCheckpoint(int flags)
10641065
* use high values for special flags; that's all internal to md.c, which
10651066
* see for details.)
10661067
*
1067-
* If we are unable to pass over the request (at present, this can happen
1068-
* if the shared memory queue is full), we return false. That forces
1069-
* the backend to do its own fsync. We hope that will be even more seldom.
1070-
*
1071-
* Note: we presently make no attempt to eliminate duplicate requests
1072-
* in the requests[] queue. The bgwriter will have to eliminate dups
1073-
* internally anyway, so we may as well avoid holding the lock longer
1074-
* than we have to here.
1068+
* To avoid holding the lock for longer than necessary, we normally write
1069+
* to the requests[] queue without checking for duplicates. The bgwriter
1070+
* will have to eliminate dups internally anyway. However, if we discover
1071+
* that the queue is full, we make a pass over the entire queue to compact
1072+
* it. This is somewhat expensive, but the alternative is for the backend
1073+
* to perform its own fsync, which is far more expensive in practice. It
1074+
* is theoretically possible a backend fsync might still be necessary, if
1075+
* the queue is full and contains no duplicate entries. In that case, we
1076+
* let the backend know by returning false.
10751077
*/
10761078
bool
10771079
ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
@@ -1090,10 +1092,20 @@ ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
10901092
/* Count all backend writes regardless of if they fit in the queue */
10911093
BgWriterShmem->num_backend_writes++;
10921094

1095+
/*
1096+
* If the background writer isn't running or the request queue is full,
1097+
* the backend will have to perform its own fsync request. But before
1098+
* forcing that to happen, we can try to compact the background writer
1099+
* request queue.
1100+
*/
10931101
if (BgWriterShmem->bgwriter_pid == 0 ||
1094-
BgWriterShmem->num_requests >= BgWriterShmem->max_requests)
1102+
(BgWriterShmem->num_requests >= BgWriterShmem->max_requests
1103+
&& !CompactBgwriterRequestQueue()))
10951104
{
1096-
/* Also count the subset where backends have to do their own fsync */
1105+
/*
1106+
* Count the subset of writes where backends have to do their own
1107+
* fsync
1108+
*/
10971109
BgWriterShmem->num_backend_fsync++;
10981110
LWLockRelease(BgWriterCommLock);
10991111
return false;
@@ -1106,6 +1118,108 @@ ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
11061118
return true;
11071119
}
11081120

1121+
/*
1122+
* CompactBgwriterRequestQueue
1123+
* Remove duplicates from the request queue to avoid backend fsyncs.
1124+
*
1125+
* Although a full fsync request queue is not common, it can lead to severe
1126+
* performance problems when it does happen. So far, this situation has
1127+
* only been observed to occur when the system is under heavy write load,
1128+
* and especially during the "sync" phase of a checkpoint. Without this
1129+
* logic, each backend begins doing an fsync for every block written, which
1130+
* gets very expensive and can slow down the whole system.
1131+
*
1132+
* Trying to do this every time the queue is full could lose if there
1133+
* aren't any removable entries. But should be vanishingly rare in
1134+
* practice: there's one queue entry per shared buffer.
1135+
*/
1136+
static bool
1137+
CompactBgwriterRequestQueue()
1138+
{
1139+
struct BgWriterSlotMapping {
1140+
BgWriterRequest request;
1141+
int slot;
1142+
};
1143+
1144+
int n,
1145+
preserve_count;
1146+
int num_skipped = 0;
1147+
HASHCTL ctl;
1148+
HTAB *htab;
1149+
bool *skip_slot;
1150+
1151+
/* must hold BgWriterCommLock in exclusive mode */
1152+
Assert(LWLockHeldByMe(BgWriterCommLock));
1153+
1154+
/* Initialize temporary hash table */
1155+
MemSet(&ctl, 0, sizeof(ctl));
1156+
ctl.keysize = sizeof(BgWriterRequest);
1157+
ctl.entrysize = sizeof(struct BgWriterSlotMapping);
1158+
ctl.hash = tag_hash;
1159+
htab = hash_create("CompactBgwriterRequestQueue",
1160+
BgWriterShmem->num_requests,
1161+
&ctl,
1162+
HASH_ELEM | HASH_FUNCTION);
1163+
1164+
/* Initialize skip_slot array */
1165+
skip_slot = palloc0(sizeof(bool) * BgWriterShmem->num_requests);
1166+
1167+
/*
1168+
* The basic idea here is that a request can be skipped if it's followed
1169+
* by a later, identical request. It might seem more sensible to work
1170+
* backwards from the end of the queue and check whether a request is
1171+
* *preceded* by an earlier, identical request, in the hopes of doing less
1172+
* copying. But that might change the semantics, if there's an
1173+
* intervening FORGET_RELATION_FSYNC or FORGET_DATABASE_FSYNC request, so
1174+
* we do it this way. It would be possible to be even smarter if we made
1175+
* the code below understand the specific semantics of such requests (it
1176+
* could blow away preceding entries that would end up being cancelled
1177+
* anyhow), but it's not clear that the extra complexity would buy us
1178+
* anything.
1179+
*/
1180+
for (n = 0; n < BgWriterShmem->num_requests; ++n)
1181+
{
1182+
BgWriterRequest *request;
1183+
struct BgWriterSlotMapping *slotmap;
1184+
bool found;
1185+
1186+
request = &BgWriterShmem->requests[n];
1187+
slotmap = hash_search(htab, request, HASH_ENTER, &found);
1188+
if (found)
1189+
{
1190+
skip_slot[slotmap->slot] = true;
1191+
++num_skipped;
1192+
}
1193+
slotmap->slot = n;
1194+
}
1195+
1196+
/* Done with the hash table. */
1197+
hash_destroy(htab);
1198+
1199+
/* If no duplicates, we're out of luck. */
1200+
if (!num_skipped)
1201+
{
1202+
pfree(skip_slot);
1203+
return false;
1204+
}
1205+
1206+
/* We found some duplicates; remove them. */
1207+
for (n = 0, preserve_count = 0; n < BgWriterShmem->num_requests; ++n)
1208+
{
1209+
if (skip_slot[n])
1210+
continue;
1211+
BgWriterShmem->requests[preserve_count++] = BgWriterShmem->requests[n];
1212+
}
1213+
ereport(DEBUG1,
1214+
(errmsg("compacted fsync request queue from %d entries to %d entries",
1215+
BgWriterShmem->num_requests, preserve_count)));
1216+
BgWriterShmem->num_requests = preserve_count;
1217+
1218+
/* Cleanup. */
1219+
pfree(skip_slot);
1220+
return true;
1221+
}
1222+
11091223
/*
11101224
* AbsorbFsyncRequests
11111225
* Retrieve queued fsync requests and pass them to local smgr.

src/backend/storage/smgr/md.c

+7-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,13 @@
3434
/* interval for calling AbsorbFsyncRequests in mdsync */
3535
#define FSYNCS_PER_ABSORB 10
3636

37-
/* special values for the segno arg to RememberFsyncRequest */
37+
/*
38+
* Special values for the segno arg to RememberFsyncRequest.
39+
*
40+
* Note that CompactBgwriterRequestQueue assumes that it's OK to remove an
41+
* fsync request from the queue if an identical, subsequent request is found.
42+
* See comments there before making changes here.
43+
*/
3844
#define FORGET_RELATION_FSYNC (InvalidBlockNumber)
3945
#define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1)
4046
#define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)

0 commit comments

Comments
 (0)