@@ -182,6 +182,7 @@ static void CheckArchiveTimeout(void);
182
182
static void BgWriterNap (void );
183
183
static bool IsCheckpointOnSchedule (double progress );
184
184
static bool ImmediateCheckpointRequested (void );
185
+ static bool CompactBgwriterRequestQueue (void );
185
186
186
187
/* Signal handlers */
187
188
@@ -1064,14 +1065,15 @@ RequestCheckpoint(int flags)
1064
1065
* use high values for special flags; that's all internal to md.c, which
1065
1066
* see for details.)
1066
1067
*
1067
- * If we are unable to pass over the request (at present, this can happen
1068
- * if the shared memory queue is full), we return false. That forces
1069
- * the backend to do its own fsync. We hope that will be even more seldom.
1070
- *
1071
- * Note: we presently make no attempt to eliminate duplicate requests
1072
- * in the requests[] queue. The bgwriter will have to eliminate dups
1073
- * internally anyway, so we may as well avoid holding the lock longer
1074
- * than we have to here.
1068
+ * To avoid holding the lock for longer than necessary, we normally write
1069
+ * to the requests[] queue without checking for duplicates. The bgwriter
1070
+ * will have to eliminate dups internally anyway. However, if we discover
1071
+ * that the queue is full, we make a pass over the entire queue to compact
1072
+ * it. This is somewhat expensive, but the alternative is for the backend
1073
+ * to perform its own fsync, which is far more expensive in practice. It
1074
+ * is theoretically possible a backend fsync might still be necessary, if
1075
+ * the queue is full and contains no duplicate entries. In that case, we
1076
+ * let the backend know by returning false.
1075
1077
*/
1076
1078
bool
1077
1079
ForwardFsyncRequest (RelFileNodeBackend rnode , ForkNumber forknum ,
@@ -1090,10 +1092,20 @@ ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
1090
1092
/* Count all backend writes regardless of if they fit in the queue */
1091
1093
BgWriterShmem -> num_backend_writes ++ ;
1092
1094
1095
+ /*
1096
+ * If the background writer isn't running or the request queue is full,
1097
+ * the backend will have to perform its own fsync request. But before
1098
+ * forcing that to happen, we can try to compact the background writer
1099
+ * request queue.
1100
+ */
1093
1101
if (BgWriterShmem -> bgwriter_pid == 0 ||
1094
- BgWriterShmem -> num_requests >= BgWriterShmem -> max_requests )
1102
+ (BgWriterShmem -> num_requests >= BgWriterShmem -> max_requests
1103
+ && !CompactBgwriterRequestQueue ()))
1095
1104
{
1096
- /* Also count the subset where backends have to do their own fsync */
1105
+ /*
1106
+ * Count the subset of writes where backends have to do their own
1107
+ * fsync
1108
+ */
1097
1109
BgWriterShmem -> num_backend_fsync ++ ;
1098
1110
LWLockRelease (BgWriterCommLock );
1099
1111
return false;
@@ -1106,6 +1118,108 @@ ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
1106
1118
return true;
1107
1119
}
1108
1120
1121
+ /*
1122
+ * CompactBgwriterRequestQueue
1123
+ * Remove duplicates from the request queue to avoid backend fsyncs.
1124
+ *
1125
+ * Although a full fsync request queue is not common, it can lead to severe
1126
+ * performance problems when it does happen. So far, this situation has
1127
+ * only been observed to occur when the system is under heavy write load,
1128
+ * and especially during the "sync" phase of a checkpoint. Without this
1129
+ * logic, each backend begins doing an fsync for every block written, which
1130
+ * gets very expensive and can slow down the whole system.
1131
+ *
1132
+ * Trying to do this every time the queue is full could lose if there
1133
+ * aren't any removable entries. But should be vanishingly rare in
1134
+ * practice: there's one queue entry per shared buffer.
1135
+ */
1136
+ static bool
1137
+ CompactBgwriterRequestQueue ()
1138
+ {
1139
+ struct BgWriterSlotMapping {
1140
+ BgWriterRequest request ;
1141
+ int slot ;
1142
+ };
1143
+
1144
+ int n ,
1145
+ preserve_count ;
1146
+ int num_skipped = 0 ;
1147
+ HASHCTL ctl ;
1148
+ HTAB * htab ;
1149
+ bool * skip_slot ;
1150
+
1151
+ /* must hold BgWriterCommLock in exclusive mode */
1152
+ Assert (LWLockHeldByMe (BgWriterCommLock ));
1153
+
1154
+ /* Initialize temporary hash table */
1155
+ MemSet (& ctl , 0 , sizeof (ctl ));
1156
+ ctl .keysize = sizeof (BgWriterRequest );
1157
+ ctl .entrysize = sizeof (struct BgWriterSlotMapping );
1158
+ ctl .hash = tag_hash ;
1159
+ htab = hash_create ("CompactBgwriterRequestQueue" ,
1160
+ BgWriterShmem -> num_requests ,
1161
+ & ctl ,
1162
+ HASH_ELEM | HASH_FUNCTION );
1163
+
1164
+ /* Initialize skip_slot array */
1165
+ skip_slot = palloc0 (sizeof (bool ) * BgWriterShmem -> num_requests );
1166
+
1167
+ /*
1168
+ * The basic idea here is that a request can be skipped if it's followed
1169
+ * by a later, identical request. It might seem more sensible to work
1170
+ * backwards from the end of the queue and check whether a request is
1171
+ * *preceded* by an earlier, identical request, in the hopes of doing less
1172
+ * copying. But that might change the semantics, if there's an
1173
+ * intervening FORGET_RELATION_FSYNC or FORGET_DATABASE_FSYNC request, so
1174
+ * we do it this way. It would be possible to be even smarter if we made
1175
+ * the code below understand the specific semantics of such requests (it
1176
+ * could blow away preceding entries that would end up being cancelled
1177
+ * anyhow), but it's not clear that the extra complexity would buy us
1178
+ * anything.
1179
+ */
1180
+ for (n = 0 ; n < BgWriterShmem -> num_requests ; ++ n )
1181
+ {
1182
+ BgWriterRequest * request ;
1183
+ struct BgWriterSlotMapping * slotmap ;
1184
+ bool found ;
1185
+
1186
+ request = & BgWriterShmem -> requests [n ];
1187
+ slotmap = hash_search (htab , request , HASH_ENTER , & found );
1188
+ if (found )
1189
+ {
1190
+ skip_slot [slotmap -> slot ] = true;
1191
+ ++ num_skipped ;
1192
+ }
1193
+ slotmap -> slot = n ;
1194
+ }
1195
+
1196
+ /* Done with the hash table. */
1197
+ hash_destroy (htab );
1198
+
1199
+ /* If no duplicates, we're out of luck. */
1200
+ if (!num_skipped )
1201
+ {
1202
+ pfree (skip_slot );
1203
+ return false;
1204
+ }
1205
+
1206
+ /* We found some duplicates; remove them. */
1207
+ for (n = 0 , preserve_count = 0 ; n < BgWriterShmem -> num_requests ; ++ n )
1208
+ {
1209
+ if (skip_slot [n ])
1210
+ continue ;
1211
+ BgWriterShmem -> requests [preserve_count ++ ] = BgWriterShmem -> requests [n ];
1212
+ }
1213
+ ereport (DEBUG1 ,
1214
+ (errmsg ("compacted fsync request queue from %d entries to %d entries" ,
1215
+ BgWriterShmem -> num_requests , preserve_count )));
1216
+ BgWriterShmem -> num_requests = preserve_count ;
1217
+
1218
+ /* Cleanup. */
1219
+ pfree (skip_slot );
1220
+ return true;
1221
+ }
1222
+
1109
1223
/*
1110
1224
* AbsorbFsyncRequests
1111
1225
* Retrieve queued fsync requests and pass them to local smgr.
0 commit comments