Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 6658d78

Browse files
macdiceCommitfest Bot
authored and
Commitfest Bot
committed
Improve read stream advice for large random chunks.
read_stream.c tries not to issue advice when it thinks the kernel's readahead should be active, ie when using buffered I/O and reading sequential blocks. It previously gave up a little too easily: it should issue advice until it has started running sequential pread() calls, not just when it's planning to. The simpler strategy worked for random chunks of size <= io_combine_limit and entirely sequential streams, but so not well when reading random chunks > io_combine limit. For example, a 256kB chunk of sequential data would benefit from only one fadvise(), but (assuming io_combine_limit=128kB) could suffer an I/O stall for the second half of it. Keep issuing advice until the pread() calls catch up with the start of the region we're currently issuing advice for, if ever. In practice, if there are any jumps in the lookahead window, we'll never stop issuing advice, and if the whole lookahead window becomes sequential we'll finally stop issuing advice. Discovered by Tomas Vondra's regression testing of many data clustering patterns using Melanie Plageman's streaming Bitmap Heap Scan patch, with analysis of the I/O stall-producing pattern from Andres Freund. Reviewed-by: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CA%2BhUKGK_%3D4CVmMHvsHjOVrK6t4F%3DLBpFzsrr3R%2BaJYN8kcTfWg%40mail.gmail.com Discussion: https://postgr.es/m/CA%2BhUKGJ3HSWciQCz8ekP1Zn7N213RfA4nbuotQawfpq23%2Bw-5Q%40mail.gmail.com
1 parent 76915db commit 6658d78

File tree

1 file changed

+50
-21
lines changed

1 file changed

+50
-21
lines changed

src/backend/storage/aio/read_stream.c

Lines changed: 50 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ struct ReadStream
133133

134134
/* Next expected block, for detecting sequential access. */
135135
BlockNumber seq_blocknum;
136+
BlockNumber seq_until_processed;
136137

137138
/* The read operation we are currently preparing. */
138139
BlockNumber pending_read_blocknum;
@@ -237,11 +238,11 @@ read_stream_unget_block(ReadStream *stream, BlockNumber blocknum)
237238
* distance to a level that prevents look-ahead until buffers are released.
238239
*/
239240
static bool
240-
read_stream_start_pending_read(ReadStream *stream, bool suppress_advice)
241+
read_stream_start_pending_read(ReadStream *stream)
241242
{
242243
bool need_wait;
243244
int nblocks;
244-
int flags;
245+
int flags = 0;
245246
int16 io_index;
246247
int16 overflow;
247248
int16 buffer_index;
@@ -261,16 +262,36 @@ read_stream_start_pending_read(ReadStream *stream, bool suppress_advice)
261262
else
262263
Assert(stream->next_buffer_index == stream->oldest_buffer_index);
263264

264-
/*
265-
* If advice hasn't been suppressed, this system supports it, and this
266-
* isn't a strictly sequential pattern, then we'll issue advice.
267-
*/
268-
if (!suppress_advice &&
269-
stream->advice_enabled &&
270-
stream->pending_read_blocknum != stream->seq_blocknum)
271-
flags = READ_BUFFERS_ISSUE_ADVICE;
272-
else
273-
flags = 0;
265+
/* Do we need to issue read-ahead advice? */
266+
if (stream->advice_enabled)
267+
{
268+
bool no_wait;
269+
270+
/*
271+
* We only issue advice if we won't immediately have to call
272+
* WaitReadBuffers().
273+
*/
274+
no_wait = stream->pinned_buffers > 0 ||
275+
stream->pending_read_nblocks < stream->distance;
276+
277+
if (stream->pending_read_blocknum == stream->seq_blocknum)
278+
{
279+
/*
280+
* Sequential: issue advice only until the WaitReadBuffers() calls
281+
* catch up with the first advice issued for this sequential
282+
* region, so the kernel can see sequential access.
283+
*/
284+
if (stream->seq_until_processed != InvalidBlockNumber && no_wait)
285+
flags = READ_BUFFERS_ISSUE_ADVICE;
286+
}
287+
else
288+
{
289+
/* Random jump: start tracking new region. */
290+
stream->seq_until_processed = stream->pending_read_blocknum;
291+
if (no_wait)
292+
flags = READ_BUFFERS_ISSUE_ADVICE;
293+
}
294+
}
274295

275296
/* Compute the remaining portion of the per-backend buffer limit. */
276297
if (stream->temporary)
@@ -359,7 +380,7 @@ read_stream_start_pending_read(ReadStream *stream, bool suppress_advice)
359380
}
360381

361382
static void
362-
read_stream_look_ahead(ReadStream *stream, bool suppress_advice)
383+
read_stream_look_ahead(ReadStream *stream)
363384
{
364385
while (stream->ios_in_progress < stream->max_ios &&
365386
stream->pinned_buffers + stream->pending_read_nblocks < stream->distance)
@@ -370,8 +391,7 @@ read_stream_look_ahead(ReadStream *stream, bool suppress_advice)
370391

371392
if (stream->pending_read_nblocks == stream->io_combine_limit)
372393
{
373-
read_stream_start_pending_read(stream, suppress_advice);
374-
suppress_advice = false;
394+
read_stream_start_pending_read(stream);
375395
continue;
376396
}
377397

@@ -404,15 +424,13 @@ read_stream_look_ahead(ReadStream *stream, bool suppress_advice)
404424
/* We have to start the pending read before we can build another. */
405425
while (stream->pending_read_nblocks > 0)
406426
{
407-
if (!read_stream_start_pending_read(stream, suppress_advice) ||
427+
if (!read_stream_start_pending_read(stream) ||
408428
stream->ios_in_progress == stream->max_ios)
409429
{
410430
/* And we've hit a buffer or I/O limit. Rewind and wait. */
411431
read_stream_unget_block(stream, blocknum);
412432
return;
413433
}
414-
415-
suppress_advice = false;
416434
}
417435

418436
/* This is the start of a new pending read. */
@@ -436,7 +454,7 @@ read_stream_look_ahead(ReadStream *stream, bool suppress_advice)
436454
stream->pinned_buffers == 0) ||
437455
stream->distance == 0) &&
438456
stream->ios_in_progress < stream->max_ios)
439-
read_stream_start_pending_read(stream, suppress_advice);
457+
read_stream_start_pending_read(stream);
440458

441459
/*
442460
* There should always be something pinned when we leave this function,
@@ -612,6 +630,8 @@ read_stream_begin_impl(int flags,
612630
stream->callback = callback;
613631
stream->callback_private_data = callback_private_data;
614632
stream->buffered_blocknum = InvalidBlockNumber;
633+
stream->seq_blocknum = InvalidBlockNumber;
634+
stream->seq_until_processed = InvalidBlockNumber;
615635
stream->temporary = SmgrIsTemp(smgr);
616636

617637
/*
@@ -792,7 +812,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
792812
* space for more, but if we're just starting up we'll need to crank
793813
* the handle to get started.
794814
*/
795-
read_stream_look_ahead(stream, true);
815+
read_stream_look_ahead(stream);
796816

797817
/* End of stream reached? */
798818
if (stream->pinned_buffers == 0)
@@ -837,6 +857,15 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
837857
distance = stream->distance * 2;
838858
distance = Min(distance, stream->max_pinned_buffers);
839859
stream->distance = distance;
860+
861+
/*
862+
* If we've caught up with the first advice issued for the current
863+
* sequential region, cancel further advice until the next random
864+
* jump. The kernel should be able to see the pattern now that
865+
* we're actually making sequential preadv() calls.
866+
*/
867+
if (stream->ios[io_index].op.blocknum == stream->seq_until_processed)
868+
stream->seq_until_processed = InvalidBlockNumber;
840869
}
841870
else
842871
{
@@ -898,7 +927,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
898927
stream->oldest_buffer_index = 0;
899928

900929
/* Prepare for the next call. */
901-
read_stream_look_ahead(stream, false);
930+
read_stream_look_ahead(stream);
902931

903932
#ifndef READ_STREAM_DISABLE_FAST_PATH
904933
/* See if we can take the fast path for all-cached scans next time. */

0 commit comments

Comments
 (0)