Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit eb254b3

Browse files
anarazelCommitfest Bot
authored and
Commitfest Bot
committed
aio: Add bounce buffers
1 parent a7d518a commit eb254b3

File tree

13 files changed

+496
-2
lines changed

13 files changed

+496
-2
lines changed

src/backend/storage/aio/README.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,33 @@ shared memory no less!), completion callbacks instead have to encode errors in
406406
a more compact format that can be converted into an error message.
407407
408408
409+
### AIO Bounce Buffers
410+
411+
For some uses of AIO there is no convenient memory location as the source /
412+
destination of an AIO. E.g. when data checksums are enabled, writes from
413+
shared buffers currently cannot be done directly from shared buffers, as a
414+
shared buffer lock still allows some modification, e.g., for hint bits (see
415+
`FlushBuffer()`). If the write were done in-place, such modifications can
416+
cause the checksum to fail.
417+
418+
For synchronous IO this is solved by copying the buffer to separate memory
419+
before computing the checksum and using that copy as the source buffer for the
420+
AIO.
421+
422+
However, for AIO that is not a workable solution:
423+
- Instead of a single buffer many buffers are required, as many IOs might be
424+
in flight
425+
- When using the [worker method](#worker), the source/target of IO needs to be
426+
in shared memory, otherwise the workers won't be able to access the memory.
427+
428+
The AIO subsystem addresses this by providing a limited number of bounce
429+
buffers that can be used as the source / target for IO. A bounce buffer can be
430+
acquired with `pgaio_bounce_buffer_get()` and multiple bounce buffers can be
431+
associated with an AIO Handle with `pgaio_io_assoc_bounce_buffer()`.
432+
433+
Bounce buffers are automatically released when the IO completes.
434+
435+
409436
## Helpers
410437
411438
Using the low-level AIO API introduces too much complexity to do so all over

src/backend/storage/aio/aio.c

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ static PgAioHandle *pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation
6262
static const char *pgaio_io_state_get_name(PgAioHandleState s);
6363
static void pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation);
6464

65+
static void pgaio_bounce_buffer_wait_for_free(void);
66+
6567

6668
/* Options for io_method. */
6769
const struct config_enum_entry io_method_options[] = {
@@ -76,6 +78,7 @@ const struct config_enum_entry io_method_options[] = {
7678
/* GUCs */
7779
int io_method = DEFAULT_IO_METHOD;
7880
int io_max_concurrency = -1;
81+
int io_bounce_buffers = -1;
7982

8083
/* global control for AIO */
8184
PgAioCtl *pgaio_ctl;
@@ -662,6 +665,21 @@ pgaio_io_reclaim(PgAioHandle *ioh)
662665
if (ioh->state != PGAIO_HS_HANDED_OUT)
663666
dclist_delete_from(&pgaio_my_backend->in_flight_ios, &ioh->node);
664667

668+
/* reclaim all associated bounce buffers */
669+
if (!slist_is_empty(&ioh->bounce_buffers))
670+
{
671+
slist_mutable_iter it;
672+
673+
slist_foreach_modify(it, &ioh->bounce_buffers)
674+
{
675+
PgAioBounceBuffer *bb = slist_container(PgAioBounceBuffer, node, it.cur);
676+
677+
slist_delete_current(&it);
678+
679+
slist_push_head(&pgaio_my_backend->idle_bbs, &bb->node);
680+
}
681+
}
682+
665683
if (ioh->resowner)
666684
{
667685
ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
@@ -1046,6 +1064,166 @@ pgaio_submit_staged(void)
10461064

10471065

10481066

1067+
/* --------------------------------------------------------------------------------
1068+
* Functions primarily related to PgAioBounceBuffer
1069+
* --------------------------------------------------------------------------------
1070+
*/
1071+
1072+
PgAioBounceBuffer *
1073+
pgaio_bounce_buffer_get(void)
1074+
{
1075+
PgAioBounceBuffer *bb = NULL;
1076+
slist_node *node;
1077+
1078+
if (pgaio_my_backend->handed_out_bb != NULL)
1079+
elog(ERROR, "can only hand out one BB");
1080+
1081+
/*
1082+
* XXX: It probably is not a good idea to have bounce buffers be per
1083+
* backend, that's a fair bit of memory.
1084+
*/
1085+
if (slist_is_empty(&pgaio_my_backend->idle_bbs))
1086+
{
1087+
pgaio_bounce_buffer_wait_for_free();
1088+
}
1089+
1090+
node = slist_pop_head_node(&pgaio_my_backend->idle_bbs);
1091+
bb = slist_container(PgAioBounceBuffer, node, node);
1092+
1093+
pgaio_my_backend->handed_out_bb = bb;
1094+
1095+
bb->resowner = CurrentResourceOwner;
1096+
ResourceOwnerRememberAioBounceBuffer(bb->resowner, &bb->resowner_node);
1097+
1098+
return bb;
1099+
}
1100+
1101+
void
1102+
pgaio_io_assoc_bounce_buffer(PgAioHandle *ioh, PgAioBounceBuffer *bb)
1103+
{
1104+
if (pgaio_my_backend->handed_out_bb != bb)
1105+
elog(ERROR, "can only assign handed out BB");
1106+
pgaio_my_backend->handed_out_bb = NULL;
1107+
1108+
/*
1109+
* There can be many bounce buffers assigned in case of vectorized IOs.
1110+
*/
1111+
slist_push_head(&ioh->bounce_buffers, &bb->node);
1112+
1113+
/* once associated with an IO, the IO has ownership */
1114+
ResourceOwnerForgetAioBounceBuffer(bb->resowner, &bb->resowner_node);
1115+
bb->resowner = NULL;
1116+
}
1117+
1118+
uint32
1119+
pgaio_bounce_buffer_id(PgAioBounceBuffer *bb)
1120+
{
1121+
return bb - pgaio_ctl->bounce_buffers;
1122+
}
1123+
1124+
void
1125+
pgaio_bounce_buffer_release(PgAioBounceBuffer *bb)
1126+
{
1127+
if (pgaio_my_backend->handed_out_bb != bb)
1128+
elog(ERROR, "can only release handed out BB");
1129+
1130+
slist_push_head(&pgaio_my_backend->idle_bbs, &bb->node);
1131+
pgaio_my_backend->handed_out_bb = NULL;
1132+
1133+
ResourceOwnerForgetAioBounceBuffer(bb->resowner, &bb->resowner_node);
1134+
bb->resowner = NULL;
1135+
}
1136+
1137+
void
1138+
pgaio_bounce_buffer_release_resowner(dlist_node *bb_node, bool on_error)
1139+
{
1140+
PgAioBounceBuffer *bb = dlist_container(PgAioBounceBuffer, resowner_node, bb_node);
1141+
1142+
Assert(bb->resowner);
1143+
1144+
if (!on_error)
1145+
elog(WARNING, "leaked AIO bounce buffer");
1146+
1147+
pgaio_bounce_buffer_release(bb);
1148+
}
1149+
1150+
char *
1151+
pgaio_bounce_buffer_buffer(PgAioBounceBuffer *bb)
1152+
{
1153+
return bb->buffer;
1154+
}
1155+
1156+
static void
1157+
pgaio_bounce_buffer_wait_for_free(void)
1158+
{
1159+
static uint32 lastpos = 0;
1160+
1161+
if (pgaio_my_backend->num_staged_ios > 0)
1162+
{
1163+
pgaio_debug(DEBUG2, "submitting %d, while acquiring free bb",
1164+
pgaio_my_backend->num_staged_ios);
1165+
pgaio_submit_staged();
1166+
}
1167+
1168+
for (uint32 i = lastpos; i < lastpos + io_max_concurrency; i++)
1169+
{
1170+
uint32 thisoff = pgaio_my_backend->io_handle_off + (i % io_max_concurrency);
1171+
PgAioHandle *ioh = &pgaio_ctl->io_handles[thisoff];
1172+
1173+
switch (ioh->state)
1174+
{
1175+
case PGAIO_HS_IDLE:
1176+
case PGAIO_HS_HANDED_OUT:
1177+
continue;
1178+
case PGAIO_HS_DEFINED: /* should have been submitted above */
1179+
case PGAIO_HS_STAGED:
1180+
elog(ERROR, "shouldn't get here with io:%d in state %d",
1181+
pgaio_io_get_id(ioh), ioh->state);
1182+
break;
1183+
case PGAIO_HS_COMPLETED_IO:
1184+
case PGAIO_HS_SUBMITTED:
1185+
if (!slist_is_empty(&ioh->bounce_buffers))
1186+
{
1187+
pgaio_debug_io(DEBUG2, ioh,
1188+
"waiting for IO to reclaim BB with %d in flight",
1189+
dclist_count(&pgaio_my_backend->in_flight_ios));
1190+
1191+
/* see comment in pgaio_io_wait_for_free() about raciness */
1192+
pgaio_io_wait(ioh, ioh->generation);
1193+
1194+
if (slist_is_empty(&pgaio_my_backend->idle_bbs))
1195+
elog(WARNING, "empty after wait");
1196+
1197+
if (!slist_is_empty(&pgaio_my_backend->idle_bbs))
1198+
{
1199+
lastpos = i;
1200+
return;
1201+
}
1202+
}
1203+
break;
1204+
case PGAIO_HS_COMPLETED_SHARED:
1205+
case PGAIO_HS_COMPLETED_LOCAL:
1206+
/* reclaim */
1207+
pgaio_io_reclaim(ioh);
1208+
1209+
if (!slist_is_empty(&pgaio_my_backend->idle_bbs))
1210+
{
1211+
lastpos = i;
1212+
return;
1213+
}
1214+
break;
1215+
}
1216+
}
1217+
1218+
/*
1219+
* The submission above could have caused the IO to complete at any time.
1220+
*/
1221+
if (slist_is_empty(&pgaio_my_backend->idle_bbs))
1222+
elog(PANIC, "no more bbs");
1223+
}
1224+
1225+
1226+
10491227
/* --------------------------------------------------------------------------------
10501228
* Other
10511229
* --------------------------------------------------------------------------------

0 commit comments

Comments
 (0)