@@ -155,12 +155,19 @@ static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
155
155
156
156
static PgAioResult md_readv_complete (PgAioHandle * ioh , PgAioResult prior_result , uint8 cb_data );
157
157
static void md_readv_report (PgAioResult result , const PgAioTargetData * target_data , int elevel );
158
+ static PgAioResult md_writev_complete (PgAioHandle * ioh , PgAioResult prior_result , uint8 cb_data );
159
+ static void md_writev_report (PgAioResult result , const PgAioTargetData * target_data , int elevel );
158
160
159
161
const PgAioHandleCallbacks aio_md_readv_cb = {
160
162
.complete_shared = md_readv_complete ,
161
163
.report = md_readv_report ,
162
164
};
163
165
166
+ const PgAioHandleCallbacks aio_md_writev_cb = {
167
+ .complete_shared = md_writev_complete ,
168
+ .report = md_writev_report ,
169
+ };
170
+
164
171
165
172
static inline int
166
173
_mdfd_open_flags (void )
@@ -1143,6 +1150,64 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
1143
1150
}
1144
1151
}
1145
1152
1153
+ /*
1154
+ * mdstartwritev() -- Asynchronous version of mdrwritev().
1155
+ */
1156
+ void
1157
+ mdstartwritev (PgAioHandle * ioh ,
1158
+ SMgrRelation reln , ForkNumber forknum , BlockNumber blocknum ,
1159
+ const void * * buffers , BlockNumber nblocks , bool skipFsync )
1160
+ {
1161
+ off_t seekpos ;
1162
+ MdfdVec * v ;
1163
+ BlockNumber nblocks_this_segment ;
1164
+ struct iovec * iov ;
1165
+ int iovcnt ;
1166
+ int ret ;
1167
+
1168
+ v = _mdfd_getseg (reln , forknum , blocknum , false,
1169
+ EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY );
1170
+
1171
+ seekpos = (off_t ) BLCKSZ * (blocknum % ((BlockNumber ) RELSEG_SIZE ));
1172
+
1173
+ Assert (seekpos < (off_t ) BLCKSZ * RELSEG_SIZE );
1174
+
1175
+ nblocks_this_segment =
1176
+ Min (nblocks ,
1177
+ RELSEG_SIZE - (blocknum % ((BlockNumber ) RELSEG_SIZE )));
1178
+
1179
+ if (nblocks_this_segment != nblocks )
1180
+ elog (ERROR , "write crossing segment boundary" );
1181
+
1182
+ iovcnt = pgaio_io_get_iovec (ioh , & iov );
1183
+
1184
+ Assert (nblocks <= iovcnt );
1185
+
1186
+ iovcnt = buffers_to_iovec (iov , unconstify (void * * , buffers ), nblocks_this_segment );
1187
+
1188
+ Assert (iovcnt <= nblocks_this_segment );
1189
+
1190
+ if (!(io_direct_flags & IO_DIRECT_DATA ))
1191
+ pgaio_io_set_flag (ioh , PGAIO_HF_BUFFERED );
1192
+
1193
+ pgaio_io_set_target_smgr (ioh ,
1194
+ reln ,
1195
+ forknum ,
1196
+ blocknum ,
1197
+ nblocks ,
1198
+ skipFsync );
1199
+ pgaio_io_register_callbacks (ioh , PGAIO_HCB_MD_WRITEV , 0 );
1200
+
1201
+ ret = FileStartWriteV (ioh , v -> mdfd_vfd , iovcnt , seekpos , WAIT_EVENT_DATA_FILE_WRITE );
1202
+ if (ret != 0 )
1203
+ ereport (ERROR ,
1204
+ (errcode_for_file_access (),
1205
+ errmsg ("could not start writing blocks %u..%u in file \"%s\": %m" ,
1206
+ blocknum ,
1207
+ blocknum + nblocks_this_segment - 1 ,
1208
+ FilePathName (v -> mdfd_vfd ))));
1209
+ }
1210
+
1146
1211
1147
1212
/*
1148
1213
* mdwriteback() -- Tell the kernel to write pages back to storage.
@@ -1531,6 +1596,40 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
1531
1596
}
1532
1597
}
1533
1598
1599
+ /*
1600
+ * Like register_dirty_segment(), except for use by AIO. In the completion
1601
+ * callback we don't have access to the MdfdVec (the completion callback might
1602
+ * be executed in a different backend than the issuing backend), therefore we
1603
+ * have to implement this slightly differently.
1604
+ */
1605
+ static void
1606
+ register_dirty_segment_aio (RelFileLocator locator , ForkNumber forknum , uint64 segno )
1607
+ {
1608
+ FileTag tag ;
1609
+
1610
+ INIT_MD_FILETAG (tag , locator , forknum , segno );
1611
+
1612
+ /*
1613
+ * Can't block here waiting for checkpointer to accept our sync request,
1614
+ * as checkpointer might be waiting for this AIO to finish if offloaded to
1615
+ * a worker.
1616
+ */
1617
+ if (!RegisterSyncRequest (& tag , SYNC_REQUEST , false /* retryOnError */ ))
1618
+ {
1619
+ char path [MAXPGPATH ];
1620
+
1621
+ ereport (DEBUG1 ,
1622
+ (errmsg_internal ("could not forward fsync request because request queue is full" )));
1623
+
1624
+ /* reuse mdsyncfiletag() to avoid duplicating code */
1625
+ if (mdsyncfiletag (& tag , path ))
1626
+ ereport (data_sync_elevel (ERROR ),
1627
+ (errcode_for_file_access (),
1628
+ errmsg ("could not fsync file \"%s\": %m" ,
1629
+ path )));
1630
+ }
1631
+ }
1632
+
1534
1633
/*
1535
1634
* register_unlink_segment() -- Schedule a file to be deleted after next checkpoint
1536
1635
*/
@@ -2065,3 +2164,103 @@ md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
2065
2164
td -> smgr .nblocks * (size_t ) BLCKSZ ));
2066
2165
}
2067
2166
}
2167
+
2168
+ /*
2169
+ * AIO completion callback for mdstartwritev().
2170
+ */
2171
+ static PgAioResult
2172
+ md_writev_complete (PgAioHandle * ioh , PgAioResult prior_result , uint8 cb_data )
2173
+ {
2174
+ PgAioTargetData * td = pgaio_io_get_target_data (ioh );
2175
+ PgAioResult result = prior_result ;
2176
+
2177
+ if (prior_result .result < 0 )
2178
+ {
2179
+ result .status = PGAIO_RS_ERROR ;
2180
+ result .id = PGAIO_HCB_MD_WRITEV ;
2181
+ /* For "hard" errors, track the error number in error_data */
2182
+ result .error_data = - prior_result .result ;
2183
+ result .result = 0 ;
2184
+
2185
+ pgaio_result_report (result , td , LOG );
2186
+
2187
+ return result ;
2188
+ }
2189
+
2190
+ /*
2191
+ * As explained above smgrstartwritev(), the smgr API operates on the
2192
+ * level of blocks, rather than bytes. Convert.
2193
+ */
2194
+ result .result /= BLCKSZ ;
2195
+
2196
+ Assert (result .result <= td -> smgr .nblocks );
2197
+
2198
+ if (result .result == 0 )
2199
+ {
2200
+ /* consider 0 blocks written a failure */
2201
+ result .status = PGAIO_RS_ERROR ;
2202
+ result .id = PGAIO_HCB_MD_WRITEV ;
2203
+ result .error_data = 0 ;
2204
+
2205
+ pgaio_result_report (result , td , LOG );
2206
+
2207
+ return result ;
2208
+ }
2209
+
2210
+ if (result .status != PGAIO_RS_ERROR &&
2211
+ result .result < td -> smgr .nblocks )
2212
+ {
2213
+ /* partial writes should be retried at upper level */
2214
+ result .status = PGAIO_RS_PARTIAL ;
2215
+ result .id = PGAIO_HCB_MD_WRITEV ;
2216
+ }
2217
+
2218
+ if (!td -> smgr .skip_fsync )
2219
+ register_dirty_segment_aio (td -> smgr .rlocator , td -> smgr .forkNum ,
2220
+ td -> smgr .blockNum / ((BlockNumber ) RELSEG_SIZE ));
2221
+
2222
+ return result ;
2223
+ }
2224
+
2225
+ /*
2226
+ * AIO error reporting callback for mdstartwritev().
2227
+ */
2228
+ static void
2229
+ md_writev_report (PgAioResult result , const PgAioTargetData * td , int elevel )
2230
+ {
2231
+ RelPathStr path ;
2232
+
2233
+ path = relpathbackend (td -> smgr .rlocator ,
2234
+ td -> smgr .is_temp ? MyProcNumber : INVALID_PROC_NUMBER ,
2235
+ td -> smgr .forkNum );
2236
+
2237
+ if (result .error_data != 0 )
2238
+ {
2239
+ errno = result .error_data ; /* for errcode_for_file_access() */
2240
+
2241
+ ereport (elevel ,
2242
+ errcode_for_file_access (),
2243
+ errmsg ("could not write blocks %u..%u in file \"%s\": %m" ,
2244
+ td -> smgr .blockNum ,
2245
+ td -> smgr .blockNum + td -> smgr .nblocks ,
2246
+ path .str )
2247
+ );
2248
+ }
2249
+ else
2250
+ {
2251
+ /*
2252
+ * NB: This will typically only be output in debug messages, while
2253
+ * retrying a partial IO.
2254
+ */
2255
+ ereport (elevel ,
2256
+ errcode (ERRCODE_DATA_CORRUPTED ),
2257
+ errmsg ("could not write blocks %u..%u in file \"%s\": wrote only %zu of %zu bytes" ,
2258
+ td -> smgr .blockNum ,
2259
+ td -> smgr .blockNum + td -> smgr .nblocks - 1 ,
2260
+ path .str ,
2261
+ result .result * (size_t ) BLCKSZ ,
2262
+ td -> smgr .nblocks * (size_t ) BLCKSZ
2263
+ )
2264
+ );
2265
+ }
2266
+ }
0 commit comments