1 /* 2 * Linux io_uring support. 3 * 4 * Copyright (C) 2009 IBM, Corp. 5 * Copyright (C) 2009 Red Hat, Inc. 6 * Copyright (C) 2019 Aarushi Mehta 7 * 8 * This work is licensed under the terms of the GNU GPL, version 2 or later. 9 * See the COPYING file in the top-level directory. 10 */ 11 #include "qemu/osdep.h" 12 #include <liburing.h> 13 #include "qemu-common.h" 14 #include "block/aio.h" 15 #include "qemu/queue.h" 16 #include "block/block.h" 17 #include "block/raw-aio.h" 18 #include "qemu/coroutine.h" 19 #include "qapi/error.h" 20 21 /* io_uring ring size */ 22 #define MAX_ENTRIES 128 23 24 typedef struct LuringAIOCB { 25 Coroutine *co; 26 struct io_uring_sqe sqeq; 27 ssize_t ret; 28 QEMUIOVector *qiov; 29 bool is_read; 30 QSIMPLEQ_ENTRY(LuringAIOCB) next; 31 32 /* 33 * Buffered reads may require resubmission, see 34 * luring_resubmit_short_read(). 35 */ 36 int total_read; 37 QEMUIOVector resubmit_qiov; 38 } LuringAIOCB; 39 40 typedef struct LuringQueue { 41 int plugged; 42 unsigned int in_queue; 43 unsigned int in_flight; 44 bool blocked; 45 QSIMPLEQ_HEAD(, LuringAIOCB) submit_queue; 46 } LuringQueue; 47 48 typedef struct LuringState { 49 AioContext *aio_context; 50 51 struct io_uring ring; 52 53 /* io queue for submit at batch. Protected by AioContext lock. */ 54 LuringQueue io_q; 55 56 /* I/O completion processing. Only runs in I/O thread. */ 57 QEMUBH *completion_bh; 58 } LuringState; 59 60 /** 61 * luring_resubmit: 62 * 63 * Resubmit a request by appending it to submit_queue. The caller must ensure 64 * that ioq_submit() is called later so that submit_queue requests are started. 65 */ 66 static void luring_resubmit(LuringState *s, LuringAIOCB *luringcb) 67 { 68 QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next); 69 s->io_q.in_queue++; 70 } 71 72 /** 73 * luring_resubmit_short_read: 74 * 75 * Before Linux commit 9d93a3f5a0c ("io_uring: punt short reads to async 76 * context") a buffered I/O request with the start of the file range in the 77 * page cache could result in a short read. Applications need to resubmit the 78 * remaining read request. 79 * 80 * This is a slow path but recent kernels never take it. 81 */ 82 static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb, 83 int nread) 84 { 85 QEMUIOVector *resubmit_qiov; 86 size_t remaining; 87 88 /* Update read position */ 89 luringcb->total_read = nread; 90 remaining = luringcb->qiov->size - luringcb->total_read; 91 92 /* Shorten qiov */ 93 resubmit_qiov = &luringcb->resubmit_qiov; 94 if (resubmit_qiov->iov == NULL) { 95 qemu_iovec_init(resubmit_qiov, luringcb->qiov->niov); 96 } else { 97 qemu_iovec_reset(resubmit_qiov); 98 } 99 qemu_iovec_concat(resubmit_qiov, luringcb->qiov, luringcb->total_read, 100 remaining); 101 102 /* Update sqe */ 103 luringcb->sqeq.off = nread; 104 luringcb->sqeq.addr = (__u64)(uintptr_t)luringcb->resubmit_qiov.iov; 105 luringcb->sqeq.len = luringcb->resubmit_qiov.niov; 106 107 luring_resubmit(s, luringcb); 108 } 109 110 /** 111 * luring_process_completions: 112 * @s: AIO state 113 * 114 * Fetches completed I/O requests, consumes cqes and invokes their callbacks 115 * The function is somewhat tricky because it supports nested event loops, for 116 * example when a request callback invokes aio_poll(). 117 * 118 * Function schedules BH completion so it can be called again in a nested 119 * event loop. When there are no events left to complete the BH is being 120 * canceled. 121 * 122 */ 123 static void luring_process_completions(LuringState *s) 124 { 125 struct io_uring_cqe *cqes; 126 int total_bytes; 127 /* 128 * Request completion callbacks can run the nested event loop. 129 * Schedule ourselves so the nested event loop will "see" remaining 130 * completed requests and process them. Without this, completion 131 * callbacks that wait for other requests using a nested event loop 132 * would hang forever. 133 * 134 * This workaround is needed because io_uring uses poll_wait, which 135 * is woken up when new events are added to the uring, thus polling on 136 * the same uring fd will block unless more events are received. 137 * 138 * Other leaf block drivers (drivers that access the data themselves) 139 * are networking based, so they poll sockets for data and run the 140 * correct coroutine. 141 */ 142 qemu_bh_schedule(s->completion_bh); 143 144 while (io_uring_peek_cqe(&s->ring, &cqes) == 0) { 145 LuringAIOCB *luringcb; 146 int ret; 147 148 if (!cqes) { 149 break; 150 } 151 152 luringcb = io_uring_cqe_get_data(cqes); 153 ret = cqes->res; 154 io_uring_cqe_seen(&s->ring, cqes); 155 cqes = NULL; 156 157 /* Change counters one-by-one because we can be nested. */ 158 s->io_q.in_flight--; 159 160 /* total_read is non-zero only for resubmitted read requests */ 161 total_bytes = ret + luringcb->total_read; 162 163 if (ret < 0) { 164 if (ret == -EINTR) { 165 luring_resubmit(s, luringcb); 166 continue; 167 } 168 } else if (!luringcb->qiov) { 169 goto end; 170 } else if (total_bytes == luringcb->qiov->size) { 171 ret = 0; 172 /* Only read/write */ 173 } else { 174 /* Short Read/Write */ 175 if (luringcb->is_read) { 176 if (ret > 0) { 177 luring_resubmit_short_read(s, luringcb, ret); 178 continue; 179 } else { 180 /* Pad with zeroes */ 181 qemu_iovec_memset(luringcb->qiov, total_bytes, 0, 182 luringcb->qiov->size - total_bytes); 183 ret = 0; 184 } 185 } else { 186 ret = -ENOSPC;; 187 } 188 } 189 end: 190 luringcb->ret = ret; 191 qemu_iovec_destroy(&luringcb->resubmit_qiov); 192 193 /* 194 * If the coroutine is already entered it must be in ioq_submit() 195 * and will notice luringcb->ret has been filled in when it 196 * eventually runs later. Coroutines cannot be entered recursively 197 * so avoid doing that! 198 */ 199 if (!qemu_coroutine_entered(luringcb->co)) { 200 aio_co_wake(luringcb->co); 201 } 202 } 203 qemu_bh_cancel(s->completion_bh); 204 } 205 206 static int ioq_submit(LuringState *s) 207 { 208 int ret = 0; 209 LuringAIOCB *luringcb, *luringcb_next; 210 211 while (s->io_q.in_queue > 0) { 212 /* 213 * Try to fetch sqes from the ring for requests waiting in 214 * the overflow queue 215 */ 216 QSIMPLEQ_FOREACH_SAFE(luringcb, &s->io_q.submit_queue, next, 217 luringcb_next) { 218 struct io_uring_sqe *sqes = io_uring_get_sqe(&s->ring); 219 if (!sqes) { 220 break; 221 } 222 /* Prep sqe for submission */ 223 *sqes = luringcb->sqeq; 224 QSIMPLEQ_REMOVE_HEAD(&s->io_q.submit_queue, next); 225 } 226 ret = io_uring_submit(&s->ring); 227 /* Prevent infinite loop if submission is refused */ 228 if (ret <= 0) { 229 if (ret == -EAGAIN) { 230 continue; 231 } 232 break; 233 } 234 s->io_q.in_flight += ret; 235 s->io_q.in_queue -= ret; 236 } 237 s->io_q.blocked = (s->io_q.in_queue > 0); 238 239 if (s->io_q.in_flight) { 240 /* 241 * We can try to complete something just right away if there are 242 * still requests in-flight. 243 */ 244 luring_process_completions(s); 245 } 246 return ret; 247 } 248 249 static void luring_process_completions_and_submit(LuringState *s) 250 { 251 aio_context_acquire(s->aio_context); 252 luring_process_completions(s); 253 254 if (!s->io_q.plugged && s->io_q.in_queue > 0) { 255 ioq_submit(s); 256 } 257 aio_context_release(s->aio_context); 258 } 259 260 static void qemu_luring_completion_bh(void *opaque) 261 { 262 LuringState *s = opaque; 263 luring_process_completions_and_submit(s); 264 } 265 266 static void qemu_luring_completion_cb(void *opaque) 267 { 268 LuringState *s = opaque; 269 luring_process_completions_and_submit(s); 270 } 271 272 static void ioq_init(LuringQueue *io_q) 273 { 274 QSIMPLEQ_INIT(&io_q->submit_queue); 275 io_q->plugged = 0; 276 io_q->in_queue = 0; 277 io_q->in_flight = 0; 278 io_q->blocked = false; 279 } 280 281 void luring_io_plug(BlockDriverState *bs, LuringState *s) 282 { 283 s->io_q.plugged++; 284 } 285 286 void luring_io_unplug(BlockDriverState *bs, LuringState *s) 287 { 288 assert(s->io_q.plugged); 289 if (--s->io_q.plugged == 0 && 290 !s->io_q.blocked && s->io_q.in_queue > 0) { 291 ioq_submit(s); 292 } 293 } 294 295 /** 296 * luring_do_submit: 297 * @fd: file descriptor for I/O 298 * @luringcb: AIO control block 299 * @s: AIO state 300 * @offset: offset for request 301 * @type: type of request 302 * 303 * Fetches sqes from ring, adds to pending queue and preps them 304 * 305 */ 306 static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, 307 uint64_t offset, int type) 308 { 309 struct io_uring_sqe *sqes = &luringcb->sqeq; 310 311 switch (type) { 312 case QEMU_AIO_WRITE: 313 io_uring_prep_writev(sqes, fd, luringcb->qiov->iov, 314 luringcb->qiov->niov, offset); 315 break; 316 case QEMU_AIO_READ: 317 io_uring_prep_readv(sqes, fd, luringcb->qiov->iov, 318 luringcb->qiov->niov, offset); 319 break; 320 case QEMU_AIO_FLUSH: 321 io_uring_prep_fsync(sqes, fd, IORING_FSYNC_DATASYNC); 322 break; 323 default: 324 fprintf(stderr, "%s: invalid AIO request type, aborting 0x%x.\n", 325 __func__, type); 326 abort(); 327 } 328 io_uring_sqe_set_data(sqes, luringcb); 329 330 QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next); 331 s->io_q.in_queue++; 332 333 if (!s->io_q.blocked && 334 (!s->io_q.plugged || 335 s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES)) { 336 return ioq_submit(s); 337 } 338 return 0; 339 } 340 341 int coroutine_fn luring_co_submit(BlockDriverState *bs, LuringState *s, int fd, 342 uint64_t offset, QEMUIOVector *qiov, int type) 343 { 344 int ret; 345 LuringAIOCB luringcb = { 346 .co = qemu_coroutine_self(), 347 .ret = -EINPROGRESS, 348 .qiov = qiov, 349 .is_read = (type == QEMU_AIO_READ), 350 }; 351 352 ret = luring_do_submit(fd, &luringcb, s, offset, type); 353 if (ret < 0) { 354 return ret; 355 } 356 357 if (luringcb.ret == -EINPROGRESS) { 358 qemu_coroutine_yield(); 359 } 360 return luringcb.ret; 361 } 362 363 void luring_detach_aio_context(LuringState *s, AioContext *old_context) 364 { 365 aio_set_fd_handler(old_context, s->ring.ring_fd, false, NULL, NULL, NULL, 366 s); 367 qemu_bh_delete(s->completion_bh); 368 s->aio_context = NULL; 369 } 370 371 void luring_attach_aio_context(LuringState *s, AioContext *new_context) 372 { 373 s->aio_context = new_context; 374 s->completion_bh = aio_bh_new(new_context, qemu_luring_completion_bh, s); 375 aio_set_fd_handler(s->aio_context, s->ring.ring_fd, false, 376 qemu_luring_completion_cb, NULL, NULL, s); 377 } 378 379 LuringState *luring_init(Error **errp) 380 { 381 int rc; 382 LuringState *s = g_new0(LuringState, 1); 383 struct io_uring *ring = &s->ring; 384 385 rc = io_uring_queue_init(MAX_ENTRIES, ring, 0); 386 if (rc < 0) { 387 error_setg_errno(errp, errno, "failed to init linux io_uring ring"); 388 g_free(s); 389 return NULL; 390 } 391 392 ioq_init(&s->io_q); 393 return s; 394 395 } 396 397 void luring_cleanup(LuringState *s) 398 { 399 io_uring_queue_exit(&s->ring); 400 g_free(s); 401 } 402