1 /* 2 * QEMU aio implementation 3 * 4 * Copyright IBM, Corp. 2008 5 * 6 * Authors: 7 * Anthony Liguori <aliguori@us.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 */ 13 14 #ifndef QEMU_AIO_H 15 #define QEMU_AIO_H 16 17 #include "qemu/queue.h" 18 #include "qemu/event_notifier.h" 19 #include "qemu/thread.h" 20 #include "qemu/timer.h" 21 22 typedef struct BlockAIOCB BlockAIOCB; 23 typedef void BlockCompletionFunc(void *opaque, int ret); 24 25 typedef struct AIOCBInfo { 26 void (*cancel_async)(BlockAIOCB *acb); 27 AioContext *(*get_aio_context)(BlockAIOCB *acb); 28 size_t aiocb_size; 29 } AIOCBInfo; 30 31 struct BlockAIOCB { 32 const AIOCBInfo *aiocb_info; 33 BlockDriverState *bs; 34 BlockCompletionFunc *cb; 35 void *opaque; 36 int refcnt; 37 }; 38 39 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, 40 BlockCompletionFunc *cb, void *opaque); 41 void qemu_aio_unref(void *p); 42 void qemu_aio_ref(void *p); 43 44 typedef struct AioHandler AioHandler; 45 typedef void QEMUBHFunc(void *opaque); 46 typedef bool AioPollFn(void *opaque); 47 typedef void IOHandler(void *opaque); 48 49 struct Coroutine; 50 struct ThreadPool; 51 struct LinuxAioState; 52 struct LuringState; 53 54 struct AioContext { 55 GSource source; 56 57 /* Used by AioContext users to protect from multi-threaded access. */ 58 QemuRecMutex lock; 59 60 /* The list of registered AIO handlers. Protected by ctx->list_lock. */ 61 QLIST_HEAD(, AioHandler) aio_handlers; 62 63 /* Used to avoid unnecessary event_notifier_set calls in aio_notify; 64 * accessed with atomic primitives. If this field is 0, everything 65 * (file descriptors, bottom halves, timers) will be re-evaluated 66 * before the next blocking poll(), thus the event_notifier_set call 67 * can be skipped. If it is non-zero, you may need to wake up a 68 * concurrent aio_poll or the glib main event loop, making 69 * event_notifier_set necessary. 70 * 71 * Bit 0 is reserved for GSource usage of the AioContext, and is 1 72 * between a call to aio_ctx_prepare and the next call to aio_ctx_check. 73 * Bits 1-31 simply count the number of active calls to aio_poll 74 * that are in the prepare or poll phase. 75 * 76 * The GSource and aio_poll must use a different mechanism because 77 * there is no certainty that a call to GSource's prepare callback 78 * (via g_main_context_prepare) is indeed followed by check and 79 * dispatch. It's not clear whether this would be a bug, but let's 80 * play safe and allow it---it will just cause extra calls to 81 * event_notifier_set until the next call to dispatch. 82 * 83 * Instead, the aio_poll calls include both the prepare and the 84 * dispatch phase, hence a simple counter is enough for them. 85 */ 86 uint32_t notify_me; 87 88 /* A lock to protect between QEMUBH and AioHandler adders and deleter, 89 * and to ensure that no callbacks are removed while we're walking and 90 * dispatching them. 91 */ 92 QemuLockCnt list_lock; 93 94 /* Anchor of the list of Bottom Halves belonging to the context */ 95 struct QEMUBH *first_bh; 96 97 /* Used by aio_notify. 98 * 99 * "notified" is used to avoid expensive event_notifier_test_and_clear 100 * calls. When it is clear, the EventNotifier is clear, or one thread 101 * is going to clear "notified" before processing more events. False 102 * positives are possible, i.e. "notified" could be set even though the 103 * EventNotifier is clear. 104 * 105 * Note that event_notifier_set *cannot* be optimized the same way. For 106 * more information on the problem that would result, see "#ifdef BUG2" 107 * in the docs/aio_notify_accept.promela formal model. 108 */ 109 bool notified; 110 EventNotifier notifier; 111 112 QSLIST_HEAD(, Coroutine) scheduled_coroutines; 113 QEMUBH *co_schedule_bh; 114 115 /* Thread pool for performing work and receiving completion callbacks. 116 * Has its own locking. 117 */ 118 struct ThreadPool *thread_pool; 119 120 #ifdef CONFIG_LINUX_AIO 121 /* 122 * State for native Linux AIO. Uses aio_context_acquire/release for 123 * locking. 124 */ 125 struct LinuxAioState *linux_aio; 126 #endif 127 #ifdef CONFIG_LINUX_IO_URING 128 /* 129 * State for Linux io_uring. Uses aio_context_acquire/release for 130 * locking. 131 */ 132 struct LuringState *linux_io_uring; 133 #endif 134 135 /* TimerLists for calling timers - one per clock type. Has its own 136 * locking. 137 */ 138 QEMUTimerListGroup tlg; 139 140 int external_disable_cnt; 141 142 /* Number of AioHandlers without .io_poll() */ 143 int poll_disable_cnt; 144 145 /* Polling mode parameters */ 146 int64_t poll_ns; /* current polling time in nanoseconds */ 147 int64_t poll_max_ns; /* maximum polling time in nanoseconds */ 148 int64_t poll_grow; /* polling time growth factor */ 149 int64_t poll_shrink; /* polling time shrink factor */ 150 151 /* Are we in polling mode or monitoring file descriptors? */ 152 bool poll_started; 153 154 /* epoll(7) state used when built with CONFIG_EPOLL */ 155 int epollfd; 156 bool epoll_enabled; 157 bool epoll_available; 158 }; 159 160 /** 161 * aio_context_new: Allocate a new AioContext. 162 * 163 * AioContext provide a mini event-loop that can be waited on synchronously. 164 * They also provide bottom halves, a service to execute a piece of code 165 * as soon as possible. 166 */ 167 AioContext *aio_context_new(Error **errp); 168 169 /** 170 * aio_context_ref: 171 * @ctx: The AioContext to operate on. 172 * 173 * Add a reference to an AioContext. 174 */ 175 void aio_context_ref(AioContext *ctx); 176 177 /** 178 * aio_context_unref: 179 * @ctx: The AioContext to operate on. 180 * 181 * Drop a reference to an AioContext. 182 */ 183 void aio_context_unref(AioContext *ctx); 184 185 /* Take ownership of the AioContext. If the AioContext will be shared between 186 * threads, and a thread does not want to be interrupted, it will have to 187 * take ownership around calls to aio_poll(). Otherwise, aio_poll() 188 * automatically takes care of calling aio_context_acquire and 189 * aio_context_release. 190 * 191 * Note that this is separate from bdrv_drained_begin/bdrv_drained_end. A 192 * thread still has to call those to avoid being interrupted by the guest. 193 * 194 * Bottom halves, timers and callbacks can be created or removed without 195 * acquiring the AioContext. 196 */ 197 void aio_context_acquire(AioContext *ctx); 198 199 /* Relinquish ownership of the AioContext. */ 200 void aio_context_release(AioContext *ctx); 201 202 /** 203 * aio_bh_schedule_oneshot: Allocate a new bottom half structure that will run 204 * only once and as soon as possible. 205 */ 206 void aio_bh_schedule_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque); 207 208 /** 209 * aio_bh_new: Allocate a new bottom half structure. 210 * 211 * Bottom halves are lightweight callbacks whose invocation is guaranteed 212 * to be wait-free, thread-safe and signal-safe. The #QEMUBH structure 213 * is opaque and must be allocated prior to its use. 214 */ 215 QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque); 216 217 /** 218 * aio_notify: Force processing of pending events. 219 * 220 * Similar to signaling a condition variable, aio_notify forces 221 * aio_poll to exit, so that the next call will re-examine pending events. 222 * The caller of aio_notify will usually call aio_poll again very soon, 223 * or go through another iteration of the GLib main loop. Hence, aio_notify 224 * also has the side effect of recalculating the sets of file descriptors 225 * that the main loop waits for. 226 * 227 * Calling aio_notify is rarely necessary, because for example scheduling 228 * a bottom half calls it already. 229 */ 230 void aio_notify(AioContext *ctx); 231 232 /** 233 * aio_notify_accept: Acknowledge receiving an aio_notify. 234 * 235 * aio_notify() uses an EventNotifier in order to wake up a sleeping 236 * aio_poll() or g_main_context_iteration(). Calls to aio_notify() are 237 * usually rare, but the AioContext has to clear the EventNotifier on 238 * every aio_poll() or g_main_context_iteration() in order to avoid 239 * busy waiting. This event_notifier_test_and_clear() cannot be done 240 * using the usual aio_context_set_event_notifier(), because it must 241 * be done before processing all events (file descriptors, bottom halves, 242 * timers). 243 * 244 * aio_notify_accept() is an optimized event_notifier_test_and_clear() 245 * that is specific to an AioContext's notifier; it is used internally 246 * to clear the EventNotifier only if aio_notify() had been called. 247 */ 248 void aio_notify_accept(AioContext *ctx); 249 250 /** 251 * aio_bh_call: Executes callback function of the specified BH. 252 */ 253 void aio_bh_call(QEMUBH *bh); 254 255 /** 256 * aio_bh_poll: Poll bottom halves for an AioContext. 257 * 258 * These are internal functions used by the QEMU main loop. 259 * And notice that multiple occurrences of aio_bh_poll cannot 260 * be called concurrently 261 */ 262 int aio_bh_poll(AioContext *ctx); 263 264 /** 265 * qemu_bh_schedule: Schedule a bottom half. 266 * 267 * Scheduling a bottom half interrupts the main loop and causes the 268 * execution of the callback that was passed to qemu_bh_new. 269 * 270 * Bottom halves that are scheduled from a bottom half handler are instantly 271 * invoked. This can create an infinite loop if a bottom half handler 272 * schedules itself. 273 * 274 * @bh: The bottom half to be scheduled. 275 */ 276 void qemu_bh_schedule(QEMUBH *bh); 277 278 /** 279 * qemu_bh_cancel: Cancel execution of a bottom half. 280 * 281 * Canceling execution of a bottom half undoes the effect of calls to 282 * qemu_bh_schedule without freeing its resources yet. While cancellation 283 * itself is also wait-free and thread-safe, it can of course race with the 284 * loop that executes bottom halves unless you are holding the iothread 285 * mutex. This makes it mostly useless if you are not holding the mutex. 286 * 287 * @bh: The bottom half to be canceled. 288 */ 289 void qemu_bh_cancel(QEMUBH *bh); 290 291 /** 292 *qemu_bh_delete: Cancel execution of a bottom half and free its resources. 293 * 294 * Deleting a bottom half frees the memory that was allocated for it by 295 * qemu_bh_new. It also implies canceling the bottom half if it was 296 * scheduled. 297 * This func is async. The bottom half will do the delete action at the finial 298 * end. 299 * 300 * @bh: The bottom half to be deleted. 301 */ 302 void qemu_bh_delete(QEMUBH *bh); 303 304 /* Return whether there are any pending callbacks from the GSource 305 * attached to the AioContext, before g_poll is invoked. 306 * 307 * This is used internally in the implementation of the GSource. 308 */ 309 bool aio_prepare(AioContext *ctx); 310 311 /* Return whether there are any pending callbacks from the GSource 312 * attached to the AioContext, after g_poll is invoked. 313 * 314 * This is used internally in the implementation of the GSource. 315 */ 316 bool aio_pending(AioContext *ctx); 317 318 /* Dispatch any pending callbacks from the GSource attached to the AioContext. 319 * 320 * This is used internally in the implementation of the GSource. 321 */ 322 void aio_dispatch(AioContext *ctx); 323 324 /* Progress in completing AIO work to occur. This can issue new pending 325 * aio as a result of executing I/O completion or bh callbacks. 326 * 327 * Return whether any progress was made by executing AIO or bottom half 328 * handlers. If @blocking == true, this should always be true except 329 * if someone called aio_notify. 330 * 331 * If there are no pending bottom halves, but there are pending AIO 332 * operations, it may not be possible to make any progress without 333 * blocking. If @blocking is true, this function will wait until one 334 * or more AIO events have completed, to ensure something has moved 335 * before returning. 336 */ 337 bool aio_poll(AioContext *ctx, bool blocking); 338 339 /* Register a file descriptor and associated callbacks. Behaves very similarly 340 * to qemu_set_fd_handler. Unlike qemu_set_fd_handler, these callbacks will 341 * be invoked when using aio_poll(). 342 * 343 * Code that invokes AIO completion functions should rely on this function 344 * instead of qemu_set_fd_handler[2]. 345 */ 346 void aio_set_fd_handler(AioContext *ctx, 347 int fd, 348 bool is_external, 349 IOHandler *io_read, 350 IOHandler *io_write, 351 AioPollFn *io_poll, 352 void *opaque); 353 354 /* Set polling begin/end callbacks for a file descriptor that has already been 355 * registered with aio_set_fd_handler. Do nothing if the file descriptor is 356 * not registered. 357 */ 358 void aio_set_fd_poll(AioContext *ctx, int fd, 359 IOHandler *io_poll_begin, 360 IOHandler *io_poll_end); 361 362 /* Register an event notifier and associated callbacks. Behaves very similarly 363 * to event_notifier_set_handler. Unlike event_notifier_set_handler, these callbacks 364 * will be invoked when using aio_poll(). 365 * 366 * Code that invokes AIO completion functions should rely on this function 367 * instead of event_notifier_set_handler. 368 */ 369 void aio_set_event_notifier(AioContext *ctx, 370 EventNotifier *notifier, 371 bool is_external, 372 EventNotifierHandler *io_read, 373 AioPollFn *io_poll); 374 375 /* Set polling begin/end callbacks for an event notifier that has already been 376 * registered with aio_set_event_notifier. Do nothing if the event notifier is 377 * not registered. 378 */ 379 void aio_set_event_notifier_poll(AioContext *ctx, 380 EventNotifier *notifier, 381 EventNotifierHandler *io_poll_begin, 382 EventNotifierHandler *io_poll_end); 383 384 /* Return a GSource that lets the main loop poll the file descriptors attached 385 * to this AioContext. 386 */ 387 GSource *aio_get_g_source(AioContext *ctx); 388 389 /* Return the ThreadPool bound to this AioContext */ 390 struct ThreadPool *aio_get_thread_pool(AioContext *ctx); 391 392 /* Setup the LinuxAioState bound to this AioContext */ 393 struct LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp); 394 395 /* Return the LinuxAioState bound to this AioContext */ 396 struct LinuxAioState *aio_get_linux_aio(AioContext *ctx); 397 398 /* Setup the LuringState bound to this AioContext */ 399 struct LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp); 400 401 /* Return the LuringState bound to this AioContext */ 402 struct LuringState *aio_get_linux_io_uring(AioContext *ctx); 403 /** 404 * aio_timer_new_with_attrs: 405 * @ctx: the aio context 406 * @type: the clock type 407 * @scale: the scale 408 * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values 409 * to assign 410 * @cb: the callback to call on timer expiry 411 * @opaque: the opaque pointer to pass to the callback 412 * 413 * Allocate a new timer (with attributes) attached to the context @ctx. 414 * The function is responsible for memory allocation. 415 * 416 * The preferred interface is aio_timer_init or aio_timer_init_with_attrs. 417 * Use that unless you really need dynamic memory allocation. 418 * 419 * Returns: a pointer to the new timer 420 */ 421 static inline QEMUTimer *aio_timer_new_with_attrs(AioContext *ctx, 422 QEMUClockType type, 423 int scale, int attributes, 424 QEMUTimerCB *cb, void *opaque) 425 { 426 return timer_new_full(&ctx->tlg, type, scale, attributes, cb, opaque); 427 } 428 429 /** 430 * aio_timer_new: 431 * @ctx: the aio context 432 * @type: the clock type 433 * @scale: the scale 434 * @cb: the callback to call on timer expiry 435 * @opaque: the opaque pointer to pass to the callback 436 * 437 * Allocate a new timer attached to the context @ctx. 438 * See aio_timer_new_with_attrs for details. 439 * 440 * Returns: a pointer to the new timer 441 */ 442 static inline QEMUTimer *aio_timer_new(AioContext *ctx, QEMUClockType type, 443 int scale, 444 QEMUTimerCB *cb, void *opaque) 445 { 446 return timer_new_full(&ctx->tlg, type, scale, 0, cb, opaque); 447 } 448 449 /** 450 * aio_timer_init_with_attrs: 451 * @ctx: the aio context 452 * @ts: the timer 453 * @type: the clock type 454 * @scale: the scale 455 * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values 456 * to assign 457 * @cb: the callback to call on timer expiry 458 * @opaque: the opaque pointer to pass to the callback 459 * 460 * Initialise a new timer (with attributes) attached to the context @ctx. 461 * The caller is responsible for memory allocation. 462 */ 463 static inline void aio_timer_init_with_attrs(AioContext *ctx, 464 QEMUTimer *ts, QEMUClockType type, 465 int scale, int attributes, 466 QEMUTimerCB *cb, void *opaque) 467 { 468 timer_init_full(ts, &ctx->tlg, type, scale, attributes, cb, opaque); 469 } 470 471 /** 472 * aio_timer_init: 473 * @ctx: the aio context 474 * @ts: the timer 475 * @type: the clock type 476 * @scale: the scale 477 * @cb: the callback to call on timer expiry 478 * @opaque: the opaque pointer to pass to the callback 479 * 480 * Initialise a new timer attached to the context @ctx. 481 * See aio_timer_init_with_attrs for details. 482 */ 483 static inline void aio_timer_init(AioContext *ctx, 484 QEMUTimer *ts, QEMUClockType type, 485 int scale, 486 QEMUTimerCB *cb, void *opaque) 487 { 488 timer_init_full(ts, &ctx->tlg, type, scale, 0, cb, opaque); 489 } 490 491 /** 492 * aio_compute_timeout: 493 * @ctx: the aio context 494 * 495 * Compute the timeout that a blocking aio_poll should use. 496 */ 497 int64_t aio_compute_timeout(AioContext *ctx); 498 499 /** 500 * aio_disable_external: 501 * @ctx: the aio context 502 * 503 * Disable the further processing of external clients. 504 */ 505 static inline void aio_disable_external(AioContext *ctx) 506 { 507 atomic_inc(&ctx->external_disable_cnt); 508 } 509 510 /** 511 * aio_enable_external: 512 * @ctx: the aio context 513 * 514 * Enable the processing of external clients. 515 */ 516 static inline void aio_enable_external(AioContext *ctx) 517 { 518 int old; 519 520 old = atomic_fetch_dec(&ctx->external_disable_cnt); 521 assert(old > 0); 522 if (old == 1) { 523 /* Kick event loop so it re-arms file descriptors */ 524 aio_notify(ctx); 525 } 526 } 527 528 /** 529 * aio_external_disabled: 530 * @ctx: the aio context 531 * 532 * Return true if the external clients are disabled. 533 */ 534 static inline bool aio_external_disabled(AioContext *ctx) 535 { 536 return atomic_read(&ctx->external_disable_cnt); 537 } 538 539 /** 540 * aio_node_check: 541 * @ctx: the aio context 542 * @is_external: Whether or not the checked node is an external event source. 543 * 544 * Check if the node's is_external flag is okay to be polled by the ctx at this 545 * moment. True means green light. 546 */ 547 static inline bool aio_node_check(AioContext *ctx, bool is_external) 548 { 549 return !is_external || !atomic_read(&ctx->external_disable_cnt); 550 } 551 552 /** 553 * aio_co_schedule: 554 * @ctx: the aio context 555 * @co: the coroutine 556 * 557 * Start a coroutine on a remote AioContext. 558 * 559 * The coroutine must not be entered by anyone else while aio_co_schedule() 560 * is active. In addition the coroutine must have yielded unless ctx 561 * is the context in which the coroutine is running (i.e. the value of 562 * qemu_get_current_aio_context() from the coroutine itself). 563 */ 564 void aio_co_schedule(AioContext *ctx, struct Coroutine *co); 565 566 /** 567 * aio_co_wake: 568 * @co: the coroutine 569 * 570 * Restart a coroutine on the AioContext where it was running last, thus 571 * preventing coroutines from jumping from one context to another when they 572 * go to sleep. 573 * 574 * aio_co_wake may be executed either in coroutine or non-coroutine 575 * context. The coroutine must not be entered by anyone else while 576 * aio_co_wake() is active. 577 */ 578 void aio_co_wake(struct Coroutine *co); 579 580 /** 581 * aio_co_enter: 582 * @ctx: the context to run the coroutine 583 * @co: the coroutine to run 584 * 585 * Enter a coroutine in the specified AioContext. 586 */ 587 void aio_co_enter(AioContext *ctx, struct Coroutine *co); 588 589 /** 590 * Return the AioContext whose event loop runs in the current thread. 591 * 592 * If called from an IOThread this will be the IOThread's AioContext. If 593 * called from another thread it will be the main loop AioContext. 594 */ 595 AioContext *qemu_get_current_aio_context(void); 596 597 /** 598 * in_aio_context_home_thread: 599 * @ctx: the aio context 600 * 601 * Return whether we are running in the thread that normally runs @ctx. Note 602 * that acquiring/releasing ctx does not affect the outcome, each AioContext 603 * still only has one home thread that is responsible for running it. 604 */ 605 static inline bool in_aio_context_home_thread(AioContext *ctx) 606 { 607 return ctx == qemu_get_current_aio_context(); 608 } 609 610 /** 611 * aio_context_setup: 612 * @ctx: the aio context 613 * 614 * Initialize the aio context. 615 */ 616 void aio_context_setup(AioContext *ctx); 617 618 /** 619 * aio_context_destroy: 620 * @ctx: the aio context 621 * 622 * Destroy the aio context. 623 */ 624 void aio_context_destroy(AioContext *ctx); 625 626 /** 627 * aio_context_set_poll_params: 628 * @ctx: the aio context 629 * @max_ns: how long to busy poll for, in nanoseconds 630 * @grow: polling time growth factor 631 * @shrink: polling time shrink factor 632 * 633 * Poll mode can be disabled by setting poll_max_ns to 0. 634 */ 635 void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns, 636 int64_t grow, int64_t shrink, 637 Error **errp); 638 639 #endif 640