1 /* 2 * Data plane event loop 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2009-2017 QEMU contributors 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a copy 8 * of this software and associated documentation files (the "Software"), to deal 9 * in the Software without restriction, including without limitation the rights 10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 * copies of the Software, and to permit persons to whom the Software is 12 * furnished to do so, subject to the following conditions: 13 * 14 * The above copyright notice and this permission notice shall be included in 15 * all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23 * THE SOFTWARE. 24 */ 25 26 #include "qemu/osdep.h" 27 #include "qapi/error.h" 28 #include "block/aio.h" 29 #include "block/thread-pool.h" 30 #include "qemu/main-loop.h" 31 #include "qemu/atomic.h" 32 #include "qemu/rcu_queue.h" 33 #include "block/raw-aio.h" 34 #include "qemu/coroutine_int.h" 35 #include "qemu/coroutine-tls.h" 36 #include "trace.h" 37 38 /***********************************************************/ 39 /* bottom halves (can be seen as timers which expire ASAP) */ 40 41 /* QEMUBH::flags values */ 42 enum { 43 /* Already enqueued and waiting for aio_bh_poll() */ 44 BH_PENDING = (1 << 0), 45 46 /* Invoke the callback */ 47 BH_SCHEDULED = (1 << 1), 48 49 /* Delete without invoking callback */ 50 BH_DELETED = (1 << 2), 51 52 /* Delete after invoking callback */ 53 BH_ONESHOT = (1 << 3), 54 55 /* Schedule periodically when the event loop is idle */ 56 BH_IDLE = (1 << 4), 57 }; 58 59 struct QEMUBH { 60 AioContext *ctx; 61 const char *name; 62 QEMUBHFunc *cb; 63 void *opaque; 64 QSLIST_ENTRY(QEMUBH) next; 65 unsigned flags; 66 }; 67 68 /* Called concurrently from any thread */ 69 static void aio_bh_enqueue(QEMUBH *bh, unsigned new_flags) 70 { 71 AioContext *ctx = bh->ctx; 72 unsigned old_flags; 73 74 /* 75 * The memory barrier implicit in qatomic_fetch_or makes sure that: 76 * 1. idle & any writes needed by the callback are done before the 77 * locations are read in the aio_bh_poll. 78 * 2. ctx is loaded before the callback has a chance to execute and bh 79 * could be freed. 80 */ 81 old_flags = qatomic_fetch_or(&bh->flags, BH_PENDING | new_flags); 82 if (!(old_flags & BH_PENDING)) { 83 QSLIST_INSERT_HEAD_ATOMIC(&ctx->bh_list, bh, next); 84 } 85 86 aio_notify(ctx); 87 } 88 89 /* Only called from aio_bh_poll() and aio_ctx_finalize() */ 90 static QEMUBH *aio_bh_dequeue(BHList *head, unsigned *flags) 91 { 92 QEMUBH *bh = QSLIST_FIRST_RCU(head); 93 94 if (!bh) { 95 return NULL; 96 } 97 98 QSLIST_REMOVE_HEAD(head, next); 99 100 /* 101 * The qatomic_and is paired with aio_bh_enqueue(). The implicit memory 102 * barrier ensures that the callback sees all writes done by the scheduling 103 * thread. It also ensures that the scheduling thread sees the cleared 104 * flag before bh->cb has run, and thus will call aio_notify again if 105 * necessary. 106 */ 107 *flags = qatomic_fetch_and(&bh->flags, 108 ~(BH_PENDING | BH_SCHEDULED | BH_IDLE)); 109 return bh; 110 } 111 112 void aio_bh_schedule_oneshot_full(AioContext *ctx, QEMUBHFunc *cb, 113 void *opaque, const char *name) 114 { 115 QEMUBH *bh; 116 bh = g_new(QEMUBH, 1); 117 *bh = (QEMUBH){ 118 .ctx = ctx, 119 .cb = cb, 120 .opaque = opaque, 121 .name = name, 122 }; 123 aio_bh_enqueue(bh, BH_SCHEDULED | BH_ONESHOT); 124 } 125 126 QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque, 127 const char *name) 128 { 129 QEMUBH *bh; 130 bh = g_new(QEMUBH, 1); 131 *bh = (QEMUBH){ 132 .ctx = ctx, 133 .cb = cb, 134 .opaque = opaque, 135 .name = name, 136 }; 137 return bh; 138 } 139 140 void aio_bh_call(QEMUBH *bh) 141 { 142 bh->cb(bh->opaque); 143 } 144 145 /* Multiple occurrences of aio_bh_poll cannot be called concurrently. */ 146 int aio_bh_poll(AioContext *ctx) 147 { 148 BHListSlice slice; 149 BHListSlice *s; 150 int ret = 0; 151 152 QSLIST_MOVE_ATOMIC(&slice.bh_list, &ctx->bh_list); 153 QSIMPLEQ_INSERT_TAIL(&ctx->bh_slice_list, &slice, next); 154 155 while ((s = QSIMPLEQ_FIRST(&ctx->bh_slice_list))) { 156 QEMUBH *bh; 157 unsigned flags; 158 159 bh = aio_bh_dequeue(&s->bh_list, &flags); 160 if (!bh) { 161 QSIMPLEQ_REMOVE_HEAD(&ctx->bh_slice_list, next); 162 continue; 163 } 164 165 if ((flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) { 166 /* Idle BHs don't count as progress */ 167 if (!(flags & BH_IDLE)) { 168 ret = 1; 169 } 170 aio_bh_call(bh); 171 } 172 if (flags & (BH_DELETED | BH_ONESHOT)) { 173 g_free(bh); 174 } 175 } 176 177 return ret; 178 } 179 180 void qemu_bh_schedule_idle(QEMUBH *bh) 181 { 182 aio_bh_enqueue(bh, BH_SCHEDULED | BH_IDLE); 183 } 184 185 void qemu_bh_schedule(QEMUBH *bh) 186 { 187 aio_bh_enqueue(bh, BH_SCHEDULED); 188 } 189 190 /* This func is async. 191 */ 192 void qemu_bh_cancel(QEMUBH *bh) 193 { 194 qatomic_and(&bh->flags, ~BH_SCHEDULED); 195 } 196 197 /* This func is async.The bottom half will do the delete action at the finial 198 * end. 199 */ 200 void qemu_bh_delete(QEMUBH *bh) 201 { 202 aio_bh_enqueue(bh, BH_DELETED); 203 } 204 205 static int64_t aio_compute_bh_timeout(BHList *head, int timeout) 206 { 207 QEMUBH *bh; 208 209 QSLIST_FOREACH_RCU(bh, head, next) { 210 if ((bh->flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) { 211 if (bh->flags & BH_IDLE) { 212 /* idle bottom halves will be polled at least 213 * every 10ms */ 214 timeout = 10000000; 215 } else { 216 /* non-idle bottom halves will be executed 217 * immediately */ 218 return 0; 219 } 220 } 221 } 222 223 return timeout; 224 } 225 226 int64_t 227 aio_compute_timeout(AioContext *ctx) 228 { 229 BHListSlice *s; 230 int64_t deadline; 231 int timeout = -1; 232 233 timeout = aio_compute_bh_timeout(&ctx->bh_list, timeout); 234 if (timeout == 0) { 235 return 0; 236 } 237 238 QSIMPLEQ_FOREACH(s, &ctx->bh_slice_list, next) { 239 timeout = aio_compute_bh_timeout(&s->bh_list, timeout); 240 if (timeout == 0) { 241 return 0; 242 } 243 } 244 245 deadline = timerlistgroup_deadline_ns(&ctx->tlg); 246 if (deadline == 0) { 247 return 0; 248 } else { 249 return qemu_soonest_timeout(timeout, deadline); 250 } 251 } 252 253 static gboolean 254 aio_ctx_prepare(GSource *source, gint *timeout) 255 { 256 AioContext *ctx = (AioContext *) source; 257 258 qatomic_set(&ctx->notify_me, qatomic_read(&ctx->notify_me) | 1); 259 260 /* 261 * Write ctx->notify_me before computing the timeout 262 * (reading bottom half flags, etc.). Pairs with 263 * smp_mb in aio_notify(). 264 */ 265 smp_mb(); 266 267 /* We assume there is no timeout already supplied */ 268 *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)); 269 270 if (aio_prepare(ctx)) { 271 *timeout = 0; 272 } 273 274 return *timeout == 0; 275 } 276 277 static gboolean 278 aio_ctx_check(GSource *source) 279 { 280 AioContext *ctx = (AioContext *) source; 281 QEMUBH *bh; 282 BHListSlice *s; 283 284 /* Finish computing the timeout before clearing the flag. */ 285 qatomic_store_release(&ctx->notify_me, qatomic_read(&ctx->notify_me) & ~1); 286 aio_notify_accept(ctx); 287 288 QSLIST_FOREACH_RCU(bh, &ctx->bh_list, next) { 289 if ((bh->flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) { 290 return true; 291 } 292 } 293 294 QSIMPLEQ_FOREACH(s, &ctx->bh_slice_list, next) { 295 QSLIST_FOREACH_RCU(bh, &s->bh_list, next) { 296 if ((bh->flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) { 297 return true; 298 } 299 } 300 } 301 return aio_pending(ctx) || (timerlistgroup_deadline_ns(&ctx->tlg) == 0); 302 } 303 304 static gboolean 305 aio_ctx_dispatch(GSource *source, 306 GSourceFunc callback, 307 gpointer user_data) 308 { 309 AioContext *ctx = (AioContext *) source; 310 311 assert(callback == NULL); 312 aio_dispatch(ctx); 313 return true; 314 } 315 316 static void 317 aio_ctx_finalize(GSource *source) 318 { 319 AioContext *ctx = (AioContext *) source; 320 QEMUBH *bh; 321 unsigned flags; 322 323 thread_pool_free(ctx->thread_pool); 324 325 #ifdef CONFIG_LINUX_AIO 326 if (ctx->linux_aio) { 327 laio_detach_aio_context(ctx->linux_aio, ctx); 328 laio_cleanup(ctx->linux_aio); 329 ctx->linux_aio = NULL; 330 } 331 #endif 332 333 #ifdef CONFIG_LINUX_IO_URING 334 if (ctx->linux_io_uring) { 335 luring_detach_aio_context(ctx->linux_io_uring, ctx); 336 luring_cleanup(ctx->linux_io_uring); 337 ctx->linux_io_uring = NULL; 338 } 339 #endif 340 341 assert(QSLIST_EMPTY(&ctx->scheduled_coroutines)); 342 qemu_bh_delete(ctx->co_schedule_bh); 343 344 /* There must be no aio_bh_poll() calls going on */ 345 assert(QSIMPLEQ_EMPTY(&ctx->bh_slice_list)); 346 347 while ((bh = aio_bh_dequeue(&ctx->bh_list, &flags))) { 348 /* 349 * qemu_bh_delete() must have been called on BHs in this AioContext. In 350 * many cases memory leaks, hangs, or inconsistent state occur when a 351 * BH is leaked because something still expects it to run. 352 * 353 * If you hit this, fix the lifecycle of the BH so that 354 * qemu_bh_delete() and any associated cleanup is called before the 355 * AioContext is finalized. 356 */ 357 if (unlikely(!(flags & BH_DELETED))) { 358 fprintf(stderr, "%s: BH '%s' leaked, aborting...\n", 359 __func__, bh->name); 360 abort(); 361 } 362 363 g_free(bh); 364 } 365 366 aio_set_event_notifier(ctx, &ctx->notifier, false, NULL, NULL, NULL); 367 event_notifier_cleanup(&ctx->notifier); 368 qemu_rec_mutex_destroy(&ctx->lock); 369 qemu_lockcnt_destroy(&ctx->list_lock); 370 timerlistgroup_deinit(&ctx->tlg); 371 aio_context_destroy(ctx); 372 } 373 374 static GSourceFuncs aio_source_funcs = { 375 aio_ctx_prepare, 376 aio_ctx_check, 377 aio_ctx_dispatch, 378 aio_ctx_finalize 379 }; 380 381 GSource *aio_get_g_source(AioContext *ctx) 382 { 383 aio_context_use_g_source(ctx); 384 g_source_ref(&ctx->source); 385 return &ctx->source; 386 } 387 388 ThreadPool *aio_get_thread_pool(AioContext *ctx) 389 { 390 if (!ctx->thread_pool) { 391 ctx->thread_pool = thread_pool_new(ctx); 392 } 393 return ctx->thread_pool; 394 } 395 396 #ifdef CONFIG_LINUX_AIO 397 LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp) 398 { 399 if (!ctx->linux_aio) { 400 ctx->linux_aio = laio_init(errp); 401 if (ctx->linux_aio) { 402 laio_attach_aio_context(ctx->linux_aio, ctx); 403 } 404 } 405 return ctx->linux_aio; 406 } 407 408 LinuxAioState *aio_get_linux_aio(AioContext *ctx) 409 { 410 assert(ctx->linux_aio); 411 return ctx->linux_aio; 412 } 413 #endif 414 415 #ifdef CONFIG_LINUX_IO_URING 416 LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp) 417 { 418 if (ctx->linux_io_uring) { 419 return ctx->linux_io_uring; 420 } 421 422 ctx->linux_io_uring = luring_init(errp); 423 if (!ctx->linux_io_uring) { 424 return NULL; 425 } 426 427 luring_attach_aio_context(ctx->linux_io_uring, ctx); 428 return ctx->linux_io_uring; 429 } 430 431 LuringState *aio_get_linux_io_uring(AioContext *ctx) 432 { 433 assert(ctx->linux_io_uring); 434 return ctx->linux_io_uring; 435 } 436 #endif 437 438 void aio_notify(AioContext *ctx) 439 { 440 /* 441 * Write e.g. bh->flags before writing ctx->notified. Pairs with smp_mb in 442 * aio_notify_accept. 443 */ 444 smp_wmb(); 445 qatomic_set(&ctx->notified, true); 446 447 /* 448 * Write ctx->notified before reading ctx->notify_me. Pairs 449 * with smp_mb in aio_ctx_prepare or aio_poll. 450 */ 451 smp_mb(); 452 if (qatomic_read(&ctx->notify_me)) { 453 event_notifier_set(&ctx->notifier); 454 } 455 } 456 457 void aio_notify_accept(AioContext *ctx) 458 { 459 qatomic_set(&ctx->notified, false); 460 461 /* 462 * Write ctx->notified before reading e.g. bh->flags. Pairs with smp_wmb 463 * in aio_notify. 464 */ 465 smp_mb(); 466 } 467 468 static void aio_timerlist_notify(void *opaque, QEMUClockType type) 469 { 470 aio_notify(opaque); 471 } 472 473 static void aio_context_notifier_cb(EventNotifier *e) 474 { 475 AioContext *ctx = container_of(e, AioContext, notifier); 476 477 event_notifier_test_and_clear(&ctx->notifier); 478 } 479 480 /* Returns true if aio_notify() was called (e.g. a BH was scheduled) */ 481 static bool aio_context_notifier_poll(void *opaque) 482 { 483 EventNotifier *e = opaque; 484 AioContext *ctx = container_of(e, AioContext, notifier); 485 486 return qatomic_read(&ctx->notified); 487 } 488 489 static void aio_context_notifier_poll_ready(EventNotifier *e) 490 { 491 /* Do nothing, we just wanted to kick the event loop */ 492 } 493 494 static void co_schedule_bh_cb(void *opaque) 495 { 496 AioContext *ctx = opaque; 497 QSLIST_HEAD(, Coroutine) straight, reversed; 498 499 QSLIST_MOVE_ATOMIC(&reversed, &ctx->scheduled_coroutines); 500 QSLIST_INIT(&straight); 501 502 while (!QSLIST_EMPTY(&reversed)) { 503 Coroutine *co = QSLIST_FIRST(&reversed); 504 QSLIST_REMOVE_HEAD(&reversed, co_scheduled_next); 505 QSLIST_INSERT_HEAD(&straight, co, co_scheduled_next); 506 } 507 508 while (!QSLIST_EMPTY(&straight)) { 509 Coroutine *co = QSLIST_FIRST(&straight); 510 QSLIST_REMOVE_HEAD(&straight, co_scheduled_next); 511 trace_aio_co_schedule_bh_cb(ctx, co); 512 aio_context_acquire(ctx); 513 514 /* Protected by write barrier in qemu_aio_coroutine_enter */ 515 qatomic_set(&co->scheduled, NULL); 516 qemu_aio_coroutine_enter(ctx, co); 517 aio_context_release(ctx); 518 } 519 } 520 521 AioContext *aio_context_new(Error **errp) 522 { 523 int ret; 524 AioContext *ctx; 525 526 ctx = (AioContext *) g_source_new(&aio_source_funcs, sizeof(AioContext)); 527 QSLIST_INIT(&ctx->bh_list); 528 QSIMPLEQ_INIT(&ctx->bh_slice_list); 529 aio_context_setup(ctx); 530 531 ret = event_notifier_init(&ctx->notifier, false); 532 if (ret < 0) { 533 error_setg_errno(errp, -ret, "Failed to initialize event notifier"); 534 goto fail; 535 } 536 g_source_set_can_recurse(&ctx->source, true); 537 qemu_lockcnt_init(&ctx->list_lock); 538 539 ctx->co_schedule_bh = aio_bh_new(ctx, co_schedule_bh_cb, ctx); 540 QSLIST_INIT(&ctx->scheduled_coroutines); 541 542 aio_set_event_notifier(ctx, &ctx->notifier, 543 false, 544 aio_context_notifier_cb, 545 aio_context_notifier_poll, 546 aio_context_notifier_poll_ready); 547 #ifdef CONFIG_LINUX_AIO 548 ctx->linux_aio = NULL; 549 #endif 550 551 #ifdef CONFIG_LINUX_IO_URING 552 ctx->linux_io_uring = NULL; 553 #endif 554 555 ctx->thread_pool = NULL; 556 qemu_rec_mutex_init(&ctx->lock); 557 timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx); 558 559 ctx->poll_ns = 0; 560 ctx->poll_max_ns = 0; 561 ctx->poll_grow = 0; 562 ctx->poll_shrink = 0; 563 564 ctx->aio_max_batch = 0; 565 566 return ctx; 567 fail: 568 g_source_destroy(&ctx->source); 569 return NULL; 570 } 571 572 void aio_co_schedule(AioContext *ctx, Coroutine *co) 573 { 574 trace_aio_co_schedule(ctx, co); 575 const char *scheduled = qatomic_cmpxchg(&co->scheduled, NULL, 576 __func__); 577 578 if (scheduled) { 579 fprintf(stderr, 580 "%s: Co-routine was already scheduled in '%s'\n", 581 __func__, scheduled); 582 abort(); 583 } 584 585 /* The coroutine might run and release the last ctx reference before we 586 * invoke qemu_bh_schedule(). Take a reference to keep ctx alive until 587 * we're done. 588 */ 589 aio_context_ref(ctx); 590 591 QSLIST_INSERT_HEAD_ATOMIC(&ctx->scheduled_coroutines, 592 co, co_scheduled_next); 593 qemu_bh_schedule(ctx->co_schedule_bh); 594 595 aio_context_unref(ctx); 596 } 597 598 typedef struct AioCoRescheduleSelf { 599 Coroutine *co; 600 AioContext *new_ctx; 601 } AioCoRescheduleSelf; 602 603 static void aio_co_reschedule_self_bh(void *opaque) 604 { 605 AioCoRescheduleSelf *data = opaque; 606 aio_co_schedule(data->new_ctx, data->co); 607 } 608 609 void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx) 610 { 611 AioContext *old_ctx = qemu_get_current_aio_context(); 612 613 if (old_ctx != new_ctx) { 614 AioCoRescheduleSelf data = { 615 .co = qemu_coroutine_self(), 616 .new_ctx = new_ctx, 617 }; 618 /* 619 * We can't directly schedule the coroutine in the target context 620 * because this would be racy: The other thread could try to enter the 621 * coroutine before it has yielded in this one. 622 */ 623 aio_bh_schedule_oneshot(old_ctx, aio_co_reschedule_self_bh, &data); 624 qemu_coroutine_yield(); 625 } 626 } 627 628 void aio_co_wake(struct Coroutine *co) 629 { 630 AioContext *ctx; 631 632 /* Read coroutine before co->ctx. Matches smp_wmb in 633 * qemu_coroutine_enter. 634 */ 635 smp_read_barrier_depends(); 636 ctx = qatomic_read(&co->ctx); 637 638 aio_co_enter(ctx, co); 639 } 640 641 void aio_co_enter(AioContext *ctx, struct Coroutine *co) 642 { 643 if (ctx != qemu_get_current_aio_context()) { 644 aio_co_schedule(ctx, co); 645 return; 646 } 647 648 if (qemu_in_coroutine()) { 649 Coroutine *self = qemu_coroutine_self(); 650 assert(self != co); 651 QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, co, co_queue_next); 652 } else { 653 aio_context_acquire(ctx); 654 qemu_aio_coroutine_enter(ctx, co); 655 aio_context_release(ctx); 656 } 657 } 658 659 void aio_context_ref(AioContext *ctx) 660 { 661 g_source_ref(&ctx->source); 662 } 663 664 void aio_context_unref(AioContext *ctx) 665 { 666 g_source_unref(&ctx->source); 667 } 668 669 void aio_context_acquire(AioContext *ctx) 670 { 671 qemu_rec_mutex_lock(&ctx->lock); 672 } 673 674 void aio_context_release(AioContext *ctx) 675 { 676 qemu_rec_mutex_unlock(&ctx->lock); 677 } 678 679 QEMU_DEFINE_STATIC_CO_TLS(AioContext *, my_aiocontext) 680 681 AioContext *qemu_get_current_aio_context(void) 682 { 683 AioContext *ctx = get_my_aiocontext(); 684 if (ctx) { 685 return ctx; 686 } 687 if (qemu_mutex_iothread_locked()) { 688 /* Possibly in a vCPU thread. */ 689 return qemu_get_aio_context(); 690 } 691 return NULL; 692 } 693 694 void qemu_set_current_aio_context(AioContext *ctx) 695 { 696 assert(!get_my_aiocontext()); 697 set_my_aiocontext(ctx); 698 } 699