1 /* 2 * QEMU aio implementation 3 * 4 * Copyright IBM, Corp. 2008 5 * 6 * Authors: 7 * Anthony Liguori <aliguori@us.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Contributions after 2012-01-13 are licensed under the terms of the 13 * GNU GPL, version 2 or (at your option) any later version. 14 */ 15 16 #include "qemu/osdep.h" 17 #include "block/block.h" 18 #include "qemu/rcu.h" 19 #include "qemu/rcu_queue.h" 20 #include "qemu/sockets.h" 21 #include "qemu/cutils.h" 22 #include "trace.h" 23 #include "aio-posix.h" 24 25 /* Stop userspace polling on a handler if it isn't active for some time */ 26 #define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND) 27 28 bool aio_poll_disabled(AioContext *ctx) 29 { 30 return atomic_read(&ctx->poll_disable_cnt); 31 } 32 33 void aio_add_ready_handler(AioHandlerList *ready_list, 34 AioHandler *node, 35 int revents) 36 { 37 QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */ 38 node->pfd.revents = revents; 39 QLIST_INSERT_HEAD(ready_list, node, node_ready); 40 } 41 42 static AioHandler *find_aio_handler(AioContext *ctx, int fd) 43 { 44 AioHandler *node; 45 46 QLIST_FOREACH(node, &ctx->aio_handlers, node) { 47 if (node->pfd.fd == fd) { 48 if (!QLIST_IS_INSERTED(node, node_deleted)) { 49 return node; 50 } 51 } 52 } 53 54 return NULL; 55 } 56 57 static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node) 58 { 59 /* If the GSource is in the process of being destroyed then 60 * g_source_remove_poll() causes an assertion failure. Skip 61 * removal in that case, because glib cleans up its state during 62 * destruction anyway. 63 */ 64 if (!g_source_is_destroyed(&ctx->source)) { 65 g_source_remove_poll(&ctx->source, &node->pfd); 66 } 67 68 node->pfd.revents = 0; 69 70 /* If the fd monitor has already marked it deleted, leave it alone */ 71 if (QLIST_IS_INSERTED(node, node_deleted)) { 72 return false; 73 } 74 75 /* If a read is in progress, just mark the node as deleted */ 76 if (qemu_lockcnt_count(&ctx->list_lock)) { 77 QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted); 78 return false; 79 } 80 /* Otherwise, delete it for real. We can't just mark it as 81 * deleted because deleted nodes are only cleaned up while 82 * no one is walking the handlers list. 83 */ 84 QLIST_SAFE_REMOVE(node, node_poll); 85 QLIST_REMOVE(node, node); 86 return true; 87 } 88 89 void aio_set_fd_handler(AioContext *ctx, 90 int fd, 91 bool is_external, 92 IOHandler *io_read, 93 IOHandler *io_write, 94 AioPollFn *io_poll, 95 void *opaque) 96 { 97 AioHandler *node; 98 AioHandler *new_node = NULL; 99 bool is_new = false; 100 bool deleted = false; 101 int poll_disable_change; 102 103 qemu_lockcnt_lock(&ctx->list_lock); 104 105 node = find_aio_handler(ctx, fd); 106 107 /* Are we deleting the fd handler? */ 108 if (!io_read && !io_write && !io_poll) { 109 if (node == NULL) { 110 qemu_lockcnt_unlock(&ctx->list_lock); 111 return; 112 } 113 /* Clean events in order to unregister fd from the ctx epoll. */ 114 node->pfd.events = 0; 115 116 poll_disable_change = -!node->io_poll; 117 } else { 118 poll_disable_change = !io_poll - (node && !node->io_poll); 119 if (node == NULL) { 120 is_new = true; 121 } 122 /* Alloc and insert if it's not already there */ 123 new_node = g_new0(AioHandler, 1); 124 125 /* Update handler with latest information */ 126 new_node->io_read = io_read; 127 new_node->io_write = io_write; 128 new_node->io_poll = io_poll; 129 new_node->opaque = opaque; 130 new_node->is_external = is_external; 131 132 if (is_new) { 133 new_node->pfd.fd = fd; 134 } else { 135 new_node->pfd = node->pfd; 136 } 137 g_source_add_poll(&ctx->source, &new_node->pfd); 138 139 new_node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0); 140 new_node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0); 141 142 QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, new_node, node); 143 } 144 145 /* No need to order poll_disable_cnt writes against other updates; 146 * the counter is only used to avoid wasting time and latency on 147 * iterated polling when the system call will be ultimately necessary. 148 * Changing handlers is a rare event, and a little wasted polling until 149 * the aio_notify below is not an issue. 150 */ 151 atomic_set(&ctx->poll_disable_cnt, 152 atomic_read(&ctx->poll_disable_cnt) + poll_disable_change); 153 154 ctx->fdmon_ops->update(ctx, node, new_node); 155 if (node) { 156 deleted = aio_remove_fd_handler(ctx, node); 157 } 158 qemu_lockcnt_unlock(&ctx->list_lock); 159 aio_notify(ctx); 160 161 if (deleted) { 162 g_free(node); 163 } 164 } 165 166 void aio_set_fd_poll(AioContext *ctx, int fd, 167 IOHandler *io_poll_begin, 168 IOHandler *io_poll_end) 169 { 170 AioHandler *node = find_aio_handler(ctx, fd); 171 172 if (!node) { 173 return; 174 } 175 176 node->io_poll_begin = io_poll_begin; 177 node->io_poll_end = io_poll_end; 178 } 179 180 void aio_set_event_notifier(AioContext *ctx, 181 EventNotifier *notifier, 182 bool is_external, 183 EventNotifierHandler *io_read, 184 AioPollFn *io_poll) 185 { 186 aio_set_fd_handler(ctx, event_notifier_get_fd(notifier), is_external, 187 (IOHandler *)io_read, NULL, io_poll, notifier); 188 } 189 190 void aio_set_event_notifier_poll(AioContext *ctx, 191 EventNotifier *notifier, 192 EventNotifierHandler *io_poll_begin, 193 EventNotifierHandler *io_poll_end) 194 { 195 aio_set_fd_poll(ctx, event_notifier_get_fd(notifier), 196 (IOHandler *)io_poll_begin, 197 (IOHandler *)io_poll_end); 198 } 199 200 static bool poll_set_started(AioContext *ctx, bool started) 201 { 202 AioHandler *node; 203 bool progress = false; 204 205 if (started == ctx->poll_started) { 206 return false; 207 } 208 209 ctx->poll_started = started; 210 211 qemu_lockcnt_inc(&ctx->list_lock); 212 QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) { 213 IOHandler *fn; 214 215 if (QLIST_IS_INSERTED(node, node_deleted)) { 216 continue; 217 } 218 219 if (started) { 220 fn = node->io_poll_begin; 221 } else { 222 fn = node->io_poll_end; 223 } 224 225 if (fn) { 226 fn(node->opaque); 227 } 228 229 /* Poll one last time in case ->io_poll_end() raced with the event */ 230 if (!started) { 231 progress = node->io_poll(node->opaque) || progress; 232 } 233 } 234 qemu_lockcnt_dec(&ctx->list_lock); 235 236 return progress; 237 } 238 239 240 bool aio_prepare(AioContext *ctx) 241 { 242 /* Poll mode cannot be used with glib's event loop, disable it. */ 243 poll_set_started(ctx, false); 244 245 return false; 246 } 247 248 bool aio_pending(AioContext *ctx) 249 { 250 AioHandler *node; 251 bool result = false; 252 253 /* 254 * We have to walk very carefully in case aio_set_fd_handler is 255 * called while we're walking. 256 */ 257 qemu_lockcnt_inc(&ctx->list_lock); 258 259 QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { 260 int revents; 261 262 revents = node->pfd.revents & node->pfd.events; 263 if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read && 264 aio_node_check(ctx, node->is_external)) { 265 result = true; 266 break; 267 } 268 if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write && 269 aio_node_check(ctx, node->is_external)) { 270 result = true; 271 break; 272 } 273 } 274 qemu_lockcnt_dec(&ctx->list_lock); 275 276 return result; 277 } 278 279 static void aio_free_deleted_handlers(AioContext *ctx) 280 { 281 AioHandler *node; 282 283 if (QLIST_EMPTY_RCU(&ctx->deleted_aio_handlers)) { 284 return; 285 } 286 if (!qemu_lockcnt_dec_if_lock(&ctx->list_lock)) { 287 return; /* we are nested, let the parent do the freeing */ 288 } 289 290 while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) { 291 QLIST_REMOVE(node, node); 292 QLIST_REMOVE(node, node_deleted); 293 QLIST_SAFE_REMOVE(node, node_poll); 294 g_free(node); 295 } 296 297 qemu_lockcnt_inc_and_unlock(&ctx->list_lock); 298 } 299 300 static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node) 301 { 302 bool progress = false; 303 int revents; 304 305 revents = node->pfd.revents & node->pfd.events; 306 node->pfd.revents = 0; 307 308 /* 309 * Start polling AioHandlers when they become ready because activity is 310 * likely to continue. Note that starvation is theoretically possible when 311 * fdmon_supports_polling(), but only until the fd fires for the first 312 * time. 313 */ 314 if (!QLIST_IS_INSERTED(node, node_deleted) && 315 !QLIST_IS_INSERTED(node, node_poll) && 316 node->io_poll) { 317 trace_poll_add(ctx, node, node->pfd.fd, revents); 318 if (ctx->poll_started && node->io_poll_begin) { 319 node->io_poll_begin(node->opaque); 320 } 321 QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll); 322 } 323 324 if (!QLIST_IS_INSERTED(node, node_deleted) && 325 (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) && 326 aio_node_check(ctx, node->is_external) && 327 node->io_read) { 328 node->io_read(node->opaque); 329 330 /* aio_notify() does not count as progress */ 331 if (node->opaque != &ctx->notifier) { 332 progress = true; 333 } 334 } 335 if (!QLIST_IS_INSERTED(node, node_deleted) && 336 (revents & (G_IO_OUT | G_IO_ERR)) && 337 aio_node_check(ctx, node->is_external) && 338 node->io_write) { 339 node->io_write(node->opaque); 340 progress = true; 341 } 342 343 return progress; 344 } 345 346 /* 347 * If we have a list of ready handlers then this is more efficient than 348 * scanning all handlers with aio_dispatch_handlers(). 349 */ 350 static bool aio_dispatch_ready_handlers(AioContext *ctx, 351 AioHandlerList *ready_list) 352 { 353 bool progress = false; 354 AioHandler *node; 355 356 while ((node = QLIST_FIRST(ready_list))) { 357 QLIST_REMOVE(node, node_ready); 358 progress = aio_dispatch_handler(ctx, node) || progress; 359 } 360 361 return progress; 362 } 363 364 /* Slower than aio_dispatch_ready_handlers() but only used via glib */ 365 static bool aio_dispatch_handlers(AioContext *ctx) 366 { 367 AioHandler *node, *tmp; 368 bool progress = false; 369 370 QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) { 371 progress = aio_dispatch_handler(ctx, node) || progress; 372 } 373 374 return progress; 375 } 376 377 void aio_dispatch(AioContext *ctx) 378 { 379 qemu_lockcnt_inc(&ctx->list_lock); 380 aio_bh_poll(ctx); 381 aio_dispatch_handlers(ctx); 382 aio_free_deleted_handlers(ctx); 383 qemu_lockcnt_dec(&ctx->list_lock); 384 385 timerlistgroup_run_timers(&ctx->tlg); 386 } 387 388 static bool run_poll_handlers_once(AioContext *ctx, 389 int64_t now, 390 int64_t *timeout) 391 { 392 bool progress = false; 393 AioHandler *node; 394 AioHandler *tmp; 395 396 QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) { 397 if (aio_node_check(ctx, node->is_external) && 398 node->io_poll(node->opaque)) { 399 node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS; 400 401 /* 402 * Polling was successful, exit try_poll_mode immediately 403 * to adjust the next polling time. 404 */ 405 *timeout = 0; 406 if (node->opaque != &ctx->notifier) { 407 progress = true; 408 } 409 } 410 411 /* Caller handles freeing deleted nodes. Don't do it here. */ 412 } 413 414 return progress; 415 } 416 417 static bool fdmon_supports_polling(AioContext *ctx) 418 { 419 return ctx->fdmon_ops->need_wait != aio_poll_disabled; 420 } 421 422 static bool remove_idle_poll_handlers(AioContext *ctx, int64_t now) 423 { 424 AioHandler *node; 425 AioHandler *tmp; 426 bool progress = false; 427 428 /* 429 * File descriptor monitoring implementations without userspace polling 430 * support suffer from starvation when a subset of handlers is polled 431 * because fds will not be processed in a timely fashion. Don't remove 432 * idle poll handlers. 433 */ 434 if (!fdmon_supports_polling(ctx)) { 435 return false; 436 } 437 438 QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) { 439 if (node->poll_idle_timeout == 0LL) { 440 node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS; 441 } else if (now >= node->poll_idle_timeout) { 442 trace_poll_remove(ctx, node, node->pfd.fd); 443 node->poll_idle_timeout = 0LL; 444 QLIST_SAFE_REMOVE(node, node_poll); 445 if (ctx->poll_started && node->io_poll_end) { 446 node->io_poll_end(node->opaque); 447 448 /* 449 * Final poll in case ->io_poll_end() races with an event. 450 * Nevermind about re-adding the handler in the rare case where 451 * this causes progress. 452 */ 453 progress = node->io_poll(node->opaque) || progress; 454 } 455 } 456 } 457 458 return progress; 459 } 460 461 /* run_poll_handlers: 462 * @ctx: the AioContext 463 * @max_ns: maximum time to poll for, in nanoseconds 464 * 465 * Polls for a given time. 466 * 467 * Note that ctx->notify_me must be non-zero so this function can detect 468 * aio_notify(). 469 * 470 * Note that the caller must have incremented ctx->list_lock. 471 * 472 * Returns: true if progress was made, false otherwise 473 */ 474 static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout) 475 { 476 bool progress; 477 int64_t start_time, elapsed_time; 478 479 assert(ctx->notify_me); 480 assert(qemu_lockcnt_count(&ctx->list_lock) > 0); 481 482 trace_run_poll_handlers_begin(ctx, max_ns, *timeout); 483 484 /* 485 * Optimization: ->io_poll() handlers often contain RCU read critical 486 * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock() 487 * -> rcu_read_lock() -> ... sequences with expensive memory 488 * synchronization primitives. Make the entire polling loop an RCU 489 * critical section because nested rcu_read_lock()/rcu_read_unlock() calls 490 * are cheap. 491 */ 492 RCU_READ_LOCK_GUARD(); 493 494 start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 495 do { 496 progress = run_poll_handlers_once(ctx, start_time, timeout); 497 elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time; 498 max_ns = qemu_soonest_timeout(*timeout, max_ns); 499 assert(!(max_ns && progress)); 500 } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx)); 501 502 if (remove_idle_poll_handlers(ctx, start_time + elapsed_time)) { 503 *timeout = 0; 504 progress = true; 505 } 506 507 /* If time has passed with no successful polling, adjust *timeout to 508 * keep the same ending time. 509 */ 510 if (*timeout != -1) { 511 *timeout -= MIN(*timeout, elapsed_time); 512 } 513 514 trace_run_poll_handlers_end(ctx, progress, *timeout); 515 return progress; 516 } 517 518 /* try_poll_mode: 519 * @ctx: the AioContext 520 * @timeout: timeout for blocking wait, computed by the caller and updated if 521 * polling succeeds. 522 * 523 * ctx->notify_me must be non-zero so this function can detect aio_notify(). 524 * 525 * Note that the caller must have incremented ctx->list_lock. 526 * 527 * Returns: true if progress was made, false otherwise 528 */ 529 static bool try_poll_mode(AioContext *ctx, int64_t *timeout) 530 { 531 int64_t max_ns; 532 533 if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) { 534 return false; 535 } 536 537 max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns); 538 if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) { 539 poll_set_started(ctx, true); 540 541 if (run_poll_handlers(ctx, max_ns, timeout)) { 542 return true; 543 } 544 } 545 546 if (poll_set_started(ctx, false)) { 547 *timeout = 0; 548 return true; 549 } 550 551 return false; 552 } 553 554 bool aio_poll(AioContext *ctx, bool blocking) 555 { 556 AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list); 557 int ret = 0; 558 bool progress; 559 int64_t timeout; 560 int64_t start = 0; 561 562 assert(in_aio_context_home_thread(ctx)); 563 564 /* aio_notify can avoid the expensive event_notifier_set if 565 * everything (file descriptors, bottom halves, timers) will 566 * be re-evaluated before the next blocking poll(). This is 567 * already true when aio_poll is called with blocking == false; 568 * if blocking == true, it is only true after poll() returns, 569 * so disable the optimization now. 570 */ 571 if (blocking) { 572 atomic_add(&ctx->notify_me, 2); 573 } 574 575 qemu_lockcnt_inc(&ctx->list_lock); 576 577 if (ctx->poll_max_ns) { 578 start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 579 } 580 581 timeout = blocking ? aio_compute_timeout(ctx) : 0; 582 progress = try_poll_mode(ctx, &timeout); 583 assert(!(timeout && progress)); 584 585 /* If polling is allowed, non-blocking aio_poll does not need the 586 * system call---a single round of run_poll_handlers_once suffices. 587 */ 588 if (timeout || ctx->fdmon_ops->need_wait(ctx)) { 589 ret = ctx->fdmon_ops->wait(ctx, &ready_list, timeout); 590 } 591 592 if (blocking) { 593 atomic_sub(&ctx->notify_me, 2); 594 aio_notify_accept(ctx); 595 } 596 597 /* Adjust polling time */ 598 if (ctx->poll_max_ns) { 599 int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start; 600 601 if (block_ns <= ctx->poll_ns) { 602 /* This is the sweet spot, no adjustment needed */ 603 } else if (block_ns > ctx->poll_max_ns) { 604 /* We'd have to poll for too long, poll less */ 605 int64_t old = ctx->poll_ns; 606 607 if (ctx->poll_shrink) { 608 ctx->poll_ns /= ctx->poll_shrink; 609 } else { 610 ctx->poll_ns = 0; 611 } 612 613 trace_poll_shrink(ctx, old, ctx->poll_ns); 614 } else if (ctx->poll_ns < ctx->poll_max_ns && 615 block_ns < ctx->poll_max_ns) { 616 /* There is room to grow, poll longer */ 617 int64_t old = ctx->poll_ns; 618 int64_t grow = ctx->poll_grow; 619 620 if (grow == 0) { 621 grow = 2; 622 } 623 624 if (ctx->poll_ns) { 625 ctx->poll_ns *= grow; 626 } else { 627 ctx->poll_ns = 4000; /* start polling at 4 microseconds */ 628 } 629 630 if (ctx->poll_ns > ctx->poll_max_ns) { 631 ctx->poll_ns = ctx->poll_max_ns; 632 } 633 634 trace_poll_grow(ctx, old, ctx->poll_ns); 635 } 636 } 637 638 progress |= aio_bh_poll(ctx); 639 640 if (ret > 0) { 641 progress |= aio_dispatch_ready_handlers(ctx, &ready_list); 642 } 643 644 aio_free_deleted_handlers(ctx); 645 646 qemu_lockcnt_dec(&ctx->list_lock); 647 648 progress |= timerlistgroup_run_timers(&ctx->tlg); 649 650 return progress; 651 } 652 653 void aio_context_setup(AioContext *ctx) 654 { 655 ctx->fdmon_ops = &fdmon_poll_ops; 656 ctx->epollfd = -1; 657 658 /* Use the fastest fd monitoring implementation if available */ 659 if (fdmon_io_uring_setup(ctx)) { 660 return; 661 } 662 663 fdmon_epoll_setup(ctx); 664 } 665 666 void aio_context_destroy(AioContext *ctx) 667 { 668 fdmon_io_uring_destroy(ctx); 669 fdmon_epoll_disable(ctx); 670 } 671 672 void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns, 673 int64_t grow, int64_t shrink, Error **errp) 674 { 675 /* No thread synchronization here, it doesn't matter if an incorrect value 676 * is used once. 677 */ 678 ctx->poll_max_ns = max_ns; 679 ctx->poll_ns = 0; 680 ctx->poll_grow = grow; 681 ctx->poll_shrink = shrink; 682 683 aio_notify(ctx); 684 } 685