1 /* SPDX-License-Identifier: LGPL-2.1-or-later */ 2 /* 3 * libblkio BlockDriver 4 * 5 * Copyright Red Hat, Inc. 6 * 7 * Author: 8 * Stefan Hajnoczi <stefanha@redhat.com> 9 */ 10 11 #include "qemu/osdep.h" 12 #include <blkio.h> 13 #include "block/block_int.h" 14 #include "exec/memory.h" 15 #include "exec/cpu-common.h" /* for qemu_ram_get_fd() */ 16 #include "qapi/error.h" 17 #include "qemu/error-report.h" 18 #include "qapi/qmp/qdict.h" 19 #include "qemu/module.h" 20 #include "sysemu/block-backend.h" 21 #include "exec/memory.h" /* for ram_block_discard_disable() */ 22 23 #include "block/block-io.h" 24 25 /* 26 * Allocated bounce buffers are kept in a list sorted by buffer address. 27 */ 28 typedef struct BlkioBounceBuf { 29 QLIST_ENTRY(BlkioBounceBuf) next; 30 31 /* The bounce buffer */ 32 struct iovec buf; 33 } BlkioBounceBuf; 34 35 typedef struct { 36 /* 37 * libblkio is not thread-safe so this lock protects ->blkio and 38 * ->blkioq. 39 */ 40 QemuMutex blkio_lock; 41 struct blkio *blkio; 42 struct blkioq *blkioq; /* make this multi-queue in the future... */ 43 int completion_fd; 44 45 /* 46 * Polling fetches the next completion into this field. 47 * 48 * No lock is necessary since only one thread calls aio_poll() and invokes 49 * fd and poll handlers. 50 */ 51 struct blkio_completion poll_completion; 52 53 /* 54 * Protects ->bounce_pool, ->bounce_bufs, ->bounce_available. 55 * 56 * Lock ordering: ->bounce_lock before ->blkio_lock. 57 */ 58 CoMutex bounce_lock; 59 60 /* Bounce buffer pool */ 61 struct blkio_mem_region bounce_pool; 62 63 /* Sorted list of allocated bounce buffers */ 64 QLIST_HEAD(, BlkioBounceBuf) bounce_bufs; 65 66 /* Queue for coroutines waiting for bounce buffer space */ 67 CoQueue bounce_available; 68 69 /* The value of the "mem-region-alignment" property */ 70 size_t mem_region_alignment; 71 72 /* Can we skip adding/deleting blkio_mem_regions? */ 73 bool needs_mem_regions; 74 75 /* Are file descriptors necessary for blkio_mem_regions? */ 76 bool needs_mem_region_fd; 77 78 /* Are madvise(MADV_DONTNEED)-style operations unavailable? */ 79 bool may_pin_mem_regions; 80 } BDRVBlkioState; 81 82 /* Called with s->bounce_lock held */ 83 static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes) 84 { 85 /* There can be no allocated bounce buffers during resize */ 86 assert(QLIST_EMPTY(&s->bounce_bufs)); 87 88 /* Pad size to reduce frequency of resize calls */ 89 bytes += 128 * 1024; 90 91 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 92 int ret; 93 94 if (s->bounce_pool.addr) { 95 blkio_unmap_mem_region(s->blkio, &s->bounce_pool); 96 blkio_free_mem_region(s->blkio, &s->bounce_pool); 97 memset(&s->bounce_pool, 0, sizeof(s->bounce_pool)); 98 } 99 100 /* Automatically freed when s->blkio is destroyed */ 101 ret = blkio_alloc_mem_region(s->blkio, &s->bounce_pool, bytes); 102 if (ret < 0) { 103 return ret; 104 } 105 106 ret = blkio_map_mem_region(s->blkio, &s->bounce_pool); 107 if (ret < 0) { 108 blkio_free_mem_region(s->blkio, &s->bounce_pool); 109 memset(&s->bounce_pool, 0, sizeof(s->bounce_pool)); 110 return ret; 111 } 112 } 113 114 return 0; 115 } 116 117 /* Called with s->bounce_lock held */ 118 static bool 119 blkio_do_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce, 120 int64_t bytes) 121 { 122 void *addr = s->bounce_pool.addr; 123 BlkioBounceBuf *cur = NULL; 124 BlkioBounceBuf *prev = NULL; 125 ptrdiff_t space; 126 127 /* 128 * This is just a linear search over the holes between requests. An 129 * efficient allocator would be nice. 130 */ 131 QLIST_FOREACH(cur, &s->bounce_bufs, next) { 132 space = cur->buf.iov_base - addr; 133 if (bytes <= space) { 134 QLIST_INSERT_BEFORE(cur, bounce, next); 135 bounce->buf.iov_base = addr; 136 bounce->buf.iov_len = bytes; 137 return true; 138 } 139 140 addr = cur->buf.iov_base + cur->buf.iov_len; 141 prev = cur; 142 } 143 144 /* Is there space after the last request? */ 145 space = s->bounce_pool.addr + s->bounce_pool.len - addr; 146 if (bytes > space) { 147 return false; 148 } 149 if (prev) { 150 QLIST_INSERT_AFTER(prev, bounce, next); 151 } else { 152 QLIST_INSERT_HEAD(&s->bounce_bufs, bounce, next); 153 } 154 bounce->buf.iov_base = addr; 155 bounce->buf.iov_len = bytes; 156 return true; 157 } 158 159 static int coroutine_fn 160 blkio_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce, 161 int64_t bytes) 162 { 163 /* 164 * Ensure fairness: first time around we join the back of the queue, 165 * subsequently we join the front so we don't lose our place. 166 */ 167 CoQueueWaitFlags wait_flags = 0; 168 169 QEMU_LOCK_GUARD(&s->bounce_lock); 170 171 /* Ensure fairness: don't even try if other requests are already waiting */ 172 if (!qemu_co_queue_empty(&s->bounce_available)) { 173 qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock, 174 wait_flags); 175 wait_flags = CO_QUEUE_WAIT_FRONT; 176 } 177 178 while (true) { 179 if (blkio_do_alloc_bounce_buffer(s, bounce, bytes)) { 180 /* Kick the next queued request since there may be space */ 181 qemu_co_queue_next(&s->bounce_available); 182 return 0; 183 } 184 185 /* 186 * If there are no in-flight requests then the pool was simply too 187 * small. 188 */ 189 if (QLIST_EMPTY(&s->bounce_bufs)) { 190 bool ok; 191 int ret; 192 193 ret = blkio_resize_bounce_pool(s, bytes); 194 if (ret < 0) { 195 /* Kick the next queued request since that may fail too */ 196 qemu_co_queue_next(&s->bounce_available); 197 return ret; 198 } 199 200 ok = blkio_do_alloc_bounce_buffer(s, bounce, bytes); 201 assert(ok); /* must have space this time */ 202 return 0; 203 } 204 205 qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock, 206 wait_flags); 207 wait_flags = CO_QUEUE_WAIT_FRONT; 208 } 209 } 210 211 static void coroutine_fn blkio_free_bounce_buffer(BDRVBlkioState *s, 212 BlkioBounceBuf *bounce) 213 { 214 QEMU_LOCK_GUARD(&s->bounce_lock); 215 216 QLIST_REMOVE(bounce, next); 217 218 /* Wake up waiting coroutines since space may now be available */ 219 qemu_co_queue_next(&s->bounce_available); 220 } 221 222 /* For async to .bdrv_co_*() conversion */ 223 typedef struct { 224 Coroutine *coroutine; 225 int ret; 226 } BlkioCoData; 227 228 static void blkio_completion_fd_read(void *opaque) 229 { 230 BlockDriverState *bs = opaque; 231 BDRVBlkioState *s = bs->opaque; 232 uint64_t val; 233 int ret; 234 235 /* Polling may have already fetched a completion */ 236 if (s->poll_completion.user_data != NULL) { 237 BlkioCoData *cod = s->poll_completion.user_data; 238 cod->ret = s->poll_completion.ret; 239 240 /* Clear it in case aio_co_wake() enters a nested event loop */ 241 s->poll_completion.user_data = NULL; 242 243 aio_co_wake(cod->coroutine); 244 } 245 246 /* Reset completion fd status */ 247 ret = read(s->completion_fd, &val, sizeof(val)); 248 249 /* Ignore errors, there's nothing we can do */ 250 (void)ret; 251 252 /* 253 * Reading one completion at a time makes nested event loop re-entrancy 254 * simple. Change this loop to get multiple completions in one go if it 255 * becomes a performance bottleneck. 256 */ 257 while (true) { 258 struct blkio_completion completion; 259 260 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 261 ret = blkioq_do_io(s->blkioq, &completion, 0, 1, NULL); 262 } 263 if (ret != 1) { 264 break; 265 } 266 267 BlkioCoData *cod = completion.user_data; 268 cod->ret = completion.ret; 269 aio_co_wake(cod->coroutine); 270 } 271 } 272 273 static bool blkio_completion_fd_poll(void *opaque) 274 { 275 BlockDriverState *bs = opaque; 276 BDRVBlkioState *s = bs->opaque; 277 int ret; 278 279 /* Just in case we already fetched a completion */ 280 if (s->poll_completion.user_data != NULL) { 281 return true; 282 } 283 284 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 285 ret = blkioq_do_io(s->blkioq, &s->poll_completion, 0, 1, NULL); 286 } 287 return ret == 1; 288 } 289 290 static void blkio_completion_fd_poll_ready(void *opaque) 291 { 292 blkio_completion_fd_read(opaque); 293 } 294 295 static void blkio_attach_aio_context(BlockDriverState *bs, 296 AioContext *new_context) 297 { 298 BDRVBlkioState *s = bs->opaque; 299 300 aio_set_fd_handler(new_context, s->completion_fd, 301 blkio_completion_fd_read, NULL, 302 blkio_completion_fd_poll, 303 blkio_completion_fd_poll_ready, bs); 304 } 305 306 static void blkio_detach_aio_context(BlockDriverState *bs) 307 { 308 BDRVBlkioState *s = bs->opaque; 309 310 aio_set_fd_handler(bdrv_get_aio_context(bs), s->completion_fd, NULL, NULL, 311 NULL, NULL, NULL); 312 } 313 314 /* 315 * Called by blk_io_unplug() or immediately if not plugged. Called without 316 * blkio_lock. 317 */ 318 static void blkio_unplug_fn(void *opaque) 319 { 320 BDRVBlkioState *s = opaque; 321 322 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 323 blkioq_do_io(s->blkioq, NULL, 0, 0, NULL); 324 } 325 } 326 327 /* 328 * Schedule I/O submission after enqueuing a new request. Called without 329 * blkio_lock. 330 */ 331 static void blkio_submit_io(BlockDriverState *bs) 332 { 333 BDRVBlkioState *s = bs->opaque; 334 335 blk_io_plug_call(blkio_unplug_fn, s); 336 } 337 338 static int coroutine_fn 339 blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) 340 { 341 BDRVBlkioState *s = bs->opaque; 342 BlkioCoData cod = { 343 .coroutine = qemu_coroutine_self(), 344 }; 345 346 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 347 blkioq_discard(s->blkioq, offset, bytes, &cod, 0); 348 } 349 350 blkio_submit_io(bs); 351 qemu_coroutine_yield(); 352 return cod.ret; 353 } 354 355 static int coroutine_fn 356 blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, 357 QEMUIOVector *qiov, BdrvRequestFlags flags) 358 { 359 BlkioCoData cod = { 360 .coroutine = qemu_coroutine_self(), 361 }; 362 BDRVBlkioState *s = bs->opaque; 363 bool use_bounce_buffer = 364 s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF); 365 BlkioBounceBuf bounce; 366 struct iovec *iov = qiov->iov; 367 int iovcnt = qiov->niov; 368 369 if (use_bounce_buffer) { 370 int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes); 371 if (ret < 0) { 372 return ret; 373 } 374 375 iov = &bounce.buf; 376 iovcnt = 1; 377 } 378 379 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 380 blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0); 381 } 382 383 blkio_submit_io(bs); 384 qemu_coroutine_yield(); 385 386 if (use_bounce_buffer) { 387 if (cod.ret == 0) { 388 qemu_iovec_from_buf(qiov, 0, 389 bounce.buf.iov_base, 390 bounce.buf.iov_len); 391 } 392 393 blkio_free_bounce_buffer(s, &bounce); 394 } 395 396 return cod.ret; 397 } 398 399 static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset, 400 int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) 401 { 402 uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0; 403 BlkioCoData cod = { 404 .coroutine = qemu_coroutine_self(), 405 }; 406 BDRVBlkioState *s = bs->opaque; 407 bool use_bounce_buffer = 408 s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF); 409 BlkioBounceBuf bounce; 410 struct iovec *iov = qiov->iov; 411 int iovcnt = qiov->niov; 412 413 if (use_bounce_buffer) { 414 int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes); 415 if (ret < 0) { 416 return ret; 417 } 418 419 qemu_iovec_to_buf(qiov, 0, bounce.buf.iov_base, bytes); 420 iov = &bounce.buf; 421 iovcnt = 1; 422 } 423 424 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 425 blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags); 426 } 427 428 blkio_submit_io(bs); 429 qemu_coroutine_yield(); 430 431 if (use_bounce_buffer) { 432 blkio_free_bounce_buffer(s, &bounce); 433 } 434 435 return cod.ret; 436 } 437 438 static int coroutine_fn blkio_co_flush(BlockDriverState *bs) 439 { 440 BDRVBlkioState *s = bs->opaque; 441 BlkioCoData cod = { 442 .coroutine = qemu_coroutine_self(), 443 }; 444 445 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 446 blkioq_flush(s->blkioq, &cod, 0); 447 } 448 449 blkio_submit_io(bs); 450 qemu_coroutine_yield(); 451 return cod.ret; 452 } 453 454 static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs, 455 int64_t offset, int64_t bytes, BdrvRequestFlags flags) 456 { 457 BDRVBlkioState *s = bs->opaque; 458 BlkioCoData cod = { 459 .coroutine = qemu_coroutine_self(), 460 }; 461 uint32_t blkio_flags = 0; 462 463 if (flags & BDRV_REQ_FUA) { 464 blkio_flags |= BLKIO_REQ_FUA; 465 } 466 if (!(flags & BDRV_REQ_MAY_UNMAP)) { 467 blkio_flags |= BLKIO_REQ_NO_UNMAP; 468 } 469 if (flags & BDRV_REQ_NO_FALLBACK) { 470 blkio_flags |= BLKIO_REQ_NO_FALLBACK; 471 } 472 473 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 474 blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags); 475 } 476 477 blkio_submit_io(bs); 478 qemu_coroutine_yield(); 479 return cod.ret; 480 } 481 482 typedef enum { 483 BMRR_OK, 484 BMRR_SKIP, 485 BMRR_FAIL, 486 } BlkioMemRegionResult; 487 488 /* 489 * Produce a struct blkio_mem_region for a given address and size. 490 * 491 * This function produces identical results when called multiple times with the 492 * same arguments. This property is necessary because blkio_unmap_mem_region() 493 * must receive the same struct blkio_mem_region field values that were passed 494 * to blkio_map_mem_region(). 495 */ 496 static BlkioMemRegionResult 497 blkio_mem_region_from_host(BlockDriverState *bs, 498 void *host, size_t size, 499 struct blkio_mem_region *region, 500 Error **errp) 501 { 502 BDRVBlkioState *s = bs->opaque; 503 int fd = -1; 504 ram_addr_t fd_offset = 0; 505 506 if (((uintptr_t)host | size) % s->mem_region_alignment) { 507 error_setg(errp, "unaligned buf %p with size %zu", host, size); 508 return BMRR_FAIL; 509 } 510 511 /* Attempt to find the fd for the underlying memory */ 512 if (s->needs_mem_region_fd) { 513 RAMBlock *ram_block; 514 RAMBlock *end_block; 515 ram_addr_t offset; 516 517 /* 518 * bdrv_register_buf() is called with the BQL held so mr lives at least 519 * until this function returns. 520 */ 521 ram_block = qemu_ram_block_from_host(host, false, &fd_offset); 522 if (ram_block) { 523 fd = qemu_ram_get_fd(ram_block); 524 } 525 if (fd == -1) { 526 /* 527 * Ideally every RAMBlock would have an fd. pc-bios and other 528 * things don't. Luckily they are usually not I/O buffers and we 529 * can just ignore them. 530 */ 531 return BMRR_SKIP; 532 } 533 534 /* Make sure the fd covers the entire range */ 535 end_block = qemu_ram_block_from_host(host + size - 1, false, &offset); 536 if (ram_block != end_block) { 537 error_setg(errp, "registered buffer at %p with size %zu extends " 538 "beyond RAMBlock", host, size); 539 return BMRR_FAIL; 540 } 541 } 542 543 *region = (struct blkio_mem_region){ 544 .addr = host, 545 .len = size, 546 .fd = fd, 547 .fd_offset = fd_offset, 548 }; 549 return BMRR_OK; 550 } 551 552 static bool blkio_register_buf(BlockDriverState *bs, void *host, size_t size, 553 Error **errp) 554 { 555 BDRVBlkioState *s = bs->opaque; 556 struct blkio_mem_region region; 557 BlkioMemRegionResult region_result; 558 int ret; 559 560 /* 561 * Mapping memory regions conflicts with RAM discard (virtio-mem) when 562 * there is pinning, so only do it when necessary. 563 */ 564 if (!s->needs_mem_regions && s->may_pin_mem_regions) { 565 return true; 566 } 567 568 region_result = blkio_mem_region_from_host(bs, host, size, ®ion, errp); 569 if (region_result == BMRR_SKIP) { 570 return true; 571 } else if (region_result != BMRR_OK) { 572 return false; 573 } 574 575 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 576 ret = blkio_map_mem_region(s->blkio, ®ion); 577 } 578 579 if (ret < 0) { 580 error_setg(errp, "Failed to add blkio mem region %p with size %zu: %s", 581 host, size, blkio_get_error_msg()); 582 return false; 583 } 584 return true; 585 } 586 587 static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size) 588 { 589 BDRVBlkioState *s = bs->opaque; 590 struct blkio_mem_region region; 591 592 /* See blkio_register_buf() */ 593 if (!s->needs_mem_regions && s->may_pin_mem_regions) { 594 return; 595 } 596 597 if (blkio_mem_region_from_host(bs, host, size, ®ion, NULL) != BMRR_OK) { 598 return; 599 } 600 601 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 602 blkio_unmap_mem_region(s->blkio, ®ion); 603 } 604 } 605 606 static int blkio_io_uring_open(BlockDriverState *bs, QDict *options, int flags, 607 Error **errp) 608 { 609 const char *filename = qdict_get_str(options, "filename"); 610 BDRVBlkioState *s = bs->opaque; 611 int ret; 612 613 ret = blkio_set_str(s->blkio, "path", filename); 614 qdict_del(options, "filename"); 615 if (ret < 0) { 616 error_setg_errno(errp, -ret, "failed to set path: %s", 617 blkio_get_error_msg()); 618 return ret; 619 } 620 621 if (flags & BDRV_O_NOCACHE) { 622 ret = blkio_set_bool(s->blkio, "direct", true); 623 if (ret < 0) { 624 error_setg_errno(errp, -ret, "failed to set direct: %s", 625 blkio_get_error_msg()); 626 return ret; 627 } 628 } 629 630 return 0; 631 } 632 633 static int blkio_nvme_io_uring(BlockDriverState *bs, QDict *options, int flags, 634 Error **errp) 635 { 636 const char *path = qdict_get_try_str(options, "path"); 637 BDRVBlkioState *s = bs->opaque; 638 int ret; 639 640 if (!path) { 641 error_setg(errp, "missing 'path' option"); 642 return -EINVAL; 643 } 644 645 ret = blkio_set_str(s->blkio, "path", path); 646 qdict_del(options, "path"); 647 if (ret < 0) { 648 error_setg_errno(errp, -ret, "failed to set path: %s", 649 blkio_get_error_msg()); 650 return ret; 651 } 652 653 if (!(flags & BDRV_O_NOCACHE)) { 654 error_setg(errp, "cache.direct=off is not supported"); 655 return -EINVAL; 656 } 657 658 return 0; 659 } 660 661 static int blkio_virtio_blk_common_open(BlockDriverState *bs, 662 QDict *options, int flags, Error **errp) 663 { 664 const char *path = qdict_get_try_str(options, "path"); 665 BDRVBlkioState *s = bs->opaque; 666 bool fd_supported = false; 667 int fd, ret; 668 669 if (!path) { 670 error_setg(errp, "missing 'path' option"); 671 return -EINVAL; 672 } 673 674 if (!(flags & BDRV_O_NOCACHE)) { 675 error_setg(errp, "cache.direct=off is not supported"); 676 return -EINVAL; 677 } 678 679 if (blkio_get_int(s->blkio, "fd", &fd) == 0) { 680 fd_supported = true; 681 } 682 683 /* 684 * If the libblkio driver supports fd passing, let's always use qemu_open() 685 * to open the `path`, so we can handle fd passing from the management 686 * layer through the "/dev/fdset/N" special path. 687 */ 688 if (fd_supported) { 689 int open_flags; 690 691 if (flags & BDRV_O_RDWR) { 692 open_flags = O_RDWR; 693 } else { 694 open_flags = O_RDONLY; 695 } 696 697 fd = qemu_open(path, open_flags, errp); 698 if (fd < 0) { 699 return -EINVAL; 700 } 701 702 ret = blkio_set_int(s->blkio, "fd", fd); 703 if (ret < 0) { 704 error_setg_errno(errp, -ret, "failed to set fd: %s", 705 blkio_get_error_msg()); 706 qemu_close(fd); 707 return ret; 708 } 709 } else { 710 ret = blkio_set_str(s->blkio, "path", path); 711 if (ret < 0) { 712 error_setg_errno(errp, -ret, "failed to set path: %s", 713 blkio_get_error_msg()); 714 return ret; 715 } 716 } 717 718 qdict_del(options, "path"); 719 720 return 0; 721 } 722 723 static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags, 724 Error **errp) 725 { 726 const char *blkio_driver = bs->drv->protocol_name; 727 BDRVBlkioState *s = bs->opaque; 728 int ret; 729 730 ret = blkio_create(blkio_driver, &s->blkio); 731 if (ret < 0) { 732 error_setg_errno(errp, -ret, "blkio_create failed: %s", 733 blkio_get_error_msg()); 734 return ret; 735 } 736 737 if (strcmp(blkio_driver, "io_uring") == 0) { 738 ret = blkio_io_uring_open(bs, options, flags, errp); 739 } else if (strcmp(blkio_driver, "nvme-io_uring") == 0) { 740 ret = blkio_nvme_io_uring(bs, options, flags, errp); 741 } else if (strcmp(blkio_driver, "virtio-blk-vfio-pci") == 0) { 742 ret = blkio_virtio_blk_common_open(bs, options, flags, errp); 743 } else if (strcmp(blkio_driver, "virtio-blk-vhost-user") == 0) { 744 ret = blkio_virtio_blk_common_open(bs, options, flags, errp); 745 } else if (strcmp(blkio_driver, "virtio-blk-vhost-vdpa") == 0) { 746 ret = blkio_virtio_blk_common_open(bs, options, flags, errp); 747 } else { 748 g_assert_not_reached(); 749 } 750 if (ret < 0) { 751 blkio_destroy(&s->blkio); 752 return ret; 753 } 754 755 if (!(flags & BDRV_O_RDWR)) { 756 ret = blkio_set_bool(s->blkio, "read-only", true); 757 if (ret < 0) { 758 error_setg_errno(errp, -ret, "failed to set read-only: %s", 759 blkio_get_error_msg()); 760 blkio_destroy(&s->blkio); 761 return ret; 762 } 763 } 764 765 ret = blkio_connect(s->blkio); 766 if (ret < 0) { 767 error_setg_errno(errp, -ret, "blkio_connect failed: %s", 768 blkio_get_error_msg()); 769 blkio_destroy(&s->blkio); 770 return ret; 771 } 772 773 ret = blkio_get_bool(s->blkio, 774 "needs-mem-regions", 775 &s->needs_mem_regions); 776 if (ret < 0) { 777 error_setg_errno(errp, -ret, 778 "failed to get needs-mem-regions: %s", 779 blkio_get_error_msg()); 780 blkio_destroy(&s->blkio); 781 return ret; 782 } 783 784 ret = blkio_get_bool(s->blkio, 785 "needs-mem-region-fd", 786 &s->needs_mem_region_fd); 787 if (ret < 0) { 788 error_setg_errno(errp, -ret, 789 "failed to get needs-mem-region-fd: %s", 790 blkio_get_error_msg()); 791 blkio_destroy(&s->blkio); 792 return ret; 793 } 794 795 ret = blkio_get_uint64(s->blkio, 796 "mem-region-alignment", 797 &s->mem_region_alignment); 798 if (ret < 0) { 799 error_setg_errno(errp, -ret, 800 "failed to get mem-region-alignment: %s", 801 blkio_get_error_msg()); 802 blkio_destroy(&s->blkio); 803 return ret; 804 } 805 806 ret = blkio_get_bool(s->blkio, 807 "may-pin-mem-regions", 808 &s->may_pin_mem_regions); 809 if (ret < 0) { 810 /* Be conservative (assume pinning) if the property is not supported */ 811 s->may_pin_mem_regions = s->needs_mem_regions; 812 } 813 814 /* 815 * Notify if libblkio drivers pin memory and prevent features like 816 * virtio-mem from working. 817 */ 818 if (s->may_pin_mem_regions) { 819 ret = ram_block_discard_disable(true); 820 if (ret < 0) { 821 error_setg_errno(errp, -ret, "ram_block_discard_disable() failed"); 822 blkio_destroy(&s->blkio); 823 return ret; 824 } 825 } 826 827 ret = blkio_start(s->blkio); 828 if (ret < 0) { 829 error_setg_errno(errp, -ret, "blkio_start failed: %s", 830 blkio_get_error_msg()); 831 blkio_destroy(&s->blkio); 832 if (s->may_pin_mem_regions) { 833 ram_block_discard_disable(false); 834 } 835 return ret; 836 } 837 838 bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF; 839 bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | 840 BDRV_REQ_NO_FALLBACK; 841 842 qemu_mutex_init(&s->blkio_lock); 843 qemu_co_mutex_init(&s->bounce_lock); 844 qemu_co_queue_init(&s->bounce_available); 845 QLIST_INIT(&s->bounce_bufs); 846 s->blkioq = blkio_get_queue(s->blkio, 0); 847 s->completion_fd = blkioq_get_completion_fd(s->blkioq); 848 849 blkio_attach_aio_context(bs, bdrv_get_aio_context(bs)); 850 return 0; 851 } 852 853 static void blkio_close(BlockDriverState *bs) 854 { 855 BDRVBlkioState *s = bs->opaque; 856 857 /* There is no destroy() API for s->bounce_lock */ 858 859 qemu_mutex_destroy(&s->blkio_lock); 860 blkio_detach_aio_context(bs); 861 blkio_destroy(&s->blkio); 862 863 if (s->may_pin_mem_regions) { 864 ram_block_discard_disable(false); 865 } 866 } 867 868 static int64_t coroutine_fn blkio_co_getlength(BlockDriverState *bs) 869 { 870 BDRVBlkioState *s = bs->opaque; 871 uint64_t capacity; 872 int ret; 873 874 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 875 ret = blkio_get_uint64(s->blkio, "capacity", &capacity); 876 } 877 if (ret < 0) { 878 return -ret; 879 } 880 881 return capacity; 882 } 883 884 static int coroutine_fn blkio_truncate(BlockDriverState *bs, int64_t offset, 885 bool exact, PreallocMode prealloc, 886 BdrvRequestFlags flags, Error **errp) 887 { 888 int64_t current_length; 889 890 if (prealloc != PREALLOC_MODE_OFF) { 891 error_setg(errp, "Unsupported preallocation mode '%s'", 892 PreallocMode_str(prealloc)); 893 return -ENOTSUP; 894 } 895 896 current_length = blkio_co_getlength(bs); 897 898 if (offset > current_length) { 899 error_setg(errp, "Cannot grow device"); 900 return -EINVAL; 901 } else if (exact && offset != current_length) { 902 error_setg(errp, "Cannot resize device"); 903 return -ENOTSUP; 904 } 905 906 return 0; 907 } 908 909 static int coroutine_fn 910 blkio_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 911 { 912 return 0; 913 } 914 915 static void blkio_refresh_limits(BlockDriverState *bs, Error **errp) 916 { 917 BDRVBlkioState *s = bs->opaque; 918 QEMU_LOCK_GUARD(&s->blkio_lock); 919 int value; 920 int ret; 921 922 ret = blkio_get_int(s->blkio, "request-alignment", &value); 923 if (ret < 0) { 924 error_setg_errno(errp, -ret, "failed to get \"request-alignment\": %s", 925 blkio_get_error_msg()); 926 return; 927 } 928 bs->bl.request_alignment = value; 929 if (bs->bl.request_alignment < 1 || 930 bs->bl.request_alignment >= INT_MAX || 931 !is_power_of_2(bs->bl.request_alignment)) { 932 error_setg(errp, "invalid \"request-alignment\" value %" PRIu32 ", " 933 "must be a power of 2 less than INT_MAX", 934 bs->bl.request_alignment); 935 return; 936 } 937 938 ret = blkio_get_int(s->blkio, "optimal-io-size", &value); 939 if (ret < 0) { 940 error_setg_errno(errp, -ret, "failed to get \"optimal-io-size\": %s", 941 blkio_get_error_msg()); 942 return; 943 } 944 bs->bl.opt_transfer = value; 945 if (bs->bl.opt_transfer > INT_MAX || 946 (bs->bl.opt_transfer % bs->bl.request_alignment)) { 947 error_setg(errp, "invalid \"optimal-io-size\" value %" PRIu32 ", must " 948 "be a multiple of %" PRIu32, bs->bl.opt_transfer, 949 bs->bl.request_alignment); 950 return; 951 } 952 953 ret = blkio_get_int(s->blkio, "max-transfer", &value); 954 if (ret < 0) { 955 error_setg_errno(errp, -ret, "failed to get \"max-transfer\": %s", 956 blkio_get_error_msg()); 957 return; 958 } 959 bs->bl.max_transfer = value; 960 if ((bs->bl.max_transfer % bs->bl.request_alignment) || 961 (bs->bl.opt_transfer && (bs->bl.max_transfer % bs->bl.opt_transfer))) { 962 error_setg(errp, "invalid \"max-transfer\" value %" PRIu32 ", must be " 963 "a multiple of %" PRIu32 " and %" PRIu32 " (if non-zero)", 964 bs->bl.max_transfer, bs->bl.request_alignment, 965 bs->bl.opt_transfer); 966 return; 967 } 968 969 ret = blkio_get_int(s->blkio, "buf-alignment", &value); 970 if (ret < 0) { 971 error_setg_errno(errp, -ret, "failed to get \"buf-alignment\": %s", 972 blkio_get_error_msg()); 973 return; 974 } 975 if (value < 1) { 976 error_setg(errp, "invalid \"buf-alignment\" value %d, must be " 977 "positive", value); 978 return; 979 } 980 bs->bl.min_mem_alignment = value; 981 982 ret = blkio_get_int(s->blkio, "optimal-buf-alignment", &value); 983 if (ret < 0) { 984 error_setg_errno(errp, -ret, 985 "failed to get \"optimal-buf-alignment\": %s", 986 blkio_get_error_msg()); 987 return; 988 } 989 if (value < 1) { 990 error_setg(errp, "invalid \"optimal-buf-alignment\" value %d, " 991 "must be positive", value); 992 return; 993 } 994 bs->bl.opt_mem_alignment = value; 995 996 ret = blkio_get_int(s->blkio, "max-segments", &value); 997 if (ret < 0) { 998 error_setg_errno(errp, -ret, "failed to get \"max-segments\": %s", 999 blkio_get_error_msg()); 1000 return; 1001 } 1002 if (value < 1) { 1003 error_setg(errp, "invalid \"max-segments\" value %d, must be positive", 1004 value); 1005 return; 1006 } 1007 bs->bl.max_iov = value; 1008 } 1009 1010 /* 1011 * TODO 1012 * Missing libblkio APIs: 1013 * - block_status 1014 * - co_invalidate_cache 1015 * 1016 * Out of scope? 1017 * - create 1018 * - truncate 1019 */ 1020 1021 /* 1022 * Do not include .format_name and .protocol_name because module_block.py 1023 * does not parse macros in the source code. 1024 */ 1025 #define BLKIO_DRIVER_COMMON \ 1026 .instance_size = sizeof(BDRVBlkioState), \ 1027 .bdrv_file_open = blkio_file_open, \ 1028 .bdrv_close = blkio_close, \ 1029 .bdrv_co_getlength = blkio_co_getlength, \ 1030 .bdrv_co_truncate = blkio_truncate, \ 1031 .bdrv_co_get_info = blkio_co_get_info, \ 1032 .bdrv_attach_aio_context = blkio_attach_aio_context, \ 1033 .bdrv_detach_aio_context = blkio_detach_aio_context, \ 1034 .bdrv_co_pdiscard = blkio_co_pdiscard, \ 1035 .bdrv_co_preadv = blkio_co_preadv, \ 1036 .bdrv_co_pwritev = blkio_co_pwritev, \ 1037 .bdrv_co_flush_to_disk = blkio_co_flush, \ 1038 .bdrv_co_pwrite_zeroes = blkio_co_pwrite_zeroes, \ 1039 .bdrv_refresh_limits = blkio_refresh_limits, \ 1040 .bdrv_register_buf = blkio_register_buf, \ 1041 .bdrv_unregister_buf = blkio_unregister_buf, 1042 1043 /* 1044 * Use the same .format_name and .protocol_name as the libblkio driver name for 1045 * consistency. 1046 */ 1047 1048 static BlockDriver bdrv_io_uring = { 1049 .format_name = "io_uring", 1050 .protocol_name = "io_uring", 1051 .bdrv_needs_filename = true, 1052 BLKIO_DRIVER_COMMON 1053 }; 1054 1055 static BlockDriver bdrv_nvme_io_uring = { 1056 .format_name = "nvme-io_uring", 1057 .protocol_name = "nvme-io_uring", 1058 BLKIO_DRIVER_COMMON 1059 }; 1060 1061 static BlockDriver bdrv_virtio_blk_vfio_pci = { 1062 .format_name = "virtio-blk-vfio-pci", 1063 .protocol_name = "virtio-blk-vfio-pci", 1064 BLKIO_DRIVER_COMMON 1065 }; 1066 1067 static BlockDriver bdrv_virtio_blk_vhost_user = { 1068 .format_name = "virtio-blk-vhost-user", 1069 .protocol_name = "virtio-blk-vhost-user", 1070 BLKIO_DRIVER_COMMON 1071 }; 1072 1073 static BlockDriver bdrv_virtio_blk_vhost_vdpa = { 1074 .format_name = "virtio-blk-vhost-vdpa", 1075 .protocol_name = "virtio-blk-vhost-vdpa", 1076 BLKIO_DRIVER_COMMON 1077 }; 1078 1079 static void bdrv_blkio_init(void) 1080 { 1081 bdrv_register(&bdrv_io_uring); 1082 bdrv_register(&bdrv_nvme_io_uring); 1083 bdrv_register(&bdrv_virtio_blk_vfio_pci); 1084 bdrv_register(&bdrv_virtio_blk_vhost_user); 1085 bdrv_register(&bdrv_virtio_blk_vhost_vdpa); 1086 } 1087 1088 block_init(bdrv_blkio_init); 1089