1 /* SPDX-License-Identifier: LGPL-2.1-or-later */ 2 /* 3 * libblkio BlockDriver 4 * 5 * Copyright Red Hat, Inc. 6 * 7 * Author: 8 * Stefan Hajnoczi <stefanha@redhat.com> 9 */ 10 11 #include "qemu/osdep.h" 12 #include <blkio.h> 13 #include "block/block_int.h" 14 #include "exec/memory.h" 15 #include "exec/cpu-common.h" /* for qemu_ram_get_fd() */ 16 #include "qapi/error.h" 17 #include "qemu/error-report.h" 18 #include "qapi/qmp/qdict.h" 19 #include "qemu/module.h" 20 #include "sysemu/block-backend.h" 21 #include "exec/memory.h" /* for ram_block_discard_disable() */ 22 23 #include "block/block-io.h" 24 25 /* 26 * Keep the QEMU BlockDriver names identical to the libblkio driver names. 27 * Using macros instead of typing out the string literals avoids typos. 28 */ 29 #define DRIVER_IO_URING "io_uring" 30 #define DRIVER_NVME_IO_URING "nvme-io_uring" 31 #define DRIVER_VIRTIO_BLK_VFIO_PCI "virtio-blk-vfio-pci" 32 #define DRIVER_VIRTIO_BLK_VHOST_USER "virtio-blk-vhost-user" 33 #define DRIVER_VIRTIO_BLK_VHOST_VDPA "virtio-blk-vhost-vdpa" 34 35 /* 36 * Allocated bounce buffers are kept in a list sorted by buffer address. 37 */ 38 typedef struct BlkioBounceBuf { 39 QLIST_ENTRY(BlkioBounceBuf) next; 40 41 /* The bounce buffer */ 42 struct iovec buf; 43 } BlkioBounceBuf; 44 45 typedef struct { 46 /* 47 * libblkio is not thread-safe so this lock protects ->blkio and 48 * ->blkioq. 49 */ 50 QemuMutex blkio_lock; 51 struct blkio *blkio; 52 struct blkioq *blkioq; /* make this multi-queue in the future... */ 53 int completion_fd; 54 55 /* 56 * Polling fetches the next completion into this field. 57 * 58 * No lock is necessary since only one thread calls aio_poll() and invokes 59 * fd and poll handlers. 60 */ 61 struct blkio_completion poll_completion; 62 63 /* 64 * Protects ->bounce_pool, ->bounce_bufs, ->bounce_available. 65 * 66 * Lock ordering: ->bounce_lock before ->blkio_lock. 67 */ 68 CoMutex bounce_lock; 69 70 /* Bounce buffer pool */ 71 struct blkio_mem_region bounce_pool; 72 73 /* Sorted list of allocated bounce buffers */ 74 QLIST_HEAD(, BlkioBounceBuf) bounce_bufs; 75 76 /* Queue for coroutines waiting for bounce buffer space */ 77 CoQueue bounce_available; 78 79 /* The value of the "mem-region-alignment" property */ 80 size_t mem_region_alignment; 81 82 /* Can we skip adding/deleting blkio_mem_regions? */ 83 bool needs_mem_regions; 84 85 /* Are file descriptors necessary for blkio_mem_regions? */ 86 bool needs_mem_region_fd; 87 88 /* Are madvise(MADV_DONTNEED)-style operations unavailable? */ 89 bool may_pin_mem_regions; 90 } BDRVBlkioState; 91 92 /* Called with s->bounce_lock held */ 93 static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes) 94 { 95 /* There can be no allocated bounce buffers during resize */ 96 assert(QLIST_EMPTY(&s->bounce_bufs)); 97 98 /* Pad size to reduce frequency of resize calls */ 99 bytes += 128 * 1024; 100 101 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 102 int ret; 103 104 if (s->bounce_pool.addr) { 105 blkio_unmap_mem_region(s->blkio, &s->bounce_pool); 106 blkio_free_mem_region(s->blkio, &s->bounce_pool); 107 memset(&s->bounce_pool, 0, sizeof(s->bounce_pool)); 108 } 109 110 /* Automatically freed when s->blkio is destroyed */ 111 ret = blkio_alloc_mem_region(s->blkio, &s->bounce_pool, bytes); 112 if (ret < 0) { 113 return ret; 114 } 115 116 ret = blkio_map_mem_region(s->blkio, &s->bounce_pool); 117 if (ret < 0) { 118 blkio_free_mem_region(s->blkio, &s->bounce_pool); 119 memset(&s->bounce_pool, 0, sizeof(s->bounce_pool)); 120 return ret; 121 } 122 } 123 124 return 0; 125 } 126 127 /* Called with s->bounce_lock held */ 128 static bool 129 blkio_do_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce, 130 int64_t bytes) 131 { 132 void *addr = s->bounce_pool.addr; 133 BlkioBounceBuf *cur = NULL; 134 BlkioBounceBuf *prev = NULL; 135 ptrdiff_t space; 136 137 /* 138 * This is just a linear search over the holes between requests. An 139 * efficient allocator would be nice. 140 */ 141 QLIST_FOREACH(cur, &s->bounce_bufs, next) { 142 space = cur->buf.iov_base - addr; 143 if (bytes <= space) { 144 QLIST_INSERT_BEFORE(cur, bounce, next); 145 bounce->buf.iov_base = addr; 146 bounce->buf.iov_len = bytes; 147 return true; 148 } 149 150 addr = cur->buf.iov_base + cur->buf.iov_len; 151 prev = cur; 152 } 153 154 /* Is there space after the last request? */ 155 space = s->bounce_pool.addr + s->bounce_pool.len - addr; 156 if (bytes > space) { 157 return false; 158 } 159 if (prev) { 160 QLIST_INSERT_AFTER(prev, bounce, next); 161 } else { 162 QLIST_INSERT_HEAD(&s->bounce_bufs, bounce, next); 163 } 164 bounce->buf.iov_base = addr; 165 bounce->buf.iov_len = bytes; 166 return true; 167 } 168 169 static int coroutine_fn 170 blkio_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce, 171 int64_t bytes) 172 { 173 /* 174 * Ensure fairness: first time around we join the back of the queue, 175 * subsequently we join the front so we don't lose our place. 176 */ 177 CoQueueWaitFlags wait_flags = 0; 178 179 QEMU_LOCK_GUARD(&s->bounce_lock); 180 181 /* Ensure fairness: don't even try if other requests are already waiting */ 182 if (!qemu_co_queue_empty(&s->bounce_available)) { 183 qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock, 184 wait_flags); 185 wait_flags = CO_QUEUE_WAIT_FRONT; 186 } 187 188 while (true) { 189 if (blkio_do_alloc_bounce_buffer(s, bounce, bytes)) { 190 /* Kick the next queued request since there may be space */ 191 qemu_co_queue_next(&s->bounce_available); 192 return 0; 193 } 194 195 /* 196 * If there are no in-flight requests then the pool was simply too 197 * small. 198 */ 199 if (QLIST_EMPTY(&s->bounce_bufs)) { 200 bool ok; 201 int ret; 202 203 ret = blkio_resize_bounce_pool(s, bytes); 204 if (ret < 0) { 205 /* Kick the next queued request since that may fail too */ 206 qemu_co_queue_next(&s->bounce_available); 207 return ret; 208 } 209 210 ok = blkio_do_alloc_bounce_buffer(s, bounce, bytes); 211 assert(ok); /* must have space this time */ 212 return 0; 213 } 214 215 qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock, 216 wait_flags); 217 wait_flags = CO_QUEUE_WAIT_FRONT; 218 } 219 } 220 221 static void coroutine_fn blkio_free_bounce_buffer(BDRVBlkioState *s, 222 BlkioBounceBuf *bounce) 223 { 224 QEMU_LOCK_GUARD(&s->bounce_lock); 225 226 QLIST_REMOVE(bounce, next); 227 228 /* Wake up waiting coroutines since space may now be available */ 229 qemu_co_queue_next(&s->bounce_available); 230 } 231 232 /* For async to .bdrv_co_*() conversion */ 233 typedef struct { 234 Coroutine *coroutine; 235 int ret; 236 } BlkioCoData; 237 238 static void blkio_completion_fd_read(void *opaque) 239 { 240 BlockDriverState *bs = opaque; 241 BDRVBlkioState *s = bs->opaque; 242 uint64_t val; 243 int ret; 244 245 /* Polling may have already fetched a completion */ 246 if (s->poll_completion.user_data != NULL) { 247 BlkioCoData *cod = s->poll_completion.user_data; 248 cod->ret = s->poll_completion.ret; 249 250 /* Clear it in case aio_co_wake() enters a nested event loop */ 251 s->poll_completion.user_data = NULL; 252 253 aio_co_wake(cod->coroutine); 254 } 255 256 /* Reset completion fd status */ 257 ret = read(s->completion_fd, &val, sizeof(val)); 258 259 /* Ignore errors, there's nothing we can do */ 260 (void)ret; 261 262 /* 263 * Reading one completion at a time makes nested event loop re-entrancy 264 * simple. Change this loop to get multiple completions in one go if it 265 * becomes a performance bottleneck. 266 */ 267 while (true) { 268 struct blkio_completion completion; 269 270 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 271 ret = blkioq_do_io(s->blkioq, &completion, 0, 1, NULL); 272 } 273 if (ret != 1) { 274 break; 275 } 276 277 BlkioCoData *cod = completion.user_data; 278 cod->ret = completion.ret; 279 aio_co_wake(cod->coroutine); 280 } 281 } 282 283 static bool blkio_completion_fd_poll(void *opaque) 284 { 285 BlockDriverState *bs = opaque; 286 BDRVBlkioState *s = bs->opaque; 287 int ret; 288 289 /* Just in case we already fetched a completion */ 290 if (s->poll_completion.user_data != NULL) { 291 return true; 292 } 293 294 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 295 ret = blkioq_do_io(s->blkioq, &s->poll_completion, 0, 1, NULL); 296 } 297 return ret == 1; 298 } 299 300 static void blkio_completion_fd_poll_ready(void *opaque) 301 { 302 blkio_completion_fd_read(opaque); 303 } 304 305 static void blkio_attach_aio_context(BlockDriverState *bs, 306 AioContext *new_context) 307 { 308 BDRVBlkioState *s = bs->opaque; 309 310 aio_set_fd_handler(new_context, s->completion_fd, 311 blkio_completion_fd_read, NULL, 312 blkio_completion_fd_poll, 313 blkio_completion_fd_poll_ready, bs); 314 } 315 316 static void blkio_detach_aio_context(BlockDriverState *bs) 317 { 318 BDRVBlkioState *s = bs->opaque; 319 320 aio_set_fd_handler(bdrv_get_aio_context(bs), s->completion_fd, NULL, NULL, 321 NULL, NULL, NULL); 322 } 323 324 /* 325 * Called by blk_io_unplug() or immediately if not plugged. Called without 326 * blkio_lock. 327 */ 328 static void blkio_unplug_fn(void *opaque) 329 { 330 BDRVBlkioState *s = opaque; 331 332 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 333 blkioq_do_io(s->blkioq, NULL, 0, 0, NULL); 334 } 335 } 336 337 /* 338 * Schedule I/O submission after enqueuing a new request. Called without 339 * blkio_lock. 340 */ 341 static void blkio_submit_io(BlockDriverState *bs) 342 { 343 BDRVBlkioState *s = bs->opaque; 344 345 blk_io_plug_call(blkio_unplug_fn, s); 346 } 347 348 static int coroutine_fn 349 blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) 350 { 351 BDRVBlkioState *s = bs->opaque; 352 BlkioCoData cod = { 353 .coroutine = qemu_coroutine_self(), 354 }; 355 356 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 357 blkioq_discard(s->blkioq, offset, bytes, &cod, 0); 358 } 359 360 blkio_submit_io(bs); 361 qemu_coroutine_yield(); 362 return cod.ret; 363 } 364 365 static int coroutine_fn 366 blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, 367 QEMUIOVector *qiov, BdrvRequestFlags flags) 368 { 369 BlkioCoData cod = { 370 .coroutine = qemu_coroutine_self(), 371 }; 372 BDRVBlkioState *s = bs->opaque; 373 bool use_bounce_buffer = 374 s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF); 375 BlkioBounceBuf bounce; 376 struct iovec *iov = qiov->iov; 377 int iovcnt = qiov->niov; 378 379 if (use_bounce_buffer) { 380 int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes); 381 if (ret < 0) { 382 return ret; 383 } 384 385 iov = &bounce.buf; 386 iovcnt = 1; 387 } 388 389 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 390 blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0); 391 } 392 393 blkio_submit_io(bs); 394 qemu_coroutine_yield(); 395 396 if (use_bounce_buffer) { 397 if (cod.ret == 0) { 398 qemu_iovec_from_buf(qiov, 0, 399 bounce.buf.iov_base, 400 bounce.buf.iov_len); 401 } 402 403 blkio_free_bounce_buffer(s, &bounce); 404 } 405 406 return cod.ret; 407 } 408 409 static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset, 410 int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) 411 { 412 uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0; 413 BlkioCoData cod = { 414 .coroutine = qemu_coroutine_self(), 415 }; 416 BDRVBlkioState *s = bs->opaque; 417 bool use_bounce_buffer = 418 s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF); 419 BlkioBounceBuf bounce; 420 struct iovec *iov = qiov->iov; 421 int iovcnt = qiov->niov; 422 423 if (use_bounce_buffer) { 424 int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes); 425 if (ret < 0) { 426 return ret; 427 } 428 429 qemu_iovec_to_buf(qiov, 0, bounce.buf.iov_base, bytes); 430 iov = &bounce.buf; 431 iovcnt = 1; 432 } 433 434 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 435 blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags); 436 } 437 438 blkio_submit_io(bs); 439 qemu_coroutine_yield(); 440 441 if (use_bounce_buffer) { 442 blkio_free_bounce_buffer(s, &bounce); 443 } 444 445 return cod.ret; 446 } 447 448 static int coroutine_fn blkio_co_flush(BlockDriverState *bs) 449 { 450 BDRVBlkioState *s = bs->opaque; 451 BlkioCoData cod = { 452 .coroutine = qemu_coroutine_self(), 453 }; 454 455 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 456 blkioq_flush(s->blkioq, &cod, 0); 457 } 458 459 blkio_submit_io(bs); 460 qemu_coroutine_yield(); 461 return cod.ret; 462 } 463 464 static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs, 465 int64_t offset, int64_t bytes, BdrvRequestFlags flags) 466 { 467 BDRVBlkioState *s = bs->opaque; 468 BlkioCoData cod = { 469 .coroutine = qemu_coroutine_self(), 470 }; 471 uint32_t blkio_flags = 0; 472 473 if (flags & BDRV_REQ_FUA) { 474 blkio_flags |= BLKIO_REQ_FUA; 475 } 476 if (!(flags & BDRV_REQ_MAY_UNMAP)) { 477 blkio_flags |= BLKIO_REQ_NO_UNMAP; 478 } 479 if (flags & BDRV_REQ_NO_FALLBACK) { 480 blkio_flags |= BLKIO_REQ_NO_FALLBACK; 481 } 482 483 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 484 blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags); 485 } 486 487 blkio_submit_io(bs); 488 qemu_coroutine_yield(); 489 return cod.ret; 490 } 491 492 typedef enum { 493 BMRR_OK, 494 BMRR_SKIP, 495 BMRR_FAIL, 496 } BlkioMemRegionResult; 497 498 /* 499 * Produce a struct blkio_mem_region for a given address and size. 500 * 501 * This function produces identical results when called multiple times with the 502 * same arguments. This property is necessary because blkio_unmap_mem_region() 503 * must receive the same struct blkio_mem_region field values that were passed 504 * to blkio_map_mem_region(). 505 */ 506 static BlkioMemRegionResult 507 blkio_mem_region_from_host(BlockDriverState *bs, 508 void *host, size_t size, 509 struct blkio_mem_region *region, 510 Error **errp) 511 { 512 BDRVBlkioState *s = bs->opaque; 513 int fd = -1; 514 ram_addr_t fd_offset = 0; 515 516 if (((uintptr_t)host | size) % s->mem_region_alignment) { 517 error_setg(errp, "unaligned buf %p with size %zu", host, size); 518 return BMRR_FAIL; 519 } 520 521 /* Attempt to find the fd for the underlying memory */ 522 if (s->needs_mem_region_fd) { 523 RAMBlock *ram_block; 524 RAMBlock *end_block; 525 ram_addr_t offset; 526 527 /* 528 * bdrv_register_buf() is called with the BQL held so mr lives at least 529 * until this function returns. 530 */ 531 ram_block = qemu_ram_block_from_host(host, false, &fd_offset); 532 if (ram_block) { 533 fd = qemu_ram_get_fd(ram_block); 534 } 535 if (fd == -1) { 536 /* 537 * Ideally every RAMBlock would have an fd. pc-bios and other 538 * things don't. Luckily they are usually not I/O buffers and we 539 * can just ignore them. 540 */ 541 return BMRR_SKIP; 542 } 543 544 /* Make sure the fd covers the entire range */ 545 end_block = qemu_ram_block_from_host(host + size - 1, false, &offset); 546 if (ram_block != end_block) { 547 error_setg(errp, "registered buffer at %p with size %zu extends " 548 "beyond RAMBlock", host, size); 549 return BMRR_FAIL; 550 } 551 } 552 553 *region = (struct blkio_mem_region){ 554 .addr = host, 555 .len = size, 556 .fd = fd, 557 .fd_offset = fd_offset, 558 }; 559 return BMRR_OK; 560 } 561 562 static bool blkio_register_buf(BlockDriverState *bs, void *host, size_t size, 563 Error **errp) 564 { 565 BDRVBlkioState *s = bs->opaque; 566 struct blkio_mem_region region; 567 BlkioMemRegionResult region_result; 568 int ret; 569 570 /* 571 * Mapping memory regions conflicts with RAM discard (virtio-mem) when 572 * there is pinning, so only do it when necessary. 573 */ 574 if (!s->needs_mem_regions && s->may_pin_mem_regions) { 575 return true; 576 } 577 578 region_result = blkio_mem_region_from_host(bs, host, size, ®ion, errp); 579 if (region_result == BMRR_SKIP) { 580 return true; 581 } else if (region_result != BMRR_OK) { 582 return false; 583 } 584 585 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 586 ret = blkio_map_mem_region(s->blkio, ®ion); 587 } 588 589 if (ret < 0) { 590 error_setg(errp, "Failed to add blkio mem region %p with size %zu: %s", 591 host, size, blkio_get_error_msg()); 592 return false; 593 } 594 return true; 595 } 596 597 static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size) 598 { 599 BDRVBlkioState *s = bs->opaque; 600 struct blkio_mem_region region; 601 602 /* See blkio_register_buf() */ 603 if (!s->needs_mem_regions && s->may_pin_mem_regions) { 604 return; 605 } 606 607 if (blkio_mem_region_from_host(bs, host, size, ®ion, NULL) != BMRR_OK) { 608 return; 609 } 610 611 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 612 blkio_unmap_mem_region(s->blkio, ®ion); 613 } 614 } 615 616 static int blkio_io_uring_open(BlockDriverState *bs, QDict *options, int flags, 617 Error **errp) 618 { 619 const char *filename = qdict_get_str(options, "filename"); 620 BDRVBlkioState *s = bs->opaque; 621 int ret; 622 623 ret = blkio_set_str(s->blkio, "path", filename); 624 qdict_del(options, "filename"); 625 if (ret < 0) { 626 error_setg_errno(errp, -ret, "failed to set path: %s", 627 blkio_get_error_msg()); 628 return ret; 629 } 630 631 if (flags & BDRV_O_NOCACHE) { 632 ret = blkio_set_bool(s->blkio, "direct", true); 633 if (ret < 0) { 634 error_setg_errno(errp, -ret, "failed to set direct: %s", 635 blkio_get_error_msg()); 636 return ret; 637 } 638 } 639 640 return 0; 641 } 642 643 static int blkio_nvme_io_uring(BlockDriverState *bs, QDict *options, int flags, 644 Error **errp) 645 { 646 const char *path = qdict_get_try_str(options, "path"); 647 BDRVBlkioState *s = bs->opaque; 648 int ret; 649 650 if (!path) { 651 error_setg(errp, "missing 'path' option"); 652 return -EINVAL; 653 } 654 655 ret = blkio_set_str(s->blkio, "path", path); 656 qdict_del(options, "path"); 657 if (ret < 0) { 658 error_setg_errno(errp, -ret, "failed to set path: %s", 659 blkio_get_error_msg()); 660 return ret; 661 } 662 663 if (!(flags & BDRV_O_NOCACHE)) { 664 error_setg(errp, "cache.direct=off is not supported"); 665 return -EINVAL; 666 } 667 668 return 0; 669 } 670 671 static int blkio_virtio_blk_common_open(BlockDriverState *bs, 672 QDict *options, int flags, Error **errp) 673 { 674 const char *path = qdict_get_try_str(options, "path"); 675 BDRVBlkioState *s = bs->opaque; 676 bool fd_supported = false; 677 int fd, ret; 678 679 if (!path) { 680 error_setg(errp, "missing 'path' option"); 681 return -EINVAL; 682 } 683 684 if (!(flags & BDRV_O_NOCACHE)) { 685 error_setg(errp, "cache.direct=off is not supported"); 686 return -EINVAL; 687 } 688 689 if (blkio_get_int(s->blkio, "fd", &fd) == 0) { 690 fd_supported = true; 691 } 692 693 /* 694 * If the libblkio driver supports fd passing, let's always use qemu_open() 695 * to open the `path`, so we can handle fd passing from the management 696 * layer through the "/dev/fdset/N" special path. 697 */ 698 if (fd_supported) { 699 int open_flags; 700 701 if (flags & BDRV_O_RDWR) { 702 open_flags = O_RDWR; 703 } else { 704 open_flags = O_RDONLY; 705 } 706 707 fd = qemu_open(path, open_flags, errp); 708 if (fd < 0) { 709 return -EINVAL; 710 } 711 712 ret = blkio_set_int(s->blkio, "fd", fd); 713 if (ret < 0) { 714 error_setg_errno(errp, -ret, "failed to set fd: %s", 715 blkio_get_error_msg()); 716 qemu_close(fd); 717 return ret; 718 } 719 } else { 720 ret = blkio_set_str(s->blkio, "path", path); 721 if (ret < 0) { 722 error_setg_errno(errp, -ret, "failed to set path: %s", 723 blkio_get_error_msg()); 724 return ret; 725 } 726 } 727 728 qdict_del(options, "path"); 729 730 return 0; 731 } 732 733 static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags, 734 Error **errp) 735 { 736 const char *blkio_driver = bs->drv->protocol_name; 737 BDRVBlkioState *s = bs->opaque; 738 int ret; 739 740 ret = blkio_create(blkio_driver, &s->blkio); 741 if (ret < 0) { 742 error_setg_errno(errp, -ret, "blkio_create failed: %s", 743 blkio_get_error_msg()); 744 return ret; 745 } 746 747 if (strcmp(blkio_driver, DRIVER_IO_URING) == 0) { 748 ret = blkio_io_uring_open(bs, options, flags, errp); 749 } else if (strcmp(blkio_driver, DRIVER_NVME_IO_URING) == 0) { 750 ret = blkio_nvme_io_uring(bs, options, flags, errp); 751 } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VFIO_PCI) == 0) { 752 ret = blkio_virtio_blk_common_open(bs, options, flags, errp); 753 } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_USER) == 0) { 754 ret = blkio_virtio_blk_common_open(bs, options, flags, errp); 755 } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_VDPA) == 0) { 756 ret = blkio_virtio_blk_common_open(bs, options, flags, errp); 757 } else { 758 g_assert_not_reached(); 759 } 760 if (ret < 0) { 761 blkio_destroy(&s->blkio); 762 return ret; 763 } 764 765 if (!(flags & BDRV_O_RDWR)) { 766 ret = blkio_set_bool(s->blkio, "read-only", true); 767 if (ret < 0) { 768 error_setg_errno(errp, -ret, "failed to set read-only: %s", 769 blkio_get_error_msg()); 770 blkio_destroy(&s->blkio); 771 return ret; 772 } 773 } 774 775 ret = blkio_connect(s->blkio); 776 if (ret < 0) { 777 error_setg_errno(errp, -ret, "blkio_connect failed: %s", 778 blkio_get_error_msg()); 779 blkio_destroy(&s->blkio); 780 return ret; 781 } 782 783 ret = blkio_get_bool(s->blkio, 784 "needs-mem-regions", 785 &s->needs_mem_regions); 786 if (ret < 0) { 787 error_setg_errno(errp, -ret, 788 "failed to get needs-mem-regions: %s", 789 blkio_get_error_msg()); 790 blkio_destroy(&s->blkio); 791 return ret; 792 } 793 794 ret = blkio_get_bool(s->blkio, 795 "needs-mem-region-fd", 796 &s->needs_mem_region_fd); 797 if (ret < 0) { 798 error_setg_errno(errp, -ret, 799 "failed to get needs-mem-region-fd: %s", 800 blkio_get_error_msg()); 801 blkio_destroy(&s->blkio); 802 return ret; 803 } 804 805 ret = blkio_get_uint64(s->blkio, 806 "mem-region-alignment", 807 &s->mem_region_alignment); 808 if (ret < 0) { 809 error_setg_errno(errp, -ret, 810 "failed to get mem-region-alignment: %s", 811 blkio_get_error_msg()); 812 blkio_destroy(&s->blkio); 813 return ret; 814 } 815 816 ret = blkio_get_bool(s->blkio, 817 "may-pin-mem-regions", 818 &s->may_pin_mem_regions); 819 if (ret < 0) { 820 /* Be conservative (assume pinning) if the property is not supported */ 821 s->may_pin_mem_regions = s->needs_mem_regions; 822 } 823 824 /* 825 * Notify if libblkio drivers pin memory and prevent features like 826 * virtio-mem from working. 827 */ 828 if (s->may_pin_mem_regions) { 829 ret = ram_block_discard_disable(true); 830 if (ret < 0) { 831 error_setg_errno(errp, -ret, "ram_block_discard_disable() failed"); 832 blkio_destroy(&s->blkio); 833 return ret; 834 } 835 } 836 837 ret = blkio_start(s->blkio); 838 if (ret < 0) { 839 error_setg_errno(errp, -ret, "blkio_start failed: %s", 840 blkio_get_error_msg()); 841 blkio_destroy(&s->blkio); 842 if (s->may_pin_mem_regions) { 843 ram_block_discard_disable(false); 844 } 845 return ret; 846 } 847 848 bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF; 849 bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | 850 BDRV_REQ_NO_FALLBACK; 851 852 qemu_mutex_init(&s->blkio_lock); 853 qemu_co_mutex_init(&s->bounce_lock); 854 qemu_co_queue_init(&s->bounce_available); 855 QLIST_INIT(&s->bounce_bufs); 856 s->blkioq = blkio_get_queue(s->blkio, 0); 857 s->completion_fd = blkioq_get_completion_fd(s->blkioq); 858 859 blkio_attach_aio_context(bs, bdrv_get_aio_context(bs)); 860 return 0; 861 } 862 863 static void blkio_close(BlockDriverState *bs) 864 { 865 BDRVBlkioState *s = bs->opaque; 866 867 /* There is no destroy() API for s->bounce_lock */ 868 869 qemu_mutex_destroy(&s->blkio_lock); 870 blkio_detach_aio_context(bs); 871 blkio_destroy(&s->blkio); 872 873 if (s->may_pin_mem_regions) { 874 ram_block_discard_disable(false); 875 } 876 } 877 878 static int64_t coroutine_fn blkio_co_getlength(BlockDriverState *bs) 879 { 880 BDRVBlkioState *s = bs->opaque; 881 uint64_t capacity; 882 int ret; 883 884 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 885 ret = blkio_get_uint64(s->blkio, "capacity", &capacity); 886 } 887 if (ret < 0) { 888 return -ret; 889 } 890 891 return capacity; 892 } 893 894 static int coroutine_fn blkio_truncate(BlockDriverState *bs, int64_t offset, 895 bool exact, PreallocMode prealloc, 896 BdrvRequestFlags flags, Error **errp) 897 { 898 int64_t current_length; 899 900 if (prealloc != PREALLOC_MODE_OFF) { 901 error_setg(errp, "Unsupported preallocation mode '%s'", 902 PreallocMode_str(prealloc)); 903 return -ENOTSUP; 904 } 905 906 current_length = blkio_co_getlength(bs); 907 908 if (offset > current_length) { 909 error_setg(errp, "Cannot grow device"); 910 return -EINVAL; 911 } else if (exact && offset != current_length) { 912 error_setg(errp, "Cannot resize device"); 913 return -ENOTSUP; 914 } 915 916 return 0; 917 } 918 919 static int coroutine_fn 920 blkio_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 921 { 922 return 0; 923 } 924 925 static void blkio_refresh_limits(BlockDriverState *bs, Error **errp) 926 { 927 BDRVBlkioState *s = bs->opaque; 928 QEMU_LOCK_GUARD(&s->blkio_lock); 929 int value; 930 int ret; 931 932 ret = blkio_get_int(s->blkio, "request-alignment", &value); 933 if (ret < 0) { 934 error_setg_errno(errp, -ret, "failed to get \"request-alignment\": %s", 935 blkio_get_error_msg()); 936 return; 937 } 938 bs->bl.request_alignment = value; 939 if (bs->bl.request_alignment < 1 || 940 bs->bl.request_alignment >= INT_MAX || 941 !is_power_of_2(bs->bl.request_alignment)) { 942 error_setg(errp, "invalid \"request-alignment\" value %" PRIu32 ", " 943 "must be a power of 2 less than INT_MAX", 944 bs->bl.request_alignment); 945 return; 946 } 947 948 ret = blkio_get_int(s->blkio, "optimal-io-size", &value); 949 if (ret < 0) { 950 error_setg_errno(errp, -ret, "failed to get \"optimal-io-size\": %s", 951 blkio_get_error_msg()); 952 return; 953 } 954 bs->bl.opt_transfer = value; 955 if (bs->bl.opt_transfer > INT_MAX || 956 (bs->bl.opt_transfer % bs->bl.request_alignment)) { 957 error_setg(errp, "invalid \"optimal-io-size\" value %" PRIu32 ", must " 958 "be a multiple of %" PRIu32, bs->bl.opt_transfer, 959 bs->bl.request_alignment); 960 return; 961 } 962 963 ret = blkio_get_int(s->blkio, "max-transfer", &value); 964 if (ret < 0) { 965 error_setg_errno(errp, -ret, "failed to get \"max-transfer\": %s", 966 blkio_get_error_msg()); 967 return; 968 } 969 bs->bl.max_transfer = value; 970 if ((bs->bl.max_transfer % bs->bl.request_alignment) || 971 (bs->bl.opt_transfer && (bs->bl.max_transfer % bs->bl.opt_transfer))) { 972 error_setg(errp, "invalid \"max-transfer\" value %" PRIu32 ", must be " 973 "a multiple of %" PRIu32 " and %" PRIu32 " (if non-zero)", 974 bs->bl.max_transfer, bs->bl.request_alignment, 975 bs->bl.opt_transfer); 976 return; 977 } 978 979 ret = blkio_get_int(s->blkio, "buf-alignment", &value); 980 if (ret < 0) { 981 error_setg_errno(errp, -ret, "failed to get \"buf-alignment\": %s", 982 blkio_get_error_msg()); 983 return; 984 } 985 if (value < 1) { 986 error_setg(errp, "invalid \"buf-alignment\" value %d, must be " 987 "positive", value); 988 return; 989 } 990 bs->bl.min_mem_alignment = value; 991 992 ret = blkio_get_int(s->blkio, "optimal-buf-alignment", &value); 993 if (ret < 0) { 994 error_setg_errno(errp, -ret, 995 "failed to get \"optimal-buf-alignment\": %s", 996 blkio_get_error_msg()); 997 return; 998 } 999 if (value < 1) { 1000 error_setg(errp, "invalid \"optimal-buf-alignment\" value %d, " 1001 "must be positive", value); 1002 return; 1003 } 1004 bs->bl.opt_mem_alignment = value; 1005 1006 ret = blkio_get_int(s->blkio, "max-segments", &value); 1007 if (ret < 0) { 1008 error_setg_errno(errp, -ret, "failed to get \"max-segments\": %s", 1009 blkio_get_error_msg()); 1010 return; 1011 } 1012 if (value < 1) { 1013 error_setg(errp, "invalid \"max-segments\" value %d, must be positive", 1014 value); 1015 return; 1016 } 1017 bs->bl.max_iov = value; 1018 } 1019 1020 /* 1021 * TODO 1022 * Missing libblkio APIs: 1023 * - block_status 1024 * - co_invalidate_cache 1025 * 1026 * Out of scope? 1027 * - create 1028 * - truncate 1029 */ 1030 1031 #define BLKIO_DRIVER(name, ...) \ 1032 { \ 1033 .format_name = name, \ 1034 .protocol_name = name, \ 1035 .instance_size = sizeof(BDRVBlkioState), \ 1036 .bdrv_file_open = blkio_file_open, \ 1037 .bdrv_close = blkio_close, \ 1038 .bdrv_co_getlength = blkio_co_getlength, \ 1039 .bdrv_co_truncate = blkio_truncate, \ 1040 .bdrv_co_get_info = blkio_co_get_info, \ 1041 .bdrv_attach_aio_context = blkio_attach_aio_context, \ 1042 .bdrv_detach_aio_context = blkio_detach_aio_context, \ 1043 .bdrv_co_pdiscard = blkio_co_pdiscard, \ 1044 .bdrv_co_preadv = blkio_co_preadv, \ 1045 .bdrv_co_pwritev = blkio_co_pwritev, \ 1046 .bdrv_co_flush_to_disk = blkio_co_flush, \ 1047 .bdrv_co_pwrite_zeroes = blkio_co_pwrite_zeroes, \ 1048 .bdrv_refresh_limits = blkio_refresh_limits, \ 1049 .bdrv_register_buf = blkio_register_buf, \ 1050 .bdrv_unregister_buf = blkio_unregister_buf, \ 1051 __VA_ARGS__ \ 1052 } 1053 1054 static BlockDriver bdrv_io_uring = BLKIO_DRIVER( 1055 DRIVER_IO_URING, 1056 .bdrv_needs_filename = true, 1057 ); 1058 1059 static BlockDriver bdrv_nvme_io_uring = BLKIO_DRIVER( 1060 DRIVER_NVME_IO_URING, 1061 ); 1062 1063 static BlockDriver bdrv_virtio_blk_vfio_pci = BLKIO_DRIVER( 1064 DRIVER_VIRTIO_BLK_VFIO_PCI 1065 ); 1066 1067 static BlockDriver bdrv_virtio_blk_vhost_user = BLKIO_DRIVER( 1068 DRIVER_VIRTIO_BLK_VHOST_USER 1069 ); 1070 1071 static BlockDriver bdrv_virtio_blk_vhost_vdpa = BLKIO_DRIVER( 1072 DRIVER_VIRTIO_BLK_VHOST_VDPA 1073 ); 1074 1075 static void bdrv_blkio_init(void) 1076 { 1077 bdrv_register(&bdrv_io_uring); 1078 bdrv_register(&bdrv_nvme_io_uring); 1079 bdrv_register(&bdrv_virtio_blk_vfio_pci); 1080 bdrv_register(&bdrv_virtio_blk_vhost_user); 1081 bdrv_register(&bdrv_virtio_blk_vhost_vdpa); 1082 } 1083 1084 block_init(bdrv_blkio_init); 1085