1 /* SPDX-License-Identifier: LGPL-2.1-or-later */ 2 /* 3 * libblkio BlockDriver 4 * 5 * Copyright Red Hat, Inc. 6 * 7 * Author: 8 * Stefan Hajnoczi <stefanha@redhat.com> 9 */ 10 11 #include "qemu/osdep.h" 12 #include <blkio.h> 13 #include "block/block_int.h" 14 #include "exec/memory.h" 15 #include "exec/cpu-common.h" /* for qemu_ram_get_fd() */ 16 #include "qapi/error.h" 17 #include "qemu/error-report.h" 18 #include "qapi/qmp/qdict.h" 19 #include "qemu/module.h" 20 #include "exec/memory.h" /* for ram_block_discard_disable() */ 21 22 #include "block/block-io.h" 23 24 /* 25 * Keep the QEMU BlockDriver names identical to the libblkio driver names. 26 * Using macros instead of typing out the string literals avoids typos. 27 */ 28 #define DRIVER_IO_URING "io_uring" 29 #define DRIVER_NVME_IO_URING "nvme-io_uring" 30 #define DRIVER_VIRTIO_BLK_VFIO_PCI "virtio-blk-vfio-pci" 31 #define DRIVER_VIRTIO_BLK_VHOST_USER "virtio-blk-vhost-user" 32 #define DRIVER_VIRTIO_BLK_VHOST_VDPA "virtio-blk-vhost-vdpa" 33 34 /* 35 * Allocated bounce buffers are kept in a list sorted by buffer address. 36 */ 37 typedef struct BlkioBounceBuf { 38 QLIST_ENTRY(BlkioBounceBuf) next; 39 40 /* The bounce buffer */ 41 struct iovec buf; 42 } BlkioBounceBuf; 43 44 typedef struct { 45 /* 46 * libblkio is not thread-safe so this lock protects ->blkio and 47 * ->blkioq. 48 */ 49 QemuMutex blkio_lock; 50 struct blkio *blkio; 51 struct blkioq *blkioq; /* make this multi-queue in the future... */ 52 int completion_fd; 53 54 /* 55 * Polling fetches the next completion into this field. 56 * 57 * No lock is necessary since only one thread calls aio_poll() and invokes 58 * fd and poll handlers. 59 */ 60 struct blkio_completion poll_completion; 61 62 /* 63 * Protects ->bounce_pool, ->bounce_bufs, ->bounce_available. 64 * 65 * Lock ordering: ->bounce_lock before ->blkio_lock. 66 */ 67 CoMutex bounce_lock; 68 69 /* Bounce buffer pool */ 70 struct blkio_mem_region bounce_pool; 71 72 /* Sorted list of allocated bounce buffers */ 73 QLIST_HEAD(, BlkioBounceBuf) bounce_bufs; 74 75 /* Queue for coroutines waiting for bounce buffer space */ 76 CoQueue bounce_available; 77 78 /* The value of the "mem-region-alignment" property */ 79 size_t mem_region_alignment; 80 81 /* Can we skip adding/deleting blkio_mem_regions? */ 82 bool needs_mem_regions; 83 84 /* Are file descriptors necessary for blkio_mem_regions? */ 85 bool needs_mem_region_fd; 86 87 /* Are madvise(MADV_DONTNEED)-style operations unavailable? */ 88 bool may_pin_mem_regions; 89 } BDRVBlkioState; 90 91 /* Called with s->bounce_lock held */ 92 static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes) 93 { 94 /* There can be no allocated bounce buffers during resize */ 95 assert(QLIST_EMPTY(&s->bounce_bufs)); 96 97 /* Pad size to reduce frequency of resize calls */ 98 bytes += 128 * 1024; 99 100 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 101 int ret; 102 103 if (s->bounce_pool.addr) { 104 blkio_unmap_mem_region(s->blkio, &s->bounce_pool); 105 blkio_free_mem_region(s->blkio, &s->bounce_pool); 106 memset(&s->bounce_pool, 0, sizeof(s->bounce_pool)); 107 } 108 109 /* Automatically freed when s->blkio is destroyed */ 110 ret = blkio_alloc_mem_region(s->blkio, &s->bounce_pool, bytes); 111 if (ret < 0) { 112 return ret; 113 } 114 115 ret = blkio_map_mem_region(s->blkio, &s->bounce_pool); 116 if (ret < 0) { 117 blkio_free_mem_region(s->blkio, &s->bounce_pool); 118 memset(&s->bounce_pool, 0, sizeof(s->bounce_pool)); 119 return ret; 120 } 121 } 122 123 return 0; 124 } 125 126 /* Called with s->bounce_lock held */ 127 static bool 128 blkio_do_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce, 129 int64_t bytes) 130 { 131 void *addr = s->bounce_pool.addr; 132 BlkioBounceBuf *cur = NULL; 133 BlkioBounceBuf *prev = NULL; 134 ptrdiff_t space; 135 136 /* 137 * This is just a linear search over the holes between requests. An 138 * efficient allocator would be nice. 139 */ 140 QLIST_FOREACH(cur, &s->bounce_bufs, next) { 141 space = cur->buf.iov_base - addr; 142 if (bytes <= space) { 143 QLIST_INSERT_BEFORE(cur, bounce, next); 144 bounce->buf.iov_base = addr; 145 bounce->buf.iov_len = bytes; 146 return true; 147 } 148 149 addr = cur->buf.iov_base + cur->buf.iov_len; 150 prev = cur; 151 } 152 153 /* Is there space after the last request? */ 154 space = s->bounce_pool.addr + s->bounce_pool.len - addr; 155 if (bytes > space) { 156 return false; 157 } 158 if (prev) { 159 QLIST_INSERT_AFTER(prev, bounce, next); 160 } else { 161 QLIST_INSERT_HEAD(&s->bounce_bufs, bounce, next); 162 } 163 bounce->buf.iov_base = addr; 164 bounce->buf.iov_len = bytes; 165 return true; 166 } 167 168 static int coroutine_fn 169 blkio_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce, 170 int64_t bytes) 171 { 172 /* 173 * Ensure fairness: first time around we join the back of the queue, 174 * subsequently we join the front so we don't lose our place. 175 */ 176 CoQueueWaitFlags wait_flags = 0; 177 178 QEMU_LOCK_GUARD(&s->bounce_lock); 179 180 /* Ensure fairness: don't even try if other requests are already waiting */ 181 if (!qemu_co_queue_empty(&s->bounce_available)) { 182 qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock, 183 wait_flags); 184 wait_flags = CO_QUEUE_WAIT_FRONT; 185 } 186 187 while (true) { 188 if (blkio_do_alloc_bounce_buffer(s, bounce, bytes)) { 189 /* Kick the next queued request since there may be space */ 190 qemu_co_queue_next(&s->bounce_available); 191 return 0; 192 } 193 194 /* 195 * If there are no in-flight requests then the pool was simply too 196 * small. 197 */ 198 if (QLIST_EMPTY(&s->bounce_bufs)) { 199 bool ok; 200 int ret; 201 202 ret = blkio_resize_bounce_pool(s, bytes); 203 if (ret < 0) { 204 /* Kick the next queued request since that may fail too */ 205 qemu_co_queue_next(&s->bounce_available); 206 return ret; 207 } 208 209 ok = blkio_do_alloc_bounce_buffer(s, bounce, bytes); 210 assert(ok); /* must have space this time */ 211 return 0; 212 } 213 214 qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock, 215 wait_flags); 216 wait_flags = CO_QUEUE_WAIT_FRONT; 217 } 218 } 219 220 static void coroutine_fn blkio_free_bounce_buffer(BDRVBlkioState *s, 221 BlkioBounceBuf *bounce) 222 { 223 QEMU_LOCK_GUARD(&s->bounce_lock); 224 225 QLIST_REMOVE(bounce, next); 226 227 /* Wake up waiting coroutines since space may now be available */ 228 qemu_co_queue_next(&s->bounce_available); 229 } 230 231 /* For async to .bdrv_co_*() conversion */ 232 typedef struct { 233 Coroutine *coroutine; 234 int ret; 235 } BlkioCoData; 236 237 static void blkio_completion_fd_read(void *opaque) 238 { 239 BlockDriverState *bs = opaque; 240 BDRVBlkioState *s = bs->opaque; 241 uint64_t val; 242 int ret; 243 244 /* Polling may have already fetched a completion */ 245 if (s->poll_completion.user_data != NULL) { 246 BlkioCoData *cod = s->poll_completion.user_data; 247 cod->ret = s->poll_completion.ret; 248 249 /* Clear it in case aio_co_wake() enters a nested event loop */ 250 s->poll_completion.user_data = NULL; 251 252 aio_co_wake(cod->coroutine); 253 } 254 255 /* Reset completion fd status */ 256 ret = read(s->completion_fd, &val, sizeof(val)); 257 258 /* Ignore errors, there's nothing we can do */ 259 (void)ret; 260 261 /* 262 * Reading one completion at a time makes nested event loop re-entrancy 263 * simple. Change this loop to get multiple completions in one go if it 264 * becomes a performance bottleneck. 265 */ 266 while (true) { 267 struct blkio_completion completion; 268 269 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 270 ret = blkioq_do_io(s->blkioq, &completion, 0, 1, NULL); 271 } 272 if (ret != 1) { 273 break; 274 } 275 276 BlkioCoData *cod = completion.user_data; 277 cod->ret = completion.ret; 278 aio_co_wake(cod->coroutine); 279 } 280 } 281 282 static bool blkio_completion_fd_poll(void *opaque) 283 { 284 BlockDriverState *bs = opaque; 285 BDRVBlkioState *s = bs->opaque; 286 int ret; 287 288 /* Just in case we already fetched a completion */ 289 if (s->poll_completion.user_data != NULL) { 290 return true; 291 } 292 293 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 294 ret = blkioq_do_io(s->blkioq, &s->poll_completion, 0, 1, NULL); 295 } 296 return ret == 1; 297 } 298 299 static void blkio_completion_fd_poll_ready(void *opaque) 300 { 301 blkio_completion_fd_read(opaque); 302 } 303 304 static void blkio_attach_aio_context(BlockDriverState *bs, 305 AioContext *new_context) 306 { 307 BDRVBlkioState *s = bs->opaque; 308 309 aio_set_fd_handler(new_context, s->completion_fd, 310 blkio_completion_fd_read, NULL, 311 blkio_completion_fd_poll, 312 blkio_completion_fd_poll_ready, bs); 313 } 314 315 static void blkio_detach_aio_context(BlockDriverState *bs) 316 { 317 BDRVBlkioState *s = bs->opaque; 318 319 aio_set_fd_handler(bdrv_get_aio_context(bs), s->completion_fd, NULL, NULL, 320 NULL, NULL, NULL); 321 } 322 323 /* Call with s->blkio_lock held to submit I/O after enqueuing a new request */ 324 static void blkio_submit_io(BlockDriverState *bs) 325 { 326 if (qatomic_read(&bs->io_plugged) == 0) { 327 BDRVBlkioState *s = bs->opaque; 328 329 blkioq_do_io(s->blkioq, NULL, 0, 0, NULL); 330 } 331 } 332 333 static int coroutine_fn 334 blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) 335 { 336 BDRVBlkioState *s = bs->opaque; 337 BlkioCoData cod = { 338 .coroutine = qemu_coroutine_self(), 339 }; 340 341 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 342 blkioq_discard(s->blkioq, offset, bytes, &cod, 0); 343 blkio_submit_io(bs); 344 } 345 346 qemu_coroutine_yield(); 347 return cod.ret; 348 } 349 350 static int coroutine_fn 351 blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, 352 QEMUIOVector *qiov, BdrvRequestFlags flags) 353 { 354 BlkioCoData cod = { 355 .coroutine = qemu_coroutine_self(), 356 }; 357 BDRVBlkioState *s = bs->opaque; 358 bool use_bounce_buffer = 359 s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF); 360 BlkioBounceBuf bounce; 361 struct iovec *iov = qiov->iov; 362 int iovcnt = qiov->niov; 363 364 if (use_bounce_buffer) { 365 int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes); 366 if (ret < 0) { 367 return ret; 368 } 369 370 iov = &bounce.buf; 371 iovcnt = 1; 372 } 373 374 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 375 blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0); 376 blkio_submit_io(bs); 377 } 378 379 qemu_coroutine_yield(); 380 381 if (use_bounce_buffer) { 382 if (cod.ret == 0) { 383 qemu_iovec_from_buf(qiov, 0, 384 bounce.buf.iov_base, 385 bounce.buf.iov_len); 386 } 387 388 blkio_free_bounce_buffer(s, &bounce); 389 } 390 391 return cod.ret; 392 } 393 394 static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset, 395 int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) 396 { 397 uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0; 398 BlkioCoData cod = { 399 .coroutine = qemu_coroutine_self(), 400 }; 401 BDRVBlkioState *s = bs->opaque; 402 bool use_bounce_buffer = 403 s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF); 404 BlkioBounceBuf bounce; 405 struct iovec *iov = qiov->iov; 406 int iovcnt = qiov->niov; 407 408 if (use_bounce_buffer) { 409 int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes); 410 if (ret < 0) { 411 return ret; 412 } 413 414 qemu_iovec_to_buf(qiov, 0, bounce.buf.iov_base, bytes); 415 iov = &bounce.buf; 416 iovcnt = 1; 417 } 418 419 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 420 blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags); 421 blkio_submit_io(bs); 422 } 423 424 qemu_coroutine_yield(); 425 426 if (use_bounce_buffer) { 427 blkio_free_bounce_buffer(s, &bounce); 428 } 429 430 return cod.ret; 431 } 432 433 static int coroutine_fn blkio_co_flush(BlockDriverState *bs) 434 { 435 BDRVBlkioState *s = bs->opaque; 436 BlkioCoData cod = { 437 .coroutine = qemu_coroutine_self(), 438 }; 439 440 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 441 blkioq_flush(s->blkioq, &cod, 0); 442 blkio_submit_io(bs); 443 } 444 445 qemu_coroutine_yield(); 446 return cod.ret; 447 } 448 449 static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs, 450 int64_t offset, int64_t bytes, BdrvRequestFlags flags) 451 { 452 BDRVBlkioState *s = bs->opaque; 453 BlkioCoData cod = { 454 .coroutine = qemu_coroutine_self(), 455 }; 456 uint32_t blkio_flags = 0; 457 458 if (flags & BDRV_REQ_FUA) { 459 blkio_flags |= BLKIO_REQ_FUA; 460 } 461 if (!(flags & BDRV_REQ_MAY_UNMAP)) { 462 blkio_flags |= BLKIO_REQ_NO_UNMAP; 463 } 464 if (flags & BDRV_REQ_NO_FALLBACK) { 465 blkio_flags |= BLKIO_REQ_NO_FALLBACK; 466 } 467 468 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 469 blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags); 470 blkio_submit_io(bs); 471 } 472 473 qemu_coroutine_yield(); 474 return cod.ret; 475 } 476 477 static void coroutine_fn blkio_co_io_unplug(BlockDriverState *bs) 478 { 479 BDRVBlkioState *s = bs->opaque; 480 481 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 482 blkio_submit_io(bs); 483 } 484 } 485 486 typedef enum { 487 BMRR_OK, 488 BMRR_SKIP, 489 BMRR_FAIL, 490 } BlkioMemRegionResult; 491 492 /* 493 * Produce a struct blkio_mem_region for a given address and size. 494 * 495 * This function produces identical results when called multiple times with the 496 * same arguments. This property is necessary because blkio_unmap_mem_region() 497 * must receive the same struct blkio_mem_region field values that were passed 498 * to blkio_map_mem_region(). 499 */ 500 static BlkioMemRegionResult 501 blkio_mem_region_from_host(BlockDriverState *bs, 502 void *host, size_t size, 503 struct blkio_mem_region *region, 504 Error **errp) 505 { 506 BDRVBlkioState *s = bs->opaque; 507 int fd = -1; 508 ram_addr_t fd_offset = 0; 509 510 if (((uintptr_t)host | size) % s->mem_region_alignment) { 511 error_setg(errp, "unaligned buf %p with size %zu", host, size); 512 return BMRR_FAIL; 513 } 514 515 /* Attempt to find the fd for the underlying memory */ 516 if (s->needs_mem_region_fd) { 517 RAMBlock *ram_block; 518 RAMBlock *end_block; 519 ram_addr_t offset; 520 521 /* 522 * bdrv_register_buf() is called with the BQL held so mr lives at least 523 * until this function returns. 524 */ 525 ram_block = qemu_ram_block_from_host(host, false, &fd_offset); 526 if (ram_block) { 527 fd = qemu_ram_get_fd(ram_block); 528 } 529 if (fd == -1) { 530 /* 531 * Ideally every RAMBlock would have an fd. pc-bios and other 532 * things don't. Luckily they are usually not I/O buffers and we 533 * can just ignore them. 534 */ 535 return BMRR_SKIP; 536 } 537 538 /* Make sure the fd covers the entire range */ 539 end_block = qemu_ram_block_from_host(host + size - 1, false, &offset); 540 if (ram_block != end_block) { 541 error_setg(errp, "registered buffer at %p with size %zu extends " 542 "beyond RAMBlock", host, size); 543 return BMRR_FAIL; 544 } 545 } 546 547 *region = (struct blkio_mem_region){ 548 .addr = host, 549 .len = size, 550 .fd = fd, 551 .fd_offset = fd_offset, 552 }; 553 return BMRR_OK; 554 } 555 556 static bool blkio_register_buf(BlockDriverState *bs, void *host, size_t size, 557 Error **errp) 558 { 559 BDRVBlkioState *s = bs->opaque; 560 struct blkio_mem_region region; 561 BlkioMemRegionResult region_result; 562 int ret; 563 564 /* 565 * Mapping memory regions conflicts with RAM discard (virtio-mem) when 566 * there is pinning, so only do it when necessary. 567 */ 568 if (!s->needs_mem_regions && s->may_pin_mem_regions) { 569 return true; 570 } 571 572 region_result = blkio_mem_region_from_host(bs, host, size, ®ion, errp); 573 if (region_result == BMRR_SKIP) { 574 return true; 575 } else if (region_result != BMRR_OK) { 576 return false; 577 } 578 579 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 580 ret = blkio_map_mem_region(s->blkio, ®ion); 581 } 582 583 if (ret < 0) { 584 error_setg(errp, "Failed to add blkio mem region %p with size %zu: %s", 585 host, size, blkio_get_error_msg()); 586 return false; 587 } 588 return true; 589 } 590 591 static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size) 592 { 593 BDRVBlkioState *s = bs->opaque; 594 struct blkio_mem_region region; 595 596 /* See blkio_register_buf() */ 597 if (!s->needs_mem_regions && s->may_pin_mem_regions) { 598 return; 599 } 600 601 if (blkio_mem_region_from_host(bs, host, size, ®ion, NULL) != BMRR_OK) { 602 return; 603 } 604 605 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 606 blkio_unmap_mem_region(s->blkio, ®ion); 607 } 608 } 609 610 static int blkio_io_uring_open(BlockDriverState *bs, QDict *options, int flags, 611 Error **errp) 612 { 613 const char *filename = qdict_get_str(options, "filename"); 614 BDRVBlkioState *s = bs->opaque; 615 int ret; 616 617 ret = blkio_set_str(s->blkio, "path", filename); 618 qdict_del(options, "filename"); 619 if (ret < 0) { 620 error_setg_errno(errp, -ret, "failed to set path: %s", 621 blkio_get_error_msg()); 622 return ret; 623 } 624 625 if (flags & BDRV_O_NOCACHE) { 626 ret = blkio_set_bool(s->blkio, "direct", true); 627 if (ret < 0) { 628 error_setg_errno(errp, -ret, "failed to set direct: %s", 629 blkio_get_error_msg()); 630 return ret; 631 } 632 } 633 634 return 0; 635 } 636 637 static int blkio_nvme_io_uring(BlockDriverState *bs, QDict *options, int flags, 638 Error **errp) 639 { 640 const char *path = qdict_get_try_str(options, "path"); 641 BDRVBlkioState *s = bs->opaque; 642 int ret; 643 644 if (!path) { 645 error_setg(errp, "missing 'path' option"); 646 return -EINVAL; 647 } 648 649 ret = blkio_set_str(s->blkio, "path", path); 650 qdict_del(options, "path"); 651 if (ret < 0) { 652 error_setg_errno(errp, -ret, "failed to set path: %s", 653 blkio_get_error_msg()); 654 return ret; 655 } 656 657 if (!(flags & BDRV_O_NOCACHE)) { 658 error_setg(errp, "cache.direct=off is not supported"); 659 return -EINVAL; 660 } 661 662 return 0; 663 } 664 665 static int blkio_virtio_blk_common_open(BlockDriverState *bs, 666 QDict *options, int flags, Error **errp) 667 { 668 const char *path = qdict_get_try_str(options, "path"); 669 BDRVBlkioState *s = bs->opaque; 670 int ret; 671 672 if (!path) { 673 error_setg(errp, "missing 'path' option"); 674 return -EINVAL; 675 } 676 677 ret = blkio_set_str(s->blkio, "path", path); 678 qdict_del(options, "path"); 679 if (ret < 0) { 680 error_setg_errno(errp, -ret, "failed to set path: %s", 681 blkio_get_error_msg()); 682 return ret; 683 } 684 685 if (!(flags & BDRV_O_NOCACHE)) { 686 error_setg(errp, "cache.direct=off is not supported"); 687 return -EINVAL; 688 } 689 return 0; 690 } 691 692 static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags, 693 Error **errp) 694 { 695 const char *blkio_driver = bs->drv->protocol_name; 696 BDRVBlkioState *s = bs->opaque; 697 int ret; 698 699 ret = blkio_create(blkio_driver, &s->blkio); 700 if (ret < 0) { 701 error_setg_errno(errp, -ret, "blkio_create failed: %s", 702 blkio_get_error_msg()); 703 return ret; 704 } 705 706 if (strcmp(blkio_driver, DRIVER_IO_URING) == 0) { 707 ret = blkio_io_uring_open(bs, options, flags, errp); 708 } else if (strcmp(blkio_driver, DRIVER_NVME_IO_URING) == 0) { 709 ret = blkio_nvme_io_uring(bs, options, flags, errp); 710 } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VFIO_PCI) == 0) { 711 ret = blkio_virtio_blk_common_open(bs, options, flags, errp); 712 } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_USER) == 0) { 713 ret = blkio_virtio_blk_common_open(bs, options, flags, errp); 714 } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_VDPA) == 0) { 715 ret = blkio_virtio_blk_common_open(bs, options, flags, errp); 716 } else { 717 g_assert_not_reached(); 718 } 719 if (ret < 0) { 720 blkio_destroy(&s->blkio); 721 return ret; 722 } 723 724 if (!(flags & BDRV_O_RDWR)) { 725 ret = blkio_set_bool(s->blkio, "read-only", true); 726 if (ret < 0) { 727 error_setg_errno(errp, -ret, "failed to set read-only: %s", 728 blkio_get_error_msg()); 729 blkio_destroy(&s->blkio); 730 return ret; 731 } 732 } 733 734 ret = blkio_connect(s->blkio); 735 if (ret < 0) { 736 error_setg_errno(errp, -ret, "blkio_connect failed: %s", 737 blkio_get_error_msg()); 738 blkio_destroy(&s->blkio); 739 return ret; 740 } 741 742 ret = blkio_get_bool(s->blkio, 743 "needs-mem-regions", 744 &s->needs_mem_regions); 745 if (ret < 0) { 746 error_setg_errno(errp, -ret, 747 "failed to get needs-mem-regions: %s", 748 blkio_get_error_msg()); 749 blkio_destroy(&s->blkio); 750 return ret; 751 } 752 753 ret = blkio_get_bool(s->blkio, 754 "needs-mem-region-fd", 755 &s->needs_mem_region_fd); 756 if (ret < 0) { 757 error_setg_errno(errp, -ret, 758 "failed to get needs-mem-region-fd: %s", 759 blkio_get_error_msg()); 760 blkio_destroy(&s->blkio); 761 return ret; 762 } 763 764 ret = blkio_get_uint64(s->blkio, 765 "mem-region-alignment", 766 &s->mem_region_alignment); 767 if (ret < 0) { 768 error_setg_errno(errp, -ret, 769 "failed to get mem-region-alignment: %s", 770 blkio_get_error_msg()); 771 blkio_destroy(&s->blkio); 772 return ret; 773 } 774 775 ret = blkio_get_bool(s->blkio, 776 "may-pin-mem-regions", 777 &s->may_pin_mem_regions); 778 if (ret < 0) { 779 /* Be conservative (assume pinning) if the property is not supported */ 780 s->may_pin_mem_regions = s->needs_mem_regions; 781 } 782 783 /* 784 * Notify if libblkio drivers pin memory and prevent features like 785 * virtio-mem from working. 786 */ 787 if (s->may_pin_mem_regions) { 788 ret = ram_block_discard_disable(true); 789 if (ret < 0) { 790 error_setg_errno(errp, -ret, "ram_block_discard_disable() failed"); 791 blkio_destroy(&s->blkio); 792 return ret; 793 } 794 } 795 796 ret = blkio_start(s->blkio); 797 if (ret < 0) { 798 error_setg_errno(errp, -ret, "blkio_start failed: %s", 799 blkio_get_error_msg()); 800 blkio_destroy(&s->blkio); 801 if (s->may_pin_mem_regions) { 802 ram_block_discard_disable(false); 803 } 804 return ret; 805 } 806 807 bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF; 808 bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | 809 BDRV_REQ_NO_FALLBACK; 810 811 qemu_mutex_init(&s->blkio_lock); 812 qemu_co_mutex_init(&s->bounce_lock); 813 qemu_co_queue_init(&s->bounce_available); 814 QLIST_INIT(&s->bounce_bufs); 815 s->blkioq = blkio_get_queue(s->blkio, 0); 816 s->completion_fd = blkioq_get_completion_fd(s->blkioq); 817 818 blkio_attach_aio_context(bs, bdrv_get_aio_context(bs)); 819 return 0; 820 } 821 822 static void blkio_close(BlockDriverState *bs) 823 { 824 BDRVBlkioState *s = bs->opaque; 825 826 /* There is no destroy() API for s->bounce_lock */ 827 828 qemu_mutex_destroy(&s->blkio_lock); 829 blkio_detach_aio_context(bs); 830 blkio_destroy(&s->blkio); 831 832 if (s->may_pin_mem_regions) { 833 ram_block_discard_disable(false); 834 } 835 } 836 837 static int64_t coroutine_fn blkio_co_getlength(BlockDriverState *bs) 838 { 839 BDRVBlkioState *s = bs->opaque; 840 uint64_t capacity; 841 int ret; 842 843 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 844 ret = blkio_get_uint64(s->blkio, "capacity", &capacity); 845 } 846 if (ret < 0) { 847 return -ret; 848 } 849 850 return capacity; 851 } 852 853 static int coroutine_fn blkio_truncate(BlockDriverState *bs, int64_t offset, 854 bool exact, PreallocMode prealloc, 855 BdrvRequestFlags flags, Error **errp) 856 { 857 int64_t current_length; 858 859 if (prealloc != PREALLOC_MODE_OFF) { 860 error_setg(errp, "Unsupported preallocation mode '%s'", 861 PreallocMode_str(prealloc)); 862 return -ENOTSUP; 863 } 864 865 current_length = blkio_co_getlength(bs); 866 867 if (offset > current_length) { 868 error_setg(errp, "Cannot grow device"); 869 return -EINVAL; 870 } else if (exact && offset != current_length) { 871 error_setg(errp, "Cannot resize device"); 872 return -ENOTSUP; 873 } 874 875 return 0; 876 } 877 878 static int coroutine_fn 879 blkio_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 880 { 881 return 0; 882 } 883 884 static void blkio_refresh_limits(BlockDriverState *bs, Error **errp) 885 { 886 BDRVBlkioState *s = bs->opaque; 887 QEMU_LOCK_GUARD(&s->blkio_lock); 888 int value; 889 int ret; 890 891 ret = blkio_get_int(s->blkio, "request-alignment", &value); 892 if (ret < 0) { 893 error_setg_errno(errp, -ret, "failed to get \"request-alignment\": %s", 894 blkio_get_error_msg()); 895 return; 896 } 897 bs->bl.request_alignment = value; 898 if (bs->bl.request_alignment < 1 || 899 bs->bl.request_alignment >= INT_MAX || 900 !is_power_of_2(bs->bl.request_alignment)) { 901 error_setg(errp, "invalid \"request-alignment\" value %" PRIu32 ", " 902 "must be a power of 2 less than INT_MAX", 903 bs->bl.request_alignment); 904 return; 905 } 906 907 ret = blkio_get_int(s->blkio, "optimal-io-size", &value); 908 if (ret < 0) { 909 error_setg_errno(errp, -ret, "failed to get \"optimal-io-size\": %s", 910 blkio_get_error_msg()); 911 return; 912 } 913 bs->bl.opt_transfer = value; 914 if (bs->bl.opt_transfer > INT_MAX || 915 (bs->bl.opt_transfer % bs->bl.request_alignment)) { 916 error_setg(errp, "invalid \"optimal-io-size\" value %" PRIu32 ", must " 917 "be a multiple of %" PRIu32, bs->bl.opt_transfer, 918 bs->bl.request_alignment); 919 return; 920 } 921 922 ret = blkio_get_int(s->blkio, "max-transfer", &value); 923 if (ret < 0) { 924 error_setg_errno(errp, -ret, "failed to get \"max-transfer\": %s", 925 blkio_get_error_msg()); 926 return; 927 } 928 bs->bl.max_transfer = value; 929 if ((bs->bl.max_transfer % bs->bl.request_alignment) || 930 (bs->bl.opt_transfer && (bs->bl.max_transfer % bs->bl.opt_transfer))) { 931 error_setg(errp, "invalid \"max-transfer\" value %" PRIu32 ", must be " 932 "a multiple of %" PRIu32 " and %" PRIu32 " (if non-zero)", 933 bs->bl.max_transfer, bs->bl.request_alignment, 934 bs->bl.opt_transfer); 935 return; 936 } 937 938 ret = blkio_get_int(s->blkio, "buf-alignment", &value); 939 if (ret < 0) { 940 error_setg_errno(errp, -ret, "failed to get \"buf-alignment\": %s", 941 blkio_get_error_msg()); 942 return; 943 } 944 if (value < 1) { 945 error_setg(errp, "invalid \"buf-alignment\" value %d, must be " 946 "positive", value); 947 return; 948 } 949 bs->bl.min_mem_alignment = value; 950 951 ret = blkio_get_int(s->blkio, "optimal-buf-alignment", &value); 952 if (ret < 0) { 953 error_setg_errno(errp, -ret, 954 "failed to get \"optimal-buf-alignment\": %s", 955 blkio_get_error_msg()); 956 return; 957 } 958 if (value < 1) { 959 error_setg(errp, "invalid \"optimal-buf-alignment\" value %d, " 960 "must be positive", value); 961 return; 962 } 963 bs->bl.opt_mem_alignment = value; 964 965 ret = blkio_get_int(s->blkio, "max-segments", &value); 966 if (ret < 0) { 967 error_setg_errno(errp, -ret, "failed to get \"max-segments\": %s", 968 blkio_get_error_msg()); 969 return; 970 } 971 if (value < 1) { 972 error_setg(errp, "invalid \"max-segments\" value %d, must be positive", 973 value); 974 return; 975 } 976 bs->bl.max_iov = value; 977 } 978 979 /* 980 * TODO 981 * Missing libblkio APIs: 982 * - block_status 983 * - co_invalidate_cache 984 * 985 * Out of scope? 986 * - create 987 * - truncate 988 */ 989 990 #define BLKIO_DRIVER(name, ...) \ 991 { \ 992 .format_name = name, \ 993 .protocol_name = name, \ 994 .instance_size = sizeof(BDRVBlkioState), \ 995 .bdrv_file_open = blkio_file_open, \ 996 .bdrv_close = blkio_close, \ 997 .bdrv_co_getlength = blkio_co_getlength, \ 998 .bdrv_co_truncate = blkio_truncate, \ 999 .bdrv_co_get_info = blkio_co_get_info, \ 1000 .bdrv_attach_aio_context = blkio_attach_aio_context, \ 1001 .bdrv_detach_aio_context = blkio_detach_aio_context, \ 1002 .bdrv_co_pdiscard = blkio_co_pdiscard, \ 1003 .bdrv_co_preadv = blkio_co_preadv, \ 1004 .bdrv_co_pwritev = blkio_co_pwritev, \ 1005 .bdrv_co_flush_to_disk = blkio_co_flush, \ 1006 .bdrv_co_pwrite_zeroes = blkio_co_pwrite_zeroes, \ 1007 .bdrv_co_io_unplug = blkio_co_io_unplug, \ 1008 .bdrv_refresh_limits = blkio_refresh_limits, \ 1009 .bdrv_register_buf = blkio_register_buf, \ 1010 .bdrv_unregister_buf = blkio_unregister_buf, \ 1011 __VA_ARGS__ \ 1012 } 1013 1014 static BlockDriver bdrv_io_uring = BLKIO_DRIVER( 1015 DRIVER_IO_URING, 1016 .bdrv_needs_filename = true, 1017 ); 1018 1019 static BlockDriver bdrv_nvme_io_uring = BLKIO_DRIVER( 1020 DRIVER_NVME_IO_URING, 1021 ); 1022 1023 static BlockDriver bdrv_virtio_blk_vfio_pci = BLKIO_DRIVER( 1024 DRIVER_VIRTIO_BLK_VFIO_PCI 1025 ); 1026 1027 static BlockDriver bdrv_virtio_blk_vhost_user = BLKIO_DRIVER( 1028 DRIVER_VIRTIO_BLK_VHOST_USER 1029 ); 1030 1031 static BlockDriver bdrv_virtio_blk_vhost_vdpa = BLKIO_DRIVER( 1032 DRIVER_VIRTIO_BLK_VHOST_VDPA 1033 ); 1034 1035 static void bdrv_blkio_init(void) 1036 { 1037 bdrv_register(&bdrv_io_uring); 1038 bdrv_register(&bdrv_nvme_io_uring); 1039 bdrv_register(&bdrv_virtio_blk_vfio_pci); 1040 bdrv_register(&bdrv_virtio_blk_vhost_user); 1041 bdrv_register(&bdrv_virtio_blk_vhost_vdpa); 1042 } 1043 1044 block_init(bdrv_blkio_init); 1045