1 /* SPDX-License-Identifier: LGPL-2.1-or-later */ 2 /* 3 * libblkio BlockDriver 4 * 5 * Copyright Red Hat, Inc. 6 * 7 * Author: 8 * Stefan Hajnoczi <stefanha@redhat.com> 9 */ 10 11 #include "qemu/osdep.h" 12 #include <blkio.h> 13 #include "block/block_int.h" 14 #include "exec/memory.h" 15 #include "exec/cpu-common.h" /* for qemu_ram_get_fd() */ 16 #include "qapi/error.h" 17 #include "qemu/error-report.h" 18 #include "qapi/qmp/qdict.h" 19 #include "qemu/module.h" 20 #include "exec/memory.h" /* for ram_block_discard_disable() */ 21 22 /* 23 * Keep the QEMU BlockDriver names identical to the libblkio driver names. 24 * Using macros instead of typing out the string literals avoids typos. 25 */ 26 #define DRIVER_IO_URING "io_uring" 27 #define DRIVER_NVME_IO_URING "nvme-io_uring" 28 #define DRIVER_VIRTIO_BLK_VFIO_PCI "virtio-blk-vfio-pci" 29 #define DRIVER_VIRTIO_BLK_VHOST_USER "virtio-blk-vhost-user" 30 #define DRIVER_VIRTIO_BLK_VHOST_VDPA "virtio-blk-vhost-vdpa" 31 32 /* 33 * Allocated bounce buffers are kept in a list sorted by buffer address. 34 */ 35 typedef struct BlkioBounceBuf { 36 QLIST_ENTRY(BlkioBounceBuf) next; 37 38 /* The bounce buffer */ 39 struct iovec buf; 40 } BlkioBounceBuf; 41 42 typedef struct { 43 /* 44 * libblkio is not thread-safe so this lock protects ->blkio and 45 * ->blkioq. 46 */ 47 QemuMutex blkio_lock; 48 struct blkio *blkio; 49 struct blkioq *blkioq; /* make this multi-queue in the future... */ 50 int completion_fd; 51 52 /* 53 * Polling fetches the next completion into this field. 54 * 55 * No lock is necessary since only one thread calls aio_poll() and invokes 56 * fd and poll handlers. 57 */ 58 struct blkio_completion poll_completion; 59 60 /* 61 * Protects ->bounce_pool, ->bounce_bufs, ->bounce_available. 62 * 63 * Lock ordering: ->bounce_lock before ->blkio_lock. 64 */ 65 CoMutex bounce_lock; 66 67 /* Bounce buffer pool */ 68 struct blkio_mem_region bounce_pool; 69 70 /* Sorted list of allocated bounce buffers */ 71 QLIST_HEAD(, BlkioBounceBuf) bounce_bufs; 72 73 /* Queue for coroutines waiting for bounce buffer space */ 74 CoQueue bounce_available; 75 76 /* The value of the "mem-region-alignment" property */ 77 size_t mem_region_alignment; 78 79 /* Can we skip adding/deleting blkio_mem_regions? */ 80 bool needs_mem_regions; 81 82 /* Are file descriptors necessary for blkio_mem_regions? */ 83 bool needs_mem_region_fd; 84 85 /* Are madvise(MADV_DONTNEED)-style operations unavailable? */ 86 bool may_pin_mem_regions; 87 } BDRVBlkioState; 88 89 /* Called with s->bounce_lock held */ 90 static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes) 91 { 92 /* There can be no allocated bounce buffers during resize */ 93 assert(QLIST_EMPTY(&s->bounce_bufs)); 94 95 /* Pad size to reduce frequency of resize calls */ 96 bytes += 128 * 1024; 97 98 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 99 int ret; 100 101 if (s->bounce_pool.addr) { 102 blkio_unmap_mem_region(s->blkio, &s->bounce_pool); 103 blkio_free_mem_region(s->blkio, &s->bounce_pool); 104 memset(&s->bounce_pool, 0, sizeof(s->bounce_pool)); 105 } 106 107 /* Automatically freed when s->blkio is destroyed */ 108 ret = blkio_alloc_mem_region(s->blkio, &s->bounce_pool, bytes); 109 if (ret < 0) { 110 return ret; 111 } 112 113 ret = blkio_map_mem_region(s->blkio, &s->bounce_pool); 114 if (ret < 0) { 115 blkio_free_mem_region(s->blkio, &s->bounce_pool); 116 memset(&s->bounce_pool, 0, sizeof(s->bounce_pool)); 117 return ret; 118 } 119 } 120 121 return 0; 122 } 123 124 /* Called with s->bounce_lock held */ 125 static bool 126 blkio_do_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce, 127 int64_t bytes) 128 { 129 void *addr = s->bounce_pool.addr; 130 BlkioBounceBuf *cur = NULL; 131 BlkioBounceBuf *prev = NULL; 132 ptrdiff_t space; 133 134 /* 135 * This is just a linear search over the holes between requests. An 136 * efficient allocator would be nice. 137 */ 138 QLIST_FOREACH(cur, &s->bounce_bufs, next) { 139 space = cur->buf.iov_base - addr; 140 if (bytes <= space) { 141 QLIST_INSERT_BEFORE(cur, bounce, next); 142 bounce->buf.iov_base = addr; 143 bounce->buf.iov_len = bytes; 144 return true; 145 } 146 147 addr = cur->buf.iov_base + cur->buf.iov_len; 148 prev = cur; 149 } 150 151 /* Is there space after the last request? */ 152 space = s->bounce_pool.addr + s->bounce_pool.len - addr; 153 if (bytes > space) { 154 return false; 155 } 156 if (prev) { 157 QLIST_INSERT_AFTER(prev, bounce, next); 158 } else { 159 QLIST_INSERT_HEAD(&s->bounce_bufs, bounce, next); 160 } 161 bounce->buf.iov_base = addr; 162 bounce->buf.iov_len = bytes; 163 return true; 164 } 165 166 static int coroutine_fn 167 blkio_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce, 168 int64_t bytes) 169 { 170 /* 171 * Ensure fairness: first time around we join the back of the queue, 172 * subsequently we join the front so we don't lose our place. 173 */ 174 CoQueueWaitFlags wait_flags = 0; 175 176 QEMU_LOCK_GUARD(&s->bounce_lock); 177 178 /* Ensure fairness: don't even try if other requests are already waiting */ 179 if (!qemu_co_queue_empty(&s->bounce_available)) { 180 qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock, 181 wait_flags); 182 wait_flags = CO_QUEUE_WAIT_FRONT; 183 } 184 185 while (true) { 186 if (blkio_do_alloc_bounce_buffer(s, bounce, bytes)) { 187 /* Kick the next queued request since there may be space */ 188 qemu_co_queue_next(&s->bounce_available); 189 return 0; 190 } 191 192 /* 193 * If there are no in-flight requests then the pool was simply too 194 * small. 195 */ 196 if (QLIST_EMPTY(&s->bounce_bufs)) { 197 bool ok; 198 int ret; 199 200 ret = blkio_resize_bounce_pool(s, bytes); 201 if (ret < 0) { 202 /* Kick the next queued request since that may fail too */ 203 qemu_co_queue_next(&s->bounce_available); 204 return ret; 205 } 206 207 ok = blkio_do_alloc_bounce_buffer(s, bounce, bytes); 208 assert(ok); /* must have space this time */ 209 return 0; 210 } 211 212 qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock, 213 wait_flags); 214 wait_flags = CO_QUEUE_WAIT_FRONT; 215 } 216 } 217 218 static void coroutine_fn blkio_free_bounce_buffer(BDRVBlkioState *s, 219 BlkioBounceBuf *bounce) 220 { 221 QEMU_LOCK_GUARD(&s->bounce_lock); 222 223 QLIST_REMOVE(bounce, next); 224 225 /* Wake up waiting coroutines since space may now be available */ 226 qemu_co_queue_next(&s->bounce_available); 227 } 228 229 /* For async to .bdrv_co_*() conversion */ 230 typedef struct { 231 Coroutine *coroutine; 232 int ret; 233 } BlkioCoData; 234 235 static void blkio_completion_fd_read(void *opaque) 236 { 237 BlockDriverState *bs = opaque; 238 BDRVBlkioState *s = bs->opaque; 239 uint64_t val; 240 int ret; 241 242 /* Polling may have already fetched a completion */ 243 if (s->poll_completion.user_data != NULL) { 244 BlkioCoData *cod = s->poll_completion.user_data; 245 cod->ret = s->poll_completion.ret; 246 247 /* Clear it in case aio_co_wake() enters a nested event loop */ 248 s->poll_completion.user_data = NULL; 249 250 aio_co_wake(cod->coroutine); 251 } 252 253 /* Reset completion fd status */ 254 ret = read(s->completion_fd, &val, sizeof(val)); 255 256 /* Ignore errors, there's nothing we can do */ 257 (void)ret; 258 259 /* 260 * Reading one completion at a time makes nested event loop re-entrancy 261 * simple. Change this loop to get multiple completions in one go if it 262 * becomes a performance bottleneck. 263 */ 264 while (true) { 265 struct blkio_completion completion; 266 267 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 268 ret = blkioq_do_io(s->blkioq, &completion, 0, 1, NULL); 269 } 270 if (ret != 1) { 271 break; 272 } 273 274 BlkioCoData *cod = completion.user_data; 275 cod->ret = completion.ret; 276 aio_co_wake(cod->coroutine); 277 } 278 } 279 280 static bool blkio_completion_fd_poll(void *opaque) 281 { 282 BlockDriverState *bs = opaque; 283 BDRVBlkioState *s = bs->opaque; 284 int ret; 285 286 /* Just in case we already fetched a completion */ 287 if (s->poll_completion.user_data != NULL) { 288 return true; 289 } 290 291 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 292 ret = blkioq_do_io(s->blkioq, &s->poll_completion, 0, 1, NULL); 293 } 294 return ret == 1; 295 } 296 297 static void blkio_completion_fd_poll_ready(void *opaque) 298 { 299 blkio_completion_fd_read(opaque); 300 } 301 302 static void blkio_attach_aio_context(BlockDriverState *bs, 303 AioContext *new_context) 304 { 305 BDRVBlkioState *s = bs->opaque; 306 307 aio_set_fd_handler(new_context, 308 s->completion_fd, 309 false, 310 blkio_completion_fd_read, 311 NULL, 312 blkio_completion_fd_poll, 313 blkio_completion_fd_poll_ready, 314 bs); 315 } 316 317 static void blkio_detach_aio_context(BlockDriverState *bs) 318 { 319 BDRVBlkioState *s = bs->opaque; 320 321 aio_set_fd_handler(bdrv_get_aio_context(bs), 322 s->completion_fd, 323 false, NULL, NULL, NULL, NULL, NULL); 324 } 325 326 /* Call with s->blkio_lock held to submit I/O after enqueuing a new request */ 327 static void blkio_submit_io(BlockDriverState *bs) 328 { 329 if (qatomic_read(&bs->io_plugged) == 0) { 330 BDRVBlkioState *s = bs->opaque; 331 332 blkioq_do_io(s->blkioq, NULL, 0, 0, NULL); 333 } 334 } 335 336 static int coroutine_fn 337 blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) 338 { 339 BDRVBlkioState *s = bs->opaque; 340 BlkioCoData cod = { 341 .coroutine = qemu_coroutine_self(), 342 }; 343 344 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 345 blkioq_discard(s->blkioq, offset, bytes, &cod, 0); 346 blkio_submit_io(bs); 347 } 348 349 qemu_coroutine_yield(); 350 return cod.ret; 351 } 352 353 static int coroutine_fn 354 blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, 355 QEMUIOVector *qiov, BdrvRequestFlags flags) 356 { 357 BlkioCoData cod = { 358 .coroutine = qemu_coroutine_self(), 359 }; 360 BDRVBlkioState *s = bs->opaque; 361 bool use_bounce_buffer = 362 s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF); 363 BlkioBounceBuf bounce; 364 struct iovec *iov = qiov->iov; 365 int iovcnt = qiov->niov; 366 367 if (use_bounce_buffer) { 368 int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes); 369 if (ret < 0) { 370 return ret; 371 } 372 373 iov = &bounce.buf; 374 iovcnt = 1; 375 } 376 377 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 378 blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0); 379 blkio_submit_io(bs); 380 } 381 382 qemu_coroutine_yield(); 383 384 if (use_bounce_buffer) { 385 if (cod.ret == 0) { 386 qemu_iovec_from_buf(qiov, 0, 387 bounce.buf.iov_base, 388 bounce.buf.iov_len); 389 } 390 391 blkio_free_bounce_buffer(s, &bounce); 392 } 393 394 return cod.ret; 395 } 396 397 static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset, 398 int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) 399 { 400 uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0; 401 BlkioCoData cod = { 402 .coroutine = qemu_coroutine_self(), 403 }; 404 BDRVBlkioState *s = bs->opaque; 405 bool use_bounce_buffer = 406 s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF); 407 BlkioBounceBuf bounce; 408 struct iovec *iov = qiov->iov; 409 int iovcnt = qiov->niov; 410 411 if (use_bounce_buffer) { 412 int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes); 413 if (ret < 0) { 414 return ret; 415 } 416 417 qemu_iovec_to_buf(qiov, 0, bounce.buf.iov_base, bytes); 418 iov = &bounce.buf; 419 iovcnt = 1; 420 } 421 422 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 423 blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags); 424 blkio_submit_io(bs); 425 } 426 427 qemu_coroutine_yield(); 428 429 if (use_bounce_buffer) { 430 blkio_free_bounce_buffer(s, &bounce); 431 } 432 433 return cod.ret; 434 } 435 436 static int coroutine_fn blkio_co_flush(BlockDriverState *bs) 437 { 438 BDRVBlkioState *s = bs->opaque; 439 BlkioCoData cod = { 440 .coroutine = qemu_coroutine_self(), 441 }; 442 443 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 444 blkioq_flush(s->blkioq, &cod, 0); 445 blkio_submit_io(bs); 446 } 447 448 qemu_coroutine_yield(); 449 return cod.ret; 450 } 451 452 static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs, 453 int64_t offset, int64_t bytes, BdrvRequestFlags flags) 454 { 455 BDRVBlkioState *s = bs->opaque; 456 BlkioCoData cod = { 457 .coroutine = qemu_coroutine_self(), 458 }; 459 uint32_t blkio_flags = 0; 460 461 if (flags & BDRV_REQ_FUA) { 462 blkio_flags |= BLKIO_REQ_FUA; 463 } 464 if (!(flags & BDRV_REQ_MAY_UNMAP)) { 465 blkio_flags |= BLKIO_REQ_NO_UNMAP; 466 } 467 if (flags & BDRV_REQ_NO_FALLBACK) { 468 blkio_flags |= BLKIO_REQ_NO_FALLBACK; 469 } 470 471 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 472 blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags); 473 blkio_submit_io(bs); 474 } 475 476 qemu_coroutine_yield(); 477 return cod.ret; 478 } 479 480 static void blkio_io_unplug(BlockDriverState *bs) 481 { 482 BDRVBlkioState *s = bs->opaque; 483 484 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 485 blkio_submit_io(bs); 486 } 487 } 488 489 typedef enum { 490 BMRR_OK, 491 BMRR_SKIP, 492 BMRR_FAIL, 493 } BlkioMemRegionResult; 494 495 /* 496 * Produce a struct blkio_mem_region for a given address and size. 497 * 498 * This function produces identical results when called multiple times with the 499 * same arguments. This property is necessary because blkio_unmap_mem_region() 500 * must receive the same struct blkio_mem_region field values that were passed 501 * to blkio_map_mem_region(). 502 */ 503 static BlkioMemRegionResult 504 blkio_mem_region_from_host(BlockDriverState *bs, 505 void *host, size_t size, 506 struct blkio_mem_region *region, 507 Error **errp) 508 { 509 BDRVBlkioState *s = bs->opaque; 510 int fd = -1; 511 ram_addr_t fd_offset = 0; 512 513 if (((uintptr_t)host | size) % s->mem_region_alignment) { 514 error_setg(errp, "unaligned buf %p with size %zu", host, size); 515 return BMRR_FAIL; 516 } 517 518 /* Attempt to find the fd for the underlying memory */ 519 if (s->needs_mem_region_fd) { 520 RAMBlock *ram_block; 521 RAMBlock *end_block; 522 ram_addr_t offset; 523 524 /* 525 * bdrv_register_buf() is called with the BQL held so mr lives at least 526 * until this function returns. 527 */ 528 ram_block = qemu_ram_block_from_host(host, false, &fd_offset); 529 if (ram_block) { 530 fd = qemu_ram_get_fd(ram_block); 531 } 532 if (fd == -1) { 533 /* 534 * Ideally every RAMBlock would have an fd. pc-bios and other 535 * things don't. Luckily they are usually not I/O buffers and we 536 * can just ignore them. 537 */ 538 return BMRR_SKIP; 539 } 540 541 /* Make sure the fd covers the entire range */ 542 end_block = qemu_ram_block_from_host(host + size - 1, false, &offset); 543 if (ram_block != end_block) { 544 error_setg(errp, "registered buffer at %p with size %zu extends " 545 "beyond RAMBlock", host, size); 546 return BMRR_FAIL; 547 } 548 } 549 550 *region = (struct blkio_mem_region){ 551 .addr = host, 552 .len = size, 553 .fd = fd, 554 .fd_offset = fd_offset, 555 }; 556 return BMRR_OK; 557 } 558 559 static bool blkio_register_buf(BlockDriverState *bs, void *host, size_t size, 560 Error **errp) 561 { 562 BDRVBlkioState *s = bs->opaque; 563 struct blkio_mem_region region; 564 BlkioMemRegionResult region_result; 565 int ret; 566 567 /* 568 * Mapping memory regions conflicts with RAM discard (virtio-mem) when 569 * there is pinning, so only do it when necessary. 570 */ 571 if (!s->needs_mem_regions && s->may_pin_mem_regions) { 572 return true; 573 } 574 575 region_result = blkio_mem_region_from_host(bs, host, size, ®ion, errp); 576 if (region_result == BMRR_SKIP) { 577 return true; 578 } else if (region_result != BMRR_OK) { 579 return false; 580 } 581 582 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 583 ret = blkio_map_mem_region(s->blkio, ®ion); 584 } 585 586 if (ret < 0) { 587 error_setg(errp, "Failed to add blkio mem region %p with size %zu: %s", 588 host, size, blkio_get_error_msg()); 589 return false; 590 } 591 return true; 592 } 593 594 static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size) 595 { 596 BDRVBlkioState *s = bs->opaque; 597 struct blkio_mem_region region; 598 599 /* See blkio_register_buf() */ 600 if (!s->needs_mem_regions && s->may_pin_mem_regions) { 601 return; 602 } 603 604 if (blkio_mem_region_from_host(bs, host, size, ®ion, NULL) != BMRR_OK) { 605 return; 606 } 607 608 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 609 blkio_unmap_mem_region(s->blkio, ®ion); 610 } 611 } 612 613 static int blkio_io_uring_open(BlockDriverState *bs, QDict *options, int flags, 614 Error **errp) 615 { 616 const char *filename = qdict_get_str(options, "filename"); 617 BDRVBlkioState *s = bs->opaque; 618 int ret; 619 620 ret = blkio_set_str(s->blkio, "path", filename); 621 qdict_del(options, "filename"); 622 if (ret < 0) { 623 error_setg_errno(errp, -ret, "failed to set path: %s", 624 blkio_get_error_msg()); 625 return ret; 626 } 627 628 if (flags & BDRV_O_NOCACHE) { 629 ret = blkio_set_bool(s->blkio, "direct", true); 630 if (ret < 0) { 631 error_setg_errno(errp, -ret, "failed to set direct: %s", 632 blkio_get_error_msg()); 633 return ret; 634 } 635 } 636 637 return 0; 638 } 639 640 static int blkio_nvme_io_uring(BlockDriverState *bs, QDict *options, int flags, 641 Error **errp) 642 { 643 const char *path = qdict_get_try_str(options, "path"); 644 BDRVBlkioState *s = bs->opaque; 645 int ret; 646 647 if (!path) { 648 error_setg(errp, "missing 'path' option"); 649 return -EINVAL; 650 } 651 652 ret = blkio_set_str(s->blkio, "path", path); 653 qdict_del(options, "path"); 654 if (ret < 0) { 655 error_setg_errno(errp, -ret, "failed to set path: %s", 656 blkio_get_error_msg()); 657 return ret; 658 } 659 660 if (!(flags & BDRV_O_NOCACHE)) { 661 error_setg(errp, "cache.direct=off is not supported"); 662 return -EINVAL; 663 } 664 665 return 0; 666 } 667 668 static int blkio_virtio_blk_common_open(BlockDriverState *bs, 669 QDict *options, int flags, Error **errp) 670 { 671 const char *path = qdict_get_try_str(options, "path"); 672 BDRVBlkioState *s = bs->opaque; 673 int ret; 674 675 if (!path) { 676 error_setg(errp, "missing 'path' option"); 677 return -EINVAL; 678 } 679 680 ret = blkio_set_str(s->blkio, "path", path); 681 qdict_del(options, "path"); 682 if (ret < 0) { 683 error_setg_errno(errp, -ret, "failed to set path: %s", 684 blkio_get_error_msg()); 685 return ret; 686 } 687 688 if (!(flags & BDRV_O_NOCACHE)) { 689 error_setg(errp, "cache.direct=off is not supported"); 690 return -EINVAL; 691 } 692 return 0; 693 } 694 695 static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags, 696 Error **errp) 697 { 698 const char *blkio_driver = bs->drv->protocol_name; 699 BDRVBlkioState *s = bs->opaque; 700 int ret; 701 702 ret = blkio_create(blkio_driver, &s->blkio); 703 if (ret < 0) { 704 error_setg_errno(errp, -ret, "blkio_create failed: %s", 705 blkio_get_error_msg()); 706 return ret; 707 } 708 709 if (strcmp(blkio_driver, DRIVER_IO_URING) == 0) { 710 ret = blkio_io_uring_open(bs, options, flags, errp); 711 } else if (strcmp(blkio_driver, DRIVER_NVME_IO_URING) == 0) { 712 ret = blkio_nvme_io_uring(bs, options, flags, errp); 713 } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VFIO_PCI) == 0) { 714 ret = blkio_virtio_blk_common_open(bs, options, flags, errp); 715 } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_USER) == 0) { 716 ret = blkio_virtio_blk_common_open(bs, options, flags, errp); 717 } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_VDPA) == 0) { 718 ret = blkio_virtio_blk_common_open(bs, options, flags, errp); 719 } else { 720 g_assert_not_reached(); 721 } 722 if (ret < 0) { 723 blkio_destroy(&s->blkio); 724 return ret; 725 } 726 727 if (!(flags & BDRV_O_RDWR)) { 728 ret = blkio_set_bool(s->blkio, "read-only", true); 729 if (ret < 0) { 730 error_setg_errno(errp, -ret, "failed to set read-only: %s", 731 blkio_get_error_msg()); 732 blkio_destroy(&s->blkio); 733 return ret; 734 } 735 } 736 737 ret = blkio_connect(s->blkio); 738 if (ret < 0) { 739 error_setg_errno(errp, -ret, "blkio_connect failed: %s", 740 blkio_get_error_msg()); 741 blkio_destroy(&s->blkio); 742 return ret; 743 } 744 745 ret = blkio_get_bool(s->blkio, 746 "needs-mem-regions", 747 &s->needs_mem_regions); 748 if (ret < 0) { 749 error_setg_errno(errp, -ret, 750 "failed to get needs-mem-regions: %s", 751 blkio_get_error_msg()); 752 blkio_destroy(&s->blkio); 753 return ret; 754 } 755 756 ret = blkio_get_bool(s->blkio, 757 "needs-mem-region-fd", 758 &s->needs_mem_region_fd); 759 if (ret < 0) { 760 error_setg_errno(errp, -ret, 761 "failed to get needs-mem-region-fd: %s", 762 blkio_get_error_msg()); 763 blkio_destroy(&s->blkio); 764 return ret; 765 } 766 767 ret = blkio_get_uint64(s->blkio, 768 "mem-region-alignment", 769 &s->mem_region_alignment); 770 if (ret < 0) { 771 error_setg_errno(errp, -ret, 772 "failed to get mem-region-alignment: %s", 773 blkio_get_error_msg()); 774 blkio_destroy(&s->blkio); 775 return ret; 776 } 777 778 ret = blkio_get_bool(s->blkio, 779 "may-pin-mem-regions", 780 &s->may_pin_mem_regions); 781 if (ret < 0) { 782 /* Be conservative (assume pinning) if the property is not supported */ 783 s->may_pin_mem_regions = s->needs_mem_regions; 784 } 785 786 /* 787 * Notify if libblkio drivers pin memory and prevent features like 788 * virtio-mem from working. 789 */ 790 if (s->may_pin_mem_regions) { 791 ret = ram_block_discard_disable(true); 792 if (ret < 0) { 793 error_setg_errno(errp, -ret, "ram_block_discard_disable() failed"); 794 blkio_destroy(&s->blkio); 795 return ret; 796 } 797 } 798 799 ret = blkio_start(s->blkio); 800 if (ret < 0) { 801 error_setg_errno(errp, -ret, "blkio_start failed: %s", 802 blkio_get_error_msg()); 803 blkio_destroy(&s->blkio); 804 if (s->may_pin_mem_regions) { 805 ram_block_discard_disable(false); 806 } 807 return ret; 808 } 809 810 bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF; 811 bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | 812 BDRV_REQ_NO_FALLBACK; 813 814 qemu_mutex_init(&s->blkio_lock); 815 qemu_co_mutex_init(&s->bounce_lock); 816 qemu_co_queue_init(&s->bounce_available); 817 QLIST_INIT(&s->bounce_bufs); 818 s->blkioq = blkio_get_queue(s->blkio, 0); 819 s->completion_fd = blkioq_get_completion_fd(s->blkioq); 820 821 blkio_attach_aio_context(bs, bdrv_get_aio_context(bs)); 822 return 0; 823 } 824 825 static void blkio_close(BlockDriverState *bs) 826 { 827 BDRVBlkioState *s = bs->opaque; 828 829 /* There is no destroy() API for s->bounce_lock */ 830 831 qemu_mutex_destroy(&s->blkio_lock); 832 blkio_detach_aio_context(bs); 833 blkio_destroy(&s->blkio); 834 835 if (s->may_pin_mem_regions) { 836 ram_block_discard_disable(false); 837 } 838 } 839 840 static int64_t blkio_getlength(BlockDriverState *bs) 841 { 842 BDRVBlkioState *s = bs->opaque; 843 uint64_t capacity; 844 int ret; 845 846 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 847 ret = blkio_get_uint64(s->blkio, "capacity", &capacity); 848 } 849 if (ret < 0) { 850 return -ret; 851 } 852 853 return capacity; 854 } 855 856 static int coroutine_fn blkio_truncate(BlockDriverState *bs, int64_t offset, 857 bool exact, PreallocMode prealloc, 858 BdrvRequestFlags flags, Error **errp) 859 { 860 int64_t current_length; 861 862 if (prealloc != PREALLOC_MODE_OFF) { 863 error_setg(errp, "Unsupported preallocation mode '%s'", 864 PreallocMode_str(prealloc)); 865 return -ENOTSUP; 866 } 867 868 current_length = blkio_getlength(bs); 869 870 if (offset > current_length) { 871 error_setg(errp, "Cannot grow device"); 872 return -EINVAL; 873 } else if (exact && offset != current_length) { 874 error_setg(errp, "Cannot resize device"); 875 return -ENOTSUP; 876 } 877 878 return 0; 879 } 880 881 static int blkio_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 882 { 883 return 0; 884 } 885 886 static void blkio_refresh_limits(BlockDriverState *bs, Error **errp) 887 { 888 BDRVBlkioState *s = bs->opaque; 889 QEMU_LOCK_GUARD(&s->blkio_lock); 890 int value; 891 int ret; 892 893 ret = blkio_get_int(s->blkio, "request-alignment", &value); 894 if (ret < 0) { 895 error_setg_errno(errp, -ret, "failed to get \"request-alignment\": %s", 896 blkio_get_error_msg()); 897 return; 898 } 899 bs->bl.request_alignment = value; 900 if (bs->bl.request_alignment < 1 || 901 bs->bl.request_alignment >= INT_MAX || 902 !is_power_of_2(bs->bl.request_alignment)) { 903 error_setg(errp, "invalid \"request-alignment\" value %" PRIu32 ", " 904 "must be a power of 2 less than INT_MAX", 905 bs->bl.request_alignment); 906 return; 907 } 908 909 ret = blkio_get_int(s->blkio, "optimal-io-size", &value); 910 if (ret < 0) { 911 error_setg_errno(errp, -ret, "failed to get \"optimal-io-size\": %s", 912 blkio_get_error_msg()); 913 return; 914 } 915 bs->bl.opt_transfer = value; 916 if (bs->bl.opt_transfer > INT_MAX || 917 (bs->bl.opt_transfer % bs->bl.request_alignment)) { 918 error_setg(errp, "invalid \"optimal-io-size\" value %" PRIu32 ", must " 919 "be a multiple of %" PRIu32, bs->bl.opt_transfer, 920 bs->bl.request_alignment); 921 return; 922 } 923 924 ret = blkio_get_int(s->blkio, "max-transfer", &value); 925 if (ret < 0) { 926 error_setg_errno(errp, -ret, "failed to get \"max-transfer\": %s", 927 blkio_get_error_msg()); 928 return; 929 } 930 bs->bl.max_transfer = value; 931 if ((bs->bl.max_transfer % bs->bl.request_alignment) || 932 (bs->bl.opt_transfer && (bs->bl.max_transfer % bs->bl.opt_transfer))) { 933 error_setg(errp, "invalid \"max-transfer\" value %" PRIu32 ", must be " 934 "a multiple of %" PRIu32 " and %" PRIu32 " (if non-zero)", 935 bs->bl.max_transfer, bs->bl.request_alignment, 936 bs->bl.opt_transfer); 937 return; 938 } 939 940 ret = blkio_get_int(s->blkio, "buf-alignment", &value); 941 if (ret < 0) { 942 error_setg_errno(errp, -ret, "failed to get \"buf-alignment\": %s", 943 blkio_get_error_msg()); 944 return; 945 } 946 if (value < 1) { 947 error_setg(errp, "invalid \"buf-alignment\" value %d, must be " 948 "positive", value); 949 return; 950 } 951 bs->bl.min_mem_alignment = value; 952 953 ret = blkio_get_int(s->blkio, "optimal-buf-alignment", &value); 954 if (ret < 0) { 955 error_setg_errno(errp, -ret, 956 "failed to get \"optimal-buf-alignment\": %s", 957 blkio_get_error_msg()); 958 return; 959 } 960 if (value < 1) { 961 error_setg(errp, "invalid \"optimal-buf-alignment\" value %d, " 962 "must be positive", value); 963 return; 964 } 965 bs->bl.opt_mem_alignment = value; 966 967 ret = blkio_get_int(s->blkio, "max-segments", &value); 968 if (ret < 0) { 969 error_setg_errno(errp, -ret, "failed to get \"max-segments\": %s", 970 blkio_get_error_msg()); 971 return; 972 } 973 if (value < 1) { 974 error_setg(errp, "invalid \"max-segments\" value %d, must be positive", 975 value); 976 return; 977 } 978 bs->bl.max_iov = value; 979 } 980 981 /* 982 * TODO 983 * Missing libblkio APIs: 984 * - block_status 985 * - co_invalidate_cache 986 * 987 * Out of scope? 988 * - create 989 * - truncate 990 */ 991 992 #define BLKIO_DRIVER(name, ...) \ 993 { \ 994 .format_name = name, \ 995 .protocol_name = name, \ 996 .has_variable_length = true, \ 997 .instance_size = sizeof(BDRVBlkioState), \ 998 .bdrv_file_open = blkio_file_open, \ 999 .bdrv_close = blkio_close, \ 1000 .bdrv_getlength = blkio_getlength, \ 1001 .bdrv_co_truncate = blkio_truncate, \ 1002 .bdrv_get_info = blkio_get_info, \ 1003 .bdrv_attach_aio_context = blkio_attach_aio_context, \ 1004 .bdrv_detach_aio_context = blkio_detach_aio_context, \ 1005 .bdrv_co_pdiscard = blkio_co_pdiscard, \ 1006 .bdrv_co_preadv = blkio_co_preadv, \ 1007 .bdrv_co_pwritev = blkio_co_pwritev, \ 1008 .bdrv_co_flush_to_disk = blkio_co_flush, \ 1009 .bdrv_co_pwrite_zeroes = blkio_co_pwrite_zeroes, \ 1010 .bdrv_io_unplug = blkio_io_unplug, \ 1011 .bdrv_refresh_limits = blkio_refresh_limits, \ 1012 .bdrv_register_buf = blkio_register_buf, \ 1013 .bdrv_unregister_buf = blkio_unregister_buf, \ 1014 __VA_ARGS__ \ 1015 } 1016 1017 static BlockDriver bdrv_io_uring = BLKIO_DRIVER( 1018 DRIVER_IO_URING, 1019 .bdrv_needs_filename = true, 1020 ); 1021 1022 static BlockDriver bdrv_nvme_io_uring = BLKIO_DRIVER( 1023 DRIVER_NVME_IO_URING, 1024 ); 1025 1026 static BlockDriver bdrv_virtio_blk_vfio_pci = BLKIO_DRIVER( 1027 DRIVER_VIRTIO_BLK_VFIO_PCI 1028 ); 1029 1030 static BlockDriver bdrv_virtio_blk_vhost_user = BLKIO_DRIVER( 1031 DRIVER_VIRTIO_BLK_VHOST_USER 1032 ); 1033 1034 static BlockDriver bdrv_virtio_blk_vhost_vdpa = BLKIO_DRIVER( 1035 DRIVER_VIRTIO_BLK_VHOST_VDPA 1036 ); 1037 1038 static void bdrv_blkio_init(void) 1039 { 1040 bdrv_register(&bdrv_io_uring); 1041 bdrv_register(&bdrv_nvme_io_uring); 1042 bdrv_register(&bdrv_virtio_blk_vfio_pci); 1043 bdrv_register(&bdrv_virtio_blk_vhost_user); 1044 bdrv_register(&bdrv_virtio_blk_vhost_vdpa); 1045 } 1046 1047 block_init(bdrv_blkio_init); 1048