1 /* SPDX-License-Identifier: LGPL-2.1-or-later */ 2 /* 3 * libblkio BlockDriver 4 * 5 * Copyright Red Hat, Inc. 6 * 7 * Author: 8 * Stefan Hajnoczi <stefanha@redhat.com> 9 */ 10 11 #include "qemu/osdep.h" 12 #include <blkio.h> 13 #include "block/block_int.h" 14 #include "exec/memory.h" 15 #include "exec/cpu-common.h" /* for qemu_ram_get_fd() */ 16 #include "qapi/error.h" 17 #include "qemu/error-report.h" 18 #include "qapi/qmp/qdict.h" 19 #include "qemu/module.h" 20 #include "exec/memory.h" /* for ram_block_discard_disable() */ 21 22 #include "block/block-io.h" 23 24 /* 25 * Keep the QEMU BlockDriver names identical to the libblkio driver names. 26 * Using macros instead of typing out the string literals avoids typos. 27 */ 28 #define DRIVER_IO_URING "io_uring" 29 #define DRIVER_NVME_IO_URING "nvme-io_uring" 30 #define DRIVER_VIRTIO_BLK_VFIO_PCI "virtio-blk-vfio-pci" 31 #define DRIVER_VIRTIO_BLK_VHOST_USER "virtio-blk-vhost-user" 32 #define DRIVER_VIRTIO_BLK_VHOST_VDPA "virtio-blk-vhost-vdpa" 33 34 /* 35 * Allocated bounce buffers are kept in a list sorted by buffer address. 36 */ 37 typedef struct BlkioBounceBuf { 38 QLIST_ENTRY(BlkioBounceBuf) next; 39 40 /* The bounce buffer */ 41 struct iovec buf; 42 } BlkioBounceBuf; 43 44 typedef struct { 45 /* 46 * libblkio is not thread-safe so this lock protects ->blkio and 47 * ->blkioq. 48 */ 49 QemuMutex blkio_lock; 50 struct blkio *blkio; 51 struct blkioq *blkioq; /* make this multi-queue in the future... */ 52 int completion_fd; 53 54 /* 55 * Polling fetches the next completion into this field. 56 * 57 * No lock is necessary since only one thread calls aio_poll() and invokes 58 * fd and poll handlers. 59 */ 60 struct blkio_completion poll_completion; 61 62 /* 63 * Protects ->bounce_pool, ->bounce_bufs, ->bounce_available. 64 * 65 * Lock ordering: ->bounce_lock before ->blkio_lock. 66 */ 67 CoMutex bounce_lock; 68 69 /* Bounce buffer pool */ 70 struct blkio_mem_region bounce_pool; 71 72 /* Sorted list of allocated bounce buffers */ 73 QLIST_HEAD(, BlkioBounceBuf) bounce_bufs; 74 75 /* Queue for coroutines waiting for bounce buffer space */ 76 CoQueue bounce_available; 77 78 /* The value of the "mem-region-alignment" property */ 79 size_t mem_region_alignment; 80 81 /* Can we skip adding/deleting blkio_mem_regions? */ 82 bool needs_mem_regions; 83 84 /* Are file descriptors necessary for blkio_mem_regions? */ 85 bool needs_mem_region_fd; 86 87 /* Are madvise(MADV_DONTNEED)-style operations unavailable? */ 88 bool may_pin_mem_regions; 89 } BDRVBlkioState; 90 91 /* Called with s->bounce_lock held */ 92 static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes) 93 { 94 /* There can be no allocated bounce buffers during resize */ 95 assert(QLIST_EMPTY(&s->bounce_bufs)); 96 97 /* Pad size to reduce frequency of resize calls */ 98 bytes += 128 * 1024; 99 100 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 101 int ret; 102 103 if (s->bounce_pool.addr) { 104 blkio_unmap_mem_region(s->blkio, &s->bounce_pool); 105 blkio_free_mem_region(s->blkio, &s->bounce_pool); 106 memset(&s->bounce_pool, 0, sizeof(s->bounce_pool)); 107 } 108 109 /* Automatically freed when s->blkio is destroyed */ 110 ret = blkio_alloc_mem_region(s->blkio, &s->bounce_pool, bytes); 111 if (ret < 0) { 112 return ret; 113 } 114 115 ret = blkio_map_mem_region(s->blkio, &s->bounce_pool); 116 if (ret < 0) { 117 blkio_free_mem_region(s->blkio, &s->bounce_pool); 118 memset(&s->bounce_pool, 0, sizeof(s->bounce_pool)); 119 return ret; 120 } 121 } 122 123 return 0; 124 } 125 126 /* Called with s->bounce_lock held */ 127 static bool 128 blkio_do_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce, 129 int64_t bytes) 130 { 131 void *addr = s->bounce_pool.addr; 132 BlkioBounceBuf *cur = NULL; 133 BlkioBounceBuf *prev = NULL; 134 ptrdiff_t space; 135 136 /* 137 * This is just a linear search over the holes between requests. An 138 * efficient allocator would be nice. 139 */ 140 QLIST_FOREACH(cur, &s->bounce_bufs, next) { 141 space = cur->buf.iov_base - addr; 142 if (bytes <= space) { 143 QLIST_INSERT_BEFORE(cur, bounce, next); 144 bounce->buf.iov_base = addr; 145 bounce->buf.iov_len = bytes; 146 return true; 147 } 148 149 addr = cur->buf.iov_base + cur->buf.iov_len; 150 prev = cur; 151 } 152 153 /* Is there space after the last request? */ 154 space = s->bounce_pool.addr + s->bounce_pool.len - addr; 155 if (bytes > space) { 156 return false; 157 } 158 if (prev) { 159 QLIST_INSERT_AFTER(prev, bounce, next); 160 } else { 161 QLIST_INSERT_HEAD(&s->bounce_bufs, bounce, next); 162 } 163 bounce->buf.iov_base = addr; 164 bounce->buf.iov_len = bytes; 165 return true; 166 } 167 168 static int coroutine_fn 169 blkio_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce, 170 int64_t bytes) 171 { 172 /* 173 * Ensure fairness: first time around we join the back of the queue, 174 * subsequently we join the front so we don't lose our place. 175 */ 176 CoQueueWaitFlags wait_flags = 0; 177 178 QEMU_LOCK_GUARD(&s->bounce_lock); 179 180 /* Ensure fairness: don't even try if other requests are already waiting */ 181 if (!qemu_co_queue_empty(&s->bounce_available)) { 182 qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock, 183 wait_flags); 184 wait_flags = CO_QUEUE_WAIT_FRONT; 185 } 186 187 while (true) { 188 if (blkio_do_alloc_bounce_buffer(s, bounce, bytes)) { 189 /* Kick the next queued request since there may be space */ 190 qemu_co_queue_next(&s->bounce_available); 191 return 0; 192 } 193 194 /* 195 * If there are no in-flight requests then the pool was simply too 196 * small. 197 */ 198 if (QLIST_EMPTY(&s->bounce_bufs)) { 199 bool ok; 200 int ret; 201 202 ret = blkio_resize_bounce_pool(s, bytes); 203 if (ret < 0) { 204 /* Kick the next queued request since that may fail too */ 205 qemu_co_queue_next(&s->bounce_available); 206 return ret; 207 } 208 209 ok = blkio_do_alloc_bounce_buffer(s, bounce, bytes); 210 assert(ok); /* must have space this time */ 211 return 0; 212 } 213 214 qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock, 215 wait_flags); 216 wait_flags = CO_QUEUE_WAIT_FRONT; 217 } 218 } 219 220 static void coroutine_fn blkio_free_bounce_buffer(BDRVBlkioState *s, 221 BlkioBounceBuf *bounce) 222 { 223 QEMU_LOCK_GUARD(&s->bounce_lock); 224 225 QLIST_REMOVE(bounce, next); 226 227 /* Wake up waiting coroutines since space may now be available */ 228 qemu_co_queue_next(&s->bounce_available); 229 } 230 231 /* For async to .bdrv_co_*() conversion */ 232 typedef struct { 233 Coroutine *coroutine; 234 int ret; 235 } BlkioCoData; 236 237 static void blkio_completion_fd_read(void *opaque) 238 { 239 BlockDriverState *bs = opaque; 240 BDRVBlkioState *s = bs->opaque; 241 uint64_t val; 242 int ret; 243 244 /* Polling may have already fetched a completion */ 245 if (s->poll_completion.user_data != NULL) { 246 BlkioCoData *cod = s->poll_completion.user_data; 247 cod->ret = s->poll_completion.ret; 248 249 /* Clear it in case aio_co_wake() enters a nested event loop */ 250 s->poll_completion.user_data = NULL; 251 252 aio_co_wake(cod->coroutine); 253 } 254 255 /* Reset completion fd status */ 256 ret = read(s->completion_fd, &val, sizeof(val)); 257 258 /* Ignore errors, there's nothing we can do */ 259 (void)ret; 260 261 /* 262 * Reading one completion at a time makes nested event loop re-entrancy 263 * simple. Change this loop to get multiple completions in one go if it 264 * becomes a performance bottleneck. 265 */ 266 while (true) { 267 struct blkio_completion completion; 268 269 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 270 ret = blkioq_do_io(s->blkioq, &completion, 0, 1, NULL); 271 } 272 if (ret != 1) { 273 break; 274 } 275 276 BlkioCoData *cod = completion.user_data; 277 cod->ret = completion.ret; 278 aio_co_wake(cod->coroutine); 279 } 280 } 281 282 static bool blkio_completion_fd_poll(void *opaque) 283 { 284 BlockDriverState *bs = opaque; 285 BDRVBlkioState *s = bs->opaque; 286 int ret; 287 288 /* Just in case we already fetched a completion */ 289 if (s->poll_completion.user_data != NULL) { 290 return true; 291 } 292 293 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 294 ret = blkioq_do_io(s->blkioq, &s->poll_completion, 0, 1, NULL); 295 } 296 return ret == 1; 297 } 298 299 static void blkio_completion_fd_poll_ready(void *opaque) 300 { 301 blkio_completion_fd_read(opaque); 302 } 303 304 static void blkio_attach_aio_context(BlockDriverState *bs, 305 AioContext *new_context) 306 { 307 BDRVBlkioState *s = bs->opaque; 308 309 aio_set_fd_handler(new_context, 310 s->completion_fd, 311 false, 312 blkio_completion_fd_read, 313 NULL, 314 blkio_completion_fd_poll, 315 blkio_completion_fd_poll_ready, 316 bs); 317 } 318 319 static void blkio_detach_aio_context(BlockDriverState *bs) 320 { 321 BDRVBlkioState *s = bs->opaque; 322 323 aio_set_fd_handler(bdrv_get_aio_context(bs), 324 s->completion_fd, 325 false, NULL, NULL, NULL, NULL, NULL); 326 } 327 328 /* Call with s->blkio_lock held to submit I/O after enqueuing a new request */ 329 static void blkio_submit_io(BlockDriverState *bs) 330 { 331 if (qatomic_read(&bs->io_plugged) == 0) { 332 BDRVBlkioState *s = bs->opaque; 333 334 blkioq_do_io(s->blkioq, NULL, 0, 0, NULL); 335 } 336 } 337 338 static int coroutine_fn 339 blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) 340 { 341 BDRVBlkioState *s = bs->opaque; 342 BlkioCoData cod = { 343 .coroutine = qemu_coroutine_self(), 344 }; 345 346 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 347 blkioq_discard(s->blkioq, offset, bytes, &cod, 0); 348 blkio_submit_io(bs); 349 } 350 351 qemu_coroutine_yield(); 352 return cod.ret; 353 } 354 355 static int coroutine_fn 356 blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, 357 QEMUIOVector *qiov, BdrvRequestFlags flags) 358 { 359 BlkioCoData cod = { 360 .coroutine = qemu_coroutine_self(), 361 }; 362 BDRVBlkioState *s = bs->opaque; 363 bool use_bounce_buffer = 364 s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF); 365 BlkioBounceBuf bounce; 366 struct iovec *iov = qiov->iov; 367 int iovcnt = qiov->niov; 368 369 if (use_bounce_buffer) { 370 int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes); 371 if (ret < 0) { 372 return ret; 373 } 374 375 iov = &bounce.buf; 376 iovcnt = 1; 377 } 378 379 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 380 blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0); 381 blkio_submit_io(bs); 382 } 383 384 qemu_coroutine_yield(); 385 386 if (use_bounce_buffer) { 387 if (cod.ret == 0) { 388 qemu_iovec_from_buf(qiov, 0, 389 bounce.buf.iov_base, 390 bounce.buf.iov_len); 391 } 392 393 blkio_free_bounce_buffer(s, &bounce); 394 } 395 396 return cod.ret; 397 } 398 399 static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset, 400 int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) 401 { 402 uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0; 403 BlkioCoData cod = { 404 .coroutine = qemu_coroutine_self(), 405 }; 406 BDRVBlkioState *s = bs->opaque; 407 bool use_bounce_buffer = 408 s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF); 409 BlkioBounceBuf bounce; 410 struct iovec *iov = qiov->iov; 411 int iovcnt = qiov->niov; 412 413 if (use_bounce_buffer) { 414 int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes); 415 if (ret < 0) { 416 return ret; 417 } 418 419 qemu_iovec_to_buf(qiov, 0, bounce.buf.iov_base, bytes); 420 iov = &bounce.buf; 421 iovcnt = 1; 422 } 423 424 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 425 blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags); 426 blkio_submit_io(bs); 427 } 428 429 qemu_coroutine_yield(); 430 431 if (use_bounce_buffer) { 432 blkio_free_bounce_buffer(s, &bounce); 433 } 434 435 return cod.ret; 436 } 437 438 static int coroutine_fn blkio_co_flush(BlockDriverState *bs) 439 { 440 BDRVBlkioState *s = bs->opaque; 441 BlkioCoData cod = { 442 .coroutine = qemu_coroutine_self(), 443 }; 444 445 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 446 blkioq_flush(s->blkioq, &cod, 0); 447 blkio_submit_io(bs); 448 } 449 450 qemu_coroutine_yield(); 451 return cod.ret; 452 } 453 454 static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs, 455 int64_t offset, int64_t bytes, BdrvRequestFlags flags) 456 { 457 BDRVBlkioState *s = bs->opaque; 458 BlkioCoData cod = { 459 .coroutine = qemu_coroutine_self(), 460 }; 461 uint32_t blkio_flags = 0; 462 463 if (flags & BDRV_REQ_FUA) { 464 blkio_flags |= BLKIO_REQ_FUA; 465 } 466 if (!(flags & BDRV_REQ_MAY_UNMAP)) { 467 blkio_flags |= BLKIO_REQ_NO_UNMAP; 468 } 469 if (flags & BDRV_REQ_NO_FALLBACK) { 470 blkio_flags |= BLKIO_REQ_NO_FALLBACK; 471 } 472 473 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 474 blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags); 475 blkio_submit_io(bs); 476 } 477 478 qemu_coroutine_yield(); 479 return cod.ret; 480 } 481 482 static void coroutine_fn blkio_co_io_unplug(BlockDriverState *bs) 483 { 484 BDRVBlkioState *s = bs->opaque; 485 486 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 487 blkio_submit_io(bs); 488 } 489 } 490 491 typedef enum { 492 BMRR_OK, 493 BMRR_SKIP, 494 BMRR_FAIL, 495 } BlkioMemRegionResult; 496 497 /* 498 * Produce a struct blkio_mem_region for a given address and size. 499 * 500 * This function produces identical results when called multiple times with the 501 * same arguments. This property is necessary because blkio_unmap_mem_region() 502 * must receive the same struct blkio_mem_region field values that were passed 503 * to blkio_map_mem_region(). 504 */ 505 static BlkioMemRegionResult 506 blkio_mem_region_from_host(BlockDriverState *bs, 507 void *host, size_t size, 508 struct blkio_mem_region *region, 509 Error **errp) 510 { 511 BDRVBlkioState *s = bs->opaque; 512 int fd = -1; 513 ram_addr_t fd_offset = 0; 514 515 if (((uintptr_t)host | size) % s->mem_region_alignment) { 516 error_setg(errp, "unaligned buf %p with size %zu", host, size); 517 return BMRR_FAIL; 518 } 519 520 /* Attempt to find the fd for the underlying memory */ 521 if (s->needs_mem_region_fd) { 522 RAMBlock *ram_block; 523 RAMBlock *end_block; 524 ram_addr_t offset; 525 526 /* 527 * bdrv_register_buf() is called with the BQL held so mr lives at least 528 * until this function returns. 529 */ 530 ram_block = qemu_ram_block_from_host(host, false, &fd_offset); 531 if (ram_block) { 532 fd = qemu_ram_get_fd(ram_block); 533 } 534 if (fd == -1) { 535 /* 536 * Ideally every RAMBlock would have an fd. pc-bios and other 537 * things don't. Luckily they are usually not I/O buffers and we 538 * can just ignore them. 539 */ 540 return BMRR_SKIP; 541 } 542 543 /* Make sure the fd covers the entire range */ 544 end_block = qemu_ram_block_from_host(host + size - 1, false, &offset); 545 if (ram_block != end_block) { 546 error_setg(errp, "registered buffer at %p with size %zu extends " 547 "beyond RAMBlock", host, size); 548 return BMRR_FAIL; 549 } 550 } 551 552 *region = (struct blkio_mem_region){ 553 .addr = host, 554 .len = size, 555 .fd = fd, 556 .fd_offset = fd_offset, 557 }; 558 return BMRR_OK; 559 } 560 561 static bool blkio_register_buf(BlockDriverState *bs, void *host, size_t size, 562 Error **errp) 563 { 564 BDRVBlkioState *s = bs->opaque; 565 struct blkio_mem_region region; 566 BlkioMemRegionResult region_result; 567 int ret; 568 569 /* 570 * Mapping memory regions conflicts with RAM discard (virtio-mem) when 571 * there is pinning, so only do it when necessary. 572 */ 573 if (!s->needs_mem_regions && s->may_pin_mem_regions) { 574 return true; 575 } 576 577 region_result = blkio_mem_region_from_host(bs, host, size, ®ion, errp); 578 if (region_result == BMRR_SKIP) { 579 return true; 580 } else if (region_result != BMRR_OK) { 581 return false; 582 } 583 584 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 585 ret = blkio_map_mem_region(s->blkio, ®ion); 586 } 587 588 if (ret < 0) { 589 error_setg(errp, "Failed to add blkio mem region %p with size %zu: %s", 590 host, size, blkio_get_error_msg()); 591 return false; 592 } 593 return true; 594 } 595 596 static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size) 597 { 598 BDRVBlkioState *s = bs->opaque; 599 struct blkio_mem_region region; 600 601 /* See blkio_register_buf() */ 602 if (!s->needs_mem_regions && s->may_pin_mem_regions) { 603 return; 604 } 605 606 if (blkio_mem_region_from_host(bs, host, size, ®ion, NULL) != BMRR_OK) { 607 return; 608 } 609 610 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 611 blkio_unmap_mem_region(s->blkio, ®ion); 612 } 613 } 614 615 static int blkio_io_uring_open(BlockDriverState *bs, QDict *options, int flags, 616 Error **errp) 617 { 618 const char *filename = qdict_get_str(options, "filename"); 619 BDRVBlkioState *s = bs->opaque; 620 int ret; 621 622 ret = blkio_set_str(s->blkio, "path", filename); 623 qdict_del(options, "filename"); 624 if (ret < 0) { 625 error_setg_errno(errp, -ret, "failed to set path: %s", 626 blkio_get_error_msg()); 627 return ret; 628 } 629 630 if (flags & BDRV_O_NOCACHE) { 631 ret = blkio_set_bool(s->blkio, "direct", true); 632 if (ret < 0) { 633 error_setg_errno(errp, -ret, "failed to set direct: %s", 634 blkio_get_error_msg()); 635 return ret; 636 } 637 } 638 639 return 0; 640 } 641 642 static int blkio_nvme_io_uring(BlockDriverState *bs, QDict *options, int flags, 643 Error **errp) 644 { 645 const char *path = qdict_get_try_str(options, "path"); 646 BDRVBlkioState *s = bs->opaque; 647 int ret; 648 649 if (!path) { 650 error_setg(errp, "missing 'path' option"); 651 return -EINVAL; 652 } 653 654 ret = blkio_set_str(s->blkio, "path", path); 655 qdict_del(options, "path"); 656 if (ret < 0) { 657 error_setg_errno(errp, -ret, "failed to set path: %s", 658 blkio_get_error_msg()); 659 return ret; 660 } 661 662 if (!(flags & BDRV_O_NOCACHE)) { 663 error_setg(errp, "cache.direct=off is not supported"); 664 return -EINVAL; 665 } 666 667 return 0; 668 } 669 670 static int blkio_virtio_blk_common_open(BlockDriverState *bs, 671 QDict *options, int flags, Error **errp) 672 { 673 const char *path = qdict_get_try_str(options, "path"); 674 BDRVBlkioState *s = bs->opaque; 675 int ret; 676 677 if (!path) { 678 error_setg(errp, "missing 'path' option"); 679 return -EINVAL; 680 } 681 682 ret = blkio_set_str(s->blkio, "path", path); 683 qdict_del(options, "path"); 684 if (ret < 0) { 685 error_setg_errno(errp, -ret, "failed to set path: %s", 686 blkio_get_error_msg()); 687 return ret; 688 } 689 690 if (!(flags & BDRV_O_NOCACHE)) { 691 error_setg(errp, "cache.direct=off is not supported"); 692 return -EINVAL; 693 } 694 return 0; 695 } 696 697 static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags, 698 Error **errp) 699 { 700 const char *blkio_driver = bs->drv->protocol_name; 701 BDRVBlkioState *s = bs->opaque; 702 int ret; 703 704 ret = blkio_create(blkio_driver, &s->blkio); 705 if (ret < 0) { 706 error_setg_errno(errp, -ret, "blkio_create failed: %s", 707 blkio_get_error_msg()); 708 return ret; 709 } 710 711 if (strcmp(blkio_driver, DRIVER_IO_URING) == 0) { 712 ret = blkio_io_uring_open(bs, options, flags, errp); 713 } else if (strcmp(blkio_driver, DRIVER_NVME_IO_URING) == 0) { 714 ret = blkio_nvme_io_uring(bs, options, flags, errp); 715 } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VFIO_PCI) == 0) { 716 ret = blkio_virtio_blk_common_open(bs, options, flags, errp); 717 } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_USER) == 0) { 718 ret = blkio_virtio_blk_common_open(bs, options, flags, errp); 719 } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_VDPA) == 0) { 720 ret = blkio_virtio_blk_common_open(bs, options, flags, errp); 721 } else { 722 g_assert_not_reached(); 723 } 724 if (ret < 0) { 725 blkio_destroy(&s->blkio); 726 return ret; 727 } 728 729 if (!(flags & BDRV_O_RDWR)) { 730 ret = blkio_set_bool(s->blkio, "read-only", true); 731 if (ret < 0) { 732 error_setg_errno(errp, -ret, "failed to set read-only: %s", 733 blkio_get_error_msg()); 734 blkio_destroy(&s->blkio); 735 return ret; 736 } 737 } 738 739 ret = blkio_connect(s->blkio); 740 if (ret < 0) { 741 error_setg_errno(errp, -ret, "blkio_connect failed: %s", 742 blkio_get_error_msg()); 743 blkio_destroy(&s->blkio); 744 return ret; 745 } 746 747 ret = blkio_get_bool(s->blkio, 748 "needs-mem-regions", 749 &s->needs_mem_regions); 750 if (ret < 0) { 751 error_setg_errno(errp, -ret, 752 "failed to get needs-mem-regions: %s", 753 blkio_get_error_msg()); 754 blkio_destroy(&s->blkio); 755 return ret; 756 } 757 758 ret = blkio_get_bool(s->blkio, 759 "needs-mem-region-fd", 760 &s->needs_mem_region_fd); 761 if (ret < 0) { 762 error_setg_errno(errp, -ret, 763 "failed to get needs-mem-region-fd: %s", 764 blkio_get_error_msg()); 765 blkio_destroy(&s->blkio); 766 return ret; 767 } 768 769 ret = blkio_get_uint64(s->blkio, 770 "mem-region-alignment", 771 &s->mem_region_alignment); 772 if (ret < 0) { 773 error_setg_errno(errp, -ret, 774 "failed to get mem-region-alignment: %s", 775 blkio_get_error_msg()); 776 blkio_destroy(&s->blkio); 777 return ret; 778 } 779 780 ret = blkio_get_bool(s->blkio, 781 "may-pin-mem-regions", 782 &s->may_pin_mem_regions); 783 if (ret < 0) { 784 /* Be conservative (assume pinning) if the property is not supported */ 785 s->may_pin_mem_regions = s->needs_mem_regions; 786 } 787 788 /* 789 * Notify if libblkio drivers pin memory and prevent features like 790 * virtio-mem from working. 791 */ 792 if (s->may_pin_mem_regions) { 793 ret = ram_block_discard_disable(true); 794 if (ret < 0) { 795 error_setg_errno(errp, -ret, "ram_block_discard_disable() failed"); 796 blkio_destroy(&s->blkio); 797 return ret; 798 } 799 } 800 801 ret = blkio_start(s->blkio); 802 if (ret < 0) { 803 error_setg_errno(errp, -ret, "blkio_start failed: %s", 804 blkio_get_error_msg()); 805 blkio_destroy(&s->blkio); 806 if (s->may_pin_mem_regions) { 807 ram_block_discard_disable(false); 808 } 809 return ret; 810 } 811 812 bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF; 813 bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | 814 BDRV_REQ_NO_FALLBACK; 815 816 qemu_mutex_init(&s->blkio_lock); 817 qemu_co_mutex_init(&s->bounce_lock); 818 qemu_co_queue_init(&s->bounce_available); 819 QLIST_INIT(&s->bounce_bufs); 820 s->blkioq = blkio_get_queue(s->blkio, 0); 821 s->completion_fd = blkioq_get_completion_fd(s->blkioq); 822 823 blkio_attach_aio_context(bs, bdrv_get_aio_context(bs)); 824 return 0; 825 } 826 827 static void blkio_close(BlockDriverState *bs) 828 { 829 BDRVBlkioState *s = bs->opaque; 830 831 /* There is no destroy() API for s->bounce_lock */ 832 833 qemu_mutex_destroy(&s->blkio_lock); 834 blkio_detach_aio_context(bs); 835 blkio_destroy(&s->blkio); 836 837 if (s->may_pin_mem_regions) { 838 ram_block_discard_disable(false); 839 } 840 } 841 842 static int64_t coroutine_fn blkio_co_getlength(BlockDriverState *bs) 843 { 844 BDRVBlkioState *s = bs->opaque; 845 uint64_t capacity; 846 int ret; 847 848 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 849 ret = blkio_get_uint64(s->blkio, "capacity", &capacity); 850 } 851 if (ret < 0) { 852 return -ret; 853 } 854 855 return capacity; 856 } 857 858 static int coroutine_fn blkio_truncate(BlockDriverState *bs, int64_t offset, 859 bool exact, PreallocMode prealloc, 860 BdrvRequestFlags flags, Error **errp) 861 { 862 int64_t current_length; 863 864 if (prealloc != PREALLOC_MODE_OFF) { 865 error_setg(errp, "Unsupported preallocation mode '%s'", 866 PreallocMode_str(prealloc)); 867 return -ENOTSUP; 868 } 869 870 current_length = blkio_co_getlength(bs); 871 872 if (offset > current_length) { 873 error_setg(errp, "Cannot grow device"); 874 return -EINVAL; 875 } else if (exact && offset != current_length) { 876 error_setg(errp, "Cannot resize device"); 877 return -ENOTSUP; 878 } 879 880 return 0; 881 } 882 883 static int coroutine_fn 884 blkio_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 885 { 886 return 0; 887 } 888 889 static void blkio_refresh_limits(BlockDriverState *bs, Error **errp) 890 { 891 BDRVBlkioState *s = bs->opaque; 892 QEMU_LOCK_GUARD(&s->blkio_lock); 893 int value; 894 int ret; 895 896 ret = blkio_get_int(s->blkio, "request-alignment", &value); 897 if (ret < 0) { 898 error_setg_errno(errp, -ret, "failed to get \"request-alignment\": %s", 899 blkio_get_error_msg()); 900 return; 901 } 902 bs->bl.request_alignment = value; 903 if (bs->bl.request_alignment < 1 || 904 bs->bl.request_alignment >= INT_MAX || 905 !is_power_of_2(bs->bl.request_alignment)) { 906 error_setg(errp, "invalid \"request-alignment\" value %" PRIu32 ", " 907 "must be a power of 2 less than INT_MAX", 908 bs->bl.request_alignment); 909 return; 910 } 911 912 ret = blkio_get_int(s->blkio, "optimal-io-size", &value); 913 if (ret < 0) { 914 error_setg_errno(errp, -ret, "failed to get \"optimal-io-size\": %s", 915 blkio_get_error_msg()); 916 return; 917 } 918 bs->bl.opt_transfer = value; 919 if (bs->bl.opt_transfer > INT_MAX || 920 (bs->bl.opt_transfer % bs->bl.request_alignment)) { 921 error_setg(errp, "invalid \"optimal-io-size\" value %" PRIu32 ", must " 922 "be a multiple of %" PRIu32, bs->bl.opt_transfer, 923 bs->bl.request_alignment); 924 return; 925 } 926 927 ret = blkio_get_int(s->blkio, "max-transfer", &value); 928 if (ret < 0) { 929 error_setg_errno(errp, -ret, "failed to get \"max-transfer\": %s", 930 blkio_get_error_msg()); 931 return; 932 } 933 bs->bl.max_transfer = value; 934 if ((bs->bl.max_transfer % bs->bl.request_alignment) || 935 (bs->bl.opt_transfer && (bs->bl.max_transfer % bs->bl.opt_transfer))) { 936 error_setg(errp, "invalid \"max-transfer\" value %" PRIu32 ", must be " 937 "a multiple of %" PRIu32 " and %" PRIu32 " (if non-zero)", 938 bs->bl.max_transfer, bs->bl.request_alignment, 939 bs->bl.opt_transfer); 940 return; 941 } 942 943 ret = blkio_get_int(s->blkio, "buf-alignment", &value); 944 if (ret < 0) { 945 error_setg_errno(errp, -ret, "failed to get \"buf-alignment\": %s", 946 blkio_get_error_msg()); 947 return; 948 } 949 if (value < 1) { 950 error_setg(errp, "invalid \"buf-alignment\" value %d, must be " 951 "positive", value); 952 return; 953 } 954 bs->bl.min_mem_alignment = value; 955 956 ret = blkio_get_int(s->blkio, "optimal-buf-alignment", &value); 957 if (ret < 0) { 958 error_setg_errno(errp, -ret, 959 "failed to get \"optimal-buf-alignment\": %s", 960 blkio_get_error_msg()); 961 return; 962 } 963 if (value < 1) { 964 error_setg(errp, "invalid \"optimal-buf-alignment\" value %d, " 965 "must be positive", value); 966 return; 967 } 968 bs->bl.opt_mem_alignment = value; 969 970 ret = blkio_get_int(s->blkio, "max-segments", &value); 971 if (ret < 0) { 972 error_setg_errno(errp, -ret, "failed to get \"max-segments\": %s", 973 blkio_get_error_msg()); 974 return; 975 } 976 if (value < 1) { 977 error_setg(errp, "invalid \"max-segments\" value %d, must be positive", 978 value); 979 return; 980 } 981 bs->bl.max_iov = value; 982 } 983 984 /* 985 * TODO 986 * Missing libblkio APIs: 987 * - block_status 988 * - co_invalidate_cache 989 * 990 * Out of scope? 991 * - create 992 * - truncate 993 */ 994 995 #define BLKIO_DRIVER(name, ...) \ 996 { \ 997 .format_name = name, \ 998 .protocol_name = name, \ 999 .instance_size = sizeof(BDRVBlkioState), \ 1000 .bdrv_file_open = blkio_file_open, \ 1001 .bdrv_close = blkio_close, \ 1002 .bdrv_co_getlength = blkio_co_getlength, \ 1003 .bdrv_co_truncate = blkio_truncate, \ 1004 .bdrv_co_get_info = blkio_co_get_info, \ 1005 .bdrv_attach_aio_context = blkio_attach_aio_context, \ 1006 .bdrv_detach_aio_context = blkio_detach_aio_context, \ 1007 .bdrv_co_pdiscard = blkio_co_pdiscard, \ 1008 .bdrv_co_preadv = blkio_co_preadv, \ 1009 .bdrv_co_pwritev = blkio_co_pwritev, \ 1010 .bdrv_co_flush_to_disk = blkio_co_flush, \ 1011 .bdrv_co_pwrite_zeroes = blkio_co_pwrite_zeroes, \ 1012 .bdrv_co_io_unplug = blkio_co_io_unplug, \ 1013 .bdrv_refresh_limits = blkio_refresh_limits, \ 1014 .bdrv_register_buf = blkio_register_buf, \ 1015 .bdrv_unregister_buf = blkio_unregister_buf, \ 1016 __VA_ARGS__ \ 1017 } 1018 1019 static BlockDriver bdrv_io_uring = BLKIO_DRIVER( 1020 DRIVER_IO_URING, 1021 .bdrv_needs_filename = true, 1022 ); 1023 1024 static BlockDriver bdrv_nvme_io_uring = BLKIO_DRIVER( 1025 DRIVER_NVME_IO_URING, 1026 ); 1027 1028 static BlockDriver bdrv_virtio_blk_vfio_pci = BLKIO_DRIVER( 1029 DRIVER_VIRTIO_BLK_VFIO_PCI 1030 ); 1031 1032 static BlockDriver bdrv_virtio_blk_vhost_user = BLKIO_DRIVER( 1033 DRIVER_VIRTIO_BLK_VHOST_USER 1034 ); 1035 1036 static BlockDriver bdrv_virtio_blk_vhost_vdpa = BLKIO_DRIVER( 1037 DRIVER_VIRTIO_BLK_VHOST_VDPA 1038 ); 1039 1040 static void bdrv_blkio_init(void) 1041 { 1042 bdrv_register(&bdrv_io_uring); 1043 bdrv_register(&bdrv_nvme_io_uring); 1044 bdrv_register(&bdrv_virtio_blk_vfio_pci); 1045 bdrv_register(&bdrv_virtio_blk_vhost_user); 1046 bdrv_register(&bdrv_virtio_blk_vhost_vdpa); 1047 } 1048 1049 block_init(bdrv_blkio_init); 1050