1 /* SPDX-License-Identifier: LGPL-2.1-or-later */ 2 /* 3 * libblkio BlockDriver 4 * 5 * Copyright Red Hat, Inc. 6 * 7 * Author: 8 * Stefan Hajnoczi <stefanha@redhat.com> 9 */ 10 11 #include "qemu/osdep.h" 12 #include <blkio.h> 13 #include "block/block_int.h" 14 #include "exec/memory.h" 15 #include "exec/cpu-common.h" /* for qemu_ram_get_fd() */ 16 #include "qapi/error.h" 17 #include "qemu/error-report.h" 18 #include "qapi/qmp/qdict.h" 19 #include "qemu/module.h" 20 #include "sysemu/block-backend.h" 21 #include "exec/memory.h" /* for ram_block_discard_disable() */ 22 23 #include "block/block-io.h" 24 25 /* 26 * Allocated bounce buffers are kept in a list sorted by buffer address. 27 */ 28 typedef struct BlkioBounceBuf { 29 QLIST_ENTRY(BlkioBounceBuf) next; 30 31 /* The bounce buffer */ 32 struct iovec buf; 33 } BlkioBounceBuf; 34 35 typedef struct { 36 /* 37 * libblkio is not thread-safe so this lock protects ->blkio and 38 * ->blkioq. 39 */ 40 QemuMutex blkio_lock; 41 struct blkio *blkio; 42 struct blkioq *blkioq; /* make this multi-queue in the future... */ 43 int completion_fd; 44 45 /* 46 * Polling fetches the next completion into this field. 47 * 48 * No lock is necessary since only one thread calls aio_poll() and invokes 49 * fd and poll handlers. 50 */ 51 struct blkio_completion poll_completion; 52 53 /* 54 * Protects ->bounce_pool, ->bounce_bufs, ->bounce_available. 55 * 56 * Lock ordering: ->bounce_lock before ->blkio_lock. 57 */ 58 CoMutex bounce_lock; 59 60 /* Bounce buffer pool */ 61 struct blkio_mem_region bounce_pool; 62 63 /* Sorted list of allocated bounce buffers */ 64 QLIST_HEAD(, BlkioBounceBuf) bounce_bufs; 65 66 /* Queue for coroutines waiting for bounce buffer space */ 67 CoQueue bounce_available; 68 69 /* The value of the "mem-region-alignment" property */ 70 size_t mem_region_alignment; 71 72 /* Can we skip adding/deleting blkio_mem_regions? */ 73 bool needs_mem_regions; 74 75 /* Are file descriptors necessary for blkio_mem_regions? */ 76 bool needs_mem_region_fd; 77 78 /* Are madvise(MADV_DONTNEED)-style operations unavailable? */ 79 bool may_pin_mem_regions; 80 } BDRVBlkioState; 81 82 /* Called with s->bounce_lock held */ 83 static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes) 84 { 85 /* There can be no allocated bounce buffers during resize */ 86 assert(QLIST_EMPTY(&s->bounce_bufs)); 87 88 /* Pad size to reduce frequency of resize calls */ 89 bytes += 128 * 1024; 90 91 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 92 int ret; 93 94 if (s->bounce_pool.addr) { 95 blkio_unmap_mem_region(s->blkio, &s->bounce_pool); 96 blkio_free_mem_region(s->blkio, &s->bounce_pool); 97 memset(&s->bounce_pool, 0, sizeof(s->bounce_pool)); 98 } 99 100 /* Automatically freed when s->blkio is destroyed */ 101 ret = blkio_alloc_mem_region(s->blkio, &s->bounce_pool, bytes); 102 if (ret < 0) { 103 return ret; 104 } 105 106 ret = blkio_map_mem_region(s->blkio, &s->bounce_pool); 107 if (ret < 0) { 108 blkio_free_mem_region(s->blkio, &s->bounce_pool); 109 memset(&s->bounce_pool, 0, sizeof(s->bounce_pool)); 110 return ret; 111 } 112 } 113 114 return 0; 115 } 116 117 /* Called with s->bounce_lock held */ 118 static bool 119 blkio_do_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce, 120 int64_t bytes) 121 { 122 void *addr = s->bounce_pool.addr; 123 BlkioBounceBuf *cur = NULL; 124 BlkioBounceBuf *prev = NULL; 125 ptrdiff_t space; 126 127 /* 128 * This is just a linear search over the holes between requests. An 129 * efficient allocator would be nice. 130 */ 131 QLIST_FOREACH(cur, &s->bounce_bufs, next) { 132 space = cur->buf.iov_base - addr; 133 if (bytes <= space) { 134 QLIST_INSERT_BEFORE(cur, bounce, next); 135 bounce->buf.iov_base = addr; 136 bounce->buf.iov_len = bytes; 137 return true; 138 } 139 140 addr = cur->buf.iov_base + cur->buf.iov_len; 141 prev = cur; 142 } 143 144 /* Is there space after the last request? */ 145 space = s->bounce_pool.addr + s->bounce_pool.len - addr; 146 if (bytes > space) { 147 return false; 148 } 149 if (prev) { 150 QLIST_INSERT_AFTER(prev, bounce, next); 151 } else { 152 QLIST_INSERT_HEAD(&s->bounce_bufs, bounce, next); 153 } 154 bounce->buf.iov_base = addr; 155 bounce->buf.iov_len = bytes; 156 return true; 157 } 158 159 static int coroutine_fn 160 blkio_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce, 161 int64_t bytes) 162 { 163 /* 164 * Ensure fairness: first time around we join the back of the queue, 165 * subsequently we join the front so we don't lose our place. 166 */ 167 CoQueueWaitFlags wait_flags = 0; 168 169 QEMU_LOCK_GUARD(&s->bounce_lock); 170 171 /* Ensure fairness: don't even try if other requests are already waiting */ 172 if (!qemu_co_queue_empty(&s->bounce_available)) { 173 qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock, 174 wait_flags); 175 wait_flags = CO_QUEUE_WAIT_FRONT; 176 } 177 178 while (true) { 179 if (blkio_do_alloc_bounce_buffer(s, bounce, bytes)) { 180 /* Kick the next queued request since there may be space */ 181 qemu_co_queue_next(&s->bounce_available); 182 return 0; 183 } 184 185 /* 186 * If there are no in-flight requests then the pool was simply too 187 * small. 188 */ 189 if (QLIST_EMPTY(&s->bounce_bufs)) { 190 bool ok; 191 int ret; 192 193 ret = blkio_resize_bounce_pool(s, bytes); 194 if (ret < 0) { 195 /* Kick the next queued request since that may fail too */ 196 qemu_co_queue_next(&s->bounce_available); 197 return ret; 198 } 199 200 ok = blkio_do_alloc_bounce_buffer(s, bounce, bytes); 201 assert(ok); /* must have space this time */ 202 return 0; 203 } 204 205 qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock, 206 wait_flags); 207 wait_flags = CO_QUEUE_WAIT_FRONT; 208 } 209 } 210 211 static void coroutine_fn blkio_free_bounce_buffer(BDRVBlkioState *s, 212 BlkioBounceBuf *bounce) 213 { 214 QEMU_LOCK_GUARD(&s->bounce_lock); 215 216 QLIST_REMOVE(bounce, next); 217 218 /* Wake up waiting coroutines since space may now be available */ 219 qemu_co_queue_next(&s->bounce_available); 220 } 221 222 /* For async to .bdrv_co_*() conversion */ 223 typedef struct { 224 Coroutine *coroutine; 225 int ret; 226 } BlkioCoData; 227 228 static void blkio_completion_fd_read(void *opaque) 229 { 230 BlockDriverState *bs = opaque; 231 BDRVBlkioState *s = bs->opaque; 232 uint64_t val; 233 int ret; 234 235 /* Polling may have already fetched a completion */ 236 if (s->poll_completion.user_data != NULL) { 237 BlkioCoData *cod = s->poll_completion.user_data; 238 cod->ret = s->poll_completion.ret; 239 240 /* Clear it in case aio_co_wake() enters a nested event loop */ 241 s->poll_completion.user_data = NULL; 242 243 aio_co_wake(cod->coroutine); 244 } 245 246 /* Reset completion fd status */ 247 ret = read(s->completion_fd, &val, sizeof(val)); 248 249 /* Ignore errors, there's nothing we can do */ 250 (void)ret; 251 252 /* 253 * Reading one completion at a time makes nested event loop re-entrancy 254 * simple. Change this loop to get multiple completions in one go if it 255 * becomes a performance bottleneck. 256 */ 257 while (true) { 258 struct blkio_completion completion; 259 260 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 261 ret = blkioq_do_io(s->blkioq, &completion, 0, 1, NULL); 262 } 263 if (ret != 1) { 264 break; 265 } 266 267 BlkioCoData *cod = completion.user_data; 268 cod->ret = completion.ret; 269 aio_co_wake(cod->coroutine); 270 } 271 } 272 273 static bool blkio_completion_fd_poll(void *opaque) 274 { 275 BlockDriverState *bs = opaque; 276 BDRVBlkioState *s = bs->opaque; 277 int ret; 278 279 /* Just in case we already fetched a completion */ 280 if (s->poll_completion.user_data != NULL) { 281 return true; 282 } 283 284 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 285 ret = blkioq_do_io(s->blkioq, &s->poll_completion, 0, 1, NULL); 286 } 287 return ret == 1; 288 } 289 290 static void blkio_completion_fd_poll_ready(void *opaque) 291 { 292 blkio_completion_fd_read(opaque); 293 } 294 295 static void blkio_attach_aio_context(BlockDriverState *bs, 296 AioContext *new_context) 297 { 298 BDRVBlkioState *s = bs->opaque; 299 300 aio_set_fd_handler(new_context, s->completion_fd, 301 blkio_completion_fd_read, NULL, 302 blkio_completion_fd_poll, 303 blkio_completion_fd_poll_ready, bs); 304 } 305 306 static void blkio_detach_aio_context(BlockDriverState *bs) 307 { 308 BDRVBlkioState *s = bs->opaque; 309 310 aio_set_fd_handler(bdrv_get_aio_context(bs), s->completion_fd, NULL, NULL, 311 NULL, NULL, NULL); 312 } 313 314 /* 315 * Called by blk_io_unplug() or immediately if not plugged. Called without 316 * blkio_lock. 317 */ 318 static void blkio_unplug_fn(void *opaque) 319 { 320 BDRVBlkioState *s = opaque; 321 322 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 323 blkioq_do_io(s->blkioq, NULL, 0, 0, NULL); 324 } 325 } 326 327 /* 328 * Schedule I/O submission after enqueuing a new request. Called without 329 * blkio_lock. 330 */ 331 static void blkio_submit_io(BlockDriverState *bs) 332 { 333 BDRVBlkioState *s = bs->opaque; 334 335 blk_io_plug_call(blkio_unplug_fn, s); 336 } 337 338 static int coroutine_fn 339 blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) 340 { 341 BDRVBlkioState *s = bs->opaque; 342 BlkioCoData cod = { 343 .coroutine = qemu_coroutine_self(), 344 }; 345 346 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 347 blkioq_discard(s->blkioq, offset, bytes, &cod, 0); 348 } 349 350 blkio_submit_io(bs); 351 qemu_coroutine_yield(); 352 return cod.ret; 353 } 354 355 static int coroutine_fn 356 blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, 357 QEMUIOVector *qiov, BdrvRequestFlags flags) 358 { 359 BlkioCoData cod = { 360 .coroutine = qemu_coroutine_self(), 361 }; 362 BDRVBlkioState *s = bs->opaque; 363 bool use_bounce_buffer = 364 s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF); 365 BlkioBounceBuf bounce; 366 struct iovec *iov = qiov->iov; 367 int iovcnt = qiov->niov; 368 369 if (use_bounce_buffer) { 370 int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes); 371 if (ret < 0) { 372 return ret; 373 } 374 375 iov = &bounce.buf; 376 iovcnt = 1; 377 } 378 379 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 380 blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0); 381 } 382 383 blkio_submit_io(bs); 384 qemu_coroutine_yield(); 385 386 if (use_bounce_buffer) { 387 if (cod.ret == 0) { 388 qemu_iovec_from_buf(qiov, 0, 389 bounce.buf.iov_base, 390 bounce.buf.iov_len); 391 } 392 393 blkio_free_bounce_buffer(s, &bounce); 394 } 395 396 return cod.ret; 397 } 398 399 static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset, 400 int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) 401 { 402 uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0; 403 BlkioCoData cod = { 404 .coroutine = qemu_coroutine_self(), 405 }; 406 BDRVBlkioState *s = bs->opaque; 407 bool use_bounce_buffer = 408 s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF); 409 BlkioBounceBuf bounce; 410 struct iovec *iov = qiov->iov; 411 int iovcnt = qiov->niov; 412 413 if (use_bounce_buffer) { 414 int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes); 415 if (ret < 0) { 416 return ret; 417 } 418 419 qemu_iovec_to_buf(qiov, 0, bounce.buf.iov_base, bytes); 420 iov = &bounce.buf; 421 iovcnt = 1; 422 } 423 424 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 425 blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags); 426 } 427 428 blkio_submit_io(bs); 429 qemu_coroutine_yield(); 430 431 if (use_bounce_buffer) { 432 blkio_free_bounce_buffer(s, &bounce); 433 } 434 435 return cod.ret; 436 } 437 438 static int coroutine_fn blkio_co_flush(BlockDriverState *bs) 439 { 440 BDRVBlkioState *s = bs->opaque; 441 BlkioCoData cod = { 442 .coroutine = qemu_coroutine_self(), 443 }; 444 445 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 446 blkioq_flush(s->blkioq, &cod, 0); 447 } 448 449 blkio_submit_io(bs); 450 qemu_coroutine_yield(); 451 return cod.ret; 452 } 453 454 static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs, 455 int64_t offset, int64_t bytes, BdrvRequestFlags flags) 456 { 457 BDRVBlkioState *s = bs->opaque; 458 BlkioCoData cod = { 459 .coroutine = qemu_coroutine_self(), 460 }; 461 uint32_t blkio_flags = 0; 462 463 if (flags & BDRV_REQ_FUA) { 464 blkio_flags |= BLKIO_REQ_FUA; 465 } 466 if (!(flags & BDRV_REQ_MAY_UNMAP)) { 467 blkio_flags |= BLKIO_REQ_NO_UNMAP; 468 } 469 if (flags & BDRV_REQ_NO_FALLBACK) { 470 blkio_flags |= BLKIO_REQ_NO_FALLBACK; 471 } 472 473 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 474 blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags); 475 } 476 477 blkio_submit_io(bs); 478 qemu_coroutine_yield(); 479 return cod.ret; 480 } 481 482 typedef enum { 483 BMRR_OK, 484 BMRR_SKIP, 485 BMRR_FAIL, 486 } BlkioMemRegionResult; 487 488 /* 489 * Produce a struct blkio_mem_region for a given address and size. 490 * 491 * This function produces identical results when called multiple times with the 492 * same arguments. This property is necessary because blkio_unmap_mem_region() 493 * must receive the same struct blkio_mem_region field values that were passed 494 * to blkio_map_mem_region(). 495 */ 496 static BlkioMemRegionResult 497 blkio_mem_region_from_host(BlockDriverState *bs, 498 void *host, size_t size, 499 struct blkio_mem_region *region, 500 Error **errp) 501 { 502 BDRVBlkioState *s = bs->opaque; 503 int fd = -1; 504 ram_addr_t fd_offset = 0; 505 506 if (((uintptr_t)host | size) % s->mem_region_alignment) { 507 error_setg(errp, "unaligned buf %p with size %zu", host, size); 508 return BMRR_FAIL; 509 } 510 511 /* Attempt to find the fd for the underlying memory */ 512 if (s->needs_mem_region_fd) { 513 RAMBlock *ram_block; 514 RAMBlock *end_block; 515 ram_addr_t offset; 516 517 /* 518 * bdrv_register_buf() is called with the BQL held so mr lives at least 519 * until this function returns. 520 */ 521 ram_block = qemu_ram_block_from_host(host, false, &fd_offset); 522 if (ram_block) { 523 fd = qemu_ram_get_fd(ram_block); 524 } 525 if (fd == -1) { 526 /* 527 * Ideally every RAMBlock would have an fd. pc-bios and other 528 * things don't. Luckily they are usually not I/O buffers and we 529 * can just ignore them. 530 */ 531 return BMRR_SKIP; 532 } 533 534 /* Make sure the fd covers the entire range */ 535 end_block = qemu_ram_block_from_host(host + size - 1, false, &offset); 536 if (ram_block != end_block) { 537 error_setg(errp, "registered buffer at %p with size %zu extends " 538 "beyond RAMBlock", host, size); 539 return BMRR_FAIL; 540 } 541 } 542 543 *region = (struct blkio_mem_region){ 544 .addr = host, 545 .len = size, 546 .fd = fd, 547 .fd_offset = fd_offset, 548 }; 549 return BMRR_OK; 550 } 551 552 static bool blkio_register_buf(BlockDriverState *bs, void *host, size_t size, 553 Error **errp) 554 { 555 BDRVBlkioState *s = bs->opaque; 556 struct blkio_mem_region region; 557 BlkioMemRegionResult region_result; 558 int ret; 559 560 /* 561 * Mapping memory regions conflicts with RAM discard (virtio-mem) when 562 * there is pinning, so only do it when necessary. 563 */ 564 if (!s->needs_mem_regions && s->may_pin_mem_regions) { 565 return true; 566 } 567 568 region_result = blkio_mem_region_from_host(bs, host, size, ®ion, errp); 569 if (region_result == BMRR_SKIP) { 570 return true; 571 } else if (region_result != BMRR_OK) { 572 return false; 573 } 574 575 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 576 ret = blkio_map_mem_region(s->blkio, ®ion); 577 } 578 579 if (ret < 0) { 580 error_setg(errp, "Failed to add blkio mem region %p with size %zu: %s", 581 host, size, blkio_get_error_msg()); 582 return false; 583 } 584 return true; 585 } 586 587 static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size) 588 { 589 BDRVBlkioState *s = bs->opaque; 590 struct blkio_mem_region region; 591 592 /* See blkio_register_buf() */ 593 if (!s->needs_mem_regions && s->may_pin_mem_regions) { 594 return; 595 } 596 597 if (blkio_mem_region_from_host(bs, host, size, ®ion, NULL) != BMRR_OK) { 598 return; 599 } 600 601 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 602 blkio_unmap_mem_region(s->blkio, ®ion); 603 } 604 } 605 606 static int blkio_io_uring_connect(BlockDriverState *bs, QDict *options, 607 int flags, Error **errp) 608 { 609 const char *filename = qdict_get_str(options, "filename"); 610 BDRVBlkioState *s = bs->opaque; 611 int ret; 612 613 ret = blkio_set_str(s->blkio, "path", filename); 614 qdict_del(options, "filename"); 615 if (ret < 0) { 616 error_setg_errno(errp, -ret, "failed to set path: %s", 617 blkio_get_error_msg()); 618 return ret; 619 } 620 621 if (flags & BDRV_O_NOCACHE) { 622 ret = blkio_set_bool(s->blkio, "direct", true); 623 if (ret < 0) { 624 error_setg_errno(errp, -ret, "failed to set direct: %s", 625 blkio_get_error_msg()); 626 return ret; 627 } 628 } 629 630 ret = blkio_connect(s->blkio); 631 if (ret < 0) { 632 error_setg_errno(errp, -ret, "blkio_connect failed: %s", 633 blkio_get_error_msg()); 634 return ret; 635 } 636 637 return 0; 638 } 639 640 static int blkio_nvme_io_uring_connect(BlockDriverState *bs, QDict *options, 641 int flags, Error **errp) 642 { 643 const char *path = qdict_get_try_str(options, "path"); 644 BDRVBlkioState *s = bs->opaque; 645 int ret; 646 647 if (!path) { 648 error_setg(errp, "missing 'path' option"); 649 return -EINVAL; 650 } 651 652 ret = blkio_set_str(s->blkio, "path", path); 653 qdict_del(options, "path"); 654 if (ret < 0) { 655 error_setg_errno(errp, -ret, "failed to set path: %s", 656 blkio_get_error_msg()); 657 return ret; 658 } 659 660 if (!(flags & BDRV_O_NOCACHE)) { 661 error_setg(errp, "cache.direct=off is not supported"); 662 return -EINVAL; 663 } 664 665 ret = blkio_connect(s->blkio); 666 if (ret < 0) { 667 error_setg_errno(errp, -ret, "blkio_connect failed: %s", 668 blkio_get_error_msg()); 669 return ret; 670 } 671 672 return 0; 673 } 674 675 static int blkio_virtio_blk_connect(BlockDriverState *bs, QDict *options, 676 int flags, Error **errp) 677 { 678 const char *path = qdict_get_try_str(options, "path"); 679 BDRVBlkioState *s = bs->opaque; 680 bool fd_supported = false; 681 int fd, ret; 682 683 if (!path) { 684 error_setg(errp, "missing 'path' option"); 685 return -EINVAL; 686 } 687 688 if (!(flags & BDRV_O_NOCACHE)) { 689 error_setg(errp, "cache.direct=off is not supported"); 690 return -EINVAL; 691 } 692 693 if (blkio_set_int(s->blkio, "fd", -1) == 0) { 694 fd_supported = true; 695 } 696 697 /* 698 * If the libblkio driver supports fd passing, let's always use qemu_open() 699 * to open the `path`, so we can handle fd passing from the management 700 * layer through the "/dev/fdset/N" special path. 701 */ 702 if (fd_supported) { 703 /* 704 * `path` can contain the path of a character device 705 * (e.g. /dev/vhost-vdpa-0 or /dev/vfio/vfio) or a unix socket. 706 * 707 * So, we should always open it with O_RDWR flag, also if BDRV_O_RDWR 708 * is not set in the open flags, because the exchange of IOCTL commands 709 * for example will fail. 710 * 711 * In order to open the device read-only, we are using the `read-only` 712 * property of the libblkio driver in blkio_file_open(). 713 */ 714 fd = qemu_open(path, O_RDWR, NULL); 715 if (fd < 0) { 716 fd_supported = false; 717 } else { 718 ret = blkio_set_int(s->blkio, "fd", fd); 719 if (ret < 0) { 720 fd_supported = false; 721 qemu_close(fd); 722 } 723 } 724 } 725 726 if (!fd_supported) { 727 ret = blkio_set_str(s->blkio, "path", path); 728 if (ret < 0) { 729 error_setg_errno(errp, -ret, "failed to set path: %s", 730 blkio_get_error_msg()); 731 return ret; 732 } 733 } 734 735 ret = blkio_connect(s->blkio); 736 /* 737 * If the libblkio driver doesn't support the `fd` property, blkio_connect() 738 * will fail with -EINVAL. So let's try calling blkio_connect() again by 739 * directly setting `path`. 740 */ 741 if (fd_supported && ret == -EINVAL) { 742 qemu_close(fd); 743 744 /* 745 * We need to clear the `fd` property we set previously by setting 746 * it to -1. 747 */ 748 ret = blkio_set_int(s->blkio, "fd", -1); 749 if (ret < 0) { 750 error_setg_errno(errp, -ret, "failed to set fd: %s", 751 blkio_get_error_msg()); 752 return ret; 753 } 754 755 ret = blkio_set_str(s->blkio, "path", path); 756 if (ret < 0) { 757 error_setg_errno(errp, -ret, "failed to set path: %s", 758 blkio_get_error_msg()); 759 return ret; 760 } 761 762 ret = blkio_connect(s->blkio); 763 } 764 765 if (ret < 0) { 766 error_setg_errno(errp, -ret, "blkio_connect failed: %s", 767 blkio_get_error_msg()); 768 return ret; 769 } 770 771 qdict_del(options, "path"); 772 773 return 0; 774 } 775 776 static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags, 777 Error **errp) 778 { 779 const char *blkio_driver = bs->drv->protocol_name; 780 BDRVBlkioState *s = bs->opaque; 781 int ret; 782 783 ret = blkio_create(blkio_driver, &s->blkio); 784 if (ret < 0) { 785 error_setg_errno(errp, -ret, "blkio_create failed: %s", 786 blkio_get_error_msg()); 787 return ret; 788 } 789 790 if (!(flags & BDRV_O_RDWR)) { 791 ret = blkio_set_bool(s->blkio, "read-only", true); 792 if (ret < 0) { 793 error_setg_errno(errp, -ret, "failed to set read-only: %s", 794 blkio_get_error_msg()); 795 blkio_destroy(&s->blkio); 796 return ret; 797 } 798 } 799 800 if (strcmp(blkio_driver, "io_uring") == 0) { 801 ret = blkio_io_uring_connect(bs, options, flags, errp); 802 } else if (strcmp(blkio_driver, "nvme-io_uring") == 0) { 803 ret = blkio_nvme_io_uring_connect(bs, options, flags, errp); 804 } else if (strcmp(blkio_driver, "virtio-blk-vfio-pci") == 0) { 805 ret = blkio_virtio_blk_connect(bs, options, flags, errp); 806 } else if (strcmp(blkio_driver, "virtio-blk-vhost-user") == 0) { 807 ret = blkio_virtio_blk_connect(bs, options, flags, errp); 808 } else if (strcmp(blkio_driver, "virtio-blk-vhost-vdpa") == 0) { 809 ret = blkio_virtio_blk_connect(bs, options, flags, errp); 810 } else { 811 g_assert_not_reached(); 812 } 813 if (ret < 0) { 814 blkio_destroy(&s->blkio); 815 return ret; 816 } 817 818 ret = blkio_get_bool(s->blkio, 819 "needs-mem-regions", 820 &s->needs_mem_regions); 821 if (ret < 0) { 822 error_setg_errno(errp, -ret, 823 "failed to get needs-mem-regions: %s", 824 blkio_get_error_msg()); 825 blkio_destroy(&s->blkio); 826 return ret; 827 } 828 829 ret = blkio_get_bool(s->blkio, 830 "needs-mem-region-fd", 831 &s->needs_mem_region_fd); 832 if (ret < 0) { 833 error_setg_errno(errp, -ret, 834 "failed to get needs-mem-region-fd: %s", 835 blkio_get_error_msg()); 836 blkio_destroy(&s->blkio); 837 return ret; 838 } 839 840 ret = blkio_get_uint64(s->blkio, 841 "mem-region-alignment", 842 &s->mem_region_alignment); 843 if (ret < 0) { 844 error_setg_errno(errp, -ret, 845 "failed to get mem-region-alignment: %s", 846 blkio_get_error_msg()); 847 blkio_destroy(&s->blkio); 848 return ret; 849 } 850 851 ret = blkio_get_bool(s->blkio, 852 "may-pin-mem-regions", 853 &s->may_pin_mem_regions); 854 if (ret < 0) { 855 /* Be conservative (assume pinning) if the property is not supported */ 856 s->may_pin_mem_regions = s->needs_mem_regions; 857 } 858 859 /* 860 * Notify if libblkio drivers pin memory and prevent features like 861 * virtio-mem from working. 862 */ 863 if (s->may_pin_mem_regions) { 864 ret = ram_block_discard_disable(true); 865 if (ret < 0) { 866 error_setg_errno(errp, -ret, "ram_block_discard_disable() failed"); 867 blkio_destroy(&s->blkio); 868 return ret; 869 } 870 } 871 872 ret = blkio_start(s->blkio); 873 if (ret < 0) { 874 error_setg_errno(errp, -ret, "blkio_start failed: %s", 875 blkio_get_error_msg()); 876 blkio_destroy(&s->blkio); 877 if (s->may_pin_mem_regions) { 878 ram_block_discard_disable(false); 879 } 880 return ret; 881 } 882 883 bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF; 884 bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | 885 BDRV_REQ_NO_FALLBACK; 886 887 qemu_mutex_init(&s->blkio_lock); 888 qemu_co_mutex_init(&s->bounce_lock); 889 qemu_co_queue_init(&s->bounce_available); 890 QLIST_INIT(&s->bounce_bufs); 891 s->blkioq = blkio_get_queue(s->blkio, 0); 892 s->completion_fd = blkioq_get_completion_fd(s->blkioq); 893 blkioq_set_completion_fd_enabled(s->blkioq, true); 894 895 blkio_attach_aio_context(bs, bdrv_get_aio_context(bs)); 896 return 0; 897 } 898 899 static void blkio_close(BlockDriverState *bs) 900 { 901 BDRVBlkioState *s = bs->opaque; 902 903 /* There is no destroy() API for s->bounce_lock */ 904 905 qemu_mutex_destroy(&s->blkio_lock); 906 blkio_detach_aio_context(bs); 907 blkio_destroy(&s->blkio); 908 909 if (s->may_pin_mem_regions) { 910 ram_block_discard_disable(false); 911 } 912 } 913 914 static int64_t coroutine_fn blkio_co_getlength(BlockDriverState *bs) 915 { 916 BDRVBlkioState *s = bs->opaque; 917 uint64_t capacity; 918 int ret; 919 920 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 921 ret = blkio_get_uint64(s->blkio, "capacity", &capacity); 922 } 923 if (ret < 0) { 924 return -ret; 925 } 926 927 return capacity; 928 } 929 930 static int coroutine_fn blkio_truncate(BlockDriverState *bs, int64_t offset, 931 bool exact, PreallocMode prealloc, 932 BdrvRequestFlags flags, Error **errp) 933 { 934 int64_t current_length; 935 936 if (prealloc != PREALLOC_MODE_OFF) { 937 error_setg(errp, "Unsupported preallocation mode '%s'", 938 PreallocMode_str(prealloc)); 939 return -ENOTSUP; 940 } 941 942 current_length = blkio_co_getlength(bs); 943 944 if (offset > current_length) { 945 error_setg(errp, "Cannot grow device"); 946 return -EINVAL; 947 } else if (exact && offset != current_length) { 948 error_setg(errp, "Cannot resize device"); 949 return -ENOTSUP; 950 } 951 952 return 0; 953 } 954 955 static int coroutine_fn 956 blkio_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 957 { 958 return 0; 959 } 960 961 static void blkio_refresh_limits(BlockDriverState *bs, Error **errp) 962 { 963 BDRVBlkioState *s = bs->opaque; 964 QEMU_LOCK_GUARD(&s->blkio_lock); 965 int value; 966 int ret; 967 968 ret = blkio_get_int(s->blkio, "request-alignment", &value); 969 if (ret < 0) { 970 error_setg_errno(errp, -ret, "failed to get \"request-alignment\": %s", 971 blkio_get_error_msg()); 972 return; 973 } 974 bs->bl.request_alignment = value; 975 if (bs->bl.request_alignment < 1 || 976 bs->bl.request_alignment >= INT_MAX || 977 !is_power_of_2(bs->bl.request_alignment)) { 978 error_setg(errp, "invalid \"request-alignment\" value %" PRIu32 ", " 979 "must be a power of 2 less than INT_MAX", 980 bs->bl.request_alignment); 981 return; 982 } 983 984 ret = blkio_get_int(s->blkio, "optimal-io-size", &value); 985 if (ret < 0) { 986 error_setg_errno(errp, -ret, "failed to get \"optimal-io-size\": %s", 987 blkio_get_error_msg()); 988 return; 989 } 990 bs->bl.opt_transfer = value; 991 if (bs->bl.opt_transfer > INT_MAX || 992 (bs->bl.opt_transfer % bs->bl.request_alignment)) { 993 error_setg(errp, "invalid \"optimal-io-size\" value %" PRIu32 ", must " 994 "be a multiple of %" PRIu32, bs->bl.opt_transfer, 995 bs->bl.request_alignment); 996 return; 997 } 998 999 ret = blkio_get_int(s->blkio, "max-transfer", &value); 1000 if (ret < 0) { 1001 error_setg_errno(errp, -ret, "failed to get \"max-transfer\": %s", 1002 blkio_get_error_msg()); 1003 return; 1004 } 1005 bs->bl.max_transfer = value; 1006 if ((bs->bl.max_transfer % bs->bl.request_alignment) || 1007 (bs->bl.opt_transfer && (bs->bl.max_transfer % bs->bl.opt_transfer))) { 1008 error_setg(errp, "invalid \"max-transfer\" value %" PRIu32 ", must be " 1009 "a multiple of %" PRIu32 " and %" PRIu32 " (if non-zero)", 1010 bs->bl.max_transfer, bs->bl.request_alignment, 1011 bs->bl.opt_transfer); 1012 return; 1013 } 1014 1015 ret = blkio_get_int(s->blkio, "buf-alignment", &value); 1016 if (ret < 0) { 1017 error_setg_errno(errp, -ret, "failed to get \"buf-alignment\": %s", 1018 blkio_get_error_msg()); 1019 return; 1020 } 1021 if (value < 1) { 1022 error_setg(errp, "invalid \"buf-alignment\" value %d, must be " 1023 "positive", value); 1024 return; 1025 } 1026 bs->bl.min_mem_alignment = value; 1027 1028 ret = blkio_get_int(s->blkio, "optimal-buf-alignment", &value); 1029 if (ret < 0) { 1030 error_setg_errno(errp, -ret, 1031 "failed to get \"optimal-buf-alignment\": %s", 1032 blkio_get_error_msg()); 1033 return; 1034 } 1035 if (value < 1) { 1036 error_setg(errp, "invalid \"optimal-buf-alignment\" value %d, " 1037 "must be positive", value); 1038 return; 1039 } 1040 bs->bl.opt_mem_alignment = value; 1041 1042 ret = blkio_get_int(s->blkio, "max-segments", &value); 1043 if (ret < 0) { 1044 error_setg_errno(errp, -ret, "failed to get \"max-segments\": %s", 1045 blkio_get_error_msg()); 1046 return; 1047 } 1048 if (value < 1) { 1049 error_setg(errp, "invalid \"max-segments\" value %d, must be positive", 1050 value); 1051 return; 1052 } 1053 bs->bl.max_iov = value; 1054 } 1055 1056 /* 1057 * TODO 1058 * Missing libblkio APIs: 1059 * - block_status 1060 * - co_invalidate_cache 1061 * 1062 * Out of scope? 1063 * - create 1064 * - truncate 1065 */ 1066 1067 /* 1068 * Do not include .format_name and .protocol_name because module_block.py 1069 * does not parse macros in the source code. 1070 */ 1071 #define BLKIO_DRIVER_COMMON \ 1072 .instance_size = sizeof(BDRVBlkioState), \ 1073 .bdrv_file_open = blkio_file_open, \ 1074 .bdrv_close = blkio_close, \ 1075 .bdrv_co_getlength = blkio_co_getlength, \ 1076 .bdrv_co_truncate = blkio_truncate, \ 1077 .bdrv_co_get_info = blkio_co_get_info, \ 1078 .bdrv_attach_aio_context = blkio_attach_aio_context, \ 1079 .bdrv_detach_aio_context = blkio_detach_aio_context, \ 1080 .bdrv_co_pdiscard = blkio_co_pdiscard, \ 1081 .bdrv_co_preadv = blkio_co_preadv, \ 1082 .bdrv_co_pwritev = blkio_co_pwritev, \ 1083 .bdrv_co_flush_to_disk = blkio_co_flush, \ 1084 .bdrv_co_pwrite_zeroes = blkio_co_pwrite_zeroes, \ 1085 .bdrv_refresh_limits = blkio_refresh_limits, \ 1086 .bdrv_register_buf = blkio_register_buf, \ 1087 .bdrv_unregister_buf = blkio_unregister_buf, 1088 1089 /* 1090 * Use the same .format_name and .protocol_name as the libblkio driver name for 1091 * consistency. 1092 */ 1093 1094 static BlockDriver bdrv_io_uring = { 1095 .format_name = "io_uring", 1096 .protocol_name = "io_uring", 1097 .bdrv_needs_filename = true, 1098 BLKIO_DRIVER_COMMON 1099 }; 1100 1101 static BlockDriver bdrv_nvme_io_uring = { 1102 .format_name = "nvme-io_uring", 1103 .protocol_name = "nvme-io_uring", 1104 BLKIO_DRIVER_COMMON 1105 }; 1106 1107 static BlockDriver bdrv_virtio_blk_vfio_pci = { 1108 .format_name = "virtio-blk-vfio-pci", 1109 .protocol_name = "virtio-blk-vfio-pci", 1110 BLKIO_DRIVER_COMMON 1111 }; 1112 1113 static BlockDriver bdrv_virtio_blk_vhost_user = { 1114 .format_name = "virtio-blk-vhost-user", 1115 .protocol_name = "virtio-blk-vhost-user", 1116 BLKIO_DRIVER_COMMON 1117 }; 1118 1119 static BlockDriver bdrv_virtio_blk_vhost_vdpa = { 1120 .format_name = "virtio-blk-vhost-vdpa", 1121 .protocol_name = "virtio-blk-vhost-vdpa", 1122 BLKIO_DRIVER_COMMON 1123 }; 1124 1125 static void bdrv_blkio_init(void) 1126 { 1127 bdrv_register(&bdrv_io_uring); 1128 bdrv_register(&bdrv_nvme_io_uring); 1129 bdrv_register(&bdrv_virtio_blk_vfio_pci); 1130 bdrv_register(&bdrv_virtio_blk_vhost_user); 1131 bdrv_register(&bdrv_virtio_blk_vhost_vdpa); 1132 } 1133 1134 block_init(bdrv_blkio_init); 1135