1 /* SPDX-License-Identifier: LGPL-2.1-or-later */ 2 /* 3 * libblkio BlockDriver 4 * 5 * Copyright Red Hat, Inc. 6 * 7 * Author: 8 * Stefan Hajnoczi <stefanha@redhat.com> 9 */ 10 11 #include "qemu/osdep.h" 12 #include <blkio.h> 13 #include "block/block_int.h" 14 #include "exec/memory.h" 15 #include "exec/cpu-common.h" /* for qemu_ram_get_fd() */ 16 #include "qemu/defer-call.h" 17 #include "qapi/error.h" 18 #include "qemu/error-report.h" 19 #include "qapi/qmp/qdict.h" 20 #include "qemu/module.h" 21 #include "sysemu/block-backend.h" 22 #include "exec/memory.h" /* for ram_block_discard_disable() */ 23 24 #include "block/block-io.h" 25 26 /* 27 * Allocated bounce buffers are kept in a list sorted by buffer address. 28 */ 29 typedef struct BlkioBounceBuf { 30 QLIST_ENTRY(BlkioBounceBuf) next; 31 32 /* The bounce buffer */ 33 struct iovec buf; 34 } BlkioBounceBuf; 35 36 typedef struct { 37 /* 38 * libblkio is not thread-safe so this lock protects ->blkio and 39 * ->blkioq. 40 */ 41 QemuMutex blkio_lock; 42 struct blkio *blkio; 43 struct blkioq *blkioq; /* make this multi-queue in the future... */ 44 int completion_fd; 45 46 /* 47 * Polling fetches the next completion into this field. 48 * 49 * No lock is necessary since only one thread calls aio_poll() and invokes 50 * fd and poll handlers. 51 */ 52 struct blkio_completion poll_completion; 53 54 /* 55 * Protects ->bounce_pool, ->bounce_bufs, ->bounce_available. 56 * 57 * Lock ordering: ->bounce_lock before ->blkio_lock. 58 */ 59 CoMutex bounce_lock; 60 61 /* Bounce buffer pool */ 62 struct blkio_mem_region bounce_pool; 63 64 /* Sorted list of allocated bounce buffers */ 65 QLIST_HEAD(, BlkioBounceBuf) bounce_bufs; 66 67 /* Queue for coroutines waiting for bounce buffer space */ 68 CoQueue bounce_available; 69 70 /* The value of the "mem-region-alignment" property */ 71 uint64_t mem_region_alignment; 72 73 /* Can we skip adding/deleting blkio_mem_regions? */ 74 bool needs_mem_regions; 75 76 /* Are file descriptors necessary for blkio_mem_regions? */ 77 bool needs_mem_region_fd; 78 79 /* Are madvise(MADV_DONTNEED)-style operations unavailable? */ 80 bool may_pin_mem_regions; 81 } BDRVBlkioState; 82 83 /* Called with s->bounce_lock held */ 84 static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes) 85 { 86 /* There can be no allocated bounce buffers during resize */ 87 assert(QLIST_EMPTY(&s->bounce_bufs)); 88 89 /* Pad size to reduce frequency of resize calls */ 90 bytes += 128 * 1024; 91 92 /* Align the pool size to avoid blkio_alloc_mem_region() failure */ 93 bytes = QEMU_ALIGN_UP(bytes, s->mem_region_alignment); 94 95 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 96 int ret; 97 98 if (s->bounce_pool.addr) { 99 blkio_unmap_mem_region(s->blkio, &s->bounce_pool); 100 blkio_free_mem_region(s->blkio, &s->bounce_pool); 101 memset(&s->bounce_pool, 0, sizeof(s->bounce_pool)); 102 } 103 104 /* Automatically freed when s->blkio is destroyed */ 105 ret = blkio_alloc_mem_region(s->blkio, &s->bounce_pool, bytes); 106 if (ret < 0) { 107 return ret; 108 } 109 110 ret = blkio_map_mem_region(s->blkio, &s->bounce_pool); 111 if (ret < 0) { 112 blkio_free_mem_region(s->blkio, &s->bounce_pool); 113 memset(&s->bounce_pool, 0, sizeof(s->bounce_pool)); 114 return ret; 115 } 116 } 117 118 return 0; 119 } 120 121 /* Called with s->bounce_lock held */ 122 static bool 123 blkio_do_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce, 124 int64_t bytes) 125 { 126 void *addr = s->bounce_pool.addr; 127 BlkioBounceBuf *cur = NULL; 128 BlkioBounceBuf *prev = NULL; 129 ptrdiff_t space; 130 131 /* 132 * This is just a linear search over the holes between requests. An 133 * efficient allocator would be nice. 134 */ 135 QLIST_FOREACH(cur, &s->bounce_bufs, next) { 136 space = cur->buf.iov_base - addr; 137 if (bytes <= space) { 138 QLIST_INSERT_BEFORE(cur, bounce, next); 139 bounce->buf.iov_base = addr; 140 bounce->buf.iov_len = bytes; 141 return true; 142 } 143 144 addr = cur->buf.iov_base + cur->buf.iov_len; 145 prev = cur; 146 } 147 148 /* Is there space after the last request? */ 149 space = s->bounce_pool.addr + s->bounce_pool.len - addr; 150 if (bytes > space) { 151 return false; 152 } 153 if (prev) { 154 QLIST_INSERT_AFTER(prev, bounce, next); 155 } else { 156 QLIST_INSERT_HEAD(&s->bounce_bufs, bounce, next); 157 } 158 bounce->buf.iov_base = addr; 159 bounce->buf.iov_len = bytes; 160 return true; 161 } 162 163 static int coroutine_fn 164 blkio_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce, 165 int64_t bytes) 166 { 167 /* 168 * Ensure fairness: first time around we join the back of the queue, 169 * subsequently we join the front so we don't lose our place. 170 */ 171 CoQueueWaitFlags wait_flags = 0; 172 173 QEMU_LOCK_GUARD(&s->bounce_lock); 174 175 /* Ensure fairness: don't even try if other requests are already waiting */ 176 if (!qemu_co_queue_empty(&s->bounce_available)) { 177 qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock, 178 wait_flags); 179 wait_flags = CO_QUEUE_WAIT_FRONT; 180 } 181 182 while (true) { 183 if (blkio_do_alloc_bounce_buffer(s, bounce, bytes)) { 184 /* Kick the next queued request since there may be space */ 185 qemu_co_queue_next(&s->bounce_available); 186 return 0; 187 } 188 189 /* 190 * If there are no in-flight requests then the pool was simply too 191 * small. 192 */ 193 if (QLIST_EMPTY(&s->bounce_bufs)) { 194 bool ok; 195 int ret; 196 197 ret = blkio_resize_bounce_pool(s, bytes); 198 if (ret < 0) { 199 /* Kick the next queued request since that may fail too */ 200 qemu_co_queue_next(&s->bounce_available); 201 return ret; 202 } 203 204 ok = blkio_do_alloc_bounce_buffer(s, bounce, bytes); 205 assert(ok); /* must have space this time */ 206 return 0; 207 } 208 209 qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock, 210 wait_flags); 211 wait_flags = CO_QUEUE_WAIT_FRONT; 212 } 213 } 214 215 static void coroutine_fn blkio_free_bounce_buffer(BDRVBlkioState *s, 216 BlkioBounceBuf *bounce) 217 { 218 QEMU_LOCK_GUARD(&s->bounce_lock); 219 220 QLIST_REMOVE(bounce, next); 221 222 /* Wake up waiting coroutines since space may now be available */ 223 qemu_co_queue_next(&s->bounce_available); 224 } 225 226 /* For async to .bdrv_co_*() conversion */ 227 typedef struct { 228 Coroutine *coroutine; 229 int ret; 230 } BlkioCoData; 231 232 static void blkio_completion_fd_read(void *opaque) 233 { 234 BlockDriverState *bs = opaque; 235 BDRVBlkioState *s = bs->opaque; 236 uint64_t val; 237 int ret; 238 239 /* Polling may have already fetched a completion */ 240 if (s->poll_completion.user_data != NULL) { 241 BlkioCoData *cod = s->poll_completion.user_data; 242 cod->ret = s->poll_completion.ret; 243 244 /* Clear it in case aio_co_wake() enters a nested event loop */ 245 s->poll_completion.user_data = NULL; 246 247 aio_co_wake(cod->coroutine); 248 } 249 250 /* Reset completion fd status */ 251 ret = read(s->completion_fd, &val, sizeof(val)); 252 253 /* Ignore errors, there's nothing we can do */ 254 (void)ret; 255 256 /* 257 * Reading one completion at a time makes nested event loop re-entrancy 258 * simple. Change this loop to get multiple completions in one go if it 259 * becomes a performance bottleneck. 260 */ 261 while (true) { 262 struct blkio_completion completion; 263 264 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 265 ret = blkioq_do_io(s->blkioq, &completion, 0, 1, NULL); 266 } 267 if (ret != 1) { 268 break; 269 } 270 271 BlkioCoData *cod = completion.user_data; 272 cod->ret = completion.ret; 273 aio_co_wake(cod->coroutine); 274 } 275 } 276 277 static bool blkio_completion_fd_poll(void *opaque) 278 { 279 BlockDriverState *bs = opaque; 280 BDRVBlkioState *s = bs->opaque; 281 int ret; 282 283 /* Just in case we already fetched a completion */ 284 if (s->poll_completion.user_data != NULL) { 285 return true; 286 } 287 288 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 289 ret = blkioq_do_io(s->blkioq, &s->poll_completion, 0, 1, NULL); 290 } 291 return ret == 1; 292 } 293 294 static void blkio_completion_fd_poll_ready(void *opaque) 295 { 296 blkio_completion_fd_read(opaque); 297 } 298 299 static void blkio_attach_aio_context(BlockDriverState *bs, 300 AioContext *new_context) 301 { 302 BDRVBlkioState *s = bs->opaque; 303 304 aio_set_fd_handler(new_context, s->completion_fd, 305 blkio_completion_fd_read, NULL, 306 blkio_completion_fd_poll, 307 blkio_completion_fd_poll_ready, bs); 308 } 309 310 static void blkio_detach_aio_context(BlockDriverState *bs) 311 { 312 BDRVBlkioState *s = bs->opaque; 313 314 aio_set_fd_handler(bdrv_get_aio_context(bs), s->completion_fd, NULL, NULL, 315 NULL, NULL, NULL); 316 } 317 318 /* 319 * Called by defer_call_end() or immediately if not in a deferred section. 320 * Called without blkio_lock. 321 */ 322 static void blkio_deferred_fn(void *opaque) 323 { 324 BDRVBlkioState *s = opaque; 325 326 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 327 blkioq_do_io(s->blkioq, NULL, 0, 0, NULL); 328 } 329 } 330 331 /* 332 * Schedule I/O submission after enqueuing a new request. Called without 333 * blkio_lock. 334 */ 335 static void blkio_submit_io(BlockDriverState *bs) 336 { 337 BDRVBlkioState *s = bs->opaque; 338 339 defer_call(blkio_deferred_fn, s); 340 } 341 342 static int coroutine_fn 343 blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) 344 { 345 BDRVBlkioState *s = bs->opaque; 346 BlkioCoData cod = { 347 .coroutine = qemu_coroutine_self(), 348 }; 349 350 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 351 blkioq_discard(s->blkioq, offset, bytes, &cod, 0); 352 } 353 354 blkio_submit_io(bs); 355 qemu_coroutine_yield(); 356 return cod.ret; 357 } 358 359 static int coroutine_fn 360 blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, 361 QEMUIOVector *qiov, BdrvRequestFlags flags) 362 { 363 BlkioCoData cod = { 364 .coroutine = qemu_coroutine_self(), 365 }; 366 BDRVBlkioState *s = bs->opaque; 367 bool use_bounce_buffer = 368 s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF); 369 BlkioBounceBuf bounce; 370 struct iovec *iov = qiov->iov; 371 int iovcnt = qiov->niov; 372 373 if (use_bounce_buffer) { 374 int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes); 375 if (ret < 0) { 376 return ret; 377 } 378 379 iov = &bounce.buf; 380 iovcnt = 1; 381 } 382 383 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 384 blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0); 385 } 386 387 blkio_submit_io(bs); 388 qemu_coroutine_yield(); 389 390 if (use_bounce_buffer) { 391 if (cod.ret == 0) { 392 qemu_iovec_from_buf(qiov, 0, 393 bounce.buf.iov_base, 394 bounce.buf.iov_len); 395 } 396 397 blkio_free_bounce_buffer(s, &bounce); 398 } 399 400 return cod.ret; 401 } 402 403 static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset, 404 int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) 405 { 406 uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0; 407 BlkioCoData cod = { 408 .coroutine = qemu_coroutine_self(), 409 }; 410 BDRVBlkioState *s = bs->opaque; 411 bool use_bounce_buffer = 412 s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF); 413 BlkioBounceBuf bounce; 414 struct iovec *iov = qiov->iov; 415 int iovcnt = qiov->niov; 416 417 if (use_bounce_buffer) { 418 int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes); 419 if (ret < 0) { 420 return ret; 421 } 422 423 qemu_iovec_to_buf(qiov, 0, bounce.buf.iov_base, bytes); 424 iov = &bounce.buf; 425 iovcnt = 1; 426 } 427 428 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 429 blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags); 430 } 431 432 blkio_submit_io(bs); 433 qemu_coroutine_yield(); 434 435 if (use_bounce_buffer) { 436 blkio_free_bounce_buffer(s, &bounce); 437 } 438 439 return cod.ret; 440 } 441 442 static int coroutine_fn blkio_co_flush(BlockDriverState *bs) 443 { 444 BDRVBlkioState *s = bs->opaque; 445 BlkioCoData cod = { 446 .coroutine = qemu_coroutine_self(), 447 }; 448 449 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 450 blkioq_flush(s->blkioq, &cod, 0); 451 } 452 453 blkio_submit_io(bs); 454 qemu_coroutine_yield(); 455 return cod.ret; 456 } 457 458 static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs, 459 int64_t offset, int64_t bytes, BdrvRequestFlags flags) 460 { 461 BDRVBlkioState *s = bs->opaque; 462 BlkioCoData cod = { 463 .coroutine = qemu_coroutine_self(), 464 }; 465 uint32_t blkio_flags = 0; 466 467 if (flags & BDRV_REQ_FUA) { 468 blkio_flags |= BLKIO_REQ_FUA; 469 } 470 if (!(flags & BDRV_REQ_MAY_UNMAP)) { 471 blkio_flags |= BLKIO_REQ_NO_UNMAP; 472 } 473 if (flags & BDRV_REQ_NO_FALLBACK) { 474 blkio_flags |= BLKIO_REQ_NO_FALLBACK; 475 } 476 477 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 478 blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags); 479 } 480 481 blkio_submit_io(bs); 482 qemu_coroutine_yield(); 483 return cod.ret; 484 } 485 486 typedef enum { 487 BMRR_OK, 488 BMRR_SKIP, 489 BMRR_FAIL, 490 } BlkioMemRegionResult; 491 492 /* 493 * Produce a struct blkio_mem_region for a given address and size. 494 * 495 * This function produces identical results when called multiple times with the 496 * same arguments. This property is necessary because blkio_unmap_mem_region() 497 * must receive the same struct blkio_mem_region field values that were passed 498 * to blkio_map_mem_region(). 499 */ 500 static BlkioMemRegionResult 501 blkio_mem_region_from_host(BlockDriverState *bs, 502 void *host, size_t size, 503 struct blkio_mem_region *region, 504 Error **errp) 505 { 506 BDRVBlkioState *s = bs->opaque; 507 int fd = -1; 508 ram_addr_t fd_offset = 0; 509 510 if (((uintptr_t)host | size) % s->mem_region_alignment) { 511 error_setg(errp, "unaligned buf %p with size %zu", host, size); 512 return BMRR_FAIL; 513 } 514 515 /* Attempt to find the fd for the underlying memory */ 516 if (s->needs_mem_region_fd) { 517 RAMBlock *ram_block; 518 RAMBlock *end_block; 519 ram_addr_t offset; 520 521 /* 522 * bdrv_register_buf() is called with the BQL held so mr lives at least 523 * until this function returns. 524 */ 525 ram_block = qemu_ram_block_from_host(host, false, &fd_offset); 526 if (ram_block) { 527 fd = qemu_ram_get_fd(ram_block); 528 } 529 if (fd == -1) { 530 /* 531 * Ideally every RAMBlock would have an fd. pc-bios and other 532 * things don't. Luckily they are usually not I/O buffers and we 533 * can just ignore them. 534 */ 535 return BMRR_SKIP; 536 } 537 538 /* Make sure the fd covers the entire range */ 539 end_block = qemu_ram_block_from_host(host + size - 1, false, &offset); 540 if (ram_block != end_block) { 541 error_setg(errp, "registered buffer at %p with size %zu extends " 542 "beyond RAMBlock", host, size); 543 return BMRR_FAIL; 544 } 545 } 546 547 *region = (struct blkio_mem_region){ 548 .addr = host, 549 .len = size, 550 .fd = fd, 551 .fd_offset = fd_offset, 552 }; 553 return BMRR_OK; 554 } 555 556 static bool blkio_register_buf(BlockDriverState *bs, void *host, size_t size, 557 Error **errp) 558 { 559 BDRVBlkioState *s = bs->opaque; 560 struct blkio_mem_region region; 561 BlkioMemRegionResult region_result; 562 int ret; 563 564 /* 565 * Mapping memory regions conflicts with RAM discard (virtio-mem) when 566 * there is pinning, so only do it when necessary. 567 */ 568 if (!s->needs_mem_regions && s->may_pin_mem_regions) { 569 return true; 570 } 571 572 region_result = blkio_mem_region_from_host(bs, host, size, ®ion, errp); 573 if (region_result == BMRR_SKIP) { 574 return true; 575 } else if (region_result != BMRR_OK) { 576 return false; 577 } 578 579 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 580 ret = blkio_map_mem_region(s->blkio, ®ion); 581 } 582 583 if (ret < 0) { 584 error_setg(errp, "Failed to add blkio mem region %p with size %zu: %s", 585 host, size, blkio_get_error_msg()); 586 return false; 587 } 588 return true; 589 } 590 591 static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size) 592 { 593 BDRVBlkioState *s = bs->opaque; 594 struct blkio_mem_region region; 595 596 /* See blkio_register_buf() */ 597 if (!s->needs_mem_regions && s->may_pin_mem_regions) { 598 return; 599 } 600 601 if (blkio_mem_region_from_host(bs, host, size, ®ion, NULL) != BMRR_OK) { 602 return; 603 } 604 605 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 606 blkio_unmap_mem_region(s->blkio, ®ion); 607 } 608 } 609 610 static int blkio_io_uring_connect(BlockDriverState *bs, QDict *options, 611 int flags, Error **errp) 612 { 613 const char *filename = qdict_get_str(options, "filename"); 614 BDRVBlkioState *s = bs->opaque; 615 int ret; 616 617 ret = blkio_set_str(s->blkio, "path", filename); 618 qdict_del(options, "filename"); 619 if (ret < 0) { 620 error_setg_errno(errp, -ret, "failed to set path: %s", 621 blkio_get_error_msg()); 622 return ret; 623 } 624 625 if (flags & BDRV_O_NOCACHE) { 626 ret = blkio_set_bool(s->blkio, "direct", true); 627 if (ret < 0) { 628 error_setg_errno(errp, -ret, "failed to set direct: %s", 629 blkio_get_error_msg()); 630 return ret; 631 } 632 } 633 634 ret = blkio_connect(s->blkio); 635 if (ret < 0) { 636 error_setg_errno(errp, -ret, "blkio_connect failed: %s", 637 blkio_get_error_msg()); 638 return ret; 639 } 640 641 return 0; 642 } 643 644 static int blkio_nvme_io_uring_connect(BlockDriverState *bs, QDict *options, 645 int flags, Error **errp) 646 { 647 const char *path = qdict_get_try_str(options, "path"); 648 BDRVBlkioState *s = bs->opaque; 649 int ret; 650 651 if (!path) { 652 error_setg(errp, "missing 'path' option"); 653 return -EINVAL; 654 } 655 656 ret = blkio_set_str(s->blkio, "path", path); 657 qdict_del(options, "path"); 658 if (ret < 0) { 659 error_setg_errno(errp, -ret, "failed to set path: %s", 660 blkio_get_error_msg()); 661 return ret; 662 } 663 664 if (!(flags & BDRV_O_NOCACHE)) { 665 error_setg(errp, "cache.direct=off is not supported"); 666 return -EINVAL; 667 } 668 669 ret = blkio_connect(s->blkio); 670 if (ret < 0) { 671 error_setg_errno(errp, -ret, "blkio_connect failed: %s", 672 blkio_get_error_msg()); 673 return ret; 674 } 675 676 return 0; 677 } 678 679 static int blkio_virtio_blk_connect(BlockDriverState *bs, QDict *options, 680 int flags, Error **errp) 681 { 682 const char *path = qdict_get_try_str(options, "path"); 683 BDRVBlkioState *s = bs->opaque; 684 bool fd_supported = false; 685 int fd = -1, ret; 686 687 if (!path) { 688 error_setg(errp, "missing 'path' option"); 689 return -EINVAL; 690 } 691 692 if (!(flags & BDRV_O_NOCACHE)) { 693 error_setg(errp, "cache.direct=off is not supported"); 694 return -EINVAL; 695 } 696 697 if (blkio_set_int(s->blkio, "fd", -1) == 0) { 698 fd_supported = true; 699 } 700 701 /* 702 * If the libblkio driver supports fd passing, let's always use qemu_open() 703 * to open the `path`, so we can handle fd passing from the management 704 * layer through the "/dev/fdset/N" special path. 705 */ 706 if (fd_supported) { 707 /* 708 * `path` can contain the path of a character device 709 * (e.g. /dev/vhost-vdpa-0 or /dev/vfio/vfio) or a unix socket. 710 * 711 * So, we should always open it with O_RDWR flag, also if BDRV_O_RDWR 712 * is not set in the open flags, because the exchange of IOCTL commands 713 * for example will fail. 714 * 715 * In order to open the device read-only, we are using the `read-only` 716 * property of the libblkio driver in blkio_file_open(). 717 */ 718 fd = qemu_open(path, O_RDWR, NULL); 719 if (fd < 0) { 720 /* 721 * qemu_open() can fail if the user specifies a path that is not 722 * a file or device, for example in the case of Unix Domain Socket 723 * for the virtio-blk-vhost-user driver. In such cases let's have 724 * libblkio open the path directly. 725 */ 726 fd_supported = false; 727 } else { 728 ret = blkio_set_int(s->blkio, "fd", fd); 729 if (ret < 0) { 730 fd_supported = false; 731 qemu_close(fd); 732 fd = -1; 733 } 734 } 735 } 736 737 if (!fd_supported) { 738 ret = blkio_set_str(s->blkio, "path", path); 739 if (ret < 0) { 740 error_setg_errno(errp, -ret, "failed to set path: %s", 741 blkio_get_error_msg()); 742 return ret; 743 } 744 } 745 746 ret = blkio_connect(s->blkio); 747 if (ret < 0 && fd >= 0) { 748 /* Failed to give the FD to libblkio, close it */ 749 qemu_close(fd); 750 fd = -1; 751 } 752 753 /* 754 * Before https://gitlab.com/libblkio/libblkio/-/merge_requests/208 755 * (libblkio <= v1.3.0), setting the `fd` property is not enough to check 756 * whether the driver supports the `fd` property or not. In that case, 757 * blkio_connect() will fail with -EINVAL. 758 * So let's try calling blkio_connect() again by directly setting `path` 759 * to cover this scenario. 760 */ 761 if (fd_supported && ret == -EINVAL) { 762 /* 763 * We need to clear the `fd` property we set previously by setting 764 * it to -1. 765 */ 766 ret = blkio_set_int(s->blkio, "fd", -1); 767 if (ret < 0) { 768 error_setg_errno(errp, -ret, "failed to set fd: %s", 769 blkio_get_error_msg()); 770 return ret; 771 } 772 773 ret = blkio_set_str(s->blkio, "path", path); 774 if (ret < 0) { 775 error_setg_errno(errp, -ret, "failed to set path: %s", 776 blkio_get_error_msg()); 777 return ret; 778 } 779 780 ret = blkio_connect(s->blkio); 781 } 782 783 if (ret < 0) { 784 error_setg_errno(errp, -ret, "blkio_connect failed: %s", 785 blkio_get_error_msg()); 786 return ret; 787 } 788 789 qdict_del(options, "path"); 790 791 return 0; 792 } 793 794 static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags, 795 Error **errp) 796 { 797 const char *blkio_driver = bs->drv->protocol_name; 798 BDRVBlkioState *s = bs->opaque; 799 int ret; 800 801 ret = blkio_create(blkio_driver, &s->blkio); 802 if (ret < 0) { 803 error_setg_errno(errp, -ret, "blkio_create failed: %s", 804 blkio_get_error_msg()); 805 return ret; 806 } 807 808 if (!(flags & BDRV_O_RDWR)) { 809 ret = blkio_set_bool(s->blkio, "read-only", true); 810 if (ret < 0) { 811 error_setg_errno(errp, -ret, "failed to set read-only: %s", 812 blkio_get_error_msg()); 813 blkio_destroy(&s->blkio); 814 return ret; 815 } 816 } 817 818 if (strcmp(blkio_driver, "io_uring") == 0) { 819 ret = blkio_io_uring_connect(bs, options, flags, errp); 820 } else if (strcmp(blkio_driver, "nvme-io_uring") == 0) { 821 ret = blkio_nvme_io_uring_connect(bs, options, flags, errp); 822 } else if (strcmp(blkio_driver, "virtio-blk-vfio-pci") == 0) { 823 ret = blkio_virtio_blk_connect(bs, options, flags, errp); 824 } else if (strcmp(blkio_driver, "virtio-blk-vhost-user") == 0) { 825 ret = blkio_virtio_blk_connect(bs, options, flags, errp); 826 } else if (strcmp(blkio_driver, "virtio-blk-vhost-vdpa") == 0) { 827 ret = blkio_virtio_blk_connect(bs, options, flags, errp); 828 } else { 829 g_assert_not_reached(); 830 } 831 if (ret < 0) { 832 blkio_destroy(&s->blkio); 833 return ret; 834 } 835 836 ret = blkio_get_bool(s->blkio, 837 "needs-mem-regions", 838 &s->needs_mem_regions); 839 if (ret < 0) { 840 error_setg_errno(errp, -ret, 841 "failed to get needs-mem-regions: %s", 842 blkio_get_error_msg()); 843 blkio_destroy(&s->blkio); 844 return ret; 845 } 846 847 ret = blkio_get_bool(s->blkio, 848 "needs-mem-region-fd", 849 &s->needs_mem_region_fd); 850 if (ret < 0) { 851 error_setg_errno(errp, -ret, 852 "failed to get needs-mem-region-fd: %s", 853 blkio_get_error_msg()); 854 blkio_destroy(&s->blkio); 855 return ret; 856 } 857 858 ret = blkio_get_uint64(s->blkio, 859 "mem-region-alignment", 860 &s->mem_region_alignment); 861 if (ret < 0) { 862 error_setg_errno(errp, -ret, 863 "failed to get mem-region-alignment: %s", 864 blkio_get_error_msg()); 865 blkio_destroy(&s->blkio); 866 return ret; 867 } 868 869 ret = blkio_get_bool(s->blkio, 870 "may-pin-mem-regions", 871 &s->may_pin_mem_regions); 872 if (ret < 0) { 873 /* Be conservative (assume pinning) if the property is not supported */ 874 s->may_pin_mem_regions = s->needs_mem_regions; 875 } 876 877 /* 878 * Notify if libblkio drivers pin memory and prevent features like 879 * virtio-mem from working. 880 */ 881 if (s->may_pin_mem_regions) { 882 ret = ram_block_discard_disable(true); 883 if (ret < 0) { 884 error_setg_errno(errp, -ret, "ram_block_discard_disable() failed"); 885 blkio_destroy(&s->blkio); 886 return ret; 887 } 888 } 889 890 ret = blkio_start(s->blkio); 891 if (ret < 0) { 892 error_setg_errno(errp, -ret, "blkio_start failed: %s", 893 blkio_get_error_msg()); 894 blkio_destroy(&s->blkio); 895 if (s->may_pin_mem_regions) { 896 ram_block_discard_disable(false); 897 } 898 return ret; 899 } 900 901 bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF; 902 bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | 903 BDRV_REQ_NO_FALLBACK; 904 905 qemu_mutex_init(&s->blkio_lock); 906 qemu_co_mutex_init(&s->bounce_lock); 907 qemu_co_queue_init(&s->bounce_available); 908 QLIST_INIT(&s->bounce_bufs); 909 s->blkioq = blkio_get_queue(s->blkio, 0); 910 s->completion_fd = blkioq_get_completion_fd(s->blkioq); 911 blkioq_set_completion_fd_enabled(s->blkioq, true); 912 913 blkio_attach_aio_context(bs, bdrv_get_aio_context(bs)); 914 return 0; 915 } 916 917 static void blkio_close(BlockDriverState *bs) 918 { 919 BDRVBlkioState *s = bs->opaque; 920 921 /* There is no destroy() API for s->bounce_lock */ 922 923 qemu_mutex_destroy(&s->blkio_lock); 924 blkio_detach_aio_context(bs); 925 blkio_destroy(&s->blkio); 926 927 if (s->may_pin_mem_regions) { 928 ram_block_discard_disable(false); 929 } 930 } 931 932 static int64_t coroutine_fn blkio_co_getlength(BlockDriverState *bs) 933 { 934 BDRVBlkioState *s = bs->opaque; 935 uint64_t capacity; 936 int ret; 937 938 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 939 ret = blkio_get_uint64(s->blkio, "capacity", &capacity); 940 } 941 if (ret < 0) { 942 return -ret; 943 } 944 945 return capacity; 946 } 947 948 static int coroutine_fn blkio_truncate(BlockDriverState *bs, int64_t offset, 949 bool exact, PreallocMode prealloc, 950 BdrvRequestFlags flags, Error **errp) 951 { 952 int64_t current_length; 953 954 if (prealloc != PREALLOC_MODE_OFF) { 955 error_setg(errp, "Unsupported preallocation mode '%s'", 956 PreallocMode_str(prealloc)); 957 return -ENOTSUP; 958 } 959 960 current_length = blkio_co_getlength(bs); 961 962 if (offset > current_length) { 963 error_setg(errp, "Cannot grow device"); 964 return -EINVAL; 965 } else if (exact && offset != current_length) { 966 error_setg(errp, "Cannot resize device"); 967 return -ENOTSUP; 968 } 969 970 return 0; 971 } 972 973 static int coroutine_fn 974 blkio_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 975 { 976 return 0; 977 } 978 979 static void blkio_refresh_limits(BlockDriverState *bs, Error **errp) 980 { 981 BDRVBlkioState *s = bs->opaque; 982 QEMU_LOCK_GUARD(&s->blkio_lock); 983 int value; 984 int ret; 985 986 ret = blkio_get_int(s->blkio, "request-alignment", &value); 987 if (ret < 0) { 988 error_setg_errno(errp, -ret, "failed to get \"request-alignment\": %s", 989 blkio_get_error_msg()); 990 return; 991 } 992 bs->bl.request_alignment = value; 993 if (bs->bl.request_alignment < 1 || 994 bs->bl.request_alignment >= INT_MAX || 995 !is_power_of_2(bs->bl.request_alignment)) { 996 error_setg(errp, "invalid \"request-alignment\" value %" PRIu32 ", " 997 "must be a power of 2 less than INT_MAX", 998 bs->bl.request_alignment); 999 return; 1000 } 1001 1002 ret = blkio_get_int(s->blkio, "optimal-io-size", &value); 1003 if (ret < 0) { 1004 error_setg_errno(errp, -ret, "failed to get \"optimal-io-size\": %s", 1005 blkio_get_error_msg()); 1006 return; 1007 } 1008 bs->bl.opt_transfer = value; 1009 if (bs->bl.opt_transfer > INT_MAX || 1010 (bs->bl.opt_transfer % bs->bl.request_alignment)) { 1011 error_setg(errp, "invalid \"optimal-io-size\" value %" PRIu32 ", must " 1012 "be a multiple of %" PRIu32, bs->bl.opt_transfer, 1013 bs->bl.request_alignment); 1014 return; 1015 } 1016 1017 ret = blkio_get_int(s->blkio, "max-transfer", &value); 1018 if (ret < 0) { 1019 error_setg_errno(errp, -ret, "failed to get \"max-transfer\": %s", 1020 blkio_get_error_msg()); 1021 return; 1022 } 1023 bs->bl.max_transfer = value; 1024 if ((bs->bl.max_transfer % bs->bl.request_alignment) || 1025 (bs->bl.opt_transfer && (bs->bl.max_transfer % bs->bl.opt_transfer))) { 1026 error_setg(errp, "invalid \"max-transfer\" value %" PRIu32 ", must be " 1027 "a multiple of %" PRIu32 " and %" PRIu32 " (if non-zero)", 1028 bs->bl.max_transfer, bs->bl.request_alignment, 1029 bs->bl.opt_transfer); 1030 return; 1031 } 1032 1033 ret = blkio_get_int(s->blkio, "buf-alignment", &value); 1034 if (ret < 0) { 1035 error_setg_errno(errp, -ret, "failed to get \"buf-alignment\": %s", 1036 blkio_get_error_msg()); 1037 return; 1038 } 1039 if (value < 1) { 1040 error_setg(errp, "invalid \"buf-alignment\" value %d, must be " 1041 "positive", value); 1042 return; 1043 } 1044 bs->bl.min_mem_alignment = value; 1045 1046 ret = blkio_get_int(s->blkio, "optimal-buf-alignment", &value); 1047 if (ret < 0) { 1048 error_setg_errno(errp, -ret, 1049 "failed to get \"optimal-buf-alignment\": %s", 1050 blkio_get_error_msg()); 1051 return; 1052 } 1053 if (value < 1) { 1054 error_setg(errp, "invalid \"optimal-buf-alignment\" value %d, " 1055 "must be positive", value); 1056 return; 1057 } 1058 bs->bl.opt_mem_alignment = value; 1059 1060 ret = blkio_get_int(s->blkio, "max-segments", &value); 1061 if (ret < 0) { 1062 error_setg_errno(errp, -ret, "failed to get \"max-segments\": %s", 1063 blkio_get_error_msg()); 1064 return; 1065 } 1066 if (value < 1) { 1067 error_setg(errp, "invalid \"max-segments\" value %d, must be positive", 1068 value); 1069 return; 1070 } 1071 bs->bl.max_iov = value; 1072 } 1073 1074 /* 1075 * TODO 1076 * Missing libblkio APIs: 1077 * - block_status 1078 * - co_invalidate_cache 1079 * 1080 * Out of scope? 1081 * - create 1082 * - truncate 1083 */ 1084 1085 /* 1086 * Do not include .format_name and .protocol_name because module_block.py 1087 * does not parse macros in the source code. 1088 */ 1089 #define BLKIO_DRIVER_COMMON \ 1090 .instance_size = sizeof(BDRVBlkioState), \ 1091 .bdrv_file_open = blkio_file_open, \ 1092 .bdrv_close = blkio_close, \ 1093 .bdrv_co_getlength = blkio_co_getlength, \ 1094 .bdrv_co_truncate = blkio_truncate, \ 1095 .bdrv_co_get_info = blkio_co_get_info, \ 1096 .bdrv_attach_aio_context = blkio_attach_aio_context, \ 1097 .bdrv_detach_aio_context = blkio_detach_aio_context, \ 1098 .bdrv_co_pdiscard = blkio_co_pdiscard, \ 1099 .bdrv_co_preadv = blkio_co_preadv, \ 1100 .bdrv_co_pwritev = blkio_co_pwritev, \ 1101 .bdrv_co_flush_to_disk = blkio_co_flush, \ 1102 .bdrv_co_pwrite_zeroes = blkio_co_pwrite_zeroes, \ 1103 .bdrv_refresh_limits = blkio_refresh_limits, \ 1104 .bdrv_register_buf = blkio_register_buf, \ 1105 .bdrv_unregister_buf = blkio_unregister_buf, 1106 1107 /* 1108 * Use the same .format_name and .protocol_name as the libblkio driver name for 1109 * consistency. 1110 */ 1111 1112 static BlockDriver bdrv_io_uring = { 1113 .format_name = "io_uring", 1114 .protocol_name = "io_uring", 1115 .bdrv_needs_filename = true, 1116 BLKIO_DRIVER_COMMON 1117 }; 1118 1119 static BlockDriver bdrv_nvme_io_uring = { 1120 .format_name = "nvme-io_uring", 1121 .protocol_name = "nvme-io_uring", 1122 BLKIO_DRIVER_COMMON 1123 }; 1124 1125 static BlockDriver bdrv_virtio_blk_vfio_pci = { 1126 .format_name = "virtio-blk-vfio-pci", 1127 .protocol_name = "virtio-blk-vfio-pci", 1128 BLKIO_DRIVER_COMMON 1129 }; 1130 1131 static BlockDriver bdrv_virtio_blk_vhost_user = { 1132 .format_name = "virtio-blk-vhost-user", 1133 .protocol_name = "virtio-blk-vhost-user", 1134 BLKIO_DRIVER_COMMON 1135 }; 1136 1137 static BlockDriver bdrv_virtio_blk_vhost_vdpa = { 1138 .format_name = "virtio-blk-vhost-vdpa", 1139 .protocol_name = "virtio-blk-vhost-vdpa", 1140 BLKIO_DRIVER_COMMON 1141 }; 1142 1143 static void bdrv_blkio_init(void) 1144 { 1145 bdrv_register(&bdrv_io_uring); 1146 bdrv_register(&bdrv_nvme_io_uring); 1147 bdrv_register(&bdrv_virtio_blk_vfio_pci); 1148 bdrv_register(&bdrv_virtio_blk_vhost_user); 1149 bdrv_register(&bdrv_virtio_blk_vhost_vdpa); 1150 } 1151 1152 block_init(bdrv_blkio_init); 1153