1 /* SPDX-License-Identifier: LGPL-2.1-or-later */ 2 /* 3 * libblkio BlockDriver 4 * 5 * Copyright Red Hat, Inc. 6 * 7 * Author: 8 * Stefan Hajnoczi <stefanha@redhat.com> 9 */ 10 11 #include "qemu/osdep.h" 12 #include <blkio.h> 13 #include "block/block_int.h" 14 #include "exec/memory.h" 15 #include "exec/cpu-common.h" /* for qemu_ram_get_fd() */ 16 #include "qemu/defer-call.h" 17 #include "qapi/error.h" 18 #include "qemu/error-report.h" 19 #include "qapi/qmp/qdict.h" 20 #include "qemu/module.h" 21 #include "sysemu/block-backend.h" 22 #include "exec/memory.h" /* for ram_block_discard_disable() */ 23 24 #include "block/block-io.h" 25 26 /* 27 * Allocated bounce buffers are kept in a list sorted by buffer address. 28 */ 29 typedef struct BlkioBounceBuf { 30 QLIST_ENTRY(BlkioBounceBuf) next; 31 32 /* The bounce buffer */ 33 struct iovec buf; 34 } BlkioBounceBuf; 35 36 typedef struct { 37 /* 38 * libblkio is not thread-safe so this lock protects ->blkio and 39 * ->blkioq. 40 */ 41 QemuMutex blkio_lock; 42 struct blkio *blkio; 43 struct blkioq *blkioq; /* make this multi-queue in the future... */ 44 int completion_fd; 45 46 /* 47 * Polling fetches the next completion into this field. 48 * 49 * No lock is necessary since only one thread calls aio_poll() and invokes 50 * fd and poll handlers. 51 */ 52 struct blkio_completion poll_completion; 53 54 /* 55 * Protects ->bounce_pool, ->bounce_bufs, ->bounce_available. 56 * 57 * Lock ordering: ->bounce_lock before ->blkio_lock. 58 */ 59 CoMutex bounce_lock; 60 61 /* Bounce buffer pool */ 62 struct blkio_mem_region bounce_pool; 63 64 /* Sorted list of allocated bounce buffers */ 65 QLIST_HEAD(, BlkioBounceBuf) bounce_bufs; 66 67 /* Queue for coroutines waiting for bounce buffer space */ 68 CoQueue bounce_available; 69 70 /* The value of the "mem-region-alignment" property */ 71 uint64_t mem_region_alignment; 72 73 /* Can we skip adding/deleting blkio_mem_regions? */ 74 bool needs_mem_regions; 75 76 /* Are file descriptors necessary for blkio_mem_regions? */ 77 bool needs_mem_region_fd; 78 79 /* Are madvise(MADV_DONTNEED)-style operations unavailable? */ 80 bool may_pin_mem_regions; 81 } BDRVBlkioState; 82 83 /* Called with s->bounce_lock held */ 84 static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes) 85 { 86 /* There can be no allocated bounce buffers during resize */ 87 assert(QLIST_EMPTY(&s->bounce_bufs)); 88 89 /* Pad size to reduce frequency of resize calls */ 90 bytes += 128 * 1024; 91 92 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 93 int ret; 94 95 if (s->bounce_pool.addr) { 96 blkio_unmap_mem_region(s->blkio, &s->bounce_pool); 97 blkio_free_mem_region(s->blkio, &s->bounce_pool); 98 memset(&s->bounce_pool, 0, sizeof(s->bounce_pool)); 99 } 100 101 /* Automatically freed when s->blkio is destroyed */ 102 ret = blkio_alloc_mem_region(s->blkio, &s->bounce_pool, bytes); 103 if (ret < 0) { 104 return ret; 105 } 106 107 ret = blkio_map_mem_region(s->blkio, &s->bounce_pool); 108 if (ret < 0) { 109 blkio_free_mem_region(s->blkio, &s->bounce_pool); 110 memset(&s->bounce_pool, 0, sizeof(s->bounce_pool)); 111 return ret; 112 } 113 } 114 115 return 0; 116 } 117 118 /* Called with s->bounce_lock held */ 119 static bool 120 blkio_do_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce, 121 int64_t bytes) 122 { 123 void *addr = s->bounce_pool.addr; 124 BlkioBounceBuf *cur = NULL; 125 BlkioBounceBuf *prev = NULL; 126 ptrdiff_t space; 127 128 /* 129 * This is just a linear search over the holes between requests. An 130 * efficient allocator would be nice. 131 */ 132 QLIST_FOREACH(cur, &s->bounce_bufs, next) { 133 space = cur->buf.iov_base - addr; 134 if (bytes <= space) { 135 QLIST_INSERT_BEFORE(cur, bounce, next); 136 bounce->buf.iov_base = addr; 137 bounce->buf.iov_len = bytes; 138 return true; 139 } 140 141 addr = cur->buf.iov_base + cur->buf.iov_len; 142 prev = cur; 143 } 144 145 /* Is there space after the last request? */ 146 space = s->bounce_pool.addr + s->bounce_pool.len - addr; 147 if (bytes > space) { 148 return false; 149 } 150 if (prev) { 151 QLIST_INSERT_AFTER(prev, bounce, next); 152 } else { 153 QLIST_INSERT_HEAD(&s->bounce_bufs, bounce, next); 154 } 155 bounce->buf.iov_base = addr; 156 bounce->buf.iov_len = bytes; 157 return true; 158 } 159 160 static int coroutine_fn 161 blkio_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce, 162 int64_t bytes) 163 { 164 /* 165 * Ensure fairness: first time around we join the back of the queue, 166 * subsequently we join the front so we don't lose our place. 167 */ 168 CoQueueWaitFlags wait_flags = 0; 169 170 QEMU_LOCK_GUARD(&s->bounce_lock); 171 172 /* Ensure fairness: don't even try if other requests are already waiting */ 173 if (!qemu_co_queue_empty(&s->bounce_available)) { 174 qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock, 175 wait_flags); 176 wait_flags = CO_QUEUE_WAIT_FRONT; 177 } 178 179 while (true) { 180 if (blkio_do_alloc_bounce_buffer(s, bounce, bytes)) { 181 /* Kick the next queued request since there may be space */ 182 qemu_co_queue_next(&s->bounce_available); 183 return 0; 184 } 185 186 /* 187 * If there are no in-flight requests then the pool was simply too 188 * small. 189 */ 190 if (QLIST_EMPTY(&s->bounce_bufs)) { 191 bool ok; 192 int ret; 193 194 ret = blkio_resize_bounce_pool(s, bytes); 195 if (ret < 0) { 196 /* Kick the next queued request since that may fail too */ 197 qemu_co_queue_next(&s->bounce_available); 198 return ret; 199 } 200 201 ok = blkio_do_alloc_bounce_buffer(s, bounce, bytes); 202 assert(ok); /* must have space this time */ 203 return 0; 204 } 205 206 qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock, 207 wait_flags); 208 wait_flags = CO_QUEUE_WAIT_FRONT; 209 } 210 } 211 212 static void coroutine_fn blkio_free_bounce_buffer(BDRVBlkioState *s, 213 BlkioBounceBuf *bounce) 214 { 215 QEMU_LOCK_GUARD(&s->bounce_lock); 216 217 QLIST_REMOVE(bounce, next); 218 219 /* Wake up waiting coroutines since space may now be available */ 220 qemu_co_queue_next(&s->bounce_available); 221 } 222 223 /* For async to .bdrv_co_*() conversion */ 224 typedef struct { 225 Coroutine *coroutine; 226 int ret; 227 } BlkioCoData; 228 229 static void blkio_completion_fd_read(void *opaque) 230 { 231 BlockDriverState *bs = opaque; 232 BDRVBlkioState *s = bs->opaque; 233 uint64_t val; 234 int ret; 235 236 /* Polling may have already fetched a completion */ 237 if (s->poll_completion.user_data != NULL) { 238 BlkioCoData *cod = s->poll_completion.user_data; 239 cod->ret = s->poll_completion.ret; 240 241 /* Clear it in case aio_co_wake() enters a nested event loop */ 242 s->poll_completion.user_data = NULL; 243 244 aio_co_wake(cod->coroutine); 245 } 246 247 /* Reset completion fd status */ 248 ret = read(s->completion_fd, &val, sizeof(val)); 249 250 /* Ignore errors, there's nothing we can do */ 251 (void)ret; 252 253 /* 254 * Reading one completion at a time makes nested event loop re-entrancy 255 * simple. Change this loop to get multiple completions in one go if it 256 * becomes a performance bottleneck. 257 */ 258 while (true) { 259 struct blkio_completion completion; 260 261 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 262 ret = blkioq_do_io(s->blkioq, &completion, 0, 1, NULL); 263 } 264 if (ret != 1) { 265 break; 266 } 267 268 BlkioCoData *cod = completion.user_data; 269 cod->ret = completion.ret; 270 aio_co_wake(cod->coroutine); 271 } 272 } 273 274 static bool blkio_completion_fd_poll(void *opaque) 275 { 276 BlockDriverState *bs = opaque; 277 BDRVBlkioState *s = bs->opaque; 278 int ret; 279 280 /* Just in case we already fetched a completion */ 281 if (s->poll_completion.user_data != NULL) { 282 return true; 283 } 284 285 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 286 ret = blkioq_do_io(s->blkioq, &s->poll_completion, 0, 1, NULL); 287 } 288 return ret == 1; 289 } 290 291 static void blkio_completion_fd_poll_ready(void *opaque) 292 { 293 blkio_completion_fd_read(opaque); 294 } 295 296 static void blkio_attach_aio_context(BlockDriverState *bs, 297 AioContext *new_context) 298 { 299 BDRVBlkioState *s = bs->opaque; 300 301 aio_set_fd_handler(new_context, s->completion_fd, 302 blkio_completion_fd_read, NULL, 303 blkio_completion_fd_poll, 304 blkio_completion_fd_poll_ready, bs); 305 } 306 307 static void blkio_detach_aio_context(BlockDriverState *bs) 308 { 309 BDRVBlkioState *s = bs->opaque; 310 311 aio_set_fd_handler(bdrv_get_aio_context(bs), s->completion_fd, NULL, NULL, 312 NULL, NULL, NULL); 313 } 314 315 /* 316 * Called by defer_call_end() or immediately if not in a deferred section. 317 * Called without blkio_lock. 318 */ 319 static void blkio_deferred_fn(void *opaque) 320 { 321 BDRVBlkioState *s = opaque; 322 323 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 324 blkioq_do_io(s->blkioq, NULL, 0, 0, NULL); 325 } 326 } 327 328 /* 329 * Schedule I/O submission after enqueuing a new request. Called without 330 * blkio_lock. 331 */ 332 static void blkio_submit_io(BlockDriverState *bs) 333 { 334 BDRVBlkioState *s = bs->opaque; 335 336 defer_call(blkio_deferred_fn, s); 337 } 338 339 static int coroutine_fn 340 blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) 341 { 342 BDRVBlkioState *s = bs->opaque; 343 BlkioCoData cod = { 344 .coroutine = qemu_coroutine_self(), 345 }; 346 347 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 348 blkioq_discard(s->blkioq, offset, bytes, &cod, 0); 349 } 350 351 blkio_submit_io(bs); 352 qemu_coroutine_yield(); 353 return cod.ret; 354 } 355 356 static int coroutine_fn 357 blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, 358 QEMUIOVector *qiov, BdrvRequestFlags flags) 359 { 360 BlkioCoData cod = { 361 .coroutine = qemu_coroutine_self(), 362 }; 363 BDRVBlkioState *s = bs->opaque; 364 bool use_bounce_buffer = 365 s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF); 366 BlkioBounceBuf bounce; 367 struct iovec *iov = qiov->iov; 368 int iovcnt = qiov->niov; 369 370 if (use_bounce_buffer) { 371 int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes); 372 if (ret < 0) { 373 return ret; 374 } 375 376 iov = &bounce.buf; 377 iovcnt = 1; 378 } 379 380 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 381 blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0); 382 } 383 384 blkio_submit_io(bs); 385 qemu_coroutine_yield(); 386 387 if (use_bounce_buffer) { 388 if (cod.ret == 0) { 389 qemu_iovec_from_buf(qiov, 0, 390 bounce.buf.iov_base, 391 bounce.buf.iov_len); 392 } 393 394 blkio_free_bounce_buffer(s, &bounce); 395 } 396 397 return cod.ret; 398 } 399 400 static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset, 401 int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) 402 { 403 uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0; 404 BlkioCoData cod = { 405 .coroutine = qemu_coroutine_self(), 406 }; 407 BDRVBlkioState *s = bs->opaque; 408 bool use_bounce_buffer = 409 s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF); 410 BlkioBounceBuf bounce; 411 struct iovec *iov = qiov->iov; 412 int iovcnt = qiov->niov; 413 414 if (use_bounce_buffer) { 415 int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes); 416 if (ret < 0) { 417 return ret; 418 } 419 420 qemu_iovec_to_buf(qiov, 0, bounce.buf.iov_base, bytes); 421 iov = &bounce.buf; 422 iovcnt = 1; 423 } 424 425 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 426 blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags); 427 } 428 429 blkio_submit_io(bs); 430 qemu_coroutine_yield(); 431 432 if (use_bounce_buffer) { 433 blkio_free_bounce_buffer(s, &bounce); 434 } 435 436 return cod.ret; 437 } 438 439 static int coroutine_fn blkio_co_flush(BlockDriverState *bs) 440 { 441 BDRVBlkioState *s = bs->opaque; 442 BlkioCoData cod = { 443 .coroutine = qemu_coroutine_self(), 444 }; 445 446 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 447 blkioq_flush(s->blkioq, &cod, 0); 448 } 449 450 blkio_submit_io(bs); 451 qemu_coroutine_yield(); 452 return cod.ret; 453 } 454 455 static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs, 456 int64_t offset, int64_t bytes, BdrvRequestFlags flags) 457 { 458 BDRVBlkioState *s = bs->opaque; 459 BlkioCoData cod = { 460 .coroutine = qemu_coroutine_self(), 461 }; 462 uint32_t blkio_flags = 0; 463 464 if (flags & BDRV_REQ_FUA) { 465 blkio_flags |= BLKIO_REQ_FUA; 466 } 467 if (!(flags & BDRV_REQ_MAY_UNMAP)) { 468 blkio_flags |= BLKIO_REQ_NO_UNMAP; 469 } 470 if (flags & BDRV_REQ_NO_FALLBACK) { 471 blkio_flags |= BLKIO_REQ_NO_FALLBACK; 472 } 473 474 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 475 blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags); 476 } 477 478 blkio_submit_io(bs); 479 qemu_coroutine_yield(); 480 return cod.ret; 481 } 482 483 typedef enum { 484 BMRR_OK, 485 BMRR_SKIP, 486 BMRR_FAIL, 487 } BlkioMemRegionResult; 488 489 /* 490 * Produce a struct blkio_mem_region for a given address and size. 491 * 492 * This function produces identical results when called multiple times with the 493 * same arguments. This property is necessary because blkio_unmap_mem_region() 494 * must receive the same struct blkio_mem_region field values that were passed 495 * to blkio_map_mem_region(). 496 */ 497 static BlkioMemRegionResult 498 blkio_mem_region_from_host(BlockDriverState *bs, 499 void *host, size_t size, 500 struct blkio_mem_region *region, 501 Error **errp) 502 { 503 BDRVBlkioState *s = bs->opaque; 504 int fd = -1; 505 ram_addr_t fd_offset = 0; 506 507 if (((uintptr_t)host | size) % s->mem_region_alignment) { 508 error_setg(errp, "unaligned buf %p with size %zu", host, size); 509 return BMRR_FAIL; 510 } 511 512 /* Attempt to find the fd for the underlying memory */ 513 if (s->needs_mem_region_fd) { 514 RAMBlock *ram_block; 515 RAMBlock *end_block; 516 ram_addr_t offset; 517 518 /* 519 * bdrv_register_buf() is called with the BQL held so mr lives at least 520 * until this function returns. 521 */ 522 ram_block = qemu_ram_block_from_host(host, false, &fd_offset); 523 if (ram_block) { 524 fd = qemu_ram_get_fd(ram_block); 525 } 526 if (fd == -1) { 527 /* 528 * Ideally every RAMBlock would have an fd. pc-bios and other 529 * things don't. Luckily they are usually not I/O buffers and we 530 * can just ignore them. 531 */ 532 return BMRR_SKIP; 533 } 534 535 /* Make sure the fd covers the entire range */ 536 end_block = qemu_ram_block_from_host(host + size - 1, false, &offset); 537 if (ram_block != end_block) { 538 error_setg(errp, "registered buffer at %p with size %zu extends " 539 "beyond RAMBlock", host, size); 540 return BMRR_FAIL; 541 } 542 } 543 544 *region = (struct blkio_mem_region){ 545 .addr = host, 546 .len = size, 547 .fd = fd, 548 .fd_offset = fd_offset, 549 }; 550 return BMRR_OK; 551 } 552 553 static bool blkio_register_buf(BlockDriverState *bs, void *host, size_t size, 554 Error **errp) 555 { 556 BDRVBlkioState *s = bs->opaque; 557 struct blkio_mem_region region; 558 BlkioMemRegionResult region_result; 559 int ret; 560 561 /* 562 * Mapping memory regions conflicts with RAM discard (virtio-mem) when 563 * there is pinning, so only do it when necessary. 564 */ 565 if (!s->needs_mem_regions && s->may_pin_mem_regions) { 566 return true; 567 } 568 569 region_result = blkio_mem_region_from_host(bs, host, size, ®ion, errp); 570 if (region_result == BMRR_SKIP) { 571 return true; 572 } else if (region_result != BMRR_OK) { 573 return false; 574 } 575 576 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 577 ret = blkio_map_mem_region(s->blkio, ®ion); 578 } 579 580 if (ret < 0) { 581 error_setg(errp, "Failed to add blkio mem region %p with size %zu: %s", 582 host, size, blkio_get_error_msg()); 583 return false; 584 } 585 return true; 586 } 587 588 static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size) 589 { 590 BDRVBlkioState *s = bs->opaque; 591 struct blkio_mem_region region; 592 593 /* See blkio_register_buf() */ 594 if (!s->needs_mem_regions && s->may_pin_mem_regions) { 595 return; 596 } 597 598 if (blkio_mem_region_from_host(bs, host, size, ®ion, NULL) != BMRR_OK) { 599 return; 600 } 601 602 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 603 blkio_unmap_mem_region(s->blkio, ®ion); 604 } 605 } 606 607 static int blkio_io_uring_connect(BlockDriverState *bs, QDict *options, 608 int flags, Error **errp) 609 { 610 const char *filename = qdict_get_str(options, "filename"); 611 BDRVBlkioState *s = bs->opaque; 612 int ret; 613 614 ret = blkio_set_str(s->blkio, "path", filename); 615 qdict_del(options, "filename"); 616 if (ret < 0) { 617 error_setg_errno(errp, -ret, "failed to set path: %s", 618 blkio_get_error_msg()); 619 return ret; 620 } 621 622 if (flags & BDRV_O_NOCACHE) { 623 ret = blkio_set_bool(s->blkio, "direct", true); 624 if (ret < 0) { 625 error_setg_errno(errp, -ret, "failed to set direct: %s", 626 blkio_get_error_msg()); 627 return ret; 628 } 629 } 630 631 ret = blkio_connect(s->blkio); 632 if (ret < 0) { 633 error_setg_errno(errp, -ret, "blkio_connect failed: %s", 634 blkio_get_error_msg()); 635 return ret; 636 } 637 638 return 0; 639 } 640 641 static int blkio_nvme_io_uring_connect(BlockDriverState *bs, QDict *options, 642 int flags, Error **errp) 643 { 644 const char *path = qdict_get_try_str(options, "path"); 645 BDRVBlkioState *s = bs->opaque; 646 int ret; 647 648 if (!path) { 649 error_setg(errp, "missing 'path' option"); 650 return -EINVAL; 651 } 652 653 ret = blkio_set_str(s->blkio, "path", path); 654 qdict_del(options, "path"); 655 if (ret < 0) { 656 error_setg_errno(errp, -ret, "failed to set path: %s", 657 blkio_get_error_msg()); 658 return ret; 659 } 660 661 if (!(flags & BDRV_O_NOCACHE)) { 662 error_setg(errp, "cache.direct=off is not supported"); 663 return -EINVAL; 664 } 665 666 ret = blkio_connect(s->blkio); 667 if (ret < 0) { 668 error_setg_errno(errp, -ret, "blkio_connect failed: %s", 669 blkio_get_error_msg()); 670 return ret; 671 } 672 673 return 0; 674 } 675 676 static int blkio_virtio_blk_connect(BlockDriverState *bs, QDict *options, 677 int flags, Error **errp) 678 { 679 const char *path = qdict_get_try_str(options, "path"); 680 BDRVBlkioState *s = bs->opaque; 681 bool fd_supported = false; 682 int fd = -1, ret; 683 684 if (!path) { 685 error_setg(errp, "missing 'path' option"); 686 return -EINVAL; 687 } 688 689 if (!(flags & BDRV_O_NOCACHE)) { 690 error_setg(errp, "cache.direct=off is not supported"); 691 return -EINVAL; 692 } 693 694 if (blkio_set_int(s->blkio, "fd", -1) == 0) { 695 fd_supported = true; 696 } 697 698 /* 699 * If the libblkio driver supports fd passing, let's always use qemu_open() 700 * to open the `path`, so we can handle fd passing from the management 701 * layer through the "/dev/fdset/N" special path. 702 */ 703 if (fd_supported) { 704 /* 705 * `path` can contain the path of a character device 706 * (e.g. /dev/vhost-vdpa-0 or /dev/vfio/vfio) or a unix socket. 707 * 708 * So, we should always open it with O_RDWR flag, also if BDRV_O_RDWR 709 * is not set in the open flags, because the exchange of IOCTL commands 710 * for example will fail. 711 * 712 * In order to open the device read-only, we are using the `read-only` 713 * property of the libblkio driver in blkio_file_open(). 714 */ 715 fd = qemu_open(path, O_RDWR, NULL); 716 if (fd < 0) { 717 /* 718 * qemu_open() can fail if the user specifies a path that is not 719 * a file or device, for example in the case of Unix Domain Socket 720 * for the virtio-blk-vhost-user driver. In such cases let's have 721 * libblkio open the path directly. 722 */ 723 fd_supported = false; 724 } else { 725 ret = blkio_set_int(s->blkio, "fd", fd); 726 if (ret < 0) { 727 fd_supported = false; 728 qemu_close(fd); 729 fd = -1; 730 } 731 } 732 } 733 734 if (!fd_supported) { 735 ret = blkio_set_str(s->blkio, "path", path); 736 if (ret < 0) { 737 error_setg_errno(errp, -ret, "failed to set path: %s", 738 blkio_get_error_msg()); 739 return ret; 740 } 741 } 742 743 ret = blkio_connect(s->blkio); 744 if (ret < 0 && fd >= 0) { 745 /* Failed to give the FD to libblkio, close it */ 746 qemu_close(fd); 747 fd = -1; 748 } 749 750 /* 751 * Before https://gitlab.com/libblkio/libblkio/-/merge_requests/208 752 * (libblkio <= v1.3.0), setting the `fd` property is not enough to check 753 * whether the driver supports the `fd` property or not. In that case, 754 * blkio_connect() will fail with -EINVAL. 755 * So let's try calling blkio_connect() again by directly setting `path` 756 * to cover this scenario. 757 */ 758 if (fd_supported && ret == -EINVAL) { 759 /* 760 * We need to clear the `fd` property we set previously by setting 761 * it to -1. 762 */ 763 ret = blkio_set_int(s->blkio, "fd", -1); 764 if (ret < 0) { 765 error_setg_errno(errp, -ret, "failed to set fd: %s", 766 blkio_get_error_msg()); 767 return ret; 768 } 769 770 ret = blkio_set_str(s->blkio, "path", path); 771 if (ret < 0) { 772 error_setg_errno(errp, -ret, "failed to set path: %s", 773 blkio_get_error_msg()); 774 return ret; 775 } 776 777 ret = blkio_connect(s->blkio); 778 } 779 780 if (ret < 0) { 781 error_setg_errno(errp, -ret, "blkio_connect failed: %s", 782 blkio_get_error_msg()); 783 return ret; 784 } 785 786 qdict_del(options, "path"); 787 788 return 0; 789 } 790 791 static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags, 792 Error **errp) 793 { 794 const char *blkio_driver = bs->drv->protocol_name; 795 BDRVBlkioState *s = bs->opaque; 796 int ret; 797 798 ret = blkio_create(blkio_driver, &s->blkio); 799 if (ret < 0) { 800 error_setg_errno(errp, -ret, "blkio_create failed: %s", 801 blkio_get_error_msg()); 802 return ret; 803 } 804 805 if (!(flags & BDRV_O_RDWR)) { 806 ret = blkio_set_bool(s->blkio, "read-only", true); 807 if (ret < 0) { 808 error_setg_errno(errp, -ret, "failed to set read-only: %s", 809 blkio_get_error_msg()); 810 blkio_destroy(&s->blkio); 811 return ret; 812 } 813 } 814 815 if (strcmp(blkio_driver, "io_uring") == 0) { 816 ret = blkio_io_uring_connect(bs, options, flags, errp); 817 } else if (strcmp(blkio_driver, "nvme-io_uring") == 0) { 818 ret = blkio_nvme_io_uring_connect(bs, options, flags, errp); 819 } else if (strcmp(blkio_driver, "virtio-blk-vfio-pci") == 0) { 820 ret = blkio_virtio_blk_connect(bs, options, flags, errp); 821 } else if (strcmp(blkio_driver, "virtio-blk-vhost-user") == 0) { 822 ret = blkio_virtio_blk_connect(bs, options, flags, errp); 823 } else if (strcmp(blkio_driver, "virtio-blk-vhost-vdpa") == 0) { 824 ret = blkio_virtio_blk_connect(bs, options, flags, errp); 825 } else { 826 g_assert_not_reached(); 827 } 828 if (ret < 0) { 829 blkio_destroy(&s->blkio); 830 return ret; 831 } 832 833 ret = blkio_get_bool(s->blkio, 834 "needs-mem-regions", 835 &s->needs_mem_regions); 836 if (ret < 0) { 837 error_setg_errno(errp, -ret, 838 "failed to get needs-mem-regions: %s", 839 blkio_get_error_msg()); 840 blkio_destroy(&s->blkio); 841 return ret; 842 } 843 844 ret = blkio_get_bool(s->blkio, 845 "needs-mem-region-fd", 846 &s->needs_mem_region_fd); 847 if (ret < 0) { 848 error_setg_errno(errp, -ret, 849 "failed to get needs-mem-region-fd: %s", 850 blkio_get_error_msg()); 851 blkio_destroy(&s->blkio); 852 return ret; 853 } 854 855 ret = blkio_get_uint64(s->blkio, 856 "mem-region-alignment", 857 &s->mem_region_alignment); 858 if (ret < 0) { 859 error_setg_errno(errp, -ret, 860 "failed to get mem-region-alignment: %s", 861 blkio_get_error_msg()); 862 blkio_destroy(&s->blkio); 863 return ret; 864 } 865 866 ret = blkio_get_bool(s->blkio, 867 "may-pin-mem-regions", 868 &s->may_pin_mem_regions); 869 if (ret < 0) { 870 /* Be conservative (assume pinning) if the property is not supported */ 871 s->may_pin_mem_regions = s->needs_mem_regions; 872 } 873 874 /* 875 * Notify if libblkio drivers pin memory and prevent features like 876 * virtio-mem from working. 877 */ 878 if (s->may_pin_mem_regions) { 879 ret = ram_block_discard_disable(true); 880 if (ret < 0) { 881 error_setg_errno(errp, -ret, "ram_block_discard_disable() failed"); 882 blkio_destroy(&s->blkio); 883 return ret; 884 } 885 } 886 887 ret = blkio_start(s->blkio); 888 if (ret < 0) { 889 error_setg_errno(errp, -ret, "blkio_start failed: %s", 890 blkio_get_error_msg()); 891 blkio_destroy(&s->blkio); 892 if (s->may_pin_mem_regions) { 893 ram_block_discard_disable(false); 894 } 895 return ret; 896 } 897 898 bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF; 899 bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | 900 BDRV_REQ_NO_FALLBACK; 901 902 qemu_mutex_init(&s->blkio_lock); 903 qemu_co_mutex_init(&s->bounce_lock); 904 qemu_co_queue_init(&s->bounce_available); 905 QLIST_INIT(&s->bounce_bufs); 906 s->blkioq = blkio_get_queue(s->blkio, 0); 907 s->completion_fd = blkioq_get_completion_fd(s->blkioq); 908 blkioq_set_completion_fd_enabled(s->blkioq, true); 909 910 blkio_attach_aio_context(bs, bdrv_get_aio_context(bs)); 911 return 0; 912 } 913 914 static void blkio_close(BlockDriverState *bs) 915 { 916 BDRVBlkioState *s = bs->opaque; 917 918 /* There is no destroy() API for s->bounce_lock */ 919 920 qemu_mutex_destroy(&s->blkio_lock); 921 blkio_detach_aio_context(bs); 922 blkio_destroy(&s->blkio); 923 924 if (s->may_pin_mem_regions) { 925 ram_block_discard_disable(false); 926 } 927 } 928 929 static int64_t coroutine_fn blkio_co_getlength(BlockDriverState *bs) 930 { 931 BDRVBlkioState *s = bs->opaque; 932 uint64_t capacity; 933 int ret; 934 935 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 936 ret = blkio_get_uint64(s->blkio, "capacity", &capacity); 937 } 938 if (ret < 0) { 939 return -ret; 940 } 941 942 return capacity; 943 } 944 945 static int coroutine_fn blkio_truncate(BlockDriverState *bs, int64_t offset, 946 bool exact, PreallocMode prealloc, 947 BdrvRequestFlags flags, Error **errp) 948 { 949 int64_t current_length; 950 951 if (prealloc != PREALLOC_MODE_OFF) { 952 error_setg(errp, "Unsupported preallocation mode '%s'", 953 PreallocMode_str(prealloc)); 954 return -ENOTSUP; 955 } 956 957 current_length = blkio_co_getlength(bs); 958 959 if (offset > current_length) { 960 error_setg(errp, "Cannot grow device"); 961 return -EINVAL; 962 } else if (exact && offset != current_length) { 963 error_setg(errp, "Cannot resize device"); 964 return -ENOTSUP; 965 } 966 967 return 0; 968 } 969 970 static int coroutine_fn 971 blkio_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 972 { 973 return 0; 974 } 975 976 static void blkio_refresh_limits(BlockDriverState *bs, Error **errp) 977 { 978 BDRVBlkioState *s = bs->opaque; 979 QEMU_LOCK_GUARD(&s->blkio_lock); 980 int value; 981 int ret; 982 983 ret = blkio_get_int(s->blkio, "request-alignment", &value); 984 if (ret < 0) { 985 error_setg_errno(errp, -ret, "failed to get \"request-alignment\": %s", 986 blkio_get_error_msg()); 987 return; 988 } 989 bs->bl.request_alignment = value; 990 if (bs->bl.request_alignment < 1 || 991 bs->bl.request_alignment >= INT_MAX || 992 !is_power_of_2(bs->bl.request_alignment)) { 993 error_setg(errp, "invalid \"request-alignment\" value %" PRIu32 ", " 994 "must be a power of 2 less than INT_MAX", 995 bs->bl.request_alignment); 996 return; 997 } 998 999 ret = blkio_get_int(s->blkio, "optimal-io-size", &value); 1000 if (ret < 0) { 1001 error_setg_errno(errp, -ret, "failed to get \"optimal-io-size\": %s", 1002 blkio_get_error_msg()); 1003 return; 1004 } 1005 bs->bl.opt_transfer = value; 1006 if (bs->bl.opt_transfer > INT_MAX || 1007 (bs->bl.opt_transfer % bs->bl.request_alignment)) { 1008 error_setg(errp, "invalid \"optimal-io-size\" value %" PRIu32 ", must " 1009 "be a multiple of %" PRIu32, bs->bl.opt_transfer, 1010 bs->bl.request_alignment); 1011 return; 1012 } 1013 1014 ret = blkio_get_int(s->blkio, "max-transfer", &value); 1015 if (ret < 0) { 1016 error_setg_errno(errp, -ret, "failed to get \"max-transfer\": %s", 1017 blkio_get_error_msg()); 1018 return; 1019 } 1020 bs->bl.max_transfer = value; 1021 if ((bs->bl.max_transfer % bs->bl.request_alignment) || 1022 (bs->bl.opt_transfer && (bs->bl.max_transfer % bs->bl.opt_transfer))) { 1023 error_setg(errp, "invalid \"max-transfer\" value %" PRIu32 ", must be " 1024 "a multiple of %" PRIu32 " and %" PRIu32 " (if non-zero)", 1025 bs->bl.max_transfer, bs->bl.request_alignment, 1026 bs->bl.opt_transfer); 1027 return; 1028 } 1029 1030 ret = blkio_get_int(s->blkio, "buf-alignment", &value); 1031 if (ret < 0) { 1032 error_setg_errno(errp, -ret, "failed to get \"buf-alignment\": %s", 1033 blkio_get_error_msg()); 1034 return; 1035 } 1036 if (value < 1) { 1037 error_setg(errp, "invalid \"buf-alignment\" value %d, must be " 1038 "positive", value); 1039 return; 1040 } 1041 bs->bl.min_mem_alignment = value; 1042 1043 ret = blkio_get_int(s->blkio, "optimal-buf-alignment", &value); 1044 if (ret < 0) { 1045 error_setg_errno(errp, -ret, 1046 "failed to get \"optimal-buf-alignment\": %s", 1047 blkio_get_error_msg()); 1048 return; 1049 } 1050 if (value < 1) { 1051 error_setg(errp, "invalid \"optimal-buf-alignment\" value %d, " 1052 "must be positive", value); 1053 return; 1054 } 1055 bs->bl.opt_mem_alignment = value; 1056 1057 ret = blkio_get_int(s->blkio, "max-segments", &value); 1058 if (ret < 0) { 1059 error_setg_errno(errp, -ret, "failed to get \"max-segments\": %s", 1060 blkio_get_error_msg()); 1061 return; 1062 } 1063 if (value < 1) { 1064 error_setg(errp, "invalid \"max-segments\" value %d, must be positive", 1065 value); 1066 return; 1067 } 1068 bs->bl.max_iov = value; 1069 } 1070 1071 /* 1072 * TODO 1073 * Missing libblkio APIs: 1074 * - block_status 1075 * - co_invalidate_cache 1076 * 1077 * Out of scope? 1078 * - create 1079 * - truncate 1080 */ 1081 1082 /* 1083 * Do not include .format_name and .protocol_name because module_block.py 1084 * does not parse macros in the source code. 1085 */ 1086 #define BLKIO_DRIVER_COMMON \ 1087 .instance_size = sizeof(BDRVBlkioState), \ 1088 .bdrv_file_open = blkio_file_open, \ 1089 .bdrv_close = blkio_close, \ 1090 .bdrv_co_getlength = blkio_co_getlength, \ 1091 .bdrv_co_truncate = blkio_truncate, \ 1092 .bdrv_co_get_info = blkio_co_get_info, \ 1093 .bdrv_attach_aio_context = blkio_attach_aio_context, \ 1094 .bdrv_detach_aio_context = blkio_detach_aio_context, \ 1095 .bdrv_co_pdiscard = blkio_co_pdiscard, \ 1096 .bdrv_co_preadv = blkio_co_preadv, \ 1097 .bdrv_co_pwritev = blkio_co_pwritev, \ 1098 .bdrv_co_flush_to_disk = blkio_co_flush, \ 1099 .bdrv_co_pwrite_zeroes = blkio_co_pwrite_zeroes, \ 1100 .bdrv_refresh_limits = blkio_refresh_limits, \ 1101 .bdrv_register_buf = blkio_register_buf, \ 1102 .bdrv_unregister_buf = blkio_unregister_buf, 1103 1104 /* 1105 * Use the same .format_name and .protocol_name as the libblkio driver name for 1106 * consistency. 1107 */ 1108 1109 static BlockDriver bdrv_io_uring = { 1110 .format_name = "io_uring", 1111 .protocol_name = "io_uring", 1112 .bdrv_needs_filename = true, 1113 BLKIO_DRIVER_COMMON 1114 }; 1115 1116 static BlockDriver bdrv_nvme_io_uring = { 1117 .format_name = "nvme-io_uring", 1118 .protocol_name = "nvme-io_uring", 1119 BLKIO_DRIVER_COMMON 1120 }; 1121 1122 static BlockDriver bdrv_virtio_blk_vfio_pci = { 1123 .format_name = "virtio-blk-vfio-pci", 1124 .protocol_name = "virtio-blk-vfio-pci", 1125 BLKIO_DRIVER_COMMON 1126 }; 1127 1128 static BlockDriver bdrv_virtio_blk_vhost_user = { 1129 .format_name = "virtio-blk-vhost-user", 1130 .protocol_name = "virtio-blk-vhost-user", 1131 BLKIO_DRIVER_COMMON 1132 }; 1133 1134 static BlockDriver bdrv_virtio_blk_vhost_vdpa = { 1135 .format_name = "virtio-blk-vhost-vdpa", 1136 .protocol_name = "virtio-blk-vhost-vdpa", 1137 BLKIO_DRIVER_COMMON 1138 }; 1139 1140 static void bdrv_blkio_init(void) 1141 { 1142 bdrv_register(&bdrv_io_uring); 1143 bdrv_register(&bdrv_nvme_io_uring); 1144 bdrv_register(&bdrv_virtio_blk_vfio_pci); 1145 bdrv_register(&bdrv_virtio_blk_vhost_user); 1146 bdrv_register(&bdrv_virtio_blk_vhost_vdpa); 1147 } 1148 1149 block_init(bdrv_blkio_init); 1150