1 /* SPDX-License-Identifier: LGPL-2.1-or-later */ 2 /* 3 * libblkio BlockDriver 4 * 5 * Copyright Red Hat, Inc. 6 * 7 * Author: 8 * Stefan Hajnoczi <stefanha@redhat.com> 9 */ 10 11 #include "qemu/osdep.h" 12 #include <blkio.h> 13 #include "block/block_int.h" 14 #include "exec/memory.h" 15 #include "exec/cpu-common.h" /* for qemu_ram_get_fd() */ 16 #include "qapi/error.h" 17 #include "qemu/error-report.h" 18 #include "qapi/qmp/qdict.h" 19 #include "qemu/module.h" 20 #include "sysemu/block-backend.h" 21 #include "exec/memory.h" /* for ram_block_discard_disable() */ 22 23 #include "block/block-io.h" 24 25 /* 26 * Allocated bounce buffers are kept in a list sorted by buffer address. 27 */ 28 typedef struct BlkioBounceBuf { 29 QLIST_ENTRY(BlkioBounceBuf) next; 30 31 /* The bounce buffer */ 32 struct iovec buf; 33 } BlkioBounceBuf; 34 35 typedef struct { 36 /* 37 * libblkio is not thread-safe so this lock protects ->blkio and 38 * ->blkioq. 39 */ 40 QemuMutex blkio_lock; 41 struct blkio *blkio; 42 struct blkioq *blkioq; /* make this multi-queue in the future... */ 43 int completion_fd; 44 45 /* 46 * Polling fetches the next completion into this field. 47 * 48 * No lock is necessary since only one thread calls aio_poll() and invokes 49 * fd and poll handlers. 50 */ 51 struct blkio_completion poll_completion; 52 53 /* 54 * Protects ->bounce_pool, ->bounce_bufs, ->bounce_available. 55 * 56 * Lock ordering: ->bounce_lock before ->blkio_lock. 57 */ 58 CoMutex bounce_lock; 59 60 /* Bounce buffer pool */ 61 struct blkio_mem_region bounce_pool; 62 63 /* Sorted list of allocated bounce buffers */ 64 QLIST_HEAD(, BlkioBounceBuf) bounce_bufs; 65 66 /* Queue for coroutines waiting for bounce buffer space */ 67 CoQueue bounce_available; 68 69 /* The value of the "mem-region-alignment" property */ 70 size_t mem_region_alignment; 71 72 /* Can we skip adding/deleting blkio_mem_regions? */ 73 bool needs_mem_regions; 74 75 /* Are file descriptors necessary for blkio_mem_regions? */ 76 bool needs_mem_region_fd; 77 78 /* Are madvise(MADV_DONTNEED)-style operations unavailable? */ 79 bool may_pin_mem_regions; 80 } BDRVBlkioState; 81 82 /* Called with s->bounce_lock held */ 83 static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes) 84 { 85 /* There can be no allocated bounce buffers during resize */ 86 assert(QLIST_EMPTY(&s->bounce_bufs)); 87 88 /* Pad size to reduce frequency of resize calls */ 89 bytes += 128 * 1024; 90 91 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 92 int ret; 93 94 if (s->bounce_pool.addr) { 95 blkio_unmap_mem_region(s->blkio, &s->bounce_pool); 96 blkio_free_mem_region(s->blkio, &s->bounce_pool); 97 memset(&s->bounce_pool, 0, sizeof(s->bounce_pool)); 98 } 99 100 /* Automatically freed when s->blkio is destroyed */ 101 ret = blkio_alloc_mem_region(s->blkio, &s->bounce_pool, bytes); 102 if (ret < 0) { 103 return ret; 104 } 105 106 ret = blkio_map_mem_region(s->blkio, &s->bounce_pool); 107 if (ret < 0) { 108 blkio_free_mem_region(s->blkio, &s->bounce_pool); 109 memset(&s->bounce_pool, 0, sizeof(s->bounce_pool)); 110 return ret; 111 } 112 } 113 114 return 0; 115 } 116 117 /* Called with s->bounce_lock held */ 118 static bool 119 blkio_do_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce, 120 int64_t bytes) 121 { 122 void *addr = s->bounce_pool.addr; 123 BlkioBounceBuf *cur = NULL; 124 BlkioBounceBuf *prev = NULL; 125 ptrdiff_t space; 126 127 /* 128 * This is just a linear search over the holes between requests. An 129 * efficient allocator would be nice. 130 */ 131 QLIST_FOREACH(cur, &s->bounce_bufs, next) { 132 space = cur->buf.iov_base - addr; 133 if (bytes <= space) { 134 QLIST_INSERT_BEFORE(cur, bounce, next); 135 bounce->buf.iov_base = addr; 136 bounce->buf.iov_len = bytes; 137 return true; 138 } 139 140 addr = cur->buf.iov_base + cur->buf.iov_len; 141 prev = cur; 142 } 143 144 /* Is there space after the last request? */ 145 space = s->bounce_pool.addr + s->bounce_pool.len - addr; 146 if (bytes > space) { 147 return false; 148 } 149 if (prev) { 150 QLIST_INSERT_AFTER(prev, bounce, next); 151 } else { 152 QLIST_INSERT_HEAD(&s->bounce_bufs, bounce, next); 153 } 154 bounce->buf.iov_base = addr; 155 bounce->buf.iov_len = bytes; 156 return true; 157 } 158 159 static int coroutine_fn 160 blkio_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce, 161 int64_t bytes) 162 { 163 /* 164 * Ensure fairness: first time around we join the back of the queue, 165 * subsequently we join the front so we don't lose our place. 166 */ 167 CoQueueWaitFlags wait_flags = 0; 168 169 QEMU_LOCK_GUARD(&s->bounce_lock); 170 171 /* Ensure fairness: don't even try if other requests are already waiting */ 172 if (!qemu_co_queue_empty(&s->bounce_available)) { 173 qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock, 174 wait_flags); 175 wait_flags = CO_QUEUE_WAIT_FRONT; 176 } 177 178 while (true) { 179 if (blkio_do_alloc_bounce_buffer(s, bounce, bytes)) { 180 /* Kick the next queued request since there may be space */ 181 qemu_co_queue_next(&s->bounce_available); 182 return 0; 183 } 184 185 /* 186 * If there are no in-flight requests then the pool was simply too 187 * small. 188 */ 189 if (QLIST_EMPTY(&s->bounce_bufs)) { 190 bool ok; 191 int ret; 192 193 ret = blkio_resize_bounce_pool(s, bytes); 194 if (ret < 0) { 195 /* Kick the next queued request since that may fail too */ 196 qemu_co_queue_next(&s->bounce_available); 197 return ret; 198 } 199 200 ok = blkio_do_alloc_bounce_buffer(s, bounce, bytes); 201 assert(ok); /* must have space this time */ 202 return 0; 203 } 204 205 qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock, 206 wait_flags); 207 wait_flags = CO_QUEUE_WAIT_FRONT; 208 } 209 } 210 211 static void coroutine_fn blkio_free_bounce_buffer(BDRVBlkioState *s, 212 BlkioBounceBuf *bounce) 213 { 214 QEMU_LOCK_GUARD(&s->bounce_lock); 215 216 QLIST_REMOVE(bounce, next); 217 218 /* Wake up waiting coroutines since space may now be available */ 219 qemu_co_queue_next(&s->bounce_available); 220 } 221 222 /* For async to .bdrv_co_*() conversion */ 223 typedef struct { 224 Coroutine *coroutine; 225 int ret; 226 } BlkioCoData; 227 228 static void blkio_completion_fd_read(void *opaque) 229 { 230 BlockDriverState *bs = opaque; 231 BDRVBlkioState *s = bs->opaque; 232 uint64_t val; 233 int ret; 234 235 /* Polling may have already fetched a completion */ 236 if (s->poll_completion.user_data != NULL) { 237 BlkioCoData *cod = s->poll_completion.user_data; 238 cod->ret = s->poll_completion.ret; 239 240 /* Clear it in case aio_co_wake() enters a nested event loop */ 241 s->poll_completion.user_data = NULL; 242 243 aio_co_wake(cod->coroutine); 244 } 245 246 /* Reset completion fd status */ 247 ret = read(s->completion_fd, &val, sizeof(val)); 248 249 /* Ignore errors, there's nothing we can do */ 250 (void)ret; 251 252 /* 253 * Reading one completion at a time makes nested event loop re-entrancy 254 * simple. Change this loop to get multiple completions in one go if it 255 * becomes a performance bottleneck. 256 */ 257 while (true) { 258 struct blkio_completion completion; 259 260 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 261 ret = blkioq_do_io(s->blkioq, &completion, 0, 1, NULL); 262 } 263 if (ret != 1) { 264 break; 265 } 266 267 BlkioCoData *cod = completion.user_data; 268 cod->ret = completion.ret; 269 aio_co_wake(cod->coroutine); 270 } 271 } 272 273 static bool blkio_completion_fd_poll(void *opaque) 274 { 275 BlockDriverState *bs = opaque; 276 BDRVBlkioState *s = bs->opaque; 277 int ret; 278 279 /* Just in case we already fetched a completion */ 280 if (s->poll_completion.user_data != NULL) { 281 return true; 282 } 283 284 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 285 ret = blkioq_do_io(s->blkioq, &s->poll_completion, 0, 1, NULL); 286 } 287 return ret == 1; 288 } 289 290 static void blkio_completion_fd_poll_ready(void *opaque) 291 { 292 blkio_completion_fd_read(opaque); 293 } 294 295 static void blkio_attach_aio_context(BlockDriverState *bs, 296 AioContext *new_context) 297 { 298 BDRVBlkioState *s = bs->opaque; 299 300 aio_set_fd_handler(new_context, s->completion_fd, 301 blkio_completion_fd_read, NULL, 302 blkio_completion_fd_poll, 303 blkio_completion_fd_poll_ready, bs); 304 } 305 306 static void blkio_detach_aio_context(BlockDriverState *bs) 307 { 308 BDRVBlkioState *s = bs->opaque; 309 310 aio_set_fd_handler(bdrv_get_aio_context(bs), s->completion_fd, NULL, NULL, 311 NULL, NULL, NULL); 312 } 313 314 /* 315 * Called by blk_io_unplug() or immediately if not plugged. Called without 316 * blkio_lock. 317 */ 318 static void blkio_unplug_fn(void *opaque) 319 { 320 BDRVBlkioState *s = opaque; 321 322 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 323 blkioq_do_io(s->blkioq, NULL, 0, 0, NULL); 324 } 325 } 326 327 /* 328 * Schedule I/O submission after enqueuing a new request. Called without 329 * blkio_lock. 330 */ 331 static void blkio_submit_io(BlockDriverState *bs) 332 { 333 BDRVBlkioState *s = bs->opaque; 334 335 blk_io_plug_call(blkio_unplug_fn, s); 336 } 337 338 static int coroutine_fn 339 blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) 340 { 341 BDRVBlkioState *s = bs->opaque; 342 BlkioCoData cod = { 343 .coroutine = qemu_coroutine_self(), 344 }; 345 346 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 347 blkioq_discard(s->blkioq, offset, bytes, &cod, 0); 348 } 349 350 blkio_submit_io(bs); 351 qemu_coroutine_yield(); 352 return cod.ret; 353 } 354 355 static int coroutine_fn 356 blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, 357 QEMUIOVector *qiov, BdrvRequestFlags flags) 358 { 359 BlkioCoData cod = { 360 .coroutine = qemu_coroutine_self(), 361 }; 362 BDRVBlkioState *s = bs->opaque; 363 bool use_bounce_buffer = 364 s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF); 365 BlkioBounceBuf bounce; 366 struct iovec *iov = qiov->iov; 367 int iovcnt = qiov->niov; 368 369 if (use_bounce_buffer) { 370 int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes); 371 if (ret < 0) { 372 return ret; 373 } 374 375 iov = &bounce.buf; 376 iovcnt = 1; 377 } 378 379 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 380 blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0); 381 } 382 383 blkio_submit_io(bs); 384 qemu_coroutine_yield(); 385 386 if (use_bounce_buffer) { 387 if (cod.ret == 0) { 388 qemu_iovec_from_buf(qiov, 0, 389 bounce.buf.iov_base, 390 bounce.buf.iov_len); 391 } 392 393 blkio_free_bounce_buffer(s, &bounce); 394 } 395 396 return cod.ret; 397 } 398 399 static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset, 400 int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) 401 { 402 uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0; 403 BlkioCoData cod = { 404 .coroutine = qemu_coroutine_self(), 405 }; 406 BDRVBlkioState *s = bs->opaque; 407 bool use_bounce_buffer = 408 s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF); 409 BlkioBounceBuf bounce; 410 struct iovec *iov = qiov->iov; 411 int iovcnt = qiov->niov; 412 413 if (use_bounce_buffer) { 414 int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes); 415 if (ret < 0) { 416 return ret; 417 } 418 419 qemu_iovec_to_buf(qiov, 0, bounce.buf.iov_base, bytes); 420 iov = &bounce.buf; 421 iovcnt = 1; 422 } 423 424 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 425 blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags); 426 } 427 428 blkio_submit_io(bs); 429 qemu_coroutine_yield(); 430 431 if (use_bounce_buffer) { 432 blkio_free_bounce_buffer(s, &bounce); 433 } 434 435 return cod.ret; 436 } 437 438 static int coroutine_fn blkio_co_flush(BlockDriverState *bs) 439 { 440 BDRVBlkioState *s = bs->opaque; 441 BlkioCoData cod = { 442 .coroutine = qemu_coroutine_self(), 443 }; 444 445 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 446 blkioq_flush(s->blkioq, &cod, 0); 447 } 448 449 blkio_submit_io(bs); 450 qemu_coroutine_yield(); 451 return cod.ret; 452 } 453 454 static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs, 455 int64_t offset, int64_t bytes, BdrvRequestFlags flags) 456 { 457 BDRVBlkioState *s = bs->opaque; 458 BlkioCoData cod = { 459 .coroutine = qemu_coroutine_self(), 460 }; 461 uint32_t blkio_flags = 0; 462 463 if (flags & BDRV_REQ_FUA) { 464 blkio_flags |= BLKIO_REQ_FUA; 465 } 466 if (!(flags & BDRV_REQ_MAY_UNMAP)) { 467 blkio_flags |= BLKIO_REQ_NO_UNMAP; 468 } 469 if (flags & BDRV_REQ_NO_FALLBACK) { 470 blkio_flags |= BLKIO_REQ_NO_FALLBACK; 471 } 472 473 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 474 blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags); 475 } 476 477 blkio_submit_io(bs); 478 qemu_coroutine_yield(); 479 return cod.ret; 480 } 481 482 typedef enum { 483 BMRR_OK, 484 BMRR_SKIP, 485 BMRR_FAIL, 486 } BlkioMemRegionResult; 487 488 /* 489 * Produce a struct blkio_mem_region for a given address and size. 490 * 491 * This function produces identical results when called multiple times with the 492 * same arguments. This property is necessary because blkio_unmap_mem_region() 493 * must receive the same struct blkio_mem_region field values that were passed 494 * to blkio_map_mem_region(). 495 */ 496 static BlkioMemRegionResult 497 blkio_mem_region_from_host(BlockDriverState *bs, 498 void *host, size_t size, 499 struct blkio_mem_region *region, 500 Error **errp) 501 { 502 BDRVBlkioState *s = bs->opaque; 503 int fd = -1; 504 ram_addr_t fd_offset = 0; 505 506 if (((uintptr_t)host | size) % s->mem_region_alignment) { 507 error_setg(errp, "unaligned buf %p with size %zu", host, size); 508 return BMRR_FAIL; 509 } 510 511 /* Attempt to find the fd for the underlying memory */ 512 if (s->needs_mem_region_fd) { 513 RAMBlock *ram_block; 514 RAMBlock *end_block; 515 ram_addr_t offset; 516 517 /* 518 * bdrv_register_buf() is called with the BQL held so mr lives at least 519 * until this function returns. 520 */ 521 ram_block = qemu_ram_block_from_host(host, false, &fd_offset); 522 if (ram_block) { 523 fd = qemu_ram_get_fd(ram_block); 524 } 525 if (fd == -1) { 526 /* 527 * Ideally every RAMBlock would have an fd. pc-bios and other 528 * things don't. Luckily they are usually not I/O buffers and we 529 * can just ignore them. 530 */ 531 return BMRR_SKIP; 532 } 533 534 /* Make sure the fd covers the entire range */ 535 end_block = qemu_ram_block_from_host(host + size - 1, false, &offset); 536 if (ram_block != end_block) { 537 error_setg(errp, "registered buffer at %p with size %zu extends " 538 "beyond RAMBlock", host, size); 539 return BMRR_FAIL; 540 } 541 } 542 543 *region = (struct blkio_mem_region){ 544 .addr = host, 545 .len = size, 546 .fd = fd, 547 .fd_offset = fd_offset, 548 }; 549 return BMRR_OK; 550 } 551 552 static bool blkio_register_buf(BlockDriverState *bs, void *host, size_t size, 553 Error **errp) 554 { 555 BDRVBlkioState *s = bs->opaque; 556 struct blkio_mem_region region; 557 BlkioMemRegionResult region_result; 558 int ret; 559 560 /* 561 * Mapping memory regions conflicts with RAM discard (virtio-mem) when 562 * there is pinning, so only do it when necessary. 563 */ 564 if (!s->needs_mem_regions && s->may_pin_mem_regions) { 565 return true; 566 } 567 568 region_result = blkio_mem_region_from_host(bs, host, size, ®ion, errp); 569 if (region_result == BMRR_SKIP) { 570 return true; 571 } else if (region_result != BMRR_OK) { 572 return false; 573 } 574 575 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 576 ret = blkio_map_mem_region(s->blkio, ®ion); 577 } 578 579 if (ret < 0) { 580 error_setg(errp, "Failed to add blkio mem region %p with size %zu: %s", 581 host, size, blkio_get_error_msg()); 582 return false; 583 } 584 return true; 585 } 586 587 static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size) 588 { 589 BDRVBlkioState *s = bs->opaque; 590 struct blkio_mem_region region; 591 592 /* See blkio_register_buf() */ 593 if (!s->needs_mem_regions && s->may_pin_mem_regions) { 594 return; 595 } 596 597 if (blkio_mem_region_from_host(bs, host, size, ®ion, NULL) != BMRR_OK) { 598 return; 599 } 600 601 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 602 blkio_unmap_mem_region(s->blkio, ®ion); 603 } 604 } 605 606 static int blkio_io_uring_connect(BlockDriverState *bs, QDict *options, 607 int flags, Error **errp) 608 { 609 const char *filename = qdict_get_str(options, "filename"); 610 BDRVBlkioState *s = bs->opaque; 611 int ret; 612 613 ret = blkio_set_str(s->blkio, "path", filename); 614 qdict_del(options, "filename"); 615 if (ret < 0) { 616 error_setg_errno(errp, -ret, "failed to set path: %s", 617 blkio_get_error_msg()); 618 return ret; 619 } 620 621 if (flags & BDRV_O_NOCACHE) { 622 ret = blkio_set_bool(s->blkio, "direct", true); 623 if (ret < 0) { 624 error_setg_errno(errp, -ret, "failed to set direct: %s", 625 blkio_get_error_msg()); 626 return ret; 627 } 628 } 629 630 ret = blkio_connect(s->blkio); 631 if (ret < 0) { 632 error_setg_errno(errp, -ret, "blkio_connect failed: %s", 633 blkio_get_error_msg()); 634 return ret; 635 } 636 637 return 0; 638 } 639 640 static int blkio_nvme_io_uring_connect(BlockDriverState *bs, QDict *options, 641 int flags, Error **errp) 642 { 643 const char *path = qdict_get_try_str(options, "path"); 644 BDRVBlkioState *s = bs->opaque; 645 int ret; 646 647 if (!path) { 648 error_setg(errp, "missing 'path' option"); 649 return -EINVAL; 650 } 651 652 ret = blkio_set_str(s->blkio, "path", path); 653 qdict_del(options, "path"); 654 if (ret < 0) { 655 error_setg_errno(errp, -ret, "failed to set path: %s", 656 blkio_get_error_msg()); 657 return ret; 658 } 659 660 if (!(flags & BDRV_O_NOCACHE)) { 661 error_setg(errp, "cache.direct=off is not supported"); 662 return -EINVAL; 663 } 664 665 ret = blkio_connect(s->blkio); 666 if (ret < 0) { 667 error_setg_errno(errp, -ret, "blkio_connect failed: %s", 668 blkio_get_error_msg()); 669 return ret; 670 } 671 672 return 0; 673 } 674 675 static int blkio_virtio_blk_connect(BlockDriverState *bs, QDict *options, 676 int flags, Error **errp) 677 { 678 const char *path = qdict_get_try_str(options, "path"); 679 BDRVBlkioState *s = bs->opaque; 680 bool fd_supported = false; 681 int fd = -1, ret; 682 683 if (!path) { 684 error_setg(errp, "missing 'path' option"); 685 return -EINVAL; 686 } 687 688 if (!(flags & BDRV_O_NOCACHE)) { 689 error_setg(errp, "cache.direct=off is not supported"); 690 return -EINVAL; 691 } 692 693 if (blkio_set_int(s->blkio, "fd", -1) == 0) { 694 fd_supported = true; 695 } 696 697 /* 698 * If the libblkio driver supports fd passing, let's always use qemu_open() 699 * to open the `path`, so we can handle fd passing from the management 700 * layer through the "/dev/fdset/N" special path. 701 */ 702 if (fd_supported) { 703 /* 704 * `path` can contain the path of a character device 705 * (e.g. /dev/vhost-vdpa-0 or /dev/vfio/vfio) or a unix socket. 706 * 707 * So, we should always open it with O_RDWR flag, also if BDRV_O_RDWR 708 * is not set in the open flags, because the exchange of IOCTL commands 709 * for example will fail. 710 * 711 * In order to open the device read-only, we are using the `read-only` 712 * property of the libblkio driver in blkio_file_open(). 713 */ 714 fd = qemu_open(path, O_RDWR, NULL); 715 if (fd < 0) { 716 /* 717 * qemu_open() can fail if the user specifies a path that is not 718 * a file or device, for example in the case of Unix Domain Socket 719 * for the virtio-blk-vhost-user driver. In such cases let's have 720 * libblkio open the path directly. 721 */ 722 fd_supported = false; 723 } else { 724 ret = blkio_set_int(s->blkio, "fd", fd); 725 if (ret < 0) { 726 fd_supported = false; 727 qemu_close(fd); 728 fd = -1; 729 } 730 } 731 } 732 733 if (!fd_supported) { 734 ret = blkio_set_str(s->blkio, "path", path); 735 if (ret < 0) { 736 error_setg_errno(errp, -ret, "failed to set path: %s", 737 blkio_get_error_msg()); 738 return ret; 739 } 740 } 741 742 ret = blkio_connect(s->blkio); 743 if (ret < 0 && fd >= 0) { 744 /* Failed to give the FD to libblkio, close it */ 745 qemu_close(fd); 746 fd = -1; 747 } 748 749 /* 750 * Before https://gitlab.com/libblkio/libblkio/-/merge_requests/208 751 * (libblkio <= v1.3.0), setting the `fd` property is not enough to check 752 * whether the driver supports the `fd` property or not. In that case, 753 * blkio_connect() will fail with -EINVAL. 754 * So let's try calling blkio_connect() again by directly setting `path` 755 * to cover this scenario. 756 */ 757 if (fd_supported && ret == -EINVAL) { 758 /* 759 * We need to clear the `fd` property we set previously by setting 760 * it to -1. 761 */ 762 ret = blkio_set_int(s->blkio, "fd", -1); 763 if (ret < 0) { 764 error_setg_errno(errp, -ret, "failed to set fd: %s", 765 blkio_get_error_msg()); 766 return ret; 767 } 768 769 ret = blkio_set_str(s->blkio, "path", path); 770 if (ret < 0) { 771 error_setg_errno(errp, -ret, "failed to set path: %s", 772 blkio_get_error_msg()); 773 return ret; 774 } 775 776 ret = blkio_connect(s->blkio); 777 } 778 779 if (ret < 0) { 780 error_setg_errno(errp, -ret, "blkio_connect failed: %s", 781 blkio_get_error_msg()); 782 return ret; 783 } 784 785 qdict_del(options, "path"); 786 787 return 0; 788 } 789 790 static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags, 791 Error **errp) 792 { 793 const char *blkio_driver = bs->drv->protocol_name; 794 BDRVBlkioState *s = bs->opaque; 795 int ret; 796 797 ret = blkio_create(blkio_driver, &s->blkio); 798 if (ret < 0) { 799 error_setg_errno(errp, -ret, "blkio_create failed: %s", 800 blkio_get_error_msg()); 801 return ret; 802 } 803 804 if (!(flags & BDRV_O_RDWR)) { 805 ret = blkio_set_bool(s->blkio, "read-only", true); 806 if (ret < 0) { 807 error_setg_errno(errp, -ret, "failed to set read-only: %s", 808 blkio_get_error_msg()); 809 blkio_destroy(&s->blkio); 810 return ret; 811 } 812 } 813 814 if (strcmp(blkio_driver, "io_uring") == 0) { 815 ret = blkio_io_uring_connect(bs, options, flags, errp); 816 } else if (strcmp(blkio_driver, "nvme-io_uring") == 0) { 817 ret = blkio_nvme_io_uring_connect(bs, options, flags, errp); 818 } else if (strcmp(blkio_driver, "virtio-blk-vfio-pci") == 0) { 819 ret = blkio_virtio_blk_connect(bs, options, flags, errp); 820 } else if (strcmp(blkio_driver, "virtio-blk-vhost-user") == 0) { 821 ret = blkio_virtio_blk_connect(bs, options, flags, errp); 822 } else if (strcmp(blkio_driver, "virtio-blk-vhost-vdpa") == 0) { 823 ret = blkio_virtio_blk_connect(bs, options, flags, errp); 824 } else { 825 g_assert_not_reached(); 826 } 827 if (ret < 0) { 828 blkio_destroy(&s->blkio); 829 return ret; 830 } 831 832 ret = blkio_get_bool(s->blkio, 833 "needs-mem-regions", 834 &s->needs_mem_regions); 835 if (ret < 0) { 836 error_setg_errno(errp, -ret, 837 "failed to get needs-mem-regions: %s", 838 blkio_get_error_msg()); 839 blkio_destroy(&s->blkio); 840 return ret; 841 } 842 843 ret = blkio_get_bool(s->blkio, 844 "needs-mem-region-fd", 845 &s->needs_mem_region_fd); 846 if (ret < 0) { 847 error_setg_errno(errp, -ret, 848 "failed to get needs-mem-region-fd: %s", 849 blkio_get_error_msg()); 850 blkio_destroy(&s->blkio); 851 return ret; 852 } 853 854 ret = blkio_get_uint64(s->blkio, 855 "mem-region-alignment", 856 &s->mem_region_alignment); 857 if (ret < 0) { 858 error_setg_errno(errp, -ret, 859 "failed to get mem-region-alignment: %s", 860 blkio_get_error_msg()); 861 blkio_destroy(&s->blkio); 862 return ret; 863 } 864 865 ret = blkio_get_bool(s->blkio, 866 "may-pin-mem-regions", 867 &s->may_pin_mem_regions); 868 if (ret < 0) { 869 /* Be conservative (assume pinning) if the property is not supported */ 870 s->may_pin_mem_regions = s->needs_mem_regions; 871 } 872 873 /* 874 * Notify if libblkio drivers pin memory and prevent features like 875 * virtio-mem from working. 876 */ 877 if (s->may_pin_mem_regions) { 878 ret = ram_block_discard_disable(true); 879 if (ret < 0) { 880 error_setg_errno(errp, -ret, "ram_block_discard_disable() failed"); 881 blkio_destroy(&s->blkio); 882 return ret; 883 } 884 } 885 886 ret = blkio_start(s->blkio); 887 if (ret < 0) { 888 error_setg_errno(errp, -ret, "blkio_start failed: %s", 889 blkio_get_error_msg()); 890 blkio_destroy(&s->blkio); 891 if (s->may_pin_mem_regions) { 892 ram_block_discard_disable(false); 893 } 894 return ret; 895 } 896 897 bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF; 898 bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | 899 BDRV_REQ_NO_FALLBACK; 900 901 qemu_mutex_init(&s->blkio_lock); 902 qemu_co_mutex_init(&s->bounce_lock); 903 qemu_co_queue_init(&s->bounce_available); 904 QLIST_INIT(&s->bounce_bufs); 905 s->blkioq = blkio_get_queue(s->blkio, 0); 906 s->completion_fd = blkioq_get_completion_fd(s->blkioq); 907 blkioq_set_completion_fd_enabled(s->blkioq, true); 908 909 blkio_attach_aio_context(bs, bdrv_get_aio_context(bs)); 910 return 0; 911 } 912 913 static void blkio_close(BlockDriverState *bs) 914 { 915 BDRVBlkioState *s = bs->opaque; 916 917 /* There is no destroy() API for s->bounce_lock */ 918 919 qemu_mutex_destroy(&s->blkio_lock); 920 blkio_detach_aio_context(bs); 921 blkio_destroy(&s->blkio); 922 923 if (s->may_pin_mem_regions) { 924 ram_block_discard_disable(false); 925 } 926 } 927 928 static int64_t coroutine_fn blkio_co_getlength(BlockDriverState *bs) 929 { 930 BDRVBlkioState *s = bs->opaque; 931 uint64_t capacity; 932 int ret; 933 934 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { 935 ret = blkio_get_uint64(s->blkio, "capacity", &capacity); 936 } 937 if (ret < 0) { 938 return -ret; 939 } 940 941 return capacity; 942 } 943 944 static int coroutine_fn blkio_truncate(BlockDriverState *bs, int64_t offset, 945 bool exact, PreallocMode prealloc, 946 BdrvRequestFlags flags, Error **errp) 947 { 948 int64_t current_length; 949 950 if (prealloc != PREALLOC_MODE_OFF) { 951 error_setg(errp, "Unsupported preallocation mode '%s'", 952 PreallocMode_str(prealloc)); 953 return -ENOTSUP; 954 } 955 956 current_length = blkio_co_getlength(bs); 957 958 if (offset > current_length) { 959 error_setg(errp, "Cannot grow device"); 960 return -EINVAL; 961 } else if (exact && offset != current_length) { 962 error_setg(errp, "Cannot resize device"); 963 return -ENOTSUP; 964 } 965 966 return 0; 967 } 968 969 static int coroutine_fn 970 blkio_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 971 { 972 return 0; 973 } 974 975 static void blkio_refresh_limits(BlockDriverState *bs, Error **errp) 976 { 977 BDRVBlkioState *s = bs->opaque; 978 QEMU_LOCK_GUARD(&s->blkio_lock); 979 int value; 980 int ret; 981 982 ret = blkio_get_int(s->blkio, "request-alignment", &value); 983 if (ret < 0) { 984 error_setg_errno(errp, -ret, "failed to get \"request-alignment\": %s", 985 blkio_get_error_msg()); 986 return; 987 } 988 bs->bl.request_alignment = value; 989 if (bs->bl.request_alignment < 1 || 990 bs->bl.request_alignment >= INT_MAX || 991 !is_power_of_2(bs->bl.request_alignment)) { 992 error_setg(errp, "invalid \"request-alignment\" value %" PRIu32 ", " 993 "must be a power of 2 less than INT_MAX", 994 bs->bl.request_alignment); 995 return; 996 } 997 998 ret = blkio_get_int(s->blkio, "optimal-io-size", &value); 999 if (ret < 0) { 1000 error_setg_errno(errp, -ret, "failed to get \"optimal-io-size\": %s", 1001 blkio_get_error_msg()); 1002 return; 1003 } 1004 bs->bl.opt_transfer = value; 1005 if (bs->bl.opt_transfer > INT_MAX || 1006 (bs->bl.opt_transfer % bs->bl.request_alignment)) { 1007 error_setg(errp, "invalid \"optimal-io-size\" value %" PRIu32 ", must " 1008 "be a multiple of %" PRIu32, bs->bl.opt_transfer, 1009 bs->bl.request_alignment); 1010 return; 1011 } 1012 1013 ret = blkio_get_int(s->blkio, "max-transfer", &value); 1014 if (ret < 0) { 1015 error_setg_errno(errp, -ret, "failed to get \"max-transfer\": %s", 1016 blkio_get_error_msg()); 1017 return; 1018 } 1019 bs->bl.max_transfer = value; 1020 if ((bs->bl.max_transfer % bs->bl.request_alignment) || 1021 (bs->bl.opt_transfer && (bs->bl.max_transfer % bs->bl.opt_transfer))) { 1022 error_setg(errp, "invalid \"max-transfer\" value %" PRIu32 ", must be " 1023 "a multiple of %" PRIu32 " and %" PRIu32 " (if non-zero)", 1024 bs->bl.max_transfer, bs->bl.request_alignment, 1025 bs->bl.opt_transfer); 1026 return; 1027 } 1028 1029 ret = blkio_get_int(s->blkio, "buf-alignment", &value); 1030 if (ret < 0) { 1031 error_setg_errno(errp, -ret, "failed to get \"buf-alignment\": %s", 1032 blkio_get_error_msg()); 1033 return; 1034 } 1035 if (value < 1) { 1036 error_setg(errp, "invalid \"buf-alignment\" value %d, must be " 1037 "positive", value); 1038 return; 1039 } 1040 bs->bl.min_mem_alignment = value; 1041 1042 ret = blkio_get_int(s->blkio, "optimal-buf-alignment", &value); 1043 if (ret < 0) { 1044 error_setg_errno(errp, -ret, 1045 "failed to get \"optimal-buf-alignment\": %s", 1046 blkio_get_error_msg()); 1047 return; 1048 } 1049 if (value < 1) { 1050 error_setg(errp, "invalid \"optimal-buf-alignment\" value %d, " 1051 "must be positive", value); 1052 return; 1053 } 1054 bs->bl.opt_mem_alignment = value; 1055 1056 ret = blkio_get_int(s->blkio, "max-segments", &value); 1057 if (ret < 0) { 1058 error_setg_errno(errp, -ret, "failed to get \"max-segments\": %s", 1059 blkio_get_error_msg()); 1060 return; 1061 } 1062 if (value < 1) { 1063 error_setg(errp, "invalid \"max-segments\" value %d, must be positive", 1064 value); 1065 return; 1066 } 1067 bs->bl.max_iov = value; 1068 } 1069 1070 /* 1071 * TODO 1072 * Missing libblkio APIs: 1073 * - block_status 1074 * - co_invalidate_cache 1075 * 1076 * Out of scope? 1077 * - create 1078 * - truncate 1079 */ 1080 1081 /* 1082 * Do not include .format_name and .protocol_name because module_block.py 1083 * does not parse macros in the source code. 1084 */ 1085 #define BLKIO_DRIVER_COMMON \ 1086 .instance_size = sizeof(BDRVBlkioState), \ 1087 .bdrv_file_open = blkio_file_open, \ 1088 .bdrv_close = blkio_close, \ 1089 .bdrv_co_getlength = blkio_co_getlength, \ 1090 .bdrv_co_truncate = blkio_truncate, \ 1091 .bdrv_co_get_info = blkio_co_get_info, \ 1092 .bdrv_attach_aio_context = blkio_attach_aio_context, \ 1093 .bdrv_detach_aio_context = blkio_detach_aio_context, \ 1094 .bdrv_co_pdiscard = blkio_co_pdiscard, \ 1095 .bdrv_co_preadv = blkio_co_preadv, \ 1096 .bdrv_co_pwritev = blkio_co_pwritev, \ 1097 .bdrv_co_flush_to_disk = blkio_co_flush, \ 1098 .bdrv_co_pwrite_zeroes = blkio_co_pwrite_zeroes, \ 1099 .bdrv_refresh_limits = blkio_refresh_limits, \ 1100 .bdrv_register_buf = blkio_register_buf, \ 1101 .bdrv_unregister_buf = blkio_unregister_buf, 1102 1103 /* 1104 * Use the same .format_name and .protocol_name as the libblkio driver name for 1105 * consistency. 1106 */ 1107 1108 static BlockDriver bdrv_io_uring = { 1109 .format_name = "io_uring", 1110 .protocol_name = "io_uring", 1111 .bdrv_needs_filename = true, 1112 BLKIO_DRIVER_COMMON 1113 }; 1114 1115 static BlockDriver bdrv_nvme_io_uring = { 1116 .format_name = "nvme-io_uring", 1117 .protocol_name = "nvme-io_uring", 1118 BLKIO_DRIVER_COMMON 1119 }; 1120 1121 static BlockDriver bdrv_virtio_blk_vfio_pci = { 1122 .format_name = "virtio-blk-vfio-pci", 1123 .protocol_name = "virtio-blk-vfio-pci", 1124 BLKIO_DRIVER_COMMON 1125 }; 1126 1127 static BlockDriver bdrv_virtio_blk_vhost_user = { 1128 .format_name = "virtio-blk-vhost-user", 1129 .protocol_name = "virtio-blk-vhost-user", 1130 BLKIO_DRIVER_COMMON 1131 }; 1132 1133 static BlockDriver bdrv_virtio_blk_vhost_vdpa = { 1134 .format_name = "virtio-blk-vhost-vdpa", 1135 .protocol_name = "virtio-blk-vhost-vdpa", 1136 BLKIO_DRIVER_COMMON 1137 }; 1138 1139 static void bdrv_blkio_init(void) 1140 { 1141 bdrv_register(&bdrv_io_uring); 1142 bdrv_register(&bdrv_nvme_io_uring); 1143 bdrv_register(&bdrv_virtio_blk_vfio_pci); 1144 bdrv_register(&bdrv_virtio_blk_vhost_user); 1145 bdrv_register(&bdrv_virtio_blk_vhost_vdpa); 1146 } 1147 1148 block_init(bdrv_blkio_init); 1149