xref: /openbmc/qemu/block/blkio.c (revision 06831001)
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2 /*
3  * libblkio BlockDriver
4  *
5  * Copyright Red Hat, Inc.
6  *
7  * Author:
8  *   Stefan Hajnoczi <stefanha@redhat.com>
9  */
10 
11 #include "qemu/osdep.h"
12 #include <blkio.h>
13 #include "block/block_int.h"
14 #include "exec/memory.h"
15 #include "exec/cpu-common.h" /* for qemu_ram_get_fd() */
16 #include "qapi/error.h"
17 #include "qemu/error-report.h"
18 #include "qapi/qmp/qdict.h"
19 #include "qemu/module.h"
20 #include "sysemu/block-backend.h"
21 #include "exec/memory.h" /* for ram_block_discard_disable() */
22 
23 #include "block/block-io.h"
24 
25 /*
26  * Keep the QEMU BlockDriver names identical to the libblkio driver names.
27  * Using macros instead of typing out the string literals avoids typos.
28  */
29 #define DRIVER_IO_URING "io_uring"
30 #define DRIVER_NVME_IO_URING "nvme-io_uring"
31 #define DRIVER_VIRTIO_BLK_VFIO_PCI "virtio-blk-vfio-pci"
32 #define DRIVER_VIRTIO_BLK_VHOST_USER "virtio-blk-vhost-user"
33 #define DRIVER_VIRTIO_BLK_VHOST_VDPA "virtio-blk-vhost-vdpa"
34 
35 /*
36  * Allocated bounce buffers are kept in a list sorted by buffer address.
37  */
38 typedef struct BlkioBounceBuf {
39     QLIST_ENTRY(BlkioBounceBuf) next;
40 
41     /* The bounce buffer */
42     struct iovec buf;
43 } BlkioBounceBuf;
44 
45 typedef struct {
46     /*
47      * libblkio is not thread-safe so this lock protects ->blkio and
48      * ->blkioq.
49      */
50     QemuMutex blkio_lock;
51     struct blkio *blkio;
52     struct blkioq *blkioq; /* make this multi-queue in the future... */
53     int completion_fd;
54 
55     /*
56      * Polling fetches the next completion into this field.
57      *
58      * No lock is necessary since only one thread calls aio_poll() and invokes
59      * fd and poll handlers.
60      */
61     struct blkio_completion poll_completion;
62 
63     /*
64      * Protects ->bounce_pool, ->bounce_bufs, ->bounce_available.
65      *
66      * Lock ordering: ->bounce_lock before ->blkio_lock.
67      */
68     CoMutex bounce_lock;
69 
70     /* Bounce buffer pool */
71     struct blkio_mem_region bounce_pool;
72 
73     /* Sorted list of allocated bounce buffers */
74     QLIST_HEAD(, BlkioBounceBuf) bounce_bufs;
75 
76     /* Queue for coroutines waiting for bounce buffer space */
77     CoQueue bounce_available;
78 
79     /* The value of the "mem-region-alignment" property */
80     size_t mem_region_alignment;
81 
82     /* Can we skip adding/deleting blkio_mem_regions? */
83     bool needs_mem_regions;
84 
85     /* Are file descriptors necessary for blkio_mem_regions? */
86     bool needs_mem_region_fd;
87 
88     /* Are madvise(MADV_DONTNEED)-style operations unavailable? */
89     bool may_pin_mem_regions;
90 } BDRVBlkioState;
91 
92 /* Called with s->bounce_lock held */
93 static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes)
94 {
95     /* There can be no allocated bounce buffers during resize */
96     assert(QLIST_EMPTY(&s->bounce_bufs));
97 
98     /* Pad size to reduce frequency of resize calls */
99     bytes += 128 * 1024;
100 
101     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
102         int ret;
103 
104         if (s->bounce_pool.addr) {
105             blkio_unmap_mem_region(s->blkio, &s->bounce_pool);
106             blkio_free_mem_region(s->blkio, &s->bounce_pool);
107             memset(&s->bounce_pool, 0, sizeof(s->bounce_pool));
108         }
109 
110         /* Automatically freed when s->blkio is destroyed */
111         ret = blkio_alloc_mem_region(s->blkio, &s->bounce_pool, bytes);
112         if (ret < 0) {
113             return ret;
114         }
115 
116         ret = blkio_map_mem_region(s->blkio, &s->bounce_pool);
117         if (ret < 0) {
118             blkio_free_mem_region(s->blkio, &s->bounce_pool);
119             memset(&s->bounce_pool, 0, sizeof(s->bounce_pool));
120             return ret;
121         }
122     }
123 
124     return 0;
125 }
126 
127 /* Called with s->bounce_lock held */
128 static bool
129 blkio_do_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce,
130                              int64_t bytes)
131 {
132     void *addr = s->bounce_pool.addr;
133     BlkioBounceBuf *cur = NULL;
134     BlkioBounceBuf *prev = NULL;
135     ptrdiff_t space;
136 
137     /*
138      * This is just a linear search over the holes between requests. An
139      * efficient allocator would be nice.
140      */
141     QLIST_FOREACH(cur, &s->bounce_bufs, next) {
142         space = cur->buf.iov_base - addr;
143         if (bytes <= space) {
144             QLIST_INSERT_BEFORE(cur, bounce, next);
145             bounce->buf.iov_base = addr;
146             bounce->buf.iov_len = bytes;
147             return true;
148         }
149 
150         addr = cur->buf.iov_base + cur->buf.iov_len;
151         prev = cur;
152     }
153 
154     /* Is there space after the last request? */
155     space = s->bounce_pool.addr + s->bounce_pool.len - addr;
156     if (bytes > space) {
157         return false;
158     }
159     if (prev) {
160         QLIST_INSERT_AFTER(prev, bounce, next);
161     } else {
162         QLIST_INSERT_HEAD(&s->bounce_bufs, bounce, next);
163     }
164     bounce->buf.iov_base = addr;
165     bounce->buf.iov_len = bytes;
166     return true;
167 }
168 
169 static int coroutine_fn
170 blkio_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce,
171                           int64_t bytes)
172 {
173     /*
174      * Ensure fairness: first time around we join the back of the queue,
175      * subsequently we join the front so we don't lose our place.
176      */
177     CoQueueWaitFlags wait_flags = 0;
178 
179     QEMU_LOCK_GUARD(&s->bounce_lock);
180 
181     /* Ensure fairness: don't even try if other requests are already waiting */
182     if (!qemu_co_queue_empty(&s->bounce_available)) {
183         qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock,
184                                  wait_flags);
185         wait_flags = CO_QUEUE_WAIT_FRONT;
186     }
187 
188     while (true) {
189         if (blkio_do_alloc_bounce_buffer(s, bounce, bytes)) {
190             /* Kick the next queued request since there may be space */
191             qemu_co_queue_next(&s->bounce_available);
192             return 0;
193         }
194 
195         /*
196          * If there are no in-flight requests then the pool was simply too
197          * small.
198          */
199         if (QLIST_EMPTY(&s->bounce_bufs)) {
200             bool ok;
201             int ret;
202 
203             ret = blkio_resize_bounce_pool(s, bytes);
204             if (ret < 0) {
205                 /* Kick the next queued request since that may fail too */
206                 qemu_co_queue_next(&s->bounce_available);
207                 return ret;
208             }
209 
210             ok = blkio_do_alloc_bounce_buffer(s, bounce, bytes);
211             assert(ok); /* must have space this time */
212             return 0;
213         }
214 
215         qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock,
216                                  wait_flags);
217         wait_flags = CO_QUEUE_WAIT_FRONT;
218     }
219 }
220 
221 static void coroutine_fn blkio_free_bounce_buffer(BDRVBlkioState *s,
222                                                   BlkioBounceBuf *bounce)
223 {
224     QEMU_LOCK_GUARD(&s->bounce_lock);
225 
226     QLIST_REMOVE(bounce, next);
227 
228     /* Wake up waiting coroutines since space may now be available */
229     qemu_co_queue_next(&s->bounce_available);
230 }
231 
232 /* For async to .bdrv_co_*() conversion */
233 typedef struct {
234     Coroutine *coroutine;
235     int ret;
236 } BlkioCoData;
237 
238 static void blkio_completion_fd_read(void *opaque)
239 {
240     BlockDriverState *bs = opaque;
241     BDRVBlkioState *s = bs->opaque;
242     uint64_t val;
243     int ret;
244 
245     /* Polling may have already fetched a completion */
246     if (s->poll_completion.user_data != NULL) {
247         BlkioCoData *cod = s->poll_completion.user_data;
248         cod->ret = s->poll_completion.ret;
249 
250         /* Clear it in case aio_co_wake() enters a nested event loop */
251         s->poll_completion.user_data = NULL;
252 
253         aio_co_wake(cod->coroutine);
254     }
255 
256     /* Reset completion fd status */
257     ret = read(s->completion_fd, &val, sizeof(val));
258 
259     /* Ignore errors, there's nothing we can do */
260     (void)ret;
261 
262     /*
263      * Reading one completion at a time makes nested event loop re-entrancy
264      * simple. Change this loop to get multiple completions in one go if it
265      * becomes a performance bottleneck.
266      */
267     while (true) {
268         struct blkio_completion completion;
269 
270         WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
271             ret = blkioq_do_io(s->blkioq, &completion, 0, 1, NULL);
272         }
273         if (ret != 1) {
274             break;
275         }
276 
277         BlkioCoData *cod = completion.user_data;
278         cod->ret = completion.ret;
279         aio_co_wake(cod->coroutine);
280     }
281 }
282 
283 static bool blkio_completion_fd_poll(void *opaque)
284 {
285     BlockDriverState *bs = opaque;
286     BDRVBlkioState *s = bs->opaque;
287     int ret;
288 
289     /* Just in case we already fetched a completion */
290     if (s->poll_completion.user_data != NULL) {
291         return true;
292     }
293 
294     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
295         ret = blkioq_do_io(s->blkioq, &s->poll_completion, 0, 1, NULL);
296     }
297     return ret == 1;
298 }
299 
300 static void blkio_completion_fd_poll_ready(void *opaque)
301 {
302     blkio_completion_fd_read(opaque);
303 }
304 
305 static void blkio_attach_aio_context(BlockDriverState *bs,
306                                      AioContext *new_context)
307 {
308     BDRVBlkioState *s = bs->opaque;
309 
310     aio_set_fd_handler(new_context, s->completion_fd,
311                        blkio_completion_fd_read, NULL,
312                        blkio_completion_fd_poll,
313                        blkio_completion_fd_poll_ready, bs);
314 }
315 
316 static void blkio_detach_aio_context(BlockDriverState *bs)
317 {
318     BDRVBlkioState *s = bs->opaque;
319 
320     aio_set_fd_handler(bdrv_get_aio_context(bs), s->completion_fd, NULL, NULL,
321                        NULL, NULL, NULL);
322 }
323 
324 /*
325  * Called by blk_io_unplug() or immediately if not plugged. Called without
326  * blkio_lock.
327  */
328 static void blkio_unplug_fn(void *opaque)
329 {
330     BDRVBlkioState *s = opaque;
331 
332     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
333         blkioq_do_io(s->blkioq, NULL, 0, 0, NULL);
334     }
335 }
336 
337 /*
338  * Schedule I/O submission after enqueuing a new request. Called without
339  * blkio_lock.
340  */
341 static void blkio_submit_io(BlockDriverState *bs)
342 {
343     BDRVBlkioState *s = bs->opaque;
344 
345     blk_io_plug_call(blkio_unplug_fn, s);
346 }
347 
348 static int coroutine_fn
349 blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
350 {
351     BDRVBlkioState *s = bs->opaque;
352     BlkioCoData cod = {
353         .coroutine = qemu_coroutine_self(),
354     };
355 
356     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
357         blkioq_discard(s->blkioq, offset, bytes, &cod, 0);
358     }
359 
360     blkio_submit_io(bs);
361     qemu_coroutine_yield();
362     return cod.ret;
363 }
364 
365 static int coroutine_fn
366 blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
367                 QEMUIOVector *qiov, BdrvRequestFlags flags)
368 {
369     BlkioCoData cod = {
370         .coroutine = qemu_coroutine_self(),
371     };
372     BDRVBlkioState *s = bs->opaque;
373     bool use_bounce_buffer =
374         s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
375     BlkioBounceBuf bounce;
376     struct iovec *iov = qiov->iov;
377     int iovcnt = qiov->niov;
378 
379     if (use_bounce_buffer) {
380         int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes);
381         if (ret < 0) {
382             return ret;
383         }
384 
385         iov = &bounce.buf;
386         iovcnt = 1;
387     }
388 
389     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
390         blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0);
391     }
392 
393     blkio_submit_io(bs);
394     qemu_coroutine_yield();
395 
396     if (use_bounce_buffer) {
397         if (cod.ret == 0) {
398             qemu_iovec_from_buf(qiov, 0,
399                                 bounce.buf.iov_base,
400                                 bounce.buf.iov_len);
401         }
402 
403         blkio_free_bounce_buffer(s, &bounce);
404     }
405 
406     return cod.ret;
407 }
408 
409 static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset,
410         int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags)
411 {
412     uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0;
413     BlkioCoData cod = {
414         .coroutine = qemu_coroutine_self(),
415     };
416     BDRVBlkioState *s = bs->opaque;
417     bool use_bounce_buffer =
418         s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
419     BlkioBounceBuf bounce;
420     struct iovec *iov = qiov->iov;
421     int iovcnt = qiov->niov;
422 
423     if (use_bounce_buffer) {
424         int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes);
425         if (ret < 0) {
426             return ret;
427         }
428 
429         qemu_iovec_to_buf(qiov, 0, bounce.buf.iov_base, bytes);
430         iov = &bounce.buf;
431         iovcnt = 1;
432     }
433 
434     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
435         blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags);
436     }
437 
438     blkio_submit_io(bs);
439     qemu_coroutine_yield();
440 
441     if (use_bounce_buffer) {
442         blkio_free_bounce_buffer(s, &bounce);
443     }
444 
445     return cod.ret;
446 }
447 
448 static int coroutine_fn blkio_co_flush(BlockDriverState *bs)
449 {
450     BDRVBlkioState *s = bs->opaque;
451     BlkioCoData cod = {
452         .coroutine = qemu_coroutine_self(),
453     };
454 
455     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
456         blkioq_flush(s->blkioq, &cod, 0);
457     }
458 
459     blkio_submit_io(bs);
460     qemu_coroutine_yield();
461     return cod.ret;
462 }
463 
464 static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs,
465     int64_t offset, int64_t bytes, BdrvRequestFlags flags)
466 {
467     BDRVBlkioState *s = bs->opaque;
468     BlkioCoData cod = {
469         .coroutine = qemu_coroutine_self(),
470     };
471     uint32_t blkio_flags = 0;
472 
473     if (flags & BDRV_REQ_FUA) {
474         blkio_flags |= BLKIO_REQ_FUA;
475     }
476     if (!(flags & BDRV_REQ_MAY_UNMAP)) {
477         blkio_flags |= BLKIO_REQ_NO_UNMAP;
478     }
479     if (flags & BDRV_REQ_NO_FALLBACK) {
480         blkio_flags |= BLKIO_REQ_NO_FALLBACK;
481     }
482 
483     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
484         blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags);
485     }
486 
487     blkio_submit_io(bs);
488     qemu_coroutine_yield();
489     return cod.ret;
490 }
491 
492 typedef enum {
493     BMRR_OK,
494     BMRR_SKIP,
495     BMRR_FAIL,
496 } BlkioMemRegionResult;
497 
498 /*
499  * Produce a struct blkio_mem_region for a given address and size.
500  *
501  * This function produces identical results when called multiple times with the
502  * same arguments. This property is necessary because blkio_unmap_mem_region()
503  * must receive the same struct blkio_mem_region field values that were passed
504  * to blkio_map_mem_region().
505  */
506 static BlkioMemRegionResult
507 blkio_mem_region_from_host(BlockDriverState *bs,
508                            void *host, size_t size,
509                            struct blkio_mem_region *region,
510                            Error **errp)
511 {
512     BDRVBlkioState *s = bs->opaque;
513     int fd = -1;
514     ram_addr_t fd_offset = 0;
515 
516     if (((uintptr_t)host | size) % s->mem_region_alignment) {
517         error_setg(errp, "unaligned buf %p with size %zu", host, size);
518         return BMRR_FAIL;
519     }
520 
521     /* Attempt to find the fd for the underlying memory */
522     if (s->needs_mem_region_fd) {
523         RAMBlock *ram_block;
524         RAMBlock *end_block;
525         ram_addr_t offset;
526 
527         /*
528          * bdrv_register_buf() is called with the BQL held so mr lives at least
529          * until this function returns.
530          */
531         ram_block = qemu_ram_block_from_host(host, false, &fd_offset);
532         if (ram_block) {
533             fd = qemu_ram_get_fd(ram_block);
534         }
535         if (fd == -1) {
536             /*
537              * Ideally every RAMBlock would have an fd. pc-bios and other
538              * things don't. Luckily they are usually not I/O buffers and we
539              * can just ignore them.
540              */
541             return BMRR_SKIP;
542         }
543 
544         /* Make sure the fd covers the entire range */
545         end_block = qemu_ram_block_from_host(host + size - 1, false, &offset);
546         if (ram_block != end_block) {
547             error_setg(errp, "registered buffer at %p with size %zu extends "
548                        "beyond RAMBlock", host, size);
549             return BMRR_FAIL;
550         }
551     }
552 
553     *region = (struct blkio_mem_region){
554         .addr = host,
555         .len = size,
556         .fd = fd,
557         .fd_offset = fd_offset,
558     };
559     return BMRR_OK;
560 }
561 
562 static bool blkio_register_buf(BlockDriverState *bs, void *host, size_t size,
563                                Error **errp)
564 {
565     BDRVBlkioState *s = bs->opaque;
566     struct blkio_mem_region region;
567     BlkioMemRegionResult region_result;
568     int ret;
569 
570     /*
571      * Mapping memory regions conflicts with RAM discard (virtio-mem) when
572      * there is pinning, so only do it when necessary.
573      */
574     if (!s->needs_mem_regions && s->may_pin_mem_regions) {
575         return true;
576     }
577 
578     region_result = blkio_mem_region_from_host(bs, host, size, &region, errp);
579     if (region_result == BMRR_SKIP) {
580         return true;
581     } else if (region_result != BMRR_OK) {
582         return false;
583     }
584 
585     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
586         ret = blkio_map_mem_region(s->blkio, &region);
587     }
588 
589     if (ret < 0) {
590         error_setg(errp, "Failed to add blkio mem region %p with size %zu: %s",
591                    host, size, blkio_get_error_msg());
592         return false;
593     }
594     return true;
595 }
596 
597 static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size)
598 {
599     BDRVBlkioState *s = bs->opaque;
600     struct blkio_mem_region region;
601 
602     /* See blkio_register_buf() */
603     if (!s->needs_mem_regions && s->may_pin_mem_regions) {
604         return;
605     }
606 
607     if (blkio_mem_region_from_host(bs, host, size, &region, NULL) != BMRR_OK) {
608         return;
609     }
610 
611     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
612         blkio_unmap_mem_region(s->blkio, &region);
613     }
614 }
615 
616 static int blkio_io_uring_open(BlockDriverState *bs, QDict *options, int flags,
617                                Error **errp)
618 {
619     const char *filename = qdict_get_str(options, "filename");
620     BDRVBlkioState *s = bs->opaque;
621     int ret;
622 
623     ret = blkio_set_str(s->blkio, "path", filename);
624     qdict_del(options, "filename");
625     if (ret < 0) {
626         error_setg_errno(errp, -ret, "failed to set path: %s",
627                          blkio_get_error_msg());
628         return ret;
629     }
630 
631     if (flags & BDRV_O_NOCACHE) {
632         ret = blkio_set_bool(s->blkio, "direct", true);
633         if (ret < 0) {
634             error_setg_errno(errp, -ret, "failed to set direct: %s",
635                              blkio_get_error_msg());
636             return ret;
637         }
638     }
639 
640     return 0;
641 }
642 
643 static int blkio_nvme_io_uring(BlockDriverState *bs, QDict *options, int flags,
644                                Error **errp)
645 {
646     const char *path = qdict_get_try_str(options, "path");
647     BDRVBlkioState *s = bs->opaque;
648     int ret;
649 
650     if (!path) {
651         error_setg(errp, "missing 'path' option");
652         return -EINVAL;
653     }
654 
655     ret = blkio_set_str(s->blkio, "path", path);
656     qdict_del(options, "path");
657     if (ret < 0) {
658         error_setg_errno(errp, -ret, "failed to set path: %s",
659                          blkio_get_error_msg());
660         return ret;
661     }
662 
663     if (!(flags & BDRV_O_NOCACHE)) {
664         error_setg(errp, "cache.direct=off is not supported");
665         return -EINVAL;
666     }
667 
668     return 0;
669 }
670 
671 static int blkio_virtio_blk_common_open(BlockDriverState *bs,
672         QDict *options, int flags, Error **errp)
673 {
674     const char *path = qdict_get_try_str(options, "path");
675     BDRVBlkioState *s = bs->opaque;
676     bool fd_supported = false;
677     int fd, ret;
678 
679     if (!path) {
680         error_setg(errp, "missing 'path' option");
681         return -EINVAL;
682     }
683 
684     if (!(flags & BDRV_O_NOCACHE)) {
685         error_setg(errp, "cache.direct=off is not supported");
686         return -EINVAL;
687     }
688 
689     if (blkio_get_int(s->blkio, "fd", &fd) == 0) {
690         fd_supported = true;
691     }
692 
693     /*
694      * If the libblkio driver supports fd passing, let's always use qemu_open()
695      * to open the `path`, so we can handle fd passing from the management
696      * layer through the "/dev/fdset/N" special path.
697      */
698     if (fd_supported) {
699         int open_flags;
700 
701         if (flags & BDRV_O_RDWR) {
702             open_flags = O_RDWR;
703         } else {
704             open_flags = O_RDONLY;
705         }
706 
707         fd = qemu_open(path, open_flags, errp);
708         if (fd < 0) {
709             return -EINVAL;
710         }
711 
712         ret = blkio_set_int(s->blkio, "fd", fd);
713         if (ret < 0) {
714             error_setg_errno(errp, -ret, "failed to set fd: %s",
715                              blkio_get_error_msg());
716             qemu_close(fd);
717             return ret;
718         }
719     } else {
720         ret = blkio_set_str(s->blkio, "path", path);
721         if (ret < 0) {
722             error_setg_errno(errp, -ret, "failed to set path: %s",
723                              blkio_get_error_msg());
724             return ret;
725         }
726     }
727 
728     qdict_del(options, "path");
729 
730     return 0;
731 }
732 
733 static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags,
734                            Error **errp)
735 {
736     const char *blkio_driver = bs->drv->protocol_name;
737     BDRVBlkioState *s = bs->opaque;
738     int ret;
739 
740     ret = blkio_create(blkio_driver, &s->blkio);
741     if (ret < 0) {
742         error_setg_errno(errp, -ret, "blkio_create failed: %s",
743                          blkio_get_error_msg());
744         return ret;
745     }
746 
747     if (strcmp(blkio_driver, DRIVER_IO_URING) == 0) {
748         ret = blkio_io_uring_open(bs, options, flags, errp);
749     } else if (strcmp(blkio_driver, DRIVER_NVME_IO_URING) == 0) {
750         ret = blkio_nvme_io_uring(bs, options, flags, errp);
751     } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VFIO_PCI) == 0) {
752         ret = blkio_virtio_blk_common_open(bs, options, flags, errp);
753     } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_USER) == 0) {
754         ret = blkio_virtio_blk_common_open(bs, options, flags, errp);
755     } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_VDPA) == 0) {
756         ret = blkio_virtio_blk_common_open(bs, options, flags, errp);
757     } else {
758         g_assert_not_reached();
759     }
760     if (ret < 0) {
761         blkio_destroy(&s->blkio);
762         return ret;
763     }
764 
765     if (!(flags & BDRV_O_RDWR)) {
766         ret = blkio_set_bool(s->blkio, "read-only", true);
767         if (ret < 0) {
768             error_setg_errno(errp, -ret, "failed to set read-only: %s",
769                              blkio_get_error_msg());
770             blkio_destroy(&s->blkio);
771             return ret;
772         }
773     }
774 
775     ret = blkio_connect(s->blkio);
776     if (ret < 0) {
777         error_setg_errno(errp, -ret, "blkio_connect failed: %s",
778                          blkio_get_error_msg());
779         blkio_destroy(&s->blkio);
780         return ret;
781     }
782 
783     ret = blkio_get_bool(s->blkio,
784                          "needs-mem-regions",
785                          &s->needs_mem_regions);
786     if (ret < 0) {
787         error_setg_errno(errp, -ret,
788                          "failed to get needs-mem-regions: %s",
789                          blkio_get_error_msg());
790         blkio_destroy(&s->blkio);
791         return ret;
792     }
793 
794     ret = blkio_get_bool(s->blkio,
795                          "needs-mem-region-fd",
796                          &s->needs_mem_region_fd);
797     if (ret < 0) {
798         error_setg_errno(errp, -ret,
799                          "failed to get needs-mem-region-fd: %s",
800                          blkio_get_error_msg());
801         blkio_destroy(&s->blkio);
802         return ret;
803     }
804 
805     ret = blkio_get_uint64(s->blkio,
806                            "mem-region-alignment",
807                            &s->mem_region_alignment);
808     if (ret < 0) {
809         error_setg_errno(errp, -ret,
810                          "failed to get mem-region-alignment: %s",
811                          blkio_get_error_msg());
812         blkio_destroy(&s->blkio);
813         return ret;
814     }
815 
816     ret = blkio_get_bool(s->blkio,
817                          "may-pin-mem-regions",
818                          &s->may_pin_mem_regions);
819     if (ret < 0) {
820         /* Be conservative (assume pinning) if the property is not supported */
821         s->may_pin_mem_regions = s->needs_mem_regions;
822     }
823 
824     /*
825      * Notify if libblkio drivers pin memory and prevent features like
826      * virtio-mem from working.
827      */
828     if (s->may_pin_mem_regions) {
829         ret = ram_block_discard_disable(true);
830         if (ret < 0) {
831             error_setg_errno(errp, -ret, "ram_block_discard_disable() failed");
832             blkio_destroy(&s->blkio);
833             return ret;
834         }
835     }
836 
837     ret = blkio_start(s->blkio);
838     if (ret < 0) {
839         error_setg_errno(errp, -ret, "blkio_start failed: %s",
840                          blkio_get_error_msg());
841         blkio_destroy(&s->blkio);
842         if (s->may_pin_mem_regions) {
843             ram_block_discard_disable(false);
844         }
845         return ret;
846     }
847 
848     bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF;
849     bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP |
850                                BDRV_REQ_NO_FALLBACK;
851 
852     qemu_mutex_init(&s->blkio_lock);
853     qemu_co_mutex_init(&s->bounce_lock);
854     qemu_co_queue_init(&s->bounce_available);
855     QLIST_INIT(&s->bounce_bufs);
856     s->blkioq = blkio_get_queue(s->blkio, 0);
857     s->completion_fd = blkioq_get_completion_fd(s->blkioq);
858 
859     blkio_attach_aio_context(bs, bdrv_get_aio_context(bs));
860     return 0;
861 }
862 
863 static void blkio_close(BlockDriverState *bs)
864 {
865     BDRVBlkioState *s = bs->opaque;
866 
867     /* There is no destroy() API for s->bounce_lock */
868 
869     qemu_mutex_destroy(&s->blkio_lock);
870     blkio_detach_aio_context(bs);
871     blkio_destroy(&s->blkio);
872 
873     if (s->may_pin_mem_regions) {
874         ram_block_discard_disable(false);
875     }
876 }
877 
878 static int64_t coroutine_fn blkio_co_getlength(BlockDriverState *bs)
879 {
880     BDRVBlkioState *s = bs->opaque;
881     uint64_t capacity;
882     int ret;
883 
884     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
885         ret = blkio_get_uint64(s->blkio, "capacity", &capacity);
886     }
887     if (ret < 0) {
888         return -ret;
889     }
890 
891     return capacity;
892 }
893 
894 static int coroutine_fn blkio_truncate(BlockDriverState *bs, int64_t offset,
895                                        bool exact, PreallocMode prealloc,
896                                        BdrvRequestFlags flags, Error **errp)
897 {
898     int64_t current_length;
899 
900     if (prealloc != PREALLOC_MODE_OFF) {
901         error_setg(errp, "Unsupported preallocation mode '%s'",
902                    PreallocMode_str(prealloc));
903         return -ENOTSUP;
904     }
905 
906     current_length = blkio_co_getlength(bs);
907 
908     if (offset > current_length) {
909         error_setg(errp, "Cannot grow device");
910         return -EINVAL;
911     } else if (exact && offset != current_length) {
912         error_setg(errp, "Cannot resize device");
913         return -ENOTSUP;
914     }
915 
916     return 0;
917 }
918 
919 static int coroutine_fn
920 blkio_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
921 {
922     return 0;
923 }
924 
925 static void blkio_refresh_limits(BlockDriverState *bs, Error **errp)
926 {
927     BDRVBlkioState *s = bs->opaque;
928     QEMU_LOCK_GUARD(&s->blkio_lock);
929     int value;
930     int ret;
931 
932     ret = blkio_get_int(s->blkio, "request-alignment", &value);
933     if (ret < 0) {
934         error_setg_errno(errp, -ret, "failed to get \"request-alignment\": %s",
935                          blkio_get_error_msg());
936         return;
937     }
938     bs->bl.request_alignment = value;
939     if (bs->bl.request_alignment < 1 ||
940         bs->bl.request_alignment >= INT_MAX ||
941         !is_power_of_2(bs->bl.request_alignment)) {
942         error_setg(errp, "invalid \"request-alignment\" value %" PRIu32 ", "
943                    "must be a power of 2 less than INT_MAX",
944                    bs->bl.request_alignment);
945         return;
946     }
947 
948     ret = blkio_get_int(s->blkio, "optimal-io-size", &value);
949     if (ret < 0) {
950         error_setg_errno(errp, -ret, "failed to get \"optimal-io-size\": %s",
951                          blkio_get_error_msg());
952         return;
953     }
954     bs->bl.opt_transfer = value;
955     if (bs->bl.opt_transfer > INT_MAX ||
956         (bs->bl.opt_transfer % bs->bl.request_alignment)) {
957         error_setg(errp, "invalid \"optimal-io-size\" value %" PRIu32 ", must "
958                    "be a multiple of %" PRIu32, bs->bl.opt_transfer,
959                    bs->bl.request_alignment);
960         return;
961     }
962 
963     ret = blkio_get_int(s->blkio, "max-transfer", &value);
964     if (ret < 0) {
965         error_setg_errno(errp, -ret, "failed to get \"max-transfer\": %s",
966                          blkio_get_error_msg());
967         return;
968     }
969     bs->bl.max_transfer = value;
970     if ((bs->bl.max_transfer % bs->bl.request_alignment) ||
971         (bs->bl.opt_transfer && (bs->bl.max_transfer % bs->bl.opt_transfer))) {
972         error_setg(errp, "invalid \"max-transfer\" value %" PRIu32 ", must be "
973                    "a multiple of %" PRIu32 " and %" PRIu32 " (if non-zero)",
974                    bs->bl.max_transfer, bs->bl.request_alignment,
975                    bs->bl.opt_transfer);
976         return;
977     }
978 
979     ret = blkio_get_int(s->blkio, "buf-alignment", &value);
980     if (ret < 0) {
981         error_setg_errno(errp, -ret, "failed to get \"buf-alignment\": %s",
982                          blkio_get_error_msg());
983         return;
984     }
985     if (value < 1) {
986         error_setg(errp, "invalid \"buf-alignment\" value %d, must be "
987                    "positive", value);
988         return;
989     }
990     bs->bl.min_mem_alignment = value;
991 
992     ret = blkio_get_int(s->blkio, "optimal-buf-alignment", &value);
993     if (ret < 0) {
994         error_setg_errno(errp, -ret,
995                          "failed to get \"optimal-buf-alignment\": %s",
996                          blkio_get_error_msg());
997         return;
998     }
999     if (value < 1) {
1000         error_setg(errp, "invalid \"optimal-buf-alignment\" value %d, "
1001                    "must be positive", value);
1002         return;
1003     }
1004     bs->bl.opt_mem_alignment = value;
1005 
1006     ret = blkio_get_int(s->blkio, "max-segments", &value);
1007     if (ret < 0) {
1008         error_setg_errno(errp, -ret, "failed to get \"max-segments\": %s",
1009                          blkio_get_error_msg());
1010         return;
1011     }
1012     if (value < 1) {
1013         error_setg(errp, "invalid \"max-segments\" value %d, must be positive",
1014                    value);
1015         return;
1016     }
1017     bs->bl.max_iov = value;
1018 }
1019 
1020 /*
1021  * TODO
1022  * Missing libblkio APIs:
1023  * - block_status
1024  * - co_invalidate_cache
1025  *
1026  * Out of scope?
1027  * - create
1028  * - truncate
1029  */
1030 
1031 #define BLKIO_DRIVER(name, ...) \
1032     { \
1033         .format_name             = name, \
1034         .protocol_name           = name, \
1035         .instance_size           = sizeof(BDRVBlkioState), \
1036         .bdrv_file_open          = blkio_file_open, \
1037         .bdrv_close              = blkio_close, \
1038         .bdrv_co_getlength       = blkio_co_getlength, \
1039         .bdrv_co_truncate        = blkio_truncate, \
1040         .bdrv_co_get_info        = blkio_co_get_info, \
1041         .bdrv_attach_aio_context = blkio_attach_aio_context, \
1042         .bdrv_detach_aio_context = blkio_detach_aio_context, \
1043         .bdrv_co_pdiscard        = blkio_co_pdiscard, \
1044         .bdrv_co_preadv          = blkio_co_preadv, \
1045         .bdrv_co_pwritev         = blkio_co_pwritev, \
1046         .bdrv_co_flush_to_disk   = blkio_co_flush, \
1047         .bdrv_co_pwrite_zeroes   = blkio_co_pwrite_zeroes, \
1048         .bdrv_refresh_limits     = blkio_refresh_limits, \
1049         .bdrv_register_buf       = blkio_register_buf, \
1050         .bdrv_unregister_buf     = blkio_unregister_buf, \
1051         __VA_ARGS__ \
1052     }
1053 
1054 static BlockDriver bdrv_io_uring = BLKIO_DRIVER(
1055     DRIVER_IO_URING,
1056     .bdrv_needs_filename = true,
1057 );
1058 
1059 static BlockDriver bdrv_nvme_io_uring = BLKIO_DRIVER(
1060     DRIVER_NVME_IO_URING,
1061 );
1062 
1063 static BlockDriver bdrv_virtio_blk_vfio_pci = BLKIO_DRIVER(
1064     DRIVER_VIRTIO_BLK_VFIO_PCI
1065 );
1066 
1067 static BlockDriver bdrv_virtio_blk_vhost_user = BLKIO_DRIVER(
1068     DRIVER_VIRTIO_BLK_VHOST_USER
1069 );
1070 
1071 static BlockDriver bdrv_virtio_blk_vhost_vdpa = BLKIO_DRIVER(
1072     DRIVER_VIRTIO_BLK_VHOST_VDPA
1073 );
1074 
1075 static void bdrv_blkio_init(void)
1076 {
1077     bdrv_register(&bdrv_io_uring);
1078     bdrv_register(&bdrv_nvme_io_uring);
1079     bdrv_register(&bdrv_virtio_blk_vfio_pci);
1080     bdrv_register(&bdrv_virtio_blk_vhost_user);
1081     bdrv_register(&bdrv_virtio_blk_vhost_vdpa);
1082 }
1083 
1084 block_init(bdrv_blkio_init);
1085