xref: /openbmc/qemu/block/blkio.c (revision d5657258)
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2 /*
3  * libblkio BlockDriver
4  *
5  * Copyright Red Hat, Inc.
6  *
7  * Author:
8  *   Stefan Hajnoczi <stefanha@redhat.com>
9  */
10 
11 #include "qemu/osdep.h"
12 #include <blkio.h>
13 #include "block/block_int.h"
14 #include "exec/memory.h"
15 #include "exec/cpu-common.h" /* for qemu_ram_get_fd() */
16 #include "qapi/error.h"
17 #include "qemu/error-report.h"
18 #include "qapi/qmp/qdict.h"
19 #include "qemu/module.h"
20 #include "exec/memory.h" /* for ram_block_discard_disable() */
21 
22 #include "block/block-io.h"
23 
24 /*
25  * Keep the QEMU BlockDriver names identical to the libblkio driver names.
26  * Using macros instead of typing out the string literals avoids typos.
27  */
28 #define DRIVER_IO_URING "io_uring"
29 #define DRIVER_NVME_IO_URING "nvme-io_uring"
30 #define DRIVER_VIRTIO_BLK_VFIO_PCI "virtio-blk-vfio-pci"
31 #define DRIVER_VIRTIO_BLK_VHOST_USER "virtio-blk-vhost-user"
32 #define DRIVER_VIRTIO_BLK_VHOST_VDPA "virtio-blk-vhost-vdpa"
33 
34 /*
35  * Allocated bounce buffers are kept in a list sorted by buffer address.
36  */
37 typedef struct BlkioBounceBuf {
38     QLIST_ENTRY(BlkioBounceBuf) next;
39 
40     /* The bounce buffer */
41     struct iovec buf;
42 } BlkioBounceBuf;
43 
44 typedef struct {
45     /*
46      * libblkio is not thread-safe so this lock protects ->blkio and
47      * ->blkioq.
48      */
49     QemuMutex blkio_lock;
50     struct blkio *blkio;
51     struct blkioq *blkioq; /* make this multi-queue in the future... */
52     int completion_fd;
53 
54     /*
55      * Polling fetches the next completion into this field.
56      *
57      * No lock is necessary since only one thread calls aio_poll() and invokes
58      * fd and poll handlers.
59      */
60     struct blkio_completion poll_completion;
61 
62     /*
63      * Protects ->bounce_pool, ->bounce_bufs, ->bounce_available.
64      *
65      * Lock ordering: ->bounce_lock before ->blkio_lock.
66      */
67     CoMutex bounce_lock;
68 
69     /* Bounce buffer pool */
70     struct blkio_mem_region bounce_pool;
71 
72     /* Sorted list of allocated bounce buffers */
73     QLIST_HEAD(, BlkioBounceBuf) bounce_bufs;
74 
75     /* Queue for coroutines waiting for bounce buffer space */
76     CoQueue bounce_available;
77 
78     /* The value of the "mem-region-alignment" property */
79     size_t mem_region_alignment;
80 
81     /* Can we skip adding/deleting blkio_mem_regions? */
82     bool needs_mem_regions;
83 
84     /* Are file descriptors necessary for blkio_mem_regions? */
85     bool needs_mem_region_fd;
86 
87     /* Are madvise(MADV_DONTNEED)-style operations unavailable? */
88     bool may_pin_mem_regions;
89 } BDRVBlkioState;
90 
91 /* Called with s->bounce_lock held */
92 static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes)
93 {
94     /* There can be no allocated bounce buffers during resize */
95     assert(QLIST_EMPTY(&s->bounce_bufs));
96 
97     /* Pad size to reduce frequency of resize calls */
98     bytes += 128 * 1024;
99 
100     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
101         int ret;
102 
103         if (s->bounce_pool.addr) {
104             blkio_unmap_mem_region(s->blkio, &s->bounce_pool);
105             blkio_free_mem_region(s->blkio, &s->bounce_pool);
106             memset(&s->bounce_pool, 0, sizeof(s->bounce_pool));
107         }
108 
109         /* Automatically freed when s->blkio is destroyed */
110         ret = blkio_alloc_mem_region(s->blkio, &s->bounce_pool, bytes);
111         if (ret < 0) {
112             return ret;
113         }
114 
115         ret = blkio_map_mem_region(s->blkio, &s->bounce_pool);
116         if (ret < 0) {
117             blkio_free_mem_region(s->blkio, &s->bounce_pool);
118             memset(&s->bounce_pool, 0, sizeof(s->bounce_pool));
119             return ret;
120         }
121     }
122 
123     return 0;
124 }
125 
126 /* Called with s->bounce_lock held */
127 static bool
128 blkio_do_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce,
129                              int64_t bytes)
130 {
131     void *addr = s->bounce_pool.addr;
132     BlkioBounceBuf *cur = NULL;
133     BlkioBounceBuf *prev = NULL;
134     ptrdiff_t space;
135 
136     /*
137      * This is just a linear search over the holes between requests. An
138      * efficient allocator would be nice.
139      */
140     QLIST_FOREACH(cur, &s->bounce_bufs, next) {
141         space = cur->buf.iov_base - addr;
142         if (bytes <= space) {
143             QLIST_INSERT_BEFORE(cur, bounce, next);
144             bounce->buf.iov_base = addr;
145             bounce->buf.iov_len = bytes;
146             return true;
147         }
148 
149         addr = cur->buf.iov_base + cur->buf.iov_len;
150         prev = cur;
151     }
152 
153     /* Is there space after the last request? */
154     space = s->bounce_pool.addr + s->bounce_pool.len - addr;
155     if (bytes > space) {
156         return false;
157     }
158     if (prev) {
159         QLIST_INSERT_AFTER(prev, bounce, next);
160     } else {
161         QLIST_INSERT_HEAD(&s->bounce_bufs, bounce, next);
162     }
163     bounce->buf.iov_base = addr;
164     bounce->buf.iov_len = bytes;
165     return true;
166 }
167 
168 static int coroutine_fn
169 blkio_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce,
170                           int64_t bytes)
171 {
172     /*
173      * Ensure fairness: first time around we join the back of the queue,
174      * subsequently we join the front so we don't lose our place.
175      */
176     CoQueueWaitFlags wait_flags = 0;
177 
178     QEMU_LOCK_GUARD(&s->bounce_lock);
179 
180     /* Ensure fairness: don't even try if other requests are already waiting */
181     if (!qemu_co_queue_empty(&s->bounce_available)) {
182         qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock,
183                                  wait_flags);
184         wait_flags = CO_QUEUE_WAIT_FRONT;
185     }
186 
187     while (true) {
188         if (blkio_do_alloc_bounce_buffer(s, bounce, bytes)) {
189             /* Kick the next queued request since there may be space */
190             qemu_co_queue_next(&s->bounce_available);
191             return 0;
192         }
193 
194         /*
195          * If there are no in-flight requests then the pool was simply too
196          * small.
197          */
198         if (QLIST_EMPTY(&s->bounce_bufs)) {
199             bool ok;
200             int ret;
201 
202             ret = blkio_resize_bounce_pool(s, bytes);
203             if (ret < 0) {
204                 /* Kick the next queued request since that may fail too */
205                 qemu_co_queue_next(&s->bounce_available);
206                 return ret;
207             }
208 
209             ok = blkio_do_alloc_bounce_buffer(s, bounce, bytes);
210             assert(ok); /* must have space this time */
211             return 0;
212         }
213 
214         qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock,
215                                  wait_flags);
216         wait_flags = CO_QUEUE_WAIT_FRONT;
217     }
218 }
219 
220 static void coroutine_fn blkio_free_bounce_buffer(BDRVBlkioState *s,
221                                                   BlkioBounceBuf *bounce)
222 {
223     QEMU_LOCK_GUARD(&s->bounce_lock);
224 
225     QLIST_REMOVE(bounce, next);
226 
227     /* Wake up waiting coroutines since space may now be available */
228     qemu_co_queue_next(&s->bounce_available);
229 }
230 
231 /* For async to .bdrv_co_*() conversion */
232 typedef struct {
233     Coroutine *coroutine;
234     int ret;
235 } BlkioCoData;
236 
237 static void blkio_completion_fd_read(void *opaque)
238 {
239     BlockDriverState *bs = opaque;
240     BDRVBlkioState *s = bs->opaque;
241     uint64_t val;
242     int ret;
243 
244     /* Polling may have already fetched a completion */
245     if (s->poll_completion.user_data != NULL) {
246         BlkioCoData *cod = s->poll_completion.user_data;
247         cod->ret = s->poll_completion.ret;
248 
249         /* Clear it in case aio_co_wake() enters a nested event loop */
250         s->poll_completion.user_data = NULL;
251 
252         aio_co_wake(cod->coroutine);
253     }
254 
255     /* Reset completion fd status */
256     ret = read(s->completion_fd, &val, sizeof(val));
257 
258     /* Ignore errors, there's nothing we can do */
259     (void)ret;
260 
261     /*
262      * Reading one completion at a time makes nested event loop re-entrancy
263      * simple. Change this loop to get multiple completions in one go if it
264      * becomes a performance bottleneck.
265      */
266     while (true) {
267         struct blkio_completion completion;
268 
269         WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
270             ret = blkioq_do_io(s->blkioq, &completion, 0, 1, NULL);
271         }
272         if (ret != 1) {
273             break;
274         }
275 
276         BlkioCoData *cod = completion.user_data;
277         cod->ret = completion.ret;
278         aio_co_wake(cod->coroutine);
279     }
280 }
281 
282 static bool blkio_completion_fd_poll(void *opaque)
283 {
284     BlockDriverState *bs = opaque;
285     BDRVBlkioState *s = bs->opaque;
286     int ret;
287 
288     /* Just in case we already fetched a completion */
289     if (s->poll_completion.user_data != NULL) {
290         return true;
291     }
292 
293     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
294         ret = blkioq_do_io(s->blkioq, &s->poll_completion, 0, 1, NULL);
295     }
296     return ret == 1;
297 }
298 
299 static void blkio_completion_fd_poll_ready(void *opaque)
300 {
301     blkio_completion_fd_read(opaque);
302 }
303 
304 static void blkio_attach_aio_context(BlockDriverState *bs,
305                                      AioContext *new_context)
306 {
307     BDRVBlkioState *s = bs->opaque;
308 
309     aio_set_fd_handler(new_context,
310                        s->completion_fd,
311                        false,
312                        blkio_completion_fd_read,
313                        NULL,
314                        blkio_completion_fd_poll,
315                        blkio_completion_fd_poll_ready,
316                        bs);
317 }
318 
319 static void blkio_detach_aio_context(BlockDriverState *bs)
320 {
321     BDRVBlkioState *s = bs->opaque;
322 
323     aio_set_fd_handler(bdrv_get_aio_context(bs),
324                        s->completion_fd,
325                        false, NULL, NULL, NULL, NULL, NULL);
326 }
327 
328 /* Call with s->blkio_lock held to submit I/O after enqueuing a new request */
329 static void blkio_submit_io(BlockDriverState *bs)
330 {
331     if (qatomic_read(&bs->io_plugged) == 0) {
332         BDRVBlkioState *s = bs->opaque;
333 
334         blkioq_do_io(s->blkioq, NULL, 0, 0, NULL);
335     }
336 }
337 
338 static int coroutine_fn
339 blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
340 {
341     BDRVBlkioState *s = bs->opaque;
342     BlkioCoData cod = {
343         .coroutine = qemu_coroutine_self(),
344     };
345 
346     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
347         blkioq_discard(s->blkioq, offset, bytes, &cod, 0);
348         blkio_submit_io(bs);
349     }
350 
351     qemu_coroutine_yield();
352     return cod.ret;
353 }
354 
355 static int coroutine_fn
356 blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
357                 QEMUIOVector *qiov, BdrvRequestFlags flags)
358 {
359     BlkioCoData cod = {
360         .coroutine = qemu_coroutine_self(),
361     };
362     BDRVBlkioState *s = bs->opaque;
363     bool use_bounce_buffer =
364         s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
365     BlkioBounceBuf bounce;
366     struct iovec *iov = qiov->iov;
367     int iovcnt = qiov->niov;
368 
369     if (use_bounce_buffer) {
370         int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes);
371         if (ret < 0) {
372             return ret;
373         }
374 
375         iov = &bounce.buf;
376         iovcnt = 1;
377     }
378 
379     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
380         blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0);
381         blkio_submit_io(bs);
382     }
383 
384     qemu_coroutine_yield();
385 
386     if (use_bounce_buffer) {
387         if (cod.ret == 0) {
388             qemu_iovec_from_buf(qiov, 0,
389                                 bounce.buf.iov_base,
390                                 bounce.buf.iov_len);
391         }
392 
393         blkio_free_bounce_buffer(s, &bounce);
394     }
395 
396     return cod.ret;
397 }
398 
399 static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset,
400         int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags)
401 {
402     uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0;
403     BlkioCoData cod = {
404         .coroutine = qemu_coroutine_self(),
405     };
406     BDRVBlkioState *s = bs->opaque;
407     bool use_bounce_buffer =
408         s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
409     BlkioBounceBuf bounce;
410     struct iovec *iov = qiov->iov;
411     int iovcnt = qiov->niov;
412 
413     if (use_bounce_buffer) {
414         int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes);
415         if (ret < 0) {
416             return ret;
417         }
418 
419         qemu_iovec_to_buf(qiov, 0, bounce.buf.iov_base, bytes);
420         iov = &bounce.buf;
421         iovcnt = 1;
422     }
423 
424     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
425         blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags);
426         blkio_submit_io(bs);
427     }
428 
429     qemu_coroutine_yield();
430 
431     if (use_bounce_buffer) {
432         blkio_free_bounce_buffer(s, &bounce);
433     }
434 
435     return cod.ret;
436 }
437 
438 static int coroutine_fn blkio_co_flush(BlockDriverState *bs)
439 {
440     BDRVBlkioState *s = bs->opaque;
441     BlkioCoData cod = {
442         .coroutine = qemu_coroutine_self(),
443     };
444 
445     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
446         blkioq_flush(s->blkioq, &cod, 0);
447         blkio_submit_io(bs);
448     }
449 
450     qemu_coroutine_yield();
451     return cod.ret;
452 }
453 
454 static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs,
455     int64_t offset, int64_t bytes, BdrvRequestFlags flags)
456 {
457     BDRVBlkioState *s = bs->opaque;
458     BlkioCoData cod = {
459         .coroutine = qemu_coroutine_self(),
460     };
461     uint32_t blkio_flags = 0;
462 
463     if (flags & BDRV_REQ_FUA) {
464         blkio_flags |= BLKIO_REQ_FUA;
465     }
466     if (!(flags & BDRV_REQ_MAY_UNMAP)) {
467         blkio_flags |= BLKIO_REQ_NO_UNMAP;
468     }
469     if (flags & BDRV_REQ_NO_FALLBACK) {
470         blkio_flags |= BLKIO_REQ_NO_FALLBACK;
471     }
472 
473     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
474         blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags);
475         blkio_submit_io(bs);
476     }
477 
478     qemu_coroutine_yield();
479     return cod.ret;
480 }
481 
482 static void coroutine_fn blkio_co_io_unplug(BlockDriverState *bs)
483 {
484     BDRVBlkioState *s = bs->opaque;
485 
486     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
487         blkio_submit_io(bs);
488     }
489 }
490 
491 typedef enum {
492     BMRR_OK,
493     BMRR_SKIP,
494     BMRR_FAIL,
495 } BlkioMemRegionResult;
496 
497 /*
498  * Produce a struct blkio_mem_region for a given address and size.
499  *
500  * This function produces identical results when called multiple times with the
501  * same arguments. This property is necessary because blkio_unmap_mem_region()
502  * must receive the same struct blkio_mem_region field values that were passed
503  * to blkio_map_mem_region().
504  */
505 static BlkioMemRegionResult
506 blkio_mem_region_from_host(BlockDriverState *bs,
507                            void *host, size_t size,
508                            struct blkio_mem_region *region,
509                            Error **errp)
510 {
511     BDRVBlkioState *s = bs->opaque;
512     int fd = -1;
513     ram_addr_t fd_offset = 0;
514 
515     if (((uintptr_t)host | size) % s->mem_region_alignment) {
516         error_setg(errp, "unaligned buf %p with size %zu", host, size);
517         return BMRR_FAIL;
518     }
519 
520     /* Attempt to find the fd for the underlying memory */
521     if (s->needs_mem_region_fd) {
522         RAMBlock *ram_block;
523         RAMBlock *end_block;
524         ram_addr_t offset;
525 
526         /*
527          * bdrv_register_buf() is called with the BQL held so mr lives at least
528          * until this function returns.
529          */
530         ram_block = qemu_ram_block_from_host(host, false, &fd_offset);
531         if (ram_block) {
532             fd = qemu_ram_get_fd(ram_block);
533         }
534         if (fd == -1) {
535             /*
536              * Ideally every RAMBlock would have an fd. pc-bios and other
537              * things don't. Luckily they are usually not I/O buffers and we
538              * can just ignore them.
539              */
540             return BMRR_SKIP;
541         }
542 
543         /* Make sure the fd covers the entire range */
544         end_block = qemu_ram_block_from_host(host + size - 1, false, &offset);
545         if (ram_block != end_block) {
546             error_setg(errp, "registered buffer at %p with size %zu extends "
547                        "beyond RAMBlock", host, size);
548             return BMRR_FAIL;
549         }
550     }
551 
552     *region = (struct blkio_mem_region){
553         .addr = host,
554         .len = size,
555         .fd = fd,
556         .fd_offset = fd_offset,
557     };
558     return BMRR_OK;
559 }
560 
561 static bool blkio_register_buf(BlockDriverState *bs, void *host, size_t size,
562                                Error **errp)
563 {
564     BDRVBlkioState *s = bs->opaque;
565     struct blkio_mem_region region;
566     BlkioMemRegionResult region_result;
567     int ret;
568 
569     /*
570      * Mapping memory regions conflicts with RAM discard (virtio-mem) when
571      * there is pinning, so only do it when necessary.
572      */
573     if (!s->needs_mem_regions && s->may_pin_mem_regions) {
574         return true;
575     }
576 
577     region_result = blkio_mem_region_from_host(bs, host, size, &region, errp);
578     if (region_result == BMRR_SKIP) {
579         return true;
580     } else if (region_result != BMRR_OK) {
581         return false;
582     }
583 
584     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
585         ret = blkio_map_mem_region(s->blkio, &region);
586     }
587 
588     if (ret < 0) {
589         error_setg(errp, "Failed to add blkio mem region %p with size %zu: %s",
590                    host, size, blkio_get_error_msg());
591         return false;
592     }
593     return true;
594 }
595 
596 static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size)
597 {
598     BDRVBlkioState *s = bs->opaque;
599     struct blkio_mem_region region;
600 
601     /* See blkio_register_buf() */
602     if (!s->needs_mem_regions && s->may_pin_mem_regions) {
603         return;
604     }
605 
606     if (blkio_mem_region_from_host(bs, host, size, &region, NULL) != BMRR_OK) {
607         return;
608     }
609 
610     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
611         blkio_unmap_mem_region(s->blkio, &region);
612     }
613 }
614 
615 static int blkio_io_uring_open(BlockDriverState *bs, QDict *options, int flags,
616                                Error **errp)
617 {
618     const char *filename = qdict_get_str(options, "filename");
619     BDRVBlkioState *s = bs->opaque;
620     int ret;
621 
622     ret = blkio_set_str(s->blkio, "path", filename);
623     qdict_del(options, "filename");
624     if (ret < 0) {
625         error_setg_errno(errp, -ret, "failed to set path: %s",
626                          blkio_get_error_msg());
627         return ret;
628     }
629 
630     if (flags & BDRV_O_NOCACHE) {
631         ret = blkio_set_bool(s->blkio, "direct", true);
632         if (ret < 0) {
633             error_setg_errno(errp, -ret, "failed to set direct: %s",
634                              blkio_get_error_msg());
635             return ret;
636         }
637     }
638 
639     return 0;
640 }
641 
642 static int blkio_nvme_io_uring(BlockDriverState *bs, QDict *options, int flags,
643                                Error **errp)
644 {
645     const char *path = qdict_get_try_str(options, "path");
646     BDRVBlkioState *s = bs->opaque;
647     int ret;
648 
649     if (!path) {
650         error_setg(errp, "missing 'path' option");
651         return -EINVAL;
652     }
653 
654     ret = blkio_set_str(s->blkio, "path", path);
655     qdict_del(options, "path");
656     if (ret < 0) {
657         error_setg_errno(errp, -ret, "failed to set path: %s",
658                          blkio_get_error_msg());
659         return ret;
660     }
661 
662     if (!(flags & BDRV_O_NOCACHE)) {
663         error_setg(errp, "cache.direct=off is not supported");
664         return -EINVAL;
665     }
666 
667     return 0;
668 }
669 
670 static int blkio_virtio_blk_common_open(BlockDriverState *bs,
671         QDict *options, int flags, Error **errp)
672 {
673     const char *path = qdict_get_try_str(options, "path");
674     BDRVBlkioState *s = bs->opaque;
675     int ret;
676 
677     if (!path) {
678         error_setg(errp, "missing 'path' option");
679         return -EINVAL;
680     }
681 
682     ret = blkio_set_str(s->blkio, "path", path);
683     qdict_del(options, "path");
684     if (ret < 0) {
685         error_setg_errno(errp, -ret, "failed to set path: %s",
686                          blkio_get_error_msg());
687         return ret;
688     }
689 
690     if (!(flags & BDRV_O_NOCACHE)) {
691         error_setg(errp, "cache.direct=off is not supported");
692         return -EINVAL;
693     }
694     return 0;
695 }
696 
697 static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags,
698                            Error **errp)
699 {
700     const char *blkio_driver = bs->drv->protocol_name;
701     BDRVBlkioState *s = bs->opaque;
702     int ret;
703 
704     ret = blkio_create(blkio_driver, &s->blkio);
705     if (ret < 0) {
706         error_setg_errno(errp, -ret, "blkio_create failed: %s",
707                          blkio_get_error_msg());
708         return ret;
709     }
710 
711     if (strcmp(blkio_driver, DRIVER_IO_URING) == 0) {
712         ret = blkio_io_uring_open(bs, options, flags, errp);
713     } else if (strcmp(blkio_driver, DRIVER_NVME_IO_URING) == 0) {
714         ret = blkio_nvme_io_uring(bs, options, flags, errp);
715     } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VFIO_PCI) == 0) {
716         ret = blkio_virtio_blk_common_open(bs, options, flags, errp);
717     } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_USER) == 0) {
718         ret = blkio_virtio_blk_common_open(bs, options, flags, errp);
719     } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_VDPA) == 0) {
720         ret = blkio_virtio_blk_common_open(bs, options, flags, errp);
721     } else {
722         g_assert_not_reached();
723     }
724     if (ret < 0) {
725         blkio_destroy(&s->blkio);
726         return ret;
727     }
728 
729     if (!(flags & BDRV_O_RDWR)) {
730         ret = blkio_set_bool(s->blkio, "read-only", true);
731         if (ret < 0) {
732             error_setg_errno(errp, -ret, "failed to set read-only: %s",
733                              blkio_get_error_msg());
734             blkio_destroy(&s->blkio);
735             return ret;
736         }
737     }
738 
739     ret = blkio_connect(s->blkio);
740     if (ret < 0) {
741         error_setg_errno(errp, -ret, "blkio_connect failed: %s",
742                          blkio_get_error_msg());
743         blkio_destroy(&s->blkio);
744         return ret;
745     }
746 
747     ret = blkio_get_bool(s->blkio,
748                          "needs-mem-regions",
749                          &s->needs_mem_regions);
750     if (ret < 0) {
751         error_setg_errno(errp, -ret,
752                          "failed to get needs-mem-regions: %s",
753                          blkio_get_error_msg());
754         blkio_destroy(&s->blkio);
755         return ret;
756     }
757 
758     ret = blkio_get_bool(s->blkio,
759                          "needs-mem-region-fd",
760                          &s->needs_mem_region_fd);
761     if (ret < 0) {
762         error_setg_errno(errp, -ret,
763                          "failed to get needs-mem-region-fd: %s",
764                          blkio_get_error_msg());
765         blkio_destroy(&s->blkio);
766         return ret;
767     }
768 
769     ret = blkio_get_uint64(s->blkio,
770                            "mem-region-alignment",
771                            &s->mem_region_alignment);
772     if (ret < 0) {
773         error_setg_errno(errp, -ret,
774                          "failed to get mem-region-alignment: %s",
775                          blkio_get_error_msg());
776         blkio_destroy(&s->blkio);
777         return ret;
778     }
779 
780     ret = blkio_get_bool(s->blkio,
781                          "may-pin-mem-regions",
782                          &s->may_pin_mem_regions);
783     if (ret < 0) {
784         /* Be conservative (assume pinning) if the property is not supported */
785         s->may_pin_mem_regions = s->needs_mem_regions;
786     }
787 
788     /*
789      * Notify if libblkio drivers pin memory and prevent features like
790      * virtio-mem from working.
791      */
792     if (s->may_pin_mem_regions) {
793         ret = ram_block_discard_disable(true);
794         if (ret < 0) {
795             error_setg_errno(errp, -ret, "ram_block_discard_disable() failed");
796             blkio_destroy(&s->blkio);
797             return ret;
798         }
799     }
800 
801     ret = blkio_start(s->blkio);
802     if (ret < 0) {
803         error_setg_errno(errp, -ret, "blkio_start failed: %s",
804                          blkio_get_error_msg());
805         blkio_destroy(&s->blkio);
806         if (s->may_pin_mem_regions) {
807             ram_block_discard_disable(false);
808         }
809         return ret;
810     }
811 
812     bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF;
813     bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP |
814                                BDRV_REQ_NO_FALLBACK;
815 
816     qemu_mutex_init(&s->blkio_lock);
817     qemu_co_mutex_init(&s->bounce_lock);
818     qemu_co_queue_init(&s->bounce_available);
819     QLIST_INIT(&s->bounce_bufs);
820     s->blkioq = blkio_get_queue(s->blkio, 0);
821     s->completion_fd = blkioq_get_completion_fd(s->blkioq);
822 
823     blkio_attach_aio_context(bs, bdrv_get_aio_context(bs));
824     return 0;
825 }
826 
827 static void blkio_close(BlockDriverState *bs)
828 {
829     BDRVBlkioState *s = bs->opaque;
830 
831     /* There is no destroy() API for s->bounce_lock */
832 
833     qemu_mutex_destroy(&s->blkio_lock);
834     blkio_detach_aio_context(bs);
835     blkio_destroy(&s->blkio);
836 
837     if (s->may_pin_mem_regions) {
838         ram_block_discard_disable(false);
839     }
840 }
841 
842 static int64_t coroutine_fn blkio_co_getlength(BlockDriverState *bs)
843 {
844     BDRVBlkioState *s = bs->opaque;
845     uint64_t capacity;
846     int ret;
847 
848     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
849         ret = blkio_get_uint64(s->blkio, "capacity", &capacity);
850     }
851     if (ret < 0) {
852         return -ret;
853     }
854 
855     return capacity;
856 }
857 
858 static int coroutine_fn blkio_truncate(BlockDriverState *bs, int64_t offset,
859                                        bool exact, PreallocMode prealloc,
860                                        BdrvRequestFlags flags, Error **errp)
861 {
862     int64_t current_length;
863 
864     if (prealloc != PREALLOC_MODE_OFF) {
865         error_setg(errp, "Unsupported preallocation mode '%s'",
866                    PreallocMode_str(prealloc));
867         return -ENOTSUP;
868     }
869 
870     current_length = blkio_co_getlength(bs);
871 
872     if (offset > current_length) {
873         error_setg(errp, "Cannot grow device");
874         return -EINVAL;
875     } else if (exact && offset != current_length) {
876         error_setg(errp, "Cannot resize device");
877         return -ENOTSUP;
878     }
879 
880     return 0;
881 }
882 
883 static int coroutine_fn
884 blkio_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
885 {
886     return 0;
887 }
888 
889 static void blkio_refresh_limits(BlockDriverState *bs, Error **errp)
890 {
891     BDRVBlkioState *s = bs->opaque;
892     QEMU_LOCK_GUARD(&s->blkio_lock);
893     int value;
894     int ret;
895 
896     ret = blkio_get_int(s->blkio, "request-alignment", &value);
897     if (ret < 0) {
898         error_setg_errno(errp, -ret, "failed to get \"request-alignment\": %s",
899                          blkio_get_error_msg());
900         return;
901     }
902     bs->bl.request_alignment = value;
903     if (bs->bl.request_alignment < 1 ||
904         bs->bl.request_alignment >= INT_MAX ||
905         !is_power_of_2(bs->bl.request_alignment)) {
906         error_setg(errp, "invalid \"request-alignment\" value %" PRIu32 ", "
907                    "must be a power of 2 less than INT_MAX",
908                    bs->bl.request_alignment);
909         return;
910     }
911 
912     ret = blkio_get_int(s->blkio, "optimal-io-size", &value);
913     if (ret < 0) {
914         error_setg_errno(errp, -ret, "failed to get \"optimal-io-size\": %s",
915                          blkio_get_error_msg());
916         return;
917     }
918     bs->bl.opt_transfer = value;
919     if (bs->bl.opt_transfer > INT_MAX ||
920         (bs->bl.opt_transfer % bs->bl.request_alignment)) {
921         error_setg(errp, "invalid \"optimal-io-size\" value %" PRIu32 ", must "
922                    "be a multiple of %" PRIu32, bs->bl.opt_transfer,
923                    bs->bl.request_alignment);
924         return;
925     }
926 
927     ret = blkio_get_int(s->blkio, "max-transfer", &value);
928     if (ret < 0) {
929         error_setg_errno(errp, -ret, "failed to get \"max-transfer\": %s",
930                          blkio_get_error_msg());
931         return;
932     }
933     bs->bl.max_transfer = value;
934     if ((bs->bl.max_transfer % bs->bl.request_alignment) ||
935         (bs->bl.opt_transfer && (bs->bl.max_transfer % bs->bl.opt_transfer))) {
936         error_setg(errp, "invalid \"max-transfer\" value %" PRIu32 ", must be "
937                    "a multiple of %" PRIu32 " and %" PRIu32 " (if non-zero)",
938                    bs->bl.max_transfer, bs->bl.request_alignment,
939                    bs->bl.opt_transfer);
940         return;
941     }
942 
943     ret = blkio_get_int(s->blkio, "buf-alignment", &value);
944     if (ret < 0) {
945         error_setg_errno(errp, -ret, "failed to get \"buf-alignment\": %s",
946                          blkio_get_error_msg());
947         return;
948     }
949     if (value < 1) {
950         error_setg(errp, "invalid \"buf-alignment\" value %d, must be "
951                    "positive", value);
952         return;
953     }
954     bs->bl.min_mem_alignment = value;
955 
956     ret = blkio_get_int(s->blkio, "optimal-buf-alignment", &value);
957     if (ret < 0) {
958         error_setg_errno(errp, -ret,
959                          "failed to get \"optimal-buf-alignment\": %s",
960                          blkio_get_error_msg());
961         return;
962     }
963     if (value < 1) {
964         error_setg(errp, "invalid \"optimal-buf-alignment\" value %d, "
965                    "must be positive", value);
966         return;
967     }
968     bs->bl.opt_mem_alignment = value;
969 
970     ret = blkio_get_int(s->blkio, "max-segments", &value);
971     if (ret < 0) {
972         error_setg_errno(errp, -ret, "failed to get \"max-segments\": %s",
973                          blkio_get_error_msg());
974         return;
975     }
976     if (value < 1) {
977         error_setg(errp, "invalid \"max-segments\" value %d, must be positive",
978                    value);
979         return;
980     }
981     bs->bl.max_iov = value;
982 }
983 
984 /*
985  * TODO
986  * Missing libblkio APIs:
987  * - block_status
988  * - co_invalidate_cache
989  *
990  * Out of scope?
991  * - create
992  * - truncate
993  */
994 
995 #define BLKIO_DRIVER(name, ...) \
996     { \
997         .format_name             = name, \
998         .protocol_name           = name, \
999         .instance_size           = sizeof(BDRVBlkioState), \
1000         .bdrv_file_open          = blkio_file_open, \
1001         .bdrv_close              = blkio_close, \
1002         .bdrv_co_getlength       = blkio_co_getlength, \
1003         .bdrv_co_truncate        = blkio_truncate, \
1004         .bdrv_co_get_info        = blkio_co_get_info, \
1005         .bdrv_attach_aio_context = blkio_attach_aio_context, \
1006         .bdrv_detach_aio_context = blkio_detach_aio_context, \
1007         .bdrv_co_pdiscard        = blkio_co_pdiscard, \
1008         .bdrv_co_preadv          = blkio_co_preadv, \
1009         .bdrv_co_pwritev         = blkio_co_pwritev, \
1010         .bdrv_co_flush_to_disk   = blkio_co_flush, \
1011         .bdrv_co_pwrite_zeroes   = blkio_co_pwrite_zeroes, \
1012         .bdrv_co_io_unplug       = blkio_co_io_unplug, \
1013         .bdrv_refresh_limits     = blkio_refresh_limits, \
1014         .bdrv_register_buf       = blkio_register_buf, \
1015         .bdrv_unregister_buf     = blkio_unregister_buf, \
1016         __VA_ARGS__ \
1017     }
1018 
1019 static BlockDriver bdrv_io_uring = BLKIO_DRIVER(
1020     DRIVER_IO_URING,
1021     .bdrv_needs_filename = true,
1022 );
1023 
1024 static BlockDriver bdrv_nvme_io_uring = BLKIO_DRIVER(
1025     DRIVER_NVME_IO_URING,
1026 );
1027 
1028 static BlockDriver bdrv_virtio_blk_vfio_pci = BLKIO_DRIVER(
1029     DRIVER_VIRTIO_BLK_VFIO_PCI
1030 );
1031 
1032 static BlockDriver bdrv_virtio_blk_vhost_user = BLKIO_DRIVER(
1033     DRIVER_VIRTIO_BLK_VHOST_USER
1034 );
1035 
1036 static BlockDriver bdrv_virtio_blk_vhost_vdpa = BLKIO_DRIVER(
1037     DRIVER_VIRTIO_BLK_VHOST_VDPA
1038 );
1039 
1040 static void bdrv_blkio_init(void)
1041 {
1042     bdrv_register(&bdrv_io_uring);
1043     bdrv_register(&bdrv_nvme_io_uring);
1044     bdrv_register(&bdrv_virtio_blk_vfio_pci);
1045     bdrv_register(&bdrv_virtio_blk_vhost_user);
1046     bdrv_register(&bdrv_virtio_blk_vhost_vdpa);
1047 }
1048 
1049 block_init(bdrv_blkio_init);
1050