xref: /openbmc/qemu/block/blkio.c (revision 7653b1ea)
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2 /*
3  * libblkio BlockDriver
4  *
5  * Copyright Red Hat, Inc.
6  *
7  * Author:
8  *   Stefan Hajnoczi <stefanha@redhat.com>
9  */
10 
11 #include "qemu/osdep.h"
12 #include <blkio.h>
13 #include "block/block_int.h"
14 #include "exec/memory.h"
15 #include "exec/cpu-common.h" /* for qemu_ram_get_fd() */
16 #include "qemu/defer-call.h"
17 #include "qapi/error.h"
18 #include "qemu/error-report.h"
19 #include "qapi/qmp/qdict.h"
20 #include "qemu/module.h"
21 #include "sysemu/block-backend.h"
22 #include "exec/memory.h" /* for ram_block_discard_disable() */
23 
24 #include "block/block-io.h"
25 
26 /*
27  * Allocated bounce buffers are kept in a list sorted by buffer address.
28  */
29 typedef struct BlkioBounceBuf {
30     QLIST_ENTRY(BlkioBounceBuf) next;
31 
32     /* The bounce buffer */
33     struct iovec buf;
34 } BlkioBounceBuf;
35 
36 typedef struct {
37     /*
38      * libblkio is not thread-safe so this lock protects ->blkio and
39      * ->blkioq.
40      */
41     QemuMutex blkio_lock;
42     struct blkio *blkio;
43     struct blkioq *blkioq; /* make this multi-queue in the future... */
44     int completion_fd;
45 
46     /*
47      * Polling fetches the next completion into this field.
48      *
49      * No lock is necessary since only one thread calls aio_poll() and invokes
50      * fd and poll handlers.
51      */
52     struct blkio_completion poll_completion;
53 
54     /*
55      * Protects ->bounce_pool, ->bounce_bufs, ->bounce_available.
56      *
57      * Lock ordering: ->bounce_lock before ->blkio_lock.
58      */
59     CoMutex bounce_lock;
60 
61     /* Bounce buffer pool */
62     struct blkio_mem_region bounce_pool;
63 
64     /* Sorted list of allocated bounce buffers */
65     QLIST_HEAD(, BlkioBounceBuf) bounce_bufs;
66 
67     /* Queue for coroutines waiting for bounce buffer space */
68     CoQueue bounce_available;
69 
70     /* The value of the "mem-region-alignment" property */
71     uint64_t mem_region_alignment;
72 
73     /* Can we skip adding/deleting blkio_mem_regions? */
74     bool needs_mem_regions;
75 
76     /* Are file descriptors necessary for blkio_mem_regions? */
77     bool needs_mem_region_fd;
78 
79     /* Are madvise(MADV_DONTNEED)-style operations unavailable? */
80     bool may_pin_mem_regions;
81 } BDRVBlkioState;
82 
83 /* Called with s->bounce_lock held */
84 static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes)
85 {
86     /* There can be no allocated bounce buffers during resize */
87     assert(QLIST_EMPTY(&s->bounce_bufs));
88 
89     /* Pad size to reduce frequency of resize calls */
90     bytes += 128 * 1024;
91 
92     /* Align the pool size to avoid blkio_alloc_mem_region() failure */
93     bytes = QEMU_ALIGN_UP(bytes, s->mem_region_alignment);
94 
95     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
96         int ret;
97 
98         if (s->bounce_pool.addr) {
99             blkio_unmap_mem_region(s->blkio, &s->bounce_pool);
100             blkio_free_mem_region(s->blkio, &s->bounce_pool);
101             memset(&s->bounce_pool, 0, sizeof(s->bounce_pool));
102         }
103 
104         /* Automatically freed when s->blkio is destroyed */
105         ret = blkio_alloc_mem_region(s->blkio, &s->bounce_pool, bytes);
106         if (ret < 0) {
107             return ret;
108         }
109 
110         ret = blkio_map_mem_region(s->blkio, &s->bounce_pool);
111         if (ret < 0) {
112             blkio_free_mem_region(s->blkio, &s->bounce_pool);
113             memset(&s->bounce_pool, 0, sizeof(s->bounce_pool));
114             return ret;
115         }
116     }
117 
118     return 0;
119 }
120 
121 /* Called with s->bounce_lock held */
122 static bool
123 blkio_do_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce,
124                              int64_t bytes)
125 {
126     void *addr = s->bounce_pool.addr;
127     BlkioBounceBuf *cur = NULL;
128     BlkioBounceBuf *prev = NULL;
129     ptrdiff_t space;
130 
131     /*
132      * This is just a linear search over the holes between requests. An
133      * efficient allocator would be nice.
134      */
135     QLIST_FOREACH(cur, &s->bounce_bufs, next) {
136         space = cur->buf.iov_base - addr;
137         if (bytes <= space) {
138             QLIST_INSERT_BEFORE(cur, bounce, next);
139             bounce->buf.iov_base = addr;
140             bounce->buf.iov_len = bytes;
141             return true;
142         }
143 
144         addr = cur->buf.iov_base + cur->buf.iov_len;
145         prev = cur;
146     }
147 
148     /* Is there space after the last request? */
149     space = s->bounce_pool.addr + s->bounce_pool.len - addr;
150     if (bytes > space) {
151         return false;
152     }
153     if (prev) {
154         QLIST_INSERT_AFTER(prev, bounce, next);
155     } else {
156         QLIST_INSERT_HEAD(&s->bounce_bufs, bounce, next);
157     }
158     bounce->buf.iov_base = addr;
159     bounce->buf.iov_len = bytes;
160     return true;
161 }
162 
163 static int coroutine_fn
164 blkio_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce,
165                           int64_t bytes)
166 {
167     /*
168      * Ensure fairness: first time around we join the back of the queue,
169      * subsequently we join the front so we don't lose our place.
170      */
171     CoQueueWaitFlags wait_flags = 0;
172 
173     QEMU_LOCK_GUARD(&s->bounce_lock);
174 
175     /* Ensure fairness: don't even try if other requests are already waiting */
176     if (!qemu_co_queue_empty(&s->bounce_available)) {
177         qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock,
178                                  wait_flags);
179         wait_flags = CO_QUEUE_WAIT_FRONT;
180     }
181 
182     while (true) {
183         if (blkio_do_alloc_bounce_buffer(s, bounce, bytes)) {
184             /* Kick the next queued request since there may be space */
185             qemu_co_queue_next(&s->bounce_available);
186             return 0;
187         }
188 
189         /*
190          * If there are no in-flight requests then the pool was simply too
191          * small.
192          */
193         if (QLIST_EMPTY(&s->bounce_bufs)) {
194             bool ok;
195             int ret;
196 
197             ret = blkio_resize_bounce_pool(s, bytes);
198             if (ret < 0) {
199                 /* Kick the next queued request since that may fail too */
200                 qemu_co_queue_next(&s->bounce_available);
201                 return ret;
202             }
203 
204             ok = blkio_do_alloc_bounce_buffer(s, bounce, bytes);
205             assert(ok); /* must have space this time */
206             return 0;
207         }
208 
209         qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock,
210                                  wait_flags);
211         wait_flags = CO_QUEUE_WAIT_FRONT;
212     }
213 }
214 
215 static void coroutine_fn blkio_free_bounce_buffer(BDRVBlkioState *s,
216                                                   BlkioBounceBuf *bounce)
217 {
218     QEMU_LOCK_GUARD(&s->bounce_lock);
219 
220     QLIST_REMOVE(bounce, next);
221 
222     /* Wake up waiting coroutines since space may now be available */
223     qemu_co_queue_next(&s->bounce_available);
224 }
225 
226 /* For async to .bdrv_co_*() conversion */
227 typedef struct {
228     Coroutine *coroutine;
229     int ret;
230 } BlkioCoData;
231 
232 static void blkio_completion_fd_read(void *opaque)
233 {
234     BlockDriverState *bs = opaque;
235     BDRVBlkioState *s = bs->opaque;
236     uint64_t val;
237     int ret;
238 
239     /* Polling may have already fetched a completion */
240     if (s->poll_completion.user_data != NULL) {
241         BlkioCoData *cod = s->poll_completion.user_data;
242         cod->ret = s->poll_completion.ret;
243 
244         /* Clear it in case aio_co_wake() enters a nested event loop */
245         s->poll_completion.user_data = NULL;
246 
247         aio_co_wake(cod->coroutine);
248     }
249 
250     /* Reset completion fd status */
251     ret = read(s->completion_fd, &val, sizeof(val));
252 
253     /* Ignore errors, there's nothing we can do */
254     (void)ret;
255 
256     /*
257      * Reading one completion at a time makes nested event loop re-entrancy
258      * simple. Change this loop to get multiple completions in one go if it
259      * becomes a performance bottleneck.
260      */
261     while (true) {
262         struct blkio_completion completion;
263 
264         WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
265             ret = blkioq_do_io(s->blkioq, &completion, 0, 1, NULL);
266         }
267         if (ret != 1) {
268             break;
269         }
270 
271         BlkioCoData *cod = completion.user_data;
272         cod->ret = completion.ret;
273         aio_co_wake(cod->coroutine);
274     }
275 }
276 
277 static bool blkio_completion_fd_poll(void *opaque)
278 {
279     BlockDriverState *bs = opaque;
280     BDRVBlkioState *s = bs->opaque;
281     int ret;
282 
283     /* Just in case we already fetched a completion */
284     if (s->poll_completion.user_data != NULL) {
285         return true;
286     }
287 
288     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
289         ret = blkioq_do_io(s->blkioq, &s->poll_completion, 0, 1, NULL);
290     }
291     return ret == 1;
292 }
293 
294 static void blkio_completion_fd_poll_ready(void *opaque)
295 {
296     blkio_completion_fd_read(opaque);
297 }
298 
299 static void blkio_attach_aio_context(BlockDriverState *bs,
300                                      AioContext *new_context)
301 {
302     BDRVBlkioState *s = bs->opaque;
303 
304     aio_set_fd_handler(new_context, s->completion_fd,
305                        blkio_completion_fd_read, NULL,
306                        blkio_completion_fd_poll,
307                        blkio_completion_fd_poll_ready, bs);
308 }
309 
310 static void blkio_detach_aio_context(BlockDriverState *bs)
311 {
312     BDRVBlkioState *s = bs->opaque;
313 
314     aio_set_fd_handler(bdrv_get_aio_context(bs), s->completion_fd, NULL, NULL,
315                        NULL, NULL, NULL);
316 }
317 
318 /*
319  * Called by defer_call_end() or immediately if not in a deferred section.
320  * Called without blkio_lock.
321  */
322 static void blkio_deferred_fn(void *opaque)
323 {
324     BDRVBlkioState *s = opaque;
325 
326     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
327         blkioq_do_io(s->blkioq, NULL, 0, 0, NULL);
328     }
329 }
330 
331 /*
332  * Schedule I/O submission after enqueuing a new request. Called without
333  * blkio_lock.
334  */
335 static void blkio_submit_io(BlockDriverState *bs)
336 {
337     BDRVBlkioState *s = bs->opaque;
338 
339     defer_call(blkio_deferred_fn, s);
340 }
341 
342 static int coroutine_fn
343 blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
344 {
345     BDRVBlkioState *s = bs->opaque;
346     BlkioCoData cod = {
347         .coroutine = qemu_coroutine_self(),
348     };
349 
350     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
351         blkioq_discard(s->blkioq, offset, bytes, &cod, 0);
352     }
353 
354     blkio_submit_io(bs);
355     qemu_coroutine_yield();
356     return cod.ret;
357 }
358 
359 static int coroutine_fn
360 blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
361                 QEMUIOVector *qiov, BdrvRequestFlags flags)
362 {
363     BlkioCoData cod = {
364         .coroutine = qemu_coroutine_self(),
365     };
366     BDRVBlkioState *s = bs->opaque;
367     bool use_bounce_buffer =
368         s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
369     BlkioBounceBuf bounce;
370     struct iovec *iov = qiov->iov;
371     int iovcnt = qiov->niov;
372 
373     if (use_bounce_buffer) {
374         int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes);
375         if (ret < 0) {
376             return ret;
377         }
378 
379         iov = &bounce.buf;
380         iovcnt = 1;
381     }
382 
383     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
384         blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0);
385     }
386 
387     blkio_submit_io(bs);
388     qemu_coroutine_yield();
389 
390     if (use_bounce_buffer) {
391         if (cod.ret == 0) {
392             qemu_iovec_from_buf(qiov, 0,
393                                 bounce.buf.iov_base,
394                                 bounce.buf.iov_len);
395         }
396 
397         blkio_free_bounce_buffer(s, &bounce);
398     }
399 
400     return cod.ret;
401 }
402 
403 static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset,
404         int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags)
405 {
406     uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0;
407     BlkioCoData cod = {
408         .coroutine = qemu_coroutine_self(),
409     };
410     BDRVBlkioState *s = bs->opaque;
411     bool use_bounce_buffer =
412         s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
413     BlkioBounceBuf bounce;
414     struct iovec *iov = qiov->iov;
415     int iovcnt = qiov->niov;
416 
417     if (use_bounce_buffer) {
418         int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes);
419         if (ret < 0) {
420             return ret;
421         }
422 
423         qemu_iovec_to_buf(qiov, 0, bounce.buf.iov_base, bytes);
424         iov = &bounce.buf;
425         iovcnt = 1;
426     }
427 
428     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
429         blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags);
430     }
431 
432     blkio_submit_io(bs);
433     qemu_coroutine_yield();
434 
435     if (use_bounce_buffer) {
436         blkio_free_bounce_buffer(s, &bounce);
437     }
438 
439     return cod.ret;
440 }
441 
442 static int coroutine_fn blkio_co_flush(BlockDriverState *bs)
443 {
444     BDRVBlkioState *s = bs->opaque;
445     BlkioCoData cod = {
446         .coroutine = qemu_coroutine_self(),
447     };
448 
449     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
450         blkioq_flush(s->blkioq, &cod, 0);
451     }
452 
453     blkio_submit_io(bs);
454     qemu_coroutine_yield();
455     return cod.ret;
456 }
457 
458 static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs,
459     int64_t offset, int64_t bytes, BdrvRequestFlags flags)
460 {
461     BDRVBlkioState *s = bs->opaque;
462     BlkioCoData cod = {
463         .coroutine = qemu_coroutine_self(),
464     };
465     uint32_t blkio_flags = 0;
466 
467     if (flags & BDRV_REQ_FUA) {
468         blkio_flags |= BLKIO_REQ_FUA;
469     }
470     if (!(flags & BDRV_REQ_MAY_UNMAP)) {
471         blkio_flags |= BLKIO_REQ_NO_UNMAP;
472     }
473     if (flags & BDRV_REQ_NO_FALLBACK) {
474         blkio_flags |= BLKIO_REQ_NO_FALLBACK;
475     }
476 
477     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
478         blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags);
479     }
480 
481     blkio_submit_io(bs);
482     qemu_coroutine_yield();
483     return cod.ret;
484 }
485 
486 typedef enum {
487     BMRR_OK,
488     BMRR_SKIP,
489     BMRR_FAIL,
490 } BlkioMemRegionResult;
491 
492 /*
493  * Produce a struct blkio_mem_region for a given address and size.
494  *
495  * This function produces identical results when called multiple times with the
496  * same arguments. This property is necessary because blkio_unmap_mem_region()
497  * must receive the same struct blkio_mem_region field values that were passed
498  * to blkio_map_mem_region().
499  */
500 static BlkioMemRegionResult
501 blkio_mem_region_from_host(BlockDriverState *bs,
502                            void *host, size_t size,
503                            struct blkio_mem_region *region,
504                            Error **errp)
505 {
506     BDRVBlkioState *s = bs->opaque;
507     int fd = -1;
508     ram_addr_t fd_offset = 0;
509 
510     if (((uintptr_t)host | size) % s->mem_region_alignment) {
511         error_setg(errp, "unaligned buf %p with size %zu", host, size);
512         return BMRR_FAIL;
513     }
514 
515     /* Attempt to find the fd for the underlying memory */
516     if (s->needs_mem_region_fd) {
517         RAMBlock *ram_block;
518         RAMBlock *end_block;
519         ram_addr_t offset;
520 
521         /*
522          * bdrv_register_buf() is called with the BQL held so mr lives at least
523          * until this function returns.
524          */
525         ram_block = qemu_ram_block_from_host(host, false, &fd_offset);
526         if (ram_block) {
527             fd = qemu_ram_get_fd(ram_block);
528         }
529         if (fd == -1) {
530             /*
531              * Ideally every RAMBlock would have an fd. pc-bios and other
532              * things don't. Luckily they are usually not I/O buffers and we
533              * can just ignore them.
534              */
535             return BMRR_SKIP;
536         }
537 
538         /* Make sure the fd covers the entire range */
539         end_block = qemu_ram_block_from_host(host + size - 1, false, &offset);
540         if (ram_block != end_block) {
541             error_setg(errp, "registered buffer at %p with size %zu extends "
542                        "beyond RAMBlock", host, size);
543             return BMRR_FAIL;
544         }
545     }
546 
547     *region = (struct blkio_mem_region){
548         .addr = host,
549         .len = size,
550         .fd = fd,
551         .fd_offset = fd_offset,
552     };
553     return BMRR_OK;
554 }
555 
556 static bool blkio_register_buf(BlockDriverState *bs, void *host, size_t size,
557                                Error **errp)
558 {
559     BDRVBlkioState *s = bs->opaque;
560     struct blkio_mem_region region;
561     BlkioMemRegionResult region_result;
562     int ret;
563 
564     /*
565      * Mapping memory regions conflicts with RAM discard (virtio-mem) when
566      * there is pinning, so only do it when necessary.
567      */
568     if (!s->needs_mem_regions && s->may_pin_mem_regions) {
569         return true;
570     }
571 
572     region_result = blkio_mem_region_from_host(bs, host, size, &region, errp);
573     if (region_result == BMRR_SKIP) {
574         return true;
575     } else if (region_result != BMRR_OK) {
576         return false;
577     }
578 
579     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
580         ret = blkio_map_mem_region(s->blkio, &region);
581     }
582 
583     if (ret < 0) {
584         error_setg(errp, "Failed to add blkio mem region %p with size %zu: %s",
585                    host, size, blkio_get_error_msg());
586         return false;
587     }
588     return true;
589 }
590 
591 static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size)
592 {
593     BDRVBlkioState *s = bs->opaque;
594     struct blkio_mem_region region;
595 
596     /* See blkio_register_buf() */
597     if (!s->needs_mem_regions && s->may_pin_mem_regions) {
598         return;
599     }
600 
601     if (blkio_mem_region_from_host(bs, host, size, &region, NULL) != BMRR_OK) {
602         return;
603     }
604 
605     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
606         blkio_unmap_mem_region(s->blkio, &region);
607     }
608 }
609 
610 static int blkio_io_uring_connect(BlockDriverState *bs, QDict *options,
611                                   int flags, Error **errp)
612 {
613     const char *filename = qdict_get_str(options, "filename");
614     BDRVBlkioState *s = bs->opaque;
615     int ret;
616 
617     ret = blkio_set_str(s->blkio, "path", filename);
618     qdict_del(options, "filename");
619     if (ret < 0) {
620         error_setg_errno(errp, -ret, "failed to set path: %s",
621                          blkio_get_error_msg());
622         return ret;
623     }
624 
625     if (flags & BDRV_O_NOCACHE) {
626         ret = blkio_set_bool(s->blkio, "direct", true);
627         if (ret < 0) {
628             error_setg_errno(errp, -ret, "failed to set direct: %s",
629                              blkio_get_error_msg());
630             return ret;
631         }
632     }
633 
634     ret = blkio_connect(s->blkio);
635     if (ret < 0) {
636         error_setg_errno(errp, -ret, "blkio_connect failed: %s",
637                          blkio_get_error_msg());
638         return ret;
639     }
640 
641     return 0;
642 }
643 
644 static int blkio_nvme_io_uring_connect(BlockDriverState *bs, QDict *options,
645                                        int flags, Error **errp)
646 {
647     const char *path = qdict_get_try_str(options, "path");
648     BDRVBlkioState *s = bs->opaque;
649     int ret;
650 
651     if (!path) {
652         error_setg(errp, "missing 'path' option");
653         return -EINVAL;
654     }
655 
656     ret = blkio_set_str(s->blkio, "path", path);
657     qdict_del(options, "path");
658     if (ret < 0) {
659         error_setg_errno(errp, -ret, "failed to set path: %s",
660                          blkio_get_error_msg());
661         return ret;
662     }
663 
664     if (!(flags & BDRV_O_NOCACHE)) {
665         error_setg(errp, "cache.direct=off is not supported");
666         return -EINVAL;
667     }
668 
669     ret = blkio_connect(s->blkio);
670     if (ret < 0) {
671         error_setg_errno(errp, -ret, "blkio_connect failed: %s",
672                          blkio_get_error_msg());
673         return ret;
674     }
675 
676     return 0;
677 }
678 
679 static int blkio_virtio_blk_connect(BlockDriverState *bs, QDict *options,
680                                     int flags, Error **errp)
681 {
682     const char *path = qdict_get_try_str(options, "path");
683     BDRVBlkioState *s = bs->opaque;
684     bool fd_supported = false;
685     int fd = -1, ret;
686 
687     if (!path) {
688         error_setg(errp, "missing 'path' option");
689         return -EINVAL;
690     }
691 
692     if (!(flags & BDRV_O_NOCACHE)) {
693         error_setg(errp, "cache.direct=off is not supported");
694         return -EINVAL;
695     }
696 
697     if (blkio_set_int(s->blkio, "fd", -1) == 0) {
698         fd_supported = true;
699     }
700 
701     /*
702      * If the libblkio driver supports fd passing, let's always use qemu_open()
703      * to open the `path`, so we can handle fd passing from the management
704      * layer through the "/dev/fdset/N" special path.
705      */
706     if (fd_supported) {
707         /*
708          * `path` can contain the path of a character device
709          * (e.g. /dev/vhost-vdpa-0 or /dev/vfio/vfio) or a unix socket.
710          *
711          * So, we should always open it with O_RDWR flag, also if BDRV_O_RDWR
712          * is not set in the open flags, because the exchange of IOCTL commands
713          * for example will fail.
714          *
715          * In order to open the device read-only, we are using the `read-only`
716          * property of the libblkio driver in blkio_file_open().
717          */
718         fd = qemu_open(path, O_RDWR, NULL);
719         if (fd < 0) {
720             /*
721              * qemu_open() can fail if the user specifies a path that is not
722              * a file or device, for example in the case of Unix Domain Socket
723              * for the virtio-blk-vhost-user driver. In such cases let's have
724              * libblkio open the path directly.
725              */
726             fd_supported = false;
727         } else {
728             ret = blkio_set_int(s->blkio, "fd", fd);
729             if (ret < 0) {
730                 fd_supported = false;
731                 qemu_close(fd);
732                 fd = -1;
733             }
734         }
735     }
736 
737     if (!fd_supported) {
738         ret = blkio_set_str(s->blkio, "path", path);
739         if (ret < 0) {
740             error_setg_errno(errp, -ret, "failed to set path: %s",
741                              blkio_get_error_msg());
742             return ret;
743         }
744     }
745 
746     ret = blkio_connect(s->blkio);
747     if (ret < 0 && fd >= 0) {
748         /* Failed to give the FD to libblkio, close it */
749         qemu_close(fd);
750         fd = -1;
751     }
752 
753     /*
754      * Before https://gitlab.com/libblkio/libblkio/-/merge_requests/208
755      * (libblkio <= v1.3.0), setting the `fd` property is not enough to check
756      * whether the driver supports the `fd` property or not. In that case,
757      * blkio_connect() will fail with -EINVAL.
758      * So let's try calling blkio_connect() again by directly setting `path`
759      * to cover this scenario.
760      */
761     if (fd_supported && ret == -EINVAL) {
762         /*
763          * We need to clear the `fd` property we set previously by setting
764          * it to -1.
765          */
766         ret = blkio_set_int(s->blkio, "fd", -1);
767         if (ret < 0) {
768             error_setg_errno(errp, -ret, "failed to set fd: %s",
769                              blkio_get_error_msg());
770             return ret;
771         }
772 
773         ret = blkio_set_str(s->blkio, "path", path);
774         if (ret < 0) {
775             error_setg_errno(errp, -ret, "failed to set path: %s",
776                              blkio_get_error_msg());
777             return ret;
778         }
779 
780         ret = blkio_connect(s->blkio);
781     }
782 
783     if (ret < 0) {
784         error_setg_errno(errp, -ret, "blkio_connect failed: %s",
785                          blkio_get_error_msg());
786         return ret;
787     }
788 
789     qdict_del(options, "path");
790 
791     return 0;
792 }
793 
794 static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags,
795                            Error **errp)
796 {
797     const char *blkio_driver = bs->drv->protocol_name;
798     BDRVBlkioState *s = bs->opaque;
799     int ret;
800 
801     ret = blkio_create(blkio_driver, &s->blkio);
802     if (ret < 0) {
803         error_setg_errno(errp, -ret, "blkio_create failed: %s",
804                          blkio_get_error_msg());
805         return ret;
806     }
807 
808     if (!(flags & BDRV_O_RDWR)) {
809         ret = blkio_set_bool(s->blkio, "read-only", true);
810         if (ret < 0) {
811             error_setg_errno(errp, -ret, "failed to set read-only: %s",
812                              blkio_get_error_msg());
813             blkio_destroy(&s->blkio);
814             return ret;
815         }
816     }
817 
818     if (strcmp(blkio_driver, "io_uring") == 0) {
819         ret = blkio_io_uring_connect(bs, options, flags, errp);
820     } else if (strcmp(blkio_driver, "nvme-io_uring") == 0) {
821         ret = blkio_nvme_io_uring_connect(bs, options, flags, errp);
822     } else if (strcmp(blkio_driver, "virtio-blk-vfio-pci") == 0) {
823         ret = blkio_virtio_blk_connect(bs, options, flags, errp);
824     } else if (strcmp(blkio_driver, "virtio-blk-vhost-user") == 0) {
825         ret = blkio_virtio_blk_connect(bs, options, flags, errp);
826     } else if (strcmp(blkio_driver, "virtio-blk-vhost-vdpa") == 0) {
827         ret = blkio_virtio_blk_connect(bs, options, flags, errp);
828     } else {
829         g_assert_not_reached();
830     }
831     if (ret < 0) {
832         blkio_destroy(&s->blkio);
833         return ret;
834     }
835 
836     ret = blkio_get_bool(s->blkio,
837                          "needs-mem-regions",
838                          &s->needs_mem_regions);
839     if (ret < 0) {
840         error_setg_errno(errp, -ret,
841                          "failed to get needs-mem-regions: %s",
842                          blkio_get_error_msg());
843         blkio_destroy(&s->blkio);
844         return ret;
845     }
846 
847     ret = blkio_get_bool(s->blkio,
848                          "needs-mem-region-fd",
849                          &s->needs_mem_region_fd);
850     if (ret < 0) {
851         error_setg_errno(errp, -ret,
852                          "failed to get needs-mem-region-fd: %s",
853                          blkio_get_error_msg());
854         blkio_destroy(&s->blkio);
855         return ret;
856     }
857 
858     ret = blkio_get_uint64(s->blkio,
859                            "mem-region-alignment",
860                            &s->mem_region_alignment);
861     if (ret < 0) {
862         error_setg_errno(errp, -ret,
863                          "failed to get mem-region-alignment: %s",
864                          blkio_get_error_msg());
865         blkio_destroy(&s->blkio);
866         return ret;
867     }
868 
869     ret = blkio_get_bool(s->blkio,
870                          "may-pin-mem-regions",
871                          &s->may_pin_mem_regions);
872     if (ret < 0) {
873         /* Be conservative (assume pinning) if the property is not supported */
874         s->may_pin_mem_regions = s->needs_mem_regions;
875     }
876 
877     /*
878      * Notify if libblkio drivers pin memory and prevent features like
879      * virtio-mem from working.
880      */
881     if (s->may_pin_mem_regions) {
882         ret = ram_block_discard_disable(true);
883         if (ret < 0) {
884             error_setg_errno(errp, -ret, "ram_block_discard_disable() failed");
885             blkio_destroy(&s->blkio);
886             return ret;
887         }
888     }
889 
890     ret = blkio_start(s->blkio);
891     if (ret < 0) {
892         error_setg_errno(errp, -ret, "blkio_start failed: %s",
893                          blkio_get_error_msg());
894         blkio_destroy(&s->blkio);
895         if (s->may_pin_mem_regions) {
896             ram_block_discard_disable(false);
897         }
898         return ret;
899     }
900 
901     bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF;
902     bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP |
903                                BDRV_REQ_NO_FALLBACK;
904 
905     qemu_mutex_init(&s->blkio_lock);
906     qemu_co_mutex_init(&s->bounce_lock);
907     qemu_co_queue_init(&s->bounce_available);
908     QLIST_INIT(&s->bounce_bufs);
909     s->blkioq = blkio_get_queue(s->blkio, 0);
910     s->completion_fd = blkioq_get_completion_fd(s->blkioq);
911     blkioq_set_completion_fd_enabled(s->blkioq, true);
912 
913     blkio_attach_aio_context(bs, bdrv_get_aio_context(bs));
914     return 0;
915 }
916 
917 static void blkio_close(BlockDriverState *bs)
918 {
919     BDRVBlkioState *s = bs->opaque;
920 
921     /* There is no destroy() API for s->bounce_lock */
922 
923     qemu_mutex_destroy(&s->blkio_lock);
924     blkio_detach_aio_context(bs);
925     blkio_destroy(&s->blkio);
926 
927     if (s->may_pin_mem_regions) {
928         ram_block_discard_disable(false);
929     }
930 }
931 
932 static int64_t coroutine_fn blkio_co_getlength(BlockDriverState *bs)
933 {
934     BDRVBlkioState *s = bs->opaque;
935     uint64_t capacity;
936     int ret;
937 
938     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
939         ret = blkio_get_uint64(s->blkio, "capacity", &capacity);
940     }
941     if (ret < 0) {
942         return -ret;
943     }
944 
945     return capacity;
946 }
947 
948 static int coroutine_fn blkio_truncate(BlockDriverState *bs, int64_t offset,
949                                        bool exact, PreallocMode prealloc,
950                                        BdrvRequestFlags flags, Error **errp)
951 {
952     int64_t current_length;
953 
954     if (prealloc != PREALLOC_MODE_OFF) {
955         error_setg(errp, "Unsupported preallocation mode '%s'",
956                    PreallocMode_str(prealloc));
957         return -ENOTSUP;
958     }
959 
960     current_length = blkio_co_getlength(bs);
961 
962     if (offset > current_length) {
963         error_setg(errp, "Cannot grow device");
964         return -EINVAL;
965     } else if (exact && offset != current_length) {
966         error_setg(errp, "Cannot resize device");
967         return -ENOTSUP;
968     }
969 
970     return 0;
971 }
972 
973 static int coroutine_fn
974 blkio_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
975 {
976     return 0;
977 }
978 
979 static void blkio_refresh_limits(BlockDriverState *bs, Error **errp)
980 {
981     BDRVBlkioState *s = bs->opaque;
982     QEMU_LOCK_GUARD(&s->blkio_lock);
983     int value;
984     int ret;
985 
986     ret = blkio_get_int(s->blkio, "request-alignment", &value);
987     if (ret < 0) {
988         error_setg_errno(errp, -ret, "failed to get \"request-alignment\": %s",
989                          blkio_get_error_msg());
990         return;
991     }
992     bs->bl.request_alignment = value;
993     if (bs->bl.request_alignment < 1 ||
994         bs->bl.request_alignment >= INT_MAX ||
995         !is_power_of_2(bs->bl.request_alignment)) {
996         error_setg(errp, "invalid \"request-alignment\" value %" PRIu32 ", "
997                    "must be a power of 2 less than INT_MAX",
998                    bs->bl.request_alignment);
999         return;
1000     }
1001 
1002     ret = blkio_get_int(s->blkio, "optimal-io-size", &value);
1003     if (ret < 0) {
1004         error_setg_errno(errp, -ret, "failed to get \"optimal-io-size\": %s",
1005                          blkio_get_error_msg());
1006         return;
1007     }
1008     bs->bl.opt_transfer = value;
1009     if (bs->bl.opt_transfer > INT_MAX ||
1010         (bs->bl.opt_transfer % bs->bl.request_alignment)) {
1011         error_setg(errp, "invalid \"optimal-io-size\" value %" PRIu32 ", must "
1012                    "be a multiple of %" PRIu32, bs->bl.opt_transfer,
1013                    bs->bl.request_alignment);
1014         return;
1015     }
1016 
1017     ret = blkio_get_int(s->blkio, "max-transfer", &value);
1018     if (ret < 0) {
1019         error_setg_errno(errp, -ret, "failed to get \"max-transfer\": %s",
1020                          blkio_get_error_msg());
1021         return;
1022     }
1023     bs->bl.max_transfer = value;
1024     if ((bs->bl.max_transfer % bs->bl.request_alignment) ||
1025         (bs->bl.opt_transfer && (bs->bl.max_transfer % bs->bl.opt_transfer))) {
1026         error_setg(errp, "invalid \"max-transfer\" value %" PRIu32 ", must be "
1027                    "a multiple of %" PRIu32 " and %" PRIu32 " (if non-zero)",
1028                    bs->bl.max_transfer, bs->bl.request_alignment,
1029                    bs->bl.opt_transfer);
1030         return;
1031     }
1032 
1033     ret = blkio_get_int(s->blkio, "buf-alignment", &value);
1034     if (ret < 0) {
1035         error_setg_errno(errp, -ret, "failed to get \"buf-alignment\": %s",
1036                          blkio_get_error_msg());
1037         return;
1038     }
1039     if (value < 1) {
1040         error_setg(errp, "invalid \"buf-alignment\" value %d, must be "
1041                    "positive", value);
1042         return;
1043     }
1044     bs->bl.min_mem_alignment = value;
1045 
1046     ret = blkio_get_int(s->blkio, "optimal-buf-alignment", &value);
1047     if (ret < 0) {
1048         error_setg_errno(errp, -ret,
1049                          "failed to get \"optimal-buf-alignment\": %s",
1050                          blkio_get_error_msg());
1051         return;
1052     }
1053     if (value < 1) {
1054         error_setg(errp, "invalid \"optimal-buf-alignment\" value %d, "
1055                    "must be positive", value);
1056         return;
1057     }
1058     bs->bl.opt_mem_alignment = value;
1059 
1060     ret = blkio_get_int(s->blkio, "max-segments", &value);
1061     if (ret < 0) {
1062         error_setg_errno(errp, -ret, "failed to get \"max-segments\": %s",
1063                          blkio_get_error_msg());
1064         return;
1065     }
1066     if (value < 1) {
1067         error_setg(errp, "invalid \"max-segments\" value %d, must be positive",
1068                    value);
1069         return;
1070     }
1071     bs->bl.max_iov = value;
1072 }
1073 
1074 /*
1075  * TODO
1076  * Missing libblkio APIs:
1077  * - block_status
1078  * - co_invalidate_cache
1079  *
1080  * Out of scope?
1081  * - create
1082  * - truncate
1083  */
1084 
1085 /*
1086  * Do not include .format_name and .protocol_name because module_block.py
1087  * does not parse macros in the source code.
1088  */
1089 #define BLKIO_DRIVER_COMMON \
1090     .instance_size           = sizeof(BDRVBlkioState), \
1091     .bdrv_file_open          = blkio_file_open, \
1092     .bdrv_close              = blkio_close, \
1093     .bdrv_co_getlength       = blkio_co_getlength, \
1094     .bdrv_co_truncate        = blkio_truncate, \
1095     .bdrv_co_get_info        = blkio_co_get_info, \
1096     .bdrv_attach_aio_context = blkio_attach_aio_context, \
1097     .bdrv_detach_aio_context = blkio_detach_aio_context, \
1098     .bdrv_co_pdiscard        = blkio_co_pdiscard, \
1099     .bdrv_co_preadv          = blkio_co_preadv, \
1100     .bdrv_co_pwritev         = blkio_co_pwritev, \
1101     .bdrv_co_flush_to_disk   = blkio_co_flush, \
1102     .bdrv_co_pwrite_zeroes   = blkio_co_pwrite_zeroes, \
1103     .bdrv_refresh_limits     = blkio_refresh_limits, \
1104     .bdrv_register_buf       = blkio_register_buf, \
1105     .bdrv_unregister_buf     = blkio_unregister_buf,
1106 
1107 /*
1108  * Use the same .format_name and .protocol_name as the libblkio driver name for
1109  * consistency.
1110  */
1111 
1112 static BlockDriver bdrv_io_uring = {
1113     .format_name         = "io_uring",
1114     .protocol_name       = "io_uring",
1115     .bdrv_needs_filename = true,
1116     BLKIO_DRIVER_COMMON
1117 };
1118 
1119 static BlockDriver bdrv_nvme_io_uring = {
1120     .format_name         = "nvme-io_uring",
1121     .protocol_name       = "nvme-io_uring",
1122     BLKIO_DRIVER_COMMON
1123 };
1124 
1125 static BlockDriver bdrv_virtio_blk_vfio_pci = {
1126     .format_name         = "virtio-blk-vfio-pci",
1127     .protocol_name       = "virtio-blk-vfio-pci",
1128     BLKIO_DRIVER_COMMON
1129 };
1130 
1131 static BlockDriver bdrv_virtio_blk_vhost_user = {
1132     .format_name         = "virtio-blk-vhost-user",
1133     .protocol_name       = "virtio-blk-vhost-user",
1134     BLKIO_DRIVER_COMMON
1135 };
1136 
1137 static BlockDriver bdrv_virtio_blk_vhost_vdpa = {
1138     .format_name         = "virtio-blk-vhost-vdpa",
1139     .protocol_name       = "virtio-blk-vhost-vdpa",
1140     BLKIO_DRIVER_COMMON
1141 };
1142 
1143 static void bdrv_blkio_init(void)
1144 {
1145     bdrv_register(&bdrv_io_uring);
1146     bdrv_register(&bdrv_nvme_io_uring);
1147     bdrv_register(&bdrv_virtio_blk_vfio_pci);
1148     bdrv_register(&bdrv_virtio_blk_vhost_user);
1149     bdrv_register(&bdrv_virtio_blk_vhost_vdpa);
1150 }
1151 
1152 block_init(bdrv_blkio_init);
1153