xref: /openbmc/qemu/block/blkio.c (revision e4a4edc10ab6a621e1c18eb73fc3e6f5d3f7c2e1)
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2 /*
3  * libblkio BlockDriver
4  *
5  * Copyright Red Hat, Inc.
6  *
7  * Author:
8  *   Stefan Hajnoczi <stefanha@redhat.com>
9  */
10 
11 #include "qemu/osdep.h"
12 #include <blkio.h>
13 #include "block/block_int.h"
14 #include "exec/memory.h"
15 #include "exec/cpu-common.h" /* for qemu_ram_get_fd() */
16 #include "qemu/defer-call.h"
17 #include "qapi/error.h"
18 #include "qemu/error-report.h"
19 #include "qapi/qmp/qdict.h"
20 #include "qemu/module.h"
21 #include "sysemu/block-backend.h"
22 #include "exec/memory.h" /* for ram_block_discard_disable() */
23 
24 #include "block/block-io.h"
25 
26 /*
27  * Allocated bounce buffers are kept in a list sorted by buffer address.
28  */
29 typedef struct BlkioBounceBuf {
30     QLIST_ENTRY(BlkioBounceBuf) next;
31 
32     /* The bounce buffer */
33     struct iovec buf;
34 } BlkioBounceBuf;
35 
36 typedef struct {
37     /*
38      * libblkio is not thread-safe so this lock protects ->blkio and
39      * ->blkioq.
40      */
41     QemuMutex blkio_lock;
42     struct blkio *blkio;
43     struct blkioq *blkioq; /* make this multi-queue in the future... */
44     int completion_fd;
45 
46     /*
47      * Polling fetches the next completion into this field.
48      *
49      * No lock is necessary since only one thread calls aio_poll() and invokes
50      * fd and poll handlers.
51      */
52     struct blkio_completion poll_completion;
53 
54     /*
55      * Protects ->bounce_pool, ->bounce_bufs, ->bounce_available.
56      *
57      * Lock ordering: ->bounce_lock before ->blkio_lock.
58      */
59     CoMutex bounce_lock;
60 
61     /* Bounce buffer pool */
62     struct blkio_mem_region bounce_pool;
63 
64     /* Sorted list of allocated bounce buffers */
65     QLIST_HEAD(, BlkioBounceBuf) bounce_bufs;
66 
67     /* Queue for coroutines waiting for bounce buffer space */
68     CoQueue bounce_available;
69 
70     /* The value of the "mem-region-alignment" property */
71     uint64_t mem_region_alignment;
72 
73     /* Can we skip adding/deleting blkio_mem_regions? */
74     bool needs_mem_regions;
75 
76     /* Are file descriptors necessary for blkio_mem_regions? */
77     bool needs_mem_region_fd;
78 
79     /* Are madvise(MADV_DONTNEED)-style operations unavailable? */
80     bool may_pin_mem_regions;
81 } BDRVBlkioState;
82 
83 /* Called with s->bounce_lock held */
84 static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes)
85 {
86     /* There can be no allocated bounce buffers during resize */
87     assert(QLIST_EMPTY(&s->bounce_bufs));
88 
89     /* Pad size to reduce frequency of resize calls */
90     bytes += 128 * 1024;
91 
92     /* Align the pool size to avoid blkio_alloc_mem_region() failure */
93     bytes = QEMU_ALIGN_UP(bytes, s->mem_region_alignment);
94 
95     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
96         int ret;
97 
98         if (s->bounce_pool.addr) {
99             blkio_unmap_mem_region(s->blkio, &s->bounce_pool);
100             blkio_free_mem_region(s->blkio, &s->bounce_pool);
101             memset(&s->bounce_pool, 0, sizeof(s->bounce_pool));
102         }
103 
104         /* Automatically freed when s->blkio is destroyed */
105         ret = blkio_alloc_mem_region(s->blkio, &s->bounce_pool, bytes);
106         if (ret < 0) {
107             return ret;
108         }
109 
110         ret = blkio_map_mem_region(s->blkio, &s->bounce_pool);
111         if (ret < 0) {
112             blkio_free_mem_region(s->blkio, &s->bounce_pool);
113             memset(&s->bounce_pool, 0, sizeof(s->bounce_pool));
114             return ret;
115         }
116     }
117 
118     return 0;
119 }
120 
121 /* Called with s->bounce_lock held */
122 static bool
123 blkio_do_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce,
124                              int64_t bytes)
125 {
126     void *addr = s->bounce_pool.addr;
127     BlkioBounceBuf *cur = NULL;
128     BlkioBounceBuf *prev = NULL;
129     ptrdiff_t space;
130 
131     /*
132      * This is just a linear search over the holes between requests. An
133      * efficient allocator would be nice.
134      */
135     QLIST_FOREACH(cur, &s->bounce_bufs, next) {
136         space = cur->buf.iov_base - addr;
137         if (bytes <= space) {
138             QLIST_INSERT_BEFORE(cur, bounce, next);
139             bounce->buf.iov_base = addr;
140             bounce->buf.iov_len = bytes;
141             return true;
142         }
143 
144         addr = cur->buf.iov_base + cur->buf.iov_len;
145         prev = cur;
146     }
147 
148     /* Is there space after the last request? */
149     space = s->bounce_pool.addr + s->bounce_pool.len - addr;
150     if (bytes > space) {
151         return false;
152     }
153     if (prev) {
154         QLIST_INSERT_AFTER(prev, bounce, next);
155     } else {
156         QLIST_INSERT_HEAD(&s->bounce_bufs, bounce, next);
157     }
158     bounce->buf.iov_base = addr;
159     bounce->buf.iov_len = bytes;
160     return true;
161 }
162 
163 static int coroutine_fn
164 blkio_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce,
165                           int64_t bytes)
166 {
167     /*
168      * Ensure fairness: first time around we join the back of the queue,
169      * subsequently we join the front so we don't lose our place.
170      */
171     CoQueueWaitFlags wait_flags = 0;
172 
173     QEMU_LOCK_GUARD(&s->bounce_lock);
174 
175     /* Ensure fairness: don't even try if other requests are already waiting */
176     if (!qemu_co_queue_empty(&s->bounce_available)) {
177         qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock,
178                                  wait_flags);
179         wait_flags = CO_QUEUE_WAIT_FRONT;
180     }
181 
182     while (true) {
183         if (blkio_do_alloc_bounce_buffer(s, bounce, bytes)) {
184             /* Kick the next queued request since there may be space */
185             qemu_co_queue_next(&s->bounce_available);
186             return 0;
187         }
188 
189         /*
190          * If there are no in-flight requests then the pool was simply too
191          * small.
192          */
193         if (QLIST_EMPTY(&s->bounce_bufs)) {
194             bool ok;
195             int ret;
196 
197             ret = blkio_resize_bounce_pool(s, bytes);
198             if (ret < 0) {
199                 /* Kick the next queued request since that may fail too */
200                 qemu_co_queue_next(&s->bounce_available);
201                 return ret;
202             }
203 
204             ok = blkio_do_alloc_bounce_buffer(s, bounce, bytes);
205             assert(ok); /* must have space this time */
206             return 0;
207         }
208 
209         qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock,
210                                  wait_flags);
211         wait_flags = CO_QUEUE_WAIT_FRONT;
212     }
213 }
214 
215 static void coroutine_fn blkio_free_bounce_buffer(BDRVBlkioState *s,
216                                                   BlkioBounceBuf *bounce)
217 {
218     QEMU_LOCK_GUARD(&s->bounce_lock);
219 
220     QLIST_REMOVE(bounce, next);
221 
222     /* Wake up waiting coroutines since space may now be available */
223     qemu_co_queue_next(&s->bounce_available);
224 }
225 
226 /* For async to .bdrv_co_*() conversion */
227 typedef struct {
228     Coroutine *coroutine;
229     int ret;
230 } BlkioCoData;
231 
232 static void blkio_completion_fd_read(void *opaque)
233 {
234     BlockDriverState *bs = opaque;
235     BDRVBlkioState *s = bs->opaque;
236     uint64_t val;
237     int ret;
238 
239     /* Polling may have already fetched a completion */
240     if (s->poll_completion.user_data != NULL) {
241         BlkioCoData *cod = s->poll_completion.user_data;
242         cod->ret = s->poll_completion.ret;
243 
244         /* Clear it in case aio_co_wake() enters a nested event loop */
245         s->poll_completion.user_data = NULL;
246 
247         aio_co_wake(cod->coroutine);
248     }
249 
250     /* Reset completion fd status */
251     ret = read(s->completion_fd, &val, sizeof(val));
252 
253     /* Ignore errors, there's nothing we can do */
254     (void)ret;
255 
256     /*
257      * Reading one completion at a time makes nested event loop re-entrancy
258      * simple. Change this loop to get multiple completions in one go if it
259      * becomes a performance bottleneck.
260      */
261     while (true) {
262         struct blkio_completion completion;
263 
264         WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
265             ret = blkioq_do_io(s->blkioq, &completion, 0, 1, NULL);
266         }
267         if (ret != 1) {
268             break;
269         }
270 
271         BlkioCoData *cod = completion.user_data;
272         cod->ret = completion.ret;
273         aio_co_wake(cod->coroutine);
274     }
275 }
276 
277 static bool blkio_completion_fd_poll(void *opaque)
278 {
279     BlockDriverState *bs = opaque;
280     BDRVBlkioState *s = bs->opaque;
281     int ret;
282 
283     /* Just in case we already fetched a completion */
284     if (s->poll_completion.user_data != NULL) {
285         return true;
286     }
287 
288     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
289         ret = blkioq_do_io(s->blkioq, &s->poll_completion, 0, 1, NULL);
290     }
291     return ret == 1;
292 }
293 
294 static void blkio_completion_fd_poll_ready(void *opaque)
295 {
296     blkio_completion_fd_read(opaque);
297 }
298 
299 static void blkio_attach_aio_context(BlockDriverState *bs,
300                                      AioContext *new_context)
301 {
302     BDRVBlkioState *s = bs->opaque;
303 
304     aio_set_fd_handler(new_context, s->completion_fd,
305                        blkio_completion_fd_read, NULL,
306                        blkio_completion_fd_poll,
307                        blkio_completion_fd_poll_ready, bs);
308 }
309 
310 static void blkio_detach_aio_context(BlockDriverState *bs)
311 {
312     BDRVBlkioState *s = bs->opaque;
313 
314     aio_set_fd_handler(bdrv_get_aio_context(bs), s->completion_fd, NULL, NULL,
315                        NULL, NULL, NULL);
316 }
317 
318 /*
319  * Called by defer_call_end() or immediately if not in a deferred section.
320  * Called without blkio_lock.
321  */
322 static void blkio_deferred_fn(void *opaque)
323 {
324     BDRVBlkioState *s = opaque;
325 
326     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
327         blkioq_do_io(s->blkioq, NULL, 0, 0, NULL);
328     }
329 }
330 
331 /*
332  * Schedule I/O submission after enqueuing a new request. Called without
333  * blkio_lock.
334  */
335 static void blkio_submit_io(BlockDriverState *bs)
336 {
337     BDRVBlkioState *s = bs->opaque;
338 
339     defer_call(blkio_deferred_fn, s);
340 }
341 
342 static int coroutine_fn
343 blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
344 {
345     BDRVBlkioState *s = bs->opaque;
346     BlkioCoData cod = {
347         .coroutine = qemu_coroutine_self(),
348     };
349 
350     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
351         blkioq_discard(s->blkioq, offset, bytes, &cod, 0);
352     }
353 
354     blkio_submit_io(bs);
355     qemu_coroutine_yield();
356     return cod.ret;
357 }
358 
359 static int coroutine_fn
360 blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
361                 QEMUIOVector *qiov, BdrvRequestFlags flags)
362 {
363     BlkioCoData cod = {
364         .coroutine = qemu_coroutine_self(),
365     };
366     BDRVBlkioState *s = bs->opaque;
367     bool use_bounce_buffer =
368         s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
369     BlkioBounceBuf bounce;
370     struct iovec *iov = qiov->iov;
371     int iovcnt = qiov->niov;
372 
373     if (use_bounce_buffer) {
374         int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes);
375         if (ret < 0) {
376             return ret;
377         }
378 
379         iov = &bounce.buf;
380         iovcnt = 1;
381     }
382 
383     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
384         blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0);
385     }
386 
387     blkio_submit_io(bs);
388     qemu_coroutine_yield();
389 
390     if (use_bounce_buffer) {
391         if (cod.ret == 0) {
392             qemu_iovec_from_buf(qiov, 0,
393                                 bounce.buf.iov_base,
394                                 bounce.buf.iov_len);
395         }
396 
397         blkio_free_bounce_buffer(s, &bounce);
398     }
399 
400     return cod.ret;
401 }
402 
403 static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset,
404         int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags)
405 {
406     uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0;
407     BlkioCoData cod = {
408         .coroutine = qemu_coroutine_self(),
409     };
410     BDRVBlkioState *s = bs->opaque;
411     bool use_bounce_buffer =
412         s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
413     BlkioBounceBuf bounce;
414     struct iovec *iov = qiov->iov;
415     int iovcnt = qiov->niov;
416 
417     if (use_bounce_buffer) {
418         int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes);
419         if (ret < 0) {
420             return ret;
421         }
422 
423         qemu_iovec_to_buf(qiov, 0, bounce.buf.iov_base, bytes);
424         iov = &bounce.buf;
425         iovcnt = 1;
426     }
427 
428     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
429         blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags);
430     }
431 
432     blkio_submit_io(bs);
433     qemu_coroutine_yield();
434 
435     if (use_bounce_buffer) {
436         blkio_free_bounce_buffer(s, &bounce);
437     }
438 
439     return cod.ret;
440 }
441 
442 static int coroutine_fn blkio_co_flush(BlockDriverState *bs)
443 {
444     BDRVBlkioState *s = bs->opaque;
445     BlkioCoData cod = {
446         .coroutine = qemu_coroutine_self(),
447     };
448 
449     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
450         blkioq_flush(s->blkioq, &cod, 0);
451     }
452 
453     blkio_submit_io(bs);
454     qemu_coroutine_yield();
455     return cod.ret;
456 }
457 
458 static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs,
459     int64_t offset, int64_t bytes, BdrvRequestFlags flags)
460 {
461     BDRVBlkioState *s = bs->opaque;
462     BlkioCoData cod = {
463         .coroutine = qemu_coroutine_self(),
464     };
465     uint32_t blkio_flags = 0;
466 
467     if (flags & BDRV_REQ_FUA) {
468         blkio_flags |= BLKIO_REQ_FUA;
469     }
470     if (!(flags & BDRV_REQ_MAY_UNMAP)) {
471         blkio_flags |= BLKIO_REQ_NO_UNMAP;
472     }
473     if (flags & BDRV_REQ_NO_FALLBACK) {
474         blkio_flags |= BLKIO_REQ_NO_FALLBACK;
475     }
476 
477     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
478         blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags);
479     }
480 
481     blkio_submit_io(bs);
482     qemu_coroutine_yield();
483     return cod.ret;
484 }
485 
486 typedef enum {
487     BMRR_OK,
488     BMRR_SKIP,
489     BMRR_FAIL,
490 } BlkioMemRegionResult;
491 
492 /*
493  * Produce a struct blkio_mem_region for a given address and size.
494  *
495  * This function produces identical results when called multiple times with the
496  * same arguments. This property is necessary because blkio_unmap_mem_region()
497  * must receive the same struct blkio_mem_region field values that were passed
498  * to blkio_map_mem_region().
499  */
500 static BlkioMemRegionResult
501 blkio_mem_region_from_host(BlockDriverState *bs,
502                            void *host, size_t size,
503                            struct blkio_mem_region *region,
504                            Error **errp)
505 {
506     BDRVBlkioState *s = bs->opaque;
507     int fd = -1;
508     ram_addr_t fd_offset = 0;
509 
510     if (((uintptr_t)host | size) % s->mem_region_alignment) {
511         error_setg(errp, "unaligned buf %p with size %zu", host, size);
512         return BMRR_FAIL;
513     }
514 
515     /* Attempt to find the fd for the underlying memory */
516     if (s->needs_mem_region_fd) {
517         RAMBlock *ram_block;
518         RAMBlock *end_block;
519         ram_addr_t offset;
520 
521         /*
522          * bdrv_register_buf() is called with the BQL held so mr lives at least
523          * until this function returns.
524          */
525         ram_block = qemu_ram_block_from_host(host, false, &fd_offset);
526         if (ram_block) {
527             fd = qemu_ram_get_fd(ram_block);
528         }
529         if (fd == -1) {
530             /*
531              * Ideally every RAMBlock would have an fd. pc-bios and other
532              * things don't. Luckily they are usually not I/O buffers and we
533              * can just ignore them.
534              */
535             return BMRR_SKIP;
536         }
537 
538         /* Make sure the fd covers the entire range */
539         end_block = qemu_ram_block_from_host(host + size - 1, false, &offset);
540         if (ram_block != end_block) {
541             error_setg(errp, "registered buffer at %p with size %zu extends "
542                        "beyond RAMBlock", host, size);
543             return BMRR_FAIL;
544         }
545     }
546 
547     *region = (struct blkio_mem_region){
548         .addr = host,
549         .len = size,
550         .fd = fd,
551         .fd_offset = fd_offset,
552     };
553     return BMRR_OK;
554 }
555 
556 static bool blkio_register_buf(BlockDriverState *bs, void *host, size_t size,
557                                Error **errp)
558 {
559     BDRVBlkioState *s = bs->opaque;
560     struct blkio_mem_region region;
561     BlkioMemRegionResult region_result;
562     int ret;
563 
564     /*
565      * Mapping memory regions conflicts with RAM discard (virtio-mem) when
566      * there is pinning, so only do it when necessary.
567      */
568     if (!s->needs_mem_regions && s->may_pin_mem_regions) {
569         return true;
570     }
571 
572     region_result = blkio_mem_region_from_host(bs, host, size, &region, errp);
573     if (region_result == BMRR_SKIP) {
574         return true;
575     } else if (region_result != BMRR_OK) {
576         return false;
577     }
578 
579     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
580         ret = blkio_map_mem_region(s->blkio, &region);
581     }
582 
583     if (ret < 0) {
584         error_setg(errp, "Failed to add blkio mem region %p with size %zu: %s",
585                    host, size, blkio_get_error_msg());
586         return false;
587     }
588     return true;
589 }
590 
591 static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size)
592 {
593     BDRVBlkioState *s = bs->opaque;
594     struct blkio_mem_region region;
595 
596     /* See blkio_register_buf() */
597     if (!s->needs_mem_regions && s->may_pin_mem_regions) {
598         return;
599     }
600 
601     if (blkio_mem_region_from_host(bs, host, size, &region, NULL) != BMRR_OK) {
602         return;
603     }
604 
605     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
606         blkio_unmap_mem_region(s->blkio, &region);
607     }
608 }
609 
610 static int blkio_io_uring_connect(BlockDriverState *bs, QDict *options,
611                                   int flags, Error **errp)
612 {
613     const char *filename = qdict_get_str(options, "filename");
614     BDRVBlkioState *s = bs->opaque;
615     int ret;
616 
617     ret = blkio_set_str(s->blkio, "path", filename);
618     qdict_del(options, "filename");
619     if (ret < 0) {
620         error_setg_errno(errp, -ret, "failed to set path: %s",
621                          blkio_get_error_msg());
622         return ret;
623     }
624 
625     if (flags & BDRV_O_NOCACHE) {
626         ret = blkio_set_bool(s->blkio, "direct", true);
627         if (ret < 0) {
628             error_setg_errno(errp, -ret, "failed to set direct: %s",
629                              blkio_get_error_msg());
630             return ret;
631         }
632     }
633 
634     ret = blkio_connect(s->blkio);
635     if (ret < 0) {
636         error_setg_errno(errp, -ret, "blkio_connect failed: %s",
637                          blkio_get_error_msg());
638         return ret;
639     }
640 
641     return 0;
642 }
643 
644 static int blkio_nvme_io_uring_connect(BlockDriverState *bs, QDict *options,
645                                        int flags, Error **errp)
646 {
647     const char *path = qdict_get_try_str(options, "path");
648     BDRVBlkioState *s = bs->opaque;
649     int ret;
650 
651     if (!path) {
652         error_setg(errp, "missing 'path' option");
653         return -EINVAL;
654     }
655 
656     ret = blkio_set_str(s->blkio, "path", path);
657     qdict_del(options, "path");
658     if (ret < 0) {
659         error_setg_errno(errp, -ret, "failed to set path: %s",
660                          blkio_get_error_msg());
661         return ret;
662     }
663 
664     if (!(flags & BDRV_O_NOCACHE)) {
665         error_setg(errp, "cache.direct=off is not supported");
666         return -EINVAL;
667     }
668 
669     ret = blkio_connect(s->blkio);
670     if (ret < 0) {
671         error_setg_errno(errp, -ret, "blkio_connect failed: %s",
672                          blkio_get_error_msg());
673         return ret;
674     }
675 
676     return 0;
677 }
678 
679 static int blkio_virtio_blk_connect(BlockDriverState *bs, QDict *options,
680                                     int flags, Error **errp)
681 {
682     const char *path = qdict_get_try_str(options, "path");
683     BDRVBlkioState *s = bs->opaque;
684     bool fd_supported = false;
685     int fd = -1, ret;
686 
687     if (!path) {
688         error_setg(errp, "missing 'path' option");
689         return -EINVAL;
690     }
691 
692     if (!(flags & BDRV_O_NOCACHE)) {
693         error_setg(errp, "cache.direct=off is not supported");
694         return -EINVAL;
695     }
696 
697     if (blkio_set_int(s->blkio, "fd", -1) == 0) {
698         fd_supported = true;
699     }
700 
701     /*
702      * If the libblkio driver supports fd passing, let's always use qemu_open()
703      * to open the `path`, so we can handle fd passing from the management
704      * layer through the "/dev/fdset/N" special path.
705      */
706     if (fd_supported) {
707         /*
708          * `path` can contain the path of a character device
709          * (e.g. /dev/vhost-vdpa-0 or /dev/vfio/vfio) or a unix socket.
710          *
711          * So, we should always open it with O_RDWR flag, also if BDRV_O_RDWR
712          * is not set in the open flags, because the exchange of IOCTL commands
713          * for example will fail.
714          *
715          * In order to open the device read-only, we are using the `read-only`
716          * property of the libblkio driver in blkio_open().
717          */
718         fd = qemu_open(path, O_RDWR, NULL);
719         if (fd < 0) {
720             /*
721              * qemu_open() can fail if the user specifies a path that is not
722              * a file or device, for example in the case of Unix Domain Socket
723              * for the virtio-blk-vhost-user driver. In such cases let's have
724              * libblkio open the path directly.
725              */
726             fd_supported = false;
727         } else {
728             ret = blkio_set_int(s->blkio, "fd", fd);
729             if (ret < 0) {
730                 fd_supported = false;
731                 qemu_close(fd);
732                 fd = -1;
733             }
734         }
735     }
736 
737     if (!fd_supported) {
738         ret = blkio_set_str(s->blkio, "path", path);
739         if (ret < 0) {
740             error_setg_errno(errp, -ret, "failed to set path: %s",
741                              blkio_get_error_msg());
742             return ret;
743         }
744     }
745 
746     ret = blkio_connect(s->blkio);
747     if (ret < 0 && fd >= 0) {
748         /* Failed to give the FD to libblkio, close it */
749         qemu_close(fd);
750         fd = -1;
751     }
752 
753     /*
754      * Before https://gitlab.com/libblkio/libblkio/-/merge_requests/208
755      * (libblkio <= v1.3.0), setting the `fd` property is not enough to check
756      * whether the driver supports the `fd` property or not. In that case,
757      * blkio_connect() will fail with -EINVAL.
758      * So let's try calling blkio_connect() again by directly setting `path`
759      * to cover this scenario.
760      */
761     if (fd_supported && ret == -EINVAL) {
762         /*
763          * We need to clear the `fd` property we set previously by setting
764          * it to -1.
765          */
766         ret = blkio_set_int(s->blkio, "fd", -1);
767         if (ret < 0) {
768             error_setg_errno(errp, -ret, "failed to set fd: %s",
769                              blkio_get_error_msg());
770             return ret;
771         }
772 
773         ret = blkio_set_str(s->blkio, "path", path);
774         if (ret < 0) {
775             error_setg_errno(errp, -ret, "failed to set path: %s",
776                              blkio_get_error_msg());
777             return ret;
778         }
779 
780         ret = blkio_connect(s->blkio);
781     }
782 
783     if (ret < 0) {
784         error_setg_errno(errp, -ret, "blkio_connect failed: %s",
785                          blkio_get_error_msg());
786         return ret;
787     }
788 
789     qdict_del(options, "path");
790 
791     return 0;
792 }
793 
794 static int blkio_open(BlockDriverState *bs, QDict *options, int flags,
795                       Error **errp)
796 {
797     const char *blkio_driver = bs->drv->protocol_name;
798     BDRVBlkioState *s = bs->opaque;
799     int ret;
800 
801     ret = blkio_create(blkio_driver, &s->blkio);
802     if (ret < 0) {
803         error_setg_errno(errp, -ret, "blkio_create failed: %s",
804                          blkio_get_error_msg());
805         return ret;
806     }
807 
808     if (!(flags & BDRV_O_RDWR)) {
809         ret = blkio_set_bool(s->blkio, "read-only", true);
810         if (ret < 0) {
811             error_setg_errno(errp, -ret, "failed to set read-only: %s",
812                              blkio_get_error_msg());
813             blkio_destroy(&s->blkio);
814             return ret;
815         }
816     }
817 
818     if (strcmp(blkio_driver, "io_uring") == 0) {
819         ret = blkio_io_uring_connect(bs, options, flags, errp);
820     } else if (strcmp(blkio_driver, "nvme-io_uring") == 0) {
821         ret = blkio_nvme_io_uring_connect(bs, options, flags, errp);
822     } else if (strcmp(blkio_driver, "virtio-blk-vfio-pci") == 0) {
823         ret = blkio_virtio_blk_connect(bs, options, flags, errp);
824     } else if (strcmp(blkio_driver, "virtio-blk-vhost-user") == 0) {
825         ret = blkio_virtio_blk_connect(bs, options, flags, errp);
826     } else if (strcmp(blkio_driver, "virtio-blk-vhost-vdpa") == 0) {
827         ret = blkio_virtio_blk_connect(bs, options, flags, errp);
828     } else {
829         g_assert_not_reached();
830     }
831     if (ret < 0) {
832         blkio_destroy(&s->blkio);
833         return ret;
834     }
835 
836     ret = blkio_get_bool(s->blkio,
837                          "needs-mem-regions",
838                          &s->needs_mem_regions);
839     if (ret < 0) {
840         error_setg_errno(errp, -ret,
841                          "failed to get needs-mem-regions: %s",
842                          blkio_get_error_msg());
843         blkio_destroy(&s->blkio);
844         return ret;
845     }
846 
847     ret = blkio_get_bool(s->blkio,
848                          "needs-mem-region-fd",
849                          &s->needs_mem_region_fd);
850     if (ret < 0) {
851         error_setg_errno(errp, -ret,
852                          "failed to get needs-mem-region-fd: %s",
853                          blkio_get_error_msg());
854         blkio_destroy(&s->blkio);
855         return ret;
856     }
857 
858     ret = blkio_get_uint64(s->blkio,
859                            "mem-region-alignment",
860                            &s->mem_region_alignment);
861     if (ret < 0) {
862         error_setg_errno(errp, -ret,
863                          "failed to get mem-region-alignment: %s",
864                          blkio_get_error_msg());
865         blkio_destroy(&s->blkio);
866         return ret;
867     }
868 
869     ret = blkio_get_bool(s->blkio,
870                          "may-pin-mem-regions",
871                          &s->may_pin_mem_regions);
872     if (ret < 0) {
873         /* Be conservative (assume pinning) if the property is not supported */
874         s->may_pin_mem_regions = s->needs_mem_regions;
875     }
876 
877     /*
878      * Notify if libblkio drivers pin memory and prevent features like
879      * virtio-mem from working.
880      */
881     if (s->may_pin_mem_regions) {
882         ret = ram_block_discard_disable(true);
883         if (ret < 0) {
884             error_setg_errno(errp, -ret, "ram_block_discard_disable() failed");
885             blkio_destroy(&s->blkio);
886             return ret;
887         }
888     }
889 
890     ret = blkio_start(s->blkio);
891     if (ret < 0) {
892         error_setg_errno(errp, -ret, "blkio_start failed: %s",
893                          blkio_get_error_msg());
894         blkio_destroy(&s->blkio);
895         if (s->may_pin_mem_regions) {
896             ram_block_discard_disable(false);
897         }
898         return ret;
899     }
900 
901     bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF;
902     bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
903 #ifdef CONFIG_BLKIO_WRITE_ZEROS_FUA
904     bs->supported_zero_flags |= BDRV_REQ_FUA;
905 #endif
906 
907     qemu_mutex_init(&s->blkio_lock);
908     qemu_co_mutex_init(&s->bounce_lock);
909     qemu_co_queue_init(&s->bounce_available);
910     QLIST_INIT(&s->bounce_bufs);
911     s->blkioq = blkio_get_queue(s->blkio, 0);
912     s->completion_fd = blkioq_get_completion_fd(s->blkioq);
913     blkioq_set_completion_fd_enabled(s->blkioq, true);
914 
915     blkio_attach_aio_context(bs, bdrv_get_aio_context(bs));
916     return 0;
917 }
918 
919 static void blkio_close(BlockDriverState *bs)
920 {
921     BDRVBlkioState *s = bs->opaque;
922 
923     /* There is no destroy() API for s->bounce_lock */
924 
925     qemu_mutex_destroy(&s->blkio_lock);
926     blkio_detach_aio_context(bs);
927     blkio_destroy(&s->blkio);
928 
929     if (s->may_pin_mem_regions) {
930         ram_block_discard_disable(false);
931     }
932 }
933 
934 static int64_t coroutine_fn blkio_co_getlength(BlockDriverState *bs)
935 {
936     BDRVBlkioState *s = bs->opaque;
937     uint64_t capacity;
938     int ret;
939 
940     WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
941         ret = blkio_get_uint64(s->blkio, "capacity", &capacity);
942     }
943     if (ret < 0) {
944         return -ret;
945     }
946 
947     return capacity;
948 }
949 
950 static int coroutine_fn blkio_truncate(BlockDriverState *bs, int64_t offset,
951                                        bool exact, PreallocMode prealloc,
952                                        BdrvRequestFlags flags, Error **errp)
953 {
954     int64_t current_length;
955 
956     if (prealloc != PREALLOC_MODE_OFF) {
957         error_setg(errp, "Unsupported preallocation mode '%s'",
958                    PreallocMode_str(prealloc));
959         return -ENOTSUP;
960     }
961 
962     current_length = blkio_co_getlength(bs);
963 
964     if (offset > current_length) {
965         error_setg(errp, "Cannot grow device");
966         return -EINVAL;
967     } else if (exact && offset != current_length) {
968         error_setg(errp, "Cannot resize device");
969         return -ENOTSUP;
970     }
971 
972     return 0;
973 }
974 
975 static int coroutine_fn
976 blkio_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
977 {
978     return 0;
979 }
980 
981 static void blkio_refresh_limits(BlockDriverState *bs, Error **errp)
982 {
983     BDRVBlkioState *s = bs->opaque;
984     QEMU_LOCK_GUARD(&s->blkio_lock);
985     int value;
986     int ret;
987 
988     ret = blkio_get_int(s->blkio, "request-alignment", &value);
989     if (ret < 0) {
990         error_setg_errno(errp, -ret, "failed to get \"request-alignment\": %s",
991                          blkio_get_error_msg());
992         return;
993     }
994     bs->bl.request_alignment = value;
995     if (bs->bl.request_alignment < 1 ||
996         bs->bl.request_alignment >= INT_MAX ||
997         !is_power_of_2(bs->bl.request_alignment)) {
998         error_setg(errp, "invalid \"request-alignment\" value %" PRIu32 ", "
999                    "must be a power of 2 less than INT_MAX",
1000                    bs->bl.request_alignment);
1001         return;
1002     }
1003 
1004     ret = blkio_get_int(s->blkio, "optimal-io-size", &value);
1005     if (ret < 0) {
1006         error_setg_errno(errp, -ret, "failed to get \"optimal-io-size\": %s",
1007                          blkio_get_error_msg());
1008         return;
1009     }
1010     bs->bl.opt_transfer = value;
1011     if (bs->bl.opt_transfer > INT_MAX ||
1012         (bs->bl.opt_transfer % bs->bl.request_alignment)) {
1013         error_setg(errp, "invalid \"optimal-io-size\" value %" PRIu32 ", must "
1014                    "be a multiple of %" PRIu32, bs->bl.opt_transfer,
1015                    bs->bl.request_alignment);
1016         return;
1017     }
1018 
1019     ret = blkio_get_int(s->blkio, "max-transfer", &value);
1020     if (ret < 0) {
1021         error_setg_errno(errp, -ret, "failed to get \"max-transfer\": %s",
1022                          blkio_get_error_msg());
1023         return;
1024     }
1025     bs->bl.max_transfer = value;
1026     if ((bs->bl.max_transfer % bs->bl.request_alignment) ||
1027         (bs->bl.opt_transfer && (bs->bl.max_transfer % bs->bl.opt_transfer))) {
1028         error_setg(errp, "invalid \"max-transfer\" value %" PRIu32 ", must be "
1029                    "a multiple of %" PRIu32 " and %" PRIu32 " (if non-zero)",
1030                    bs->bl.max_transfer, bs->bl.request_alignment,
1031                    bs->bl.opt_transfer);
1032         return;
1033     }
1034 
1035     ret = blkio_get_int(s->blkio, "buf-alignment", &value);
1036     if (ret < 0) {
1037         error_setg_errno(errp, -ret, "failed to get \"buf-alignment\": %s",
1038                          blkio_get_error_msg());
1039         return;
1040     }
1041     if (value < 1) {
1042         error_setg(errp, "invalid \"buf-alignment\" value %d, must be "
1043                    "positive", value);
1044         return;
1045     }
1046     bs->bl.min_mem_alignment = value;
1047 
1048     ret = blkio_get_int(s->blkio, "optimal-buf-alignment", &value);
1049     if (ret < 0) {
1050         error_setg_errno(errp, -ret,
1051                          "failed to get \"optimal-buf-alignment\": %s",
1052                          blkio_get_error_msg());
1053         return;
1054     }
1055     if (value < 1) {
1056         error_setg(errp, "invalid \"optimal-buf-alignment\" value %d, "
1057                    "must be positive", value);
1058         return;
1059     }
1060     bs->bl.opt_mem_alignment = value;
1061 
1062     ret = blkio_get_int(s->blkio, "max-segments", &value);
1063     if (ret < 0) {
1064         error_setg_errno(errp, -ret, "failed to get \"max-segments\": %s",
1065                          blkio_get_error_msg());
1066         return;
1067     }
1068     if (value < 1) {
1069         error_setg(errp, "invalid \"max-segments\" value %d, must be positive",
1070                    value);
1071         return;
1072     }
1073     bs->bl.max_iov = value;
1074 }
1075 
1076 /*
1077  * TODO
1078  * Missing libblkio APIs:
1079  * - block_status
1080  * - co_invalidate_cache
1081  *
1082  * Out of scope?
1083  * - create
1084  * - truncate
1085  */
1086 
1087 /*
1088  * Do not include .format_name and .protocol_name because module_block.py
1089  * does not parse macros in the source code.
1090  */
1091 #define BLKIO_DRIVER_COMMON \
1092     .instance_size           = sizeof(BDRVBlkioState), \
1093     .bdrv_open               = blkio_open, \
1094     .bdrv_close              = blkio_close, \
1095     .bdrv_co_getlength       = blkio_co_getlength, \
1096     .bdrv_co_truncate        = blkio_truncate, \
1097     .bdrv_co_get_info        = blkio_co_get_info, \
1098     .bdrv_attach_aio_context = blkio_attach_aio_context, \
1099     .bdrv_detach_aio_context = blkio_detach_aio_context, \
1100     .bdrv_co_pdiscard        = blkio_co_pdiscard, \
1101     .bdrv_co_preadv          = blkio_co_preadv, \
1102     .bdrv_co_pwritev         = blkio_co_pwritev, \
1103     .bdrv_co_flush_to_disk   = blkio_co_flush, \
1104     .bdrv_co_pwrite_zeroes   = blkio_co_pwrite_zeroes, \
1105     .bdrv_refresh_limits     = blkio_refresh_limits, \
1106     .bdrv_register_buf       = blkio_register_buf, \
1107     .bdrv_unregister_buf     = blkio_unregister_buf,
1108 
1109 /*
1110  * Use the same .format_name and .protocol_name as the libblkio driver name for
1111  * consistency.
1112  */
1113 
1114 static BlockDriver bdrv_io_uring = {
1115     .format_name         = "io_uring",
1116     .protocol_name       = "io_uring",
1117     .bdrv_needs_filename = true,
1118     BLKIO_DRIVER_COMMON
1119 };
1120 
1121 static BlockDriver bdrv_nvme_io_uring = {
1122     .format_name         = "nvme-io_uring",
1123     .protocol_name       = "nvme-io_uring",
1124     BLKIO_DRIVER_COMMON
1125 };
1126 
1127 static BlockDriver bdrv_virtio_blk_vfio_pci = {
1128     .format_name         = "virtio-blk-vfio-pci",
1129     .protocol_name       = "virtio-blk-vfio-pci",
1130     BLKIO_DRIVER_COMMON
1131 };
1132 
1133 static BlockDriver bdrv_virtio_blk_vhost_user = {
1134     .format_name         = "virtio-blk-vhost-user",
1135     .protocol_name       = "virtio-blk-vhost-user",
1136     BLKIO_DRIVER_COMMON
1137 };
1138 
1139 static BlockDriver bdrv_virtio_blk_vhost_vdpa = {
1140     .format_name         = "virtio-blk-vhost-vdpa",
1141     .protocol_name       = "virtio-blk-vhost-vdpa",
1142     BLKIO_DRIVER_COMMON
1143 };
1144 
1145 static void bdrv_blkio_init(void)
1146 {
1147     bdrv_register(&bdrv_io_uring);
1148     bdrv_register(&bdrv_nvme_io_uring);
1149     bdrv_register(&bdrv_virtio_blk_vfio_pci);
1150     bdrv_register(&bdrv_virtio_blk_vhost_user);
1151     bdrv_register(&bdrv_virtio_blk_vhost_vdpa);
1152 }
1153 
1154 block_init(bdrv_blkio_init);
1155