xref: /openbmc/qemu/block/blkio.c (revision de6cd7599b518f0c832cc85980196ec02c129a86)
1  /* SPDX-License-Identifier: LGPL-2.1-or-later */
2  /*
3   * libblkio BlockDriver
4   *
5   * Copyright Red Hat, Inc.
6   *
7   * Author:
8   *   Stefan Hajnoczi <stefanha@redhat.com>
9   */
10  
11  #include "qemu/osdep.h"
12  #include <blkio.h>
13  #include "block/block_int.h"
14  #include "exec/memory.h"
15  #include "exec/cpu-common.h" /* for qemu_ram_get_fd() */
16  #include "qapi/error.h"
17  #include "qemu/error-report.h"
18  #include "qapi/qmp/qdict.h"
19  #include "qemu/module.h"
20  #include "sysemu/block-backend.h"
21  #include "exec/memory.h" /* for ram_block_discard_disable() */
22  
23  #include "block/block-io.h"
24  
25  /*
26   * Keep the QEMU BlockDriver names identical to the libblkio driver names.
27   * Using macros instead of typing out the string literals avoids typos.
28   */
29  #define DRIVER_IO_URING "io_uring"
30  #define DRIVER_NVME_IO_URING "nvme-io_uring"
31  #define DRIVER_VIRTIO_BLK_VFIO_PCI "virtio-blk-vfio-pci"
32  #define DRIVER_VIRTIO_BLK_VHOST_USER "virtio-blk-vhost-user"
33  #define DRIVER_VIRTIO_BLK_VHOST_VDPA "virtio-blk-vhost-vdpa"
34  
35  /*
36   * Allocated bounce buffers are kept in a list sorted by buffer address.
37   */
38  typedef struct BlkioBounceBuf {
39      QLIST_ENTRY(BlkioBounceBuf) next;
40  
41      /* The bounce buffer */
42      struct iovec buf;
43  } BlkioBounceBuf;
44  
45  typedef struct {
46      /*
47       * libblkio is not thread-safe so this lock protects ->blkio and
48       * ->blkioq.
49       */
50      QemuMutex blkio_lock;
51      struct blkio *blkio;
52      struct blkioq *blkioq; /* make this multi-queue in the future... */
53      int completion_fd;
54  
55      /*
56       * Polling fetches the next completion into this field.
57       *
58       * No lock is necessary since only one thread calls aio_poll() and invokes
59       * fd and poll handlers.
60       */
61      struct blkio_completion poll_completion;
62  
63      /*
64       * Protects ->bounce_pool, ->bounce_bufs, ->bounce_available.
65       *
66       * Lock ordering: ->bounce_lock before ->blkio_lock.
67       */
68      CoMutex bounce_lock;
69  
70      /* Bounce buffer pool */
71      struct blkio_mem_region bounce_pool;
72  
73      /* Sorted list of allocated bounce buffers */
74      QLIST_HEAD(, BlkioBounceBuf) bounce_bufs;
75  
76      /* Queue for coroutines waiting for bounce buffer space */
77      CoQueue bounce_available;
78  
79      /* The value of the "mem-region-alignment" property */
80      size_t mem_region_alignment;
81  
82      /* Can we skip adding/deleting blkio_mem_regions? */
83      bool needs_mem_regions;
84  
85      /* Are file descriptors necessary for blkio_mem_regions? */
86      bool needs_mem_region_fd;
87  
88      /* Are madvise(MADV_DONTNEED)-style operations unavailable? */
89      bool may_pin_mem_regions;
90  } BDRVBlkioState;
91  
92  /* Called with s->bounce_lock held */
93  static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes)
94  {
95      /* There can be no allocated bounce buffers during resize */
96      assert(QLIST_EMPTY(&s->bounce_bufs));
97  
98      /* Pad size to reduce frequency of resize calls */
99      bytes += 128 * 1024;
100  
101      WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
102          int ret;
103  
104          if (s->bounce_pool.addr) {
105              blkio_unmap_mem_region(s->blkio, &s->bounce_pool);
106              blkio_free_mem_region(s->blkio, &s->bounce_pool);
107              memset(&s->bounce_pool, 0, sizeof(s->bounce_pool));
108          }
109  
110          /* Automatically freed when s->blkio is destroyed */
111          ret = blkio_alloc_mem_region(s->blkio, &s->bounce_pool, bytes);
112          if (ret < 0) {
113              return ret;
114          }
115  
116          ret = blkio_map_mem_region(s->blkio, &s->bounce_pool);
117          if (ret < 0) {
118              blkio_free_mem_region(s->blkio, &s->bounce_pool);
119              memset(&s->bounce_pool, 0, sizeof(s->bounce_pool));
120              return ret;
121          }
122      }
123  
124      return 0;
125  }
126  
127  /* Called with s->bounce_lock held */
128  static bool
129  blkio_do_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce,
130                               int64_t bytes)
131  {
132      void *addr = s->bounce_pool.addr;
133      BlkioBounceBuf *cur = NULL;
134      BlkioBounceBuf *prev = NULL;
135      ptrdiff_t space;
136  
137      /*
138       * This is just a linear search over the holes between requests. An
139       * efficient allocator would be nice.
140       */
141      QLIST_FOREACH(cur, &s->bounce_bufs, next) {
142          space = cur->buf.iov_base - addr;
143          if (bytes <= space) {
144              QLIST_INSERT_BEFORE(cur, bounce, next);
145              bounce->buf.iov_base = addr;
146              bounce->buf.iov_len = bytes;
147              return true;
148          }
149  
150          addr = cur->buf.iov_base + cur->buf.iov_len;
151          prev = cur;
152      }
153  
154      /* Is there space after the last request? */
155      space = s->bounce_pool.addr + s->bounce_pool.len - addr;
156      if (bytes > space) {
157          return false;
158      }
159      if (prev) {
160          QLIST_INSERT_AFTER(prev, bounce, next);
161      } else {
162          QLIST_INSERT_HEAD(&s->bounce_bufs, bounce, next);
163      }
164      bounce->buf.iov_base = addr;
165      bounce->buf.iov_len = bytes;
166      return true;
167  }
168  
169  static int coroutine_fn
170  blkio_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce,
171                            int64_t bytes)
172  {
173      /*
174       * Ensure fairness: first time around we join the back of the queue,
175       * subsequently we join the front so we don't lose our place.
176       */
177      CoQueueWaitFlags wait_flags = 0;
178  
179      QEMU_LOCK_GUARD(&s->bounce_lock);
180  
181      /* Ensure fairness: don't even try if other requests are already waiting */
182      if (!qemu_co_queue_empty(&s->bounce_available)) {
183          qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock,
184                                   wait_flags);
185          wait_flags = CO_QUEUE_WAIT_FRONT;
186      }
187  
188      while (true) {
189          if (blkio_do_alloc_bounce_buffer(s, bounce, bytes)) {
190              /* Kick the next queued request since there may be space */
191              qemu_co_queue_next(&s->bounce_available);
192              return 0;
193          }
194  
195          /*
196           * If there are no in-flight requests then the pool was simply too
197           * small.
198           */
199          if (QLIST_EMPTY(&s->bounce_bufs)) {
200              bool ok;
201              int ret;
202  
203              ret = blkio_resize_bounce_pool(s, bytes);
204              if (ret < 0) {
205                  /* Kick the next queued request since that may fail too */
206                  qemu_co_queue_next(&s->bounce_available);
207                  return ret;
208              }
209  
210              ok = blkio_do_alloc_bounce_buffer(s, bounce, bytes);
211              assert(ok); /* must have space this time */
212              return 0;
213          }
214  
215          qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock,
216                                   wait_flags);
217          wait_flags = CO_QUEUE_WAIT_FRONT;
218      }
219  }
220  
221  static void coroutine_fn blkio_free_bounce_buffer(BDRVBlkioState *s,
222                                                    BlkioBounceBuf *bounce)
223  {
224      QEMU_LOCK_GUARD(&s->bounce_lock);
225  
226      QLIST_REMOVE(bounce, next);
227  
228      /* Wake up waiting coroutines since space may now be available */
229      qemu_co_queue_next(&s->bounce_available);
230  }
231  
232  /* For async to .bdrv_co_*() conversion */
233  typedef struct {
234      Coroutine *coroutine;
235      int ret;
236  } BlkioCoData;
237  
238  static void blkio_completion_fd_read(void *opaque)
239  {
240      BlockDriverState *bs = opaque;
241      BDRVBlkioState *s = bs->opaque;
242      uint64_t val;
243      int ret;
244  
245      /* Polling may have already fetched a completion */
246      if (s->poll_completion.user_data != NULL) {
247          BlkioCoData *cod = s->poll_completion.user_data;
248          cod->ret = s->poll_completion.ret;
249  
250          /* Clear it in case aio_co_wake() enters a nested event loop */
251          s->poll_completion.user_data = NULL;
252  
253          aio_co_wake(cod->coroutine);
254      }
255  
256      /* Reset completion fd status */
257      ret = read(s->completion_fd, &val, sizeof(val));
258  
259      /* Ignore errors, there's nothing we can do */
260      (void)ret;
261  
262      /*
263       * Reading one completion at a time makes nested event loop re-entrancy
264       * simple. Change this loop to get multiple completions in one go if it
265       * becomes a performance bottleneck.
266       */
267      while (true) {
268          struct blkio_completion completion;
269  
270          WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
271              ret = blkioq_do_io(s->blkioq, &completion, 0, 1, NULL);
272          }
273          if (ret != 1) {
274              break;
275          }
276  
277          BlkioCoData *cod = completion.user_data;
278          cod->ret = completion.ret;
279          aio_co_wake(cod->coroutine);
280      }
281  }
282  
283  static bool blkio_completion_fd_poll(void *opaque)
284  {
285      BlockDriverState *bs = opaque;
286      BDRVBlkioState *s = bs->opaque;
287      int ret;
288  
289      /* Just in case we already fetched a completion */
290      if (s->poll_completion.user_data != NULL) {
291          return true;
292      }
293  
294      WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
295          ret = blkioq_do_io(s->blkioq, &s->poll_completion, 0, 1, NULL);
296      }
297      return ret == 1;
298  }
299  
300  static void blkio_completion_fd_poll_ready(void *opaque)
301  {
302      blkio_completion_fd_read(opaque);
303  }
304  
305  static void blkio_attach_aio_context(BlockDriverState *bs,
306                                       AioContext *new_context)
307  {
308      BDRVBlkioState *s = bs->opaque;
309  
310      aio_set_fd_handler(new_context, s->completion_fd,
311                         blkio_completion_fd_read, NULL,
312                         blkio_completion_fd_poll,
313                         blkio_completion_fd_poll_ready, bs);
314  }
315  
316  static void blkio_detach_aio_context(BlockDriverState *bs)
317  {
318      BDRVBlkioState *s = bs->opaque;
319  
320      aio_set_fd_handler(bdrv_get_aio_context(bs), s->completion_fd, NULL, NULL,
321                         NULL, NULL, NULL);
322  }
323  
324  /*
325   * Called by blk_io_unplug() or immediately if not plugged. Called without
326   * blkio_lock.
327   */
328  static void blkio_unplug_fn(void *opaque)
329  {
330      BDRVBlkioState *s = opaque;
331  
332      WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
333          blkioq_do_io(s->blkioq, NULL, 0, 0, NULL);
334      }
335  }
336  
337  /*
338   * Schedule I/O submission after enqueuing a new request. Called without
339   * blkio_lock.
340   */
341  static void blkio_submit_io(BlockDriverState *bs)
342  {
343      BDRVBlkioState *s = bs->opaque;
344  
345      blk_io_plug_call(blkio_unplug_fn, s);
346  }
347  
348  static int coroutine_fn
349  blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
350  {
351      BDRVBlkioState *s = bs->opaque;
352      BlkioCoData cod = {
353          .coroutine = qemu_coroutine_self(),
354      };
355  
356      WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
357          blkioq_discard(s->blkioq, offset, bytes, &cod, 0);
358      }
359  
360      blkio_submit_io(bs);
361      qemu_coroutine_yield();
362      return cod.ret;
363  }
364  
365  static int coroutine_fn
366  blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
367                  QEMUIOVector *qiov, BdrvRequestFlags flags)
368  {
369      BlkioCoData cod = {
370          .coroutine = qemu_coroutine_self(),
371      };
372      BDRVBlkioState *s = bs->opaque;
373      bool use_bounce_buffer =
374          s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
375      BlkioBounceBuf bounce;
376      struct iovec *iov = qiov->iov;
377      int iovcnt = qiov->niov;
378  
379      if (use_bounce_buffer) {
380          int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes);
381          if (ret < 0) {
382              return ret;
383          }
384  
385          iov = &bounce.buf;
386          iovcnt = 1;
387      }
388  
389      WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
390          blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0);
391      }
392  
393      blkio_submit_io(bs);
394      qemu_coroutine_yield();
395  
396      if (use_bounce_buffer) {
397          if (cod.ret == 0) {
398              qemu_iovec_from_buf(qiov, 0,
399                                  bounce.buf.iov_base,
400                                  bounce.buf.iov_len);
401          }
402  
403          blkio_free_bounce_buffer(s, &bounce);
404      }
405  
406      return cod.ret;
407  }
408  
409  static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset,
410          int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags)
411  {
412      uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0;
413      BlkioCoData cod = {
414          .coroutine = qemu_coroutine_self(),
415      };
416      BDRVBlkioState *s = bs->opaque;
417      bool use_bounce_buffer =
418          s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
419      BlkioBounceBuf bounce;
420      struct iovec *iov = qiov->iov;
421      int iovcnt = qiov->niov;
422  
423      if (use_bounce_buffer) {
424          int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes);
425          if (ret < 0) {
426              return ret;
427          }
428  
429          qemu_iovec_to_buf(qiov, 0, bounce.buf.iov_base, bytes);
430          iov = &bounce.buf;
431          iovcnt = 1;
432      }
433  
434      WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
435          blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags);
436      }
437  
438      blkio_submit_io(bs);
439      qemu_coroutine_yield();
440  
441      if (use_bounce_buffer) {
442          blkio_free_bounce_buffer(s, &bounce);
443      }
444  
445      return cod.ret;
446  }
447  
448  static int coroutine_fn blkio_co_flush(BlockDriverState *bs)
449  {
450      BDRVBlkioState *s = bs->opaque;
451      BlkioCoData cod = {
452          .coroutine = qemu_coroutine_self(),
453      };
454  
455      WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
456          blkioq_flush(s->blkioq, &cod, 0);
457      }
458  
459      blkio_submit_io(bs);
460      qemu_coroutine_yield();
461      return cod.ret;
462  }
463  
464  static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs,
465      int64_t offset, int64_t bytes, BdrvRequestFlags flags)
466  {
467      BDRVBlkioState *s = bs->opaque;
468      BlkioCoData cod = {
469          .coroutine = qemu_coroutine_self(),
470      };
471      uint32_t blkio_flags = 0;
472  
473      if (flags & BDRV_REQ_FUA) {
474          blkio_flags |= BLKIO_REQ_FUA;
475      }
476      if (!(flags & BDRV_REQ_MAY_UNMAP)) {
477          blkio_flags |= BLKIO_REQ_NO_UNMAP;
478      }
479      if (flags & BDRV_REQ_NO_FALLBACK) {
480          blkio_flags |= BLKIO_REQ_NO_FALLBACK;
481      }
482  
483      WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
484          blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags);
485      }
486  
487      blkio_submit_io(bs);
488      qemu_coroutine_yield();
489      return cod.ret;
490  }
491  
492  typedef enum {
493      BMRR_OK,
494      BMRR_SKIP,
495      BMRR_FAIL,
496  } BlkioMemRegionResult;
497  
498  /*
499   * Produce a struct blkio_mem_region for a given address and size.
500   *
501   * This function produces identical results when called multiple times with the
502   * same arguments. This property is necessary because blkio_unmap_mem_region()
503   * must receive the same struct blkio_mem_region field values that were passed
504   * to blkio_map_mem_region().
505   */
506  static BlkioMemRegionResult
507  blkio_mem_region_from_host(BlockDriverState *bs,
508                             void *host, size_t size,
509                             struct blkio_mem_region *region,
510                             Error **errp)
511  {
512      BDRVBlkioState *s = bs->opaque;
513      int fd = -1;
514      ram_addr_t fd_offset = 0;
515  
516      if (((uintptr_t)host | size) % s->mem_region_alignment) {
517          error_setg(errp, "unaligned buf %p with size %zu", host, size);
518          return BMRR_FAIL;
519      }
520  
521      /* Attempt to find the fd for the underlying memory */
522      if (s->needs_mem_region_fd) {
523          RAMBlock *ram_block;
524          RAMBlock *end_block;
525          ram_addr_t offset;
526  
527          /*
528           * bdrv_register_buf() is called with the BQL held so mr lives at least
529           * until this function returns.
530           */
531          ram_block = qemu_ram_block_from_host(host, false, &fd_offset);
532          if (ram_block) {
533              fd = qemu_ram_get_fd(ram_block);
534          }
535          if (fd == -1) {
536              /*
537               * Ideally every RAMBlock would have an fd. pc-bios and other
538               * things don't. Luckily they are usually not I/O buffers and we
539               * can just ignore them.
540               */
541              return BMRR_SKIP;
542          }
543  
544          /* Make sure the fd covers the entire range */
545          end_block = qemu_ram_block_from_host(host + size - 1, false, &offset);
546          if (ram_block != end_block) {
547              error_setg(errp, "registered buffer at %p with size %zu extends "
548                         "beyond RAMBlock", host, size);
549              return BMRR_FAIL;
550          }
551      }
552  
553      *region = (struct blkio_mem_region){
554          .addr = host,
555          .len = size,
556          .fd = fd,
557          .fd_offset = fd_offset,
558      };
559      return BMRR_OK;
560  }
561  
562  static bool blkio_register_buf(BlockDriverState *bs, void *host, size_t size,
563                                 Error **errp)
564  {
565      BDRVBlkioState *s = bs->opaque;
566      struct blkio_mem_region region;
567      BlkioMemRegionResult region_result;
568      int ret;
569  
570      /*
571       * Mapping memory regions conflicts with RAM discard (virtio-mem) when
572       * there is pinning, so only do it when necessary.
573       */
574      if (!s->needs_mem_regions && s->may_pin_mem_regions) {
575          return true;
576      }
577  
578      region_result = blkio_mem_region_from_host(bs, host, size, &region, errp);
579      if (region_result == BMRR_SKIP) {
580          return true;
581      } else if (region_result != BMRR_OK) {
582          return false;
583      }
584  
585      WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
586          ret = blkio_map_mem_region(s->blkio, &region);
587      }
588  
589      if (ret < 0) {
590          error_setg(errp, "Failed to add blkio mem region %p with size %zu: %s",
591                     host, size, blkio_get_error_msg());
592          return false;
593      }
594      return true;
595  }
596  
597  static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size)
598  {
599      BDRVBlkioState *s = bs->opaque;
600      struct blkio_mem_region region;
601  
602      /* See blkio_register_buf() */
603      if (!s->needs_mem_regions && s->may_pin_mem_regions) {
604          return;
605      }
606  
607      if (blkio_mem_region_from_host(bs, host, size, &region, NULL) != BMRR_OK) {
608          return;
609      }
610  
611      WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
612          blkio_unmap_mem_region(s->blkio, &region);
613      }
614  }
615  
616  static int blkio_io_uring_open(BlockDriverState *bs, QDict *options, int flags,
617                                 Error **errp)
618  {
619      const char *filename = qdict_get_str(options, "filename");
620      BDRVBlkioState *s = bs->opaque;
621      int ret;
622  
623      ret = blkio_set_str(s->blkio, "path", filename);
624      qdict_del(options, "filename");
625      if (ret < 0) {
626          error_setg_errno(errp, -ret, "failed to set path: %s",
627                           blkio_get_error_msg());
628          return ret;
629      }
630  
631      if (flags & BDRV_O_NOCACHE) {
632          ret = blkio_set_bool(s->blkio, "direct", true);
633          if (ret < 0) {
634              error_setg_errno(errp, -ret, "failed to set direct: %s",
635                               blkio_get_error_msg());
636              return ret;
637          }
638      }
639  
640      return 0;
641  }
642  
643  static int blkio_nvme_io_uring(BlockDriverState *bs, QDict *options, int flags,
644                                 Error **errp)
645  {
646      const char *path = qdict_get_try_str(options, "path");
647      BDRVBlkioState *s = bs->opaque;
648      int ret;
649  
650      if (!path) {
651          error_setg(errp, "missing 'path' option");
652          return -EINVAL;
653      }
654  
655      ret = blkio_set_str(s->blkio, "path", path);
656      qdict_del(options, "path");
657      if (ret < 0) {
658          error_setg_errno(errp, -ret, "failed to set path: %s",
659                           blkio_get_error_msg());
660          return ret;
661      }
662  
663      if (!(flags & BDRV_O_NOCACHE)) {
664          error_setg(errp, "cache.direct=off is not supported");
665          return -EINVAL;
666      }
667  
668      return 0;
669  }
670  
671  static int blkio_virtio_blk_common_open(BlockDriverState *bs,
672          QDict *options, int flags, Error **errp)
673  {
674      const char *path = qdict_get_try_str(options, "path");
675      BDRVBlkioState *s = bs->opaque;
676      bool fd_supported = false;
677      int fd, ret;
678  
679      if (!path) {
680          error_setg(errp, "missing 'path' option");
681          return -EINVAL;
682      }
683  
684      if (!(flags & BDRV_O_NOCACHE)) {
685          error_setg(errp, "cache.direct=off is not supported");
686          return -EINVAL;
687      }
688  
689      if (blkio_get_int(s->blkio, "fd", &fd) == 0) {
690          fd_supported = true;
691      }
692  
693      /*
694       * If the libblkio driver supports fd passing, let's always use qemu_open()
695       * to open the `path`, so we can handle fd passing from the management
696       * layer through the "/dev/fdset/N" special path.
697       */
698      if (fd_supported) {
699          int open_flags;
700  
701          if (flags & BDRV_O_RDWR) {
702              open_flags = O_RDWR;
703          } else {
704              open_flags = O_RDONLY;
705          }
706  
707          fd = qemu_open(path, open_flags, errp);
708          if (fd < 0) {
709              return -EINVAL;
710          }
711  
712          ret = blkio_set_int(s->blkio, "fd", fd);
713          if (ret < 0) {
714              error_setg_errno(errp, -ret, "failed to set fd: %s",
715                               blkio_get_error_msg());
716              qemu_close(fd);
717              return ret;
718          }
719      } else {
720          ret = blkio_set_str(s->blkio, "path", path);
721          if (ret < 0) {
722              error_setg_errno(errp, -ret, "failed to set path: %s",
723                               blkio_get_error_msg());
724              return ret;
725          }
726      }
727  
728      qdict_del(options, "path");
729  
730      return 0;
731  }
732  
733  static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags,
734                             Error **errp)
735  {
736      const char *blkio_driver = bs->drv->protocol_name;
737      BDRVBlkioState *s = bs->opaque;
738      int ret;
739  
740      ret = blkio_create(blkio_driver, &s->blkio);
741      if (ret < 0) {
742          error_setg_errno(errp, -ret, "blkio_create failed: %s",
743                           blkio_get_error_msg());
744          return ret;
745      }
746  
747      if (strcmp(blkio_driver, DRIVER_IO_URING) == 0) {
748          ret = blkio_io_uring_open(bs, options, flags, errp);
749      } else if (strcmp(blkio_driver, DRIVER_NVME_IO_URING) == 0) {
750          ret = blkio_nvme_io_uring(bs, options, flags, errp);
751      } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VFIO_PCI) == 0) {
752          ret = blkio_virtio_blk_common_open(bs, options, flags, errp);
753      } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_USER) == 0) {
754          ret = blkio_virtio_blk_common_open(bs, options, flags, errp);
755      } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_VDPA) == 0) {
756          ret = blkio_virtio_blk_common_open(bs, options, flags, errp);
757      } else {
758          g_assert_not_reached();
759      }
760      if (ret < 0) {
761          blkio_destroy(&s->blkio);
762          return ret;
763      }
764  
765      if (!(flags & BDRV_O_RDWR)) {
766          ret = blkio_set_bool(s->blkio, "read-only", true);
767          if (ret < 0) {
768              error_setg_errno(errp, -ret, "failed to set read-only: %s",
769                               blkio_get_error_msg());
770              blkio_destroy(&s->blkio);
771              return ret;
772          }
773      }
774  
775      ret = blkio_connect(s->blkio);
776      if (ret < 0) {
777          error_setg_errno(errp, -ret, "blkio_connect failed: %s",
778                           blkio_get_error_msg());
779          blkio_destroy(&s->blkio);
780          return ret;
781      }
782  
783      ret = blkio_get_bool(s->blkio,
784                           "needs-mem-regions",
785                           &s->needs_mem_regions);
786      if (ret < 0) {
787          error_setg_errno(errp, -ret,
788                           "failed to get needs-mem-regions: %s",
789                           blkio_get_error_msg());
790          blkio_destroy(&s->blkio);
791          return ret;
792      }
793  
794      ret = blkio_get_bool(s->blkio,
795                           "needs-mem-region-fd",
796                           &s->needs_mem_region_fd);
797      if (ret < 0) {
798          error_setg_errno(errp, -ret,
799                           "failed to get needs-mem-region-fd: %s",
800                           blkio_get_error_msg());
801          blkio_destroy(&s->blkio);
802          return ret;
803      }
804  
805      ret = blkio_get_uint64(s->blkio,
806                             "mem-region-alignment",
807                             &s->mem_region_alignment);
808      if (ret < 0) {
809          error_setg_errno(errp, -ret,
810                           "failed to get mem-region-alignment: %s",
811                           blkio_get_error_msg());
812          blkio_destroy(&s->blkio);
813          return ret;
814      }
815  
816      ret = blkio_get_bool(s->blkio,
817                           "may-pin-mem-regions",
818                           &s->may_pin_mem_regions);
819      if (ret < 0) {
820          /* Be conservative (assume pinning) if the property is not supported */
821          s->may_pin_mem_regions = s->needs_mem_regions;
822      }
823  
824      /*
825       * Notify if libblkio drivers pin memory and prevent features like
826       * virtio-mem from working.
827       */
828      if (s->may_pin_mem_regions) {
829          ret = ram_block_discard_disable(true);
830          if (ret < 0) {
831              error_setg_errno(errp, -ret, "ram_block_discard_disable() failed");
832              blkio_destroy(&s->blkio);
833              return ret;
834          }
835      }
836  
837      ret = blkio_start(s->blkio);
838      if (ret < 0) {
839          error_setg_errno(errp, -ret, "blkio_start failed: %s",
840                           blkio_get_error_msg());
841          blkio_destroy(&s->blkio);
842          if (s->may_pin_mem_regions) {
843              ram_block_discard_disable(false);
844          }
845          return ret;
846      }
847  
848      bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF;
849      bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP |
850                                 BDRV_REQ_NO_FALLBACK;
851  
852      qemu_mutex_init(&s->blkio_lock);
853      qemu_co_mutex_init(&s->bounce_lock);
854      qemu_co_queue_init(&s->bounce_available);
855      QLIST_INIT(&s->bounce_bufs);
856      s->blkioq = blkio_get_queue(s->blkio, 0);
857      s->completion_fd = blkioq_get_completion_fd(s->blkioq);
858  
859      blkio_attach_aio_context(bs, bdrv_get_aio_context(bs));
860      return 0;
861  }
862  
863  static void blkio_close(BlockDriverState *bs)
864  {
865      BDRVBlkioState *s = bs->opaque;
866  
867      /* There is no destroy() API for s->bounce_lock */
868  
869      qemu_mutex_destroy(&s->blkio_lock);
870      blkio_detach_aio_context(bs);
871      blkio_destroy(&s->blkio);
872  
873      if (s->may_pin_mem_regions) {
874          ram_block_discard_disable(false);
875      }
876  }
877  
878  static int64_t coroutine_fn blkio_co_getlength(BlockDriverState *bs)
879  {
880      BDRVBlkioState *s = bs->opaque;
881      uint64_t capacity;
882      int ret;
883  
884      WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
885          ret = blkio_get_uint64(s->blkio, "capacity", &capacity);
886      }
887      if (ret < 0) {
888          return -ret;
889      }
890  
891      return capacity;
892  }
893  
894  static int coroutine_fn blkio_truncate(BlockDriverState *bs, int64_t offset,
895                                         bool exact, PreallocMode prealloc,
896                                         BdrvRequestFlags flags, Error **errp)
897  {
898      int64_t current_length;
899  
900      if (prealloc != PREALLOC_MODE_OFF) {
901          error_setg(errp, "Unsupported preallocation mode '%s'",
902                     PreallocMode_str(prealloc));
903          return -ENOTSUP;
904      }
905  
906      current_length = blkio_co_getlength(bs);
907  
908      if (offset > current_length) {
909          error_setg(errp, "Cannot grow device");
910          return -EINVAL;
911      } else if (exact && offset != current_length) {
912          error_setg(errp, "Cannot resize device");
913          return -ENOTSUP;
914      }
915  
916      return 0;
917  }
918  
919  static int coroutine_fn
920  blkio_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
921  {
922      return 0;
923  }
924  
925  static void blkio_refresh_limits(BlockDriverState *bs, Error **errp)
926  {
927      BDRVBlkioState *s = bs->opaque;
928      QEMU_LOCK_GUARD(&s->blkio_lock);
929      int value;
930      int ret;
931  
932      ret = blkio_get_int(s->blkio, "request-alignment", &value);
933      if (ret < 0) {
934          error_setg_errno(errp, -ret, "failed to get \"request-alignment\": %s",
935                           blkio_get_error_msg());
936          return;
937      }
938      bs->bl.request_alignment = value;
939      if (bs->bl.request_alignment < 1 ||
940          bs->bl.request_alignment >= INT_MAX ||
941          !is_power_of_2(bs->bl.request_alignment)) {
942          error_setg(errp, "invalid \"request-alignment\" value %" PRIu32 ", "
943                     "must be a power of 2 less than INT_MAX",
944                     bs->bl.request_alignment);
945          return;
946      }
947  
948      ret = blkio_get_int(s->blkio, "optimal-io-size", &value);
949      if (ret < 0) {
950          error_setg_errno(errp, -ret, "failed to get \"optimal-io-size\": %s",
951                           blkio_get_error_msg());
952          return;
953      }
954      bs->bl.opt_transfer = value;
955      if (bs->bl.opt_transfer > INT_MAX ||
956          (bs->bl.opt_transfer % bs->bl.request_alignment)) {
957          error_setg(errp, "invalid \"optimal-io-size\" value %" PRIu32 ", must "
958                     "be a multiple of %" PRIu32, bs->bl.opt_transfer,
959                     bs->bl.request_alignment);
960          return;
961      }
962  
963      ret = blkio_get_int(s->blkio, "max-transfer", &value);
964      if (ret < 0) {
965          error_setg_errno(errp, -ret, "failed to get \"max-transfer\": %s",
966                           blkio_get_error_msg());
967          return;
968      }
969      bs->bl.max_transfer = value;
970      if ((bs->bl.max_transfer % bs->bl.request_alignment) ||
971          (bs->bl.opt_transfer && (bs->bl.max_transfer % bs->bl.opt_transfer))) {
972          error_setg(errp, "invalid \"max-transfer\" value %" PRIu32 ", must be "
973                     "a multiple of %" PRIu32 " and %" PRIu32 " (if non-zero)",
974                     bs->bl.max_transfer, bs->bl.request_alignment,
975                     bs->bl.opt_transfer);
976          return;
977      }
978  
979      ret = blkio_get_int(s->blkio, "buf-alignment", &value);
980      if (ret < 0) {
981          error_setg_errno(errp, -ret, "failed to get \"buf-alignment\": %s",
982                           blkio_get_error_msg());
983          return;
984      }
985      if (value < 1) {
986          error_setg(errp, "invalid \"buf-alignment\" value %d, must be "
987                     "positive", value);
988          return;
989      }
990      bs->bl.min_mem_alignment = value;
991  
992      ret = blkio_get_int(s->blkio, "optimal-buf-alignment", &value);
993      if (ret < 0) {
994          error_setg_errno(errp, -ret,
995                           "failed to get \"optimal-buf-alignment\": %s",
996                           blkio_get_error_msg());
997          return;
998      }
999      if (value < 1) {
1000          error_setg(errp, "invalid \"optimal-buf-alignment\" value %d, "
1001                     "must be positive", value);
1002          return;
1003      }
1004      bs->bl.opt_mem_alignment = value;
1005  
1006      ret = blkio_get_int(s->blkio, "max-segments", &value);
1007      if (ret < 0) {
1008          error_setg_errno(errp, -ret, "failed to get \"max-segments\": %s",
1009                           blkio_get_error_msg());
1010          return;
1011      }
1012      if (value < 1) {
1013          error_setg(errp, "invalid \"max-segments\" value %d, must be positive",
1014                     value);
1015          return;
1016      }
1017      bs->bl.max_iov = value;
1018  }
1019  
1020  /*
1021   * TODO
1022   * Missing libblkio APIs:
1023   * - block_status
1024   * - co_invalidate_cache
1025   *
1026   * Out of scope?
1027   * - create
1028   * - truncate
1029   */
1030  
1031  #define BLKIO_DRIVER(name, ...) \
1032      { \
1033          .format_name             = name, \
1034          .protocol_name           = name, \
1035          .instance_size           = sizeof(BDRVBlkioState), \
1036          .bdrv_file_open          = blkio_file_open, \
1037          .bdrv_close              = blkio_close, \
1038          .bdrv_co_getlength       = blkio_co_getlength, \
1039          .bdrv_co_truncate        = blkio_truncate, \
1040          .bdrv_co_get_info        = blkio_co_get_info, \
1041          .bdrv_attach_aio_context = blkio_attach_aio_context, \
1042          .bdrv_detach_aio_context = blkio_detach_aio_context, \
1043          .bdrv_co_pdiscard        = blkio_co_pdiscard, \
1044          .bdrv_co_preadv          = blkio_co_preadv, \
1045          .bdrv_co_pwritev         = blkio_co_pwritev, \
1046          .bdrv_co_flush_to_disk   = blkio_co_flush, \
1047          .bdrv_co_pwrite_zeroes   = blkio_co_pwrite_zeroes, \
1048          .bdrv_refresh_limits     = blkio_refresh_limits, \
1049          .bdrv_register_buf       = blkio_register_buf, \
1050          .bdrv_unregister_buf     = blkio_unregister_buf, \
1051          __VA_ARGS__ \
1052      }
1053  
1054  static BlockDriver bdrv_io_uring = BLKIO_DRIVER(
1055      DRIVER_IO_URING,
1056      .bdrv_needs_filename = true,
1057  );
1058  
1059  static BlockDriver bdrv_nvme_io_uring = BLKIO_DRIVER(
1060      DRIVER_NVME_IO_URING,
1061  );
1062  
1063  static BlockDriver bdrv_virtio_blk_vfio_pci = BLKIO_DRIVER(
1064      DRIVER_VIRTIO_BLK_VFIO_PCI
1065  );
1066  
1067  static BlockDriver bdrv_virtio_blk_vhost_user = BLKIO_DRIVER(
1068      DRIVER_VIRTIO_BLK_VHOST_USER
1069  );
1070  
1071  static BlockDriver bdrv_virtio_blk_vhost_vdpa = BLKIO_DRIVER(
1072      DRIVER_VIRTIO_BLK_VHOST_VDPA
1073  );
1074  
1075  static void bdrv_blkio_init(void)
1076  {
1077      bdrv_register(&bdrv_io_uring);
1078      bdrv_register(&bdrv_nvme_io_uring);
1079      bdrv_register(&bdrv_virtio_blk_vfio_pci);
1080      bdrv_register(&bdrv_virtio_blk_vhost_user);
1081      bdrv_register(&bdrv_virtio_blk_vhost_vdpa);
1082  }
1083  
1084  block_init(bdrv_blkio_init);
1085