xref: /openbmc/qemu/block/block-backend.c (revision 2f44bea9)
1 /*
2  * QEMU Block backends
3  *
4  * Copyright (C) 2014-2016 Red Hat, Inc.
5  *
6  * Authors:
7  *  Markus Armbruster <armbru@redhat.com>,
8  *
9  * This work is licensed under the terms of the GNU LGPL, version 2.1
10  * or later.  See the COPYING.LIB file in the top-level directory.
11  */
12 
13 #include "qemu/osdep.h"
14 #include "sysemu/block-backend.h"
15 #include "block/block_int.h"
16 #include "block/blockjob.h"
17 #include "block/throttle-groups.h"
18 #include "hw/qdev-core.h"
19 #include "sysemu/blockdev.h"
20 #include "sysemu/runstate.h"
21 #include "sysemu/replay.h"
22 #include "qapi/error.h"
23 #include "qapi/qapi-events-block.h"
24 #include "qemu/id.h"
25 #include "qemu/main-loop.h"
26 #include "qemu/option.h"
27 #include "trace.h"
28 #include "migration/misc.h"
29 
30 /* Number of coroutines to reserve per attached device model */
31 #define COROUTINE_POOL_RESERVATION 64
32 
33 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
34 
35 static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb);
36 
37 typedef struct BlockBackendAioNotifier {
38     void (*attached_aio_context)(AioContext *new_context, void *opaque);
39     void (*detach_aio_context)(void *opaque);
40     void *opaque;
41     QLIST_ENTRY(BlockBackendAioNotifier) list;
42 } BlockBackendAioNotifier;
43 
44 struct BlockBackend {
45     char *name;
46     int refcnt;
47     BdrvChild *root;
48     AioContext *ctx;
49     DriveInfo *legacy_dinfo;    /* null unless created by drive_new() */
50     QTAILQ_ENTRY(BlockBackend) link;         /* for block_backends */
51     QTAILQ_ENTRY(BlockBackend) monitor_link; /* for monitor_block_backends */
52     BlockBackendPublic public;
53 
54     DeviceState *dev;           /* attached device model, if any */
55     const BlockDevOps *dev_ops;
56     void *dev_opaque;
57 
58     /* the block size for which the guest device expects atomicity */
59     int guest_block_size;
60 
61     /* If the BDS tree is removed, some of its options are stored here (which
62      * can be used to restore those options in the new BDS on insert) */
63     BlockBackendRootState root_state;
64 
65     bool enable_write_cache;
66 
67     /* I/O stats (display with "info blockstats"). */
68     BlockAcctStats stats;
69 
70     BlockdevOnError on_read_error, on_write_error;
71     bool iostatus_enabled;
72     BlockDeviceIoStatus iostatus;
73 
74     uint64_t perm;
75     uint64_t shared_perm;
76     bool disable_perm;
77 
78     bool allow_aio_context_change;
79     bool allow_write_beyond_eof;
80 
81     NotifierList remove_bs_notifiers, insert_bs_notifiers;
82     QLIST_HEAD(, BlockBackendAioNotifier) aio_notifiers;
83 
84     int quiesce_counter;
85     CoQueue queued_requests;
86     bool disable_request_queuing;
87 
88     VMChangeStateEntry *vmsh;
89     bool force_allow_inactivate;
90 
91     /* Number of in-flight aio requests.  BlockDriverState also counts
92      * in-flight requests but aio requests can exist even when blk->root is
93      * NULL, so we cannot rely on its counter for that case.
94      * Accessed with atomic ops.
95      */
96     unsigned int in_flight;
97 };
98 
99 typedef struct BlockBackendAIOCB {
100     BlockAIOCB common;
101     BlockBackend *blk;
102     int ret;
103 } BlockBackendAIOCB;
104 
105 static const AIOCBInfo block_backend_aiocb_info = {
106     .get_aio_context = blk_aiocb_get_aio_context,
107     .aiocb_size = sizeof(BlockBackendAIOCB),
108 };
109 
110 static void drive_info_del(DriveInfo *dinfo);
111 static BlockBackend *bdrv_first_blk(BlockDriverState *bs);
112 
113 /* All BlockBackends */
114 static QTAILQ_HEAD(, BlockBackend) block_backends =
115     QTAILQ_HEAD_INITIALIZER(block_backends);
116 
117 /* All BlockBackends referenced by the monitor and which are iterated through by
118  * blk_next() */
119 static QTAILQ_HEAD(, BlockBackend) monitor_block_backends =
120     QTAILQ_HEAD_INITIALIZER(monitor_block_backends);
121 
122 static void blk_root_inherit_options(BdrvChildRole role, bool parent_is_format,
123                                      int *child_flags, QDict *child_options,
124                                      int parent_flags, QDict *parent_options)
125 {
126     /* We're not supposed to call this function for root nodes */
127     abort();
128 }
129 static void blk_root_drained_begin(BdrvChild *child);
130 static bool blk_root_drained_poll(BdrvChild *child);
131 static void blk_root_drained_end(BdrvChild *child, int *drained_end_counter);
132 
133 static void blk_root_change_media(BdrvChild *child, bool load);
134 static void blk_root_resize(BdrvChild *child);
135 
136 static bool blk_root_can_set_aio_ctx(BdrvChild *child, AioContext *ctx,
137                                      GSList **ignore, Error **errp);
138 static void blk_root_set_aio_ctx(BdrvChild *child, AioContext *ctx,
139                                  GSList **ignore);
140 
141 static char *blk_root_get_parent_desc(BdrvChild *child)
142 {
143     BlockBackend *blk = child->opaque;
144     g_autofree char *dev_id = NULL;
145 
146     if (blk->name) {
147         return g_strdup_printf("block device '%s'", blk->name);
148     }
149 
150     dev_id = blk_get_attached_dev_id(blk);
151     if (*dev_id) {
152         return g_strdup_printf("block device '%s'", dev_id);
153     } else {
154         /* TODO Callback into the BB owner for something more detailed */
155         return g_strdup("an unnamed block device");
156     }
157 }
158 
159 static const char *blk_root_get_name(BdrvChild *child)
160 {
161     return blk_name(child->opaque);
162 }
163 
164 static void blk_vm_state_changed(void *opaque, bool running, RunState state)
165 {
166     Error *local_err = NULL;
167     BlockBackend *blk = opaque;
168 
169     if (state == RUN_STATE_INMIGRATE) {
170         return;
171     }
172 
173     qemu_del_vm_change_state_handler(blk->vmsh);
174     blk->vmsh = NULL;
175     blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
176     if (local_err) {
177         error_report_err(local_err);
178     }
179 }
180 
181 /*
182  * Notifies the user of the BlockBackend that migration has completed. qdev
183  * devices can tighten their permissions in response (specifically revoke
184  * shared write permissions that we needed for storage migration).
185  *
186  * If an error is returned, the VM cannot be allowed to be resumed.
187  */
188 static void blk_root_activate(BdrvChild *child, Error **errp)
189 {
190     BlockBackend *blk = child->opaque;
191     Error *local_err = NULL;
192 
193     if (!blk->disable_perm) {
194         return;
195     }
196 
197     blk->disable_perm = false;
198 
199     blk_set_perm(blk, blk->perm, BLK_PERM_ALL, &local_err);
200     if (local_err) {
201         error_propagate(errp, local_err);
202         blk->disable_perm = true;
203         return;
204     }
205 
206     if (runstate_check(RUN_STATE_INMIGRATE)) {
207         /* Activation can happen when migration process is still active, for
208          * example when nbd_server_add is called during non-shared storage
209          * migration. Defer the shared_perm update to migration completion. */
210         if (!blk->vmsh) {
211             blk->vmsh = qemu_add_vm_change_state_handler(blk_vm_state_changed,
212                                                          blk);
213         }
214         return;
215     }
216 
217     blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
218     if (local_err) {
219         error_propagate(errp, local_err);
220         blk->disable_perm = true;
221         return;
222     }
223 }
224 
225 void blk_set_force_allow_inactivate(BlockBackend *blk)
226 {
227     blk->force_allow_inactivate = true;
228 }
229 
230 static bool blk_can_inactivate(BlockBackend *blk)
231 {
232     /* If it is a guest device, inactivate is ok. */
233     if (blk->dev || blk_name(blk)[0]) {
234         return true;
235     }
236 
237     /* Inactivating means no more writes to the image can be done,
238      * even if those writes would be changes invisible to the
239      * guest.  For block job BBs that satisfy this, we can just allow
240      * it.  This is the case for mirror job source, which is required
241      * by libvirt non-shared block migration. */
242     if (!(blk->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED))) {
243         return true;
244     }
245 
246     return blk->force_allow_inactivate;
247 }
248 
249 static int blk_root_inactivate(BdrvChild *child)
250 {
251     BlockBackend *blk = child->opaque;
252 
253     if (blk->disable_perm) {
254         return 0;
255     }
256 
257     if (!blk_can_inactivate(blk)) {
258         return -EPERM;
259     }
260 
261     blk->disable_perm = true;
262     if (blk->root) {
263         bdrv_child_try_set_perm(blk->root, 0, BLK_PERM_ALL, &error_abort);
264     }
265 
266     return 0;
267 }
268 
269 static void blk_root_attach(BdrvChild *child)
270 {
271     BlockBackend *blk = child->opaque;
272     BlockBackendAioNotifier *notifier;
273 
274     trace_blk_root_attach(child, blk, child->bs);
275 
276     QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
277         bdrv_add_aio_context_notifier(child->bs,
278                 notifier->attached_aio_context,
279                 notifier->detach_aio_context,
280                 notifier->opaque);
281     }
282 }
283 
284 static void blk_root_detach(BdrvChild *child)
285 {
286     BlockBackend *blk = child->opaque;
287     BlockBackendAioNotifier *notifier;
288 
289     trace_blk_root_detach(child, blk, child->bs);
290 
291     QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
292         bdrv_remove_aio_context_notifier(child->bs,
293                 notifier->attached_aio_context,
294                 notifier->detach_aio_context,
295                 notifier->opaque);
296     }
297 }
298 
299 static AioContext *blk_root_get_parent_aio_context(BdrvChild *c)
300 {
301     BlockBackend *blk = c->opaque;
302 
303     return blk_get_aio_context(blk);
304 }
305 
306 static const BdrvChildClass child_root = {
307     .inherit_options    = blk_root_inherit_options,
308 
309     .change_media       = blk_root_change_media,
310     .resize             = blk_root_resize,
311     .get_name           = blk_root_get_name,
312     .get_parent_desc    = blk_root_get_parent_desc,
313 
314     .drained_begin      = blk_root_drained_begin,
315     .drained_poll       = blk_root_drained_poll,
316     .drained_end        = blk_root_drained_end,
317 
318     .activate           = blk_root_activate,
319     .inactivate         = blk_root_inactivate,
320 
321     .attach             = blk_root_attach,
322     .detach             = blk_root_detach,
323 
324     .can_set_aio_ctx    = blk_root_can_set_aio_ctx,
325     .set_aio_ctx        = blk_root_set_aio_ctx,
326 
327     .get_parent_aio_context = blk_root_get_parent_aio_context,
328 };
329 
330 /*
331  * Create a new BlockBackend with a reference count of one.
332  *
333  * @perm is a bitmasks of BLK_PERM_* constants which describes the permissions
334  * to request for a block driver node that is attached to this BlockBackend.
335  * @shared_perm is a bitmask which describes which permissions may be granted
336  * to other users of the attached node.
337  * Both sets of permissions can be changed later using blk_set_perm().
338  *
339  * Return the new BlockBackend on success, null on failure.
340  */
341 BlockBackend *blk_new(AioContext *ctx, uint64_t perm, uint64_t shared_perm)
342 {
343     BlockBackend *blk;
344 
345     blk = g_new0(BlockBackend, 1);
346     blk->refcnt = 1;
347     blk->ctx = ctx;
348     blk->perm = perm;
349     blk->shared_perm = shared_perm;
350     blk_set_enable_write_cache(blk, true);
351 
352     blk->on_read_error = BLOCKDEV_ON_ERROR_REPORT;
353     blk->on_write_error = BLOCKDEV_ON_ERROR_ENOSPC;
354 
355     block_acct_init(&blk->stats);
356 
357     qemu_co_queue_init(&blk->queued_requests);
358     notifier_list_init(&blk->remove_bs_notifiers);
359     notifier_list_init(&blk->insert_bs_notifiers);
360     QLIST_INIT(&blk->aio_notifiers);
361 
362     QTAILQ_INSERT_TAIL(&block_backends, blk, link);
363     return blk;
364 }
365 
366 /*
367  * Create a new BlockBackend connected to an existing BlockDriverState.
368  *
369  * @perm is a bitmasks of BLK_PERM_* constants which describes the
370  * permissions to request for @bs that is attached to this
371  * BlockBackend.  @shared_perm is a bitmask which describes which
372  * permissions may be granted to other users of the attached node.
373  * Both sets of permissions can be changed later using blk_set_perm().
374  *
375  * Return the new BlockBackend on success, null on failure.
376  */
377 BlockBackend *blk_new_with_bs(BlockDriverState *bs, uint64_t perm,
378                               uint64_t shared_perm, Error **errp)
379 {
380     BlockBackend *blk = blk_new(bdrv_get_aio_context(bs), perm, shared_perm);
381 
382     if (blk_insert_bs(blk, bs, errp) < 0) {
383         blk_unref(blk);
384         return NULL;
385     }
386     return blk;
387 }
388 
389 /*
390  * Creates a new BlockBackend, opens a new BlockDriverState, and connects both.
391  * The new BlockBackend is in the main AioContext.
392  *
393  * Just as with bdrv_open(), after having called this function the reference to
394  * @options belongs to the block layer (even on failure).
395  *
396  * TODO: Remove @filename and @flags; it should be possible to specify a whole
397  * BDS tree just by specifying the @options QDict (or @reference,
398  * alternatively). At the time of adding this function, this is not possible,
399  * though, so callers of this function have to be able to specify @filename and
400  * @flags.
401  */
402 BlockBackend *blk_new_open(const char *filename, const char *reference,
403                            QDict *options, int flags, Error **errp)
404 {
405     BlockBackend *blk;
406     BlockDriverState *bs;
407     uint64_t perm = 0;
408     uint64_t shared = BLK_PERM_ALL;
409 
410     /*
411      * blk_new_open() is mainly used in .bdrv_create implementations and the
412      * tools where sharing isn't a major concern because the BDS stays private
413      * and the file is generally not supposed to be used by a second process,
414      * so we just request permission according to the flags.
415      *
416      * The exceptions are xen_disk and blockdev_init(); in these cases, the
417      * caller of blk_new_open() doesn't make use of the permissions, but they
418      * shouldn't hurt either. We can still share everything here because the
419      * guest devices will add their own blockers if they can't share.
420      */
421     if ((flags & BDRV_O_NO_IO) == 0) {
422         perm |= BLK_PERM_CONSISTENT_READ;
423         if (flags & BDRV_O_RDWR) {
424             perm |= BLK_PERM_WRITE;
425         }
426     }
427     if (flags & BDRV_O_RESIZE) {
428         perm |= BLK_PERM_RESIZE;
429     }
430     if (flags & BDRV_O_NO_SHARE) {
431         shared = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED;
432     }
433 
434     blk = blk_new(qemu_get_aio_context(), perm, shared);
435     bs = bdrv_open(filename, reference, options, flags, errp);
436     if (!bs) {
437         blk_unref(blk);
438         return NULL;
439     }
440 
441     blk->root = bdrv_root_attach_child(bs, "root", &child_root,
442                                        BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
443                                        perm, shared, blk, errp);
444     if (!blk->root) {
445         blk_unref(blk);
446         return NULL;
447     }
448 
449     return blk;
450 }
451 
452 static void blk_delete(BlockBackend *blk)
453 {
454     assert(!blk->refcnt);
455     assert(!blk->name);
456     assert(!blk->dev);
457     if (blk->public.throttle_group_member.throttle_state) {
458         blk_io_limits_disable(blk);
459     }
460     if (blk->root) {
461         blk_remove_bs(blk);
462     }
463     if (blk->vmsh) {
464         qemu_del_vm_change_state_handler(blk->vmsh);
465         blk->vmsh = NULL;
466     }
467     assert(QLIST_EMPTY(&blk->remove_bs_notifiers.notifiers));
468     assert(QLIST_EMPTY(&blk->insert_bs_notifiers.notifiers));
469     assert(QLIST_EMPTY(&blk->aio_notifiers));
470     QTAILQ_REMOVE(&block_backends, blk, link);
471     drive_info_del(blk->legacy_dinfo);
472     block_acct_cleanup(&blk->stats);
473     g_free(blk);
474 }
475 
476 static void drive_info_del(DriveInfo *dinfo)
477 {
478     if (!dinfo) {
479         return;
480     }
481     qemu_opts_del(dinfo->opts);
482     g_free(dinfo);
483 }
484 
485 int blk_get_refcnt(BlockBackend *blk)
486 {
487     return blk ? blk->refcnt : 0;
488 }
489 
490 /*
491  * Increment @blk's reference count.
492  * @blk must not be null.
493  */
494 void blk_ref(BlockBackend *blk)
495 {
496     assert(blk->refcnt > 0);
497     blk->refcnt++;
498 }
499 
500 /*
501  * Decrement @blk's reference count.
502  * If this drops it to zero, destroy @blk.
503  * For convenience, do nothing if @blk is null.
504  */
505 void blk_unref(BlockBackend *blk)
506 {
507     if (blk) {
508         assert(blk->refcnt > 0);
509         if (blk->refcnt > 1) {
510             blk->refcnt--;
511         } else {
512             blk_drain(blk);
513             /* blk_drain() cannot resurrect blk, nobody held a reference */
514             assert(blk->refcnt == 1);
515             blk->refcnt = 0;
516             blk_delete(blk);
517         }
518     }
519 }
520 
521 /*
522  * Behaves similarly to blk_next() but iterates over all BlockBackends, even the
523  * ones which are hidden (i.e. are not referenced by the monitor).
524  */
525 BlockBackend *blk_all_next(BlockBackend *blk)
526 {
527     return blk ? QTAILQ_NEXT(blk, link)
528                : QTAILQ_FIRST(&block_backends);
529 }
530 
531 void blk_remove_all_bs(void)
532 {
533     BlockBackend *blk = NULL;
534 
535     while ((blk = blk_all_next(blk)) != NULL) {
536         AioContext *ctx = blk_get_aio_context(blk);
537 
538         aio_context_acquire(ctx);
539         if (blk->root) {
540             blk_remove_bs(blk);
541         }
542         aio_context_release(ctx);
543     }
544 }
545 
546 /*
547  * Return the monitor-owned BlockBackend after @blk.
548  * If @blk is null, return the first one.
549  * Else, return @blk's next sibling, which may be null.
550  *
551  * To iterate over all BlockBackends, do
552  * for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
553  *     ...
554  * }
555  */
556 BlockBackend *blk_next(BlockBackend *blk)
557 {
558     return blk ? QTAILQ_NEXT(blk, monitor_link)
559                : QTAILQ_FIRST(&monitor_block_backends);
560 }
561 
562 /* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
563  * the monitor or attached to a BlockBackend */
564 BlockDriverState *bdrv_next(BdrvNextIterator *it)
565 {
566     BlockDriverState *bs, *old_bs;
567 
568     /* Must be called from the main loop */
569     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
570 
571     /* First, return all root nodes of BlockBackends. In order to avoid
572      * returning a BDS twice when multiple BBs refer to it, we only return it
573      * if the BB is the first one in the parent list of the BDS. */
574     if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
575         BlockBackend *old_blk = it->blk;
576 
577         old_bs = old_blk ? blk_bs(old_blk) : NULL;
578 
579         do {
580             it->blk = blk_all_next(it->blk);
581             bs = it->blk ? blk_bs(it->blk) : NULL;
582         } while (it->blk && (bs == NULL || bdrv_first_blk(bs) != it->blk));
583 
584         if (it->blk) {
585             blk_ref(it->blk);
586         }
587         blk_unref(old_blk);
588 
589         if (bs) {
590             bdrv_ref(bs);
591             bdrv_unref(old_bs);
592             return bs;
593         }
594         it->phase = BDRV_NEXT_MONITOR_OWNED;
595     } else {
596         old_bs = it->bs;
597     }
598 
599     /* Then return the monitor-owned BDSes without a BB attached. Ignore all
600      * BDSes that are attached to a BlockBackend here; they have been handled
601      * by the above block already */
602     do {
603         it->bs = bdrv_next_monitor_owned(it->bs);
604         bs = it->bs;
605     } while (bs && bdrv_has_blk(bs));
606 
607     if (bs) {
608         bdrv_ref(bs);
609     }
610     bdrv_unref(old_bs);
611 
612     return bs;
613 }
614 
615 static void bdrv_next_reset(BdrvNextIterator *it)
616 {
617     *it = (BdrvNextIterator) {
618         .phase = BDRV_NEXT_BACKEND_ROOTS,
619     };
620 }
621 
622 BlockDriverState *bdrv_first(BdrvNextIterator *it)
623 {
624     bdrv_next_reset(it);
625     return bdrv_next(it);
626 }
627 
628 /* Must be called when aborting a bdrv_next() iteration before
629  * bdrv_next() returns NULL */
630 void bdrv_next_cleanup(BdrvNextIterator *it)
631 {
632     /* Must be called from the main loop */
633     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
634 
635     if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
636         if (it->blk) {
637             bdrv_unref(blk_bs(it->blk));
638             blk_unref(it->blk);
639         }
640     } else {
641         bdrv_unref(it->bs);
642     }
643 
644     bdrv_next_reset(it);
645 }
646 
647 /*
648  * Add a BlockBackend into the list of backends referenced by the monitor, with
649  * the given @name acting as the handle for the monitor.
650  * Strictly for use by blockdev.c.
651  *
652  * @name must not be null or empty.
653  *
654  * Returns true on success and false on failure. In the latter case, an Error
655  * object is returned through @errp.
656  */
657 bool monitor_add_blk(BlockBackend *blk, const char *name, Error **errp)
658 {
659     assert(!blk->name);
660     assert(name && name[0]);
661 
662     if (!id_wellformed(name)) {
663         error_setg(errp, "Invalid device name");
664         return false;
665     }
666     if (blk_by_name(name)) {
667         error_setg(errp, "Device with id '%s' already exists", name);
668         return false;
669     }
670     if (bdrv_find_node(name)) {
671         error_setg(errp,
672                    "Device name '%s' conflicts with an existing node name",
673                    name);
674         return false;
675     }
676 
677     blk->name = g_strdup(name);
678     QTAILQ_INSERT_TAIL(&monitor_block_backends, blk, monitor_link);
679     return true;
680 }
681 
682 /*
683  * Remove a BlockBackend from the list of backends referenced by the monitor.
684  * Strictly for use by blockdev.c.
685  */
686 void monitor_remove_blk(BlockBackend *blk)
687 {
688     if (!blk->name) {
689         return;
690     }
691 
692     QTAILQ_REMOVE(&monitor_block_backends, blk, monitor_link);
693     g_free(blk->name);
694     blk->name = NULL;
695 }
696 
697 /*
698  * Return @blk's name, a non-null string.
699  * Returns an empty string iff @blk is not referenced by the monitor.
700  */
701 const char *blk_name(const BlockBackend *blk)
702 {
703     return blk->name ?: "";
704 }
705 
706 /*
707  * Return the BlockBackend with name @name if it exists, else null.
708  * @name must not be null.
709  */
710 BlockBackend *blk_by_name(const char *name)
711 {
712     BlockBackend *blk = NULL;
713 
714     assert(name);
715     while ((blk = blk_next(blk)) != NULL) {
716         if (!strcmp(name, blk->name)) {
717             return blk;
718         }
719     }
720     return NULL;
721 }
722 
723 /*
724  * Return the BlockDriverState attached to @blk if any, else null.
725  */
726 BlockDriverState *blk_bs(BlockBackend *blk)
727 {
728     return blk->root ? blk->root->bs : NULL;
729 }
730 
731 static BlockBackend *bdrv_first_blk(BlockDriverState *bs)
732 {
733     BdrvChild *child;
734     QLIST_FOREACH(child, &bs->parents, next_parent) {
735         if (child->klass == &child_root) {
736             return child->opaque;
737         }
738     }
739 
740     return NULL;
741 }
742 
743 /*
744  * Returns true if @bs has an associated BlockBackend.
745  */
746 bool bdrv_has_blk(BlockDriverState *bs)
747 {
748     return bdrv_first_blk(bs) != NULL;
749 }
750 
751 /*
752  * Returns true if @bs has only BlockBackends as parents.
753  */
754 bool bdrv_is_root_node(BlockDriverState *bs)
755 {
756     BdrvChild *c;
757 
758     QLIST_FOREACH(c, &bs->parents, next_parent) {
759         if (c->klass != &child_root) {
760             return false;
761         }
762     }
763 
764     return true;
765 }
766 
767 /*
768  * Return @blk's DriveInfo if any, else null.
769  */
770 DriveInfo *blk_legacy_dinfo(BlockBackend *blk)
771 {
772     return blk->legacy_dinfo;
773 }
774 
775 /*
776  * Set @blk's DriveInfo to @dinfo, and return it.
777  * @blk must not have a DriveInfo set already.
778  * No other BlockBackend may have the same DriveInfo set.
779  */
780 DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, DriveInfo *dinfo)
781 {
782     assert(!blk->legacy_dinfo);
783     return blk->legacy_dinfo = dinfo;
784 }
785 
786 /*
787  * Return the BlockBackend with DriveInfo @dinfo.
788  * It must exist.
789  */
790 BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo)
791 {
792     BlockBackend *blk = NULL;
793 
794     while ((blk = blk_next(blk)) != NULL) {
795         if (blk->legacy_dinfo == dinfo) {
796             return blk;
797         }
798     }
799     abort();
800 }
801 
802 /*
803  * Returns a pointer to the publicly accessible fields of @blk.
804  */
805 BlockBackendPublic *blk_get_public(BlockBackend *blk)
806 {
807     return &blk->public;
808 }
809 
810 /*
811  * Returns a BlockBackend given the associated @public fields.
812  */
813 BlockBackend *blk_by_public(BlockBackendPublic *public)
814 {
815     return container_of(public, BlockBackend, public);
816 }
817 
818 /*
819  * Disassociates the currently associated BlockDriverState from @blk.
820  */
821 void blk_remove_bs(BlockBackend *blk)
822 {
823     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
824     BlockDriverState *bs;
825     BdrvChild *root;
826 
827     notifier_list_notify(&blk->remove_bs_notifiers, blk);
828     if (tgm->throttle_state) {
829         bs = blk_bs(blk);
830         bdrv_drained_begin(bs);
831         throttle_group_detach_aio_context(tgm);
832         throttle_group_attach_aio_context(tgm, qemu_get_aio_context());
833         bdrv_drained_end(bs);
834     }
835 
836     blk_update_root_state(blk);
837 
838     /* bdrv_root_unref_child() will cause blk->root to become stale and may
839      * switch to a completion coroutine later on. Let's drain all I/O here
840      * to avoid that and a potential QEMU crash.
841      */
842     blk_drain(blk);
843     root = blk->root;
844     blk->root = NULL;
845     bdrv_root_unref_child(root);
846 }
847 
848 /*
849  * Associates a new BlockDriverState with @blk.
850  */
851 int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp)
852 {
853     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
854     bdrv_ref(bs);
855     blk->root = bdrv_root_attach_child(bs, "root", &child_root,
856                                        BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
857                                        blk->perm, blk->shared_perm,
858                                        blk, errp);
859     if (blk->root == NULL) {
860         return -EPERM;
861     }
862 
863     notifier_list_notify(&blk->insert_bs_notifiers, blk);
864     if (tgm->throttle_state) {
865         throttle_group_detach_aio_context(tgm);
866         throttle_group_attach_aio_context(tgm, bdrv_get_aio_context(bs));
867     }
868 
869     return 0;
870 }
871 
872 /*
873  * Change BlockDriverState associated with @blk.
874  */
875 int blk_replace_bs(BlockBackend *blk, BlockDriverState *new_bs, Error **errp)
876 {
877     return bdrv_replace_child_bs(blk->root, new_bs, errp);
878 }
879 
880 /*
881  * Sets the permission bitmasks that the user of the BlockBackend needs.
882  */
883 int blk_set_perm(BlockBackend *blk, uint64_t perm, uint64_t shared_perm,
884                  Error **errp)
885 {
886     int ret;
887 
888     if (blk->root && !blk->disable_perm) {
889         ret = bdrv_child_try_set_perm(blk->root, perm, shared_perm, errp);
890         if (ret < 0) {
891             return ret;
892         }
893     }
894 
895     blk->perm = perm;
896     blk->shared_perm = shared_perm;
897 
898     return 0;
899 }
900 
901 void blk_get_perm(BlockBackend *blk, uint64_t *perm, uint64_t *shared_perm)
902 {
903     *perm = blk->perm;
904     *shared_perm = blk->shared_perm;
905 }
906 
907 /*
908  * Attach device model @dev to @blk.
909  * Return 0 on success, -EBUSY when a device model is attached already.
910  */
911 int blk_attach_dev(BlockBackend *blk, DeviceState *dev)
912 {
913     if (blk->dev) {
914         return -EBUSY;
915     }
916 
917     /* While migration is still incoming, we don't need to apply the
918      * permissions of guest device BlockBackends. We might still have a block
919      * job or NBD server writing to the image for storage migration. */
920     if (runstate_check(RUN_STATE_INMIGRATE)) {
921         blk->disable_perm = true;
922     }
923 
924     blk_ref(blk);
925     blk->dev = dev;
926     blk_iostatus_reset(blk);
927 
928     return 0;
929 }
930 
931 /*
932  * Detach device model @dev from @blk.
933  * @dev must be currently attached to @blk.
934  */
935 void blk_detach_dev(BlockBackend *blk, DeviceState *dev)
936 {
937     assert(blk->dev == dev);
938     blk->dev = NULL;
939     blk->dev_ops = NULL;
940     blk->dev_opaque = NULL;
941     blk->guest_block_size = 512;
942     blk_set_perm(blk, 0, BLK_PERM_ALL, &error_abort);
943     blk_unref(blk);
944 }
945 
946 /*
947  * Return the device model attached to @blk if any, else null.
948  */
949 DeviceState *blk_get_attached_dev(BlockBackend *blk)
950 {
951     return blk->dev;
952 }
953 
954 /* Return the qdev ID, or if no ID is assigned the QOM path, of the block
955  * device attached to the BlockBackend. */
956 char *blk_get_attached_dev_id(BlockBackend *blk)
957 {
958     DeviceState *dev = blk->dev;
959 
960     if (!dev) {
961         return g_strdup("");
962     } else if (dev->id) {
963         return g_strdup(dev->id);
964     }
965 
966     return object_get_canonical_path(OBJECT(dev)) ?: g_strdup("");
967 }
968 
969 /*
970  * Return the BlockBackend which has the device model @dev attached if it
971  * exists, else null.
972  *
973  * @dev must not be null.
974  */
975 BlockBackend *blk_by_dev(void *dev)
976 {
977     BlockBackend *blk = NULL;
978 
979     assert(dev != NULL);
980     while ((blk = blk_all_next(blk)) != NULL) {
981         if (blk->dev == dev) {
982             return blk;
983         }
984     }
985     return NULL;
986 }
987 
988 /*
989  * Set @blk's device model callbacks to @ops.
990  * @opaque is the opaque argument to pass to the callbacks.
991  * This is for use by device models.
992  */
993 void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops,
994                      void *opaque)
995 {
996     blk->dev_ops = ops;
997     blk->dev_opaque = opaque;
998 
999     /* Are we currently quiesced? Should we enforce this right now? */
1000     if (blk->quiesce_counter && ops->drained_begin) {
1001         ops->drained_begin(opaque);
1002     }
1003 }
1004 
1005 /*
1006  * Notify @blk's attached device model of media change.
1007  *
1008  * If @load is true, notify of media load. This action can fail, meaning that
1009  * the medium cannot be loaded. @errp is set then.
1010  *
1011  * If @load is false, notify of media eject. This can never fail.
1012  *
1013  * Also send DEVICE_TRAY_MOVED events as appropriate.
1014  */
1015 void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp)
1016 {
1017     if (blk->dev_ops && blk->dev_ops->change_media_cb) {
1018         bool tray_was_open, tray_is_open;
1019         Error *local_err = NULL;
1020 
1021         tray_was_open = blk_dev_is_tray_open(blk);
1022         blk->dev_ops->change_media_cb(blk->dev_opaque, load, &local_err);
1023         if (local_err) {
1024             assert(load == true);
1025             error_propagate(errp, local_err);
1026             return;
1027         }
1028         tray_is_open = blk_dev_is_tray_open(blk);
1029 
1030         if (tray_was_open != tray_is_open) {
1031             char *id = blk_get_attached_dev_id(blk);
1032             qapi_event_send_device_tray_moved(blk_name(blk), id, tray_is_open);
1033             g_free(id);
1034         }
1035     }
1036 }
1037 
1038 static void blk_root_change_media(BdrvChild *child, bool load)
1039 {
1040     blk_dev_change_media_cb(child->opaque, load, NULL);
1041 }
1042 
1043 /*
1044  * Does @blk's attached device model have removable media?
1045  * %true if no device model is attached.
1046  */
1047 bool blk_dev_has_removable_media(BlockBackend *blk)
1048 {
1049     return !blk->dev || (blk->dev_ops && blk->dev_ops->change_media_cb);
1050 }
1051 
1052 /*
1053  * Does @blk's attached device model have a tray?
1054  */
1055 bool blk_dev_has_tray(BlockBackend *blk)
1056 {
1057     return blk->dev_ops && blk->dev_ops->is_tray_open;
1058 }
1059 
1060 /*
1061  * Notify @blk's attached device model of a media eject request.
1062  * If @force is true, the medium is about to be yanked out forcefully.
1063  */
1064 void blk_dev_eject_request(BlockBackend *blk, bool force)
1065 {
1066     if (blk->dev_ops && blk->dev_ops->eject_request_cb) {
1067         blk->dev_ops->eject_request_cb(blk->dev_opaque, force);
1068     }
1069 }
1070 
1071 /*
1072  * Does @blk's attached device model have a tray, and is it open?
1073  */
1074 bool blk_dev_is_tray_open(BlockBackend *blk)
1075 {
1076     if (blk_dev_has_tray(blk)) {
1077         return blk->dev_ops->is_tray_open(blk->dev_opaque);
1078     }
1079     return false;
1080 }
1081 
1082 /*
1083  * Does @blk's attached device model have the medium locked?
1084  * %false if the device model has no such lock.
1085  */
1086 bool blk_dev_is_medium_locked(BlockBackend *blk)
1087 {
1088     if (blk->dev_ops && blk->dev_ops->is_medium_locked) {
1089         return blk->dev_ops->is_medium_locked(blk->dev_opaque);
1090     }
1091     return false;
1092 }
1093 
1094 /*
1095  * Notify @blk's attached device model of a backend size change.
1096  */
1097 static void blk_root_resize(BdrvChild *child)
1098 {
1099     BlockBackend *blk = child->opaque;
1100 
1101     if (blk->dev_ops && blk->dev_ops->resize_cb) {
1102         blk->dev_ops->resize_cb(blk->dev_opaque);
1103     }
1104 }
1105 
1106 void blk_iostatus_enable(BlockBackend *blk)
1107 {
1108     blk->iostatus_enabled = true;
1109     blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
1110 }
1111 
1112 /* The I/O status is only enabled if the drive explicitly
1113  * enables it _and_ the VM is configured to stop on errors */
1114 bool blk_iostatus_is_enabled(const BlockBackend *blk)
1115 {
1116     return (blk->iostatus_enabled &&
1117            (blk->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
1118             blk->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
1119             blk->on_read_error == BLOCKDEV_ON_ERROR_STOP));
1120 }
1121 
1122 BlockDeviceIoStatus blk_iostatus(const BlockBackend *blk)
1123 {
1124     return blk->iostatus;
1125 }
1126 
1127 void blk_iostatus_disable(BlockBackend *blk)
1128 {
1129     blk->iostatus_enabled = false;
1130 }
1131 
1132 void blk_iostatus_reset(BlockBackend *blk)
1133 {
1134     if (blk_iostatus_is_enabled(blk)) {
1135         blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
1136     }
1137 }
1138 
1139 void blk_iostatus_set_err(BlockBackend *blk, int error)
1140 {
1141     assert(blk_iostatus_is_enabled(blk));
1142     if (blk->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
1143         blk->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
1144                                           BLOCK_DEVICE_IO_STATUS_FAILED;
1145     }
1146 }
1147 
1148 void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow)
1149 {
1150     blk->allow_write_beyond_eof = allow;
1151 }
1152 
1153 void blk_set_allow_aio_context_change(BlockBackend *blk, bool allow)
1154 {
1155     blk->allow_aio_context_change = allow;
1156 }
1157 
1158 void blk_set_disable_request_queuing(BlockBackend *blk, bool disable)
1159 {
1160     blk->disable_request_queuing = disable;
1161 }
1162 
1163 static int blk_check_byte_request(BlockBackend *blk, int64_t offset,
1164                                   size_t size)
1165 {
1166     int64_t len;
1167 
1168     if (size > INT_MAX) {
1169         return -EIO;
1170     }
1171 
1172     if (!blk_is_available(blk)) {
1173         return -ENOMEDIUM;
1174     }
1175 
1176     if (offset < 0) {
1177         return -EIO;
1178     }
1179 
1180     if (!blk->allow_write_beyond_eof) {
1181         len = blk_getlength(blk);
1182         if (len < 0) {
1183             return len;
1184         }
1185 
1186         if (offset > len || len - offset < size) {
1187             return -EIO;
1188         }
1189     }
1190 
1191     return 0;
1192 }
1193 
1194 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1195 static void coroutine_fn blk_wait_while_drained(BlockBackend *blk)
1196 {
1197     assert(blk->in_flight > 0);
1198 
1199     if (blk->quiesce_counter && !blk->disable_request_queuing) {
1200         blk_dec_in_flight(blk);
1201         qemu_co_queue_wait(&blk->queued_requests, NULL);
1202         blk_inc_in_flight(blk);
1203     }
1204 }
1205 
1206 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1207 static int coroutine_fn
1208 blk_do_preadv(BlockBackend *blk, int64_t offset, unsigned int bytes,
1209               QEMUIOVector *qiov, BdrvRequestFlags flags)
1210 {
1211     int ret;
1212     BlockDriverState *bs;
1213 
1214     blk_wait_while_drained(blk);
1215 
1216     /* Call blk_bs() only after waiting, the graph may have changed */
1217     bs = blk_bs(blk);
1218     trace_blk_co_preadv(blk, bs, offset, bytes, flags);
1219 
1220     ret = blk_check_byte_request(blk, offset, bytes);
1221     if (ret < 0) {
1222         return ret;
1223     }
1224 
1225     bdrv_inc_in_flight(bs);
1226 
1227     /* throttling disk I/O */
1228     if (blk->public.throttle_group_member.throttle_state) {
1229         throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
1230                 bytes, false);
1231     }
1232 
1233     ret = bdrv_co_preadv(blk->root, offset, bytes, qiov, flags);
1234     bdrv_dec_in_flight(bs);
1235     return ret;
1236 }
1237 
1238 int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
1239                                unsigned int bytes, QEMUIOVector *qiov,
1240                                BdrvRequestFlags flags)
1241 {
1242     int ret;
1243 
1244     blk_inc_in_flight(blk);
1245     ret = blk_do_preadv(blk, offset, bytes, qiov, flags);
1246     blk_dec_in_flight(blk);
1247 
1248     return ret;
1249 }
1250 
1251 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1252 static int coroutine_fn
1253 blk_do_pwritev_part(BlockBackend *blk, int64_t offset, unsigned int bytes,
1254                     QEMUIOVector *qiov, size_t qiov_offset,
1255                     BdrvRequestFlags flags)
1256 {
1257     int ret;
1258     BlockDriverState *bs;
1259 
1260     blk_wait_while_drained(blk);
1261 
1262     /* Call blk_bs() only after waiting, the graph may have changed */
1263     bs = blk_bs(blk);
1264     trace_blk_co_pwritev(blk, bs, offset, bytes, flags);
1265 
1266     ret = blk_check_byte_request(blk, offset, bytes);
1267     if (ret < 0) {
1268         return ret;
1269     }
1270 
1271     bdrv_inc_in_flight(bs);
1272     /* throttling disk I/O */
1273     if (blk->public.throttle_group_member.throttle_state) {
1274         throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
1275                 bytes, true);
1276     }
1277 
1278     if (!blk->enable_write_cache) {
1279         flags |= BDRV_REQ_FUA;
1280     }
1281 
1282     ret = bdrv_co_pwritev_part(blk->root, offset, bytes, qiov, qiov_offset,
1283                                flags);
1284     bdrv_dec_in_flight(bs);
1285     return ret;
1286 }
1287 
1288 int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset,
1289                                      unsigned int bytes,
1290                                      QEMUIOVector *qiov, size_t qiov_offset,
1291                                      BdrvRequestFlags flags)
1292 {
1293     int ret;
1294 
1295     blk_inc_in_flight(blk);
1296     ret = blk_do_pwritev_part(blk, offset, bytes, qiov, qiov_offset, flags);
1297     blk_dec_in_flight(blk);
1298 
1299     return ret;
1300 }
1301 
1302 int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
1303                                 unsigned int bytes, QEMUIOVector *qiov,
1304                                 BdrvRequestFlags flags)
1305 {
1306     return blk_co_pwritev_part(blk, offset, bytes, qiov, 0, flags);
1307 }
1308 
1309 typedef struct BlkRwCo {
1310     BlockBackend *blk;
1311     int64_t offset;
1312     void *iobuf;
1313     int ret;
1314     BdrvRequestFlags flags;
1315 } BlkRwCo;
1316 
1317 static void blk_read_entry(void *opaque)
1318 {
1319     BlkRwCo *rwco = opaque;
1320     QEMUIOVector *qiov = rwco->iobuf;
1321 
1322     rwco->ret = blk_do_preadv(rwco->blk, rwco->offset, qiov->size,
1323                               qiov, rwco->flags);
1324     aio_wait_kick();
1325 }
1326 
1327 static void blk_write_entry(void *opaque)
1328 {
1329     BlkRwCo *rwco = opaque;
1330     QEMUIOVector *qiov = rwco->iobuf;
1331 
1332     rwco->ret = blk_do_pwritev_part(rwco->blk, rwco->offset, qiov->size,
1333                                     qiov, 0, rwco->flags);
1334     aio_wait_kick();
1335 }
1336 
1337 static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
1338                    int64_t bytes, CoroutineEntry co_entry,
1339                    BdrvRequestFlags flags)
1340 {
1341     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
1342     BlkRwCo rwco = {
1343         .blk    = blk,
1344         .offset = offset,
1345         .iobuf  = &qiov,
1346         .flags  = flags,
1347         .ret    = NOT_DONE,
1348     };
1349 
1350     blk_inc_in_flight(blk);
1351     if (qemu_in_coroutine()) {
1352         /* Fast-path if already in coroutine context */
1353         co_entry(&rwco);
1354     } else {
1355         Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
1356         bdrv_coroutine_enter(blk_bs(blk), co);
1357         BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
1358     }
1359     blk_dec_in_flight(blk);
1360 
1361     return rwco.ret;
1362 }
1363 
1364 int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
1365                       int bytes, BdrvRequestFlags flags)
1366 {
1367     return blk_prw(blk, offset, NULL, bytes, blk_write_entry,
1368                    flags | BDRV_REQ_ZERO_WRITE);
1369 }
1370 
1371 int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
1372 {
1373     return bdrv_make_zero(blk->root, flags);
1374 }
1375 
1376 void blk_inc_in_flight(BlockBackend *blk)
1377 {
1378     qatomic_inc(&blk->in_flight);
1379 }
1380 
1381 void blk_dec_in_flight(BlockBackend *blk)
1382 {
1383     qatomic_dec(&blk->in_flight);
1384     aio_wait_kick();
1385 }
1386 
1387 static void error_callback_bh(void *opaque)
1388 {
1389     struct BlockBackendAIOCB *acb = opaque;
1390 
1391     blk_dec_in_flight(acb->blk);
1392     acb->common.cb(acb->common.opaque, acb->ret);
1393     qemu_aio_unref(acb);
1394 }
1395 
1396 BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
1397                                   BlockCompletionFunc *cb,
1398                                   void *opaque, int ret)
1399 {
1400     struct BlockBackendAIOCB *acb;
1401 
1402     blk_inc_in_flight(blk);
1403     acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque);
1404     acb->blk = blk;
1405     acb->ret = ret;
1406 
1407     replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
1408                                      error_callback_bh, acb);
1409     return &acb->common;
1410 }
1411 
1412 typedef struct BlkAioEmAIOCB {
1413     BlockAIOCB common;
1414     BlkRwCo rwco;
1415     int bytes;
1416     bool has_returned;
1417 } BlkAioEmAIOCB;
1418 
1419 static AioContext *blk_aio_em_aiocb_get_aio_context(BlockAIOCB *acb_)
1420 {
1421     BlkAioEmAIOCB *acb = container_of(acb_, BlkAioEmAIOCB, common);
1422 
1423     return blk_get_aio_context(acb->rwco.blk);
1424 }
1425 
1426 static const AIOCBInfo blk_aio_em_aiocb_info = {
1427     .aiocb_size         = sizeof(BlkAioEmAIOCB),
1428     .get_aio_context    = blk_aio_em_aiocb_get_aio_context,
1429 };
1430 
1431 static void blk_aio_complete(BlkAioEmAIOCB *acb)
1432 {
1433     if (acb->has_returned) {
1434         acb->common.cb(acb->common.opaque, acb->rwco.ret);
1435         blk_dec_in_flight(acb->rwco.blk);
1436         qemu_aio_unref(acb);
1437     }
1438 }
1439 
1440 static void blk_aio_complete_bh(void *opaque)
1441 {
1442     BlkAioEmAIOCB *acb = opaque;
1443     assert(acb->has_returned);
1444     blk_aio_complete(acb);
1445 }
1446 
1447 static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
1448                                 void *iobuf, CoroutineEntry co_entry,
1449                                 BdrvRequestFlags flags,
1450                                 BlockCompletionFunc *cb, void *opaque)
1451 {
1452     BlkAioEmAIOCB *acb;
1453     Coroutine *co;
1454 
1455     blk_inc_in_flight(blk);
1456     acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
1457     acb->rwco = (BlkRwCo) {
1458         .blk    = blk,
1459         .offset = offset,
1460         .iobuf  = iobuf,
1461         .flags  = flags,
1462         .ret    = NOT_DONE,
1463     };
1464     acb->bytes = bytes;
1465     acb->has_returned = false;
1466 
1467     co = qemu_coroutine_create(co_entry, acb);
1468     bdrv_coroutine_enter(blk_bs(blk), co);
1469 
1470     acb->has_returned = true;
1471     if (acb->rwco.ret != NOT_DONE) {
1472         replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
1473                                          blk_aio_complete_bh, acb);
1474     }
1475 
1476     return &acb->common;
1477 }
1478 
1479 static void blk_aio_read_entry(void *opaque)
1480 {
1481     BlkAioEmAIOCB *acb = opaque;
1482     BlkRwCo *rwco = &acb->rwco;
1483     QEMUIOVector *qiov = rwco->iobuf;
1484 
1485     assert(qiov->size == acb->bytes);
1486     rwco->ret = blk_do_preadv(rwco->blk, rwco->offset, acb->bytes,
1487                               qiov, rwco->flags);
1488     blk_aio_complete(acb);
1489 }
1490 
1491 static void blk_aio_write_entry(void *opaque)
1492 {
1493     BlkAioEmAIOCB *acb = opaque;
1494     BlkRwCo *rwco = &acb->rwco;
1495     QEMUIOVector *qiov = rwco->iobuf;
1496 
1497     assert(!qiov || qiov->size == acb->bytes);
1498     rwco->ret = blk_do_pwritev_part(rwco->blk, rwco->offset, acb->bytes,
1499                                     qiov, 0, rwco->flags);
1500     blk_aio_complete(acb);
1501 }
1502 
1503 BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset,
1504                                   int count, BdrvRequestFlags flags,
1505                                   BlockCompletionFunc *cb, void *opaque)
1506 {
1507     return blk_aio_prwv(blk, offset, count, NULL, blk_aio_write_entry,
1508                         flags | BDRV_REQ_ZERO_WRITE, cb, opaque);
1509 }
1510 
1511 int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count)
1512 {
1513     int ret = blk_prw(blk, offset, buf, count, blk_read_entry, 0);
1514     if (ret < 0) {
1515         return ret;
1516     }
1517     return count;
1518 }
1519 
1520 int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count,
1521                BdrvRequestFlags flags)
1522 {
1523     int ret = blk_prw(blk, offset, (void *) buf, count, blk_write_entry,
1524                       flags);
1525     if (ret < 0) {
1526         return ret;
1527     }
1528     return count;
1529 }
1530 
1531 int64_t blk_getlength(BlockBackend *blk)
1532 {
1533     if (!blk_is_available(blk)) {
1534         return -ENOMEDIUM;
1535     }
1536 
1537     return bdrv_getlength(blk_bs(blk));
1538 }
1539 
1540 void blk_get_geometry(BlockBackend *blk, uint64_t *nb_sectors_ptr)
1541 {
1542     if (!blk_bs(blk)) {
1543         *nb_sectors_ptr = 0;
1544     } else {
1545         bdrv_get_geometry(blk_bs(blk), nb_sectors_ptr);
1546     }
1547 }
1548 
1549 int64_t blk_nb_sectors(BlockBackend *blk)
1550 {
1551     if (!blk_is_available(blk)) {
1552         return -ENOMEDIUM;
1553     }
1554 
1555     return bdrv_nb_sectors(blk_bs(blk));
1556 }
1557 
1558 BlockAIOCB *blk_aio_preadv(BlockBackend *blk, int64_t offset,
1559                            QEMUIOVector *qiov, BdrvRequestFlags flags,
1560                            BlockCompletionFunc *cb, void *opaque)
1561 {
1562     return blk_aio_prwv(blk, offset, qiov->size, qiov,
1563                         blk_aio_read_entry, flags, cb, opaque);
1564 }
1565 
1566 BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
1567                             QEMUIOVector *qiov, BdrvRequestFlags flags,
1568                             BlockCompletionFunc *cb, void *opaque)
1569 {
1570     return blk_aio_prwv(blk, offset, qiov->size, qiov,
1571                         blk_aio_write_entry, flags, cb, opaque);
1572 }
1573 
1574 void blk_aio_cancel(BlockAIOCB *acb)
1575 {
1576     bdrv_aio_cancel(acb);
1577 }
1578 
1579 void blk_aio_cancel_async(BlockAIOCB *acb)
1580 {
1581     bdrv_aio_cancel_async(acb);
1582 }
1583 
1584 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1585 static int coroutine_fn
1586 blk_do_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
1587 {
1588     blk_wait_while_drained(blk);
1589 
1590     if (!blk_is_available(blk)) {
1591         return -ENOMEDIUM;
1592     }
1593 
1594     return bdrv_co_ioctl(blk_bs(blk), req, buf);
1595 }
1596 
1597 static void blk_ioctl_entry(void *opaque)
1598 {
1599     BlkRwCo *rwco = opaque;
1600     QEMUIOVector *qiov = rwco->iobuf;
1601 
1602     rwco->ret = blk_do_ioctl(rwco->blk, rwco->offset, qiov->iov[0].iov_base);
1603     aio_wait_kick();
1604 }
1605 
1606 int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
1607 {
1608     return blk_prw(blk, req, buf, 0, blk_ioctl_entry, 0);
1609 }
1610 
1611 static void blk_aio_ioctl_entry(void *opaque)
1612 {
1613     BlkAioEmAIOCB *acb = opaque;
1614     BlkRwCo *rwco = &acb->rwco;
1615 
1616     rwco->ret = blk_do_ioctl(rwco->blk, rwco->offset, rwco->iobuf);
1617 
1618     blk_aio_complete(acb);
1619 }
1620 
1621 BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
1622                           BlockCompletionFunc *cb, void *opaque)
1623 {
1624     return blk_aio_prwv(blk, req, 0, buf, blk_aio_ioctl_entry, 0, cb, opaque);
1625 }
1626 
1627 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1628 static int coroutine_fn
1629 blk_do_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
1630 {
1631     int ret;
1632 
1633     blk_wait_while_drained(blk);
1634 
1635     ret = blk_check_byte_request(blk, offset, bytes);
1636     if (ret < 0) {
1637         return ret;
1638     }
1639 
1640     return bdrv_co_pdiscard(blk->root, offset, bytes);
1641 }
1642 
1643 static void blk_aio_pdiscard_entry(void *opaque)
1644 {
1645     BlkAioEmAIOCB *acb = opaque;
1646     BlkRwCo *rwco = &acb->rwco;
1647 
1648     rwco->ret = blk_do_pdiscard(rwco->blk, rwco->offset, acb->bytes);
1649     blk_aio_complete(acb);
1650 }
1651 
1652 BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk,
1653                              int64_t offset, int bytes,
1654                              BlockCompletionFunc *cb, void *opaque)
1655 {
1656     return blk_aio_prwv(blk, offset, bytes, NULL, blk_aio_pdiscard_entry, 0,
1657                         cb, opaque);
1658 }
1659 
1660 int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
1661 {
1662     int ret;
1663 
1664     blk_inc_in_flight(blk);
1665     ret = blk_do_pdiscard(blk, offset, bytes);
1666     blk_dec_in_flight(blk);
1667 
1668     return ret;
1669 }
1670 
1671 static void blk_pdiscard_entry(void *opaque)
1672 {
1673     BlkRwCo *rwco = opaque;
1674     QEMUIOVector *qiov = rwco->iobuf;
1675 
1676     rwco->ret = blk_do_pdiscard(rwco->blk, rwco->offset, qiov->size);
1677     aio_wait_kick();
1678 }
1679 
1680 int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
1681 {
1682     return blk_prw(blk, offset, NULL, bytes, blk_pdiscard_entry, 0);
1683 }
1684 
1685 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1686 static int coroutine_fn blk_do_flush(BlockBackend *blk)
1687 {
1688     blk_wait_while_drained(blk);
1689 
1690     if (!blk_is_available(blk)) {
1691         return -ENOMEDIUM;
1692     }
1693 
1694     return bdrv_co_flush(blk_bs(blk));
1695 }
1696 
1697 static void blk_aio_flush_entry(void *opaque)
1698 {
1699     BlkAioEmAIOCB *acb = opaque;
1700     BlkRwCo *rwco = &acb->rwco;
1701 
1702     rwco->ret = blk_do_flush(rwco->blk);
1703     blk_aio_complete(acb);
1704 }
1705 
1706 BlockAIOCB *blk_aio_flush(BlockBackend *blk,
1707                           BlockCompletionFunc *cb, void *opaque)
1708 {
1709     return blk_aio_prwv(blk, 0, 0, NULL, blk_aio_flush_entry, 0, cb, opaque);
1710 }
1711 
1712 int coroutine_fn blk_co_flush(BlockBackend *blk)
1713 {
1714     int ret;
1715 
1716     blk_inc_in_flight(blk);
1717     ret = blk_do_flush(blk);
1718     blk_dec_in_flight(blk);
1719 
1720     return ret;
1721 }
1722 
1723 static void blk_flush_entry(void *opaque)
1724 {
1725     BlkRwCo *rwco = opaque;
1726     rwco->ret = blk_do_flush(rwco->blk);
1727     aio_wait_kick();
1728 }
1729 
1730 int blk_flush(BlockBackend *blk)
1731 {
1732     return blk_prw(blk, 0, NULL, 0, blk_flush_entry, 0);
1733 }
1734 
1735 void blk_drain(BlockBackend *blk)
1736 {
1737     BlockDriverState *bs = blk_bs(blk);
1738 
1739     if (bs) {
1740         bdrv_drained_begin(bs);
1741     }
1742 
1743     /* We may have -ENOMEDIUM completions in flight */
1744     AIO_WAIT_WHILE(blk_get_aio_context(blk),
1745                    qatomic_mb_read(&blk->in_flight) > 0);
1746 
1747     if (bs) {
1748         bdrv_drained_end(bs);
1749     }
1750 }
1751 
1752 void blk_drain_all(void)
1753 {
1754     BlockBackend *blk = NULL;
1755 
1756     bdrv_drain_all_begin();
1757 
1758     while ((blk = blk_all_next(blk)) != NULL) {
1759         AioContext *ctx = blk_get_aio_context(blk);
1760 
1761         aio_context_acquire(ctx);
1762 
1763         /* We may have -ENOMEDIUM completions in flight */
1764         AIO_WAIT_WHILE(ctx, qatomic_mb_read(&blk->in_flight) > 0);
1765 
1766         aio_context_release(ctx);
1767     }
1768 
1769     bdrv_drain_all_end();
1770 }
1771 
1772 void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error,
1773                       BlockdevOnError on_write_error)
1774 {
1775     blk->on_read_error = on_read_error;
1776     blk->on_write_error = on_write_error;
1777 }
1778 
1779 BlockdevOnError blk_get_on_error(BlockBackend *blk, bool is_read)
1780 {
1781     return is_read ? blk->on_read_error : blk->on_write_error;
1782 }
1783 
1784 BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read,
1785                                       int error)
1786 {
1787     BlockdevOnError on_err = blk_get_on_error(blk, is_read);
1788 
1789     switch (on_err) {
1790     case BLOCKDEV_ON_ERROR_ENOSPC:
1791         return (error == ENOSPC) ?
1792                BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
1793     case BLOCKDEV_ON_ERROR_STOP:
1794         return BLOCK_ERROR_ACTION_STOP;
1795     case BLOCKDEV_ON_ERROR_REPORT:
1796         return BLOCK_ERROR_ACTION_REPORT;
1797     case BLOCKDEV_ON_ERROR_IGNORE:
1798         return BLOCK_ERROR_ACTION_IGNORE;
1799     case BLOCKDEV_ON_ERROR_AUTO:
1800     default:
1801         abort();
1802     }
1803 }
1804 
1805 static void send_qmp_error_event(BlockBackend *blk,
1806                                  BlockErrorAction action,
1807                                  bool is_read, int error)
1808 {
1809     IoOperationType optype;
1810     BlockDriverState *bs = blk_bs(blk);
1811 
1812     optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
1813     qapi_event_send_block_io_error(blk_name(blk), !!bs,
1814                                    bs ? bdrv_get_node_name(bs) : NULL, optype,
1815                                    action, blk_iostatus_is_enabled(blk),
1816                                    error == ENOSPC, strerror(error));
1817 }
1818 
1819 /* This is done by device models because, while the block layer knows
1820  * about the error, it does not know whether an operation comes from
1821  * the device or the block layer (from a job, for example).
1822  */
1823 void blk_error_action(BlockBackend *blk, BlockErrorAction action,
1824                       bool is_read, int error)
1825 {
1826     assert(error >= 0);
1827 
1828     if (action == BLOCK_ERROR_ACTION_STOP) {
1829         /* First set the iostatus, so that "info block" returns an iostatus
1830          * that matches the events raised so far (an additional error iostatus
1831          * is fine, but not a lost one).
1832          */
1833         blk_iostatus_set_err(blk, error);
1834 
1835         /* Then raise the request to stop the VM and the event.
1836          * qemu_system_vmstop_request_prepare has two effects.  First,
1837          * it ensures that the STOP event always comes after the
1838          * BLOCK_IO_ERROR event.  Second, it ensures that even if management
1839          * can observe the STOP event and do a "cont" before the STOP
1840          * event is issued, the VM will not stop.  In this case, vm_start()
1841          * also ensures that the STOP/RESUME pair of events is emitted.
1842          */
1843         qemu_system_vmstop_request_prepare();
1844         send_qmp_error_event(blk, action, is_read, error);
1845         qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
1846     } else {
1847         send_qmp_error_event(blk, action, is_read, error);
1848     }
1849 }
1850 
1851 /*
1852  * Returns true if the BlockBackend can support taking write permissions
1853  * (because its root node is not read-only).
1854  */
1855 bool blk_supports_write_perm(BlockBackend *blk)
1856 {
1857     BlockDriverState *bs = blk_bs(blk);
1858 
1859     if (bs) {
1860         return !bdrv_is_read_only(bs);
1861     } else {
1862         return blk->root_state.open_flags & BDRV_O_RDWR;
1863     }
1864 }
1865 
1866 /*
1867  * Returns true if the BlockBackend can be written to in its current
1868  * configuration (i.e. if write permission have been requested)
1869  */
1870 bool blk_is_writable(BlockBackend *blk)
1871 {
1872     return blk->perm & BLK_PERM_WRITE;
1873 }
1874 
1875 bool blk_is_sg(BlockBackend *blk)
1876 {
1877     BlockDriverState *bs = blk_bs(blk);
1878 
1879     if (!bs) {
1880         return false;
1881     }
1882 
1883     return bdrv_is_sg(bs);
1884 }
1885 
1886 bool blk_enable_write_cache(BlockBackend *blk)
1887 {
1888     return blk->enable_write_cache;
1889 }
1890 
1891 void blk_set_enable_write_cache(BlockBackend *blk, bool wce)
1892 {
1893     blk->enable_write_cache = wce;
1894 }
1895 
1896 void blk_invalidate_cache(BlockBackend *blk, Error **errp)
1897 {
1898     BlockDriverState *bs = blk_bs(blk);
1899 
1900     if (!bs) {
1901         error_setg(errp, "Device '%s' has no medium", blk->name);
1902         return;
1903     }
1904 
1905     bdrv_invalidate_cache(bs, errp);
1906 }
1907 
1908 bool blk_is_inserted(BlockBackend *blk)
1909 {
1910     BlockDriverState *bs = blk_bs(blk);
1911 
1912     return bs && bdrv_is_inserted(bs);
1913 }
1914 
1915 bool blk_is_available(BlockBackend *blk)
1916 {
1917     return blk_is_inserted(blk) && !blk_dev_is_tray_open(blk);
1918 }
1919 
1920 void blk_lock_medium(BlockBackend *blk, bool locked)
1921 {
1922     BlockDriverState *bs = blk_bs(blk);
1923 
1924     if (bs) {
1925         bdrv_lock_medium(bs, locked);
1926     }
1927 }
1928 
1929 void blk_eject(BlockBackend *blk, bool eject_flag)
1930 {
1931     BlockDriverState *bs = blk_bs(blk);
1932     char *id;
1933 
1934     if (bs) {
1935         bdrv_eject(bs, eject_flag);
1936     }
1937 
1938     /* Whether or not we ejected on the backend,
1939      * the frontend experienced a tray event. */
1940     id = blk_get_attached_dev_id(blk);
1941     qapi_event_send_device_tray_moved(blk_name(blk), id,
1942                                       eject_flag);
1943     g_free(id);
1944 }
1945 
1946 int blk_get_flags(BlockBackend *blk)
1947 {
1948     BlockDriverState *bs = blk_bs(blk);
1949 
1950     if (bs) {
1951         return bdrv_get_flags(bs);
1952     } else {
1953         return blk->root_state.open_flags;
1954     }
1955 }
1956 
1957 /* Returns the minimum request alignment, in bytes; guaranteed nonzero */
1958 uint32_t blk_get_request_alignment(BlockBackend *blk)
1959 {
1960     BlockDriverState *bs = blk_bs(blk);
1961     return bs ? bs->bl.request_alignment : BDRV_SECTOR_SIZE;
1962 }
1963 
1964 /* Returns the maximum hardware transfer length, in bytes; guaranteed nonzero */
1965 uint64_t blk_get_max_hw_transfer(BlockBackend *blk)
1966 {
1967     BlockDriverState *bs = blk_bs(blk);
1968     uint64_t max = INT_MAX;
1969 
1970     if (bs) {
1971         max = MIN_NON_ZERO(max, bs->bl.max_hw_transfer);
1972         max = MIN_NON_ZERO(max, bs->bl.max_transfer);
1973     }
1974     return ROUND_DOWN(max, blk_get_request_alignment(blk));
1975 }
1976 
1977 /* Returns the maximum transfer length, in bytes; guaranteed nonzero */
1978 uint32_t blk_get_max_transfer(BlockBackend *blk)
1979 {
1980     BlockDriverState *bs = blk_bs(blk);
1981     uint32_t max = INT_MAX;
1982 
1983     if (bs) {
1984         max = MIN_NON_ZERO(max, bs->bl.max_transfer);
1985     }
1986     return ROUND_DOWN(max, blk_get_request_alignment(blk));
1987 }
1988 
1989 int blk_get_max_iov(BlockBackend *blk)
1990 {
1991     return blk->root->bs->bl.max_iov;
1992 }
1993 
1994 void blk_set_guest_block_size(BlockBackend *blk, int align)
1995 {
1996     blk->guest_block_size = align;
1997 }
1998 
1999 void *blk_try_blockalign(BlockBackend *blk, size_t size)
2000 {
2001     return qemu_try_blockalign(blk ? blk_bs(blk) : NULL, size);
2002 }
2003 
2004 void *blk_blockalign(BlockBackend *blk, size_t size)
2005 {
2006     return qemu_blockalign(blk ? blk_bs(blk) : NULL, size);
2007 }
2008 
2009 bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp)
2010 {
2011     BlockDriverState *bs = blk_bs(blk);
2012 
2013     if (!bs) {
2014         return false;
2015     }
2016 
2017     return bdrv_op_is_blocked(bs, op, errp);
2018 }
2019 
2020 void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason)
2021 {
2022     BlockDriverState *bs = blk_bs(blk);
2023 
2024     if (bs) {
2025         bdrv_op_unblock(bs, op, reason);
2026     }
2027 }
2028 
2029 void blk_op_block_all(BlockBackend *blk, Error *reason)
2030 {
2031     BlockDriverState *bs = blk_bs(blk);
2032 
2033     if (bs) {
2034         bdrv_op_block_all(bs, reason);
2035     }
2036 }
2037 
2038 void blk_op_unblock_all(BlockBackend *blk, Error *reason)
2039 {
2040     BlockDriverState *bs = blk_bs(blk);
2041 
2042     if (bs) {
2043         bdrv_op_unblock_all(bs, reason);
2044     }
2045 }
2046 
2047 AioContext *blk_get_aio_context(BlockBackend *blk)
2048 {
2049     BlockDriverState *bs = blk_bs(blk);
2050 
2051     if (bs) {
2052         AioContext *ctx = bdrv_get_aio_context(blk_bs(blk));
2053         assert(ctx == blk->ctx);
2054     }
2055 
2056     return blk->ctx;
2057 }
2058 
2059 static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
2060 {
2061     BlockBackendAIOCB *blk_acb = DO_UPCAST(BlockBackendAIOCB, common, acb);
2062     return blk_get_aio_context(blk_acb->blk);
2063 }
2064 
2065 static int blk_do_set_aio_context(BlockBackend *blk, AioContext *new_context,
2066                                   bool update_root_node, Error **errp)
2067 {
2068     BlockDriverState *bs = blk_bs(blk);
2069     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
2070     int ret;
2071 
2072     if (bs) {
2073         if (update_root_node) {
2074             ret = bdrv_child_try_set_aio_context(bs, new_context, blk->root,
2075                                                  errp);
2076             if (ret < 0) {
2077                 return ret;
2078             }
2079         }
2080         if (tgm->throttle_state) {
2081             bdrv_drained_begin(bs);
2082             throttle_group_detach_aio_context(tgm);
2083             throttle_group_attach_aio_context(tgm, new_context);
2084             bdrv_drained_end(bs);
2085         }
2086     }
2087 
2088     blk->ctx = new_context;
2089     return 0;
2090 }
2091 
2092 int blk_set_aio_context(BlockBackend *blk, AioContext *new_context,
2093                         Error **errp)
2094 {
2095     return blk_do_set_aio_context(blk, new_context, true, errp);
2096 }
2097 
2098 static bool blk_root_can_set_aio_ctx(BdrvChild *child, AioContext *ctx,
2099                                      GSList **ignore, Error **errp)
2100 {
2101     BlockBackend *blk = child->opaque;
2102 
2103     if (blk->allow_aio_context_change) {
2104         return true;
2105     }
2106 
2107     /* Only manually created BlockBackends that are not attached to anything
2108      * can change their AioContext without updating their user. */
2109     if (!blk->name || blk->dev) {
2110         /* TODO Add BB name/QOM path */
2111         error_setg(errp, "Cannot change iothread of active block backend");
2112         return false;
2113     }
2114 
2115     return true;
2116 }
2117 
2118 static void blk_root_set_aio_ctx(BdrvChild *child, AioContext *ctx,
2119                                  GSList **ignore)
2120 {
2121     BlockBackend *blk = child->opaque;
2122     blk_do_set_aio_context(blk, ctx, false, &error_abort);
2123 }
2124 
2125 void blk_add_aio_context_notifier(BlockBackend *blk,
2126         void (*attached_aio_context)(AioContext *new_context, void *opaque),
2127         void (*detach_aio_context)(void *opaque), void *opaque)
2128 {
2129     BlockBackendAioNotifier *notifier;
2130     BlockDriverState *bs = blk_bs(blk);
2131 
2132     notifier = g_new(BlockBackendAioNotifier, 1);
2133     notifier->attached_aio_context = attached_aio_context;
2134     notifier->detach_aio_context = detach_aio_context;
2135     notifier->opaque = opaque;
2136     QLIST_INSERT_HEAD(&blk->aio_notifiers, notifier, list);
2137 
2138     if (bs) {
2139         bdrv_add_aio_context_notifier(bs, attached_aio_context,
2140                                       detach_aio_context, opaque);
2141     }
2142 }
2143 
2144 void blk_remove_aio_context_notifier(BlockBackend *blk,
2145                                      void (*attached_aio_context)(AioContext *,
2146                                                                   void *),
2147                                      void (*detach_aio_context)(void *),
2148                                      void *opaque)
2149 {
2150     BlockBackendAioNotifier *notifier;
2151     BlockDriverState *bs = blk_bs(blk);
2152 
2153     if (bs) {
2154         bdrv_remove_aio_context_notifier(bs, attached_aio_context,
2155                                          detach_aio_context, opaque);
2156     }
2157 
2158     QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
2159         if (notifier->attached_aio_context == attached_aio_context &&
2160             notifier->detach_aio_context == detach_aio_context &&
2161             notifier->opaque == opaque) {
2162             QLIST_REMOVE(notifier, list);
2163             g_free(notifier);
2164             return;
2165         }
2166     }
2167 
2168     abort();
2169 }
2170 
2171 void blk_add_remove_bs_notifier(BlockBackend *blk, Notifier *notify)
2172 {
2173     notifier_list_add(&blk->remove_bs_notifiers, notify);
2174 }
2175 
2176 void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify)
2177 {
2178     notifier_list_add(&blk->insert_bs_notifiers, notify);
2179 }
2180 
2181 void blk_io_plug(BlockBackend *blk)
2182 {
2183     BlockDriverState *bs = blk_bs(blk);
2184 
2185     if (bs) {
2186         bdrv_io_plug(bs);
2187     }
2188 }
2189 
2190 void blk_io_unplug(BlockBackend *blk)
2191 {
2192     BlockDriverState *bs = blk_bs(blk);
2193 
2194     if (bs) {
2195         bdrv_io_unplug(bs);
2196     }
2197 }
2198 
2199 BlockAcctStats *blk_get_stats(BlockBackend *blk)
2200 {
2201     return &blk->stats;
2202 }
2203 
2204 void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
2205                   BlockCompletionFunc *cb, void *opaque)
2206 {
2207     return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque);
2208 }
2209 
2210 int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
2211                                       int bytes, BdrvRequestFlags flags)
2212 {
2213     return blk_co_pwritev(blk, offset, bytes, NULL,
2214                           flags | BDRV_REQ_ZERO_WRITE);
2215 }
2216 
2217 int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf,
2218                           int count)
2219 {
2220     return blk_prw(blk, offset, (void *) buf, count, blk_write_entry,
2221                    BDRV_REQ_WRITE_COMPRESSED);
2222 }
2223 
2224 int blk_truncate(BlockBackend *blk, int64_t offset, bool exact,
2225                  PreallocMode prealloc, BdrvRequestFlags flags, Error **errp)
2226 {
2227     if (!blk_is_available(blk)) {
2228         error_setg(errp, "No medium inserted");
2229         return -ENOMEDIUM;
2230     }
2231 
2232     return bdrv_truncate(blk->root, offset, exact, prealloc, flags, errp);
2233 }
2234 
2235 int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
2236                      int64_t pos, int size)
2237 {
2238     int ret;
2239 
2240     if (!blk_is_available(blk)) {
2241         return -ENOMEDIUM;
2242     }
2243 
2244     ret = bdrv_save_vmstate(blk_bs(blk), buf, pos, size);
2245     if (ret < 0) {
2246         return ret;
2247     }
2248 
2249     if (ret == size && !blk->enable_write_cache) {
2250         ret = bdrv_flush(blk_bs(blk));
2251     }
2252 
2253     return ret < 0 ? ret : size;
2254 }
2255 
2256 int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size)
2257 {
2258     if (!blk_is_available(blk)) {
2259         return -ENOMEDIUM;
2260     }
2261 
2262     return bdrv_load_vmstate(blk_bs(blk), buf, pos, size);
2263 }
2264 
2265 int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz)
2266 {
2267     if (!blk_is_available(blk)) {
2268         return -ENOMEDIUM;
2269     }
2270 
2271     return bdrv_probe_blocksizes(blk_bs(blk), bsz);
2272 }
2273 
2274 int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo)
2275 {
2276     if (!blk_is_available(blk)) {
2277         return -ENOMEDIUM;
2278     }
2279 
2280     return bdrv_probe_geometry(blk_bs(blk), geo);
2281 }
2282 
2283 /*
2284  * Updates the BlockBackendRootState object with data from the currently
2285  * attached BlockDriverState.
2286  */
2287 void blk_update_root_state(BlockBackend *blk)
2288 {
2289     assert(blk->root);
2290 
2291     blk->root_state.open_flags    = blk->root->bs->open_flags;
2292     blk->root_state.detect_zeroes = blk->root->bs->detect_zeroes;
2293 }
2294 
2295 /*
2296  * Returns the detect-zeroes setting to be used for bdrv_open() of a
2297  * BlockDriverState which is supposed to inherit the root state.
2298  */
2299 bool blk_get_detect_zeroes_from_root_state(BlockBackend *blk)
2300 {
2301     return blk->root_state.detect_zeroes;
2302 }
2303 
2304 /*
2305  * Returns the flags to be used for bdrv_open() of a BlockDriverState which is
2306  * supposed to inherit the root state.
2307  */
2308 int blk_get_open_flags_from_root_state(BlockBackend *blk)
2309 {
2310     return blk->root_state.open_flags;
2311 }
2312 
2313 BlockBackendRootState *blk_get_root_state(BlockBackend *blk)
2314 {
2315     return &blk->root_state;
2316 }
2317 
2318 int blk_commit_all(void)
2319 {
2320     BlockBackend *blk = NULL;
2321 
2322     while ((blk = blk_all_next(blk)) != NULL) {
2323         AioContext *aio_context = blk_get_aio_context(blk);
2324         BlockDriverState *unfiltered_bs = bdrv_skip_filters(blk_bs(blk));
2325 
2326         aio_context_acquire(aio_context);
2327         if (blk_is_inserted(blk) && bdrv_cow_child(unfiltered_bs)) {
2328             int ret;
2329 
2330             ret = bdrv_commit(unfiltered_bs);
2331             if (ret < 0) {
2332                 aio_context_release(aio_context);
2333                 return ret;
2334             }
2335         }
2336         aio_context_release(aio_context);
2337     }
2338     return 0;
2339 }
2340 
2341 
2342 /* throttling disk I/O limits */
2343 void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg)
2344 {
2345     throttle_group_config(&blk->public.throttle_group_member, cfg);
2346 }
2347 
2348 void blk_io_limits_disable(BlockBackend *blk)
2349 {
2350     BlockDriverState *bs = blk_bs(blk);
2351     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
2352     assert(tgm->throttle_state);
2353     if (bs) {
2354         bdrv_drained_begin(bs);
2355     }
2356     throttle_group_unregister_tgm(tgm);
2357     if (bs) {
2358         bdrv_drained_end(bs);
2359     }
2360 }
2361 
2362 /* should be called before blk_set_io_limits if a limit is set */
2363 void blk_io_limits_enable(BlockBackend *blk, const char *group)
2364 {
2365     assert(!blk->public.throttle_group_member.throttle_state);
2366     throttle_group_register_tgm(&blk->public.throttle_group_member,
2367                                 group, blk_get_aio_context(blk));
2368 }
2369 
2370 void blk_io_limits_update_group(BlockBackend *blk, const char *group)
2371 {
2372     /* this BB is not part of any group */
2373     if (!blk->public.throttle_group_member.throttle_state) {
2374         return;
2375     }
2376 
2377     /* this BB is a part of the same group than the one we want */
2378     if (!g_strcmp0(throttle_group_get_name(&blk->public.throttle_group_member),
2379                 group)) {
2380         return;
2381     }
2382 
2383     /* need to change the group this bs belong to */
2384     blk_io_limits_disable(blk);
2385     blk_io_limits_enable(blk, group);
2386 }
2387 
2388 static void blk_root_drained_begin(BdrvChild *child)
2389 {
2390     BlockBackend *blk = child->opaque;
2391     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
2392 
2393     if (++blk->quiesce_counter == 1) {
2394         if (blk->dev_ops && blk->dev_ops->drained_begin) {
2395             blk->dev_ops->drained_begin(blk->dev_opaque);
2396         }
2397     }
2398 
2399     /* Note that blk->root may not be accessible here yet if we are just
2400      * attaching to a BlockDriverState that is drained. Use child instead. */
2401 
2402     if (qatomic_fetch_inc(&tgm->io_limits_disabled) == 0) {
2403         throttle_group_restart_tgm(tgm);
2404     }
2405 }
2406 
2407 static bool blk_root_drained_poll(BdrvChild *child)
2408 {
2409     BlockBackend *blk = child->opaque;
2410     bool busy = false;
2411     assert(blk->quiesce_counter);
2412 
2413     if (blk->dev_ops && blk->dev_ops->drained_poll) {
2414         busy = blk->dev_ops->drained_poll(blk->dev_opaque);
2415     }
2416     return busy || !!blk->in_flight;
2417 }
2418 
2419 static void blk_root_drained_end(BdrvChild *child, int *drained_end_counter)
2420 {
2421     BlockBackend *blk = child->opaque;
2422     assert(blk->quiesce_counter);
2423 
2424     assert(blk->public.throttle_group_member.io_limits_disabled);
2425     qatomic_dec(&blk->public.throttle_group_member.io_limits_disabled);
2426 
2427     if (--blk->quiesce_counter == 0) {
2428         if (blk->dev_ops && blk->dev_ops->drained_end) {
2429             blk->dev_ops->drained_end(blk->dev_opaque);
2430         }
2431         while (qemu_co_enter_next(&blk->queued_requests, NULL)) {
2432             /* Resume all queued requests */
2433         }
2434     }
2435 }
2436 
2437 void blk_register_buf(BlockBackend *blk, void *host, size_t size)
2438 {
2439     bdrv_register_buf(blk_bs(blk), host, size);
2440 }
2441 
2442 void blk_unregister_buf(BlockBackend *blk, void *host)
2443 {
2444     bdrv_unregister_buf(blk_bs(blk), host);
2445 }
2446 
2447 int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in,
2448                                    BlockBackend *blk_out, int64_t off_out,
2449                                    int bytes, BdrvRequestFlags read_flags,
2450                                    BdrvRequestFlags write_flags)
2451 {
2452     int r;
2453     r = blk_check_byte_request(blk_in, off_in, bytes);
2454     if (r) {
2455         return r;
2456     }
2457     r = blk_check_byte_request(blk_out, off_out, bytes);
2458     if (r) {
2459         return r;
2460     }
2461     return bdrv_co_copy_range(blk_in->root, off_in,
2462                               blk_out->root, off_out,
2463                               bytes, read_flags, write_flags);
2464 }
2465 
2466 const BdrvChild *blk_root(BlockBackend *blk)
2467 {
2468     return blk->root;
2469 }
2470 
2471 int blk_make_empty(BlockBackend *blk, Error **errp)
2472 {
2473     if (!blk_is_available(blk)) {
2474         error_setg(errp, "No medium inserted");
2475         return -ENOMEDIUM;
2476     }
2477 
2478     return bdrv_make_empty(blk->root, errp);
2479 }
2480