xref: /openbmc/qemu/block/block-backend.c (revision b49f4755c7fa35ea6e17e5b52c1cdaef6b4aa21c)
1 /*
2  * QEMU Block backends
3  *
4  * Copyright (C) 2014-2016 Red Hat, Inc.
5  *
6  * Authors:
7  *  Markus Armbruster <armbru@redhat.com>,
8  *
9  * This work is licensed under the terms of the GNU LGPL, version 2.1
10  * or later.  See the COPYING.LIB file in the top-level directory.
11  */
12 
13 #include "qemu/osdep.h"
14 #include "sysemu/block-backend.h"
15 #include "block/block_int.h"
16 #include "block/blockjob.h"
17 #include "block/coroutines.h"
18 #include "block/throttle-groups.h"
19 #include "hw/qdev-core.h"
20 #include "sysemu/blockdev.h"
21 #include "sysemu/runstate.h"
22 #include "sysemu/replay.h"
23 #include "qapi/error.h"
24 #include "qapi/qapi-events-block.h"
25 #include "qemu/id.h"
26 #include "qemu/main-loop.h"
27 #include "qemu/option.h"
28 #include "trace.h"
29 #include "migration/misc.h"
30 
31 /* Number of coroutines to reserve per attached device model */
32 #define COROUTINE_POOL_RESERVATION 64
33 
34 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
35 
36 typedef struct BlockBackendAioNotifier {
37     void (*attached_aio_context)(AioContext *new_context, void *opaque);
38     void (*detach_aio_context)(void *opaque);
39     void *opaque;
40     QLIST_ENTRY(BlockBackendAioNotifier) list;
41 } BlockBackendAioNotifier;
42 
43 struct BlockBackend {
44     char *name;
45     int refcnt;
46     BdrvChild *root;
47     AioContext *ctx;
48     DriveInfo *legacy_dinfo;    /* null unless created by drive_new() */
49     QTAILQ_ENTRY(BlockBackend) link;         /* for block_backends */
50     QTAILQ_ENTRY(BlockBackend) monitor_link; /* for monitor_block_backends */
51     BlockBackendPublic public;
52 
53     DeviceState *dev;           /* attached device model, if any */
54     const BlockDevOps *dev_ops;
55     void *dev_opaque;
56 
57     /* If the BDS tree is removed, some of its options are stored here (which
58      * can be used to restore those options in the new BDS on insert) */
59     BlockBackendRootState root_state;
60 
61     bool enable_write_cache;
62 
63     /* I/O stats (display with "info blockstats"). */
64     BlockAcctStats stats;
65 
66     BlockdevOnError on_read_error, on_write_error;
67     bool iostatus_enabled;
68     BlockDeviceIoStatus iostatus;
69 
70     uint64_t perm;
71     uint64_t shared_perm;
72     bool disable_perm;
73 
74     bool allow_aio_context_change;
75     bool allow_write_beyond_eof;
76 
77     /* Protected by BQL */
78     NotifierList remove_bs_notifiers, insert_bs_notifiers;
79     QLIST_HEAD(, BlockBackendAioNotifier) aio_notifiers;
80 
81     int quiesce_counter; /* atomic: written under BQL, read by other threads */
82     QemuMutex queued_requests_lock; /* protects queued_requests */
83     CoQueue queued_requests;
84     bool disable_request_queuing; /* atomic */
85 
86     VMChangeStateEntry *vmsh;
87     bool force_allow_inactivate;
88 
89     /* Number of in-flight aio requests.  BlockDriverState also counts
90      * in-flight requests but aio requests can exist even when blk->root is
91      * NULL, so we cannot rely on its counter for that case.
92      * Accessed with atomic ops.
93      */
94     unsigned int in_flight;
95 };
96 
97 typedef struct BlockBackendAIOCB {
98     BlockAIOCB common;
99     BlockBackend *blk;
100     int ret;
101 } BlockBackendAIOCB;
102 
103 static const AIOCBInfo block_backend_aiocb_info = {
104     .aiocb_size = sizeof(BlockBackendAIOCB),
105 };
106 
107 static void drive_info_del(DriveInfo *dinfo);
108 static BlockBackend *bdrv_first_blk(BlockDriverState *bs);
109 
110 /* All BlockBackends. Protected by BQL. */
111 static QTAILQ_HEAD(, BlockBackend) block_backends =
112     QTAILQ_HEAD_INITIALIZER(block_backends);
113 
114 /*
115  * All BlockBackends referenced by the monitor and which are iterated through by
116  * blk_next(). Protected by BQL.
117  */
118 static QTAILQ_HEAD(, BlockBackend) monitor_block_backends =
119     QTAILQ_HEAD_INITIALIZER(monitor_block_backends);
120 
121 static int coroutine_mixed_fn GRAPH_RDLOCK
122 blk_set_perm_locked(BlockBackend *blk, uint64_t perm, uint64_t shared_perm,
123                     Error **errp);
124 
125 static void blk_root_inherit_options(BdrvChildRole role, bool parent_is_format,
126                                      int *child_flags, QDict *child_options,
127                                      int parent_flags, QDict *parent_options)
128 {
129     /* We're not supposed to call this function for root nodes */
130     abort();
131 }
132 static void blk_root_drained_begin(BdrvChild *child);
133 static bool blk_root_drained_poll(BdrvChild *child);
134 static void blk_root_drained_end(BdrvChild *child);
135 
136 static void blk_root_change_media(BdrvChild *child, bool load);
137 static void blk_root_resize(BdrvChild *child);
138 
139 static bool blk_root_change_aio_ctx(BdrvChild *child, AioContext *ctx,
140                                     GHashTable *visited, Transaction *tran,
141                                     Error **errp);
142 
143 static char *blk_root_get_parent_desc(BdrvChild *child)
144 {
145     BlockBackend *blk = child->opaque;
146     g_autofree char *dev_id = NULL;
147 
148     if (blk->name) {
149         return g_strdup_printf("block device '%s'", blk->name);
150     }
151 
152     dev_id = blk_get_attached_dev_id(blk);
153     if (*dev_id) {
154         return g_strdup_printf("block device '%s'", dev_id);
155     } else {
156         /* TODO Callback into the BB owner for something more detailed */
157         return g_strdup("an unnamed block device");
158     }
159 }
160 
161 static const char *blk_root_get_name(BdrvChild *child)
162 {
163     return blk_name(child->opaque);
164 }
165 
166 static void blk_vm_state_changed(void *opaque, bool running, RunState state)
167 {
168     Error *local_err = NULL;
169     BlockBackend *blk = opaque;
170 
171     if (state == RUN_STATE_INMIGRATE) {
172         return;
173     }
174 
175     qemu_del_vm_change_state_handler(blk->vmsh);
176     blk->vmsh = NULL;
177     blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
178     if (local_err) {
179         error_report_err(local_err);
180     }
181 }
182 
183 /*
184  * Notifies the user of the BlockBackend that migration has completed. qdev
185  * devices can tighten their permissions in response (specifically revoke
186  * shared write permissions that we needed for storage migration).
187  *
188  * If an error is returned, the VM cannot be allowed to be resumed.
189  */
190 static void GRAPH_RDLOCK blk_root_activate(BdrvChild *child, Error **errp)
191 {
192     BlockBackend *blk = child->opaque;
193     Error *local_err = NULL;
194     uint64_t saved_shared_perm;
195 
196     if (!blk->disable_perm) {
197         return;
198     }
199 
200     blk->disable_perm = false;
201 
202     /*
203      * blk->shared_perm contains the permissions we want to share once
204      * migration is really completely done.  For now, we need to share
205      * all; but we also need to retain blk->shared_perm, which is
206      * overwritten by a successful blk_set_perm() call.  Save it and
207      * restore it below.
208      */
209     saved_shared_perm = blk->shared_perm;
210 
211     blk_set_perm_locked(blk, blk->perm, BLK_PERM_ALL, &local_err);
212     if (local_err) {
213         error_propagate(errp, local_err);
214         blk->disable_perm = true;
215         return;
216     }
217     blk->shared_perm = saved_shared_perm;
218 
219     if (runstate_check(RUN_STATE_INMIGRATE)) {
220         /* Activation can happen when migration process is still active, for
221          * example when nbd_server_add is called during non-shared storage
222          * migration. Defer the shared_perm update to migration completion. */
223         if (!blk->vmsh) {
224             blk->vmsh = qemu_add_vm_change_state_handler(blk_vm_state_changed,
225                                                          blk);
226         }
227         return;
228     }
229 
230     blk_set_perm_locked(blk, blk->perm, blk->shared_perm, &local_err);
231     if (local_err) {
232         error_propagate(errp, local_err);
233         blk->disable_perm = true;
234         return;
235     }
236 }
237 
238 void blk_set_force_allow_inactivate(BlockBackend *blk)
239 {
240     GLOBAL_STATE_CODE();
241     blk->force_allow_inactivate = true;
242 }
243 
244 static bool blk_can_inactivate(BlockBackend *blk)
245 {
246     /* If it is a guest device, inactivate is ok. */
247     if (blk->dev || blk_name(blk)[0]) {
248         return true;
249     }
250 
251     /* Inactivating means no more writes to the image can be done,
252      * even if those writes would be changes invisible to the
253      * guest.  For block job BBs that satisfy this, we can just allow
254      * it.  This is the case for mirror job source, which is required
255      * by libvirt non-shared block migration. */
256     if (!(blk->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED))) {
257         return true;
258     }
259 
260     return blk->force_allow_inactivate;
261 }
262 
263 static int GRAPH_RDLOCK blk_root_inactivate(BdrvChild *child)
264 {
265     BlockBackend *blk = child->opaque;
266 
267     if (blk->disable_perm) {
268         return 0;
269     }
270 
271     if (!blk_can_inactivate(blk)) {
272         return -EPERM;
273     }
274 
275     blk->disable_perm = true;
276     if (blk->root) {
277         bdrv_child_try_set_perm(blk->root, 0, BLK_PERM_ALL, &error_abort);
278     }
279 
280     return 0;
281 }
282 
283 static void blk_root_attach(BdrvChild *child)
284 {
285     BlockBackend *blk = child->opaque;
286     BlockBackendAioNotifier *notifier;
287 
288     trace_blk_root_attach(child, blk, child->bs);
289 
290     QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
291         bdrv_add_aio_context_notifier(child->bs,
292                 notifier->attached_aio_context,
293                 notifier->detach_aio_context,
294                 notifier->opaque);
295     }
296 }
297 
298 static void blk_root_detach(BdrvChild *child)
299 {
300     BlockBackend *blk = child->opaque;
301     BlockBackendAioNotifier *notifier;
302 
303     trace_blk_root_detach(child, blk, child->bs);
304 
305     QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
306         bdrv_remove_aio_context_notifier(child->bs,
307                 notifier->attached_aio_context,
308                 notifier->detach_aio_context,
309                 notifier->opaque);
310     }
311 }
312 
313 static AioContext *blk_root_get_parent_aio_context(BdrvChild *c)
314 {
315     BlockBackend *blk = c->opaque;
316     IO_CODE();
317 
318     return blk_get_aio_context(blk);
319 }
320 
321 static const BdrvChildClass child_root = {
322     .inherit_options    = blk_root_inherit_options,
323 
324     .change_media       = blk_root_change_media,
325     .resize             = blk_root_resize,
326     .get_name           = blk_root_get_name,
327     .get_parent_desc    = blk_root_get_parent_desc,
328 
329     .drained_begin      = blk_root_drained_begin,
330     .drained_poll       = blk_root_drained_poll,
331     .drained_end        = blk_root_drained_end,
332 
333     .activate           = blk_root_activate,
334     .inactivate         = blk_root_inactivate,
335 
336     .attach             = blk_root_attach,
337     .detach             = blk_root_detach,
338 
339     .change_aio_ctx     = blk_root_change_aio_ctx,
340 
341     .get_parent_aio_context = blk_root_get_parent_aio_context,
342 };
343 
344 /*
345  * Create a new BlockBackend with a reference count of one.
346  *
347  * @perm is a bitmasks of BLK_PERM_* constants which describes the permissions
348  * to request for a block driver node that is attached to this BlockBackend.
349  * @shared_perm is a bitmask which describes which permissions may be granted
350  * to other users of the attached node.
351  * Both sets of permissions can be changed later using blk_set_perm().
352  *
353  * Return the new BlockBackend on success, null on failure.
354  */
355 BlockBackend *blk_new(AioContext *ctx, uint64_t perm, uint64_t shared_perm)
356 {
357     BlockBackend *blk;
358 
359     GLOBAL_STATE_CODE();
360 
361     blk = g_new0(BlockBackend, 1);
362     blk->refcnt = 1;
363     blk->ctx = ctx;
364     blk->perm = perm;
365     blk->shared_perm = shared_perm;
366     blk_set_enable_write_cache(blk, true);
367 
368     blk->on_read_error = BLOCKDEV_ON_ERROR_REPORT;
369     blk->on_write_error = BLOCKDEV_ON_ERROR_ENOSPC;
370 
371     block_acct_init(&blk->stats);
372 
373     qemu_mutex_init(&blk->queued_requests_lock);
374     qemu_co_queue_init(&blk->queued_requests);
375     notifier_list_init(&blk->remove_bs_notifiers);
376     notifier_list_init(&blk->insert_bs_notifiers);
377     QLIST_INIT(&blk->aio_notifiers);
378 
379     QTAILQ_INSERT_TAIL(&block_backends, blk, link);
380     return blk;
381 }
382 
383 /*
384  * Create a new BlockBackend connected to an existing BlockDriverState.
385  *
386  * @perm is a bitmasks of BLK_PERM_* constants which describes the
387  * permissions to request for @bs that is attached to this
388  * BlockBackend.  @shared_perm is a bitmask which describes which
389  * permissions may be granted to other users of the attached node.
390  * Both sets of permissions can be changed later using blk_set_perm().
391  *
392  * Return the new BlockBackend on success, null on failure.
393  *
394  * Callers must hold the AioContext lock of @bs.
395  */
396 BlockBackend *blk_new_with_bs(BlockDriverState *bs, uint64_t perm,
397                               uint64_t shared_perm, Error **errp)
398 {
399     BlockBackend *blk = blk_new(bdrv_get_aio_context(bs), perm, shared_perm);
400 
401     GLOBAL_STATE_CODE();
402 
403     if (blk_insert_bs(blk, bs, errp) < 0) {
404         blk_unref(blk);
405         return NULL;
406     }
407     return blk;
408 }
409 
410 /*
411  * Creates a new BlockBackend, opens a new BlockDriverState, and connects both.
412  * By default, the new BlockBackend is in the main AioContext, but if the
413  * parameters connect it with any existing node in a different AioContext, it
414  * may end up there instead.
415  *
416  * Just as with bdrv_open(), after having called this function the reference to
417  * @options belongs to the block layer (even on failure).
418  *
419  * Called without holding an AioContext lock.
420  *
421  * TODO: Remove @filename and @flags; it should be possible to specify a whole
422  * BDS tree just by specifying the @options QDict (or @reference,
423  * alternatively). At the time of adding this function, this is not possible,
424  * though, so callers of this function have to be able to specify @filename and
425  * @flags.
426  */
427 BlockBackend *blk_new_open(const char *filename, const char *reference,
428                            QDict *options, int flags, Error **errp)
429 {
430     BlockBackend *blk;
431     BlockDriverState *bs;
432     uint64_t perm = 0;
433     uint64_t shared = BLK_PERM_ALL;
434 
435     GLOBAL_STATE_CODE();
436 
437     /*
438      * blk_new_open() is mainly used in .bdrv_create implementations and the
439      * tools where sharing isn't a major concern because the BDS stays private
440      * and the file is generally not supposed to be used by a second process,
441      * so we just request permission according to the flags.
442      *
443      * The exceptions are xen_disk and blockdev_init(); in these cases, the
444      * caller of blk_new_open() doesn't make use of the permissions, but they
445      * shouldn't hurt either. We can still share everything here because the
446      * guest devices will add their own blockers if they can't share.
447      */
448     if ((flags & BDRV_O_NO_IO) == 0) {
449         perm |= BLK_PERM_CONSISTENT_READ;
450         if (flags & BDRV_O_RDWR) {
451             perm |= BLK_PERM_WRITE;
452         }
453     }
454     if (flags & BDRV_O_RESIZE) {
455         perm |= BLK_PERM_RESIZE;
456     }
457     if (flags & BDRV_O_NO_SHARE) {
458         shared = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED;
459     }
460 
461     bs = bdrv_open(filename, reference, options, flags, errp);
462     if (!bs) {
463         return NULL;
464     }
465 
466     /* bdrv_open() could have moved bs to a different AioContext */
467     blk = blk_new(bdrv_get_aio_context(bs), perm, shared);
468     blk->perm = perm;
469     blk->shared_perm = shared;
470 
471     blk_insert_bs(blk, bs, errp);
472     bdrv_unref(bs);
473 
474     if (!blk->root) {
475         blk_unref(blk);
476         return NULL;
477     }
478 
479     return blk;
480 }
481 
482 static void blk_delete(BlockBackend *blk)
483 {
484     assert(!blk->refcnt);
485     assert(!blk->name);
486     assert(!blk->dev);
487     if (blk->public.throttle_group_member.throttle_state) {
488         blk_io_limits_disable(blk);
489     }
490     if (blk->root) {
491         blk_remove_bs(blk);
492     }
493     if (blk->vmsh) {
494         qemu_del_vm_change_state_handler(blk->vmsh);
495         blk->vmsh = NULL;
496     }
497     assert(QLIST_EMPTY(&blk->remove_bs_notifiers.notifiers));
498     assert(QLIST_EMPTY(&blk->insert_bs_notifiers.notifiers));
499     assert(QLIST_EMPTY(&blk->aio_notifiers));
500     assert(qemu_co_queue_empty(&blk->queued_requests));
501     qemu_mutex_destroy(&blk->queued_requests_lock);
502     QTAILQ_REMOVE(&block_backends, blk, link);
503     drive_info_del(blk->legacy_dinfo);
504     block_acct_cleanup(&blk->stats);
505     g_free(blk);
506 }
507 
508 static void drive_info_del(DriveInfo *dinfo)
509 {
510     if (!dinfo) {
511         return;
512     }
513     qemu_opts_del(dinfo->opts);
514     g_free(dinfo);
515 }
516 
517 int blk_get_refcnt(BlockBackend *blk)
518 {
519     GLOBAL_STATE_CODE();
520     return blk ? blk->refcnt : 0;
521 }
522 
523 /*
524  * Increment @blk's reference count.
525  * @blk must not be null.
526  */
527 void blk_ref(BlockBackend *blk)
528 {
529     assert(blk->refcnt > 0);
530     GLOBAL_STATE_CODE();
531     blk->refcnt++;
532 }
533 
534 /*
535  * Decrement @blk's reference count.
536  * If this drops it to zero, destroy @blk.
537  * For convenience, do nothing if @blk is null.
538  */
539 void blk_unref(BlockBackend *blk)
540 {
541     GLOBAL_STATE_CODE();
542     if (blk) {
543         assert(blk->refcnt > 0);
544         if (blk->refcnt > 1) {
545             blk->refcnt--;
546         } else {
547             blk_drain(blk);
548             /* blk_drain() cannot resurrect blk, nobody held a reference */
549             assert(blk->refcnt == 1);
550             blk->refcnt = 0;
551             blk_delete(blk);
552         }
553     }
554 }
555 
556 /*
557  * Behaves similarly to blk_next() but iterates over all BlockBackends, even the
558  * ones which are hidden (i.e. are not referenced by the monitor).
559  */
560 BlockBackend *blk_all_next(BlockBackend *blk)
561 {
562     GLOBAL_STATE_CODE();
563     return blk ? QTAILQ_NEXT(blk, link)
564                : QTAILQ_FIRST(&block_backends);
565 }
566 
567 void blk_remove_all_bs(void)
568 {
569     BlockBackend *blk = NULL;
570 
571     GLOBAL_STATE_CODE();
572 
573     while ((blk = blk_all_next(blk)) != NULL) {
574         if (blk->root) {
575             blk_remove_bs(blk);
576         }
577     }
578 }
579 
580 /*
581  * Return the monitor-owned BlockBackend after @blk.
582  * If @blk is null, return the first one.
583  * Else, return @blk's next sibling, which may be null.
584  *
585  * To iterate over all BlockBackends, do
586  * for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
587  *     ...
588  * }
589  */
590 BlockBackend *blk_next(BlockBackend *blk)
591 {
592     GLOBAL_STATE_CODE();
593     return blk ? QTAILQ_NEXT(blk, monitor_link)
594                : QTAILQ_FIRST(&monitor_block_backends);
595 }
596 
597 /* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
598  * the monitor or attached to a BlockBackend */
599 BlockDriverState *bdrv_next(BdrvNextIterator *it)
600 {
601     BlockDriverState *bs, *old_bs;
602 
603     /* Must be called from the main loop */
604     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
605 
606     /* First, return all root nodes of BlockBackends. In order to avoid
607      * returning a BDS twice when multiple BBs refer to it, we only return it
608      * if the BB is the first one in the parent list of the BDS. */
609     if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
610         BlockBackend *old_blk = it->blk;
611 
612         old_bs = old_blk ? blk_bs(old_blk) : NULL;
613 
614         do {
615             it->blk = blk_all_next(it->blk);
616             bs = it->blk ? blk_bs(it->blk) : NULL;
617         } while (it->blk && (bs == NULL || bdrv_first_blk(bs) != it->blk));
618 
619         if (it->blk) {
620             blk_ref(it->blk);
621         }
622         blk_unref(old_blk);
623 
624         if (bs) {
625             bdrv_ref(bs);
626             bdrv_unref(old_bs);
627             return bs;
628         }
629         it->phase = BDRV_NEXT_MONITOR_OWNED;
630     } else {
631         old_bs = it->bs;
632     }
633 
634     /* Then return the monitor-owned BDSes without a BB attached. Ignore all
635      * BDSes that are attached to a BlockBackend here; they have been handled
636      * by the above block already */
637     do {
638         it->bs = bdrv_next_monitor_owned(it->bs);
639         bs = it->bs;
640     } while (bs && bdrv_has_blk(bs));
641 
642     if (bs) {
643         bdrv_ref(bs);
644     }
645     bdrv_unref(old_bs);
646 
647     return bs;
648 }
649 
650 static void bdrv_next_reset(BdrvNextIterator *it)
651 {
652     *it = (BdrvNextIterator) {
653         .phase = BDRV_NEXT_BACKEND_ROOTS,
654     };
655 }
656 
657 BlockDriverState *bdrv_first(BdrvNextIterator *it)
658 {
659     GLOBAL_STATE_CODE();
660     bdrv_next_reset(it);
661     return bdrv_next(it);
662 }
663 
664 /* Must be called when aborting a bdrv_next() iteration before
665  * bdrv_next() returns NULL */
666 void bdrv_next_cleanup(BdrvNextIterator *it)
667 {
668     /* Must be called from the main loop */
669     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
670 
671     if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
672         if (it->blk) {
673             bdrv_unref(blk_bs(it->blk));
674             blk_unref(it->blk);
675         }
676     } else {
677         bdrv_unref(it->bs);
678     }
679 
680     bdrv_next_reset(it);
681 }
682 
683 /*
684  * Add a BlockBackend into the list of backends referenced by the monitor, with
685  * the given @name acting as the handle for the monitor.
686  * Strictly for use by blockdev.c.
687  *
688  * @name must not be null or empty.
689  *
690  * Returns true on success and false on failure. In the latter case, an Error
691  * object is returned through @errp.
692  */
693 bool monitor_add_blk(BlockBackend *blk, const char *name, Error **errp)
694 {
695     assert(!blk->name);
696     assert(name && name[0]);
697     GLOBAL_STATE_CODE();
698 
699     if (!id_wellformed(name)) {
700         error_setg(errp, "Invalid device name");
701         return false;
702     }
703     if (blk_by_name(name)) {
704         error_setg(errp, "Device with id '%s' already exists", name);
705         return false;
706     }
707     if (bdrv_find_node(name)) {
708         error_setg(errp,
709                    "Device name '%s' conflicts with an existing node name",
710                    name);
711         return false;
712     }
713 
714     blk->name = g_strdup(name);
715     QTAILQ_INSERT_TAIL(&monitor_block_backends, blk, monitor_link);
716     return true;
717 }
718 
719 /*
720  * Remove a BlockBackend from the list of backends referenced by the monitor.
721  * Strictly for use by blockdev.c.
722  */
723 void monitor_remove_blk(BlockBackend *blk)
724 {
725     GLOBAL_STATE_CODE();
726 
727     if (!blk->name) {
728         return;
729     }
730 
731     QTAILQ_REMOVE(&monitor_block_backends, blk, monitor_link);
732     g_free(blk->name);
733     blk->name = NULL;
734 }
735 
736 /*
737  * Return @blk's name, a non-null string.
738  * Returns an empty string iff @blk is not referenced by the monitor.
739  */
740 const char *blk_name(const BlockBackend *blk)
741 {
742     IO_CODE();
743     return blk->name ?: "";
744 }
745 
746 /*
747  * Return the BlockBackend with name @name if it exists, else null.
748  * @name must not be null.
749  */
750 BlockBackend *blk_by_name(const char *name)
751 {
752     BlockBackend *blk = NULL;
753 
754     GLOBAL_STATE_CODE();
755     assert(name);
756     while ((blk = blk_next(blk)) != NULL) {
757         if (!strcmp(name, blk->name)) {
758             return blk;
759         }
760     }
761     return NULL;
762 }
763 
764 /*
765  * Return the BlockDriverState attached to @blk if any, else null.
766  */
767 BlockDriverState *blk_bs(BlockBackend *blk)
768 {
769     IO_CODE();
770     return blk->root ? blk->root->bs : NULL;
771 }
772 
773 static BlockBackend * GRAPH_RDLOCK bdrv_first_blk(BlockDriverState *bs)
774 {
775     BdrvChild *child;
776 
777     GLOBAL_STATE_CODE();
778     assert_bdrv_graph_readable();
779 
780     QLIST_FOREACH(child, &bs->parents, next_parent) {
781         if (child->klass == &child_root) {
782             return child->opaque;
783         }
784     }
785 
786     return NULL;
787 }
788 
789 /*
790  * Returns true if @bs has an associated BlockBackend.
791  */
792 bool bdrv_has_blk(BlockDriverState *bs)
793 {
794     GLOBAL_STATE_CODE();
795     return bdrv_first_blk(bs) != NULL;
796 }
797 
798 /*
799  * Returns true if @bs has only BlockBackends as parents.
800  */
801 bool bdrv_is_root_node(BlockDriverState *bs)
802 {
803     BdrvChild *c;
804 
805     GLOBAL_STATE_CODE();
806     assert_bdrv_graph_readable();
807 
808     QLIST_FOREACH(c, &bs->parents, next_parent) {
809         if (c->klass != &child_root) {
810             return false;
811         }
812     }
813 
814     return true;
815 }
816 
817 /*
818  * Return @blk's DriveInfo if any, else null.
819  */
820 DriveInfo *blk_legacy_dinfo(BlockBackend *blk)
821 {
822     GLOBAL_STATE_CODE();
823     return blk->legacy_dinfo;
824 }
825 
826 /*
827  * Set @blk's DriveInfo to @dinfo, and return it.
828  * @blk must not have a DriveInfo set already.
829  * No other BlockBackend may have the same DriveInfo set.
830  */
831 DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, DriveInfo *dinfo)
832 {
833     assert(!blk->legacy_dinfo);
834     GLOBAL_STATE_CODE();
835     return blk->legacy_dinfo = dinfo;
836 }
837 
838 /*
839  * Return the BlockBackend with DriveInfo @dinfo.
840  * It must exist.
841  */
842 BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo)
843 {
844     BlockBackend *blk = NULL;
845     GLOBAL_STATE_CODE();
846 
847     while ((blk = blk_next(blk)) != NULL) {
848         if (blk->legacy_dinfo == dinfo) {
849             return blk;
850         }
851     }
852     abort();
853 }
854 
855 /*
856  * Returns a pointer to the publicly accessible fields of @blk.
857  */
858 BlockBackendPublic *blk_get_public(BlockBackend *blk)
859 {
860     GLOBAL_STATE_CODE();
861     return &blk->public;
862 }
863 
864 /*
865  * Returns a BlockBackend given the associated @public fields.
866  */
867 BlockBackend *blk_by_public(BlockBackendPublic *public)
868 {
869     GLOBAL_STATE_CODE();
870     return container_of(public, BlockBackend, public);
871 }
872 
873 /*
874  * Disassociates the currently associated BlockDriverState from @blk.
875  *
876  * The caller must hold the AioContext lock for the BlockBackend.
877  */
878 void blk_remove_bs(BlockBackend *blk)
879 {
880     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
881     BdrvChild *root;
882 
883     GLOBAL_STATE_CODE();
884 
885     notifier_list_notify(&blk->remove_bs_notifiers, blk);
886     if (tgm->throttle_state) {
887         BlockDriverState *bs = blk_bs(blk);
888 
889         /*
890          * Take a ref in case blk_bs() changes across bdrv_drained_begin(), for
891          * example, if a temporary filter node is removed by a blockjob.
892          */
893         bdrv_ref(bs);
894         bdrv_drained_begin(bs);
895         throttle_group_detach_aio_context(tgm);
896         throttle_group_attach_aio_context(tgm, qemu_get_aio_context());
897         bdrv_drained_end(bs);
898         bdrv_unref(bs);
899     }
900 
901     blk_update_root_state(blk);
902 
903     /* bdrv_root_unref_child() will cause blk->root to become stale and may
904      * switch to a completion coroutine later on. Let's drain all I/O here
905      * to avoid that and a potential QEMU crash.
906      */
907     blk_drain(blk);
908     root = blk->root;
909     blk->root = NULL;
910 
911     bdrv_graph_wrlock();
912     bdrv_root_unref_child(root);
913     bdrv_graph_wrunlock();
914 }
915 
916 /*
917  * Associates a new BlockDriverState with @blk.
918  *
919  * Callers must hold the AioContext lock of @bs.
920  */
921 int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp)
922 {
923     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
924 
925     GLOBAL_STATE_CODE();
926     bdrv_ref(bs);
927     bdrv_graph_wrlock();
928     blk->root = bdrv_root_attach_child(bs, "root", &child_root,
929                                        BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
930                                        blk->perm, blk->shared_perm,
931                                        blk, errp);
932     bdrv_graph_wrunlock();
933     if (blk->root == NULL) {
934         return -EPERM;
935     }
936 
937     notifier_list_notify(&blk->insert_bs_notifiers, blk);
938     if (tgm->throttle_state) {
939         throttle_group_detach_aio_context(tgm);
940         throttle_group_attach_aio_context(tgm, bdrv_get_aio_context(bs));
941     }
942 
943     return 0;
944 }
945 
946 /*
947  * Change BlockDriverState associated with @blk.
948  */
949 int blk_replace_bs(BlockBackend *blk, BlockDriverState *new_bs, Error **errp)
950 {
951     GLOBAL_STATE_CODE();
952     return bdrv_replace_child_bs(blk->root, new_bs, errp);
953 }
954 
955 /*
956  * Sets the permission bitmasks that the user of the BlockBackend needs.
957  */
958 static int coroutine_mixed_fn GRAPH_RDLOCK
959 blk_set_perm_locked(BlockBackend *blk, uint64_t perm, uint64_t shared_perm,
960                     Error **errp)
961 {
962     int ret;
963     GLOBAL_STATE_CODE();
964 
965     if (blk->root && !blk->disable_perm) {
966         ret = bdrv_child_try_set_perm(blk->root, perm, shared_perm, errp);
967         if (ret < 0) {
968             return ret;
969         }
970     }
971 
972     blk->perm = perm;
973     blk->shared_perm = shared_perm;
974 
975     return 0;
976 }
977 
978 int blk_set_perm(BlockBackend *blk, uint64_t perm, uint64_t shared_perm,
979                  Error **errp)
980 {
981     GLOBAL_STATE_CODE();
982     GRAPH_RDLOCK_GUARD_MAINLOOP();
983 
984     return blk_set_perm_locked(blk, perm, shared_perm, errp);
985 }
986 
987 void blk_get_perm(BlockBackend *blk, uint64_t *perm, uint64_t *shared_perm)
988 {
989     GLOBAL_STATE_CODE();
990     *perm = blk->perm;
991     *shared_perm = blk->shared_perm;
992 }
993 
994 /*
995  * Attach device model @dev to @blk.
996  * Return 0 on success, -EBUSY when a device model is attached already.
997  */
998 int blk_attach_dev(BlockBackend *blk, DeviceState *dev)
999 {
1000     GLOBAL_STATE_CODE();
1001     if (blk->dev) {
1002         return -EBUSY;
1003     }
1004 
1005     /* While migration is still incoming, we don't need to apply the
1006      * permissions of guest device BlockBackends. We might still have a block
1007      * job or NBD server writing to the image for storage migration. */
1008     if (runstate_check(RUN_STATE_INMIGRATE)) {
1009         blk->disable_perm = true;
1010     }
1011 
1012     blk_ref(blk);
1013     blk->dev = dev;
1014     blk_iostatus_reset(blk);
1015 
1016     return 0;
1017 }
1018 
1019 /*
1020  * Detach device model @dev from @blk.
1021  * @dev must be currently attached to @blk.
1022  */
1023 void blk_detach_dev(BlockBackend *blk, DeviceState *dev)
1024 {
1025     assert(blk->dev == dev);
1026     GLOBAL_STATE_CODE();
1027     blk->dev = NULL;
1028     blk->dev_ops = NULL;
1029     blk->dev_opaque = NULL;
1030     blk_set_perm(blk, 0, BLK_PERM_ALL, &error_abort);
1031     blk_unref(blk);
1032 }
1033 
1034 /*
1035  * Return the device model attached to @blk if any, else null.
1036  */
1037 DeviceState *blk_get_attached_dev(BlockBackend *blk)
1038 {
1039     GLOBAL_STATE_CODE();
1040     return blk->dev;
1041 }
1042 
1043 /* Return the qdev ID, or if no ID is assigned the QOM path, of the block
1044  * device attached to the BlockBackend. */
1045 char *blk_get_attached_dev_id(BlockBackend *blk)
1046 {
1047     DeviceState *dev = blk->dev;
1048     IO_CODE();
1049 
1050     if (!dev) {
1051         return g_strdup("");
1052     } else if (dev->id) {
1053         return g_strdup(dev->id);
1054     }
1055 
1056     return object_get_canonical_path(OBJECT(dev)) ?: g_strdup("");
1057 }
1058 
1059 /*
1060  * Return the BlockBackend which has the device model @dev attached if it
1061  * exists, else null.
1062  *
1063  * @dev must not be null.
1064  */
1065 BlockBackend *blk_by_dev(void *dev)
1066 {
1067     BlockBackend *blk = NULL;
1068 
1069     GLOBAL_STATE_CODE();
1070 
1071     assert(dev != NULL);
1072     while ((blk = blk_all_next(blk)) != NULL) {
1073         if (blk->dev == dev) {
1074             return blk;
1075         }
1076     }
1077     return NULL;
1078 }
1079 
1080 /*
1081  * Set @blk's device model callbacks to @ops.
1082  * @opaque is the opaque argument to pass to the callbacks.
1083  * This is for use by device models.
1084  */
1085 void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops,
1086                      void *opaque)
1087 {
1088     GLOBAL_STATE_CODE();
1089     blk->dev_ops = ops;
1090     blk->dev_opaque = opaque;
1091 
1092     /* Are we currently quiesced? Should we enforce this right now? */
1093     if (qatomic_read(&blk->quiesce_counter) && ops && ops->drained_begin) {
1094         ops->drained_begin(opaque);
1095     }
1096 }
1097 
1098 /*
1099  * Notify @blk's attached device model of media change.
1100  *
1101  * If @load is true, notify of media load. This action can fail, meaning that
1102  * the medium cannot be loaded. @errp is set then.
1103  *
1104  * If @load is false, notify of media eject. This can never fail.
1105  *
1106  * Also send DEVICE_TRAY_MOVED events as appropriate.
1107  */
1108 void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp)
1109 {
1110     GLOBAL_STATE_CODE();
1111     if (blk->dev_ops && blk->dev_ops->change_media_cb) {
1112         bool tray_was_open, tray_is_open;
1113         Error *local_err = NULL;
1114 
1115         tray_was_open = blk_dev_is_tray_open(blk);
1116         blk->dev_ops->change_media_cb(blk->dev_opaque, load, &local_err);
1117         if (local_err) {
1118             assert(load == true);
1119             error_propagate(errp, local_err);
1120             return;
1121         }
1122         tray_is_open = blk_dev_is_tray_open(blk);
1123 
1124         if (tray_was_open != tray_is_open) {
1125             char *id = blk_get_attached_dev_id(blk);
1126             qapi_event_send_device_tray_moved(blk_name(blk), id, tray_is_open);
1127             g_free(id);
1128         }
1129     }
1130 }
1131 
1132 static void blk_root_change_media(BdrvChild *child, bool load)
1133 {
1134     blk_dev_change_media_cb(child->opaque, load, NULL);
1135 }
1136 
1137 /*
1138  * Does @blk's attached device model have removable media?
1139  * %true if no device model is attached.
1140  */
1141 bool blk_dev_has_removable_media(BlockBackend *blk)
1142 {
1143     GLOBAL_STATE_CODE();
1144     return !blk->dev || (blk->dev_ops && blk->dev_ops->change_media_cb);
1145 }
1146 
1147 /*
1148  * Does @blk's attached device model have a tray?
1149  */
1150 bool blk_dev_has_tray(BlockBackend *blk)
1151 {
1152     IO_CODE();
1153     return blk->dev_ops && blk->dev_ops->is_tray_open;
1154 }
1155 
1156 /*
1157  * Notify @blk's attached device model of a media eject request.
1158  * If @force is true, the medium is about to be yanked out forcefully.
1159  */
1160 void blk_dev_eject_request(BlockBackend *blk, bool force)
1161 {
1162     GLOBAL_STATE_CODE();
1163     if (blk->dev_ops && blk->dev_ops->eject_request_cb) {
1164         blk->dev_ops->eject_request_cb(blk->dev_opaque, force);
1165     }
1166 }
1167 
1168 /*
1169  * Does @blk's attached device model have a tray, and is it open?
1170  */
1171 bool blk_dev_is_tray_open(BlockBackend *blk)
1172 {
1173     IO_CODE();
1174     if (blk_dev_has_tray(blk)) {
1175         return blk->dev_ops->is_tray_open(blk->dev_opaque);
1176     }
1177     return false;
1178 }
1179 
1180 /*
1181  * Does @blk's attached device model have the medium locked?
1182  * %false if the device model has no such lock.
1183  */
1184 bool blk_dev_is_medium_locked(BlockBackend *blk)
1185 {
1186     GLOBAL_STATE_CODE();
1187     if (blk->dev_ops && blk->dev_ops->is_medium_locked) {
1188         return blk->dev_ops->is_medium_locked(blk->dev_opaque);
1189     }
1190     return false;
1191 }
1192 
1193 /*
1194  * Notify @blk's attached device model of a backend size change.
1195  */
1196 static void blk_root_resize(BdrvChild *child)
1197 {
1198     BlockBackend *blk = child->opaque;
1199 
1200     if (blk->dev_ops && blk->dev_ops->resize_cb) {
1201         blk->dev_ops->resize_cb(blk->dev_opaque);
1202     }
1203 }
1204 
1205 void blk_iostatus_enable(BlockBackend *blk)
1206 {
1207     GLOBAL_STATE_CODE();
1208     blk->iostatus_enabled = true;
1209     blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
1210 }
1211 
1212 /* The I/O status is only enabled if the drive explicitly
1213  * enables it _and_ the VM is configured to stop on errors */
1214 bool blk_iostatus_is_enabled(const BlockBackend *blk)
1215 {
1216     IO_CODE();
1217     return (blk->iostatus_enabled &&
1218            (blk->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
1219             blk->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
1220             blk->on_read_error == BLOCKDEV_ON_ERROR_STOP));
1221 }
1222 
1223 BlockDeviceIoStatus blk_iostatus(const BlockBackend *blk)
1224 {
1225     GLOBAL_STATE_CODE();
1226     return blk->iostatus;
1227 }
1228 
1229 void blk_iostatus_disable(BlockBackend *blk)
1230 {
1231     GLOBAL_STATE_CODE();
1232     blk->iostatus_enabled = false;
1233 }
1234 
1235 void blk_iostatus_reset(BlockBackend *blk)
1236 {
1237     GLOBAL_STATE_CODE();
1238     if (blk_iostatus_is_enabled(blk)) {
1239         blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
1240     }
1241 }
1242 
1243 void blk_iostatus_set_err(BlockBackend *blk, int error)
1244 {
1245     IO_CODE();
1246     assert(blk_iostatus_is_enabled(blk));
1247     if (blk->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
1248         blk->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
1249                                           BLOCK_DEVICE_IO_STATUS_FAILED;
1250     }
1251 }
1252 
1253 void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow)
1254 {
1255     IO_CODE();
1256     blk->allow_write_beyond_eof = allow;
1257 }
1258 
1259 void blk_set_allow_aio_context_change(BlockBackend *blk, bool allow)
1260 {
1261     IO_CODE();
1262     blk->allow_aio_context_change = allow;
1263 }
1264 
1265 void blk_set_disable_request_queuing(BlockBackend *blk, bool disable)
1266 {
1267     IO_CODE();
1268     qatomic_set(&blk->disable_request_queuing, disable);
1269 }
1270 
1271 static int coroutine_fn GRAPH_RDLOCK
1272 blk_check_byte_request(BlockBackend *blk, int64_t offset, int64_t bytes)
1273 {
1274     int64_t len;
1275 
1276     if (bytes < 0) {
1277         return -EIO;
1278     }
1279 
1280     if (!blk_co_is_available(blk)) {
1281         return -ENOMEDIUM;
1282     }
1283 
1284     if (offset < 0) {
1285         return -EIO;
1286     }
1287 
1288     if (!blk->allow_write_beyond_eof) {
1289         len = bdrv_co_getlength(blk_bs(blk));
1290         if (len < 0) {
1291             return len;
1292         }
1293 
1294         if (offset > len || len - offset < bytes) {
1295             return -EIO;
1296         }
1297     }
1298 
1299     return 0;
1300 }
1301 
1302 /* Are we currently in a drained section? */
1303 bool blk_in_drain(BlockBackend *blk)
1304 {
1305     GLOBAL_STATE_CODE(); /* change to IO_OR_GS_CODE(), if necessary */
1306     return qatomic_read(&blk->quiesce_counter);
1307 }
1308 
1309 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1310 static void coroutine_fn blk_wait_while_drained(BlockBackend *blk)
1311 {
1312     assert(blk->in_flight > 0);
1313 
1314     if (qatomic_read(&blk->quiesce_counter) &&
1315         !qatomic_read(&blk->disable_request_queuing)) {
1316         /*
1317          * Take lock before decrementing in flight counter so main loop thread
1318          * waits for us to enqueue ourselves before it can leave the drained
1319          * section.
1320          */
1321         qemu_mutex_lock(&blk->queued_requests_lock);
1322         blk_dec_in_flight(blk);
1323         qemu_co_queue_wait(&blk->queued_requests, &blk->queued_requests_lock);
1324         blk_inc_in_flight(blk);
1325         qemu_mutex_unlock(&blk->queued_requests_lock);
1326     }
1327 }
1328 
1329 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1330 static int coroutine_fn
1331 blk_co_do_preadv_part(BlockBackend *blk, int64_t offset, int64_t bytes,
1332                       QEMUIOVector *qiov, size_t qiov_offset,
1333                       BdrvRequestFlags flags)
1334 {
1335     int ret;
1336     BlockDriverState *bs;
1337     IO_CODE();
1338 
1339     blk_wait_while_drained(blk);
1340     GRAPH_RDLOCK_GUARD();
1341 
1342     /* Call blk_bs() only after waiting, the graph may have changed */
1343     bs = blk_bs(blk);
1344     trace_blk_co_preadv(blk, bs, offset, bytes, flags);
1345 
1346     ret = blk_check_byte_request(blk, offset, bytes);
1347     if (ret < 0) {
1348         return ret;
1349     }
1350 
1351     bdrv_inc_in_flight(bs);
1352 
1353     /* throttling disk I/O */
1354     if (blk->public.throttle_group_member.throttle_state) {
1355         throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
1356                 bytes, THROTTLE_READ);
1357     }
1358 
1359     ret = bdrv_co_preadv_part(blk->root, offset, bytes, qiov, qiov_offset,
1360                               flags);
1361     bdrv_dec_in_flight(bs);
1362     return ret;
1363 }
1364 
1365 int coroutine_fn blk_co_pread(BlockBackend *blk, int64_t offset, int64_t bytes,
1366                               void *buf, BdrvRequestFlags flags)
1367 {
1368     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
1369     IO_OR_GS_CODE();
1370 
1371     assert(bytes <= SIZE_MAX);
1372 
1373     return blk_co_preadv(blk, offset, bytes, &qiov, flags);
1374 }
1375 
1376 int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
1377                                int64_t bytes, QEMUIOVector *qiov,
1378                                BdrvRequestFlags flags)
1379 {
1380     int ret;
1381     IO_OR_GS_CODE();
1382 
1383     blk_inc_in_flight(blk);
1384     ret = blk_co_do_preadv_part(blk, offset, bytes, qiov, 0, flags);
1385     blk_dec_in_flight(blk);
1386 
1387     return ret;
1388 }
1389 
1390 int coroutine_fn blk_co_preadv_part(BlockBackend *blk, int64_t offset,
1391                                     int64_t bytes, QEMUIOVector *qiov,
1392                                     size_t qiov_offset, BdrvRequestFlags flags)
1393 {
1394     int ret;
1395     IO_OR_GS_CODE();
1396 
1397     blk_inc_in_flight(blk);
1398     ret = blk_co_do_preadv_part(blk, offset, bytes, qiov, qiov_offset, flags);
1399     blk_dec_in_flight(blk);
1400 
1401     return ret;
1402 }
1403 
1404 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1405 static int coroutine_fn
1406 blk_co_do_pwritev_part(BlockBackend *blk, int64_t offset, int64_t bytes,
1407                        QEMUIOVector *qiov, size_t qiov_offset,
1408                        BdrvRequestFlags flags)
1409 {
1410     int ret;
1411     BlockDriverState *bs;
1412     IO_CODE();
1413 
1414     blk_wait_while_drained(blk);
1415     GRAPH_RDLOCK_GUARD();
1416 
1417     /* Call blk_bs() only after waiting, the graph may have changed */
1418     bs = blk_bs(blk);
1419     trace_blk_co_pwritev(blk, bs, offset, bytes, flags);
1420 
1421     ret = blk_check_byte_request(blk, offset, bytes);
1422     if (ret < 0) {
1423         return ret;
1424     }
1425 
1426     bdrv_inc_in_flight(bs);
1427     /* throttling disk I/O */
1428     if (blk->public.throttle_group_member.throttle_state) {
1429         throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
1430                 bytes, THROTTLE_WRITE);
1431     }
1432 
1433     if (!blk->enable_write_cache) {
1434         flags |= BDRV_REQ_FUA;
1435     }
1436 
1437     ret = bdrv_co_pwritev_part(blk->root, offset, bytes, qiov, qiov_offset,
1438                                flags);
1439     bdrv_dec_in_flight(bs);
1440     return ret;
1441 }
1442 
1443 int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset,
1444                                      int64_t bytes,
1445                                      QEMUIOVector *qiov, size_t qiov_offset,
1446                                      BdrvRequestFlags flags)
1447 {
1448     int ret;
1449     IO_OR_GS_CODE();
1450 
1451     blk_inc_in_flight(blk);
1452     ret = blk_co_do_pwritev_part(blk, offset, bytes, qiov, qiov_offset, flags);
1453     blk_dec_in_flight(blk);
1454 
1455     return ret;
1456 }
1457 
1458 int coroutine_fn blk_co_pwrite(BlockBackend *blk, int64_t offset, int64_t bytes,
1459                                const void *buf, BdrvRequestFlags flags)
1460 {
1461     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
1462     IO_OR_GS_CODE();
1463 
1464     assert(bytes <= SIZE_MAX);
1465 
1466     return blk_co_pwritev(blk, offset, bytes, &qiov, flags);
1467 }
1468 
1469 int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
1470                                 int64_t bytes, QEMUIOVector *qiov,
1471                                 BdrvRequestFlags flags)
1472 {
1473     IO_OR_GS_CODE();
1474     return blk_co_pwritev_part(blk, offset, bytes, qiov, 0, flags);
1475 }
1476 
1477 int coroutine_fn blk_co_block_status_above(BlockBackend *blk,
1478                                            BlockDriverState *base,
1479                                            int64_t offset, int64_t bytes,
1480                                            int64_t *pnum, int64_t *map,
1481                                            BlockDriverState **file)
1482 {
1483     IO_CODE();
1484     GRAPH_RDLOCK_GUARD();
1485     return bdrv_co_block_status_above(blk_bs(blk), base, offset, bytes, pnum,
1486                                       map, file);
1487 }
1488 
1489 int coroutine_fn blk_co_is_allocated_above(BlockBackend *blk,
1490                                            BlockDriverState *base,
1491                                            bool include_base, int64_t offset,
1492                                            int64_t bytes, int64_t *pnum)
1493 {
1494     IO_CODE();
1495     GRAPH_RDLOCK_GUARD();
1496     return bdrv_co_is_allocated_above(blk_bs(blk), base, include_base, offset,
1497                                       bytes, pnum);
1498 }
1499 
1500 typedef struct BlkRwCo {
1501     BlockBackend *blk;
1502     int64_t offset;
1503     void *iobuf;
1504     int ret;
1505     BdrvRequestFlags flags;
1506 } BlkRwCo;
1507 
1508 int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
1509 {
1510     GLOBAL_STATE_CODE();
1511     return bdrv_make_zero(blk->root, flags);
1512 }
1513 
1514 void blk_inc_in_flight(BlockBackend *blk)
1515 {
1516     IO_CODE();
1517     qatomic_inc(&blk->in_flight);
1518 }
1519 
1520 void blk_dec_in_flight(BlockBackend *blk)
1521 {
1522     IO_CODE();
1523     qatomic_dec(&blk->in_flight);
1524     aio_wait_kick();
1525 }
1526 
1527 static void error_callback_bh(void *opaque)
1528 {
1529     struct BlockBackendAIOCB *acb = opaque;
1530 
1531     blk_dec_in_flight(acb->blk);
1532     acb->common.cb(acb->common.opaque, acb->ret);
1533     qemu_aio_unref(acb);
1534 }
1535 
1536 BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
1537                                   BlockCompletionFunc *cb,
1538                                   void *opaque, int ret)
1539 {
1540     struct BlockBackendAIOCB *acb;
1541     IO_CODE();
1542 
1543     blk_inc_in_flight(blk);
1544     acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque);
1545     acb->blk = blk;
1546     acb->ret = ret;
1547 
1548     replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
1549                                      error_callback_bh, acb);
1550     return &acb->common;
1551 }
1552 
1553 typedef struct BlkAioEmAIOCB {
1554     BlockAIOCB common;
1555     BlkRwCo rwco;
1556     int64_t bytes;
1557     bool has_returned;
1558 } BlkAioEmAIOCB;
1559 
1560 static const AIOCBInfo blk_aio_em_aiocb_info = {
1561     .aiocb_size         = sizeof(BlkAioEmAIOCB),
1562 };
1563 
1564 static void blk_aio_complete(BlkAioEmAIOCB *acb)
1565 {
1566     if (acb->has_returned) {
1567         acb->common.cb(acb->common.opaque, acb->rwco.ret);
1568         blk_dec_in_flight(acb->rwco.blk);
1569         qemu_aio_unref(acb);
1570     }
1571 }
1572 
1573 static void blk_aio_complete_bh(void *opaque)
1574 {
1575     BlkAioEmAIOCB *acb = opaque;
1576     assert(acb->has_returned);
1577     blk_aio_complete(acb);
1578 }
1579 
1580 static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset,
1581                                 int64_t bytes,
1582                                 void *iobuf, CoroutineEntry co_entry,
1583                                 BdrvRequestFlags flags,
1584                                 BlockCompletionFunc *cb, void *opaque)
1585 {
1586     BlkAioEmAIOCB *acb;
1587     Coroutine *co;
1588 
1589     blk_inc_in_flight(blk);
1590     acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
1591     acb->rwco = (BlkRwCo) {
1592         .blk    = blk,
1593         .offset = offset,
1594         .iobuf  = iobuf,
1595         .flags  = flags,
1596         .ret    = NOT_DONE,
1597     };
1598     acb->bytes = bytes;
1599     acb->has_returned = false;
1600 
1601     co = qemu_coroutine_create(co_entry, acb);
1602     aio_co_enter(qemu_get_current_aio_context(), co);
1603 
1604     acb->has_returned = true;
1605     if (acb->rwco.ret != NOT_DONE) {
1606         replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
1607                                          blk_aio_complete_bh, acb);
1608     }
1609 
1610     return &acb->common;
1611 }
1612 
1613 static void coroutine_fn blk_aio_read_entry(void *opaque)
1614 {
1615     BlkAioEmAIOCB *acb = opaque;
1616     BlkRwCo *rwco = &acb->rwco;
1617     QEMUIOVector *qiov = rwco->iobuf;
1618 
1619     assert(qiov->size == acb->bytes);
1620     rwco->ret = blk_co_do_preadv_part(rwco->blk, rwco->offset, acb->bytes, qiov,
1621                                       0, rwco->flags);
1622     blk_aio_complete(acb);
1623 }
1624 
1625 static void coroutine_fn blk_aio_write_entry(void *opaque)
1626 {
1627     BlkAioEmAIOCB *acb = opaque;
1628     BlkRwCo *rwco = &acb->rwco;
1629     QEMUIOVector *qiov = rwco->iobuf;
1630 
1631     assert(!qiov || qiov->size == acb->bytes);
1632     rwco->ret = blk_co_do_pwritev_part(rwco->blk, rwco->offset, acb->bytes,
1633                                        qiov, 0, rwco->flags);
1634     blk_aio_complete(acb);
1635 }
1636 
1637 BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset,
1638                                   int64_t bytes, BdrvRequestFlags flags,
1639                                   BlockCompletionFunc *cb, void *opaque)
1640 {
1641     IO_CODE();
1642     return blk_aio_prwv(blk, offset, bytes, NULL, blk_aio_write_entry,
1643                         flags | BDRV_REQ_ZERO_WRITE, cb, opaque);
1644 }
1645 
1646 int64_t coroutine_fn blk_co_getlength(BlockBackend *blk)
1647 {
1648     IO_CODE();
1649     GRAPH_RDLOCK_GUARD();
1650 
1651     if (!blk_co_is_available(blk)) {
1652         return -ENOMEDIUM;
1653     }
1654 
1655     return bdrv_co_getlength(blk_bs(blk));
1656 }
1657 
1658 int64_t coroutine_fn blk_co_nb_sectors(BlockBackend *blk)
1659 {
1660     BlockDriverState *bs = blk_bs(blk);
1661 
1662     IO_CODE();
1663     GRAPH_RDLOCK_GUARD();
1664 
1665     if (!bs) {
1666         return -ENOMEDIUM;
1667     } else {
1668         return bdrv_co_nb_sectors(bs);
1669     }
1670 }
1671 
1672 /*
1673  * This wrapper is written by hand because this function is in the hot I/O path,
1674  * via blk_get_geometry.
1675  */
1676 int64_t coroutine_mixed_fn blk_nb_sectors(BlockBackend *blk)
1677 {
1678     BlockDriverState *bs = blk_bs(blk);
1679 
1680     IO_CODE();
1681 
1682     if (!bs) {
1683         return -ENOMEDIUM;
1684     } else {
1685         return bdrv_nb_sectors(bs);
1686     }
1687 }
1688 
1689 /* return 0 as number of sectors if no device present or error */
1690 void coroutine_fn blk_co_get_geometry(BlockBackend *blk,
1691                                       uint64_t *nb_sectors_ptr)
1692 {
1693     int64_t ret = blk_co_nb_sectors(blk);
1694     *nb_sectors_ptr = ret < 0 ? 0 : ret;
1695 }
1696 
1697 /*
1698  * This wrapper is written by hand because this function is in the hot I/O path.
1699  */
1700 void coroutine_mixed_fn blk_get_geometry(BlockBackend *blk,
1701                                          uint64_t *nb_sectors_ptr)
1702 {
1703     int64_t ret = blk_nb_sectors(blk);
1704     *nb_sectors_ptr = ret < 0 ? 0 : ret;
1705 }
1706 
1707 BlockAIOCB *blk_aio_preadv(BlockBackend *blk, int64_t offset,
1708                            QEMUIOVector *qiov, BdrvRequestFlags flags,
1709                            BlockCompletionFunc *cb, void *opaque)
1710 {
1711     IO_CODE();
1712     assert((uint64_t)qiov->size <= INT64_MAX);
1713     return blk_aio_prwv(blk, offset, qiov->size, qiov,
1714                         blk_aio_read_entry, flags, cb, opaque);
1715 }
1716 
1717 BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
1718                             QEMUIOVector *qiov, BdrvRequestFlags flags,
1719                             BlockCompletionFunc *cb, void *opaque)
1720 {
1721     IO_CODE();
1722     assert((uint64_t)qiov->size <= INT64_MAX);
1723     return blk_aio_prwv(blk, offset, qiov->size, qiov,
1724                         blk_aio_write_entry, flags, cb, opaque);
1725 }
1726 
1727 void blk_aio_cancel(BlockAIOCB *acb)
1728 {
1729     GLOBAL_STATE_CODE();
1730     bdrv_aio_cancel(acb);
1731 }
1732 
1733 void blk_aio_cancel_async(BlockAIOCB *acb)
1734 {
1735     IO_CODE();
1736     bdrv_aio_cancel_async(acb);
1737 }
1738 
1739 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1740 static int coroutine_fn
1741 blk_co_do_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
1742 {
1743     IO_CODE();
1744 
1745     blk_wait_while_drained(blk);
1746     GRAPH_RDLOCK_GUARD();
1747 
1748     if (!blk_co_is_available(blk)) {
1749         return -ENOMEDIUM;
1750     }
1751 
1752     return bdrv_co_ioctl(blk_bs(blk), req, buf);
1753 }
1754 
1755 int coroutine_fn blk_co_ioctl(BlockBackend *blk, unsigned long int req,
1756                               void *buf)
1757 {
1758     int ret;
1759     IO_OR_GS_CODE();
1760 
1761     blk_inc_in_flight(blk);
1762     ret = blk_co_do_ioctl(blk, req, buf);
1763     blk_dec_in_flight(blk);
1764 
1765     return ret;
1766 }
1767 
1768 static void coroutine_fn blk_aio_ioctl_entry(void *opaque)
1769 {
1770     BlkAioEmAIOCB *acb = opaque;
1771     BlkRwCo *rwco = &acb->rwco;
1772 
1773     rwco->ret = blk_co_do_ioctl(rwco->blk, rwco->offset, rwco->iobuf);
1774 
1775     blk_aio_complete(acb);
1776 }
1777 
1778 BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
1779                           BlockCompletionFunc *cb, void *opaque)
1780 {
1781     IO_CODE();
1782     return blk_aio_prwv(blk, req, 0, buf, blk_aio_ioctl_entry, 0, cb, opaque);
1783 }
1784 
1785 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1786 static int coroutine_fn
1787 blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes)
1788 {
1789     int ret;
1790     IO_CODE();
1791 
1792     blk_wait_while_drained(blk);
1793     GRAPH_RDLOCK_GUARD();
1794 
1795     ret = blk_check_byte_request(blk, offset, bytes);
1796     if (ret < 0) {
1797         return ret;
1798     }
1799 
1800     return bdrv_co_pdiscard(blk->root, offset, bytes);
1801 }
1802 
1803 static void coroutine_fn blk_aio_pdiscard_entry(void *opaque)
1804 {
1805     BlkAioEmAIOCB *acb = opaque;
1806     BlkRwCo *rwco = &acb->rwco;
1807 
1808     rwco->ret = blk_co_do_pdiscard(rwco->blk, rwco->offset, acb->bytes);
1809     blk_aio_complete(acb);
1810 }
1811 
1812 BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk,
1813                              int64_t offset, int64_t bytes,
1814                              BlockCompletionFunc *cb, void *opaque)
1815 {
1816     IO_CODE();
1817     return blk_aio_prwv(blk, offset, bytes, NULL, blk_aio_pdiscard_entry, 0,
1818                         cb, opaque);
1819 }
1820 
1821 int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset,
1822                                  int64_t bytes)
1823 {
1824     int ret;
1825     IO_OR_GS_CODE();
1826 
1827     blk_inc_in_flight(blk);
1828     ret = blk_co_do_pdiscard(blk, offset, bytes);
1829     blk_dec_in_flight(blk);
1830 
1831     return ret;
1832 }
1833 
1834 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1835 static int coroutine_fn blk_co_do_flush(BlockBackend *blk)
1836 {
1837     IO_CODE();
1838     blk_wait_while_drained(blk);
1839     GRAPH_RDLOCK_GUARD();
1840 
1841     if (!blk_co_is_available(blk)) {
1842         return -ENOMEDIUM;
1843     }
1844 
1845     return bdrv_co_flush(blk_bs(blk));
1846 }
1847 
1848 static void coroutine_fn blk_aio_flush_entry(void *opaque)
1849 {
1850     BlkAioEmAIOCB *acb = opaque;
1851     BlkRwCo *rwco = &acb->rwco;
1852 
1853     rwco->ret = blk_co_do_flush(rwco->blk);
1854     blk_aio_complete(acb);
1855 }
1856 
1857 BlockAIOCB *blk_aio_flush(BlockBackend *blk,
1858                           BlockCompletionFunc *cb, void *opaque)
1859 {
1860     IO_CODE();
1861     return blk_aio_prwv(blk, 0, 0, NULL, blk_aio_flush_entry, 0, cb, opaque);
1862 }
1863 
1864 int coroutine_fn blk_co_flush(BlockBackend *blk)
1865 {
1866     int ret;
1867     IO_OR_GS_CODE();
1868 
1869     blk_inc_in_flight(blk);
1870     ret = blk_co_do_flush(blk);
1871     blk_dec_in_flight(blk);
1872 
1873     return ret;
1874 }
1875 
1876 static void coroutine_fn blk_aio_zone_report_entry(void *opaque)
1877 {
1878     BlkAioEmAIOCB *acb = opaque;
1879     BlkRwCo *rwco = &acb->rwco;
1880 
1881     rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset,
1882                                    (unsigned int*)(uintptr_t)acb->bytes,
1883                                    rwco->iobuf);
1884     blk_aio_complete(acb);
1885 }
1886 
1887 BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
1888                                 unsigned int *nr_zones,
1889                                 BlockZoneDescriptor  *zones,
1890                                 BlockCompletionFunc *cb, void *opaque)
1891 {
1892     BlkAioEmAIOCB *acb;
1893     Coroutine *co;
1894     IO_CODE();
1895 
1896     blk_inc_in_flight(blk);
1897     acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
1898     acb->rwco = (BlkRwCo) {
1899         .blk    = blk,
1900         .offset = offset,
1901         .iobuf  = zones,
1902         .ret    = NOT_DONE,
1903     };
1904     acb->bytes = (int64_t)(uintptr_t)nr_zones,
1905     acb->has_returned = false;
1906 
1907     co = qemu_coroutine_create(blk_aio_zone_report_entry, acb);
1908     aio_co_enter(qemu_get_current_aio_context(), co);
1909 
1910     acb->has_returned = true;
1911     if (acb->rwco.ret != NOT_DONE) {
1912         replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
1913                                          blk_aio_complete_bh, acb);
1914     }
1915 
1916     return &acb->common;
1917 }
1918 
1919 static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque)
1920 {
1921     BlkAioEmAIOCB *acb = opaque;
1922     BlkRwCo *rwco = &acb->rwco;
1923 
1924     rwco->ret = blk_co_zone_mgmt(rwco->blk,
1925                                  (BlockZoneOp)(uintptr_t)rwco->iobuf,
1926                                  rwco->offset, acb->bytes);
1927     blk_aio_complete(acb);
1928 }
1929 
1930 BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
1931                               int64_t offset, int64_t len,
1932                               BlockCompletionFunc *cb, void *opaque) {
1933     BlkAioEmAIOCB *acb;
1934     Coroutine *co;
1935     IO_CODE();
1936 
1937     blk_inc_in_flight(blk);
1938     acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
1939     acb->rwco = (BlkRwCo) {
1940         .blk    = blk,
1941         .offset = offset,
1942         .iobuf  = (void *)(uintptr_t)op,
1943         .ret    = NOT_DONE,
1944     };
1945     acb->bytes = len;
1946     acb->has_returned = false;
1947 
1948     co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb);
1949     aio_co_enter(qemu_get_current_aio_context(), co);
1950 
1951     acb->has_returned = true;
1952     if (acb->rwco.ret != NOT_DONE) {
1953         replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
1954                                          blk_aio_complete_bh, acb);
1955     }
1956 
1957     return &acb->common;
1958 }
1959 
1960 static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
1961 {
1962     BlkAioEmAIOCB *acb = opaque;
1963     BlkRwCo *rwco = &acb->rwco;
1964 
1965     rwco->ret = blk_co_zone_append(rwco->blk, (int64_t *)(uintptr_t)acb->bytes,
1966                                    rwco->iobuf, rwco->flags);
1967     blk_aio_complete(acb);
1968 }
1969 
1970 BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
1971                                 QEMUIOVector *qiov, BdrvRequestFlags flags,
1972                                 BlockCompletionFunc *cb, void *opaque) {
1973     BlkAioEmAIOCB *acb;
1974     Coroutine *co;
1975     IO_CODE();
1976 
1977     blk_inc_in_flight(blk);
1978     acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
1979     acb->rwco = (BlkRwCo) {
1980         .blk    = blk,
1981         .ret    = NOT_DONE,
1982         .flags  = flags,
1983         .iobuf  = qiov,
1984     };
1985     acb->bytes = (int64_t)(uintptr_t)offset;
1986     acb->has_returned = false;
1987 
1988     co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
1989     aio_co_enter(qemu_get_current_aio_context(), co);
1990     acb->has_returned = true;
1991     if (acb->rwco.ret != NOT_DONE) {
1992         replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
1993                                          blk_aio_complete_bh, acb);
1994     }
1995 
1996     return &acb->common;
1997 }
1998 
1999 /*
2000  * Send a zone_report command.
2001  * offset is a byte offset from the start of the device. No alignment
2002  * required for offset.
2003  * nr_zones represents IN maximum and OUT actual.
2004  */
2005 int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
2006                                     unsigned int *nr_zones,
2007                                     BlockZoneDescriptor *zones)
2008 {
2009     int ret;
2010     IO_CODE();
2011 
2012     blk_inc_in_flight(blk); /* increase before waiting */
2013     blk_wait_while_drained(blk);
2014     GRAPH_RDLOCK_GUARD();
2015     if (!blk_is_available(blk)) {
2016         blk_dec_in_flight(blk);
2017         return -ENOMEDIUM;
2018     }
2019     ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones);
2020     blk_dec_in_flight(blk);
2021     return ret;
2022 }
2023 
2024 /*
2025  * Send a zone_management command.
2026  * op is the zone operation;
2027  * offset is the byte offset from the start of the zoned device;
2028  * len is the maximum number of bytes the command should operate on. It
2029  * should be aligned with the device zone size.
2030  */
2031 int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
2032         int64_t offset, int64_t len)
2033 {
2034     int ret;
2035     IO_CODE();
2036 
2037     blk_inc_in_flight(blk);
2038     blk_wait_while_drained(blk);
2039     GRAPH_RDLOCK_GUARD();
2040 
2041     ret = blk_check_byte_request(blk, offset, len);
2042     if (ret < 0) {
2043         blk_dec_in_flight(blk);
2044         return ret;
2045     }
2046 
2047     ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len);
2048     blk_dec_in_flight(blk);
2049     return ret;
2050 }
2051 
2052 /*
2053  * Send a zone_append command.
2054  */
2055 int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
2056         QEMUIOVector *qiov, BdrvRequestFlags flags)
2057 {
2058     int ret;
2059     IO_CODE();
2060 
2061     blk_inc_in_flight(blk);
2062     blk_wait_while_drained(blk);
2063     GRAPH_RDLOCK_GUARD();
2064     if (!blk_is_available(blk)) {
2065         blk_dec_in_flight(blk);
2066         return -ENOMEDIUM;
2067     }
2068 
2069     ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
2070     blk_dec_in_flight(blk);
2071     return ret;
2072 }
2073 
2074 void blk_drain(BlockBackend *blk)
2075 {
2076     BlockDriverState *bs = blk_bs(blk);
2077     GLOBAL_STATE_CODE();
2078 
2079     if (bs) {
2080         bdrv_ref(bs);
2081         bdrv_drained_begin(bs);
2082     }
2083 
2084     /* We may have -ENOMEDIUM completions in flight */
2085     AIO_WAIT_WHILE(blk_get_aio_context(blk),
2086                    qatomic_read(&blk->in_flight) > 0);
2087 
2088     if (bs) {
2089         bdrv_drained_end(bs);
2090         bdrv_unref(bs);
2091     }
2092 }
2093 
2094 void blk_drain_all(void)
2095 {
2096     BlockBackend *blk = NULL;
2097 
2098     GLOBAL_STATE_CODE();
2099 
2100     bdrv_drain_all_begin();
2101 
2102     while ((blk = blk_all_next(blk)) != NULL) {
2103         /* We may have -ENOMEDIUM completions in flight */
2104         AIO_WAIT_WHILE_UNLOCKED(NULL, qatomic_read(&blk->in_flight) > 0);
2105     }
2106 
2107     bdrv_drain_all_end();
2108 }
2109 
2110 void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error,
2111                       BlockdevOnError on_write_error)
2112 {
2113     GLOBAL_STATE_CODE();
2114     blk->on_read_error = on_read_error;
2115     blk->on_write_error = on_write_error;
2116 }
2117 
2118 BlockdevOnError blk_get_on_error(BlockBackend *blk, bool is_read)
2119 {
2120     IO_CODE();
2121     return is_read ? blk->on_read_error : blk->on_write_error;
2122 }
2123 
2124 BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read,
2125                                       int error)
2126 {
2127     BlockdevOnError on_err = blk_get_on_error(blk, is_read);
2128     IO_CODE();
2129 
2130     switch (on_err) {
2131     case BLOCKDEV_ON_ERROR_ENOSPC:
2132         return (error == ENOSPC) ?
2133                BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
2134     case BLOCKDEV_ON_ERROR_STOP:
2135         return BLOCK_ERROR_ACTION_STOP;
2136     case BLOCKDEV_ON_ERROR_REPORT:
2137         return BLOCK_ERROR_ACTION_REPORT;
2138     case BLOCKDEV_ON_ERROR_IGNORE:
2139         return BLOCK_ERROR_ACTION_IGNORE;
2140     case BLOCKDEV_ON_ERROR_AUTO:
2141     default:
2142         abort();
2143     }
2144 }
2145 
2146 static void send_qmp_error_event(BlockBackend *blk,
2147                                  BlockErrorAction action,
2148                                  bool is_read, int error)
2149 {
2150     IoOperationType optype;
2151     BlockDriverState *bs = blk_bs(blk);
2152 
2153     optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
2154     qapi_event_send_block_io_error(blk_name(blk),
2155                                    bs ? bdrv_get_node_name(bs) : NULL, optype,
2156                                    action, blk_iostatus_is_enabled(blk),
2157                                    error == ENOSPC, strerror(error));
2158 }
2159 
2160 /* This is done by device models because, while the block layer knows
2161  * about the error, it does not know whether an operation comes from
2162  * the device or the block layer (from a job, for example).
2163  */
2164 void blk_error_action(BlockBackend *blk, BlockErrorAction action,
2165                       bool is_read, int error)
2166 {
2167     assert(error >= 0);
2168     IO_CODE();
2169 
2170     if (action == BLOCK_ERROR_ACTION_STOP) {
2171         /* First set the iostatus, so that "info block" returns an iostatus
2172          * that matches the events raised so far (an additional error iostatus
2173          * is fine, but not a lost one).
2174          */
2175         blk_iostatus_set_err(blk, error);
2176 
2177         /* Then raise the request to stop the VM and the event.
2178          * qemu_system_vmstop_request_prepare has two effects.  First,
2179          * it ensures that the STOP event always comes after the
2180          * BLOCK_IO_ERROR event.  Second, it ensures that even if management
2181          * can observe the STOP event and do a "cont" before the STOP
2182          * event is issued, the VM will not stop.  In this case, vm_start()
2183          * also ensures that the STOP/RESUME pair of events is emitted.
2184          */
2185         qemu_system_vmstop_request_prepare();
2186         send_qmp_error_event(blk, action, is_read, error);
2187         qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
2188     } else {
2189         send_qmp_error_event(blk, action, is_read, error);
2190     }
2191 }
2192 
2193 /*
2194  * Returns true if the BlockBackend can support taking write permissions
2195  * (because its root node is not read-only).
2196  */
2197 bool blk_supports_write_perm(BlockBackend *blk)
2198 {
2199     BlockDriverState *bs = blk_bs(blk);
2200     GLOBAL_STATE_CODE();
2201 
2202     if (bs) {
2203         return !bdrv_is_read_only(bs);
2204     } else {
2205         return blk->root_state.open_flags & BDRV_O_RDWR;
2206     }
2207 }
2208 
2209 /*
2210  * Returns true if the BlockBackend can be written to in its current
2211  * configuration (i.e. if write permission have been requested)
2212  */
2213 bool blk_is_writable(BlockBackend *blk)
2214 {
2215     IO_CODE();
2216     return blk->perm & BLK_PERM_WRITE;
2217 }
2218 
2219 bool blk_is_sg(BlockBackend *blk)
2220 {
2221     BlockDriverState *bs = blk_bs(blk);
2222     GLOBAL_STATE_CODE();
2223 
2224     if (!bs) {
2225         return false;
2226     }
2227 
2228     return bdrv_is_sg(bs);
2229 }
2230 
2231 bool blk_enable_write_cache(BlockBackend *blk)
2232 {
2233     IO_CODE();
2234     return blk->enable_write_cache;
2235 }
2236 
2237 void blk_set_enable_write_cache(BlockBackend *blk, bool wce)
2238 {
2239     IO_CODE();
2240     blk->enable_write_cache = wce;
2241 }
2242 
2243 void blk_activate(BlockBackend *blk, Error **errp)
2244 {
2245     BlockDriverState *bs = blk_bs(blk);
2246     GLOBAL_STATE_CODE();
2247 
2248     if (!bs) {
2249         error_setg(errp, "Device '%s' has no medium", blk->name);
2250         return;
2251     }
2252 
2253     /*
2254      * Migration code can call this function in coroutine context, so leave
2255      * coroutine context if necessary.
2256      */
2257     if (qemu_in_coroutine()) {
2258         bdrv_co_activate(bs, errp);
2259     } else {
2260         GRAPH_RDLOCK_GUARD_MAINLOOP();
2261         bdrv_activate(bs, errp);
2262     }
2263 }
2264 
2265 bool coroutine_fn blk_co_is_inserted(BlockBackend *blk)
2266 {
2267     BlockDriverState *bs = blk_bs(blk);
2268     IO_CODE();
2269     assert_bdrv_graph_readable();
2270 
2271     return bs && bdrv_co_is_inserted(bs);
2272 }
2273 
2274 bool coroutine_fn blk_co_is_available(BlockBackend *blk)
2275 {
2276     IO_CODE();
2277     return blk_co_is_inserted(blk) && !blk_dev_is_tray_open(blk);
2278 }
2279 
2280 void coroutine_fn blk_co_lock_medium(BlockBackend *blk, bool locked)
2281 {
2282     BlockDriverState *bs = blk_bs(blk);
2283     IO_CODE();
2284     GRAPH_RDLOCK_GUARD();
2285 
2286     if (bs) {
2287         bdrv_co_lock_medium(bs, locked);
2288     }
2289 }
2290 
2291 void coroutine_fn blk_co_eject(BlockBackend *blk, bool eject_flag)
2292 {
2293     BlockDriverState *bs = blk_bs(blk);
2294     char *id;
2295     IO_CODE();
2296     GRAPH_RDLOCK_GUARD();
2297 
2298     if (bs) {
2299         bdrv_co_eject(bs, eject_flag);
2300     }
2301 
2302     /* Whether or not we ejected on the backend,
2303      * the frontend experienced a tray event. */
2304     id = blk_get_attached_dev_id(blk);
2305     qapi_event_send_device_tray_moved(blk_name(blk), id,
2306                                       eject_flag);
2307     g_free(id);
2308 }
2309 
2310 int blk_get_flags(BlockBackend *blk)
2311 {
2312     BlockDriverState *bs = blk_bs(blk);
2313     GLOBAL_STATE_CODE();
2314 
2315     if (bs) {
2316         return bdrv_get_flags(bs);
2317     } else {
2318         return blk->root_state.open_flags;
2319     }
2320 }
2321 
2322 /* Returns the minimum request alignment, in bytes; guaranteed nonzero */
2323 uint32_t blk_get_request_alignment(BlockBackend *blk)
2324 {
2325     BlockDriverState *bs = blk_bs(blk);
2326     IO_CODE();
2327     return bs ? bs->bl.request_alignment : BDRV_SECTOR_SIZE;
2328 }
2329 
2330 /* Returns the maximum hardware transfer length, in bytes; guaranteed nonzero */
2331 uint64_t blk_get_max_hw_transfer(BlockBackend *blk)
2332 {
2333     BlockDriverState *bs = blk_bs(blk);
2334     uint64_t max = INT_MAX;
2335     IO_CODE();
2336 
2337     if (bs) {
2338         max = MIN_NON_ZERO(max, bs->bl.max_hw_transfer);
2339         max = MIN_NON_ZERO(max, bs->bl.max_transfer);
2340     }
2341     return ROUND_DOWN(max, blk_get_request_alignment(blk));
2342 }
2343 
2344 /* Returns the maximum transfer length, in bytes; guaranteed nonzero */
2345 uint32_t blk_get_max_transfer(BlockBackend *blk)
2346 {
2347     BlockDriverState *bs = blk_bs(blk);
2348     uint32_t max = INT_MAX;
2349     IO_CODE();
2350 
2351     if (bs) {
2352         max = MIN_NON_ZERO(max, bs->bl.max_transfer);
2353     }
2354     return ROUND_DOWN(max, blk_get_request_alignment(blk));
2355 }
2356 
2357 int blk_get_max_hw_iov(BlockBackend *blk)
2358 {
2359     IO_CODE();
2360     return MIN_NON_ZERO(blk->root->bs->bl.max_hw_iov,
2361                         blk->root->bs->bl.max_iov);
2362 }
2363 
2364 int blk_get_max_iov(BlockBackend *blk)
2365 {
2366     IO_CODE();
2367     return blk->root->bs->bl.max_iov;
2368 }
2369 
2370 void *blk_try_blockalign(BlockBackend *blk, size_t size)
2371 {
2372     IO_CODE();
2373     return qemu_try_blockalign(blk ? blk_bs(blk) : NULL, size);
2374 }
2375 
2376 void *blk_blockalign(BlockBackend *blk, size_t size)
2377 {
2378     IO_CODE();
2379     return qemu_blockalign(blk ? blk_bs(blk) : NULL, size);
2380 }
2381 
2382 bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp)
2383 {
2384     BlockDriverState *bs = blk_bs(blk);
2385     GLOBAL_STATE_CODE();
2386     GRAPH_RDLOCK_GUARD_MAINLOOP();
2387 
2388     if (!bs) {
2389         return false;
2390     }
2391 
2392     return bdrv_op_is_blocked(bs, op, errp);
2393 }
2394 
2395 void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason)
2396 {
2397     BlockDriverState *bs = blk_bs(blk);
2398     GLOBAL_STATE_CODE();
2399 
2400     if (bs) {
2401         bdrv_op_unblock(bs, op, reason);
2402     }
2403 }
2404 
2405 void blk_op_block_all(BlockBackend *blk, Error *reason)
2406 {
2407     BlockDriverState *bs = blk_bs(blk);
2408     GLOBAL_STATE_CODE();
2409 
2410     if (bs) {
2411         bdrv_op_block_all(bs, reason);
2412     }
2413 }
2414 
2415 void blk_op_unblock_all(BlockBackend *blk, Error *reason)
2416 {
2417     BlockDriverState *bs = blk_bs(blk);
2418     GLOBAL_STATE_CODE();
2419 
2420     if (bs) {
2421         bdrv_op_unblock_all(bs, reason);
2422     }
2423 }
2424 
2425 AioContext *blk_get_aio_context(BlockBackend *blk)
2426 {
2427     BlockDriverState *bs;
2428     IO_CODE();
2429 
2430     if (!blk) {
2431         return qemu_get_aio_context();
2432     }
2433 
2434     bs = blk_bs(blk);
2435     if (bs) {
2436         AioContext *ctx = bdrv_get_aio_context(blk_bs(blk));
2437         assert(ctx == blk->ctx);
2438     }
2439 
2440     return blk->ctx;
2441 }
2442 
2443 int blk_set_aio_context(BlockBackend *blk, AioContext *new_context,
2444                         Error **errp)
2445 {
2446     bool old_allow_change;
2447     BlockDriverState *bs = blk_bs(blk);
2448     int ret;
2449 
2450     GLOBAL_STATE_CODE();
2451 
2452     if (!bs) {
2453         blk->ctx = new_context;
2454         return 0;
2455     }
2456 
2457     bdrv_ref(bs);
2458 
2459     old_allow_change = blk->allow_aio_context_change;
2460     blk->allow_aio_context_change = true;
2461 
2462     ret = bdrv_try_change_aio_context(bs, new_context, NULL, errp);
2463 
2464     blk->allow_aio_context_change = old_allow_change;
2465 
2466     bdrv_unref(bs);
2467     return ret;
2468 }
2469 
2470 typedef struct BdrvStateBlkRootContext {
2471     AioContext *new_ctx;
2472     BlockBackend *blk;
2473 } BdrvStateBlkRootContext;
2474 
2475 static void blk_root_set_aio_ctx_commit(void *opaque)
2476 {
2477     BdrvStateBlkRootContext *s = opaque;
2478     BlockBackend *blk = s->blk;
2479     AioContext *new_context = s->new_ctx;
2480     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
2481 
2482     blk->ctx = new_context;
2483     if (tgm->throttle_state) {
2484         throttle_group_detach_aio_context(tgm);
2485         throttle_group_attach_aio_context(tgm, new_context);
2486     }
2487 }
2488 
2489 static TransactionActionDrv set_blk_root_context = {
2490     .commit = blk_root_set_aio_ctx_commit,
2491     .clean = g_free,
2492 };
2493 
2494 static bool blk_root_change_aio_ctx(BdrvChild *child, AioContext *ctx,
2495                                     GHashTable *visited, Transaction *tran,
2496                                     Error **errp)
2497 {
2498     BlockBackend *blk = child->opaque;
2499     BdrvStateBlkRootContext *s;
2500 
2501     if (!blk->allow_aio_context_change) {
2502         /*
2503          * Manually created BlockBackends (those with a name) that are not
2504          * attached to anything can change their AioContext without updating
2505          * their user; return an error for others.
2506          */
2507         if (!blk->name || blk->dev) {
2508             /* TODO Add BB name/QOM path */
2509             error_setg(errp, "Cannot change iothread of active block backend");
2510             return false;
2511         }
2512     }
2513 
2514     s = g_new(BdrvStateBlkRootContext, 1);
2515     *s = (BdrvStateBlkRootContext) {
2516         .new_ctx = ctx,
2517         .blk = blk,
2518     };
2519 
2520     tran_add(tran, &set_blk_root_context, s);
2521     return true;
2522 }
2523 
2524 void blk_add_aio_context_notifier(BlockBackend *blk,
2525         void (*attached_aio_context)(AioContext *new_context, void *opaque),
2526         void (*detach_aio_context)(void *opaque), void *opaque)
2527 {
2528     BlockBackendAioNotifier *notifier;
2529     BlockDriverState *bs = blk_bs(blk);
2530     GLOBAL_STATE_CODE();
2531 
2532     notifier = g_new(BlockBackendAioNotifier, 1);
2533     notifier->attached_aio_context = attached_aio_context;
2534     notifier->detach_aio_context = detach_aio_context;
2535     notifier->opaque = opaque;
2536     QLIST_INSERT_HEAD(&blk->aio_notifiers, notifier, list);
2537 
2538     if (bs) {
2539         bdrv_add_aio_context_notifier(bs, attached_aio_context,
2540                                       detach_aio_context, opaque);
2541     }
2542 }
2543 
2544 void blk_remove_aio_context_notifier(BlockBackend *blk,
2545                                      void (*attached_aio_context)(AioContext *,
2546                                                                   void *),
2547                                      void (*detach_aio_context)(void *),
2548                                      void *opaque)
2549 {
2550     BlockBackendAioNotifier *notifier;
2551     BlockDriverState *bs = blk_bs(blk);
2552 
2553     GLOBAL_STATE_CODE();
2554 
2555     if (bs) {
2556         bdrv_remove_aio_context_notifier(bs, attached_aio_context,
2557                                          detach_aio_context, opaque);
2558     }
2559 
2560     QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
2561         if (notifier->attached_aio_context == attached_aio_context &&
2562             notifier->detach_aio_context == detach_aio_context &&
2563             notifier->opaque == opaque) {
2564             QLIST_REMOVE(notifier, list);
2565             g_free(notifier);
2566             return;
2567         }
2568     }
2569 
2570     abort();
2571 }
2572 
2573 void blk_add_remove_bs_notifier(BlockBackend *blk, Notifier *notify)
2574 {
2575     GLOBAL_STATE_CODE();
2576     notifier_list_add(&blk->remove_bs_notifiers, notify);
2577 }
2578 
2579 void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify)
2580 {
2581     GLOBAL_STATE_CODE();
2582     notifier_list_add(&blk->insert_bs_notifiers, notify);
2583 }
2584 
2585 BlockAcctStats *blk_get_stats(BlockBackend *blk)
2586 {
2587     IO_CODE();
2588     return &blk->stats;
2589 }
2590 
2591 void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
2592                   BlockCompletionFunc *cb, void *opaque)
2593 {
2594     IO_CODE();
2595     return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque);
2596 }
2597 
2598 int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
2599                                       int64_t bytes, BdrvRequestFlags flags)
2600 {
2601     IO_OR_GS_CODE();
2602     return blk_co_pwritev(blk, offset, bytes, NULL,
2603                           flags | BDRV_REQ_ZERO_WRITE);
2604 }
2605 
2606 int coroutine_fn blk_co_pwrite_compressed(BlockBackend *blk, int64_t offset,
2607                                           int64_t bytes, const void *buf)
2608 {
2609     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
2610     IO_OR_GS_CODE();
2611     return blk_co_pwritev_part(blk, offset, bytes, &qiov, 0,
2612                                BDRV_REQ_WRITE_COMPRESSED);
2613 }
2614 
2615 int coroutine_fn blk_co_truncate(BlockBackend *blk, int64_t offset, bool exact,
2616                                  PreallocMode prealloc, BdrvRequestFlags flags,
2617                                  Error **errp)
2618 {
2619     IO_OR_GS_CODE();
2620     GRAPH_RDLOCK_GUARD();
2621     if (!blk_co_is_available(blk)) {
2622         error_setg(errp, "No medium inserted");
2623         return -ENOMEDIUM;
2624     }
2625 
2626     return bdrv_co_truncate(blk->root, offset, exact, prealloc, flags, errp);
2627 }
2628 
2629 int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
2630                      int64_t pos, int size)
2631 {
2632     int ret;
2633     GLOBAL_STATE_CODE();
2634 
2635     if (!blk_is_available(blk)) {
2636         return -ENOMEDIUM;
2637     }
2638 
2639     ret = bdrv_save_vmstate(blk_bs(blk), buf, pos, size);
2640     if (ret < 0) {
2641         return ret;
2642     }
2643 
2644     if (ret == size && !blk->enable_write_cache) {
2645         ret = bdrv_flush(blk_bs(blk));
2646     }
2647 
2648     return ret < 0 ? ret : size;
2649 }
2650 
2651 int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size)
2652 {
2653     GLOBAL_STATE_CODE();
2654     if (!blk_is_available(blk)) {
2655         return -ENOMEDIUM;
2656     }
2657 
2658     return bdrv_load_vmstate(blk_bs(blk), buf, pos, size);
2659 }
2660 
2661 int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz)
2662 {
2663     GLOBAL_STATE_CODE();
2664     GRAPH_RDLOCK_GUARD_MAINLOOP();
2665 
2666     if (!blk_is_available(blk)) {
2667         return -ENOMEDIUM;
2668     }
2669 
2670     return bdrv_probe_blocksizes(blk_bs(blk), bsz);
2671 }
2672 
2673 int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo)
2674 {
2675     GLOBAL_STATE_CODE();
2676     if (!blk_is_available(blk)) {
2677         return -ENOMEDIUM;
2678     }
2679 
2680     return bdrv_probe_geometry(blk_bs(blk), geo);
2681 }
2682 
2683 /*
2684  * Updates the BlockBackendRootState object with data from the currently
2685  * attached BlockDriverState.
2686  */
2687 void blk_update_root_state(BlockBackend *blk)
2688 {
2689     GLOBAL_STATE_CODE();
2690     assert(blk->root);
2691 
2692     blk->root_state.open_flags    = blk->root->bs->open_flags;
2693     blk->root_state.detect_zeroes = blk->root->bs->detect_zeroes;
2694 }
2695 
2696 /*
2697  * Returns the detect-zeroes setting to be used for bdrv_open() of a
2698  * BlockDriverState which is supposed to inherit the root state.
2699  */
2700 bool blk_get_detect_zeroes_from_root_state(BlockBackend *blk)
2701 {
2702     GLOBAL_STATE_CODE();
2703     return blk->root_state.detect_zeroes;
2704 }
2705 
2706 /*
2707  * Returns the flags to be used for bdrv_open() of a BlockDriverState which is
2708  * supposed to inherit the root state.
2709  */
2710 int blk_get_open_flags_from_root_state(BlockBackend *blk)
2711 {
2712     GLOBAL_STATE_CODE();
2713     return blk->root_state.open_flags;
2714 }
2715 
2716 BlockBackendRootState *blk_get_root_state(BlockBackend *blk)
2717 {
2718     GLOBAL_STATE_CODE();
2719     return &blk->root_state;
2720 }
2721 
2722 int blk_commit_all(void)
2723 {
2724     BlockBackend *blk = NULL;
2725     GLOBAL_STATE_CODE();
2726     GRAPH_RDLOCK_GUARD_MAINLOOP();
2727 
2728     while ((blk = blk_all_next(blk)) != NULL) {
2729         BlockDriverState *unfiltered_bs = bdrv_skip_filters(blk_bs(blk));
2730 
2731         if (blk_is_inserted(blk) && bdrv_cow_child(unfiltered_bs)) {
2732             int ret;
2733 
2734             ret = bdrv_commit(unfiltered_bs);
2735             if (ret < 0) {
2736                 return ret;
2737             }
2738         }
2739     }
2740     return 0;
2741 }
2742 
2743 
2744 /* throttling disk I/O limits */
2745 void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg)
2746 {
2747     GLOBAL_STATE_CODE();
2748     throttle_group_config(&blk->public.throttle_group_member, cfg);
2749 }
2750 
2751 void blk_io_limits_disable(BlockBackend *blk)
2752 {
2753     BlockDriverState *bs = blk_bs(blk);
2754     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
2755     assert(tgm->throttle_state);
2756     GLOBAL_STATE_CODE();
2757     if (bs) {
2758         bdrv_ref(bs);
2759         bdrv_drained_begin(bs);
2760     }
2761     throttle_group_unregister_tgm(tgm);
2762     if (bs) {
2763         bdrv_drained_end(bs);
2764         bdrv_unref(bs);
2765     }
2766 }
2767 
2768 /* should be called before blk_set_io_limits if a limit is set */
2769 void blk_io_limits_enable(BlockBackend *blk, const char *group)
2770 {
2771     assert(!blk->public.throttle_group_member.throttle_state);
2772     GLOBAL_STATE_CODE();
2773     throttle_group_register_tgm(&blk->public.throttle_group_member,
2774                                 group, blk_get_aio_context(blk));
2775 }
2776 
2777 void blk_io_limits_update_group(BlockBackend *blk, const char *group)
2778 {
2779     GLOBAL_STATE_CODE();
2780     /* this BB is not part of any group */
2781     if (!blk->public.throttle_group_member.throttle_state) {
2782         return;
2783     }
2784 
2785     /* this BB is a part of the same group than the one we want */
2786     if (!g_strcmp0(throttle_group_get_name(&blk->public.throttle_group_member),
2787                 group)) {
2788         return;
2789     }
2790 
2791     /* need to change the group this bs belong to */
2792     blk_io_limits_disable(blk);
2793     blk_io_limits_enable(blk, group);
2794 }
2795 
2796 static void blk_root_drained_begin(BdrvChild *child)
2797 {
2798     BlockBackend *blk = child->opaque;
2799     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
2800 
2801     if (qatomic_fetch_inc(&blk->quiesce_counter) == 0) {
2802         if (blk->dev_ops && blk->dev_ops->drained_begin) {
2803             blk->dev_ops->drained_begin(blk->dev_opaque);
2804         }
2805     }
2806 
2807     /* Note that blk->root may not be accessible here yet if we are just
2808      * attaching to a BlockDriverState that is drained. Use child instead. */
2809 
2810     if (qatomic_fetch_inc(&tgm->io_limits_disabled) == 0) {
2811         throttle_group_restart_tgm(tgm);
2812     }
2813 }
2814 
2815 static bool blk_root_drained_poll(BdrvChild *child)
2816 {
2817     BlockBackend *blk = child->opaque;
2818     bool busy = false;
2819     assert(qatomic_read(&blk->quiesce_counter));
2820 
2821     if (blk->dev_ops && blk->dev_ops->drained_poll) {
2822         busy = blk->dev_ops->drained_poll(blk->dev_opaque);
2823     }
2824     return busy || !!blk->in_flight;
2825 }
2826 
2827 static void blk_root_drained_end(BdrvChild *child)
2828 {
2829     BlockBackend *blk = child->opaque;
2830     assert(qatomic_read(&blk->quiesce_counter));
2831 
2832     assert(blk->public.throttle_group_member.io_limits_disabled);
2833     qatomic_dec(&blk->public.throttle_group_member.io_limits_disabled);
2834 
2835     if (qatomic_fetch_dec(&blk->quiesce_counter) == 1) {
2836         if (blk->dev_ops && blk->dev_ops->drained_end) {
2837             blk->dev_ops->drained_end(blk->dev_opaque);
2838         }
2839         qemu_mutex_lock(&blk->queued_requests_lock);
2840         while (qemu_co_enter_next(&blk->queued_requests,
2841                                   &blk->queued_requests_lock)) {
2842             /* Resume all queued requests */
2843         }
2844         qemu_mutex_unlock(&blk->queued_requests_lock);
2845     }
2846 }
2847 
2848 bool blk_register_buf(BlockBackend *blk, void *host, size_t size, Error **errp)
2849 {
2850     BlockDriverState *bs = blk_bs(blk);
2851 
2852     GLOBAL_STATE_CODE();
2853 
2854     if (bs) {
2855         return bdrv_register_buf(bs, host, size, errp);
2856     }
2857     return true;
2858 }
2859 
2860 void blk_unregister_buf(BlockBackend *blk, void *host, size_t size)
2861 {
2862     BlockDriverState *bs = blk_bs(blk);
2863 
2864     GLOBAL_STATE_CODE();
2865 
2866     if (bs) {
2867         bdrv_unregister_buf(bs, host, size);
2868     }
2869 }
2870 
2871 int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in,
2872                                    BlockBackend *blk_out, int64_t off_out,
2873                                    int64_t bytes, BdrvRequestFlags read_flags,
2874                                    BdrvRequestFlags write_flags)
2875 {
2876     int r;
2877     IO_CODE();
2878     GRAPH_RDLOCK_GUARD();
2879 
2880     r = blk_check_byte_request(blk_in, off_in, bytes);
2881     if (r) {
2882         return r;
2883     }
2884     r = blk_check_byte_request(blk_out, off_out, bytes);
2885     if (r) {
2886         return r;
2887     }
2888 
2889     return bdrv_co_copy_range(blk_in->root, off_in,
2890                               blk_out->root, off_out,
2891                               bytes, read_flags, write_flags);
2892 }
2893 
2894 const BdrvChild *blk_root(BlockBackend *blk)
2895 {
2896     GLOBAL_STATE_CODE();
2897     return blk->root;
2898 }
2899 
2900 int blk_make_empty(BlockBackend *blk, Error **errp)
2901 {
2902     GLOBAL_STATE_CODE();
2903     GRAPH_RDLOCK_GUARD_MAINLOOP();
2904 
2905     if (!blk_is_available(blk)) {
2906         error_setg(errp, "No medium inserted");
2907         return -ENOMEDIUM;
2908     }
2909 
2910     return bdrv_make_empty(blk->root, errp);
2911 }
2912