xref: /openbmc/qemu/block/block-backend.c (revision 7f750efc)
1 /*
2  * QEMU Block backends
3  *
4  * Copyright (C) 2014-2016 Red Hat, Inc.
5  *
6  * Authors:
7  *  Markus Armbruster <armbru@redhat.com>,
8  *
9  * This work is licensed under the terms of the GNU LGPL, version 2.1
10  * or later.  See the COPYING.LIB file in the top-level directory.
11  */
12 
13 #include "qemu/osdep.h"
14 #include "sysemu/block-backend.h"
15 #include "block/block_int.h"
16 #include "block/blockjob.h"
17 #include "block/coroutines.h"
18 #include "block/throttle-groups.h"
19 #include "hw/qdev-core.h"
20 #include "sysemu/blockdev.h"
21 #include "sysemu/runstate.h"
22 #include "sysemu/replay.h"
23 #include "qapi/error.h"
24 #include "qapi/qapi-events-block.h"
25 #include "qemu/id.h"
26 #include "qemu/main-loop.h"
27 #include "qemu/option.h"
28 #include "trace.h"
29 #include "migration/misc.h"
30 
31 /* Number of coroutines to reserve per attached device model */
32 #define COROUTINE_POOL_RESERVATION 64
33 
34 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
35 
36 static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb);
37 
38 typedef struct BlockBackendAioNotifier {
39     void (*attached_aio_context)(AioContext *new_context, void *opaque);
40     void (*detach_aio_context)(void *opaque);
41     void *opaque;
42     QLIST_ENTRY(BlockBackendAioNotifier) list;
43 } BlockBackendAioNotifier;
44 
45 struct BlockBackend {
46     char *name;
47     int refcnt;
48     BdrvChild *root;
49     AioContext *ctx;
50     DriveInfo *legacy_dinfo;    /* null unless created by drive_new() */
51     QTAILQ_ENTRY(BlockBackend) link;         /* for block_backends */
52     QTAILQ_ENTRY(BlockBackend) monitor_link; /* for monitor_block_backends */
53     BlockBackendPublic public;
54 
55     DeviceState *dev;           /* attached device model, if any */
56     const BlockDevOps *dev_ops;
57     void *dev_opaque;
58 
59     /* If the BDS tree is removed, some of its options are stored here (which
60      * can be used to restore those options in the new BDS on insert) */
61     BlockBackendRootState root_state;
62 
63     bool enable_write_cache;
64 
65     /* I/O stats (display with "info blockstats"). */
66     BlockAcctStats stats;
67 
68     BlockdevOnError on_read_error, on_write_error;
69     bool iostatus_enabled;
70     BlockDeviceIoStatus iostatus;
71 
72     uint64_t perm;
73     uint64_t shared_perm;
74     bool disable_perm;
75 
76     bool allow_aio_context_change;
77     bool allow_write_beyond_eof;
78 
79     /* Protected by BQL */
80     NotifierList remove_bs_notifiers, insert_bs_notifiers;
81     QLIST_HEAD(, BlockBackendAioNotifier) aio_notifiers;
82 
83     int quiesce_counter; /* atomic: written under BQL, read by other threads */
84     QemuMutex queued_requests_lock; /* protects queued_requests */
85     CoQueue queued_requests;
86     bool disable_request_queuing; /* atomic */
87 
88     VMChangeStateEntry *vmsh;
89     bool force_allow_inactivate;
90 
91     /* Number of in-flight aio requests.  BlockDriverState also counts
92      * in-flight requests but aio requests can exist even when blk->root is
93      * NULL, so we cannot rely on its counter for that case.
94      * Accessed with atomic ops.
95      */
96     unsigned int in_flight;
97 };
98 
99 typedef struct BlockBackendAIOCB {
100     BlockAIOCB common;
101     BlockBackend *blk;
102     int ret;
103 } BlockBackendAIOCB;
104 
105 static const AIOCBInfo block_backend_aiocb_info = {
106     .get_aio_context = blk_aiocb_get_aio_context,
107     .aiocb_size = sizeof(BlockBackendAIOCB),
108 };
109 
110 static void drive_info_del(DriveInfo *dinfo);
111 static BlockBackend *bdrv_first_blk(BlockDriverState *bs);
112 
113 /* All BlockBackends. Protected by BQL. */
114 static QTAILQ_HEAD(, BlockBackend) block_backends =
115     QTAILQ_HEAD_INITIALIZER(block_backends);
116 
117 /*
118  * All BlockBackends referenced by the monitor and which are iterated through by
119  * blk_next(). Protected by BQL.
120  */
121 static QTAILQ_HEAD(, BlockBackend) monitor_block_backends =
122     QTAILQ_HEAD_INITIALIZER(monitor_block_backends);
123 
124 static void blk_root_inherit_options(BdrvChildRole role, bool parent_is_format,
125                                      int *child_flags, QDict *child_options,
126                                      int parent_flags, QDict *parent_options)
127 {
128     /* We're not supposed to call this function for root nodes */
129     abort();
130 }
131 static void blk_root_drained_begin(BdrvChild *child);
132 static bool blk_root_drained_poll(BdrvChild *child);
133 static void blk_root_drained_end(BdrvChild *child);
134 
135 static void blk_root_change_media(BdrvChild *child, bool load);
136 static void blk_root_resize(BdrvChild *child);
137 
138 static bool blk_root_change_aio_ctx(BdrvChild *child, AioContext *ctx,
139                                     GHashTable *visited, Transaction *tran,
140                                     Error **errp);
141 
142 static char *blk_root_get_parent_desc(BdrvChild *child)
143 {
144     BlockBackend *blk = child->opaque;
145     g_autofree char *dev_id = NULL;
146 
147     if (blk->name) {
148         return g_strdup_printf("block device '%s'", blk->name);
149     }
150 
151     dev_id = blk_get_attached_dev_id(blk);
152     if (*dev_id) {
153         return g_strdup_printf("block device '%s'", dev_id);
154     } else {
155         /* TODO Callback into the BB owner for something more detailed */
156         return g_strdup("an unnamed block device");
157     }
158 }
159 
160 static const char *blk_root_get_name(BdrvChild *child)
161 {
162     return blk_name(child->opaque);
163 }
164 
165 static void blk_vm_state_changed(void *opaque, bool running, RunState state)
166 {
167     Error *local_err = NULL;
168     BlockBackend *blk = opaque;
169 
170     if (state == RUN_STATE_INMIGRATE) {
171         return;
172     }
173 
174     qemu_del_vm_change_state_handler(blk->vmsh);
175     blk->vmsh = NULL;
176     blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
177     if (local_err) {
178         error_report_err(local_err);
179     }
180 }
181 
182 /*
183  * Notifies the user of the BlockBackend that migration has completed. qdev
184  * devices can tighten their permissions in response (specifically revoke
185  * shared write permissions that we needed for storage migration).
186  *
187  * If an error is returned, the VM cannot be allowed to be resumed.
188  */
189 static void blk_root_activate(BdrvChild *child, Error **errp)
190 {
191     BlockBackend *blk = child->opaque;
192     Error *local_err = NULL;
193     uint64_t saved_shared_perm;
194 
195     if (!blk->disable_perm) {
196         return;
197     }
198 
199     blk->disable_perm = false;
200 
201     /*
202      * blk->shared_perm contains the permissions we want to share once
203      * migration is really completely done.  For now, we need to share
204      * all; but we also need to retain blk->shared_perm, which is
205      * overwritten by a successful blk_set_perm() call.  Save it and
206      * restore it below.
207      */
208     saved_shared_perm = blk->shared_perm;
209 
210     blk_set_perm(blk, blk->perm, BLK_PERM_ALL, &local_err);
211     if (local_err) {
212         error_propagate(errp, local_err);
213         blk->disable_perm = true;
214         return;
215     }
216     blk->shared_perm = saved_shared_perm;
217 
218     if (runstate_check(RUN_STATE_INMIGRATE)) {
219         /* Activation can happen when migration process is still active, for
220          * example when nbd_server_add is called during non-shared storage
221          * migration. Defer the shared_perm update to migration completion. */
222         if (!blk->vmsh) {
223             blk->vmsh = qemu_add_vm_change_state_handler(blk_vm_state_changed,
224                                                          blk);
225         }
226         return;
227     }
228 
229     blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
230     if (local_err) {
231         error_propagate(errp, local_err);
232         blk->disable_perm = true;
233         return;
234     }
235 }
236 
237 void blk_set_force_allow_inactivate(BlockBackend *blk)
238 {
239     GLOBAL_STATE_CODE();
240     blk->force_allow_inactivate = true;
241 }
242 
243 static bool blk_can_inactivate(BlockBackend *blk)
244 {
245     /* If it is a guest device, inactivate is ok. */
246     if (blk->dev || blk_name(blk)[0]) {
247         return true;
248     }
249 
250     /* Inactivating means no more writes to the image can be done,
251      * even if those writes would be changes invisible to the
252      * guest.  For block job BBs that satisfy this, we can just allow
253      * it.  This is the case for mirror job source, which is required
254      * by libvirt non-shared block migration. */
255     if (!(blk->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED))) {
256         return true;
257     }
258 
259     return blk->force_allow_inactivate;
260 }
261 
262 static int blk_root_inactivate(BdrvChild *child)
263 {
264     BlockBackend *blk = child->opaque;
265 
266     if (blk->disable_perm) {
267         return 0;
268     }
269 
270     if (!blk_can_inactivate(blk)) {
271         return -EPERM;
272     }
273 
274     blk->disable_perm = true;
275     if (blk->root) {
276         bdrv_child_try_set_perm(blk->root, 0, BLK_PERM_ALL, &error_abort);
277     }
278 
279     return 0;
280 }
281 
282 static void blk_root_attach(BdrvChild *child)
283 {
284     BlockBackend *blk = child->opaque;
285     BlockBackendAioNotifier *notifier;
286 
287     trace_blk_root_attach(child, blk, child->bs);
288 
289     QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
290         bdrv_add_aio_context_notifier(child->bs,
291                 notifier->attached_aio_context,
292                 notifier->detach_aio_context,
293                 notifier->opaque);
294     }
295 }
296 
297 static void blk_root_detach(BdrvChild *child)
298 {
299     BlockBackend *blk = child->opaque;
300     BlockBackendAioNotifier *notifier;
301 
302     trace_blk_root_detach(child, blk, child->bs);
303 
304     QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
305         bdrv_remove_aio_context_notifier(child->bs,
306                 notifier->attached_aio_context,
307                 notifier->detach_aio_context,
308                 notifier->opaque);
309     }
310 }
311 
312 static AioContext *blk_root_get_parent_aio_context(BdrvChild *c)
313 {
314     BlockBackend *blk = c->opaque;
315     IO_CODE();
316 
317     return blk_get_aio_context(blk);
318 }
319 
320 static const BdrvChildClass child_root = {
321     .inherit_options    = blk_root_inherit_options,
322 
323     .change_media       = blk_root_change_media,
324     .resize             = blk_root_resize,
325     .get_name           = blk_root_get_name,
326     .get_parent_desc    = blk_root_get_parent_desc,
327 
328     .drained_begin      = blk_root_drained_begin,
329     .drained_poll       = blk_root_drained_poll,
330     .drained_end        = blk_root_drained_end,
331 
332     .activate           = blk_root_activate,
333     .inactivate         = blk_root_inactivate,
334 
335     .attach             = blk_root_attach,
336     .detach             = blk_root_detach,
337 
338     .change_aio_ctx     = blk_root_change_aio_ctx,
339 
340     .get_parent_aio_context = blk_root_get_parent_aio_context,
341 };
342 
343 /*
344  * Create a new BlockBackend with a reference count of one.
345  *
346  * @perm is a bitmasks of BLK_PERM_* constants which describes the permissions
347  * to request for a block driver node that is attached to this BlockBackend.
348  * @shared_perm is a bitmask which describes which permissions may be granted
349  * to other users of the attached node.
350  * Both sets of permissions can be changed later using blk_set_perm().
351  *
352  * Return the new BlockBackend on success, null on failure.
353  */
354 BlockBackend *blk_new(AioContext *ctx, uint64_t perm, uint64_t shared_perm)
355 {
356     BlockBackend *blk;
357 
358     GLOBAL_STATE_CODE();
359 
360     blk = g_new0(BlockBackend, 1);
361     blk->refcnt = 1;
362     blk->ctx = ctx;
363     blk->perm = perm;
364     blk->shared_perm = shared_perm;
365     blk_set_enable_write_cache(blk, true);
366 
367     blk->on_read_error = BLOCKDEV_ON_ERROR_REPORT;
368     blk->on_write_error = BLOCKDEV_ON_ERROR_ENOSPC;
369 
370     block_acct_init(&blk->stats);
371 
372     qemu_mutex_init(&blk->queued_requests_lock);
373     qemu_co_queue_init(&blk->queued_requests);
374     notifier_list_init(&blk->remove_bs_notifiers);
375     notifier_list_init(&blk->insert_bs_notifiers);
376     QLIST_INIT(&blk->aio_notifiers);
377 
378     QTAILQ_INSERT_TAIL(&block_backends, blk, link);
379     return blk;
380 }
381 
382 /*
383  * Create a new BlockBackend connected to an existing BlockDriverState.
384  *
385  * @perm is a bitmasks of BLK_PERM_* constants which describes the
386  * permissions to request for @bs that is attached to this
387  * BlockBackend.  @shared_perm is a bitmask which describes which
388  * permissions may be granted to other users of the attached node.
389  * Both sets of permissions can be changed later using blk_set_perm().
390  *
391  * Return the new BlockBackend on success, null on failure.
392  *
393  * Callers must hold the AioContext lock of @bs.
394  */
395 BlockBackend *blk_new_with_bs(BlockDriverState *bs, uint64_t perm,
396                               uint64_t shared_perm, Error **errp)
397 {
398     BlockBackend *blk = blk_new(bdrv_get_aio_context(bs), perm, shared_perm);
399 
400     GLOBAL_STATE_CODE();
401 
402     if (blk_insert_bs(blk, bs, errp) < 0) {
403         blk_unref(blk);
404         return NULL;
405     }
406     return blk;
407 }
408 
409 /*
410  * Creates a new BlockBackend, opens a new BlockDriverState, and connects both.
411  * By default, the new BlockBackend is in the main AioContext, but if the
412  * parameters connect it with any existing node in a different AioContext, it
413  * may end up there instead.
414  *
415  * Just as with bdrv_open(), after having called this function the reference to
416  * @options belongs to the block layer (even on failure).
417  *
418  * Called without holding an AioContext lock.
419  *
420  * TODO: Remove @filename and @flags; it should be possible to specify a whole
421  * BDS tree just by specifying the @options QDict (or @reference,
422  * alternatively). At the time of adding this function, this is not possible,
423  * though, so callers of this function have to be able to specify @filename and
424  * @flags.
425  */
426 BlockBackend *blk_new_open(const char *filename, const char *reference,
427                            QDict *options, int flags, Error **errp)
428 {
429     BlockBackend *blk;
430     BlockDriverState *bs;
431     AioContext *ctx;
432     uint64_t perm = 0;
433     uint64_t shared = BLK_PERM_ALL;
434 
435     GLOBAL_STATE_CODE();
436 
437     /*
438      * blk_new_open() is mainly used in .bdrv_create implementations and the
439      * tools where sharing isn't a major concern because the BDS stays private
440      * and the file is generally not supposed to be used by a second process,
441      * so we just request permission according to the flags.
442      *
443      * The exceptions are xen_disk and blockdev_init(); in these cases, the
444      * caller of blk_new_open() doesn't make use of the permissions, but they
445      * shouldn't hurt either. We can still share everything here because the
446      * guest devices will add their own blockers if they can't share.
447      */
448     if ((flags & BDRV_O_NO_IO) == 0) {
449         perm |= BLK_PERM_CONSISTENT_READ;
450         if (flags & BDRV_O_RDWR) {
451             perm |= BLK_PERM_WRITE;
452         }
453     }
454     if (flags & BDRV_O_RESIZE) {
455         perm |= BLK_PERM_RESIZE;
456     }
457     if (flags & BDRV_O_NO_SHARE) {
458         shared = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED;
459     }
460 
461     aio_context_acquire(qemu_get_aio_context());
462     bs = bdrv_open(filename, reference, options, flags, errp);
463     aio_context_release(qemu_get_aio_context());
464     if (!bs) {
465         return NULL;
466     }
467 
468     /* bdrv_open() could have moved bs to a different AioContext */
469     ctx = bdrv_get_aio_context(bs);
470     blk = blk_new(bdrv_get_aio_context(bs), perm, shared);
471     blk->perm = perm;
472     blk->shared_perm = shared;
473 
474     aio_context_acquire(ctx);
475     blk_insert_bs(blk, bs, errp);
476     bdrv_unref(bs);
477     aio_context_release(ctx);
478 
479     if (!blk->root) {
480         blk_unref(blk);
481         return NULL;
482     }
483 
484     return blk;
485 }
486 
487 static void blk_delete(BlockBackend *blk)
488 {
489     assert(!blk->refcnt);
490     assert(!blk->name);
491     assert(!blk->dev);
492     if (blk->public.throttle_group_member.throttle_state) {
493         blk_io_limits_disable(blk);
494     }
495     if (blk->root) {
496         blk_remove_bs(blk);
497     }
498     if (blk->vmsh) {
499         qemu_del_vm_change_state_handler(blk->vmsh);
500         blk->vmsh = NULL;
501     }
502     assert(QLIST_EMPTY(&blk->remove_bs_notifiers.notifiers));
503     assert(QLIST_EMPTY(&blk->insert_bs_notifiers.notifiers));
504     assert(QLIST_EMPTY(&blk->aio_notifiers));
505     assert(qemu_co_queue_empty(&blk->queued_requests));
506     qemu_mutex_destroy(&blk->queued_requests_lock);
507     QTAILQ_REMOVE(&block_backends, blk, link);
508     drive_info_del(blk->legacy_dinfo);
509     block_acct_cleanup(&blk->stats);
510     g_free(blk);
511 }
512 
513 static void drive_info_del(DriveInfo *dinfo)
514 {
515     if (!dinfo) {
516         return;
517     }
518     qemu_opts_del(dinfo->opts);
519     g_free(dinfo);
520 }
521 
522 int blk_get_refcnt(BlockBackend *blk)
523 {
524     GLOBAL_STATE_CODE();
525     return blk ? blk->refcnt : 0;
526 }
527 
528 /*
529  * Increment @blk's reference count.
530  * @blk must not be null.
531  */
532 void blk_ref(BlockBackend *blk)
533 {
534     assert(blk->refcnt > 0);
535     GLOBAL_STATE_CODE();
536     blk->refcnt++;
537 }
538 
539 /*
540  * Decrement @blk's reference count.
541  * If this drops it to zero, destroy @blk.
542  * For convenience, do nothing if @blk is null.
543  */
544 void blk_unref(BlockBackend *blk)
545 {
546     GLOBAL_STATE_CODE();
547     if (blk) {
548         assert(blk->refcnt > 0);
549         if (blk->refcnt > 1) {
550             blk->refcnt--;
551         } else {
552             blk_drain(blk);
553             /* blk_drain() cannot resurrect blk, nobody held a reference */
554             assert(blk->refcnt == 1);
555             blk->refcnt = 0;
556             blk_delete(blk);
557         }
558     }
559 }
560 
561 /*
562  * Behaves similarly to blk_next() but iterates over all BlockBackends, even the
563  * ones which are hidden (i.e. are not referenced by the monitor).
564  */
565 BlockBackend *blk_all_next(BlockBackend *blk)
566 {
567     GLOBAL_STATE_CODE();
568     return blk ? QTAILQ_NEXT(blk, link)
569                : QTAILQ_FIRST(&block_backends);
570 }
571 
572 void blk_remove_all_bs(void)
573 {
574     BlockBackend *blk = NULL;
575 
576     GLOBAL_STATE_CODE();
577 
578     while ((blk = blk_all_next(blk)) != NULL) {
579         AioContext *ctx = blk_get_aio_context(blk);
580 
581         aio_context_acquire(ctx);
582         if (blk->root) {
583             blk_remove_bs(blk);
584         }
585         aio_context_release(ctx);
586     }
587 }
588 
589 /*
590  * Return the monitor-owned BlockBackend after @blk.
591  * If @blk is null, return the first one.
592  * Else, return @blk's next sibling, which may be null.
593  *
594  * To iterate over all BlockBackends, do
595  * for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
596  *     ...
597  * }
598  */
599 BlockBackend *blk_next(BlockBackend *blk)
600 {
601     GLOBAL_STATE_CODE();
602     return blk ? QTAILQ_NEXT(blk, monitor_link)
603                : QTAILQ_FIRST(&monitor_block_backends);
604 }
605 
606 /* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
607  * the monitor or attached to a BlockBackend */
608 BlockDriverState *bdrv_next(BdrvNextIterator *it)
609 {
610     BlockDriverState *bs, *old_bs;
611 
612     /* Must be called from the main loop */
613     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
614 
615     /* First, return all root nodes of BlockBackends. In order to avoid
616      * returning a BDS twice when multiple BBs refer to it, we only return it
617      * if the BB is the first one in the parent list of the BDS. */
618     if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
619         BlockBackend *old_blk = it->blk;
620 
621         old_bs = old_blk ? blk_bs(old_blk) : NULL;
622 
623         do {
624             it->blk = blk_all_next(it->blk);
625             bs = it->blk ? blk_bs(it->blk) : NULL;
626         } while (it->blk && (bs == NULL || bdrv_first_blk(bs) != it->blk));
627 
628         if (it->blk) {
629             blk_ref(it->blk);
630         }
631         blk_unref(old_blk);
632 
633         if (bs) {
634             bdrv_ref(bs);
635             bdrv_unref(old_bs);
636             return bs;
637         }
638         it->phase = BDRV_NEXT_MONITOR_OWNED;
639     } else {
640         old_bs = it->bs;
641     }
642 
643     /* Then return the monitor-owned BDSes without a BB attached. Ignore all
644      * BDSes that are attached to a BlockBackend here; they have been handled
645      * by the above block already */
646     do {
647         it->bs = bdrv_next_monitor_owned(it->bs);
648         bs = it->bs;
649     } while (bs && bdrv_has_blk(bs));
650 
651     if (bs) {
652         bdrv_ref(bs);
653     }
654     bdrv_unref(old_bs);
655 
656     return bs;
657 }
658 
659 static void bdrv_next_reset(BdrvNextIterator *it)
660 {
661     *it = (BdrvNextIterator) {
662         .phase = BDRV_NEXT_BACKEND_ROOTS,
663     };
664 }
665 
666 BlockDriverState *bdrv_first(BdrvNextIterator *it)
667 {
668     GLOBAL_STATE_CODE();
669     bdrv_next_reset(it);
670     return bdrv_next(it);
671 }
672 
673 /* Must be called when aborting a bdrv_next() iteration before
674  * bdrv_next() returns NULL */
675 void bdrv_next_cleanup(BdrvNextIterator *it)
676 {
677     /* Must be called from the main loop */
678     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
679 
680     if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
681         if (it->blk) {
682             bdrv_unref(blk_bs(it->blk));
683             blk_unref(it->blk);
684         }
685     } else {
686         bdrv_unref(it->bs);
687     }
688 
689     bdrv_next_reset(it);
690 }
691 
692 /*
693  * Add a BlockBackend into the list of backends referenced by the monitor, with
694  * the given @name acting as the handle for the monitor.
695  * Strictly for use by blockdev.c.
696  *
697  * @name must not be null or empty.
698  *
699  * Returns true on success and false on failure. In the latter case, an Error
700  * object is returned through @errp.
701  */
702 bool monitor_add_blk(BlockBackend *blk, const char *name, Error **errp)
703 {
704     assert(!blk->name);
705     assert(name && name[0]);
706     GLOBAL_STATE_CODE();
707 
708     if (!id_wellformed(name)) {
709         error_setg(errp, "Invalid device name");
710         return false;
711     }
712     if (blk_by_name(name)) {
713         error_setg(errp, "Device with id '%s' already exists", name);
714         return false;
715     }
716     if (bdrv_find_node(name)) {
717         error_setg(errp,
718                    "Device name '%s' conflicts with an existing node name",
719                    name);
720         return false;
721     }
722 
723     blk->name = g_strdup(name);
724     QTAILQ_INSERT_TAIL(&monitor_block_backends, blk, monitor_link);
725     return true;
726 }
727 
728 /*
729  * Remove a BlockBackend from the list of backends referenced by the monitor.
730  * Strictly for use by blockdev.c.
731  */
732 void monitor_remove_blk(BlockBackend *blk)
733 {
734     GLOBAL_STATE_CODE();
735 
736     if (!blk->name) {
737         return;
738     }
739 
740     QTAILQ_REMOVE(&monitor_block_backends, blk, monitor_link);
741     g_free(blk->name);
742     blk->name = NULL;
743 }
744 
745 /*
746  * Return @blk's name, a non-null string.
747  * Returns an empty string iff @blk is not referenced by the monitor.
748  */
749 const char *blk_name(const BlockBackend *blk)
750 {
751     IO_CODE();
752     return blk->name ?: "";
753 }
754 
755 /*
756  * Return the BlockBackend with name @name if it exists, else null.
757  * @name must not be null.
758  */
759 BlockBackend *blk_by_name(const char *name)
760 {
761     BlockBackend *blk = NULL;
762 
763     GLOBAL_STATE_CODE();
764     assert(name);
765     while ((blk = blk_next(blk)) != NULL) {
766         if (!strcmp(name, blk->name)) {
767             return blk;
768         }
769     }
770     return NULL;
771 }
772 
773 /*
774  * Return the BlockDriverState attached to @blk if any, else null.
775  */
776 BlockDriverState *blk_bs(BlockBackend *blk)
777 {
778     IO_CODE();
779     return blk->root ? blk->root->bs : NULL;
780 }
781 
782 static BlockBackend *bdrv_first_blk(BlockDriverState *bs)
783 {
784     BdrvChild *child;
785 
786     GLOBAL_STATE_CODE();
787 
788     QLIST_FOREACH(child, &bs->parents, next_parent) {
789         if (child->klass == &child_root) {
790             return child->opaque;
791         }
792     }
793 
794     return NULL;
795 }
796 
797 /*
798  * Returns true if @bs has an associated BlockBackend.
799  */
800 bool bdrv_has_blk(BlockDriverState *bs)
801 {
802     GLOBAL_STATE_CODE();
803     return bdrv_first_blk(bs) != NULL;
804 }
805 
806 /*
807  * Returns true if @bs has only BlockBackends as parents.
808  */
809 bool bdrv_is_root_node(BlockDriverState *bs)
810 {
811     BdrvChild *c;
812 
813     GLOBAL_STATE_CODE();
814     QLIST_FOREACH(c, &bs->parents, next_parent) {
815         if (c->klass != &child_root) {
816             return false;
817         }
818     }
819 
820     return true;
821 }
822 
823 /*
824  * Return @blk's DriveInfo if any, else null.
825  */
826 DriveInfo *blk_legacy_dinfo(BlockBackend *blk)
827 {
828     GLOBAL_STATE_CODE();
829     return blk->legacy_dinfo;
830 }
831 
832 /*
833  * Set @blk's DriveInfo to @dinfo, and return it.
834  * @blk must not have a DriveInfo set already.
835  * No other BlockBackend may have the same DriveInfo set.
836  */
837 DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, DriveInfo *dinfo)
838 {
839     assert(!blk->legacy_dinfo);
840     GLOBAL_STATE_CODE();
841     return blk->legacy_dinfo = dinfo;
842 }
843 
844 /*
845  * Return the BlockBackend with DriveInfo @dinfo.
846  * It must exist.
847  */
848 BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo)
849 {
850     BlockBackend *blk = NULL;
851     GLOBAL_STATE_CODE();
852 
853     while ((blk = blk_next(blk)) != NULL) {
854         if (blk->legacy_dinfo == dinfo) {
855             return blk;
856         }
857     }
858     abort();
859 }
860 
861 /*
862  * Returns a pointer to the publicly accessible fields of @blk.
863  */
864 BlockBackendPublic *blk_get_public(BlockBackend *blk)
865 {
866     GLOBAL_STATE_CODE();
867     return &blk->public;
868 }
869 
870 /*
871  * Returns a BlockBackend given the associated @public fields.
872  */
873 BlockBackend *blk_by_public(BlockBackendPublic *public)
874 {
875     GLOBAL_STATE_CODE();
876     return container_of(public, BlockBackend, public);
877 }
878 
879 /*
880  * Disassociates the currently associated BlockDriverState from @blk.
881  */
882 void blk_remove_bs(BlockBackend *blk)
883 {
884     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
885     BdrvChild *root;
886 
887     GLOBAL_STATE_CODE();
888 
889     notifier_list_notify(&blk->remove_bs_notifiers, blk);
890     if (tgm->throttle_state) {
891         BlockDriverState *bs = blk_bs(blk);
892 
893         /*
894          * Take a ref in case blk_bs() changes across bdrv_drained_begin(), for
895          * example, if a temporary filter node is removed by a blockjob.
896          */
897         bdrv_ref(bs);
898         bdrv_drained_begin(bs);
899         throttle_group_detach_aio_context(tgm);
900         throttle_group_attach_aio_context(tgm, qemu_get_aio_context());
901         bdrv_drained_end(bs);
902         bdrv_unref(bs);
903     }
904 
905     blk_update_root_state(blk);
906 
907     /* bdrv_root_unref_child() will cause blk->root to become stale and may
908      * switch to a completion coroutine later on. Let's drain all I/O here
909      * to avoid that and a potential QEMU crash.
910      */
911     blk_drain(blk);
912     root = blk->root;
913     blk->root = NULL;
914     bdrv_root_unref_child(root);
915 }
916 
917 /*
918  * Associates a new BlockDriverState with @blk.
919  *
920  * Callers must hold the AioContext lock of @bs.
921  */
922 int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp)
923 {
924     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
925     GLOBAL_STATE_CODE();
926     bdrv_ref(bs);
927     blk->root = bdrv_root_attach_child(bs, "root", &child_root,
928                                        BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
929                                        blk->perm, blk->shared_perm,
930                                        blk, errp);
931     if (blk->root == NULL) {
932         return -EPERM;
933     }
934 
935     notifier_list_notify(&blk->insert_bs_notifiers, blk);
936     if (tgm->throttle_state) {
937         throttle_group_detach_aio_context(tgm);
938         throttle_group_attach_aio_context(tgm, bdrv_get_aio_context(bs));
939     }
940 
941     return 0;
942 }
943 
944 /*
945  * Change BlockDriverState associated with @blk.
946  */
947 int blk_replace_bs(BlockBackend *blk, BlockDriverState *new_bs, Error **errp)
948 {
949     GLOBAL_STATE_CODE();
950     return bdrv_replace_child_bs(blk->root, new_bs, errp);
951 }
952 
953 /*
954  * Sets the permission bitmasks that the user of the BlockBackend needs.
955  */
956 int blk_set_perm(BlockBackend *blk, uint64_t perm, uint64_t shared_perm,
957                  Error **errp)
958 {
959     int ret;
960     GLOBAL_STATE_CODE();
961 
962     if (blk->root && !blk->disable_perm) {
963         ret = bdrv_child_try_set_perm(blk->root, perm, shared_perm, errp);
964         if (ret < 0) {
965             return ret;
966         }
967     }
968 
969     blk->perm = perm;
970     blk->shared_perm = shared_perm;
971 
972     return 0;
973 }
974 
975 void blk_get_perm(BlockBackend *blk, uint64_t *perm, uint64_t *shared_perm)
976 {
977     GLOBAL_STATE_CODE();
978     *perm = blk->perm;
979     *shared_perm = blk->shared_perm;
980 }
981 
982 /*
983  * Attach device model @dev to @blk.
984  * Return 0 on success, -EBUSY when a device model is attached already.
985  */
986 int blk_attach_dev(BlockBackend *blk, DeviceState *dev)
987 {
988     GLOBAL_STATE_CODE();
989     if (blk->dev) {
990         return -EBUSY;
991     }
992 
993     /* While migration is still incoming, we don't need to apply the
994      * permissions of guest device BlockBackends. We might still have a block
995      * job or NBD server writing to the image for storage migration. */
996     if (runstate_check(RUN_STATE_INMIGRATE)) {
997         blk->disable_perm = true;
998     }
999 
1000     blk_ref(blk);
1001     blk->dev = dev;
1002     blk_iostatus_reset(blk);
1003 
1004     return 0;
1005 }
1006 
1007 /*
1008  * Detach device model @dev from @blk.
1009  * @dev must be currently attached to @blk.
1010  */
1011 void blk_detach_dev(BlockBackend *blk, DeviceState *dev)
1012 {
1013     assert(blk->dev == dev);
1014     GLOBAL_STATE_CODE();
1015     blk->dev = NULL;
1016     blk->dev_ops = NULL;
1017     blk->dev_opaque = NULL;
1018     blk_set_perm(blk, 0, BLK_PERM_ALL, &error_abort);
1019     blk_unref(blk);
1020 }
1021 
1022 /*
1023  * Return the device model attached to @blk if any, else null.
1024  */
1025 DeviceState *blk_get_attached_dev(BlockBackend *blk)
1026 {
1027     GLOBAL_STATE_CODE();
1028     return blk->dev;
1029 }
1030 
1031 /* Return the qdev ID, or if no ID is assigned the QOM path, of the block
1032  * device attached to the BlockBackend. */
1033 char *blk_get_attached_dev_id(BlockBackend *blk)
1034 {
1035     DeviceState *dev = blk->dev;
1036     IO_CODE();
1037 
1038     if (!dev) {
1039         return g_strdup("");
1040     } else if (dev->id) {
1041         return g_strdup(dev->id);
1042     }
1043 
1044     return object_get_canonical_path(OBJECT(dev)) ?: g_strdup("");
1045 }
1046 
1047 /*
1048  * Return the BlockBackend which has the device model @dev attached if it
1049  * exists, else null.
1050  *
1051  * @dev must not be null.
1052  */
1053 BlockBackend *blk_by_dev(void *dev)
1054 {
1055     BlockBackend *blk = NULL;
1056 
1057     GLOBAL_STATE_CODE();
1058 
1059     assert(dev != NULL);
1060     while ((blk = blk_all_next(blk)) != NULL) {
1061         if (blk->dev == dev) {
1062             return blk;
1063         }
1064     }
1065     return NULL;
1066 }
1067 
1068 /*
1069  * Set @blk's device model callbacks to @ops.
1070  * @opaque is the opaque argument to pass to the callbacks.
1071  * This is for use by device models.
1072  */
1073 void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops,
1074                      void *opaque)
1075 {
1076     GLOBAL_STATE_CODE();
1077     blk->dev_ops = ops;
1078     blk->dev_opaque = opaque;
1079 
1080     /* Are we currently quiesced? Should we enforce this right now? */
1081     if (qatomic_read(&blk->quiesce_counter) && ops && ops->drained_begin) {
1082         ops->drained_begin(opaque);
1083     }
1084 }
1085 
1086 /*
1087  * Notify @blk's attached device model of media change.
1088  *
1089  * If @load is true, notify of media load. This action can fail, meaning that
1090  * the medium cannot be loaded. @errp is set then.
1091  *
1092  * If @load is false, notify of media eject. This can never fail.
1093  *
1094  * Also send DEVICE_TRAY_MOVED events as appropriate.
1095  */
1096 void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp)
1097 {
1098     GLOBAL_STATE_CODE();
1099     if (blk->dev_ops && blk->dev_ops->change_media_cb) {
1100         bool tray_was_open, tray_is_open;
1101         Error *local_err = NULL;
1102 
1103         tray_was_open = blk_dev_is_tray_open(blk);
1104         blk->dev_ops->change_media_cb(blk->dev_opaque, load, &local_err);
1105         if (local_err) {
1106             assert(load == true);
1107             error_propagate(errp, local_err);
1108             return;
1109         }
1110         tray_is_open = blk_dev_is_tray_open(blk);
1111 
1112         if (tray_was_open != tray_is_open) {
1113             char *id = blk_get_attached_dev_id(blk);
1114             qapi_event_send_device_tray_moved(blk_name(blk), id, tray_is_open);
1115             g_free(id);
1116         }
1117     }
1118 }
1119 
1120 static void blk_root_change_media(BdrvChild *child, bool load)
1121 {
1122     blk_dev_change_media_cb(child->opaque, load, NULL);
1123 }
1124 
1125 /*
1126  * Does @blk's attached device model have removable media?
1127  * %true if no device model is attached.
1128  */
1129 bool blk_dev_has_removable_media(BlockBackend *blk)
1130 {
1131     GLOBAL_STATE_CODE();
1132     return !blk->dev || (blk->dev_ops && blk->dev_ops->change_media_cb);
1133 }
1134 
1135 /*
1136  * Does @blk's attached device model have a tray?
1137  */
1138 bool blk_dev_has_tray(BlockBackend *blk)
1139 {
1140     IO_CODE();
1141     return blk->dev_ops && blk->dev_ops->is_tray_open;
1142 }
1143 
1144 /*
1145  * Notify @blk's attached device model of a media eject request.
1146  * If @force is true, the medium is about to be yanked out forcefully.
1147  */
1148 void blk_dev_eject_request(BlockBackend *blk, bool force)
1149 {
1150     GLOBAL_STATE_CODE();
1151     if (blk->dev_ops && blk->dev_ops->eject_request_cb) {
1152         blk->dev_ops->eject_request_cb(blk->dev_opaque, force);
1153     }
1154 }
1155 
1156 /*
1157  * Does @blk's attached device model have a tray, and is it open?
1158  */
1159 bool blk_dev_is_tray_open(BlockBackend *blk)
1160 {
1161     IO_CODE();
1162     if (blk_dev_has_tray(blk)) {
1163         return blk->dev_ops->is_tray_open(blk->dev_opaque);
1164     }
1165     return false;
1166 }
1167 
1168 /*
1169  * Does @blk's attached device model have the medium locked?
1170  * %false if the device model has no such lock.
1171  */
1172 bool blk_dev_is_medium_locked(BlockBackend *blk)
1173 {
1174     GLOBAL_STATE_CODE();
1175     if (blk->dev_ops && blk->dev_ops->is_medium_locked) {
1176         return blk->dev_ops->is_medium_locked(blk->dev_opaque);
1177     }
1178     return false;
1179 }
1180 
1181 /*
1182  * Notify @blk's attached device model of a backend size change.
1183  */
1184 static void blk_root_resize(BdrvChild *child)
1185 {
1186     BlockBackend *blk = child->opaque;
1187 
1188     if (blk->dev_ops && blk->dev_ops->resize_cb) {
1189         blk->dev_ops->resize_cb(blk->dev_opaque);
1190     }
1191 }
1192 
1193 void blk_iostatus_enable(BlockBackend *blk)
1194 {
1195     GLOBAL_STATE_CODE();
1196     blk->iostatus_enabled = true;
1197     blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
1198 }
1199 
1200 /* The I/O status is only enabled if the drive explicitly
1201  * enables it _and_ the VM is configured to stop on errors */
1202 bool blk_iostatus_is_enabled(const BlockBackend *blk)
1203 {
1204     IO_CODE();
1205     return (blk->iostatus_enabled &&
1206            (blk->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
1207             blk->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
1208             blk->on_read_error == BLOCKDEV_ON_ERROR_STOP));
1209 }
1210 
1211 BlockDeviceIoStatus blk_iostatus(const BlockBackend *blk)
1212 {
1213     GLOBAL_STATE_CODE();
1214     return blk->iostatus;
1215 }
1216 
1217 void blk_iostatus_disable(BlockBackend *blk)
1218 {
1219     GLOBAL_STATE_CODE();
1220     blk->iostatus_enabled = false;
1221 }
1222 
1223 void blk_iostatus_reset(BlockBackend *blk)
1224 {
1225     GLOBAL_STATE_CODE();
1226     if (blk_iostatus_is_enabled(blk)) {
1227         blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
1228     }
1229 }
1230 
1231 void blk_iostatus_set_err(BlockBackend *blk, int error)
1232 {
1233     IO_CODE();
1234     assert(blk_iostatus_is_enabled(blk));
1235     if (blk->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
1236         blk->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
1237                                           BLOCK_DEVICE_IO_STATUS_FAILED;
1238     }
1239 }
1240 
1241 void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow)
1242 {
1243     IO_CODE();
1244     blk->allow_write_beyond_eof = allow;
1245 }
1246 
1247 void blk_set_allow_aio_context_change(BlockBackend *blk, bool allow)
1248 {
1249     IO_CODE();
1250     blk->allow_aio_context_change = allow;
1251 }
1252 
1253 void blk_set_disable_request_queuing(BlockBackend *blk, bool disable)
1254 {
1255     IO_CODE();
1256     qatomic_set(&blk->disable_request_queuing, disable);
1257 }
1258 
1259 static int coroutine_fn GRAPH_RDLOCK
1260 blk_check_byte_request(BlockBackend *blk, int64_t offset, int64_t bytes)
1261 {
1262     int64_t len;
1263 
1264     if (bytes < 0) {
1265         return -EIO;
1266     }
1267 
1268     if (!blk_co_is_available(blk)) {
1269         return -ENOMEDIUM;
1270     }
1271 
1272     if (offset < 0) {
1273         return -EIO;
1274     }
1275 
1276     if (!blk->allow_write_beyond_eof) {
1277         len = bdrv_co_getlength(blk_bs(blk));
1278         if (len < 0) {
1279             return len;
1280         }
1281 
1282         if (offset > len || len - offset < bytes) {
1283             return -EIO;
1284         }
1285     }
1286 
1287     return 0;
1288 }
1289 
1290 /* Are we currently in a drained section? */
1291 bool blk_in_drain(BlockBackend *blk)
1292 {
1293     GLOBAL_STATE_CODE(); /* change to IO_OR_GS_CODE(), if necessary */
1294     return qatomic_read(&blk->quiesce_counter);
1295 }
1296 
1297 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1298 static void coroutine_fn blk_wait_while_drained(BlockBackend *blk)
1299 {
1300     assert(blk->in_flight > 0);
1301 
1302     if (qatomic_read(&blk->quiesce_counter) &&
1303         !qatomic_read(&blk->disable_request_queuing)) {
1304         /*
1305          * Take lock before decrementing in flight counter so main loop thread
1306          * waits for us to enqueue ourselves before it can leave the drained
1307          * section.
1308          */
1309         qemu_mutex_lock(&blk->queued_requests_lock);
1310         blk_dec_in_flight(blk);
1311         qemu_co_queue_wait(&blk->queued_requests, &blk->queued_requests_lock);
1312         blk_inc_in_flight(blk);
1313         qemu_mutex_unlock(&blk->queued_requests_lock);
1314     }
1315 }
1316 
1317 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1318 static int coroutine_fn
1319 blk_co_do_preadv_part(BlockBackend *blk, int64_t offset, int64_t bytes,
1320                       QEMUIOVector *qiov, size_t qiov_offset,
1321                       BdrvRequestFlags flags)
1322 {
1323     int ret;
1324     BlockDriverState *bs;
1325     IO_CODE();
1326 
1327     blk_wait_while_drained(blk);
1328     GRAPH_RDLOCK_GUARD();
1329 
1330     /* Call blk_bs() only after waiting, the graph may have changed */
1331     bs = blk_bs(blk);
1332     trace_blk_co_preadv(blk, bs, offset, bytes, flags);
1333 
1334     ret = blk_check_byte_request(blk, offset, bytes);
1335     if (ret < 0) {
1336         return ret;
1337     }
1338 
1339     bdrv_inc_in_flight(bs);
1340 
1341     /* throttling disk I/O */
1342     if (blk->public.throttle_group_member.throttle_state) {
1343         throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
1344                 bytes, false);
1345     }
1346 
1347     ret = bdrv_co_preadv_part(blk->root, offset, bytes, qiov, qiov_offset,
1348                               flags);
1349     bdrv_dec_in_flight(bs);
1350     return ret;
1351 }
1352 
1353 int coroutine_fn blk_co_pread(BlockBackend *blk, int64_t offset, int64_t bytes,
1354                               void *buf, BdrvRequestFlags flags)
1355 {
1356     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
1357     IO_OR_GS_CODE();
1358 
1359     assert(bytes <= SIZE_MAX);
1360 
1361     return blk_co_preadv(blk, offset, bytes, &qiov, flags);
1362 }
1363 
1364 int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
1365                                int64_t bytes, QEMUIOVector *qiov,
1366                                BdrvRequestFlags flags)
1367 {
1368     int ret;
1369     IO_OR_GS_CODE();
1370 
1371     blk_inc_in_flight(blk);
1372     ret = blk_co_do_preadv_part(blk, offset, bytes, qiov, 0, flags);
1373     blk_dec_in_flight(blk);
1374 
1375     return ret;
1376 }
1377 
1378 int coroutine_fn blk_co_preadv_part(BlockBackend *blk, int64_t offset,
1379                                     int64_t bytes, QEMUIOVector *qiov,
1380                                     size_t qiov_offset, BdrvRequestFlags flags)
1381 {
1382     int ret;
1383     IO_OR_GS_CODE();
1384 
1385     blk_inc_in_flight(blk);
1386     ret = blk_co_do_preadv_part(blk, offset, bytes, qiov, qiov_offset, flags);
1387     blk_dec_in_flight(blk);
1388 
1389     return ret;
1390 }
1391 
1392 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1393 static int coroutine_fn
1394 blk_co_do_pwritev_part(BlockBackend *blk, int64_t offset, int64_t bytes,
1395                        QEMUIOVector *qiov, size_t qiov_offset,
1396                        BdrvRequestFlags flags)
1397 {
1398     int ret;
1399     BlockDriverState *bs;
1400     IO_CODE();
1401 
1402     blk_wait_while_drained(blk);
1403     GRAPH_RDLOCK_GUARD();
1404 
1405     /* Call blk_bs() only after waiting, the graph may have changed */
1406     bs = blk_bs(blk);
1407     trace_blk_co_pwritev(blk, bs, offset, bytes, flags);
1408 
1409     ret = blk_check_byte_request(blk, offset, bytes);
1410     if (ret < 0) {
1411         return ret;
1412     }
1413 
1414     bdrv_inc_in_flight(bs);
1415     /* throttling disk I/O */
1416     if (blk->public.throttle_group_member.throttle_state) {
1417         throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
1418                 bytes, true);
1419     }
1420 
1421     if (!blk->enable_write_cache) {
1422         flags |= BDRV_REQ_FUA;
1423     }
1424 
1425     ret = bdrv_co_pwritev_part(blk->root, offset, bytes, qiov, qiov_offset,
1426                                flags);
1427     bdrv_dec_in_flight(bs);
1428     return ret;
1429 }
1430 
1431 int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset,
1432                                      int64_t bytes,
1433                                      QEMUIOVector *qiov, size_t qiov_offset,
1434                                      BdrvRequestFlags flags)
1435 {
1436     int ret;
1437     IO_OR_GS_CODE();
1438 
1439     blk_inc_in_flight(blk);
1440     ret = blk_co_do_pwritev_part(blk, offset, bytes, qiov, qiov_offset, flags);
1441     blk_dec_in_flight(blk);
1442 
1443     return ret;
1444 }
1445 
1446 int coroutine_fn blk_co_pwrite(BlockBackend *blk, int64_t offset, int64_t bytes,
1447                                const void *buf, BdrvRequestFlags flags)
1448 {
1449     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
1450     IO_OR_GS_CODE();
1451 
1452     assert(bytes <= SIZE_MAX);
1453 
1454     return blk_co_pwritev(blk, offset, bytes, &qiov, flags);
1455 }
1456 
1457 int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
1458                                 int64_t bytes, QEMUIOVector *qiov,
1459                                 BdrvRequestFlags flags)
1460 {
1461     IO_OR_GS_CODE();
1462     return blk_co_pwritev_part(blk, offset, bytes, qiov, 0, flags);
1463 }
1464 
1465 int coroutine_fn blk_co_block_status_above(BlockBackend *blk,
1466                                            BlockDriverState *base,
1467                                            int64_t offset, int64_t bytes,
1468                                            int64_t *pnum, int64_t *map,
1469                                            BlockDriverState **file)
1470 {
1471     IO_CODE();
1472     GRAPH_RDLOCK_GUARD();
1473     return bdrv_co_block_status_above(blk_bs(blk), base, offset, bytes, pnum,
1474                                       map, file);
1475 }
1476 
1477 int coroutine_fn blk_co_is_allocated_above(BlockBackend *blk,
1478                                            BlockDriverState *base,
1479                                            bool include_base, int64_t offset,
1480                                            int64_t bytes, int64_t *pnum)
1481 {
1482     IO_CODE();
1483     GRAPH_RDLOCK_GUARD();
1484     return bdrv_co_is_allocated_above(blk_bs(blk), base, include_base, offset,
1485                                       bytes, pnum);
1486 }
1487 
1488 typedef struct BlkRwCo {
1489     BlockBackend *blk;
1490     int64_t offset;
1491     void *iobuf;
1492     int ret;
1493     BdrvRequestFlags flags;
1494 } BlkRwCo;
1495 
1496 int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
1497 {
1498     GLOBAL_STATE_CODE();
1499     return bdrv_make_zero(blk->root, flags);
1500 }
1501 
1502 void blk_inc_in_flight(BlockBackend *blk)
1503 {
1504     IO_CODE();
1505     qatomic_inc(&blk->in_flight);
1506 }
1507 
1508 void blk_dec_in_flight(BlockBackend *blk)
1509 {
1510     IO_CODE();
1511     qatomic_dec(&blk->in_flight);
1512     aio_wait_kick();
1513 }
1514 
1515 static void error_callback_bh(void *opaque)
1516 {
1517     struct BlockBackendAIOCB *acb = opaque;
1518 
1519     blk_dec_in_flight(acb->blk);
1520     acb->common.cb(acb->common.opaque, acb->ret);
1521     qemu_aio_unref(acb);
1522 }
1523 
1524 BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
1525                                   BlockCompletionFunc *cb,
1526                                   void *opaque, int ret)
1527 {
1528     struct BlockBackendAIOCB *acb;
1529     IO_CODE();
1530 
1531     blk_inc_in_flight(blk);
1532     acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque);
1533     acb->blk = blk;
1534     acb->ret = ret;
1535 
1536     replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
1537                                      error_callback_bh, acb);
1538     return &acb->common;
1539 }
1540 
1541 typedef struct BlkAioEmAIOCB {
1542     BlockAIOCB common;
1543     BlkRwCo rwco;
1544     int64_t bytes;
1545     bool has_returned;
1546 } BlkAioEmAIOCB;
1547 
1548 static AioContext *blk_aio_em_aiocb_get_aio_context(BlockAIOCB *acb_)
1549 {
1550     BlkAioEmAIOCB *acb = container_of(acb_, BlkAioEmAIOCB, common);
1551 
1552     return blk_get_aio_context(acb->rwco.blk);
1553 }
1554 
1555 static const AIOCBInfo blk_aio_em_aiocb_info = {
1556     .aiocb_size         = sizeof(BlkAioEmAIOCB),
1557     .get_aio_context    = blk_aio_em_aiocb_get_aio_context,
1558 };
1559 
1560 static void blk_aio_complete(BlkAioEmAIOCB *acb)
1561 {
1562     if (acb->has_returned) {
1563         acb->common.cb(acb->common.opaque, acb->rwco.ret);
1564         blk_dec_in_flight(acb->rwco.blk);
1565         qemu_aio_unref(acb);
1566     }
1567 }
1568 
1569 static void blk_aio_complete_bh(void *opaque)
1570 {
1571     BlkAioEmAIOCB *acb = opaque;
1572     assert(acb->has_returned);
1573     blk_aio_complete(acb);
1574 }
1575 
1576 static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset,
1577                                 int64_t bytes,
1578                                 void *iobuf, CoroutineEntry co_entry,
1579                                 BdrvRequestFlags flags,
1580                                 BlockCompletionFunc *cb, void *opaque)
1581 {
1582     BlkAioEmAIOCB *acb;
1583     Coroutine *co;
1584 
1585     blk_inc_in_flight(blk);
1586     acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
1587     acb->rwco = (BlkRwCo) {
1588         .blk    = blk,
1589         .offset = offset,
1590         .iobuf  = iobuf,
1591         .flags  = flags,
1592         .ret    = NOT_DONE,
1593     };
1594     acb->bytes = bytes;
1595     acb->has_returned = false;
1596 
1597     co = qemu_coroutine_create(co_entry, acb);
1598     aio_co_enter(blk_get_aio_context(blk), co);
1599 
1600     acb->has_returned = true;
1601     if (acb->rwco.ret != NOT_DONE) {
1602         replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
1603                                          blk_aio_complete_bh, acb);
1604     }
1605 
1606     return &acb->common;
1607 }
1608 
1609 static void coroutine_fn blk_aio_read_entry(void *opaque)
1610 {
1611     BlkAioEmAIOCB *acb = opaque;
1612     BlkRwCo *rwco = &acb->rwco;
1613     QEMUIOVector *qiov = rwco->iobuf;
1614 
1615     assert(qiov->size == acb->bytes);
1616     rwco->ret = blk_co_do_preadv_part(rwco->blk, rwco->offset, acb->bytes, qiov,
1617                                       0, rwco->flags);
1618     blk_aio_complete(acb);
1619 }
1620 
1621 static void coroutine_fn blk_aio_write_entry(void *opaque)
1622 {
1623     BlkAioEmAIOCB *acb = opaque;
1624     BlkRwCo *rwco = &acb->rwco;
1625     QEMUIOVector *qiov = rwco->iobuf;
1626 
1627     assert(!qiov || qiov->size == acb->bytes);
1628     rwco->ret = blk_co_do_pwritev_part(rwco->blk, rwco->offset, acb->bytes,
1629                                        qiov, 0, rwco->flags);
1630     blk_aio_complete(acb);
1631 }
1632 
1633 BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset,
1634                                   int64_t bytes, BdrvRequestFlags flags,
1635                                   BlockCompletionFunc *cb, void *opaque)
1636 {
1637     IO_CODE();
1638     return blk_aio_prwv(blk, offset, bytes, NULL, blk_aio_write_entry,
1639                         flags | BDRV_REQ_ZERO_WRITE, cb, opaque);
1640 }
1641 
1642 int64_t coroutine_fn blk_co_getlength(BlockBackend *blk)
1643 {
1644     IO_CODE();
1645     GRAPH_RDLOCK_GUARD();
1646 
1647     if (!blk_co_is_available(blk)) {
1648         return -ENOMEDIUM;
1649     }
1650 
1651     return bdrv_co_getlength(blk_bs(blk));
1652 }
1653 
1654 int64_t coroutine_fn blk_co_nb_sectors(BlockBackend *blk)
1655 {
1656     BlockDriverState *bs = blk_bs(blk);
1657 
1658     IO_CODE();
1659     GRAPH_RDLOCK_GUARD();
1660 
1661     if (!bs) {
1662         return -ENOMEDIUM;
1663     } else {
1664         return bdrv_co_nb_sectors(bs);
1665     }
1666 }
1667 
1668 /*
1669  * This wrapper is written by hand because this function is in the hot I/O path,
1670  * via blk_get_geometry.
1671  */
1672 int64_t coroutine_mixed_fn blk_nb_sectors(BlockBackend *blk)
1673 {
1674     BlockDriverState *bs = blk_bs(blk);
1675 
1676     IO_CODE();
1677 
1678     if (!bs) {
1679         return -ENOMEDIUM;
1680     } else {
1681         return bdrv_nb_sectors(bs);
1682     }
1683 }
1684 
1685 /* return 0 as number of sectors if no device present or error */
1686 void coroutine_fn blk_co_get_geometry(BlockBackend *blk,
1687                                       uint64_t *nb_sectors_ptr)
1688 {
1689     int64_t ret = blk_co_nb_sectors(blk);
1690     *nb_sectors_ptr = ret < 0 ? 0 : ret;
1691 }
1692 
1693 /*
1694  * This wrapper is written by hand because this function is in the hot I/O path.
1695  */
1696 void coroutine_mixed_fn blk_get_geometry(BlockBackend *blk,
1697                                          uint64_t *nb_sectors_ptr)
1698 {
1699     int64_t ret = blk_nb_sectors(blk);
1700     *nb_sectors_ptr = ret < 0 ? 0 : ret;
1701 }
1702 
1703 BlockAIOCB *blk_aio_preadv(BlockBackend *blk, int64_t offset,
1704                            QEMUIOVector *qiov, BdrvRequestFlags flags,
1705                            BlockCompletionFunc *cb, void *opaque)
1706 {
1707     IO_CODE();
1708     assert((uint64_t)qiov->size <= INT64_MAX);
1709     return blk_aio_prwv(blk, offset, qiov->size, qiov,
1710                         blk_aio_read_entry, flags, cb, opaque);
1711 }
1712 
1713 BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
1714                             QEMUIOVector *qiov, BdrvRequestFlags flags,
1715                             BlockCompletionFunc *cb, void *opaque)
1716 {
1717     IO_CODE();
1718     assert((uint64_t)qiov->size <= INT64_MAX);
1719     return blk_aio_prwv(blk, offset, qiov->size, qiov,
1720                         blk_aio_write_entry, flags, cb, opaque);
1721 }
1722 
1723 void blk_aio_cancel(BlockAIOCB *acb)
1724 {
1725     GLOBAL_STATE_CODE();
1726     bdrv_aio_cancel(acb);
1727 }
1728 
1729 void blk_aio_cancel_async(BlockAIOCB *acb)
1730 {
1731     IO_CODE();
1732     bdrv_aio_cancel_async(acb);
1733 }
1734 
1735 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1736 static int coroutine_fn
1737 blk_co_do_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
1738 {
1739     IO_CODE();
1740 
1741     blk_wait_while_drained(blk);
1742     GRAPH_RDLOCK_GUARD();
1743 
1744     if (!blk_co_is_available(blk)) {
1745         return -ENOMEDIUM;
1746     }
1747 
1748     return bdrv_co_ioctl(blk_bs(blk), req, buf);
1749 }
1750 
1751 int coroutine_fn blk_co_ioctl(BlockBackend *blk, unsigned long int req,
1752                               void *buf)
1753 {
1754     int ret;
1755     IO_OR_GS_CODE();
1756 
1757     blk_inc_in_flight(blk);
1758     ret = blk_co_do_ioctl(blk, req, buf);
1759     blk_dec_in_flight(blk);
1760 
1761     return ret;
1762 }
1763 
1764 static void coroutine_fn blk_aio_ioctl_entry(void *opaque)
1765 {
1766     BlkAioEmAIOCB *acb = opaque;
1767     BlkRwCo *rwco = &acb->rwco;
1768 
1769     rwco->ret = blk_co_do_ioctl(rwco->blk, rwco->offset, rwco->iobuf);
1770 
1771     blk_aio_complete(acb);
1772 }
1773 
1774 BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
1775                           BlockCompletionFunc *cb, void *opaque)
1776 {
1777     IO_CODE();
1778     return blk_aio_prwv(blk, req, 0, buf, blk_aio_ioctl_entry, 0, cb, opaque);
1779 }
1780 
1781 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1782 static int coroutine_fn
1783 blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes)
1784 {
1785     int ret;
1786     IO_CODE();
1787 
1788     blk_wait_while_drained(blk);
1789     GRAPH_RDLOCK_GUARD();
1790 
1791     ret = blk_check_byte_request(blk, offset, bytes);
1792     if (ret < 0) {
1793         return ret;
1794     }
1795 
1796     return bdrv_co_pdiscard(blk->root, offset, bytes);
1797 }
1798 
1799 static void coroutine_fn blk_aio_pdiscard_entry(void *opaque)
1800 {
1801     BlkAioEmAIOCB *acb = opaque;
1802     BlkRwCo *rwco = &acb->rwco;
1803 
1804     rwco->ret = blk_co_do_pdiscard(rwco->blk, rwco->offset, acb->bytes);
1805     blk_aio_complete(acb);
1806 }
1807 
1808 BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk,
1809                              int64_t offset, int64_t bytes,
1810                              BlockCompletionFunc *cb, void *opaque)
1811 {
1812     IO_CODE();
1813     return blk_aio_prwv(blk, offset, bytes, NULL, blk_aio_pdiscard_entry, 0,
1814                         cb, opaque);
1815 }
1816 
1817 int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset,
1818                                  int64_t bytes)
1819 {
1820     int ret;
1821     IO_OR_GS_CODE();
1822 
1823     blk_inc_in_flight(blk);
1824     ret = blk_co_do_pdiscard(blk, offset, bytes);
1825     blk_dec_in_flight(blk);
1826 
1827     return ret;
1828 }
1829 
1830 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1831 static int coroutine_fn blk_co_do_flush(BlockBackend *blk)
1832 {
1833     IO_CODE();
1834     blk_wait_while_drained(blk);
1835     GRAPH_RDLOCK_GUARD();
1836 
1837     if (!blk_co_is_available(blk)) {
1838         return -ENOMEDIUM;
1839     }
1840 
1841     return bdrv_co_flush(blk_bs(blk));
1842 }
1843 
1844 static void coroutine_fn blk_aio_flush_entry(void *opaque)
1845 {
1846     BlkAioEmAIOCB *acb = opaque;
1847     BlkRwCo *rwco = &acb->rwco;
1848 
1849     rwco->ret = blk_co_do_flush(rwco->blk);
1850     blk_aio_complete(acb);
1851 }
1852 
1853 BlockAIOCB *blk_aio_flush(BlockBackend *blk,
1854                           BlockCompletionFunc *cb, void *opaque)
1855 {
1856     IO_CODE();
1857     return blk_aio_prwv(blk, 0, 0, NULL, blk_aio_flush_entry, 0, cb, opaque);
1858 }
1859 
1860 int coroutine_fn blk_co_flush(BlockBackend *blk)
1861 {
1862     int ret;
1863     IO_OR_GS_CODE();
1864 
1865     blk_inc_in_flight(blk);
1866     ret = blk_co_do_flush(blk);
1867     blk_dec_in_flight(blk);
1868 
1869     return ret;
1870 }
1871 
1872 static void coroutine_fn blk_aio_zone_report_entry(void *opaque)
1873 {
1874     BlkAioEmAIOCB *acb = opaque;
1875     BlkRwCo *rwco = &acb->rwco;
1876 
1877     rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset,
1878                                    (unsigned int*)(uintptr_t)acb->bytes,
1879                                    rwco->iobuf);
1880     blk_aio_complete(acb);
1881 }
1882 
1883 BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
1884                                 unsigned int *nr_zones,
1885                                 BlockZoneDescriptor  *zones,
1886                                 BlockCompletionFunc *cb, void *opaque)
1887 {
1888     BlkAioEmAIOCB *acb;
1889     Coroutine *co;
1890     IO_CODE();
1891 
1892     blk_inc_in_flight(blk);
1893     acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
1894     acb->rwco = (BlkRwCo) {
1895         .blk    = blk,
1896         .offset = offset,
1897         .iobuf  = zones,
1898         .ret    = NOT_DONE,
1899     };
1900     acb->bytes = (int64_t)(uintptr_t)nr_zones,
1901     acb->has_returned = false;
1902 
1903     co = qemu_coroutine_create(blk_aio_zone_report_entry, acb);
1904     aio_co_enter(blk_get_aio_context(blk), co);
1905 
1906     acb->has_returned = true;
1907     if (acb->rwco.ret != NOT_DONE) {
1908         replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
1909                                          blk_aio_complete_bh, acb);
1910     }
1911 
1912     return &acb->common;
1913 }
1914 
1915 static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque)
1916 {
1917     BlkAioEmAIOCB *acb = opaque;
1918     BlkRwCo *rwco = &acb->rwco;
1919 
1920     rwco->ret = blk_co_zone_mgmt(rwco->blk,
1921                                  (BlockZoneOp)(uintptr_t)rwco->iobuf,
1922                                  rwco->offset, acb->bytes);
1923     blk_aio_complete(acb);
1924 }
1925 
1926 BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
1927                               int64_t offset, int64_t len,
1928                               BlockCompletionFunc *cb, void *opaque) {
1929     BlkAioEmAIOCB *acb;
1930     Coroutine *co;
1931     IO_CODE();
1932 
1933     blk_inc_in_flight(blk);
1934     acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
1935     acb->rwco = (BlkRwCo) {
1936         .blk    = blk,
1937         .offset = offset,
1938         .iobuf  = (void *)(uintptr_t)op,
1939         .ret    = NOT_DONE,
1940     };
1941     acb->bytes = len;
1942     acb->has_returned = false;
1943 
1944     co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb);
1945     aio_co_enter(blk_get_aio_context(blk), co);
1946 
1947     acb->has_returned = true;
1948     if (acb->rwco.ret != NOT_DONE) {
1949         replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
1950                                          blk_aio_complete_bh, acb);
1951     }
1952 
1953     return &acb->common;
1954 }
1955 
1956 static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
1957 {
1958     BlkAioEmAIOCB *acb = opaque;
1959     BlkRwCo *rwco = &acb->rwco;
1960 
1961     rwco->ret = blk_co_zone_append(rwco->blk, (int64_t *)(uintptr_t)acb->bytes,
1962                                    rwco->iobuf, rwco->flags);
1963     blk_aio_complete(acb);
1964 }
1965 
1966 BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
1967                                 QEMUIOVector *qiov, BdrvRequestFlags flags,
1968                                 BlockCompletionFunc *cb, void *opaque) {
1969     BlkAioEmAIOCB *acb;
1970     Coroutine *co;
1971     IO_CODE();
1972 
1973     blk_inc_in_flight(blk);
1974     acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
1975     acb->rwco = (BlkRwCo) {
1976         .blk    = blk,
1977         .ret    = NOT_DONE,
1978         .flags  = flags,
1979         .iobuf  = qiov,
1980     };
1981     acb->bytes = (int64_t)(uintptr_t)offset;
1982     acb->has_returned = false;
1983 
1984     co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
1985     aio_co_enter(blk_get_aio_context(blk), co);
1986     acb->has_returned = true;
1987     if (acb->rwco.ret != NOT_DONE) {
1988         replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
1989                                          blk_aio_complete_bh, acb);
1990     }
1991 
1992     return &acb->common;
1993 }
1994 
1995 /*
1996  * Send a zone_report command.
1997  * offset is a byte offset from the start of the device. No alignment
1998  * required for offset.
1999  * nr_zones represents IN maximum and OUT actual.
2000  */
2001 int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
2002                                     unsigned int *nr_zones,
2003                                     BlockZoneDescriptor *zones)
2004 {
2005     int ret;
2006     IO_CODE();
2007 
2008     blk_inc_in_flight(blk); /* increase before waiting */
2009     blk_wait_while_drained(blk);
2010     GRAPH_RDLOCK_GUARD();
2011     if (!blk_is_available(blk)) {
2012         blk_dec_in_flight(blk);
2013         return -ENOMEDIUM;
2014     }
2015     ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones);
2016     blk_dec_in_flight(blk);
2017     return ret;
2018 }
2019 
2020 /*
2021  * Send a zone_management command.
2022  * op is the zone operation;
2023  * offset is the byte offset from the start of the zoned device;
2024  * len is the maximum number of bytes the command should operate on. It
2025  * should be aligned with the device zone size.
2026  */
2027 int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
2028         int64_t offset, int64_t len)
2029 {
2030     int ret;
2031     IO_CODE();
2032 
2033     blk_inc_in_flight(blk);
2034     blk_wait_while_drained(blk);
2035     GRAPH_RDLOCK_GUARD();
2036 
2037     ret = blk_check_byte_request(blk, offset, len);
2038     if (ret < 0) {
2039         blk_dec_in_flight(blk);
2040         return ret;
2041     }
2042 
2043     ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len);
2044     blk_dec_in_flight(blk);
2045     return ret;
2046 }
2047 
2048 /*
2049  * Send a zone_append command.
2050  */
2051 int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
2052         QEMUIOVector *qiov, BdrvRequestFlags flags)
2053 {
2054     int ret;
2055     IO_CODE();
2056 
2057     blk_inc_in_flight(blk);
2058     blk_wait_while_drained(blk);
2059     GRAPH_RDLOCK_GUARD();
2060     if (!blk_is_available(blk)) {
2061         blk_dec_in_flight(blk);
2062         return -ENOMEDIUM;
2063     }
2064 
2065     ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
2066     blk_dec_in_flight(blk);
2067     return ret;
2068 }
2069 
2070 void blk_drain(BlockBackend *blk)
2071 {
2072     BlockDriverState *bs = blk_bs(blk);
2073     GLOBAL_STATE_CODE();
2074 
2075     if (bs) {
2076         bdrv_ref(bs);
2077         bdrv_drained_begin(bs);
2078     }
2079 
2080     /* We may have -ENOMEDIUM completions in flight */
2081     AIO_WAIT_WHILE(blk_get_aio_context(blk),
2082                    qatomic_read(&blk->in_flight) > 0);
2083 
2084     if (bs) {
2085         bdrv_drained_end(bs);
2086         bdrv_unref(bs);
2087     }
2088 }
2089 
2090 void blk_drain_all(void)
2091 {
2092     BlockBackend *blk = NULL;
2093 
2094     GLOBAL_STATE_CODE();
2095 
2096     bdrv_drain_all_begin();
2097 
2098     while ((blk = blk_all_next(blk)) != NULL) {
2099         /* We may have -ENOMEDIUM completions in flight */
2100         AIO_WAIT_WHILE_UNLOCKED(NULL, qatomic_read(&blk->in_flight) > 0);
2101     }
2102 
2103     bdrv_drain_all_end();
2104 }
2105 
2106 void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error,
2107                       BlockdevOnError on_write_error)
2108 {
2109     GLOBAL_STATE_CODE();
2110     blk->on_read_error = on_read_error;
2111     blk->on_write_error = on_write_error;
2112 }
2113 
2114 BlockdevOnError blk_get_on_error(BlockBackend *blk, bool is_read)
2115 {
2116     IO_CODE();
2117     return is_read ? blk->on_read_error : blk->on_write_error;
2118 }
2119 
2120 BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read,
2121                                       int error)
2122 {
2123     BlockdevOnError on_err = blk_get_on_error(blk, is_read);
2124     IO_CODE();
2125 
2126     switch (on_err) {
2127     case BLOCKDEV_ON_ERROR_ENOSPC:
2128         return (error == ENOSPC) ?
2129                BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
2130     case BLOCKDEV_ON_ERROR_STOP:
2131         return BLOCK_ERROR_ACTION_STOP;
2132     case BLOCKDEV_ON_ERROR_REPORT:
2133         return BLOCK_ERROR_ACTION_REPORT;
2134     case BLOCKDEV_ON_ERROR_IGNORE:
2135         return BLOCK_ERROR_ACTION_IGNORE;
2136     case BLOCKDEV_ON_ERROR_AUTO:
2137     default:
2138         abort();
2139     }
2140 }
2141 
2142 static void send_qmp_error_event(BlockBackend *blk,
2143                                  BlockErrorAction action,
2144                                  bool is_read, int error)
2145 {
2146     IoOperationType optype;
2147     BlockDriverState *bs = blk_bs(blk);
2148 
2149     optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
2150     qapi_event_send_block_io_error(blk_name(blk),
2151                                    bs ? bdrv_get_node_name(bs) : NULL, optype,
2152                                    action, blk_iostatus_is_enabled(blk),
2153                                    error == ENOSPC, strerror(error));
2154 }
2155 
2156 /* This is done by device models because, while the block layer knows
2157  * about the error, it does not know whether an operation comes from
2158  * the device or the block layer (from a job, for example).
2159  */
2160 void blk_error_action(BlockBackend *blk, BlockErrorAction action,
2161                       bool is_read, int error)
2162 {
2163     assert(error >= 0);
2164     IO_CODE();
2165 
2166     if (action == BLOCK_ERROR_ACTION_STOP) {
2167         /* First set the iostatus, so that "info block" returns an iostatus
2168          * that matches the events raised so far (an additional error iostatus
2169          * is fine, but not a lost one).
2170          */
2171         blk_iostatus_set_err(blk, error);
2172 
2173         /* Then raise the request to stop the VM and the event.
2174          * qemu_system_vmstop_request_prepare has two effects.  First,
2175          * it ensures that the STOP event always comes after the
2176          * BLOCK_IO_ERROR event.  Second, it ensures that even if management
2177          * can observe the STOP event and do a "cont" before the STOP
2178          * event is issued, the VM will not stop.  In this case, vm_start()
2179          * also ensures that the STOP/RESUME pair of events is emitted.
2180          */
2181         qemu_system_vmstop_request_prepare();
2182         send_qmp_error_event(blk, action, is_read, error);
2183         qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
2184     } else {
2185         send_qmp_error_event(blk, action, is_read, error);
2186     }
2187 }
2188 
2189 /*
2190  * Returns true if the BlockBackend can support taking write permissions
2191  * (because its root node is not read-only).
2192  */
2193 bool blk_supports_write_perm(BlockBackend *blk)
2194 {
2195     BlockDriverState *bs = blk_bs(blk);
2196     GLOBAL_STATE_CODE();
2197 
2198     if (bs) {
2199         return !bdrv_is_read_only(bs);
2200     } else {
2201         return blk->root_state.open_flags & BDRV_O_RDWR;
2202     }
2203 }
2204 
2205 /*
2206  * Returns true if the BlockBackend can be written to in its current
2207  * configuration (i.e. if write permission have been requested)
2208  */
2209 bool blk_is_writable(BlockBackend *blk)
2210 {
2211     IO_CODE();
2212     return blk->perm & BLK_PERM_WRITE;
2213 }
2214 
2215 bool blk_is_sg(BlockBackend *blk)
2216 {
2217     BlockDriverState *bs = blk_bs(blk);
2218     GLOBAL_STATE_CODE();
2219 
2220     if (!bs) {
2221         return false;
2222     }
2223 
2224     return bdrv_is_sg(bs);
2225 }
2226 
2227 bool blk_enable_write_cache(BlockBackend *blk)
2228 {
2229     IO_CODE();
2230     return blk->enable_write_cache;
2231 }
2232 
2233 void blk_set_enable_write_cache(BlockBackend *blk, bool wce)
2234 {
2235     IO_CODE();
2236     blk->enable_write_cache = wce;
2237 }
2238 
2239 void blk_activate(BlockBackend *blk, Error **errp)
2240 {
2241     BlockDriverState *bs = blk_bs(blk);
2242     GLOBAL_STATE_CODE();
2243 
2244     if (!bs) {
2245         error_setg(errp, "Device '%s' has no medium", blk->name);
2246         return;
2247     }
2248 
2249     /*
2250      * Migration code can call this function in coroutine context, so leave
2251      * coroutine context if necessary.
2252      */
2253     if (qemu_in_coroutine()) {
2254         bdrv_co_activate(bs, errp);
2255     } else {
2256         bdrv_activate(bs, errp);
2257     }
2258 }
2259 
2260 bool coroutine_fn blk_co_is_inserted(BlockBackend *blk)
2261 {
2262     BlockDriverState *bs = blk_bs(blk);
2263     IO_CODE();
2264     assert_bdrv_graph_readable();
2265 
2266     return bs && bdrv_co_is_inserted(bs);
2267 }
2268 
2269 bool coroutine_fn blk_co_is_available(BlockBackend *blk)
2270 {
2271     IO_CODE();
2272     return blk_co_is_inserted(blk) && !blk_dev_is_tray_open(blk);
2273 }
2274 
2275 void coroutine_fn blk_co_lock_medium(BlockBackend *blk, bool locked)
2276 {
2277     BlockDriverState *bs = blk_bs(blk);
2278     IO_CODE();
2279     GRAPH_RDLOCK_GUARD();
2280 
2281     if (bs) {
2282         bdrv_co_lock_medium(bs, locked);
2283     }
2284 }
2285 
2286 void coroutine_fn blk_co_eject(BlockBackend *blk, bool eject_flag)
2287 {
2288     BlockDriverState *bs = blk_bs(blk);
2289     char *id;
2290     IO_CODE();
2291     GRAPH_RDLOCK_GUARD();
2292 
2293     if (bs) {
2294         bdrv_co_eject(bs, eject_flag);
2295     }
2296 
2297     /* Whether or not we ejected on the backend,
2298      * the frontend experienced a tray event. */
2299     id = blk_get_attached_dev_id(blk);
2300     qapi_event_send_device_tray_moved(blk_name(blk), id,
2301                                       eject_flag);
2302     g_free(id);
2303 }
2304 
2305 int blk_get_flags(BlockBackend *blk)
2306 {
2307     BlockDriverState *bs = blk_bs(blk);
2308     GLOBAL_STATE_CODE();
2309 
2310     if (bs) {
2311         return bdrv_get_flags(bs);
2312     } else {
2313         return blk->root_state.open_flags;
2314     }
2315 }
2316 
2317 /* Returns the minimum request alignment, in bytes; guaranteed nonzero */
2318 uint32_t blk_get_request_alignment(BlockBackend *blk)
2319 {
2320     BlockDriverState *bs = blk_bs(blk);
2321     IO_CODE();
2322     return bs ? bs->bl.request_alignment : BDRV_SECTOR_SIZE;
2323 }
2324 
2325 /* Returns the maximum hardware transfer length, in bytes; guaranteed nonzero */
2326 uint64_t blk_get_max_hw_transfer(BlockBackend *blk)
2327 {
2328     BlockDriverState *bs = blk_bs(blk);
2329     uint64_t max = INT_MAX;
2330     IO_CODE();
2331 
2332     if (bs) {
2333         max = MIN_NON_ZERO(max, bs->bl.max_hw_transfer);
2334         max = MIN_NON_ZERO(max, bs->bl.max_transfer);
2335     }
2336     return ROUND_DOWN(max, blk_get_request_alignment(blk));
2337 }
2338 
2339 /* Returns the maximum transfer length, in bytes; guaranteed nonzero */
2340 uint32_t blk_get_max_transfer(BlockBackend *blk)
2341 {
2342     BlockDriverState *bs = blk_bs(blk);
2343     uint32_t max = INT_MAX;
2344     IO_CODE();
2345 
2346     if (bs) {
2347         max = MIN_NON_ZERO(max, bs->bl.max_transfer);
2348     }
2349     return ROUND_DOWN(max, blk_get_request_alignment(blk));
2350 }
2351 
2352 int blk_get_max_hw_iov(BlockBackend *blk)
2353 {
2354     IO_CODE();
2355     return MIN_NON_ZERO(blk->root->bs->bl.max_hw_iov,
2356                         blk->root->bs->bl.max_iov);
2357 }
2358 
2359 int blk_get_max_iov(BlockBackend *blk)
2360 {
2361     IO_CODE();
2362     return blk->root->bs->bl.max_iov;
2363 }
2364 
2365 void *blk_try_blockalign(BlockBackend *blk, size_t size)
2366 {
2367     IO_CODE();
2368     return qemu_try_blockalign(blk ? blk_bs(blk) : NULL, size);
2369 }
2370 
2371 void *blk_blockalign(BlockBackend *blk, size_t size)
2372 {
2373     IO_CODE();
2374     return qemu_blockalign(blk ? blk_bs(blk) : NULL, size);
2375 }
2376 
2377 bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp)
2378 {
2379     BlockDriverState *bs = blk_bs(blk);
2380     GLOBAL_STATE_CODE();
2381 
2382     if (!bs) {
2383         return false;
2384     }
2385 
2386     return bdrv_op_is_blocked(bs, op, errp);
2387 }
2388 
2389 void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason)
2390 {
2391     BlockDriverState *bs = blk_bs(blk);
2392     GLOBAL_STATE_CODE();
2393 
2394     if (bs) {
2395         bdrv_op_unblock(bs, op, reason);
2396     }
2397 }
2398 
2399 void blk_op_block_all(BlockBackend *blk, Error *reason)
2400 {
2401     BlockDriverState *bs = blk_bs(blk);
2402     GLOBAL_STATE_CODE();
2403 
2404     if (bs) {
2405         bdrv_op_block_all(bs, reason);
2406     }
2407 }
2408 
2409 void blk_op_unblock_all(BlockBackend *blk, Error *reason)
2410 {
2411     BlockDriverState *bs = blk_bs(blk);
2412     GLOBAL_STATE_CODE();
2413 
2414     if (bs) {
2415         bdrv_op_unblock_all(bs, reason);
2416     }
2417 }
2418 
2419 AioContext *blk_get_aio_context(BlockBackend *blk)
2420 {
2421     BlockDriverState *bs;
2422     IO_CODE();
2423 
2424     if (!blk) {
2425         return qemu_get_aio_context();
2426     }
2427 
2428     bs = blk_bs(blk);
2429     if (bs) {
2430         AioContext *ctx = bdrv_get_aio_context(blk_bs(blk));
2431         assert(ctx == blk->ctx);
2432     }
2433 
2434     return blk->ctx;
2435 }
2436 
2437 static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
2438 {
2439     BlockBackendAIOCB *blk_acb = DO_UPCAST(BlockBackendAIOCB, common, acb);
2440     return blk_get_aio_context(blk_acb->blk);
2441 }
2442 
2443 int blk_set_aio_context(BlockBackend *blk, AioContext *new_context,
2444                         Error **errp)
2445 {
2446     bool old_allow_change;
2447     BlockDriverState *bs = blk_bs(blk);
2448     int ret;
2449 
2450     GLOBAL_STATE_CODE();
2451 
2452     if (!bs) {
2453         blk->ctx = new_context;
2454         return 0;
2455     }
2456 
2457     bdrv_ref(bs);
2458 
2459     old_allow_change = blk->allow_aio_context_change;
2460     blk->allow_aio_context_change = true;
2461 
2462     ret = bdrv_try_change_aio_context(bs, new_context, NULL, errp);
2463 
2464     blk->allow_aio_context_change = old_allow_change;
2465 
2466     bdrv_unref(bs);
2467     return ret;
2468 }
2469 
2470 typedef struct BdrvStateBlkRootContext {
2471     AioContext *new_ctx;
2472     BlockBackend *blk;
2473 } BdrvStateBlkRootContext;
2474 
2475 static void blk_root_set_aio_ctx_commit(void *opaque)
2476 {
2477     BdrvStateBlkRootContext *s = opaque;
2478     BlockBackend *blk = s->blk;
2479     AioContext *new_context = s->new_ctx;
2480     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
2481 
2482     blk->ctx = new_context;
2483     if (tgm->throttle_state) {
2484         throttle_group_detach_aio_context(tgm);
2485         throttle_group_attach_aio_context(tgm, new_context);
2486     }
2487 }
2488 
2489 static TransactionActionDrv set_blk_root_context = {
2490     .commit = blk_root_set_aio_ctx_commit,
2491     .clean = g_free,
2492 };
2493 
2494 static bool blk_root_change_aio_ctx(BdrvChild *child, AioContext *ctx,
2495                                     GHashTable *visited, Transaction *tran,
2496                                     Error **errp)
2497 {
2498     BlockBackend *blk = child->opaque;
2499     BdrvStateBlkRootContext *s;
2500 
2501     if (!blk->allow_aio_context_change) {
2502         /*
2503          * Manually created BlockBackends (those with a name) that are not
2504          * attached to anything can change their AioContext without updating
2505          * their user; return an error for others.
2506          */
2507         if (!blk->name || blk->dev) {
2508             /* TODO Add BB name/QOM path */
2509             error_setg(errp, "Cannot change iothread of active block backend");
2510             return false;
2511         }
2512     }
2513 
2514     s = g_new(BdrvStateBlkRootContext, 1);
2515     *s = (BdrvStateBlkRootContext) {
2516         .new_ctx = ctx,
2517         .blk = blk,
2518     };
2519 
2520     tran_add(tran, &set_blk_root_context, s);
2521     return true;
2522 }
2523 
2524 void blk_add_aio_context_notifier(BlockBackend *blk,
2525         void (*attached_aio_context)(AioContext *new_context, void *opaque),
2526         void (*detach_aio_context)(void *opaque), void *opaque)
2527 {
2528     BlockBackendAioNotifier *notifier;
2529     BlockDriverState *bs = blk_bs(blk);
2530     GLOBAL_STATE_CODE();
2531 
2532     notifier = g_new(BlockBackendAioNotifier, 1);
2533     notifier->attached_aio_context = attached_aio_context;
2534     notifier->detach_aio_context = detach_aio_context;
2535     notifier->opaque = opaque;
2536     QLIST_INSERT_HEAD(&blk->aio_notifiers, notifier, list);
2537 
2538     if (bs) {
2539         bdrv_add_aio_context_notifier(bs, attached_aio_context,
2540                                       detach_aio_context, opaque);
2541     }
2542 }
2543 
2544 void blk_remove_aio_context_notifier(BlockBackend *blk,
2545                                      void (*attached_aio_context)(AioContext *,
2546                                                                   void *),
2547                                      void (*detach_aio_context)(void *),
2548                                      void *opaque)
2549 {
2550     BlockBackendAioNotifier *notifier;
2551     BlockDriverState *bs = blk_bs(blk);
2552 
2553     GLOBAL_STATE_CODE();
2554 
2555     if (bs) {
2556         bdrv_remove_aio_context_notifier(bs, attached_aio_context,
2557                                          detach_aio_context, opaque);
2558     }
2559 
2560     QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
2561         if (notifier->attached_aio_context == attached_aio_context &&
2562             notifier->detach_aio_context == detach_aio_context &&
2563             notifier->opaque == opaque) {
2564             QLIST_REMOVE(notifier, list);
2565             g_free(notifier);
2566             return;
2567         }
2568     }
2569 
2570     abort();
2571 }
2572 
2573 void blk_add_remove_bs_notifier(BlockBackend *blk, Notifier *notify)
2574 {
2575     GLOBAL_STATE_CODE();
2576     notifier_list_add(&blk->remove_bs_notifiers, notify);
2577 }
2578 
2579 void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify)
2580 {
2581     GLOBAL_STATE_CODE();
2582     notifier_list_add(&blk->insert_bs_notifiers, notify);
2583 }
2584 
2585 BlockAcctStats *blk_get_stats(BlockBackend *blk)
2586 {
2587     IO_CODE();
2588     return &blk->stats;
2589 }
2590 
2591 void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
2592                   BlockCompletionFunc *cb, void *opaque)
2593 {
2594     IO_CODE();
2595     return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque);
2596 }
2597 
2598 int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
2599                                       int64_t bytes, BdrvRequestFlags flags)
2600 {
2601     IO_OR_GS_CODE();
2602     return blk_co_pwritev(blk, offset, bytes, NULL,
2603                           flags | BDRV_REQ_ZERO_WRITE);
2604 }
2605 
2606 int coroutine_fn blk_co_pwrite_compressed(BlockBackend *blk, int64_t offset,
2607                                           int64_t bytes, const void *buf)
2608 {
2609     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
2610     IO_OR_GS_CODE();
2611     return blk_co_pwritev_part(blk, offset, bytes, &qiov, 0,
2612                                BDRV_REQ_WRITE_COMPRESSED);
2613 }
2614 
2615 int coroutine_fn blk_co_truncate(BlockBackend *blk, int64_t offset, bool exact,
2616                                  PreallocMode prealloc, BdrvRequestFlags flags,
2617                                  Error **errp)
2618 {
2619     IO_OR_GS_CODE();
2620     GRAPH_RDLOCK_GUARD();
2621     if (!blk_co_is_available(blk)) {
2622         error_setg(errp, "No medium inserted");
2623         return -ENOMEDIUM;
2624     }
2625 
2626     return bdrv_co_truncate(blk->root, offset, exact, prealloc, flags, errp);
2627 }
2628 
2629 int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
2630                      int64_t pos, int size)
2631 {
2632     int ret;
2633     GLOBAL_STATE_CODE();
2634 
2635     if (!blk_is_available(blk)) {
2636         return -ENOMEDIUM;
2637     }
2638 
2639     ret = bdrv_save_vmstate(blk_bs(blk), buf, pos, size);
2640     if (ret < 0) {
2641         return ret;
2642     }
2643 
2644     if (ret == size && !blk->enable_write_cache) {
2645         ret = bdrv_flush(blk_bs(blk));
2646     }
2647 
2648     return ret < 0 ? ret : size;
2649 }
2650 
2651 int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size)
2652 {
2653     GLOBAL_STATE_CODE();
2654     if (!blk_is_available(blk)) {
2655         return -ENOMEDIUM;
2656     }
2657 
2658     return bdrv_load_vmstate(blk_bs(blk), buf, pos, size);
2659 }
2660 
2661 int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz)
2662 {
2663     GLOBAL_STATE_CODE();
2664     if (!blk_is_available(blk)) {
2665         return -ENOMEDIUM;
2666     }
2667 
2668     return bdrv_probe_blocksizes(blk_bs(blk), bsz);
2669 }
2670 
2671 int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo)
2672 {
2673     GLOBAL_STATE_CODE();
2674     if (!blk_is_available(blk)) {
2675         return -ENOMEDIUM;
2676     }
2677 
2678     return bdrv_probe_geometry(blk_bs(blk), geo);
2679 }
2680 
2681 /*
2682  * Updates the BlockBackendRootState object with data from the currently
2683  * attached BlockDriverState.
2684  */
2685 void blk_update_root_state(BlockBackend *blk)
2686 {
2687     GLOBAL_STATE_CODE();
2688     assert(blk->root);
2689 
2690     blk->root_state.open_flags    = blk->root->bs->open_flags;
2691     blk->root_state.detect_zeroes = blk->root->bs->detect_zeroes;
2692 }
2693 
2694 /*
2695  * Returns the detect-zeroes setting to be used for bdrv_open() of a
2696  * BlockDriverState which is supposed to inherit the root state.
2697  */
2698 bool blk_get_detect_zeroes_from_root_state(BlockBackend *blk)
2699 {
2700     GLOBAL_STATE_CODE();
2701     return blk->root_state.detect_zeroes;
2702 }
2703 
2704 /*
2705  * Returns the flags to be used for bdrv_open() of a BlockDriverState which is
2706  * supposed to inherit the root state.
2707  */
2708 int blk_get_open_flags_from_root_state(BlockBackend *blk)
2709 {
2710     GLOBAL_STATE_CODE();
2711     return blk->root_state.open_flags;
2712 }
2713 
2714 BlockBackendRootState *blk_get_root_state(BlockBackend *blk)
2715 {
2716     GLOBAL_STATE_CODE();
2717     return &blk->root_state;
2718 }
2719 
2720 int blk_commit_all(void)
2721 {
2722     BlockBackend *blk = NULL;
2723     GLOBAL_STATE_CODE();
2724 
2725     while ((blk = blk_all_next(blk)) != NULL) {
2726         AioContext *aio_context = blk_get_aio_context(blk);
2727         BlockDriverState *unfiltered_bs = bdrv_skip_filters(blk_bs(blk));
2728 
2729         aio_context_acquire(aio_context);
2730         if (blk_is_inserted(blk) && bdrv_cow_child(unfiltered_bs)) {
2731             int ret;
2732 
2733             ret = bdrv_commit(unfiltered_bs);
2734             if (ret < 0) {
2735                 aio_context_release(aio_context);
2736                 return ret;
2737             }
2738         }
2739         aio_context_release(aio_context);
2740     }
2741     return 0;
2742 }
2743 
2744 
2745 /* throttling disk I/O limits */
2746 void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg)
2747 {
2748     GLOBAL_STATE_CODE();
2749     throttle_group_config(&blk->public.throttle_group_member, cfg);
2750 }
2751 
2752 void blk_io_limits_disable(BlockBackend *blk)
2753 {
2754     BlockDriverState *bs = blk_bs(blk);
2755     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
2756     assert(tgm->throttle_state);
2757     GLOBAL_STATE_CODE();
2758     if (bs) {
2759         bdrv_ref(bs);
2760         bdrv_drained_begin(bs);
2761     }
2762     throttle_group_unregister_tgm(tgm);
2763     if (bs) {
2764         bdrv_drained_end(bs);
2765         bdrv_unref(bs);
2766     }
2767 }
2768 
2769 /* should be called before blk_set_io_limits if a limit is set */
2770 void blk_io_limits_enable(BlockBackend *blk, const char *group)
2771 {
2772     assert(!blk->public.throttle_group_member.throttle_state);
2773     GLOBAL_STATE_CODE();
2774     throttle_group_register_tgm(&blk->public.throttle_group_member,
2775                                 group, blk_get_aio_context(blk));
2776 }
2777 
2778 void blk_io_limits_update_group(BlockBackend *blk, const char *group)
2779 {
2780     GLOBAL_STATE_CODE();
2781     /* this BB is not part of any group */
2782     if (!blk->public.throttle_group_member.throttle_state) {
2783         return;
2784     }
2785 
2786     /* this BB is a part of the same group than the one we want */
2787     if (!g_strcmp0(throttle_group_get_name(&blk->public.throttle_group_member),
2788                 group)) {
2789         return;
2790     }
2791 
2792     /* need to change the group this bs belong to */
2793     blk_io_limits_disable(blk);
2794     blk_io_limits_enable(blk, group);
2795 }
2796 
2797 static void blk_root_drained_begin(BdrvChild *child)
2798 {
2799     BlockBackend *blk = child->opaque;
2800     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
2801 
2802     if (qatomic_fetch_inc(&blk->quiesce_counter) == 0) {
2803         if (blk->dev_ops && blk->dev_ops->drained_begin) {
2804             blk->dev_ops->drained_begin(blk->dev_opaque);
2805         }
2806     }
2807 
2808     /* Note that blk->root may not be accessible here yet if we are just
2809      * attaching to a BlockDriverState that is drained. Use child instead. */
2810 
2811     if (qatomic_fetch_inc(&tgm->io_limits_disabled) == 0) {
2812         throttle_group_restart_tgm(tgm);
2813     }
2814 }
2815 
2816 static bool blk_root_drained_poll(BdrvChild *child)
2817 {
2818     BlockBackend *blk = child->opaque;
2819     bool busy = false;
2820     assert(qatomic_read(&blk->quiesce_counter));
2821 
2822     if (blk->dev_ops && blk->dev_ops->drained_poll) {
2823         busy = blk->dev_ops->drained_poll(blk->dev_opaque);
2824     }
2825     return busy || !!blk->in_flight;
2826 }
2827 
2828 static void blk_root_drained_end(BdrvChild *child)
2829 {
2830     BlockBackend *blk = child->opaque;
2831     assert(qatomic_read(&blk->quiesce_counter));
2832 
2833     assert(blk->public.throttle_group_member.io_limits_disabled);
2834     qatomic_dec(&blk->public.throttle_group_member.io_limits_disabled);
2835 
2836     if (qatomic_fetch_dec(&blk->quiesce_counter) == 1) {
2837         if (blk->dev_ops && blk->dev_ops->drained_end) {
2838             blk->dev_ops->drained_end(blk->dev_opaque);
2839         }
2840         qemu_mutex_lock(&blk->queued_requests_lock);
2841         while (qemu_co_enter_next(&blk->queued_requests,
2842                                   &blk->queued_requests_lock)) {
2843             /* Resume all queued requests */
2844         }
2845         qemu_mutex_unlock(&blk->queued_requests_lock);
2846     }
2847 }
2848 
2849 bool blk_register_buf(BlockBackend *blk, void *host, size_t size, Error **errp)
2850 {
2851     BlockDriverState *bs = blk_bs(blk);
2852 
2853     GLOBAL_STATE_CODE();
2854 
2855     if (bs) {
2856         return bdrv_register_buf(bs, host, size, errp);
2857     }
2858     return true;
2859 }
2860 
2861 void blk_unregister_buf(BlockBackend *blk, void *host, size_t size)
2862 {
2863     BlockDriverState *bs = blk_bs(blk);
2864 
2865     GLOBAL_STATE_CODE();
2866 
2867     if (bs) {
2868         bdrv_unregister_buf(bs, host, size);
2869     }
2870 }
2871 
2872 int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in,
2873                                    BlockBackend *blk_out, int64_t off_out,
2874                                    int64_t bytes, BdrvRequestFlags read_flags,
2875                                    BdrvRequestFlags write_flags)
2876 {
2877     int r;
2878     IO_CODE();
2879     GRAPH_RDLOCK_GUARD();
2880 
2881     r = blk_check_byte_request(blk_in, off_in, bytes);
2882     if (r) {
2883         return r;
2884     }
2885     r = blk_check_byte_request(blk_out, off_out, bytes);
2886     if (r) {
2887         return r;
2888     }
2889 
2890     return bdrv_co_copy_range(blk_in->root, off_in,
2891                               blk_out->root, off_out,
2892                               bytes, read_flags, write_flags);
2893 }
2894 
2895 const BdrvChild *blk_root(BlockBackend *blk)
2896 {
2897     GLOBAL_STATE_CODE();
2898     return blk->root;
2899 }
2900 
2901 int blk_make_empty(BlockBackend *blk, Error **errp)
2902 {
2903     GLOBAL_STATE_CODE();
2904     if (!blk_is_available(blk)) {
2905         error_setg(errp, "No medium inserted");
2906         return -ENOMEDIUM;
2907     }
2908 
2909     return bdrv_make_empty(blk->root, errp);
2910 }
2911