xref: /openbmc/qemu/block/block-backend.c (revision 2df9f571)
1 /*
2  * QEMU Block backends
3  *
4  * Copyright (C) 2014-2016 Red Hat, Inc.
5  *
6  * Authors:
7  *  Markus Armbruster <armbru@redhat.com>,
8  *
9  * This work is licensed under the terms of the GNU LGPL, version 2.1
10  * or later.  See the COPYING.LIB file in the top-level directory.
11  */
12 
13 #include "qemu/osdep.h"
14 #include "sysemu/block-backend.h"
15 #include "block/block_int.h"
16 #include "block/blockjob.h"
17 #include "block/throttle-groups.h"
18 #include "hw/qdev-core.h"
19 #include "sysemu/blockdev.h"
20 #include "sysemu/runstate.h"
21 #include "sysemu/sysemu.h"
22 #include "sysemu/replay.h"
23 #include "qapi/error.h"
24 #include "qapi/qapi-events-block.h"
25 #include "qemu/id.h"
26 #include "qemu/main-loop.h"
27 #include "qemu/option.h"
28 #include "trace.h"
29 #include "migration/misc.h"
30 
31 /* Number of coroutines to reserve per attached device model */
32 #define COROUTINE_POOL_RESERVATION 64
33 
34 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
35 
36 static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb);
37 
38 typedef struct BlockBackendAioNotifier {
39     void (*attached_aio_context)(AioContext *new_context, void *opaque);
40     void (*detach_aio_context)(void *opaque);
41     void *opaque;
42     QLIST_ENTRY(BlockBackendAioNotifier) list;
43 } BlockBackendAioNotifier;
44 
45 struct BlockBackend {
46     char *name;
47     int refcnt;
48     BdrvChild *root;
49     AioContext *ctx;
50     DriveInfo *legacy_dinfo;    /* null unless created by drive_new() */
51     QTAILQ_ENTRY(BlockBackend) link;         /* for block_backends */
52     QTAILQ_ENTRY(BlockBackend) monitor_link; /* for monitor_block_backends */
53     BlockBackendPublic public;
54 
55     DeviceState *dev;           /* attached device model, if any */
56     const BlockDevOps *dev_ops;
57     void *dev_opaque;
58 
59     /* the block size for which the guest device expects atomicity */
60     int guest_block_size;
61 
62     /* If the BDS tree is removed, some of its options are stored here (which
63      * can be used to restore those options in the new BDS on insert) */
64     BlockBackendRootState root_state;
65 
66     bool enable_write_cache;
67 
68     /* I/O stats (display with "info blockstats"). */
69     BlockAcctStats stats;
70 
71     BlockdevOnError on_read_error, on_write_error;
72     bool iostatus_enabled;
73     BlockDeviceIoStatus iostatus;
74 
75     uint64_t perm;
76     uint64_t shared_perm;
77     bool disable_perm;
78 
79     bool allow_aio_context_change;
80     bool allow_write_beyond_eof;
81 
82     NotifierList remove_bs_notifiers, insert_bs_notifiers;
83     QLIST_HEAD(, BlockBackendAioNotifier) aio_notifiers;
84 
85     int quiesce_counter;
86     CoQueue queued_requests;
87     bool disable_request_queuing;
88 
89     VMChangeStateEntry *vmsh;
90     bool force_allow_inactivate;
91 
92     /* Number of in-flight aio requests.  BlockDriverState also counts
93      * in-flight requests but aio requests can exist even when blk->root is
94      * NULL, so we cannot rely on its counter for that case.
95      * Accessed with atomic ops.
96      */
97     unsigned int in_flight;
98 };
99 
100 typedef struct BlockBackendAIOCB {
101     BlockAIOCB common;
102     BlockBackend *blk;
103     int ret;
104 } BlockBackendAIOCB;
105 
106 static const AIOCBInfo block_backend_aiocb_info = {
107     .get_aio_context = blk_aiocb_get_aio_context,
108     .aiocb_size = sizeof(BlockBackendAIOCB),
109 };
110 
111 static void drive_info_del(DriveInfo *dinfo);
112 static BlockBackend *bdrv_first_blk(BlockDriverState *bs);
113 
114 /* All BlockBackends */
115 static QTAILQ_HEAD(, BlockBackend) block_backends =
116     QTAILQ_HEAD_INITIALIZER(block_backends);
117 
118 /* All BlockBackends referenced by the monitor and which are iterated through by
119  * blk_next() */
120 static QTAILQ_HEAD(, BlockBackend) monitor_block_backends =
121     QTAILQ_HEAD_INITIALIZER(monitor_block_backends);
122 
123 static void blk_root_inherit_options(int *child_flags, QDict *child_options,
124                                      int parent_flags, QDict *parent_options)
125 {
126     /* We're not supposed to call this function for root nodes */
127     abort();
128 }
129 static void blk_root_drained_begin(BdrvChild *child);
130 static bool blk_root_drained_poll(BdrvChild *child);
131 static void blk_root_drained_end(BdrvChild *child, int *drained_end_counter);
132 
133 static void blk_root_change_media(BdrvChild *child, bool load);
134 static void blk_root_resize(BdrvChild *child);
135 
136 static bool blk_root_can_set_aio_ctx(BdrvChild *child, AioContext *ctx,
137                                      GSList **ignore, Error **errp);
138 static void blk_root_set_aio_ctx(BdrvChild *child, AioContext *ctx,
139                                  GSList **ignore);
140 
141 static char *blk_root_get_parent_desc(BdrvChild *child)
142 {
143     BlockBackend *blk = child->opaque;
144     char *dev_id;
145 
146     if (blk->name) {
147         return g_strdup(blk->name);
148     }
149 
150     dev_id = blk_get_attached_dev_id(blk);
151     if (*dev_id) {
152         return dev_id;
153     } else {
154         /* TODO Callback into the BB owner for something more detailed */
155         g_free(dev_id);
156         return g_strdup("a block device");
157     }
158 }
159 
160 static const char *blk_root_get_name(BdrvChild *child)
161 {
162     return blk_name(child->opaque);
163 }
164 
165 static void blk_vm_state_changed(void *opaque, int running, RunState state)
166 {
167     Error *local_err = NULL;
168     BlockBackend *blk = opaque;
169 
170     if (state == RUN_STATE_INMIGRATE) {
171         return;
172     }
173 
174     qemu_del_vm_change_state_handler(blk->vmsh);
175     blk->vmsh = NULL;
176     blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
177     if (local_err) {
178         error_report_err(local_err);
179     }
180 }
181 
182 /*
183  * Notifies the user of the BlockBackend that migration has completed. qdev
184  * devices can tighten their permissions in response (specifically revoke
185  * shared write permissions that we needed for storage migration).
186  *
187  * If an error is returned, the VM cannot be allowed to be resumed.
188  */
189 static void blk_root_activate(BdrvChild *child, Error **errp)
190 {
191     BlockBackend *blk = child->opaque;
192     Error *local_err = NULL;
193 
194     if (!blk->disable_perm) {
195         return;
196     }
197 
198     blk->disable_perm = false;
199 
200     blk_set_perm(blk, blk->perm, BLK_PERM_ALL, &local_err);
201     if (local_err) {
202         error_propagate(errp, local_err);
203         blk->disable_perm = true;
204         return;
205     }
206 
207     if (runstate_check(RUN_STATE_INMIGRATE)) {
208         /* Activation can happen when migration process is still active, for
209          * example when nbd_server_add is called during non-shared storage
210          * migration. Defer the shared_perm update to migration completion. */
211         if (!blk->vmsh) {
212             blk->vmsh = qemu_add_vm_change_state_handler(blk_vm_state_changed,
213                                                          blk);
214         }
215         return;
216     }
217 
218     blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
219     if (local_err) {
220         error_propagate(errp, local_err);
221         blk->disable_perm = true;
222         return;
223     }
224 }
225 
226 void blk_set_force_allow_inactivate(BlockBackend *blk)
227 {
228     blk->force_allow_inactivate = true;
229 }
230 
231 static bool blk_can_inactivate(BlockBackend *blk)
232 {
233     /* If it is a guest device, inactivate is ok. */
234     if (blk->dev || blk_name(blk)[0]) {
235         return true;
236     }
237 
238     /* Inactivating means no more writes to the image can be done,
239      * even if those writes would be changes invisible to the
240      * guest.  For block job BBs that satisfy this, we can just allow
241      * it.  This is the case for mirror job source, which is required
242      * by libvirt non-shared block migration. */
243     if (!(blk->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED))) {
244         return true;
245     }
246 
247     return blk->force_allow_inactivate;
248 }
249 
250 static int blk_root_inactivate(BdrvChild *child)
251 {
252     BlockBackend *blk = child->opaque;
253 
254     if (blk->disable_perm) {
255         return 0;
256     }
257 
258     if (!blk_can_inactivate(blk)) {
259         return -EPERM;
260     }
261 
262     blk->disable_perm = true;
263     if (blk->root) {
264         bdrv_child_try_set_perm(blk->root, 0, BLK_PERM_ALL, &error_abort);
265     }
266 
267     return 0;
268 }
269 
270 static void blk_root_attach(BdrvChild *child)
271 {
272     BlockBackend *blk = child->opaque;
273     BlockBackendAioNotifier *notifier;
274 
275     trace_blk_root_attach(child, blk, child->bs);
276 
277     QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
278         bdrv_add_aio_context_notifier(child->bs,
279                 notifier->attached_aio_context,
280                 notifier->detach_aio_context,
281                 notifier->opaque);
282     }
283 }
284 
285 static void blk_root_detach(BdrvChild *child)
286 {
287     BlockBackend *blk = child->opaque;
288     BlockBackendAioNotifier *notifier;
289 
290     trace_blk_root_detach(child, blk, child->bs);
291 
292     QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
293         bdrv_remove_aio_context_notifier(child->bs,
294                 notifier->attached_aio_context,
295                 notifier->detach_aio_context,
296                 notifier->opaque);
297     }
298 }
299 
300 static const BdrvChildRole child_root = {
301     .inherit_options    = blk_root_inherit_options,
302 
303     .change_media       = blk_root_change_media,
304     .resize             = blk_root_resize,
305     .get_name           = blk_root_get_name,
306     .get_parent_desc    = blk_root_get_parent_desc,
307 
308     .drained_begin      = blk_root_drained_begin,
309     .drained_poll       = blk_root_drained_poll,
310     .drained_end        = blk_root_drained_end,
311 
312     .activate           = blk_root_activate,
313     .inactivate         = blk_root_inactivate,
314 
315     .attach             = blk_root_attach,
316     .detach             = blk_root_detach,
317 
318     .can_set_aio_ctx    = blk_root_can_set_aio_ctx,
319     .set_aio_ctx        = blk_root_set_aio_ctx,
320 };
321 
322 /*
323  * Create a new BlockBackend with a reference count of one.
324  *
325  * @perm is a bitmasks of BLK_PERM_* constants which describes the permissions
326  * to request for a block driver node that is attached to this BlockBackend.
327  * @shared_perm is a bitmask which describes which permissions may be granted
328  * to other users of the attached node.
329  * Both sets of permissions can be changed later using blk_set_perm().
330  *
331  * Return the new BlockBackend on success, null on failure.
332  */
333 BlockBackend *blk_new(AioContext *ctx, uint64_t perm, uint64_t shared_perm)
334 {
335     BlockBackend *blk;
336 
337     blk = g_new0(BlockBackend, 1);
338     blk->refcnt = 1;
339     blk->ctx = ctx;
340     blk->perm = perm;
341     blk->shared_perm = shared_perm;
342     blk_set_enable_write_cache(blk, true);
343 
344     blk->on_read_error = BLOCKDEV_ON_ERROR_REPORT;
345     blk->on_write_error = BLOCKDEV_ON_ERROR_ENOSPC;
346 
347     block_acct_init(&blk->stats);
348 
349     qemu_co_queue_init(&blk->queued_requests);
350     notifier_list_init(&blk->remove_bs_notifiers);
351     notifier_list_init(&blk->insert_bs_notifiers);
352     QLIST_INIT(&blk->aio_notifiers);
353 
354     QTAILQ_INSERT_TAIL(&block_backends, blk, link);
355     return blk;
356 }
357 
358 /*
359  * Create a new BlockBackend connected to an existing BlockDriverState.
360  *
361  * @perm is a bitmasks of BLK_PERM_* constants which describes the
362  * permissions to request for @bs that is attached to this
363  * BlockBackend.  @shared_perm is a bitmask which describes which
364  * permissions may be granted to other users of the attached node.
365  * Both sets of permissions can be changed later using blk_set_perm().
366  *
367  * Return the new BlockBackend on success, null on failure.
368  */
369 BlockBackend *blk_new_with_bs(BlockDriverState *bs, uint64_t perm,
370                               uint64_t shared_perm, Error **errp)
371 {
372     BlockBackend *blk = blk_new(bdrv_get_aio_context(bs), perm, shared_perm);
373 
374     if (blk_insert_bs(blk, bs, errp) < 0) {
375         blk_unref(blk);
376         return NULL;
377     }
378     return blk;
379 }
380 
381 /*
382  * Creates a new BlockBackend, opens a new BlockDriverState, and connects both.
383  * The new BlockBackend is in the main AioContext.
384  *
385  * Just as with bdrv_open(), after having called this function the reference to
386  * @options belongs to the block layer (even on failure).
387  *
388  * TODO: Remove @filename and @flags; it should be possible to specify a whole
389  * BDS tree just by specifying the @options QDict (or @reference,
390  * alternatively). At the time of adding this function, this is not possible,
391  * though, so callers of this function have to be able to specify @filename and
392  * @flags.
393  */
394 BlockBackend *blk_new_open(const char *filename, const char *reference,
395                            QDict *options, int flags, Error **errp)
396 {
397     BlockBackend *blk;
398     BlockDriverState *bs;
399     uint64_t perm = 0;
400 
401     /* blk_new_open() is mainly used in .bdrv_create implementations and the
402      * tools where sharing isn't a concern because the BDS stays private, so we
403      * just request permission according to the flags.
404      *
405      * The exceptions are xen_disk and blockdev_init(); in these cases, the
406      * caller of blk_new_open() doesn't make use of the permissions, but they
407      * shouldn't hurt either. We can still share everything here because the
408      * guest devices will add their own blockers if they can't share. */
409     if ((flags & BDRV_O_NO_IO) == 0) {
410         perm |= BLK_PERM_CONSISTENT_READ;
411         if (flags & BDRV_O_RDWR) {
412             perm |= BLK_PERM_WRITE;
413         }
414     }
415     if (flags & BDRV_O_RESIZE) {
416         perm |= BLK_PERM_RESIZE;
417     }
418 
419     blk = blk_new(qemu_get_aio_context(), perm, BLK_PERM_ALL);
420     bs = bdrv_open(filename, reference, options, flags, errp);
421     if (!bs) {
422         blk_unref(blk);
423         return NULL;
424     }
425 
426     blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk->ctx,
427                                        perm, BLK_PERM_ALL, blk, errp);
428     if (!blk->root) {
429         blk_unref(blk);
430         return NULL;
431     }
432 
433     return blk;
434 }
435 
436 static void blk_delete(BlockBackend *blk)
437 {
438     assert(!blk->refcnt);
439     assert(!blk->name);
440     assert(!blk->dev);
441     if (blk->public.throttle_group_member.throttle_state) {
442         blk_io_limits_disable(blk);
443     }
444     if (blk->root) {
445         blk_remove_bs(blk);
446     }
447     if (blk->vmsh) {
448         qemu_del_vm_change_state_handler(blk->vmsh);
449         blk->vmsh = NULL;
450     }
451     assert(QLIST_EMPTY(&blk->remove_bs_notifiers.notifiers));
452     assert(QLIST_EMPTY(&blk->insert_bs_notifiers.notifiers));
453     assert(QLIST_EMPTY(&blk->aio_notifiers));
454     QTAILQ_REMOVE(&block_backends, blk, link);
455     drive_info_del(blk->legacy_dinfo);
456     block_acct_cleanup(&blk->stats);
457     g_free(blk);
458 }
459 
460 static void drive_info_del(DriveInfo *dinfo)
461 {
462     if (!dinfo) {
463         return;
464     }
465     qemu_opts_del(dinfo->opts);
466     g_free(dinfo);
467 }
468 
469 int blk_get_refcnt(BlockBackend *blk)
470 {
471     return blk ? blk->refcnt : 0;
472 }
473 
474 /*
475  * Increment @blk's reference count.
476  * @blk must not be null.
477  */
478 void blk_ref(BlockBackend *blk)
479 {
480     assert(blk->refcnt > 0);
481     blk->refcnt++;
482 }
483 
484 /*
485  * Decrement @blk's reference count.
486  * If this drops it to zero, destroy @blk.
487  * For convenience, do nothing if @blk is null.
488  */
489 void blk_unref(BlockBackend *blk)
490 {
491     if (blk) {
492         assert(blk->refcnt > 0);
493         if (blk->refcnt > 1) {
494             blk->refcnt--;
495         } else {
496             blk_drain(blk);
497             /* blk_drain() cannot resurrect blk, nobody held a reference */
498             assert(blk->refcnt == 1);
499             blk->refcnt = 0;
500             blk_delete(blk);
501         }
502     }
503 }
504 
505 /*
506  * Behaves similarly to blk_next() but iterates over all BlockBackends, even the
507  * ones which are hidden (i.e. are not referenced by the monitor).
508  */
509 BlockBackend *blk_all_next(BlockBackend *blk)
510 {
511     return blk ? QTAILQ_NEXT(blk, link)
512                : QTAILQ_FIRST(&block_backends);
513 }
514 
515 void blk_remove_all_bs(void)
516 {
517     BlockBackend *blk = NULL;
518 
519     while ((blk = blk_all_next(blk)) != NULL) {
520         AioContext *ctx = blk_get_aio_context(blk);
521 
522         aio_context_acquire(ctx);
523         if (blk->root) {
524             blk_remove_bs(blk);
525         }
526         aio_context_release(ctx);
527     }
528 }
529 
530 /*
531  * Return the monitor-owned BlockBackend after @blk.
532  * If @blk is null, return the first one.
533  * Else, return @blk's next sibling, which may be null.
534  *
535  * To iterate over all BlockBackends, do
536  * for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
537  *     ...
538  * }
539  */
540 BlockBackend *blk_next(BlockBackend *blk)
541 {
542     return blk ? QTAILQ_NEXT(blk, monitor_link)
543                : QTAILQ_FIRST(&monitor_block_backends);
544 }
545 
546 /* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
547  * the monitor or attached to a BlockBackend */
548 BlockDriverState *bdrv_next(BdrvNextIterator *it)
549 {
550     BlockDriverState *bs, *old_bs;
551 
552     /* Must be called from the main loop */
553     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
554 
555     /* First, return all root nodes of BlockBackends. In order to avoid
556      * returning a BDS twice when multiple BBs refer to it, we only return it
557      * if the BB is the first one in the parent list of the BDS. */
558     if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
559         BlockBackend *old_blk = it->blk;
560 
561         old_bs = old_blk ? blk_bs(old_blk) : NULL;
562 
563         do {
564             it->blk = blk_all_next(it->blk);
565             bs = it->blk ? blk_bs(it->blk) : NULL;
566         } while (it->blk && (bs == NULL || bdrv_first_blk(bs) != it->blk));
567 
568         if (it->blk) {
569             blk_ref(it->blk);
570         }
571         blk_unref(old_blk);
572 
573         if (bs) {
574             bdrv_ref(bs);
575             bdrv_unref(old_bs);
576             return bs;
577         }
578         it->phase = BDRV_NEXT_MONITOR_OWNED;
579     } else {
580         old_bs = it->bs;
581     }
582 
583     /* Then return the monitor-owned BDSes without a BB attached. Ignore all
584      * BDSes that are attached to a BlockBackend here; they have been handled
585      * by the above block already */
586     do {
587         it->bs = bdrv_next_monitor_owned(it->bs);
588         bs = it->bs;
589     } while (bs && bdrv_has_blk(bs));
590 
591     if (bs) {
592         bdrv_ref(bs);
593     }
594     bdrv_unref(old_bs);
595 
596     return bs;
597 }
598 
599 static void bdrv_next_reset(BdrvNextIterator *it)
600 {
601     *it = (BdrvNextIterator) {
602         .phase = BDRV_NEXT_BACKEND_ROOTS,
603     };
604 }
605 
606 BlockDriverState *bdrv_first(BdrvNextIterator *it)
607 {
608     bdrv_next_reset(it);
609     return bdrv_next(it);
610 }
611 
612 /* Must be called when aborting a bdrv_next() iteration before
613  * bdrv_next() returns NULL */
614 void bdrv_next_cleanup(BdrvNextIterator *it)
615 {
616     /* Must be called from the main loop */
617     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
618 
619     if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
620         if (it->blk) {
621             bdrv_unref(blk_bs(it->blk));
622             blk_unref(it->blk);
623         }
624     } else {
625         bdrv_unref(it->bs);
626     }
627 
628     bdrv_next_reset(it);
629 }
630 
631 /*
632  * Add a BlockBackend into the list of backends referenced by the monitor, with
633  * the given @name acting as the handle for the monitor.
634  * Strictly for use by blockdev.c.
635  *
636  * @name must not be null or empty.
637  *
638  * Returns true on success and false on failure. In the latter case, an Error
639  * object is returned through @errp.
640  */
641 bool monitor_add_blk(BlockBackend *blk, const char *name, Error **errp)
642 {
643     assert(!blk->name);
644     assert(name && name[0]);
645 
646     if (!id_wellformed(name)) {
647         error_setg(errp, "Invalid device name");
648         return false;
649     }
650     if (blk_by_name(name)) {
651         error_setg(errp, "Device with id '%s' already exists", name);
652         return false;
653     }
654     if (bdrv_find_node(name)) {
655         error_setg(errp,
656                    "Device name '%s' conflicts with an existing node name",
657                    name);
658         return false;
659     }
660 
661     blk->name = g_strdup(name);
662     QTAILQ_INSERT_TAIL(&monitor_block_backends, blk, monitor_link);
663     return true;
664 }
665 
666 /*
667  * Remove a BlockBackend from the list of backends referenced by the monitor.
668  * Strictly for use by blockdev.c.
669  */
670 void monitor_remove_blk(BlockBackend *blk)
671 {
672     if (!blk->name) {
673         return;
674     }
675 
676     QTAILQ_REMOVE(&monitor_block_backends, blk, monitor_link);
677     g_free(blk->name);
678     blk->name = NULL;
679 }
680 
681 /*
682  * Return @blk's name, a non-null string.
683  * Returns an empty string iff @blk is not referenced by the monitor.
684  */
685 const char *blk_name(const BlockBackend *blk)
686 {
687     return blk->name ?: "";
688 }
689 
690 /*
691  * Return the BlockBackend with name @name if it exists, else null.
692  * @name must not be null.
693  */
694 BlockBackend *blk_by_name(const char *name)
695 {
696     BlockBackend *blk = NULL;
697 
698     assert(name);
699     while ((blk = blk_next(blk)) != NULL) {
700         if (!strcmp(name, blk->name)) {
701             return blk;
702         }
703     }
704     return NULL;
705 }
706 
707 /*
708  * Return the BlockDriverState attached to @blk if any, else null.
709  */
710 BlockDriverState *blk_bs(BlockBackend *blk)
711 {
712     return blk->root ? blk->root->bs : NULL;
713 }
714 
715 static BlockBackend *bdrv_first_blk(BlockDriverState *bs)
716 {
717     BdrvChild *child;
718     QLIST_FOREACH(child, &bs->parents, next_parent) {
719         if (child->role == &child_root) {
720             return child->opaque;
721         }
722     }
723 
724     return NULL;
725 }
726 
727 /*
728  * Returns true if @bs has an associated BlockBackend.
729  */
730 bool bdrv_has_blk(BlockDriverState *bs)
731 {
732     return bdrv_first_blk(bs) != NULL;
733 }
734 
735 /*
736  * Returns true if @bs has only BlockBackends as parents.
737  */
738 bool bdrv_is_root_node(BlockDriverState *bs)
739 {
740     BdrvChild *c;
741 
742     QLIST_FOREACH(c, &bs->parents, next_parent) {
743         if (c->role != &child_root) {
744             return false;
745         }
746     }
747 
748     return true;
749 }
750 
751 /*
752  * Return @blk's DriveInfo if any, else null.
753  */
754 DriveInfo *blk_legacy_dinfo(BlockBackend *blk)
755 {
756     return blk->legacy_dinfo;
757 }
758 
759 /*
760  * Set @blk's DriveInfo to @dinfo, and return it.
761  * @blk must not have a DriveInfo set already.
762  * No other BlockBackend may have the same DriveInfo set.
763  */
764 DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, DriveInfo *dinfo)
765 {
766     assert(!blk->legacy_dinfo);
767     return blk->legacy_dinfo = dinfo;
768 }
769 
770 /*
771  * Return the BlockBackend with DriveInfo @dinfo.
772  * It must exist.
773  */
774 BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo)
775 {
776     BlockBackend *blk = NULL;
777 
778     while ((blk = blk_next(blk)) != NULL) {
779         if (blk->legacy_dinfo == dinfo) {
780             return blk;
781         }
782     }
783     abort();
784 }
785 
786 /*
787  * Returns a pointer to the publicly accessible fields of @blk.
788  */
789 BlockBackendPublic *blk_get_public(BlockBackend *blk)
790 {
791     return &blk->public;
792 }
793 
794 /*
795  * Returns a BlockBackend given the associated @public fields.
796  */
797 BlockBackend *blk_by_public(BlockBackendPublic *public)
798 {
799     return container_of(public, BlockBackend, public);
800 }
801 
802 /*
803  * Disassociates the currently associated BlockDriverState from @blk.
804  */
805 void blk_remove_bs(BlockBackend *blk)
806 {
807     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
808     BlockDriverState *bs;
809 
810     notifier_list_notify(&blk->remove_bs_notifiers, blk);
811     if (tgm->throttle_state) {
812         bs = blk_bs(blk);
813         bdrv_drained_begin(bs);
814         throttle_group_detach_aio_context(tgm);
815         throttle_group_attach_aio_context(tgm, qemu_get_aio_context());
816         bdrv_drained_end(bs);
817     }
818 
819     blk_update_root_state(blk);
820 
821     /* bdrv_root_unref_child() will cause blk->root to become stale and may
822      * switch to a completion coroutine later on. Let's drain all I/O here
823      * to avoid that and a potential QEMU crash.
824      */
825     blk_drain(blk);
826     bdrv_root_unref_child(blk->root);
827     blk->root = NULL;
828 }
829 
830 /*
831  * Associates a new BlockDriverState with @blk.
832  */
833 int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp)
834 {
835     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
836     bdrv_ref(bs);
837     blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk->ctx,
838                                        blk->perm, blk->shared_perm, blk, errp);
839     if (blk->root == NULL) {
840         return -EPERM;
841     }
842 
843     notifier_list_notify(&blk->insert_bs_notifiers, blk);
844     if (tgm->throttle_state) {
845         throttle_group_detach_aio_context(tgm);
846         throttle_group_attach_aio_context(tgm, bdrv_get_aio_context(bs));
847     }
848 
849     return 0;
850 }
851 
852 /*
853  * Sets the permission bitmasks that the user of the BlockBackend needs.
854  */
855 int blk_set_perm(BlockBackend *blk, uint64_t perm, uint64_t shared_perm,
856                  Error **errp)
857 {
858     int ret;
859 
860     if (blk->root && !blk->disable_perm) {
861         ret = bdrv_child_try_set_perm(blk->root, perm, shared_perm, errp);
862         if (ret < 0) {
863             return ret;
864         }
865     }
866 
867     blk->perm = perm;
868     blk->shared_perm = shared_perm;
869 
870     return 0;
871 }
872 
873 void blk_get_perm(BlockBackend *blk, uint64_t *perm, uint64_t *shared_perm)
874 {
875     *perm = blk->perm;
876     *shared_perm = blk->shared_perm;
877 }
878 
879 /*
880  * Attach device model @dev to @blk.
881  * Return 0 on success, -EBUSY when a device model is attached already.
882  */
883 int blk_attach_dev(BlockBackend *blk, DeviceState *dev)
884 {
885     if (blk->dev) {
886         return -EBUSY;
887     }
888 
889     /* While migration is still incoming, we don't need to apply the
890      * permissions of guest device BlockBackends. We might still have a block
891      * job or NBD server writing to the image for storage migration. */
892     if (runstate_check(RUN_STATE_INMIGRATE)) {
893         blk->disable_perm = true;
894     }
895 
896     blk_ref(blk);
897     blk->dev = dev;
898     blk_iostatus_reset(blk);
899 
900     return 0;
901 }
902 
903 /*
904  * Detach device model @dev from @blk.
905  * @dev must be currently attached to @blk.
906  */
907 void blk_detach_dev(BlockBackend *blk, DeviceState *dev)
908 {
909     assert(blk->dev == dev);
910     blk->dev = NULL;
911     blk->dev_ops = NULL;
912     blk->dev_opaque = NULL;
913     blk->guest_block_size = 512;
914     blk_set_perm(blk, 0, BLK_PERM_ALL, &error_abort);
915     blk_unref(blk);
916 }
917 
918 /*
919  * Return the device model attached to @blk if any, else null.
920  */
921 DeviceState *blk_get_attached_dev(BlockBackend *blk)
922 {
923     return blk->dev;
924 }
925 
926 /* Return the qdev ID, or if no ID is assigned the QOM path, of the block
927  * device attached to the BlockBackend. */
928 char *blk_get_attached_dev_id(BlockBackend *blk)
929 {
930     DeviceState *dev = blk->dev;
931 
932     if (!dev) {
933         return g_strdup("");
934     } else if (dev->id) {
935         return g_strdup(dev->id);
936     }
937 
938     return object_get_canonical_path(OBJECT(dev)) ?: g_strdup("");
939 }
940 
941 /*
942  * Return the BlockBackend which has the device model @dev attached if it
943  * exists, else null.
944  *
945  * @dev must not be null.
946  */
947 BlockBackend *blk_by_dev(void *dev)
948 {
949     BlockBackend *blk = NULL;
950 
951     assert(dev != NULL);
952     while ((blk = blk_all_next(blk)) != NULL) {
953         if (blk->dev == dev) {
954             return blk;
955         }
956     }
957     return NULL;
958 }
959 
960 /*
961  * Set @blk's device model callbacks to @ops.
962  * @opaque is the opaque argument to pass to the callbacks.
963  * This is for use by device models.
964  */
965 void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops,
966                      void *opaque)
967 {
968     blk->dev_ops = ops;
969     blk->dev_opaque = opaque;
970 
971     /* Are we currently quiesced? Should we enforce this right now? */
972     if (blk->quiesce_counter && ops->drained_begin) {
973         ops->drained_begin(opaque);
974     }
975 }
976 
977 /*
978  * Notify @blk's attached device model of media change.
979  *
980  * If @load is true, notify of media load. This action can fail, meaning that
981  * the medium cannot be loaded. @errp is set then.
982  *
983  * If @load is false, notify of media eject. This can never fail.
984  *
985  * Also send DEVICE_TRAY_MOVED events as appropriate.
986  */
987 void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp)
988 {
989     if (blk->dev_ops && blk->dev_ops->change_media_cb) {
990         bool tray_was_open, tray_is_open;
991         Error *local_err = NULL;
992 
993         tray_was_open = blk_dev_is_tray_open(blk);
994         blk->dev_ops->change_media_cb(blk->dev_opaque, load, &local_err);
995         if (local_err) {
996             assert(load == true);
997             error_propagate(errp, local_err);
998             return;
999         }
1000         tray_is_open = blk_dev_is_tray_open(blk);
1001 
1002         if (tray_was_open != tray_is_open) {
1003             char *id = blk_get_attached_dev_id(blk);
1004             qapi_event_send_device_tray_moved(blk_name(blk), id, tray_is_open);
1005             g_free(id);
1006         }
1007     }
1008 }
1009 
1010 static void blk_root_change_media(BdrvChild *child, bool load)
1011 {
1012     blk_dev_change_media_cb(child->opaque, load, NULL);
1013 }
1014 
1015 /*
1016  * Does @blk's attached device model have removable media?
1017  * %true if no device model is attached.
1018  */
1019 bool blk_dev_has_removable_media(BlockBackend *blk)
1020 {
1021     return !blk->dev || (blk->dev_ops && blk->dev_ops->change_media_cb);
1022 }
1023 
1024 /*
1025  * Does @blk's attached device model have a tray?
1026  */
1027 bool blk_dev_has_tray(BlockBackend *blk)
1028 {
1029     return blk->dev_ops && blk->dev_ops->is_tray_open;
1030 }
1031 
1032 /*
1033  * Notify @blk's attached device model of a media eject request.
1034  * If @force is true, the medium is about to be yanked out forcefully.
1035  */
1036 void blk_dev_eject_request(BlockBackend *blk, bool force)
1037 {
1038     if (blk->dev_ops && blk->dev_ops->eject_request_cb) {
1039         blk->dev_ops->eject_request_cb(blk->dev_opaque, force);
1040     }
1041 }
1042 
1043 /*
1044  * Does @blk's attached device model have a tray, and is it open?
1045  */
1046 bool blk_dev_is_tray_open(BlockBackend *blk)
1047 {
1048     if (blk_dev_has_tray(blk)) {
1049         return blk->dev_ops->is_tray_open(blk->dev_opaque);
1050     }
1051     return false;
1052 }
1053 
1054 /*
1055  * Does @blk's attached device model have the medium locked?
1056  * %false if the device model has no such lock.
1057  */
1058 bool blk_dev_is_medium_locked(BlockBackend *blk)
1059 {
1060     if (blk->dev_ops && blk->dev_ops->is_medium_locked) {
1061         return blk->dev_ops->is_medium_locked(blk->dev_opaque);
1062     }
1063     return false;
1064 }
1065 
1066 /*
1067  * Notify @blk's attached device model of a backend size change.
1068  */
1069 static void blk_root_resize(BdrvChild *child)
1070 {
1071     BlockBackend *blk = child->opaque;
1072 
1073     if (blk->dev_ops && blk->dev_ops->resize_cb) {
1074         blk->dev_ops->resize_cb(blk->dev_opaque);
1075     }
1076 }
1077 
1078 void blk_iostatus_enable(BlockBackend *blk)
1079 {
1080     blk->iostatus_enabled = true;
1081     blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
1082 }
1083 
1084 /* The I/O status is only enabled if the drive explicitly
1085  * enables it _and_ the VM is configured to stop on errors */
1086 bool blk_iostatus_is_enabled(const BlockBackend *blk)
1087 {
1088     return (blk->iostatus_enabled &&
1089            (blk->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
1090             blk->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
1091             blk->on_read_error == BLOCKDEV_ON_ERROR_STOP));
1092 }
1093 
1094 BlockDeviceIoStatus blk_iostatus(const BlockBackend *blk)
1095 {
1096     return blk->iostatus;
1097 }
1098 
1099 void blk_iostatus_disable(BlockBackend *blk)
1100 {
1101     blk->iostatus_enabled = false;
1102 }
1103 
1104 void blk_iostatus_reset(BlockBackend *blk)
1105 {
1106     if (blk_iostatus_is_enabled(blk)) {
1107         blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
1108     }
1109 }
1110 
1111 void blk_iostatus_set_err(BlockBackend *blk, int error)
1112 {
1113     assert(blk_iostatus_is_enabled(blk));
1114     if (blk->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
1115         blk->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
1116                                           BLOCK_DEVICE_IO_STATUS_FAILED;
1117     }
1118 }
1119 
1120 void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow)
1121 {
1122     blk->allow_write_beyond_eof = allow;
1123 }
1124 
1125 void blk_set_allow_aio_context_change(BlockBackend *blk, bool allow)
1126 {
1127     blk->allow_aio_context_change = allow;
1128 }
1129 
1130 void blk_set_disable_request_queuing(BlockBackend *blk, bool disable)
1131 {
1132     blk->disable_request_queuing = disable;
1133 }
1134 
1135 static int blk_check_byte_request(BlockBackend *blk, int64_t offset,
1136                                   size_t size)
1137 {
1138     int64_t len;
1139 
1140     if (size > INT_MAX) {
1141         return -EIO;
1142     }
1143 
1144     if (!blk_is_available(blk)) {
1145         return -ENOMEDIUM;
1146     }
1147 
1148     if (offset < 0) {
1149         return -EIO;
1150     }
1151 
1152     if (!blk->allow_write_beyond_eof) {
1153         len = blk_getlength(blk);
1154         if (len < 0) {
1155             return len;
1156         }
1157 
1158         if (offset > len || len - offset < size) {
1159             return -EIO;
1160         }
1161     }
1162 
1163     return 0;
1164 }
1165 
1166 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1167 static void coroutine_fn blk_wait_while_drained(BlockBackend *blk)
1168 {
1169     assert(blk->in_flight > 0);
1170 
1171     if (blk->quiesce_counter && !blk->disable_request_queuing) {
1172         blk_dec_in_flight(blk);
1173         qemu_co_queue_wait(&blk->queued_requests, NULL);
1174         blk_inc_in_flight(blk);
1175     }
1176 }
1177 
1178 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1179 static int coroutine_fn
1180 blk_do_preadv(BlockBackend *blk, int64_t offset, unsigned int bytes,
1181               QEMUIOVector *qiov, BdrvRequestFlags flags)
1182 {
1183     int ret;
1184     BlockDriverState *bs;
1185 
1186     blk_wait_while_drained(blk);
1187 
1188     /* Call blk_bs() only after waiting, the graph may have changed */
1189     bs = blk_bs(blk);
1190     trace_blk_co_preadv(blk, bs, offset, bytes, flags);
1191 
1192     ret = blk_check_byte_request(blk, offset, bytes);
1193     if (ret < 0) {
1194         return ret;
1195     }
1196 
1197     bdrv_inc_in_flight(bs);
1198 
1199     /* throttling disk I/O */
1200     if (blk->public.throttle_group_member.throttle_state) {
1201         throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
1202                 bytes, false);
1203     }
1204 
1205     ret = bdrv_co_preadv(blk->root, offset, bytes, qiov, flags);
1206     bdrv_dec_in_flight(bs);
1207     return ret;
1208 }
1209 
1210 int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
1211                                unsigned int bytes, QEMUIOVector *qiov,
1212                                BdrvRequestFlags flags)
1213 {
1214     int ret;
1215 
1216     blk_inc_in_flight(blk);
1217     ret = blk_do_preadv(blk, offset, bytes, qiov, flags);
1218     blk_dec_in_flight(blk);
1219 
1220     return ret;
1221 }
1222 
1223 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1224 static int coroutine_fn
1225 blk_do_pwritev_part(BlockBackend *blk, int64_t offset, unsigned int bytes,
1226                     QEMUIOVector *qiov, size_t qiov_offset,
1227                     BdrvRequestFlags flags)
1228 {
1229     int ret;
1230     BlockDriverState *bs;
1231 
1232     blk_wait_while_drained(blk);
1233 
1234     /* Call blk_bs() only after waiting, the graph may have changed */
1235     bs = blk_bs(blk);
1236     trace_blk_co_pwritev(blk, bs, offset, bytes, flags);
1237 
1238     ret = blk_check_byte_request(blk, offset, bytes);
1239     if (ret < 0) {
1240         return ret;
1241     }
1242 
1243     bdrv_inc_in_flight(bs);
1244     /* throttling disk I/O */
1245     if (blk->public.throttle_group_member.throttle_state) {
1246         throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
1247                 bytes, true);
1248     }
1249 
1250     if (!blk->enable_write_cache) {
1251         flags |= BDRV_REQ_FUA;
1252     }
1253 
1254     ret = bdrv_co_pwritev_part(blk->root, offset, bytes, qiov, qiov_offset,
1255                                flags);
1256     bdrv_dec_in_flight(bs);
1257     return ret;
1258 }
1259 
1260 int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset,
1261                                      unsigned int bytes,
1262                                      QEMUIOVector *qiov, size_t qiov_offset,
1263                                      BdrvRequestFlags flags)
1264 {
1265     int ret;
1266 
1267     blk_inc_in_flight(blk);
1268     ret = blk_do_pwritev_part(blk, offset, bytes, qiov, qiov_offset, flags);
1269     blk_dec_in_flight(blk);
1270 
1271     return ret;
1272 }
1273 
1274 int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
1275                                 unsigned int bytes, QEMUIOVector *qiov,
1276                                 BdrvRequestFlags flags)
1277 {
1278     return blk_co_pwritev_part(blk, offset, bytes, qiov, 0, flags);
1279 }
1280 
1281 typedef struct BlkRwCo {
1282     BlockBackend *blk;
1283     int64_t offset;
1284     void *iobuf;
1285     int ret;
1286     BdrvRequestFlags flags;
1287 } BlkRwCo;
1288 
1289 static void blk_read_entry(void *opaque)
1290 {
1291     BlkRwCo *rwco = opaque;
1292     QEMUIOVector *qiov = rwco->iobuf;
1293 
1294     rwco->ret = blk_do_preadv(rwco->blk, rwco->offset, qiov->size,
1295                               qiov, rwco->flags);
1296     aio_wait_kick();
1297 }
1298 
1299 static void blk_write_entry(void *opaque)
1300 {
1301     BlkRwCo *rwco = opaque;
1302     QEMUIOVector *qiov = rwco->iobuf;
1303 
1304     rwco->ret = blk_do_pwritev_part(rwco->blk, rwco->offset, qiov->size,
1305                                     qiov, 0, rwco->flags);
1306     aio_wait_kick();
1307 }
1308 
1309 static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
1310                    int64_t bytes, CoroutineEntry co_entry,
1311                    BdrvRequestFlags flags)
1312 {
1313     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
1314     BlkRwCo rwco = {
1315         .blk    = blk,
1316         .offset = offset,
1317         .iobuf  = &qiov,
1318         .flags  = flags,
1319         .ret    = NOT_DONE,
1320     };
1321 
1322     blk_inc_in_flight(blk);
1323     if (qemu_in_coroutine()) {
1324         /* Fast-path if already in coroutine context */
1325         co_entry(&rwco);
1326     } else {
1327         Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
1328         bdrv_coroutine_enter(blk_bs(blk), co);
1329         BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
1330     }
1331     blk_dec_in_flight(blk);
1332 
1333     return rwco.ret;
1334 }
1335 
1336 int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
1337                       int bytes, BdrvRequestFlags flags)
1338 {
1339     return blk_prw(blk, offset, NULL, bytes, blk_write_entry,
1340                    flags | BDRV_REQ_ZERO_WRITE);
1341 }
1342 
1343 int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
1344 {
1345     return bdrv_make_zero(blk->root, flags);
1346 }
1347 
1348 void blk_inc_in_flight(BlockBackend *blk)
1349 {
1350     atomic_inc(&blk->in_flight);
1351 }
1352 
1353 void blk_dec_in_flight(BlockBackend *blk)
1354 {
1355     atomic_dec(&blk->in_flight);
1356     aio_wait_kick();
1357 }
1358 
1359 static void error_callback_bh(void *opaque)
1360 {
1361     struct BlockBackendAIOCB *acb = opaque;
1362 
1363     blk_dec_in_flight(acb->blk);
1364     acb->common.cb(acb->common.opaque, acb->ret);
1365     qemu_aio_unref(acb);
1366 }
1367 
1368 BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
1369                                   BlockCompletionFunc *cb,
1370                                   void *opaque, int ret)
1371 {
1372     struct BlockBackendAIOCB *acb;
1373 
1374     blk_inc_in_flight(blk);
1375     acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque);
1376     acb->blk = blk;
1377     acb->ret = ret;
1378 
1379     replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
1380                                      error_callback_bh, acb);
1381     return &acb->common;
1382 }
1383 
1384 typedef struct BlkAioEmAIOCB {
1385     BlockAIOCB common;
1386     BlkRwCo rwco;
1387     int bytes;
1388     bool has_returned;
1389 } BlkAioEmAIOCB;
1390 
1391 static const AIOCBInfo blk_aio_em_aiocb_info = {
1392     .aiocb_size         = sizeof(BlkAioEmAIOCB),
1393 };
1394 
1395 static void blk_aio_complete(BlkAioEmAIOCB *acb)
1396 {
1397     if (acb->has_returned) {
1398         acb->common.cb(acb->common.opaque, acb->rwco.ret);
1399         blk_dec_in_flight(acb->rwco.blk);
1400         qemu_aio_unref(acb);
1401     }
1402 }
1403 
1404 static void blk_aio_complete_bh(void *opaque)
1405 {
1406     BlkAioEmAIOCB *acb = opaque;
1407     assert(acb->has_returned);
1408     blk_aio_complete(acb);
1409 }
1410 
1411 static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
1412                                 void *iobuf, CoroutineEntry co_entry,
1413                                 BdrvRequestFlags flags,
1414                                 BlockCompletionFunc *cb, void *opaque)
1415 {
1416     BlkAioEmAIOCB *acb;
1417     Coroutine *co;
1418 
1419     blk_inc_in_flight(blk);
1420     acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
1421     acb->rwco = (BlkRwCo) {
1422         .blk    = blk,
1423         .offset = offset,
1424         .iobuf  = iobuf,
1425         .flags  = flags,
1426         .ret    = NOT_DONE,
1427     };
1428     acb->bytes = bytes;
1429     acb->has_returned = false;
1430 
1431     co = qemu_coroutine_create(co_entry, acb);
1432     bdrv_coroutine_enter(blk_bs(blk), co);
1433 
1434     acb->has_returned = true;
1435     if (acb->rwco.ret != NOT_DONE) {
1436         replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
1437                                          blk_aio_complete_bh, acb);
1438     }
1439 
1440     return &acb->common;
1441 }
1442 
1443 static void blk_aio_read_entry(void *opaque)
1444 {
1445     BlkAioEmAIOCB *acb = opaque;
1446     BlkRwCo *rwco = &acb->rwco;
1447     QEMUIOVector *qiov = rwco->iobuf;
1448 
1449     assert(qiov->size == acb->bytes);
1450     rwco->ret = blk_do_preadv(rwco->blk, rwco->offset, acb->bytes,
1451                               qiov, rwco->flags);
1452     blk_aio_complete(acb);
1453 }
1454 
1455 static void blk_aio_write_entry(void *opaque)
1456 {
1457     BlkAioEmAIOCB *acb = opaque;
1458     BlkRwCo *rwco = &acb->rwco;
1459     QEMUIOVector *qiov = rwco->iobuf;
1460 
1461     assert(!qiov || qiov->size == acb->bytes);
1462     rwco->ret = blk_do_pwritev_part(rwco->blk, rwco->offset, acb->bytes,
1463                                     qiov, 0, rwco->flags);
1464     blk_aio_complete(acb);
1465 }
1466 
1467 BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset,
1468                                   int count, BdrvRequestFlags flags,
1469                                   BlockCompletionFunc *cb, void *opaque)
1470 {
1471     return blk_aio_prwv(blk, offset, count, NULL, blk_aio_write_entry,
1472                         flags | BDRV_REQ_ZERO_WRITE, cb, opaque);
1473 }
1474 
1475 int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count)
1476 {
1477     int ret = blk_prw(blk, offset, buf, count, blk_read_entry, 0);
1478     if (ret < 0) {
1479         return ret;
1480     }
1481     return count;
1482 }
1483 
1484 int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count,
1485                BdrvRequestFlags flags)
1486 {
1487     int ret = blk_prw(blk, offset, (void *) buf, count, blk_write_entry,
1488                       flags);
1489     if (ret < 0) {
1490         return ret;
1491     }
1492     return count;
1493 }
1494 
1495 int64_t blk_getlength(BlockBackend *blk)
1496 {
1497     if (!blk_is_available(blk)) {
1498         return -ENOMEDIUM;
1499     }
1500 
1501     return bdrv_getlength(blk_bs(blk));
1502 }
1503 
1504 void blk_get_geometry(BlockBackend *blk, uint64_t *nb_sectors_ptr)
1505 {
1506     if (!blk_bs(blk)) {
1507         *nb_sectors_ptr = 0;
1508     } else {
1509         bdrv_get_geometry(blk_bs(blk), nb_sectors_ptr);
1510     }
1511 }
1512 
1513 int64_t blk_nb_sectors(BlockBackend *blk)
1514 {
1515     if (!blk_is_available(blk)) {
1516         return -ENOMEDIUM;
1517     }
1518 
1519     return bdrv_nb_sectors(blk_bs(blk));
1520 }
1521 
1522 BlockAIOCB *blk_aio_preadv(BlockBackend *blk, int64_t offset,
1523                            QEMUIOVector *qiov, BdrvRequestFlags flags,
1524                            BlockCompletionFunc *cb, void *opaque)
1525 {
1526     return blk_aio_prwv(blk, offset, qiov->size, qiov,
1527                         blk_aio_read_entry, flags, cb, opaque);
1528 }
1529 
1530 BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
1531                             QEMUIOVector *qiov, BdrvRequestFlags flags,
1532                             BlockCompletionFunc *cb, void *opaque)
1533 {
1534     return blk_aio_prwv(blk, offset, qiov->size, qiov,
1535                         blk_aio_write_entry, flags, cb, opaque);
1536 }
1537 
1538 void blk_aio_cancel(BlockAIOCB *acb)
1539 {
1540     bdrv_aio_cancel(acb);
1541 }
1542 
1543 void blk_aio_cancel_async(BlockAIOCB *acb)
1544 {
1545     bdrv_aio_cancel_async(acb);
1546 }
1547 
1548 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1549 static int coroutine_fn
1550 blk_do_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
1551 {
1552     blk_wait_while_drained(blk);
1553 
1554     if (!blk_is_available(blk)) {
1555         return -ENOMEDIUM;
1556     }
1557 
1558     return bdrv_co_ioctl(blk_bs(blk), req, buf);
1559 }
1560 
1561 static void blk_ioctl_entry(void *opaque)
1562 {
1563     BlkRwCo *rwco = opaque;
1564     QEMUIOVector *qiov = rwco->iobuf;
1565 
1566     rwco->ret = blk_do_ioctl(rwco->blk, rwco->offset, qiov->iov[0].iov_base);
1567     aio_wait_kick();
1568 }
1569 
1570 int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
1571 {
1572     return blk_prw(blk, req, buf, 0, blk_ioctl_entry, 0);
1573 }
1574 
1575 static void blk_aio_ioctl_entry(void *opaque)
1576 {
1577     BlkAioEmAIOCB *acb = opaque;
1578     BlkRwCo *rwco = &acb->rwco;
1579 
1580     rwco->ret = blk_do_ioctl(rwco->blk, rwco->offset, rwco->iobuf);
1581 
1582     blk_aio_complete(acb);
1583 }
1584 
1585 BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
1586                           BlockCompletionFunc *cb, void *opaque)
1587 {
1588     return blk_aio_prwv(blk, req, 0, buf, blk_aio_ioctl_entry, 0, cb, opaque);
1589 }
1590 
1591 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1592 static int coroutine_fn
1593 blk_do_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
1594 {
1595     int ret;
1596 
1597     blk_wait_while_drained(blk);
1598 
1599     ret = blk_check_byte_request(blk, offset, bytes);
1600     if (ret < 0) {
1601         return ret;
1602     }
1603 
1604     return bdrv_co_pdiscard(blk->root, offset, bytes);
1605 }
1606 
1607 static void blk_aio_pdiscard_entry(void *opaque)
1608 {
1609     BlkAioEmAIOCB *acb = opaque;
1610     BlkRwCo *rwco = &acb->rwco;
1611 
1612     rwco->ret = blk_do_pdiscard(rwco->blk, rwco->offset, acb->bytes);
1613     blk_aio_complete(acb);
1614 }
1615 
1616 BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk,
1617                              int64_t offset, int bytes,
1618                              BlockCompletionFunc *cb, void *opaque)
1619 {
1620     return blk_aio_prwv(blk, offset, bytes, NULL, blk_aio_pdiscard_entry, 0,
1621                         cb, opaque);
1622 }
1623 
1624 int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
1625 {
1626     int ret;
1627 
1628     blk_inc_in_flight(blk);
1629     ret = blk_do_pdiscard(blk, offset, bytes);
1630     blk_dec_in_flight(blk);
1631 
1632     return ret;
1633 }
1634 
1635 static void blk_pdiscard_entry(void *opaque)
1636 {
1637     BlkRwCo *rwco = opaque;
1638     QEMUIOVector *qiov = rwco->iobuf;
1639 
1640     rwco->ret = blk_do_pdiscard(rwco->blk, rwco->offset, qiov->size);
1641     aio_wait_kick();
1642 }
1643 
1644 int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
1645 {
1646     return blk_prw(blk, offset, NULL, bytes, blk_pdiscard_entry, 0);
1647 }
1648 
1649 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
1650 static int coroutine_fn blk_do_flush(BlockBackend *blk)
1651 {
1652     blk_wait_while_drained(blk);
1653 
1654     if (!blk_is_available(blk)) {
1655         return -ENOMEDIUM;
1656     }
1657 
1658     return bdrv_co_flush(blk_bs(blk));
1659 }
1660 
1661 static void blk_aio_flush_entry(void *opaque)
1662 {
1663     BlkAioEmAIOCB *acb = opaque;
1664     BlkRwCo *rwco = &acb->rwco;
1665 
1666     rwco->ret = blk_do_flush(rwco->blk);
1667     blk_aio_complete(acb);
1668 }
1669 
1670 BlockAIOCB *blk_aio_flush(BlockBackend *blk,
1671                           BlockCompletionFunc *cb, void *opaque)
1672 {
1673     return blk_aio_prwv(blk, 0, 0, NULL, blk_aio_flush_entry, 0, cb, opaque);
1674 }
1675 
1676 int coroutine_fn blk_co_flush(BlockBackend *blk)
1677 {
1678     int ret;
1679 
1680     blk_inc_in_flight(blk);
1681     ret = blk_do_flush(blk);
1682     blk_dec_in_flight(blk);
1683 
1684     return ret;
1685 }
1686 
1687 static void blk_flush_entry(void *opaque)
1688 {
1689     BlkRwCo *rwco = opaque;
1690     rwco->ret = blk_do_flush(rwco->blk);
1691     aio_wait_kick();
1692 }
1693 
1694 int blk_flush(BlockBackend *blk)
1695 {
1696     return blk_prw(blk, 0, NULL, 0, blk_flush_entry, 0);
1697 }
1698 
1699 void blk_drain(BlockBackend *blk)
1700 {
1701     BlockDriverState *bs = blk_bs(blk);
1702 
1703     if (bs) {
1704         bdrv_drained_begin(bs);
1705     }
1706 
1707     /* We may have -ENOMEDIUM completions in flight */
1708     AIO_WAIT_WHILE(blk_get_aio_context(blk),
1709                    atomic_mb_read(&blk->in_flight) > 0);
1710 
1711     if (bs) {
1712         bdrv_drained_end(bs);
1713     }
1714 }
1715 
1716 void blk_drain_all(void)
1717 {
1718     BlockBackend *blk = NULL;
1719 
1720     bdrv_drain_all_begin();
1721 
1722     while ((blk = blk_all_next(blk)) != NULL) {
1723         AioContext *ctx = blk_get_aio_context(blk);
1724 
1725         aio_context_acquire(ctx);
1726 
1727         /* We may have -ENOMEDIUM completions in flight */
1728         AIO_WAIT_WHILE(ctx, atomic_mb_read(&blk->in_flight) > 0);
1729 
1730         aio_context_release(ctx);
1731     }
1732 
1733     bdrv_drain_all_end();
1734 }
1735 
1736 void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error,
1737                       BlockdevOnError on_write_error)
1738 {
1739     blk->on_read_error = on_read_error;
1740     blk->on_write_error = on_write_error;
1741 }
1742 
1743 BlockdevOnError blk_get_on_error(BlockBackend *blk, bool is_read)
1744 {
1745     return is_read ? blk->on_read_error : blk->on_write_error;
1746 }
1747 
1748 BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read,
1749                                       int error)
1750 {
1751     BlockdevOnError on_err = blk_get_on_error(blk, is_read);
1752 
1753     switch (on_err) {
1754     case BLOCKDEV_ON_ERROR_ENOSPC:
1755         return (error == ENOSPC) ?
1756                BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
1757     case BLOCKDEV_ON_ERROR_STOP:
1758         return BLOCK_ERROR_ACTION_STOP;
1759     case BLOCKDEV_ON_ERROR_REPORT:
1760         return BLOCK_ERROR_ACTION_REPORT;
1761     case BLOCKDEV_ON_ERROR_IGNORE:
1762         return BLOCK_ERROR_ACTION_IGNORE;
1763     case BLOCKDEV_ON_ERROR_AUTO:
1764     default:
1765         abort();
1766     }
1767 }
1768 
1769 static void send_qmp_error_event(BlockBackend *blk,
1770                                  BlockErrorAction action,
1771                                  bool is_read, int error)
1772 {
1773     IoOperationType optype;
1774     BlockDriverState *bs = blk_bs(blk);
1775 
1776     optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
1777     qapi_event_send_block_io_error(blk_name(blk), !!bs,
1778                                    bs ? bdrv_get_node_name(bs) : NULL, optype,
1779                                    action, blk_iostatus_is_enabled(blk),
1780                                    error == ENOSPC, strerror(error));
1781 }
1782 
1783 /* This is done by device models because, while the block layer knows
1784  * about the error, it does not know whether an operation comes from
1785  * the device or the block layer (from a job, for example).
1786  */
1787 void blk_error_action(BlockBackend *blk, BlockErrorAction action,
1788                       bool is_read, int error)
1789 {
1790     assert(error >= 0);
1791 
1792     if (action == BLOCK_ERROR_ACTION_STOP) {
1793         /* First set the iostatus, so that "info block" returns an iostatus
1794          * that matches the events raised so far (an additional error iostatus
1795          * is fine, but not a lost one).
1796          */
1797         blk_iostatus_set_err(blk, error);
1798 
1799         /* Then raise the request to stop the VM and the event.
1800          * qemu_system_vmstop_request_prepare has two effects.  First,
1801          * it ensures that the STOP event always comes after the
1802          * BLOCK_IO_ERROR event.  Second, it ensures that even if management
1803          * can observe the STOP event and do a "cont" before the STOP
1804          * event is issued, the VM will not stop.  In this case, vm_start()
1805          * also ensures that the STOP/RESUME pair of events is emitted.
1806          */
1807         qemu_system_vmstop_request_prepare();
1808         send_qmp_error_event(blk, action, is_read, error);
1809         qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
1810     } else {
1811         send_qmp_error_event(blk, action, is_read, error);
1812     }
1813 }
1814 
1815 bool blk_is_read_only(BlockBackend *blk)
1816 {
1817     BlockDriverState *bs = blk_bs(blk);
1818 
1819     if (bs) {
1820         return bdrv_is_read_only(bs);
1821     } else {
1822         return blk->root_state.read_only;
1823     }
1824 }
1825 
1826 bool blk_is_sg(BlockBackend *blk)
1827 {
1828     BlockDriverState *bs = blk_bs(blk);
1829 
1830     if (!bs) {
1831         return false;
1832     }
1833 
1834     return bdrv_is_sg(bs);
1835 }
1836 
1837 bool blk_enable_write_cache(BlockBackend *blk)
1838 {
1839     return blk->enable_write_cache;
1840 }
1841 
1842 void blk_set_enable_write_cache(BlockBackend *blk, bool wce)
1843 {
1844     blk->enable_write_cache = wce;
1845 }
1846 
1847 void blk_invalidate_cache(BlockBackend *blk, Error **errp)
1848 {
1849     BlockDriverState *bs = blk_bs(blk);
1850 
1851     if (!bs) {
1852         error_setg(errp, "Device '%s' has no medium", blk->name);
1853         return;
1854     }
1855 
1856     bdrv_invalidate_cache(bs, errp);
1857 }
1858 
1859 bool blk_is_inserted(BlockBackend *blk)
1860 {
1861     BlockDriverState *bs = blk_bs(blk);
1862 
1863     return bs && bdrv_is_inserted(bs);
1864 }
1865 
1866 bool blk_is_available(BlockBackend *blk)
1867 {
1868     return blk_is_inserted(blk) && !blk_dev_is_tray_open(blk);
1869 }
1870 
1871 void blk_lock_medium(BlockBackend *blk, bool locked)
1872 {
1873     BlockDriverState *bs = blk_bs(blk);
1874 
1875     if (bs) {
1876         bdrv_lock_medium(bs, locked);
1877     }
1878 }
1879 
1880 void blk_eject(BlockBackend *blk, bool eject_flag)
1881 {
1882     BlockDriverState *bs = blk_bs(blk);
1883     char *id;
1884 
1885     if (bs) {
1886         bdrv_eject(bs, eject_flag);
1887     }
1888 
1889     /* Whether or not we ejected on the backend,
1890      * the frontend experienced a tray event. */
1891     id = blk_get_attached_dev_id(blk);
1892     qapi_event_send_device_tray_moved(blk_name(blk), id,
1893                                       eject_flag);
1894     g_free(id);
1895 }
1896 
1897 int blk_get_flags(BlockBackend *blk)
1898 {
1899     BlockDriverState *bs = blk_bs(blk);
1900 
1901     if (bs) {
1902         return bdrv_get_flags(bs);
1903     } else {
1904         return blk->root_state.open_flags;
1905     }
1906 }
1907 
1908 /* Returns the minimum request alignment, in bytes; guaranteed nonzero */
1909 uint32_t blk_get_request_alignment(BlockBackend *blk)
1910 {
1911     BlockDriverState *bs = blk_bs(blk);
1912     return bs ? bs->bl.request_alignment : BDRV_SECTOR_SIZE;
1913 }
1914 
1915 /* Returns the maximum transfer length, in bytes; guaranteed nonzero */
1916 uint32_t blk_get_max_transfer(BlockBackend *blk)
1917 {
1918     BlockDriverState *bs = blk_bs(blk);
1919     uint32_t max = 0;
1920 
1921     if (bs) {
1922         max = bs->bl.max_transfer;
1923     }
1924     return MIN_NON_ZERO(max, INT_MAX);
1925 }
1926 
1927 int blk_get_max_iov(BlockBackend *blk)
1928 {
1929     return blk->root->bs->bl.max_iov;
1930 }
1931 
1932 void blk_set_guest_block_size(BlockBackend *blk, int align)
1933 {
1934     blk->guest_block_size = align;
1935 }
1936 
1937 void *blk_try_blockalign(BlockBackend *blk, size_t size)
1938 {
1939     return qemu_try_blockalign(blk ? blk_bs(blk) : NULL, size);
1940 }
1941 
1942 void *blk_blockalign(BlockBackend *blk, size_t size)
1943 {
1944     return qemu_blockalign(blk ? blk_bs(blk) : NULL, size);
1945 }
1946 
1947 bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp)
1948 {
1949     BlockDriverState *bs = blk_bs(blk);
1950 
1951     if (!bs) {
1952         return false;
1953     }
1954 
1955     return bdrv_op_is_blocked(bs, op, errp);
1956 }
1957 
1958 void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason)
1959 {
1960     BlockDriverState *bs = blk_bs(blk);
1961 
1962     if (bs) {
1963         bdrv_op_unblock(bs, op, reason);
1964     }
1965 }
1966 
1967 void blk_op_block_all(BlockBackend *blk, Error *reason)
1968 {
1969     BlockDriverState *bs = blk_bs(blk);
1970 
1971     if (bs) {
1972         bdrv_op_block_all(bs, reason);
1973     }
1974 }
1975 
1976 void blk_op_unblock_all(BlockBackend *blk, Error *reason)
1977 {
1978     BlockDriverState *bs = blk_bs(blk);
1979 
1980     if (bs) {
1981         bdrv_op_unblock_all(bs, reason);
1982     }
1983 }
1984 
1985 AioContext *blk_get_aio_context(BlockBackend *blk)
1986 {
1987     BlockDriverState *bs = blk_bs(blk);
1988 
1989     if (bs) {
1990         AioContext *ctx = bdrv_get_aio_context(blk_bs(blk));
1991         assert(ctx == blk->ctx);
1992     }
1993 
1994     return blk->ctx;
1995 }
1996 
1997 static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
1998 {
1999     BlockBackendAIOCB *blk_acb = DO_UPCAST(BlockBackendAIOCB, common, acb);
2000     return blk_get_aio_context(blk_acb->blk);
2001 }
2002 
2003 static int blk_do_set_aio_context(BlockBackend *blk, AioContext *new_context,
2004                                   bool update_root_node, Error **errp)
2005 {
2006     BlockDriverState *bs = blk_bs(blk);
2007     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
2008     int ret;
2009 
2010     if (bs) {
2011         if (update_root_node) {
2012             ret = bdrv_child_try_set_aio_context(bs, new_context, blk->root,
2013                                                  errp);
2014             if (ret < 0) {
2015                 return ret;
2016             }
2017         }
2018         if (tgm->throttle_state) {
2019             bdrv_drained_begin(bs);
2020             throttle_group_detach_aio_context(tgm);
2021             throttle_group_attach_aio_context(tgm, new_context);
2022             bdrv_drained_end(bs);
2023         }
2024     }
2025 
2026     blk->ctx = new_context;
2027     return 0;
2028 }
2029 
2030 int blk_set_aio_context(BlockBackend *blk, AioContext *new_context,
2031                         Error **errp)
2032 {
2033     return blk_do_set_aio_context(blk, new_context, true, errp);
2034 }
2035 
2036 static bool blk_root_can_set_aio_ctx(BdrvChild *child, AioContext *ctx,
2037                                      GSList **ignore, Error **errp)
2038 {
2039     BlockBackend *blk = child->opaque;
2040 
2041     if (blk->allow_aio_context_change) {
2042         return true;
2043     }
2044 
2045     /* Only manually created BlockBackends that are not attached to anything
2046      * can change their AioContext without updating their user. */
2047     if (!blk->name || blk->dev) {
2048         /* TODO Add BB name/QOM path */
2049         error_setg(errp, "Cannot change iothread of active block backend");
2050         return false;
2051     }
2052 
2053     return true;
2054 }
2055 
2056 static void blk_root_set_aio_ctx(BdrvChild *child, AioContext *ctx,
2057                                  GSList **ignore)
2058 {
2059     BlockBackend *blk = child->opaque;
2060     blk_do_set_aio_context(blk, ctx, false, &error_abort);
2061 }
2062 
2063 void blk_add_aio_context_notifier(BlockBackend *blk,
2064         void (*attached_aio_context)(AioContext *new_context, void *opaque),
2065         void (*detach_aio_context)(void *opaque), void *opaque)
2066 {
2067     BlockBackendAioNotifier *notifier;
2068     BlockDriverState *bs = blk_bs(blk);
2069 
2070     notifier = g_new(BlockBackendAioNotifier, 1);
2071     notifier->attached_aio_context = attached_aio_context;
2072     notifier->detach_aio_context = detach_aio_context;
2073     notifier->opaque = opaque;
2074     QLIST_INSERT_HEAD(&blk->aio_notifiers, notifier, list);
2075 
2076     if (bs) {
2077         bdrv_add_aio_context_notifier(bs, attached_aio_context,
2078                                       detach_aio_context, opaque);
2079     }
2080 }
2081 
2082 void blk_remove_aio_context_notifier(BlockBackend *blk,
2083                                      void (*attached_aio_context)(AioContext *,
2084                                                                   void *),
2085                                      void (*detach_aio_context)(void *),
2086                                      void *opaque)
2087 {
2088     BlockBackendAioNotifier *notifier;
2089     BlockDriverState *bs = blk_bs(blk);
2090 
2091     if (bs) {
2092         bdrv_remove_aio_context_notifier(bs, attached_aio_context,
2093                                          detach_aio_context, opaque);
2094     }
2095 
2096     QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
2097         if (notifier->attached_aio_context == attached_aio_context &&
2098             notifier->detach_aio_context == detach_aio_context &&
2099             notifier->opaque == opaque) {
2100             QLIST_REMOVE(notifier, list);
2101             g_free(notifier);
2102             return;
2103         }
2104     }
2105 
2106     abort();
2107 }
2108 
2109 void blk_add_remove_bs_notifier(BlockBackend *blk, Notifier *notify)
2110 {
2111     notifier_list_add(&blk->remove_bs_notifiers, notify);
2112 }
2113 
2114 void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify)
2115 {
2116     notifier_list_add(&blk->insert_bs_notifiers, notify);
2117 }
2118 
2119 void blk_io_plug(BlockBackend *blk)
2120 {
2121     BlockDriverState *bs = blk_bs(blk);
2122 
2123     if (bs) {
2124         bdrv_io_plug(bs);
2125     }
2126 }
2127 
2128 void blk_io_unplug(BlockBackend *blk)
2129 {
2130     BlockDriverState *bs = blk_bs(blk);
2131 
2132     if (bs) {
2133         bdrv_io_unplug(bs);
2134     }
2135 }
2136 
2137 BlockAcctStats *blk_get_stats(BlockBackend *blk)
2138 {
2139     return &blk->stats;
2140 }
2141 
2142 void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
2143                   BlockCompletionFunc *cb, void *opaque)
2144 {
2145     return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque);
2146 }
2147 
2148 int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
2149                                       int bytes, BdrvRequestFlags flags)
2150 {
2151     return blk_co_pwritev(blk, offset, bytes, NULL,
2152                           flags | BDRV_REQ_ZERO_WRITE);
2153 }
2154 
2155 int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf,
2156                           int count)
2157 {
2158     return blk_prw(blk, offset, (void *) buf, count, blk_write_entry,
2159                    BDRV_REQ_WRITE_COMPRESSED);
2160 }
2161 
2162 int blk_truncate(BlockBackend *blk, int64_t offset, bool exact,
2163                  PreallocMode prealloc, BdrvRequestFlags flags, Error **errp)
2164 {
2165     if (!blk_is_available(blk)) {
2166         error_setg(errp, "No medium inserted");
2167         return -ENOMEDIUM;
2168     }
2169 
2170     return bdrv_truncate(blk->root, offset, exact, prealloc, flags, errp);
2171 }
2172 
2173 int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
2174                      int64_t pos, int size)
2175 {
2176     int ret;
2177 
2178     if (!blk_is_available(blk)) {
2179         return -ENOMEDIUM;
2180     }
2181 
2182     ret = bdrv_save_vmstate(blk_bs(blk), buf, pos, size);
2183     if (ret < 0) {
2184         return ret;
2185     }
2186 
2187     if (ret == size && !blk->enable_write_cache) {
2188         ret = bdrv_flush(blk_bs(blk));
2189     }
2190 
2191     return ret < 0 ? ret : size;
2192 }
2193 
2194 int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size)
2195 {
2196     if (!blk_is_available(blk)) {
2197         return -ENOMEDIUM;
2198     }
2199 
2200     return bdrv_load_vmstate(blk_bs(blk), buf, pos, size);
2201 }
2202 
2203 int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz)
2204 {
2205     if (!blk_is_available(blk)) {
2206         return -ENOMEDIUM;
2207     }
2208 
2209     return bdrv_probe_blocksizes(blk_bs(blk), bsz);
2210 }
2211 
2212 int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo)
2213 {
2214     if (!blk_is_available(blk)) {
2215         return -ENOMEDIUM;
2216     }
2217 
2218     return bdrv_probe_geometry(blk_bs(blk), geo);
2219 }
2220 
2221 /*
2222  * Updates the BlockBackendRootState object with data from the currently
2223  * attached BlockDriverState.
2224  */
2225 void blk_update_root_state(BlockBackend *blk)
2226 {
2227     assert(blk->root);
2228 
2229     blk->root_state.open_flags    = blk->root->bs->open_flags;
2230     blk->root_state.read_only     = blk->root->bs->read_only;
2231     blk->root_state.detect_zeroes = blk->root->bs->detect_zeroes;
2232 }
2233 
2234 /*
2235  * Returns the detect-zeroes setting to be used for bdrv_open() of a
2236  * BlockDriverState which is supposed to inherit the root state.
2237  */
2238 bool blk_get_detect_zeroes_from_root_state(BlockBackend *blk)
2239 {
2240     return blk->root_state.detect_zeroes;
2241 }
2242 
2243 /*
2244  * Returns the flags to be used for bdrv_open() of a BlockDriverState which is
2245  * supposed to inherit the root state.
2246  */
2247 int blk_get_open_flags_from_root_state(BlockBackend *blk)
2248 {
2249     int bs_flags;
2250 
2251     bs_flags = blk->root_state.read_only ? 0 : BDRV_O_RDWR;
2252     bs_flags |= blk->root_state.open_flags & ~BDRV_O_RDWR;
2253 
2254     return bs_flags;
2255 }
2256 
2257 BlockBackendRootState *blk_get_root_state(BlockBackend *blk)
2258 {
2259     return &blk->root_state;
2260 }
2261 
2262 int blk_commit_all(void)
2263 {
2264     BlockBackend *blk = NULL;
2265 
2266     while ((blk = blk_all_next(blk)) != NULL) {
2267         AioContext *aio_context = blk_get_aio_context(blk);
2268 
2269         aio_context_acquire(aio_context);
2270         if (blk_is_inserted(blk) && blk->root->bs->backing) {
2271             int ret = bdrv_commit(blk->root->bs);
2272             if (ret < 0) {
2273                 aio_context_release(aio_context);
2274                 return ret;
2275             }
2276         }
2277         aio_context_release(aio_context);
2278     }
2279     return 0;
2280 }
2281 
2282 
2283 /* throttling disk I/O limits */
2284 void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg)
2285 {
2286     throttle_group_config(&blk->public.throttle_group_member, cfg);
2287 }
2288 
2289 void blk_io_limits_disable(BlockBackend *blk)
2290 {
2291     BlockDriverState *bs = blk_bs(blk);
2292     ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
2293     assert(tgm->throttle_state);
2294     if (bs) {
2295         bdrv_drained_begin(bs);
2296     }
2297     throttle_group_unregister_tgm(tgm);
2298     if (bs) {
2299         bdrv_drained_end(bs);
2300     }
2301 }
2302 
2303 /* should be called before blk_set_io_limits if a limit is set */
2304 void blk_io_limits_enable(BlockBackend *blk, const char *group)
2305 {
2306     assert(!blk->public.throttle_group_member.throttle_state);
2307     throttle_group_register_tgm(&blk->public.throttle_group_member,
2308                                 group, blk_get_aio_context(blk));
2309 }
2310 
2311 void blk_io_limits_update_group(BlockBackend *blk, const char *group)
2312 {
2313     /* this BB is not part of any group */
2314     if (!blk->public.throttle_group_member.throttle_state) {
2315         return;
2316     }
2317 
2318     /* this BB is a part of the same group than the one we want */
2319     if (!g_strcmp0(throttle_group_get_name(&blk->public.throttle_group_member),
2320                 group)) {
2321         return;
2322     }
2323 
2324     /* need to change the group this bs belong to */
2325     blk_io_limits_disable(blk);
2326     blk_io_limits_enable(blk, group);
2327 }
2328 
2329 static void blk_root_drained_begin(BdrvChild *child)
2330 {
2331     BlockBackend *blk = child->opaque;
2332 
2333     if (++blk->quiesce_counter == 1) {
2334         if (blk->dev_ops && blk->dev_ops->drained_begin) {
2335             blk->dev_ops->drained_begin(blk->dev_opaque);
2336         }
2337     }
2338 
2339     /* Note that blk->root may not be accessible here yet if we are just
2340      * attaching to a BlockDriverState that is drained. Use child instead. */
2341 
2342     if (atomic_fetch_inc(&blk->public.throttle_group_member.io_limits_disabled) == 0) {
2343         throttle_group_restart_tgm(&blk->public.throttle_group_member);
2344     }
2345 }
2346 
2347 static bool blk_root_drained_poll(BdrvChild *child)
2348 {
2349     BlockBackend *blk = child->opaque;
2350     assert(blk->quiesce_counter);
2351     return !!blk->in_flight;
2352 }
2353 
2354 static void blk_root_drained_end(BdrvChild *child, int *drained_end_counter)
2355 {
2356     BlockBackend *blk = child->opaque;
2357     assert(blk->quiesce_counter);
2358 
2359     assert(blk->public.throttle_group_member.io_limits_disabled);
2360     atomic_dec(&blk->public.throttle_group_member.io_limits_disabled);
2361 
2362     if (--blk->quiesce_counter == 0) {
2363         if (blk->dev_ops && blk->dev_ops->drained_end) {
2364             blk->dev_ops->drained_end(blk->dev_opaque);
2365         }
2366         while (qemu_co_enter_next(&blk->queued_requests, NULL)) {
2367             /* Resume all queued requests */
2368         }
2369     }
2370 }
2371 
2372 void blk_register_buf(BlockBackend *blk, void *host, size_t size)
2373 {
2374     bdrv_register_buf(blk_bs(blk), host, size);
2375 }
2376 
2377 void blk_unregister_buf(BlockBackend *blk, void *host)
2378 {
2379     bdrv_unregister_buf(blk_bs(blk), host);
2380 }
2381 
2382 int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in,
2383                                    BlockBackend *blk_out, int64_t off_out,
2384                                    int bytes, BdrvRequestFlags read_flags,
2385                                    BdrvRequestFlags write_flags)
2386 {
2387     int r;
2388     r = blk_check_byte_request(blk_in, off_in, bytes);
2389     if (r) {
2390         return r;
2391     }
2392     r = blk_check_byte_request(blk_out, off_out, bytes);
2393     if (r) {
2394         return r;
2395     }
2396     return bdrv_co_copy_range(blk_in->root, off_in,
2397                               blk_out->root, off_out,
2398                               bytes, read_flags, write_flags);
2399 }
2400 
2401 const BdrvChild *blk_root(BlockBackend *blk)
2402 {
2403     return blk->root;
2404 }
2405