xref: /openbmc/qemu/block.c (revision 806f71ee)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  * Copyright (c) 2020 Virtuozzo International GmbH.
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 
26 #include "qemu/osdep.h"
27 #include "block/trace.h"
28 #include "block/block_int.h"
29 #include "block/blockjob.h"
30 #include "block/dirty-bitmap.h"
31 #include "block/fuse.h"
32 #include "block/nbd.h"
33 #include "block/qdict.h"
34 #include "qemu/error-report.h"
35 #include "block/module_block.h"
36 #include "qemu/main-loop.h"
37 #include "qemu/module.h"
38 #include "qapi/error.h"
39 #include "qapi/qmp/qdict.h"
40 #include "qapi/qmp/qjson.h"
41 #include "qapi/qmp/qnull.h"
42 #include "qapi/qmp/qstring.h"
43 #include "qapi/qobject-output-visitor.h"
44 #include "qapi/qapi-visit-block-core.h"
45 #include "sysemu/block-backend.h"
46 #include "qemu/notify.h"
47 #include "qemu/option.h"
48 #include "qemu/coroutine.h"
49 #include "block/qapi.h"
50 #include "qemu/timer.h"
51 #include "qemu/cutils.h"
52 #include "qemu/id.h"
53 #include "qemu/range.h"
54 #include "qemu/rcu.h"
55 #include "block/coroutines.h"
56 
57 #ifdef CONFIG_BSD
58 #include <sys/ioctl.h>
59 #include <sys/queue.h>
60 #if defined(HAVE_SYS_DISK_H)
61 #include <sys/disk.h>
62 #endif
63 #endif
64 
65 #ifdef _WIN32
66 #include <windows.h>
67 #endif
68 
69 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
70 
71 /* Protected by BQL */
72 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
73     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
74 
75 /* Protected by BQL */
76 static QTAILQ_HEAD(, BlockDriverState) all_bdrv_states =
77     QTAILQ_HEAD_INITIALIZER(all_bdrv_states);
78 
79 /* Protected by BQL */
80 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
81     QLIST_HEAD_INITIALIZER(bdrv_drivers);
82 
83 static BlockDriverState *bdrv_open_inherit(const char *filename,
84                                            const char *reference,
85                                            QDict *options, int flags,
86                                            BlockDriverState *parent,
87                                            const BdrvChildClass *child_class,
88                                            BdrvChildRole child_role,
89                                            Error **errp);
90 
91 static bool bdrv_recurse_has_child(BlockDriverState *bs,
92                                    BlockDriverState *child);
93 
94 static void GRAPH_WRLOCK
95 bdrv_replace_child_noperm(BdrvChild *child, BlockDriverState *new_bs);
96 
97 static void GRAPH_WRLOCK
98 bdrv_remove_child(BdrvChild *child, Transaction *tran);
99 
100 static int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
101                                BlockReopenQueue *queue,
102                                Transaction *change_child_tran, Error **errp);
103 static void bdrv_reopen_commit(BDRVReopenState *reopen_state);
104 static void bdrv_reopen_abort(BDRVReopenState *reopen_state);
105 
106 static bool bdrv_backing_overridden(BlockDriverState *bs);
107 
108 static bool bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx,
109                                     GHashTable *visited, Transaction *tran,
110                                     Error **errp);
111 
112 /* If non-zero, use only whitelisted block drivers */
113 static int use_bdrv_whitelist;
114 
115 #ifdef _WIN32
116 static int is_windows_drive_prefix(const char *filename)
117 {
118     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
119              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
120             filename[1] == ':');
121 }
122 
123 int is_windows_drive(const char *filename)
124 {
125     if (is_windows_drive_prefix(filename) &&
126         filename[2] == '\0')
127         return 1;
128     if (strstart(filename, "\\\\.\\", NULL) ||
129         strstart(filename, "//./", NULL))
130         return 1;
131     return 0;
132 }
133 #endif
134 
135 size_t bdrv_opt_mem_align(BlockDriverState *bs)
136 {
137     if (!bs || !bs->drv) {
138         /* page size or 4k (hdd sector size) should be on the safe side */
139         return MAX(4096, qemu_real_host_page_size());
140     }
141     IO_CODE();
142 
143     return bs->bl.opt_mem_alignment;
144 }
145 
146 size_t bdrv_min_mem_align(BlockDriverState *bs)
147 {
148     if (!bs || !bs->drv) {
149         /* page size or 4k (hdd sector size) should be on the safe side */
150         return MAX(4096, qemu_real_host_page_size());
151     }
152     IO_CODE();
153 
154     return bs->bl.min_mem_alignment;
155 }
156 
157 /* check if the path starts with "<protocol>:" */
158 int path_has_protocol(const char *path)
159 {
160     const char *p;
161 
162 #ifdef _WIN32
163     if (is_windows_drive(path) ||
164         is_windows_drive_prefix(path)) {
165         return 0;
166     }
167     p = path + strcspn(path, ":/\\");
168 #else
169     p = path + strcspn(path, ":/");
170 #endif
171 
172     return *p == ':';
173 }
174 
175 int path_is_absolute(const char *path)
176 {
177 #ifdef _WIN32
178     /* specific case for names like: "\\.\d:" */
179     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
180         return 1;
181     }
182     return (*path == '/' || *path == '\\');
183 #else
184     return (*path == '/');
185 #endif
186 }
187 
188 /* if filename is absolute, just return its duplicate. Otherwise, build a
189    path to it by considering it is relative to base_path. URL are
190    supported. */
191 char *path_combine(const char *base_path, const char *filename)
192 {
193     const char *protocol_stripped = NULL;
194     const char *p, *p1;
195     char *result;
196     int len;
197 
198     if (path_is_absolute(filename)) {
199         return g_strdup(filename);
200     }
201 
202     if (path_has_protocol(base_path)) {
203         protocol_stripped = strchr(base_path, ':');
204         if (protocol_stripped) {
205             protocol_stripped++;
206         }
207     }
208     p = protocol_stripped ?: base_path;
209 
210     p1 = strrchr(base_path, '/');
211 #ifdef _WIN32
212     {
213         const char *p2;
214         p2 = strrchr(base_path, '\\');
215         if (!p1 || p2 > p1) {
216             p1 = p2;
217         }
218     }
219 #endif
220     if (p1) {
221         p1++;
222     } else {
223         p1 = base_path;
224     }
225     if (p1 > p) {
226         p = p1;
227     }
228     len = p - base_path;
229 
230     result = g_malloc(len + strlen(filename) + 1);
231     memcpy(result, base_path, len);
232     strcpy(result + len, filename);
233 
234     return result;
235 }
236 
237 /*
238  * Helper function for bdrv_parse_filename() implementations to remove optional
239  * protocol prefixes (especially "file:") from a filename and for putting the
240  * stripped filename into the options QDict if there is such a prefix.
241  */
242 void bdrv_parse_filename_strip_prefix(const char *filename, const char *prefix,
243                                       QDict *options)
244 {
245     if (strstart(filename, prefix, &filename)) {
246         /* Stripping the explicit protocol prefix may result in a protocol
247          * prefix being (wrongly) detected (if the filename contains a colon) */
248         if (path_has_protocol(filename)) {
249             GString *fat_filename;
250 
251             /* This means there is some colon before the first slash; therefore,
252              * this cannot be an absolute path */
253             assert(!path_is_absolute(filename));
254 
255             /* And we can thus fix the protocol detection issue by prefixing it
256              * by "./" */
257             fat_filename = g_string_new("./");
258             g_string_append(fat_filename, filename);
259 
260             assert(!path_has_protocol(fat_filename->str));
261 
262             qdict_put(options, "filename",
263                       qstring_from_gstring(fat_filename));
264         } else {
265             /* If no protocol prefix was detected, we can use the shortened
266              * filename as-is */
267             qdict_put_str(options, "filename", filename);
268         }
269     }
270 }
271 
272 
273 /* Returns whether the image file is opened as read-only. Note that this can
274  * return false and writing to the image file is still not possible because the
275  * image is inactivated. */
276 bool bdrv_is_read_only(BlockDriverState *bs)
277 {
278     IO_CODE();
279     return !(bs->open_flags & BDRV_O_RDWR);
280 }
281 
282 static int GRAPH_RDLOCK
283 bdrv_can_set_read_only(BlockDriverState *bs, bool read_only,
284                        bool ignore_allow_rdw, Error **errp)
285 {
286     IO_CODE();
287 
288     /* Do not set read_only if copy_on_read is enabled */
289     if (bs->copy_on_read && read_only) {
290         error_setg(errp, "Can't set node '%s' to r/o with copy-on-read enabled",
291                    bdrv_get_device_or_node_name(bs));
292         return -EINVAL;
293     }
294 
295     /* Do not clear read_only if it is prohibited */
296     if (!read_only && !(bs->open_flags & BDRV_O_ALLOW_RDWR) &&
297         !ignore_allow_rdw)
298     {
299         error_setg(errp, "Node '%s' is read only",
300                    bdrv_get_device_or_node_name(bs));
301         return -EPERM;
302     }
303 
304     return 0;
305 }
306 
307 /*
308  * Called by a driver that can only provide a read-only image.
309  *
310  * Returns 0 if the node is already read-only or it could switch the node to
311  * read-only because BDRV_O_AUTO_RDONLY is set.
312  *
313  * Returns -EACCES if the node is read-write and BDRV_O_AUTO_RDONLY is not set
314  * or bdrv_can_set_read_only() forbids making the node read-only. If @errmsg
315  * is not NULL, it is used as the error message for the Error object.
316  */
317 int bdrv_apply_auto_read_only(BlockDriverState *bs, const char *errmsg,
318                               Error **errp)
319 {
320     int ret = 0;
321     IO_CODE();
322 
323     if (!(bs->open_flags & BDRV_O_RDWR)) {
324         return 0;
325     }
326     if (!(bs->open_flags & BDRV_O_AUTO_RDONLY)) {
327         goto fail;
328     }
329 
330     ret = bdrv_can_set_read_only(bs, true, false, NULL);
331     if (ret < 0) {
332         goto fail;
333     }
334 
335     bs->open_flags &= ~BDRV_O_RDWR;
336 
337     return 0;
338 
339 fail:
340     error_setg(errp, "%s", errmsg ?: "Image is read-only");
341     return -EACCES;
342 }
343 
344 /*
345  * If @backing is empty, this function returns NULL without setting
346  * @errp.  In all other cases, NULL will only be returned with @errp
347  * set.
348  *
349  * Therefore, a return value of NULL without @errp set means that
350  * there is no backing file; if @errp is set, there is one but its
351  * absolute filename cannot be generated.
352  */
353 char *bdrv_get_full_backing_filename_from_filename(const char *backed,
354                                                    const char *backing,
355                                                    Error **errp)
356 {
357     if (backing[0] == '\0') {
358         return NULL;
359     } else if (path_has_protocol(backing) || path_is_absolute(backing)) {
360         return g_strdup(backing);
361     } else if (backed[0] == '\0' || strstart(backed, "json:", NULL)) {
362         error_setg(errp, "Cannot use relative backing file names for '%s'",
363                    backed);
364         return NULL;
365     } else {
366         return path_combine(backed, backing);
367     }
368 }
369 
370 /*
371  * If @filename is empty or NULL, this function returns NULL without
372  * setting @errp.  In all other cases, NULL will only be returned with
373  * @errp set.
374  */
375 static char * GRAPH_RDLOCK
376 bdrv_make_absolute_filename(BlockDriverState *relative_to,
377                             const char *filename, Error **errp)
378 {
379     char *dir, *full_name;
380 
381     if (!filename || filename[0] == '\0') {
382         return NULL;
383     } else if (path_has_protocol(filename) || path_is_absolute(filename)) {
384         return g_strdup(filename);
385     }
386 
387     dir = bdrv_dirname(relative_to, errp);
388     if (!dir) {
389         return NULL;
390     }
391 
392     full_name = g_strconcat(dir, filename, NULL);
393     g_free(dir);
394     return full_name;
395 }
396 
397 char *bdrv_get_full_backing_filename(BlockDriverState *bs, Error **errp)
398 {
399     GLOBAL_STATE_CODE();
400     return bdrv_make_absolute_filename(bs, bs->backing_file, errp);
401 }
402 
403 void bdrv_register(BlockDriver *bdrv)
404 {
405     assert(bdrv->format_name);
406     GLOBAL_STATE_CODE();
407     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
408 }
409 
410 BlockDriverState *bdrv_new(void)
411 {
412     BlockDriverState *bs;
413     int i;
414 
415     GLOBAL_STATE_CODE();
416 
417     bs = g_new0(BlockDriverState, 1);
418     QLIST_INIT(&bs->dirty_bitmaps);
419     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
420         QLIST_INIT(&bs->op_blockers[i]);
421     }
422     qemu_mutex_init(&bs->reqs_lock);
423     qemu_mutex_init(&bs->dirty_bitmap_mutex);
424     bs->refcnt = 1;
425     bs->aio_context = qemu_get_aio_context();
426 
427     qemu_co_queue_init(&bs->flush_queue);
428 
429     qemu_co_mutex_init(&bs->bsc_modify_lock);
430     bs->block_status_cache = g_new0(BdrvBlockStatusCache, 1);
431 
432     for (i = 0; i < bdrv_drain_all_count; i++) {
433         bdrv_drained_begin(bs);
434     }
435 
436     QTAILQ_INSERT_TAIL(&all_bdrv_states, bs, bs_list);
437 
438     return bs;
439 }
440 
441 static BlockDriver *bdrv_do_find_format(const char *format_name)
442 {
443     BlockDriver *drv1;
444     GLOBAL_STATE_CODE();
445 
446     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
447         if (!strcmp(drv1->format_name, format_name)) {
448             return drv1;
449         }
450     }
451 
452     return NULL;
453 }
454 
455 BlockDriver *bdrv_find_format(const char *format_name)
456 {
457     BlockDriver *drv1;
458     int i;
459 
460     GLOBAL_STATE_CODE();
461 
462     drv1 = bdrv_do_find_format(format_name);
463     if (drv1) {
464         return drv1;
465     }
466 
467     /* The driver isn't registered, maybe we need to load a module */
468     for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); ++i) {
469         if (!strcmp(block_driver_modules[i].format_name, format_name)) {
470             Error *local_err = NULL;
471             int rv = block_module_load(block_driver_modules[i].library_name,
472                                        &local_err);
473             if (rv > 0) {
474                 return bdrv_do_find_format(format_name);
475             } else if (rv < 0) {
476                 error_report_err(local_err);
477             }
478             break;
479         }
480     }
481     return NULL;
482 }
483 
484 static int bdrv_format_is_whitelisted(const char *format_name, bool read_only)
485 {
486     static const char *whitelist_rw[] = {
487         CONFIG_BDRV_RW_WHITELIST
488         NULL
489     };
490     static const char *whitelist_ro[] = {
491         CONFIG_BDRV_RO_WHITELIST
492         NULL
493     };
494     const char **p;
495 
496     if (!whitelist_rw[0] && !whitelist_ro[0]) {
497         return 1;               /* no whitelist, anything goes */
498     }
499 
500     for (p = whitelist_rw; *p; p++) {
501         if (!strcmp(format_name, *p)) {
502             return 1;
503         }
504     }
505     if (read_only) {
506         for (p = whitelist_ro; *p; p++) {
507             if (!strcmp(format_name, *p)) {
508                 return 1;
509             }
510         }
511     }
512     return 0;
513 }
514 
515 int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
516 {
517     GLOBAL_STATE_CODE();
518     return bdrv_format_is_whitelisted(drv->format_name, read_only);
519 }
520 
521 bool bdrv_uses_whitelist(void)
522 {
523     return use_bdrv_whitelist;
524 }
525 
526 typedef struct CreateCo {
527     BlockDriver *drv;
528     char *filename;
529     QemuOpts *opts;
530     int ret;
531     Error *err;
532 } CreateCo;
533 
534 int coroutine_fn bdrv_co_create(BlockDriver *drv, const char *filename,
535                                 QemuOpts *opts, Error **errp)
536 {
537     int ret;
538     GLOBAL_STATE_CODE();
539     ERRP_GUARD();
540 
541     if (!drv->bdrv_co_create_opts) {
542         error_setg(errp, "Driver '%s' does not support image creation",
543                    drv->format_name);
544         return -ENOTSUP;
545     }
546 
547     ret = drv->bdrv_co_create_opts(drv, filename, opts, errp);
548     if (ret < 0 && !*errp) {
549         error_setg_errno(errp, -ret, "Could not create image");
550     }
551 
552     return ret;
553 }
554 
555 /**
556  * Helper function for bdrv_create_file_fallback(): Resize @blk to at
557  * least the given @minimum_size.
558  *
559  * On success, return @blk's actual length.
560  * Otherwise, return -errno.
561  */
562 static int64_t coroutine_fn GRAPH_UNLOCKED
563 create_file_fallback_truncate(BlockBackend *blk, int64_t minimum_size,
564                               Error **errp)
565 {
566     Error *local_err = NULL;
567     int64_t size;
568     int ret;
569 
570     GLOBAL_STATE_CODE();
571 
572     ret = blk_co_truncate(blk, minimum_size, false, PREALLOC_MODE_OFF, 0,
573                           &local_err);
574     if (ret < 0 && ret != -ENOTSUP) {
575         error_propagate(errp, local_err);
576         return ret;
577     }
578 
579     size = blk_co_getlength(blk);
580     if (size < 0) {
581         error_free(local_err);
582         error_setg_errno(errp, -size,
583                          "Failed to inquire the new image file's length");
584         return size;
585     }
586 
587     if (size < minimum_size) {
588         /* Need to grow the image, but we failed to do that */
589         error_propagate(errp, local_err);
590         return -ENOTSUP;
591     }
592 
593     error_free(local_err);
594     local_err = NULL;
595 
596     return size;
597 }
598 
599 /**
600  * Helper function for bdrv_create_file_fallback(): Zero the first
601  * sector to remove any potentially pre-existing image header.
602  */
603 static int coroutine_fn
604 create_file_fallback_zero_first_sector(BlockBackend *blk,
605                                        int64_t current_size,
606                                        Error **errp)
607 {
608     int64_t bytes_to_clear;
609     int ret;
610 
611     GLOBAL_STATE_CODE();
612 
613     bytes_to_clear = MIN(current_size, BDRV_SECTOR_SIZE);
614     if (bytes_to_clear) {
615         ret = blk_co_pwrite_zeroes(blk, 0, bytes_to_clear, BDRV_REQ_MAY_UNMAP);
616         if (ret < 0) {
617             error_setg_errno(errp, -ret,
618                              "Failed to clear the new image's first sector");
619             return ret;
620         }
621     }
622 
623     return 0;
624 }
625 
626 /**
627  * Simple implementation of bdrv_co_create_opts for protocol drivers
628  * which only support creation via opening a file
629  * (usually existing raw storage device)
630  */
631 int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv,
632                                             const char *filename,
633                                             QemuOpts *opts,
634                                             Error **errp)
635 {
636     BlockBackend *blk;
637     QDict *options;
638     int64_t size = 0;
639     char *buf = NULL;
640     PreallocMode prealloc;
641     Error *local_err = NULL;
642     int ret;
643 
644     GLOBAL_STATE_CODE();
645 
646     size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
647     buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
648     prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
649                                PREALLOC_MODE_OFF, &local_err);
650     g_free(buf);
651     if (local_err) {
652         error_propagate(errp, local_err);
653         return -EINVAL;
654     }
655 
656     if (prealloc != PREALLOC_MODE_OFF) {
657         error_setg(errp, "Unsupported preallocation mode '%s'",
658                    PreallocMode_str(prealloc));
659         return -ENOTSUP;
660     }
661 
662     options = qdict_new();
663     qdict_put_str(options, "driver", drv->format_name);
664 
665     blk = blk_co_new_open(filename, NULL, options,
666                           BDRV_O_RDWR | BDRV_O_RESIZE, errp);
667     if (!blk) {
668         error_prepend(errp, "Protocol driver '%s' does not support creating "
669                       "new images, so an existing image must be selected as "
670                       "the target; however, opening the given target as an "
671                       "existing image failed: ",
672                       drv->format_name);
673         return -EINVAL;
674     }
675 
676     size = create_file_fallback_truncate(blk, size, errp);
677     if (size < 0) {
678         ret = size;
679         goto out;
680     }
681 
682     ret = create_file_fallback_zero_first_sector(blk, size, errp);
683     if (ret < 0) {
684         goto out;
685     }
686 
687     ret = 0;
688 out:
689     blk_co_unref(blk);
690     return ret;
691 }
692 
693 int coroutine_fn bdrv_co_create_file(const char *filename, QemuOpts *opts,
694                                      Error **errp)
695 {
696     QemuOpts *protocol_opts;
697     BlockDriver *drv;
698     QDict *qdict;
699     int ret;
700 
701     GLOBAL_STATE_CODE();
702 
703     drv = bdrv_find_protocol(filename, true, errp);
704     if (drv == NULL) {
705         return -ENOENT;
706     }
707 
708     if (!drv->create_opts) {
709         error_setg(errp, "Driver '%s' does not support image creation",
710                    drv->format_name);
711         return -ENOTSUP;
712     }
713 
714     /*
715      * 'opts' contains a QemuOptsList with a combination of format and protocol
716      * default values.
717      *
718      * The format properly removes its options, but the default values remain
719      * in 'opts->list'.  So if the protocol has options with the same name
720      * (e.g. rbd has 'cluster_size' as qcow2), it will see the default values
721      * of the format, since for overlapping options, the format wins.
722      *
723      * To avoid this issue, lets convert QemuOpts to QDict, in this way we take
724      * only the set options, and then convert it back to QemuOpts, using the
725      * create_opts of the protocol. So the new QemuOpts, will contain only the
726      * protocol defaults.
727      */
728     qdict = qemu_opts_to_qdict(opts, NULL);
729     protocol_opts = qemu_opts_from_qdict(drv->create_opts, qdict, errp);
730     if (protocol_opts == NULL) {
731         ret = -EINVAL;
732         goto out;
733     }
734 
735     ret = bdrv_co_create(drv, filename, protocol_opts, errp);
736 out:
737     qemu_opts_del(protocol_opts);
738     qobject_unref(qdict);
739     return ret;
740 }
741 
742 int coroutine_fn bdrv_co_delete_file(BlockDriverState *bs, Error **errp)
743 {
744     Error *local_err = NULL;
745     int ret;
746 
747     IO_CODE();
748     assert(bs != NULL);
749     assert_bdrv_graph_readable();
750 
751     if (!bs->drv) {
752         error_setg(errp, "Block node '%s' is not opened", bs->filename);
753         return -ENOMEDIUM;
754     }
755 
756     if (!bs->drv->bdrv_co_delete_file) {
757         error_setg(errp, "Driver '%s' does not support image deletion",
758                    bs->drv->format_name);
759         return -ENOTSUP;
760     }
761 
762     ret = bs->drv->bdrv_co_delete_file(bs, &local_err);
763     if (ret < 0) {
764         error_propagate(errp, local_err);
765     }
766 
767     return ret;
768 }
769 
770 void coroutine_fn bdrv_co_delete_file_noerr(BlockDriverState *bs)
771 {
772     Error *local_err = NULL;
773     int ret;
774     IO_CODE();
775 
776     if (!bs) {
777         return;
778     }
779 
780     ret = bdrv_co_delete_file(bs, &local_err);
781     /*
782      * ENOTSUP will happen if the block driver doesn't support
783      * the 'bdrv_co_delete_file' interface. This is a predictable
784      * scenario and shouldn't be reported back to the user.
785      */
786     if (ret == -ENOTSUP) {
787         error_free(local_err);
788     } else if (ret < 0) {
789         error_report_err(local_err);
790     }
791 }
792 
793 /**
794  * Try to get @bs's logical and physical block size.
795  * On success, store them in @bsz struct and return 0.
796  * On failure return -errno.
797  * @bs must not be empty.
798  */
799 int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
800 {
801     BlockDriver *drv = bs->drv;
802     BlockDriverState *filtered = bdrv_filter_bs(bs);
803     GLOBAL_STATE_CODE();
804 
805     if (drv && drv->bdrv_probe_blocksizes) {
806         return drv->bdrv_probe_blocksizes(bs, bsz);
807     } else if (filtered) {
808         return bdrv_probe_blocksizes(filtered, bsz);
809     }
810 
811     return -ENOTSUP;
812 }
813 
814 /**
815  * Try to get @bs's geometry (cyls, heads, sectors).
816  * On success, store them in @geo struct and return 0.
817  * On failure return -errno.
818  * @bs must not be empty.
819  */
820 int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
821 {
822     BlockDriver *drv = bs->drv;
823     BlockDriverState *filtered = bdrv_filter_bs(bs);
824     GLOBAL_STATE_CODE();
825 
826     if (drv && drv->bdrv_probe_geometry) {
827         return drv->bdrv_probe_geometry(bs, geo);
828     } else if (filtered) {
829         return bdrv_probe_geometry(filtered, geo);
830     }
831 
832     return -ENOTSUP;
833 }
834 
835 /*
836  * Create a uniquely-named empty temporary file.
837  * Return the actual file name used upon success, otherwise NULL.
838  * This string should be freed with g_free() when not needed any longer.
839  *
840  * Note: creating a temporary file for the caller to (re)open is
841  * inherently racy. Use g_file_open_tmp() instead whenever practical.
842  */
843 char *create_tmp_file(Error **errp)
844 {
845     int fd;
846     const char *tmpdir;
847     g_autofree char *filename = NULL;
848 
849     tmpdir = g_get_tmp_dir();
850 #ifndef _WIN32
851     /*
852      * See commit 69bef79 ("block: use /var/tmp instead of /tmp for -snapshot")
853      *
854      * This function is used to create temporary disk images (like -snapshot),
855      * so the files can become very large. /tmp is often a tmpfs where as
856      * /var/tmp is usually on a disk, so more appropriate for disk images.
857      */
858     if (!g_strcmp0(tmpdir, "/tmp")) {
859         tmpdir = "/var/tmp";
860     }
861 #endif
862 
863     filename = g_strdup_printf("%s/vl.XXXXXX", tmpdir);
864     fd = g_mkstemp(filename);
865     if (fd < 0) {
866         error_setg_errno(errp, errno, "Could not open temporary file '%s'",
867                          filename);
868         return NULL;
869     }
870     close(fd);
871 
872     return g_steal_pointer(&filename);
873 }
874 
875 /*
876  * Detect host devices. By convention, /dev/cdrom[N] is always
877  * recognized as a host CDROM.
878  */
879 static BlockDriver *find_hdev_driver(const char *filename)
880 {
881     int score_max = 0, score;
882     BlockDriver *drv = NULL, *d;
883     GLOBAL_STATE_CODE();
884 
885     QLIST_FOREACH(d, &bdrv_drivers, list) {
886         if (d->bdrv_probe_device) {
887             score = d->bdrv_probe_device(filename);
888             if (score > score_max) {
889                 score_max = score;
890                 drv = d;
891             }
892         }
893     }
894 
895     return drv;
896 }
897 
898 static BlockDriver *bdrv_do_find_protocol(const char *protocol)
899 {
900     BlockDriver *drv1;
901     GLOBAL_STATE_CODE();
902 
903     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
904         if (drv1->protocol_name && !strcmp(drv1->protocol_name, protocol)) {
905             return drv1;
906         }
907     }
908 
909     return NULL;
910 }
911 
912 BlockDriver *bdrv_find_protocol(const char *filename,
913                                 bool allow_protocol_prefix,
914                                 Error **errp)
915 {
916     BlockDriver *drv1;
917     char protocol[128];
918     int len;
919     const char *p;
920     int i;
921 
922     GLOBAL_STATE_CODE();
923     /* TODO Drivers without bdrv_file_open must be specified explicitly */
924 
925     /*
926      * XXX(hch): we really should not let host device detection
927      * override an explicit protocol specification, but moving this
928      * later breaks access to device names with colons in them.
929      * Thanks to the brain-dead persistent naming schemes on udev-
930      * based Linux systems those actually are quite common.
931      */
932     drv1 = find_hdev_driver(filename);
933     if (drv1) {
934         return drv1;
935     }
936 
937     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
938         return &bdrv_file;
939     }
940 
941     p = strchr(filename, ':');
942     assert(p != NULL);
943     len = p - filename;
944     if (len > sizeof(protocol) - 1)
945         len = sizeof(protocol) - 1;
946     memcpy(protocol, filename, len);
947     protocol[len] = '\0';
948 
949     drv1 = bdrv_do_find_protocol(protocol);
950     if (drv1) {
951         return drv1;
952     }
953 
954     for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); ++i) {
955         if (block_driver_modules[i].protocol_name &&
956             !strcmp(block_driver_modules[i].protocol_name, protocol)) {
957             int rv = block_module_load(block_driver_modules[i].library_name, errp);
958             if (rv > 0) {
959                 drv1 = bdrv_do_find_protocol(protocol);
960             } else if (rv < 0) {
961                 return NULL;
962             }
963             break;
964         }
965     }
966 
967     if (!drv1) {
968         error_setg(errp, "Unknown protocol '%s'", protocol);
969     }
970     return drv1;
971 }
972 
973 /*
974  * Guess image format by probing its contents.
975  * This is not a good idea when your image is raw (CVE-2008-2004), but
976  * we do it anyway for backward compatibility.
977  *
978  * @buf         contains the image's first @buf_size bytes.
979  * @buf_size    is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
980  *              but can be smaller if the image file is smaller)
981  * @filename    is its filename.
982  *
983  * For all block drivers, call the bdrv_probe() method to get its
984  * probing score.
985  * Return the first block driver with the highest probing score.
986  */
987 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
988                             const char *filename)
989 {
990     int score_max = 0, score;
991     BlockDriver *drv = NULL, *d;
992     IO_CODE();
993 
994     QLIST_FOREACH(d, &bdrv_drivers, list) {
995         if (d->bdrv_probe) {
996             score = d->bdrv_probe(buf, buf_size, filename);
997             if (score > score_max) {
998                 score_max = score;
999                 drv = d;
1000             }
1001         }
1002     }
1003 
1004     return drv;
1005 }
1006 
1007 static int find_image_format(BlockBackend *file, const char *filename,
1008                              BlockDriver **pdrv, Error **errp)
1009 {
1010     BlockDriver *drv;
1011     uint8_t buf[BLOCK_PROBE_BUF_SIZE];
1012     int ret = 0;
1013 
1014     GLOBAL_STATE_CODE();
1015 
1016     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
1017     if (blk_is_sg(file) || !blk_is_inserted(file) || blk_getlength(file) == 0) {
1018         *pdrv = &bdrv_raw;
1019         return ret;
1020     }
1021 
1022     ret = blk_pread(file, 0, sizeof(buf), buf, 0);
1023     if (ret < 0) {
1024         error_setg_errno(errp, -ret, "Could not read image for determining its "
1025                          "format");
1026         *pdrv = NULL;
1027         return ret;
1028     }
1029 
1030     drv = bdrv_probe_all(buf, sizeof(buf), filename);
1031     if (!drv) {
1032         error_setg(errp, "Could not determine image format: No compatible "
1033                    "driver found");
1034         *pdrv = NULL;
1035         return -ENOENT;
1036     }
1037 
1038     *pdrv = drv;
1039     return 0;
1040 }
1041 
1042 /**
1043  * Set the current 'total_sectors' value
1044  * Return 0 on success, -errno on error.
1045  */
1046 int coroutine_fn bdrv_co_refresh_total_sectors(BlockDriverState *bs,
1047                                                int64_t hint)
1048 {
1049     BlockDriver *drv = bs->drv;
1050     IO_CODE();
1051     assert_bdrv_graph_readable();
1052 
1053     if (!drv) {
1054         return -ENOMEDIUM;
1055     }
1056 
1057     /* Do not attempt drv->bdrv_co_getlength() on scsi-generic devices */
1058     if (bdrv_is_sg(bs))
1059         return 0;
1060 
1061     /* query actual device if possible, otherwise just trust the hint */
1062     if (drv->bdrv_co_getlength) {
1063         int64_t length = drv->bdrv_co_getlength(bs);
1064         if (length < 0) {
1065             return length;
1066         }
1067         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
1068     }
1069 
1070     bs->total_sectors = hint;
1071 
1072     if (bs->total_sectors * BDRV_SECTOR_SIZE > BDRV_MAX_LENGTH) {
1073         return -EFBIG;
1074     }
1075 
1076     return 0;
1077 }
1078 
1079 /**
1080  * Combines a QDict of new block driver @options with any missing options taken
1081  * from @old_options, so that leaving out an option defaults to its old value.
1082  */
1083 static void bdrv_join_options(BlockDriverState *bs, QDict *options,
1084                               QDict *old_options)
1085 {
1086     GLOBAL_STATE_CODE();
1087     if (bs->drv && bs->drv->bdrv_join_options) {
1088         bs->drv->bdrv_join_options(options, old_options);
1089     } else {
1090         qdict_join(options, old_options, false);
1091     }
1092 }
1093 
1094 static BlockdevDetectZeroesOptions bdrv_parse_detect_zeroes(QemuOpts *opts,
1095                                                             int open_flags,
1096                                                             Error **errp)
1097 {
1098     Error *local_err = NULL;
1099     char *value = qemu_opt_get_del(opts, "detect-zeroes");
1100     BlockdevDetectZeroesOptions detect_zeroes =
1101         qapi_enum_parse(&BlockdevDetectZeroesOptions_lookup, value,
1102                         BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF, &local_err);
1103     GLOBAL_STATE_CODE();
1104     g_free(value);
1105     if (local_err) {
1106         error_propagate(errp, local_err);
1107         return detect_zeroes;
1108     }
1109 
1110     if (detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP &&
1111         !(open_flags & BDRV_O_UNMAP))
1112     {
1113         error_setg(errp, "setting detect-zeroes to unmap is not allowed "
1114                    "without setting discard operation to unmap");
1115     }
1116 
1117     return detect_zeroes;
1118 }
1119 
1120 /**
1121  * Set open flags for aio engine
1122  *
1123  * Return 0 on success, -1 if the engine specified is invalid
1124  */
1125 int bdrv_parse_aio(const char *mode, int *flags)
1126 {
1127     if (!strcmp(mode, "threads")) {
1128         /* do nothing, default */
1129     } else if (!strcmp(mode, "native")) {
1130         *flags |= BDRV_O_NATIVE_AIO;
1131 #ifdef CONFIG_LINUX_IO_URING
1132     } else if (!strcmp(mode, "io_uring")) {
1133         *flags |= BDRV_O_IO_URING;
1134 #endif
1135     } else {
1136         return -1;
1137     }
1138 
1139     return 0;
1140 }
1141 
1142 /**
1143  * Set open flags for a given discard mode
1144  *
1145  * Return 0 on success, -1 if the discard mode was invalid.
1146  */
1147 int bdrv_parse_discard_flags(const char *mode, int *flags)
1148 {
1149     *flags &= ~BDRV_O_UNMAP;
1150 
1151     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
1152         /* do nothing */
1153     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
1154         *flags |= BDRV_O_UNMAP;
1155     } else {
1156         return -1;
1157     }
1158 
1159     return 0;
1160 }
1161 
1162 /**
1163  * Set open flags for a given cache mode
1164  *
1165  * Return 0 on success, -1 if the cache mode was invalid.
1166  */
1167 int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough)
1168 {
1169     *flags &= ~BDRV_O_CACHE_MASK;
1170 
1171     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
1172         *writethrough = false;
1173         *flags |= BDRV_O_NOCACHE;
1174     } else if (!strcmp(mode, "directsync")) {
1175         *writethrough = true;
1176         *flags |= BDRV_O_NOCACHE;
1177     } else if (!strcmp(mode, "writeback")) {
1178         *writethrough = false;
1179     } else if (!strcmp(mode, "unsafe")) {
1180         *writethrough = false;
1181         *flags |= BDRV_O_NO_FLUSH;
1182     } else if (!strcmp(mode, "writethrough")) {
1183         *writethrough = true;
1184     } else {
1185         return -1;
1186     }
1187 
1188     return 0;
1189 }
1190 
1191 static char *bdrv_child_get_parent_desc(BdrvChild *c)
1192 {
1193     BlockDriverState *parent = c->opaque;
1194     return g_strdup_printf("node '%s'", bdrv_get_node_name(parent));
1195 }
1196 
1197 static void GRAPH_RDLOCK bdrv_child_cb_drained_begin(BdrvChild *child)
1198 {
1199     BlockDriverState *bs = child->opaque;
1200     bdrv_do_drained_begin_quiesce(bs, NULL);
1201 }
1202 
1203 static bool GRAPH_RDLOCK bdrv_child_cb_drained_poll(BdrvChild *child)
1204 {
1205     BlockDriverState *bs = child->opaque;
1206     return bdrv_drain_poll(bs, NULL, false);
1207 }
1208 
1209 static void GRAPH_RDLOCK bdrv_child_cb_drained_end(BdrvChild *child)
1210 {
1211     BlockDriverState *bs = child->opaque;
1212     bdrv_drained_end(bs);
1213 }
1214 
1215 static int bdrv_child_cb_inactivate(BdrvChild *child)
1216 {
1217     BlockDriverState *bs = child->opaque;
1218     GLOBAL_STATE_CODE();
1219     assert(bs->open_flags & BDRV_O_INACTIVE);
1220     return 0;
1221 }
1222 
1223 static bool bdrv_child_cb_change_aio_ctx(BdrvChild *child, AioContext *ctx,
1224                                          GHashTable *visited, Transaction *tran,
1225                                          Error **errp)
1226 {
1227     BlockDriverState *bs = child->opaque;
1228     return bdrv_change_aio_context(bs, ctx, visited, tran, errp);
1229 }
1230 
1231 /*
1232  * Returns the options and flags that a temporary snapshot should get, based on
1233  * the originally requested flags (the originally requested image will have
1234  * flags like a backing file)
1235  */
1236 static void bdrv_temp_snapshot_options(int *child_flags, QDict *child_options,
1237                                        int parent_flags, QDict *parent_options)
1238 {
1239     GLOBAL_STATE_CODE();
1240     *child_flags = (parent_flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
1241 
1242     /* For temporary files, unconditional cache=unsafe is fine */
1243     qdict_set_default_str(child_options, BDRV_OPT_CACHE_DIRECT, "off");
1244     qdict_set_default_str(child_options, BDRV_OPT_CACHE_NO_FLUSH, "on");
1245 
1246     /* Copy the read-only and discard options from the parent */
1247     qdict_copy_default(child_options, parent_options, BDRV_OPT_READ_ONLY);
1248     qdict_copy_default(child_options, parent_options, BDRV_OPT_DISCARD);
1249 
1250     /* aio=native doesn't work for cache.direct=off, so disable it for the
1251      * temporary snapshot */
1252     *child_flags &= ~BDRV_O_NATIVE_AIO;
1253 }
1254 
1255 static void GRAPH_WRLOCK bdrv_backing_attach(BdrvChild *c)
1256 {
1257     BlockDriverState *parent = c->opaque;
1258     BlockDriverState *backing_hd = c->bs;
1259 
1260     GLOBAL_STATE_CODE();
1261     assert(!parent->backing_blocker);
1262     error_setg(&parent->backing_blocker,
1263                "node is used as backing hd of '%s'",
1264                bdrv_get_device_or_node_name(parent));
1265 
1266     bdrv_refresh_filename(backing_hd);
1267 
1268     parent->open_flags &= ~BDRV_O_NO_BACKING;
1269 
1270     bdrv_op_block_all(backing_hd, parent->backing_blocker);
1271     /* Otherwise we won't be able to commit or stream */
1272     bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET,
1273                     parent->backing_blocker);
1274     bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_STREAM,
1275                     parent->backing_blocker);
1276     /*
1277      * We do backup in 3 ways:
1278      * 1. drive backup
1279      *    The target bs is new opened, and the source is top BDS
1280      * 2. blockdev backup
1281      *    Both the source and the target are top BDSes.
1282      * 3. internal backup(used for block replication)
1283      *    Both the source and the target are backing file
1284      *
1285      * In case 1 and 2, neither the source nor the target is the backing file.
1286      * In case 3, we will block the top BDS, so there is only one block job
1287      * for the top BDS and its backing chain.
1288      */
1289     bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_SOURCE,
1290                     parent->backing_blocker);
1291     bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET,
1292                     parent->backing_blocker);
1293 }
1294 
1295 static void bdrv_backing_detach(BdrvChild *c)
1296 {
1297     BlockDriverState *parent = c->opaque;
1298 
1299     GLOBAL_STATE_CODE();
1300     assert(parent->backing_blocker);
1301     bdrv_op_unblock_all(c->bs, parent->backing_blocker);
1302     error_free(parent->backing_blocker);
1303     parent->backing_blocker = NULL;
1304 }
1305 
1306 static int bdrv_backing_update_filename(BdrvChild *c, BlockDriverState *base,
1307                                         const char *filename, Error **errp)
1308 {
1309     BlockDriverState *parent = c->opaque;
1310     bool read_only = bdrv_is_read_only(parent);
1311     int ret;
1312     GLOBAL_STATE_CODE();
1313 
1314     if (read_only) {
1315         ret = bdrv_reopen_set_read_only(parent, false, errp);
1316         if (ret < 0) {
1317             return ret;
1318         }
1319     }
1320 
1321     ret = bdrv_change_backing_file(parent, filename,
1322                                    base->drv ? base->drv->format_name : "",
1323                                    false);
1324     if (ret < 0) {
1325         error_setg_errno(errp, -ret, "Could not update backing file link");
1326     }
1327 
1328     if (read_only) {
1329         bdrv_reopen_set_read_only(parent, true, NULL);
1330     }
1331 
1332     return ret;
1333 }
1334 
1335 /*
1336  * Returns the options and flags that a generic child of a BDS should
1337  * get, based on the given options and flags for the parent BDS.
1338  */
1339 static void bdrv_inherited_options(BdrvChildRole role, bool parent_is_format,
1340                                    int *child_flags, QDict *child_options,
1341                                    int parent_flags, QDict *parent_options)
1342 {
1343     int flags = parent_flags;
1344     GLOBAL_STATE_CODE();
1345 
1346     /*
1347      * First, decide whether to set, clear, or leave BDRV_O_PROTOCOL.
1348      * Generally, the question to answer is: Should this child be
1349      * format-probed by default?
1350      */
1351 
1352     /*
1353      * Pure and non-filtered data children of non-format nodes should
1354      * be probed by default (even when the node itself has BDRV_O_PROTOCOL
1355      * set).  This only affects a very limited set of drivers (namely
1356      * quorum and blkverify when this comment was written).
1357      * Force-clear BDRV_O_PROTOCOL then.
1358      */
1359     if (!parent_is_format &&
1360         (role & BDRV_CHILD_DATA) &&
1361         !(role & (BDRV_CHILD_METADATA | BDRV_CHILD_FILTERED)))
1362     {
1363         flags &= ~BDRV_O_PROTOCOL;
1364     }
1365 
1366     /*
1367      * All children of format nodes (except for COW children) and all
1368      * metadata children in general should never be format-probed.
1369      * Force-set BDRV_O_PROTOCOL then.
1370      */
1371     if ((parent_is_format && !(role & BDRV_CHILD_COW)) ||
1372         (role & BDRV_CHILD_METADATA))
1373     {
1374         flags |= BDRV_O_PROTOCOL;
1375     }
1376 
1377     /*
1378      * If the cache mode isn't explicitly set, inherit direct and no-flush from
1379      * the parent.
1380      */
1381     qdict_copy_default(child_options, parent_options, BDRV_OPT_CACHE_DIRECT);
1382     qdict_copy_default(child_options, parent_options, BDRV_OPT_CACHE_NO_FLUSH);
1383     qdict_copy_default(child_options, parent_options, BDRV_OPT_FORCE_SHARE);
1384 
1385     if (role & BDRV_CHILD_COW) {
1386         /* backing files are opened read-only by default */
1387         qdict_set_default_str(child_options, BDRV_OPT_READ_ONLY, "on");
1388         qdict_set_default_str(child_options, BDRV_OPT_AUTO_READ_ONLY, "off");
1389     } else {
1390         /* Inherit the read-only option from the parent if it's not set */
1391         qdict_copy_default(child_options, parent_options, BDRV_OPT_READ_ONLY);
1392         qdict_copy_default(child_options, parent_options,
1393                            BDRV_OPT_AUTO_READ_ONLY);
1394     }
1395 
1396     /*
1397      * bdrv_co_pdiscard() respects unmap policy for the parent, so we
1398      * can default to enable it on lower layers regardless of the
1399      * parent option.
1400      */
1401     qdict_set_default_str(child_options, BDRV_OPT_DISCARD, "unmap");
1402 
1403     /* Clear flags that only apply to the top layer */
1404     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
1405 
1406     if (role & BDRV_CHILD_METADATA) {
1407         flags &= ~BDRV_O_NO_IO;
1408     }
1409     if (role & BDRV_CHILD_COW) {
1410         flags &= ~BDRV_O_TEMPORARY;
1411     }
1412 
1413     *child_flags = flags;
1414 }
1415 
1416 static void GRAPH_WRLOCK bdrv_child_cb_attach(BdrvChild *child)
1417 {
1418     BlockDriverState *bs = child->opaque;
1419 
1420     assert_bdrv_graph_writable();
1421     QLIST_INSERT_HEAD(&bs->children, child, next);
1422     if (bs->drv->is_filter || (child->role & BDRV_CHILD_FILTERED)) {
1423         /*
1424          * Here we handle filters and block/raw-format.c when it behave like
1425          * filter. They generally have a single PRIMARY child, which is also the
1426          * FILTERED child, and that they may have multiple more children, which
1427          * are neither PRIMARY nor FILTERED. And never we have a COW child here.
1428          * So bs->file will be the PRIMARY child, unless the PRIMARY child goes
1429          * into bs->backing on exceptional cases; and bs->backing will be
1430          * nothing else.
1431          */
1432         assert(!(child->role & BDRV_CHILD_COW));
1433         if (child->role & BDRV_CHILD_PRIMARY) {
1434             assert(child->role & BDRV_CHILD_FILTERED);
1435             assert(!bs->backing);
1436             assert(!bs->file);
1437 
1438             if (bs->drv->filtered_child_is_backing) {
1439                 bs->backing = child;
1440             } else {
1441                 bs->file = child;
1442             }
1443         } else {
1444             assert(!(child->role & BDRV_CHILD_FILTERED));
1445         }
1446     } else if (child->role & BDRV_CHILD_COW) {
1447         assert(bs->drv->supports_backing);
1448         assert(!(child->role & BDRV_CHILD_PRIMARY));
1449         assert(!bs->backing);
1450         bs->backing = child;
1451         bdrv_backing_attach(child);
1452     } else if (child->role & BDRV_CHILD_PRIMARY) {
1453         assert(!bs->file);
1454         bs->file = child;
1455     }
1456 }
1457 
1458 static void GRAPH_WRLOCK bdrv_child_cb_detach(BdrvChild *child)
1459 {
1460     BlockDriverState *bs = child->opaque;
1461 
1462     if (child->role & BDRV_CHILD_COW) {
1463         bdrv_backing_detach(child);
1464     }
1465 
1466     assert_bdrv_graph_writable();
1467     QLIST_REMOVE(child, next);
1468     if (child == bs->backing) {
1469         assert(child != bs->file);
1470         bs->backing = NULL;
1471     } else if (child == bs->file) {
1472         bs->file = NULL;
1473     }
1474 }
1475 
1476 static int bdrv_child_cb_update_filename(BdrvChild *c, BlockDriverState *base,
1477                                          const char *filename, Error **errp)
1478 {
1479     if (c->role & BDRV_CHILD_COW) {
1480         return bdrv_backing_update_filename(c, base, filename, errp);
1481     }
1482     return 0;
1483 }
1484 
1485 AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c)
1486 {
1487     BlockDriverState *bs = c->opaque;
1488     IO_CODE();
1489 
1490     return bdrv_get_aio_context(bs);
1491 }
1492 
1493 const BdrvChildClass child_of_bds = {
1494     .parent_is_bds   = true,
1495     .get_parent_desc = bdrv_child_get_parent_desc,
1496     .inherit_options = bdrv_inherited_options,
1497     .drained_begin   = bdrv_child_cb_drained_begin,
1498     .drained_poll    = bdrv_child_cb_drained_poll,
1499     .drained_end     = bdrv_child_cb_drained_end,
1500     .attach          = bdrv_child_cb_attach,
1501     .detach          = bdrv_child_cb_detach,
1502     .inactivate      = bdrv_child_cb_inactivate,
1503     .change_aio_ctx  = bdrv_child_cb_change_aio_ctx,
1504     .update_filename = bdrv_child_cb_update_filename,
1505     .get_parent_aio_context = child_of_bds_get_parent_aio_context,
1506 };
1507 
1508 AioContext *bdrv_child_get_parent_aio_context(BdrvChild *c)
1509 {
1510     IO_CODE();
1511     return c->klass->get_parent_aio_context(c);
1512 }
1513 
1514 static int bdrv_open_flags(BlockDriverState *bs, int flags)
1515 {
1516     int open_flags = flags;
1517     GLOBAL_STATE_CODE();
1518 
1519     /*
1520      * Clear flags that are internal to the block layer before opening the
1521      * image.
1522      */
1523     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
1524 
1525     return open_flags;
1526 }
1527 
1528 static void update_flags_from_options(int *flags, QemuOpts *opts)
1529 {
1530     GLOBAL_STATE_CODE();
1531 
1532     *flags &= ~(BDRV_O_CACHE_MASK | BDRV_O_RDWR | BDRV_O_AUTO_RDONLY);
1533 
1534     if (qemu_opt_get_bool_del(opts, BDRV_OPT_CACHE_NO_FLUSH, false)) {
1535         *flags |= BDRV_O_NO_FLUSH;
1536     }
1537 
1538     if (qemu_opt_get_bool_del(opts, BDRV_OPT_CACHE_DIRECT, false)) {
1539         *flags |= BDRV_O_NOCACHE;
1540     }
1541 
1542     if (!qemu_opt_get_bool_del(opts, BDRV_OPT_READ_ONLY, false)) {
1543         *flags |= BDRV_O_RDWR;
1544     }
1545 
1546     if (qemu_opt_get_bool_del(opts, BDRV_OPT_AUTO_READ_ONLY, false)) {
1547         *flags |= BDRV_O_AUTO_RDONLY;
1548     }
1549 }
1550 
1551 static void update_options_from_flags(QDict *options, int flags)
1552 {
1553     GLOBAL_STATE_CODE();
1554     if (!qdict_haskey(options, BDRV_OPT_CACHE_DIRECT)) {
1555         qdict_put_bool(options, BDRV_OPT_CACHE_DIRECT, flags & BDRV_O_NOCACHE);
1556     }
1557     if (!qdict_haskey(options, BDRV_OPT_CACHE_NO_FLUSH)) {
1558         qdict_put_bool(options, BDRV_OPT_CACHE_NO_FLUSH,
1559                        flags & BDRV_O_NO_FLUSH);
1560     }
1561     if (!qdict_haskey(options, BDRV_OPT_READ_ONLY)) {
1562         qdict_put_bool(options, BDRV_OPT_READ_ONLY, !(flags & BDRV_O_RDWR));
1563     }
1564     if (!qdict_haskey(options, BDRV_OPT_AUTO_READ_ONLY)) {
1565         qdict_put_bool(options, BDRV_OPT_AUTO_READ_ONLY,
1566                        flags & BDRV_O_AUTO_RDONLY);
1567     }
1568 }
1569 
1570 static void bdrv_assign_node_name(BlockDriverState *bs,
1571                                   const char *node_name,
1572                                   Error **errp)
1573 {
1574     char *gen_node_name = NULL;
1575     GLOBAL_STATE_CODE();
1576 
1577     if (!node_name) {
1578         node_name = gen_node_name = id_generate(ID_BLOCK);
1579     } else if (!id_wellformed(node_name)) {
1580         /*
1581          * Check for empty string or invalid characters, but not if it is
1582          * generated (generated names use characters not available to the user)
1583          */
1584         error_setg(errp, "Invalid node-name: '%s'", node_name);
1585         return;
1586     }
1587 
1588     /* takes care of avoiding namespaces collisions */
1589     if (blk_by_name(node_name)) {
1590         error_setg(errp, "node-name=%s is conflicting with a device id",
1591                    node_name);
1592         goto out;
1593     }
1594 
1595     /* takes care of avoiding duplicates node names */
1596     if (bdrv_find_node(node_name)) {
1597         error_setg(errp, "Duplicate nodes with node-name='%s'", node_name);
1598         goto out;
1599     }
1600 
1601     /* Make sure that the node name isn't truncated */
1602     if (strlen(node_name) >= sizeof(bs->node_name)) {
1603         error_setg(errp, "Node name too long");
1604         goto out;
1605     }
1606 
1607     /* copy node name into the bs and insert it into the graph list */
1608     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
1609     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
1610 out:
1611     g_free(gen_node_name);
1612 }
1613 
1614 /*
1615  * The caller must always hold @bs AioContext lock, because this function calls
1616  * bdrv_refresh_total_sectors() which polls when called from non-coroutine
1617  * context.
1618  */
1619 static int no_coroutine_fn GRAPH_UNLOCKED
1620 bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, const char *node_name,
1621                  QDict *options, int open_flags, Error **errp)
1622 {
1623     AioContext *ctx;
1624     Error *local_err = NULL;
1625     int i, ret;
1626     GLOBAL_STATE_CODE();
1627 
1628     bdrv_assign_node_name(bs, node_name, &local_err);
1629     if (local_err) {
1630         error_propagate(errp, local_err);
1631         return -EINVAL;
1632     }
1633 
1634     bs->drv = drv;
1635     bs->opaque = g_malloc0(drv->instance_size);
1636 
1637     if (drv->bdrv_file_open) {
1638         assert(!drv->bdrv_needs_filename || bs->filename[0]);
1639         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
1640     } else if (drv->bdrv_open) {
1641         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
1642     } else {
1643         ret = 0;
1644     }
1645 
1646     if (ret < 0) {
1647         if (local_err) {
1648             error_propagate(errp, local_err);
1649         } else if (bs->filename[0]) {
1650             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
1651         } else {
1652             error_setg_errno(errp, -ret, "Could not open image");
1653         }
1654         goto open_failed;
1655     }
1656 
1657     assert(!(bs->supported_read_flags & ~BDRV_REQ_MASK));
1658     assert(!(bs->supported_write_flags & ~BDRV_REQ_MASK));
1659 
1660     /*
1661      * Always allow the BDRV_REQ_REGISTERED_BUF optimization hint. This saves
1662      * drivers that pass read/write requests through to a child the trouble of
1663      * declaring support explicitly.
1664      *
1665      * Drivers must not propagate this flag accidentally when they initiate I/O
1666      * to a bounce buffer. That case should be rare though.
1667      */
1668     bs->supported_read_flags |= BDRV_REQ_REGISTERED_BUF;
1669     bs->supported_write_flags |= BDRV_REQ_REGISTERED_BUF;
1670 
1671     /* Get the context after .bdrv_open, it can change the context */
1672     ctx = bdrv_get_aio_context(bs);
1673     aio_context_acquire(ctx);
1674 
1675     ret = bdrv_refresh_total_sectors(bs, bs->total_sectors);
1676     if (ret < 0) {
1677         error_setg_errno(errp, -ret, "Could not refresh total sector count");
1678         aio_context_release(ctx);
1679         return ret;
1680     }
1681 
1682     bdrv_graph_rdlock_main_loop();
1683     bdrv_refresh_limits(bs, NULL, &local_err);
1684     bdrv_graph_rdunlock_main_loop();
1685     aio_context_release(ctx);
1686 
1687     if (local_err) {
1688         error_propagate(errp, local_err);
1689         return -EINVAL;
1690     }
1691 
1692     assert(bdrv_opt_mem_align(bs) != 0);
1693     assert(bdrv_min_mem_align(bs) != 0);
1694     assert(is_power_of_2(bs->bl.request_alignment));
1695 
1696     for (i = 0; i < bs->quiesce_counter; i++) {
1697         if (drv->bdrv_drain_begin) {
1698             drv->bdrv_drain_begin(bs);
1699         }
1700     }
1701 
1702     return 0;
1703 open_failed:
1704     bs->drv = NULL;
1705     if (bs->file != NULL) {
1706         bdrv_graph_wrlock(NULL);
1707         bdrv_unref_child(bs, bs->file);
1708         bdrv_graph_wrunlock();
1709         assert(!bs->file);
1710     }
1711     g_free(bs->opaque);
1712     bs->opaque = NULL;
1713     return ret;
1714 }
1715 
1716 /*
1717  * Create and open a block node.
1718  *
1719  * @options is a QDict of options to pass to the block drivers, or NULL for an
1720  * empty set of options. The reference to the QDict belongs to the block layer
1721  * after the call (even on failure), so if the caller intends to reuse the
1722  * dictionary, it needs to use qobject_ref() before calling bdrv_open.
1723  */
1724 BlockDriverState *bdrv_new_open_driver_opts(BlockDriver *drv,
1725                                             const char *node_name,
1726                                             QDict *options, int flags,
1727                                             Error **errp)
1728 {
1729     BlockDriverState *bs;
1730     int ret;
1731 
1732     GLOBAL_STATE_CODE();
1733 
1734     bs = bdrv_new();
1735     bs->open_flags = flags;
1736     bs->options = options ?: qdict_new();
1737     bs->explicit_options = qdict_clone_shallow(bs->options);
1738     bs->opaque = NULL;
1739 
1740     update_options_from_flags(bs->options, flags);
1741 
1742     ret = bdrv_open_driver(bs, drv, node_name, bs->options, flags, errp);
1743     if (ret < 0) {
1744         qobject_unref(bs->explicit_options);
1745         bs->explicit_options = NULL;
1746         qobject_unref(bs->options);
1747         bs->options = NULL;
1748         bdrv_unref(bs);
1749         return NULL;
1750     }
1751 
1752     return bs;
1753 }
1754 
1755 /* Create and open a block node. */
1756 BlockDriverState *bdrv_new_open_driver(BlockDriver *drv, const char *node_name,
1757                                        int flags, Error **errp)
1758 {
1759     GLOBAL_STATE_CODE();
1760     return bdrv_new_open_driver_opts(drv, node_name, NULL, flags, errp);
1761 }
1762 
1763 QemuOptsList bdrv_runtime_opts = {
1764     .name = "bdrv_common",
1765     .head = QTAILQ_HEAD_INITIALIZER(bdrv_runtime_opts.head),
1766     .desc = {
1767         {
1768             .name = "node-name",
1769             .type = QEMU_OPT_STRING,
1770             .help = "Node name of the block device node",
1771         },
1772         {
1773             .name = "driver",
1774             .type = QEMU_OPT_STRING,
1775             .help = "Block driver to use for the node",
1776         },
1777         {
1778             .name = BDRV_OPT_CACHE_DIRECT,
1779             .type = QEMU_OPT_BOOL,
1780             .help = "Bypass software writeback cache on the host",
1781         },
1782         {
1783             .name = BDRV_OPT_CACHE_NO_FLUSH,
1784             .type = QEMU_OPT_BOOL,
1785             .help = "Ignore flush requests",
1786         },
1787         {
1788             .name = BDRV_OPT_READ_ONLY,
1789             .type = QEMU_OPT_BOOL,
1790             .help = "Node is opened in read-only mode",
1791         },
1792         {
1793             .name = BDRV_OPT_AUTO_READ_ONLY,
1794             .type = QEMU_OPT_BOOL,
1795             .help = "Node can become read-only if opening read-write fails",
1796         },
1797         {
1798             .name = "detect-zeroes",
1799             .type = QEMU_OPT_STRING,
1800             .help = "try to optimize zero writes (off, on, unmap)",
1801         },
1802         {
1803             .name = BDRV_OPT_DISCARD,
1804             .type = QEMU_OPT_STRING,
1805             .help = "discard operation (ignore/off, unmap/on)",
1806         },
1807         {
1808             .name = BDRV_OPT_FORCE_SHARE,
1809             .type = QEMU_OPT_BOOL,
1810             .help = "always accept other writers (default: off)",
1811         },
1812         { /* end of list */ }
1813     },
1814 };
1815 
1816 QemuOptsList bdrv_create_opts_simple = {
1817     .name = "simple-create-opts",
1818     .head = QTAILQ_HEAD_INITIALIZER(bdrv_create_opts_simple.head),
1819     .desc = {
1820         {
1821             .name = BLOCK_OPT_SIZE,
1822             .type = QEMU_OPT_SIZE,
1823             .help = "Virtual disk size"
1824         },
1825         {
1826             .name = BLOCK_OPT_PREALLOC,
1827             .type = QEMU_OPT_STRING,
1828             .help = "Preallocation mode (allowed values: off)"
1829         },
1830         { /* end of list */ }
1831     }
1832 };
1833 
1834 /*
1835  * Common part for opening disk images and files
1836  *
1837  * Removes all processed options from *options.
1838  */
1839 static int bdrv_open_common(BlockDriverState *bs, BlockBackend *file,
1840                             QDict *options, Error **errp)
1841 {
1842     int ret, open_flags;
1843     const char *filename;
1844     const char *driver_name = NULL;
1845     const char *node_name = NULL;
1846     const char *discard;
1847     QemuOpts *opts;
1848     BlockDriver *drv;
1849     Error *local_err = NULL;
1850     bool ro;
1851 
1852     assert(bs->file == NULL);
1853     assert(options != NULL && bs->options != options);
1854     GLOBAL_STATE_CODE();
1855 
1856     opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
1857     if (!qemu_opts_absorb_qdict(opts, options, errp)) {
1858         ret = -EINVAL;
1859         goto fail_opts;
1860     }
1861 
1862     update_flags_from_options(&bs->open_flags, opts);
1863 
1864     driver_name = qemu_opt_get(opts, "driver");
1865     drv = bdrv_find_format(driver_name);
1866     assert(drv != NULL);
1867 
1868     bs->force_share = qemu_opt_get_bool(opts, BDRV_OPT_FORCE_SHARE, false);
1869 
1870     if (bs->force_share && (bs->open_flags & BDRV_O_RDWR)) {
1871         error_setg(errp,
1872                    BDRV_OPT_FORCE_SHARE
1873                    "=on can only be used with read-only images");
1874         ret = -EINVAL;
1875         goto fail_opts;
1876     }
1877 
1878     if (file != NULL) {
1879         bdrv_graph_rdlock_main_loop();
1880         bdrv_refresh_filename(blk_bs(file));
1881         bdrv_graph_rdunlock_main_loop();
1882 
1883         filename = blk_bs(file)->filename;
1884     } else {
1885         /*
1886          * Caution: while qdict_get_try_str() is fine, getting
1887          * non-string types would require more care.  When @options
1888          * come from -blockdev or blockdev_add, its members are typed
1889          * according to the QAPI schema, but when they come from
1890          * -drive, they're all QString.
1891          */
1892         filename = qdict_get_try_str(options, "filename");
1893     }
1894 
1895     if (drv->bdrv_needs_filename && (!filename || !filename[0])) {
1896         error_setg(errp, "The '%s' block driver requires a file name",
1897                    drv->format_name);
1898         ret = -EINVAL;
1899         goto fail_opts;
1900     }
1901 
1902     trace_bdrv_open_common(bs, filename ?: "", bs->open_flags,
1903                            drv->format_name);
1904 
1905     ro = bdrv_is_read_only(bs);
1906 
1907     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, ro)) {
1908         if (!ro && bdrv_is_whitelisted(drv, true)) {
1909             bdrv_graph_rdlock_main_loop();
1910             ret = bdrv_apply_auto_read_only(bs, NULL, NULL);
1911             bdrv_graph_rdunlock_main_loop();
1912         } else {
1913             ret = -ENOTSUP;
1914         }
1915         if (ret < 0) {
1916             error_setg(errp,
1917                        !ro && bdrv_is_whitelisted(drv, true)
1918                        ? "Driver '%s' can only be used for read-only devices"
1919                        : "Driver '%s' is not whitelisted",
1920                        drv->format_name);
1921             goto fail_opts;
1922         }
1923     }
1924 
1925     /* bdrv_new() and bdrv_close() make it so */
1926     assert(qatomic_read(&bs->copy_on_read) == 0);
1927 
1928     if (bs->open_flags & BDRV_O_COPY_ON_READ) {
1929         if (!ro) {
1930             bdrv_enable_copy_on_read(bs);
1931         } else {
1932             error_setg(errp, "Can't use copy-on-read on read-only device");
1933             ret = -EINVAL;
1934             goto fail_opts;
1935         }
1936     }
1937 
1938     discard = qemu_opt_get(opts, BDRV_OPT_DISCARD);
1939     if (discard != NULL) {
1940         if (bdrv_parse_discard_flags(discard, &bs->open_flags) != 0) {
1941             error_setg(errp, "Invalid discard option");
1942             ret = -EINVAL;
1943             goto fail_opts;
1944         }
1945     }
1946 
1947     bs->detect_zeroes =
1948         bdrv_parse_detect_zeroes(opts, bs->open_flags, &local_err);
1949     if (local_err) {
1950         error_propagate(errp, local_err);
1951         ret = -EINVAL;
1952         goto fail_opts;
1953     }
1954 
1955     if (filename != NULL) {
1956         pstrcpy(bs->filename, sizeof(bs->filename), filename);
1957     } else {
1958         bs->filename[0] = '\0';
1959     }
1960     pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
1961 
1962     /* Open the image, either directly or using a protocol */
1963     open_flags = bdrv_open_flags(bs, bs->open_flags);
1964     node_name = qemu_opt_get(opts, "node-name");
1965 
1966     assert(!drv->bdrv_file_open || file == NULL);
1967     ret = bdrv_open_driver(bs, drv, node_name, options, open_flags, errp);
1968     if (ret < 0) {
1969         goto fail_opts;
1970     }
1971 
1972     qemu_opts_del(opts);
1973     return 0;
1974 
1975 fail_opts:
1976     qemu_opts_del(opts);
1977     return ret;
1978 }
1979 
1980 static QDict *parse_json_filename(const char *filename, Error **errp)
1981 {
1982     QObject *options_obj;
1983     QDict *options;
1984     int ret;
1985     GLOBAL_STATE_CODE();
1986 
1987     ret = strstart(filename, "json:", &filename);
1988     assert(ret);
1989 
1990     options_obj = qobject_from_json(filename, errp);
1991     if (!options_obj) {
1992         error_prepend(errp, "Could not parse the JSON options: ");
1993         return NULL;
1994     }
1995 
1996     options = qobject_to(QDict, options_obj);
1997     if (!options) {
1998         qobject_unref(options_obj);
1999         error_setg(errp, "Invalid JSON object given");
2000         return NULL;
2001     }
2002 
2003     qdict_flatten(options);
2004 
2005     return options;
2006 }
2007 
2008 static void parse_json_protocol(QDict *options, const char **pfilename,
2009                                 Error **errp)
2010 {
2011     QDict *json_options;
2012     Error *local_err = NULL;
2013     GLOBAL_STATE_CODE();
2014 
2015     /* Parse json: pseudo-protocol */
2016     if (!*pfilename || !g_str_has_prefix(*pfilename, "json:")) {
2017         return;
2018     }
2019 
2020     json_options = parse_json_filename(*pfilename, &local_err);
2021     if (local_err) {
2022         error_propagate(errp, local_err);
2023         return;
2024     }
2025 
2026     /* Options given in the filename have lower priority than options
2027      * specified directly */
2028     qdict_join(options, json_options, false);
2029     qobject_unref(json_options);
2030     *pfilename = NULL;
2031 }
2032 
2033 /*
2034  * Fills in default options for opening images and converts the legacy
2035  * filename/flags pair to option QDict entries.
2036  * The BDRV_O_PROTOCOL flag in *flags will be set or cleared accordingly if a
2037  * block driver has been specified explicitly.
2038  */
2039 static int bdrv_fill_options(QDict **options, const char *filename,
2040                              int *flags, Error **errp)
2041 {
2042     const char *drvname;
2043     bool protocol = *flags & BDRV_O_PROTOCOL;
2044     bool parse_filename = false;
2045     BlockDriver *drv = NULL;
2046     Error *local_err = NULL;
2047 
2048     GLOBAL_STATE_CODE();
2049 
2050     /*
2051      * Caution: while qdict_get_try_str() is fine, getting non-string
2052      * types would require more care.  When @options come from
2053      * -blockdev or blockdev_add, its members are typed according to
2054      * the QAPI schema, but when they come from -drive, they're all
2055      * QString.
2056      */
2057     drvname = qdict_get_try_str(*options, "driver");
2058     if (drvname) {
2059         drv = bdrv_find_format(drvname);
2060         if (!drv) {
2061             error_setg(errp, "Unknown driver '%s'", drvname);
2062             return -ENOENT;
2063         }
2064         /* If the user has explicitly specified the driver, this choice should
2065          * override the BDRV_O_PROTOCOL flag */
2066         protocol = drv->bdrv_file_open;
2067     }
2068 
2069     if (protocol) {
2070         *flags |= BDRV_O_PROTOCOL;
2071     } else {
2072         *flags &= ~BDRV_O_PROTOCOL;
2073     }
2074 
2075     /* Translate cache options from flags into options */
2076     update_options_from_flags(*options, *flags);
2077 
2078     /* Fetch the file name from the options QDict if necessary */
2079     if (protocol && filename) {
2080         if (!qdict_haskey(*options, "filename")) {
2081             qdict_put_str(*options, "filename", filename);
2082             parse_filename = true;
2083         } else {
2084             error_setg(errp, "Can't specify 'file' and 'filename' options at "
2085                              "the same time");
2086             return -EINVAL;
2087         }
2088     }
2089 
2090     /* Find the right block driver */
2091     /* See cautionary note on accessing @options above */
2092     filename = qdict_get_try_str(*options, "filename");
2093 
2094     if (!drvname && protocol) {
2095         if (filename) {
2096             drv = bdrv_find_protocol(filename, parse_filename, errp);
2097             if (!drv) {
2098                 return -EINVAL;
2099             }
2100 
2101             drvname = drv->format_name;
2102             qdict_put_str(*options, "driver", drvname);
2103         } else {
2104             error_setg(errp, "Must specify either driver or file");
2105             return -EINVAL;
2106         }
2107     }
2108 
2109     assert(drv || !protocol);
2110 
2111     /* Driver-specific filename parsing */
2112     if (drv && drv->bdrv_parse_filename && parse_filename) {
2113         drv->bdrv_parse_filename(filename, *options, &local_err);
2114         if (local_err) {
2115             error_propagate(errp, local_err);
2116             return -EINVAL;
2117         }
2118 
2119         if (!drv->bdrv_needs_filename) {
2120             qdict_del(*options, "filename");
2121         }
2122     }
2123 
2124     return 0;
2125 }
2126 
2127 typedef struct BlockReopenQueueEntry {
2128      bool prepared;
2129      BDRVReopenState state;
2130      QTAILQ_ENTRY(BlockReopenQueueEntry) entry;
2131 } BlockReopenQueueEntry;
2132 
2133 /*
2134  * Return the flags that @bs will have after the reopens in @q have
2135  * successfully completed. If @q is NULL (or @bs is not contained in @q),
2136  * return the current flags.
2137  */
2138 static int bdrv_reopen_get_flags(BlockReopenQueue *q, BlockDriverState *bs)
2139 {
2140     BlockReopenQueueEntry *entry;
2141 
2142     if (q != NULL) {
2143         QTAILQ_FOREACH(entry, q, entry) {
2144             if (entry->state.bs == bs) {
2145                 return entry->state.flags;
2146             }
2147         }
2148     }
2149 
2150     return bs->open_flags;
2151 }
2152 
2153 /* Returns whether the image file can be written to after the reopen queue @q
2154  * has been successfully applied, or right now if @q is NULL. */
2155 static bool bdrv_is_writable_after_reopen(BlockDriverState *bs,
2156                                           BlockReopenQueue *q)
2157 {
2158     int flags = bdrv_reopen_get_flags(q, bs);
2159 
2160     return (flags & (BDRV_O_RDWR | BDRV_O_INACTIVE)) == BDRV_O_RDWR;
2161 }
2162 
2163 /*
2164  * Return whether the BDS can be written to.  This is not necessarily
2165  * the same as !bdrv_is_read_only(bs), as inactivated images may not
2166  * be written to but do not count as read-only images.
2167  */
2168 bool bdrv_is_writable(BlockDriverState *bs)
2169 {
2170     IO_CODE();
2171     return bdrv_is_writable_after_reopen(bs, NULL);
2172 }
2173 
2174 static char *bdrv_child_user_desc(BdrvChild *c)
2175 {
2176     GLOBAL_STATE_CODE();
2177     return c->klass->get_parent_desc(c);
2178 }
2179 
2180 /*
2181  * Check that @a allows everything that @b needs. @a and @b must reference same
2182  * child node.
2183  */
2184 static bool bdrv_a_allow_b(BdrvChild *a, BdrvChild *b, Error **errp)
2185 {
2186     const char *child_bs_name;
2187     g_autofree char *a_user = NULL;
2188     g_autofree char *b_user = NULL;
2189     g_autofree char *perms = NULL;
2190 
2191     assert(a->bs);
2192     assert(a->bs == b->bs);
2193     GLOBAL_STATE_CODE();
2194 
2195     if ((b->perm & a->shared_perm) == b->perm) {
2196         return true;
2197     }
2198 
2199     child_bs_name = bdrv_get_node_name(b->bs);
2200     a_user = bdrv_child_user_desc(a);
2201     b_user = bdrv_child_user_desc(b);
2202     perms = bdrv_perm_names(b->perm & ~a->shared_perm);
2203 
2204     error_setg(errp, "Permission conflict on node '%s': permissions '%s' are "
2205                "both required by %s (uses node '%s' as '%s' child) and "
2206                "unshared by %s (uses node '%s' as '%s' child).",
2207                child_bs_name, perms,
2208                b_user, child_bs_name, b->name,
2209                a_user, child_bs_name, a->name);
2210 
2211     return false;
2212 }
2213 
2214 static bool GRAPH_RDLOCK
2215 bdrv_parent_perms_conflict(BlockDriverState *bs, Error **errp)
2216 {
2217     BdrvChild *a, *b;
2218     GLOBAL_STATE_CODE();
2219 
2220     /*
2221      * During the loop we'll look at each pair twice. That's correct because
2222      * bdrv_a_allow_b() is asymmetric and we should check each pair in both
2223      * directions.
2224      */
2225     QLIST_FOREACH(a, &bs->parents, next_parent) {
2226         QLIST_FOREACH(b, &bs->parents, next_parent) {
2227             if (a == b) {
2228                 continue;
2229             }
2230 
2231             if (!bdrv_a_allow_b(a, b, errp)) {
2232                 return true;
2233             }
2234         }
2235     }
2236 
2237     return false;
2238 }
2239 
2240 static void GRAPH_RDLOCK
2241 bdrv_child_perm(BlockDriverState *bs, BlockDriverState *child_bs,
2242                 BdrvChild *c, BdrvChildRole role,
2243                 BlockReopenQueue *reopen_queue,
2244                 uint64_t parent_perm, uint64_t parent_shared,
2245                 uint64_t *nperm, uint64_t *nshared)
2246 {
2247     assert(bs->drv && bs->drv->bdrv_child_perm);
2248     GLOBAL_STATE_CODE();
2249     bs->drv->bdrv_child_perm(bs, c, role, reopen_queue,
2250                              parent_perm, parent_shared,
2251                              nperm, nshared);
2252     /* TODO Take force_share from reopen_queue */
2253     if (child_bs && child_bs->force_share) {
2254         *nshared = BLK_PERM_ALL;
2255     }
2256 }
2257 
2258 /*
2259  * Adds the whole subtree of @bs (including @bs itself) to the @list (except for
2260  * nodes that are already in the @list, of course) so that final list is
2261  * topologically sorted. Return the result (GSList @list object is updated, so
2262  * don't use old reference after function call).
2263  *
2264  * On function start @list must be already topologically sorted and for any node
2265  * in the @list the whole subtree of the node must be in the @list as well. The
2266  * simplest way to satisfy this criteria: use only result of
2267  * bdrv_topological_dfs() or NULL as @list parameter.
2268  */
2269 static GSList * GRAPH_RDLOCK
2270 bdrv_topological_dfs(GSList *list, GHashTable *found, BlockDriverState *bs)
2271 {
2272     BdrvChild *child;
2273     g_autoptr(GHashTable) local_found = NULL;
2274 
2275     GLOBAL_STATE_CODE();
2276 
2277     if (!found) {
2278         assert(!list);
2279         found = local_found = g_hash_table_new(NULL, NULL);
2280     }
2281 
2282     if (g_hash_table_contains(found, bs)) {
2283         return list;
2284     }
2285     g_hash_table_add(found, bs);
2286 
2287     QLIST_FOREACH(child, &bs->children, next) {
2288         list = bdrv_topological_dfs(list, found, child->bs);
2289     }
2290 
2291     return g_slist_prepend(list, bs);
2292 }
2293 
2294 typedef struct BdrvChildSetPermState {
2295     BdrvChild *child;
2296     uint64_t old_perm;
2297     uint64_t old_shared_perm;
2298 } BdrvChildSetPermState;
2299 
2300 static void bdrv_child_set_perm_abort(void *opaque)
2301 {
2302     BdrvChildSetPermState *s = opaque;
2303 
2304     GLOBAL_STATE_CODE();
2305 
2306     s->child->perm = s->old_perm;
2307     s->child->shared_perm = s->old_shared_perm;
2308 }
2309 
2310 static TransactionActionDrv bdrv_child_set_pem_drv = {
2311     .abort = bdrv_child_set_perm_abort,
2312     .clean = g_free,
2313 };
2314 
2315 static void bdrv_child_set_perm(BdrvChild *c, uint64_t perm,
2316                                 uint64_t shared, Transaction *tran)
2317 {
2318     BdrvChildSetPermState *s = g_new(BdrvChildSetPermState, 1);
2319     GLOBAL_STATE_CODE();
2320 
2321     *s = (BdrvChildSetPermState) {
2322         .child = c,
2323         .old_perm = c->perm,
2324         .old_shared_perm = c->shared_perm,
2325     };
2326 
2327     c->perm = perm;
2328     c->shared_perm = shared;
2329 
2330     tran_add(tran, &bdrv_child_set_pem_drv, s);
2331 }
2332 
2333 static void GRAPH_RDLOCK bdrv_drv_set_perm_commit(void *opaque)
2334 {
2335     BlockDriverState *bs = opaque;
2336     uint64_t cumulative_perms, cumulative_shared_perms;
2337     GLOBAL_STATE_CODE();
2338 
2339     if (bs->drv->bdrv_set_perm) {
2340         bdrv_get_cumulative_perm(bs, &cumulative_perms,
2341                                  &cumulative_shared_perms);
2342         bs->drv->bdrv_set_perm(bs, cumulative_perms, cumulative_shared_perms);
2343     }
2344 }
2345 
2346 static void GRAPH_RDLOCK bdrv_drv_set_perm_abort(void *opaque)
2347 {
2348     BlockDriverState *bs = opaque;
2349     GLOBAL_STATE_CODE();
2350 
2351     if (bs->drv->bdrv_abort_perm_update) {
2352         bs->drv->bdrv_abort_perm_update(bs);
2353     }
2354 }
2355 
2356 TransactionActionDrv bdrv_drv_set_perm_drv = {
2357     .abort = bdrv_drv_set_perm_abort,
2358     .commit = bdrv_drv_set_perm_commit,
2359 };
2360 
2361 /*
2362  * After calling this function, the transaction @tran may only be completed
2363  * while holding a reader lock for the graph.
2364  */
2365 static int GRAPH_RDLOCK
2366 bdrv_drv_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared_perm,
2367                   Transaction *tran, Error **errp)
2368 {
2369     GLOBAL_STATE_CODE();
2370     if (!bs->drv) {
2371         return 0;
2372     }
2373 
2374     if (bs->drv->bdrv_check_perm) {
2375         int ret = bs->drv->bdrv_check_perm(bs, perm, shared_perm, errp);
2376         if (ret < 0) {
2377             return ret;
2378         }
2379     }
2380 
2381     if (tran) {
2382         tran_add(tran, &bdrv_drv_set_perm_drv, bs);
2383     }
2384 
2385     return 0;
2386 }
2387 
2388 typedef struct BdrvReplaceChildState {
2389     BdrvChild *child;
2390     BlockDriverState *old_bs;
2391 } BdrvReplaceChildState;
2392 
2393 static void GRAPH_WRLOCK bdrv_replace_child_commit(void *opaque)
2394 {
2395     BdrvReplaceChildState *s = opaque;
2396     GLOBAL_STATE_CODE();
2397 
2398     bdrv_schedule_unref(s->old_bs);
2399 }
2400 
2401 static void GRAPH_WRLOCK bdrv_replace_child_abort(void *opaque)
2402 {
2403     BdrvReplaceChildState *s = opaque;
2404     BlockDriverState *new_bs = s->child->bs;
2405 
2406     GLOBAL_STATE_CODE();
2407     assert_bdrv_graph_writable();
2408 
2409     /* old_bs reference is transparently moved from @s to @s->child */
2410     if (!s->child->bs) {
2411         /*
2412          * The parents were undrained when removing old_bs from the child. New
2413          * requests can't have been made, though, because the child was empty.
2414          *
2415          * TODO Make bdrv_replace_child_noperm() transactionable to avoid
2416          * undraining the parent in the first place. Once this is done, having
2417          * new_bs drained when calling bdrv_replace_child_tran() is not a
2418          * requirement any more.
2419          */
2420         bdrv_parent_drained_begin_single(s->child);
2421         assert(!bdrv_parent_drained_poll_single(s->child));
2422     }
2423     assert(s->child->quiesced_parent);
2424     bdrv_replace_child_noperm(s->child, s->old_bs);
2425 
2426     bdrv_unref(new_bs);
2427 }
2428 
2429 static TransactionActionDrv bdrv_replace_child_drv = {
2430     .commit = bdrv_replace_child_commit,
2431     .abort = bdrv_replace_child_abort,
2432     .clean = g_free,
2433 };
2434 
2435 /*
2436  * bdrv_replace_child_tran
2437  *
2438  * Note: real unref of old_bs is done only on commit.
2439  *
2440  * Both @child->bs and @new_bs (if non-NULL) must be drained. @new_bs must be
2441  * kept drained until the transaction is completed.
2442  *
2443  * After calling this function, the transaction @tran may only be completed
2444  * while holding a writer lock for the graph.
2445  *
2446  * The function doesn't update permissions, caller is responsible for this.
2447  */
2448 static void GRAPH_WRLOCK
2449 bdrv_replace_child_tran(BdrvChild *child, BlockDriverState *new_bs,
2450                         Transaction *tran)
2451 {
2452     BdrvReplaceChildState *s = g_new(BdrvReplaceChildState, 1);
2453 
2454     assert(child->quiesced_parent);
2455     assert(!new_bs || new_bs->quiesce_counter);
2456 
2457     *s = (BdrvReplaceChildState) {
2458         .child = child,
2459         .old_bs = child->bs,
2460     };
2461     tran_add(tran, &bdrv_replace_child_drv, s);
2462 
2463     if (new_bs) {
2464         bdrv_ref(new_bs);
2465     }
2466 
2467     bdrv_replace_child_noperm(child, new_bs);
2468     /* old_bs reference is transparently moved from @child to @s */
2469 }
2470 
2471 /*
2472  * Refresh permissions in @bs subtree. The function is intended to be called
2473  * after some graph modification that was done without permission update.
2474  *
2475  * After calling this function, the transaction @tran may only be completed
2476  * while holding a reader lock for the graph.
2477  */
2478 static int GRAPH_RDLOCK
2479 bdrv_node_refresh_perm(BlockDriverState *bs, BlockReopenQueue *q,
2480                        Transaction *tran, Error **errp)
2481 {
2482     BlockDriver *drv = bs->drv;
2483     BdrvChild *c;
2484     int ret;
2485     uint64_t cumulative_perms, cumulative_shared_perms;
2486     GLOBAL_STATE_CODE();
2487 
2488     bdrv_get_cumulative_perm(bs, &cumulative_perms, &cumulative_shared_perms);
2489 
2490     /* Write permissions never work with read-only images */
2491     if ((cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) &&
2492         !bdrv_is_writable_after_reopen(bs, q))
2493     {
2494         if (!bdrv_is_writable_after_reopen(bs, NULL)) {
2495             error_setg(errp, "Block node is read-only");
2496         } else {
2497             error_setg(errp, "Read-only block node '%s' cannot support "
2498                        "read-write users", bdrv_get_node_name(bs));
2499         }
2500 
2501         return -EPERM;
2502     }
2503 
2504     /*
2505      * Unaligned requests will automatically be aligned to bl.request_alignment
2506      * and without RESIZE we can't extend requests to write to space beyond the
2507      * end of the image, so it's required that the image size is aligned.
2508      */
2509     if ((cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) &&
2510         !(cumulative_perms & BLK_PERM_RESIZE))
2511     {
2512         if ((bs->total_sectors * BDRV_SECTOR_SIZE) % bs->bl.request_alignment) {
2513             error_setg(errp, "Cannot get 'write' permission without 'resize': "
2514                              "Image size is not a multiple of request "
2515                              "alignment");
2516             return -EPERM;
2517         }
2518     }
2519 
2520     /* Check this node */
2521     if (!drv) {
2522         return 0;
2523     }
2524 
2525     ret = bdrv_drv_set_perm(bs, cumulative_perms, cumulative_shared_perms, tran,
2526                             errp);
2527     if (ret < 0) {
2528         return ret;
2529     }
2530 
2531     /* Drivers that never have children can omit .bdrv_child_perm() */
2532     if (!drv->bdrv_child_perm) {
2533         assert(QLIST_EMPTY(&bs->children));
2534         return 0;
2535     }
2536 
2537     /* Check all children */
2538     QLIST_FOREACH(c, &bs->children, next) {
2539         uint64_t cur_perm, cur_shared;
2540 
2541         bdrv_child_perm(bs, c->bs, c, c->role, q,
2542                         cumulative_perms, cumulative_shared_perms,
2543                         &cur_perm, &cur_shared);
2544         bdrv_child_set_perm(c, cur_perm, cur_shared, tran);
2545     }
2546 
2547     return 0;
2548 }
2549 
2550 /*
2551  * @list is a product of bdrv_topological_dfs() (may be called several times) -
2552  * a topologically sorted subgraph.
2553  *
2554  * After calling this function, the transaction @tran may only be completed
2555  * while holding a reader lock for the graph.
2556  */
2557 static int GRAPH_RDLOCK
2558 bdrv_do_refresh_perms(GSList *list, BlockReopenQueue *q, Transaction *tran,
2559                       Error **errp)
2560 {
2561     int ret;
2562     BlockDriverState *bs;
2563     GLOBAL_STATE_CODE();
2564 
2565     for ( ; list; list = list->next) {
2566         bs = list->data;
2567 
2568         if (bdrv_parent_perms_conflict(bs, errp)) {
2569             return -EINVAL;
2570         }
2571 
2572         ret = bdrv_node_refresh_perm(bs, q, tran, errp);
2573         if (ret < 0) {
2574             return ret;
2575         }
2576     }
2577 
2578     return 0;
2579 }
2580 
2581 /*
2582  * @list is any list of nodes. List is completed by all subtrees and
2583  * topologically sorted. It's not a problem if some node occurs in the @list
2584  * several times.
2585  *
2586  * After calling this function, the transaction @tran may only be completed
2587  * while holding a reader lock for the graph.
2588  */
2589 static int GRAPH_RDLOCK
2590 bdrv_list_refresh_perms(GSList *list, BlockReopenQueue *q, Transaction *tran,
2591                         Error **errp)
2592 {
2593     g_autoptr(GHashTable) found = g_hash_table_new(NULL, NULL);
2594     g_autoptr(GSList) refresh_list = NULL;
2595 
2596     for ( ; list; list = list->next) {
2597         refresh_list = bdrv_topological_dfs(refresh_list, found, list->data);
2598     }
2599 
2600     return bdrv_do_refresh_perms(refresh_list, q, tran, errp);
2601 }
2602 
2603 void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm,
2604                               uint64_t *shared_perm)
2605 {
2606     BdrvChild *c;
2607     uint64_t cumulative_perms = 0;
2608     uint64_t cumulative_shared_perms = BLK_PERM_ALL;
2609 
2610     GLOBAL_STATE_CODE();
2611 
2612     QLIST_FOREACH(c, &bs->parents, next_parent) {
2613         cumulative_perms |= c->perm;
2614         cumulative_shared_perms &= c->shared_perm;
2615     }
2616 
2617     *perm = cumulative_perms;
2618     *shared_perm = cumulative_shared_perms;
2619 }
2620 
2621 char *bdrv_perm_names(uint64_t perm)
2622 {
2623     struct perm_name {
2624         uint64_t perm;
2625         const char *name;
2626     } permissions[] = {
2627         { BLK_PERM_CONSISTENT_READ, "consistent read" },
2628         { BLK_PERM_WRITE,           "write" },
2629         { BLK_PERM_WRITE_UNCHANGED, "write unchanged" },
2630         { BLK_PERM_RESIZE,          "resize" },
2631         { 0, NULL }
2632     };
2633 
2634     GString *result = g_string_sized_new(30);
2635     struct perm_name *p;
2636 
2637     for (p = permissions; p->name; p++) {
2638         if (perm & p->perm) {
2639             if (result->len > 0) {
2640                 g_string_append(result, ", ");
2641             }
2642             g_string_append(result, p->name);
2643         }
2644     }
2645 
2646     return g_string_free(result, FALSE);
2647 }
2648 
2649 
2650 /*
2651  * @tran is allowed to be NULL. In this case no rollback is possible.
2652  *
2653  * After calling this function, the transaction @tran may only be completed
2654  * while holding a reader lock for the graph.
2655  */
2656 static int GRAPH_RDLOCK
2657 bdrv_refresh_perms(BlockDriverState *bs, Transaction *tran, Error **errp)
2658 {
2659     int ret;
2660     Transaction *local_tran = NULL;
2661     g_autoptr(GSList) list = bdrv_topological_dfs(NULL, NULL, bs);
2662     GLOBAL_STATE_CODE();
2663 
2664     if (!tran) {
2665         tran = local_tran = tran_new();
2666     }
2667 
2668     ret = bdrv_do_refresh_perms(list, NULL, tran, errp);
2669 
2670     if (local_tran) {
2671         tran_finalize(local_tran, ret);
2672     }
2673 
2674     return ret;
2675 }
2676 
2677 int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared,
2678                             Error **errp)
2679 {
2680     Error *local_err = NULL;
2681     Transaction *tran = tran_new();
2682     int ret;
2683 
2684     GLOBAL_STATE_CODE();
2685 
2686     bdrv_child_set_perm(c, perm, shared, tran);
2687 
2688     ret = bdrv_refresh_perms(c->bs, tran, &local_err);
2689 
2690     tran_finalize(tran, ret);
2691 
2692     if (ret < 0) {
2693         if ((perm & ~c->perm) || (c->shared_perm & ~shared)) {
2694             /* tighten permissions */
2695             error_propagate(errp, local_err);
2696         } else {
2697             /*
2698              * Our caller may intend to only loosen restrictions and
2699              * does not expect this function to fail.  Errors are not
2700              * fatal in such a case, so we can just hide them from our
2701              * caller.
2702              */
2703             error_free(local_err);
2704             ret = 0;
2705         }
2706     }
2707 
2708     return ret;
2709 }
2710 
2711 int bdrv_child_refresh_perms(BlockDriverState *bs, BdrvChild *c, Error **errp)
2712 {
2713     uint64_t parent_perms, parent_shared;
2714     uint64_t perms, shared;
2715 
2716     GLOBAL_STATE_CODE();
2717 
2718     bdrv_get_cumulative_perm(bs, &parent_perms, &parent_shared);
2719     bdrv_child_perm(bs, c->bs, c, c->role, NULL,
2720                     parent_perms, parent_shared, &perms, &shared);
2721 
2722     return bdrv_child_try_set_perm(c, perms, shared, errp);
2723 }
2724 
2725 /*
2726  * Default implementation for .bdrv_child_perm() for block filters:
2727  * Forward CONSISTENT_READ, WRITE, WRITE_UNCHANGED, and RESIZE to the
2728  * filtered child.
2729  */
2730 static void bdrv_filter_default_perms(BlockDriverState *bs, BdrvChild *c,
2731                                       BdrvChildRole role,
2732                                       BlockReopenQueue *reopen_queue,
2733                                       uint64_t perm, uint64_t shared,
2734                                       uint64_t *nperm, uint64_t *nshared)
2735 {
2736     GLOBAL_STATE_CODE();
2737     *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
2738     *nshared = (shared & DEFAULT_PERM_PASSTHROUGH) | DEFAULT_PERM_UNCHANGED;
2739 }
2740 
2741 static void bdrv_default_perms_for_cow(BlockDriverState *bs, BdrvChild *c,
2742                                        BdrvChildRole role,
2743                                        BlockReopenQueue *reopen_queue,
2744                                        uint64_t perm, uint64_t shared,
2745                                        uint64_t *nperm, uint64_t *nshared)
2746 {
2747     assert(role & BDRV_CHILD_COW);
2748     GLOBAL_STATE_CODE();
2749 
2750     /*
2751      * We want consistent read from backing files if the parent needs it.
2752      * No other operations are performed on backing files.
2753      */
2754     perm &= BLK_PERM_CONSISTENT_READ;
2755 
2756     /*
2757      * If the parent can deal with changing data, we're okay with a
2758      * writable and resizable backing file.
2759      * TODO Require !(perm & BLK_PERM_CONSISTENT_READ), too?
2760      */
2761     if (shared & BLK_PERM_WRITE) {
2762         shared = BLK_PERM_WRITE | BLK_PERM_RESIZE;
2763     } else {
2764         shared = 0;
2765     }
2766 
2767     shared |= BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED;
2768 
2769     if (bs->open_flags & BDRV_O_INACTIVE) {
2770         shared |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
2771     }
2772 
2773     *nperm = perm;
2774     *nshared = shared;
2775 }
2776 
2777 static void bdrv_default_perms_for_storage(BlockDriverState *bs, BdrvChild *c,
2778                                            BdrvChildRole role,
2779                                            BlockReopenQueue *reopen_queue,
2780                                            uint64_t perm, uint64_t shared,
2781                                            uint64_t *nperm, uint64_t *nshared)
2782 {
2783     int flags;
2784 
2785     GLOBAL_STATE_CODE();
2786     assert(role & (BDRV_CHILD_METADATA | BDRV_CHILD_DATA));
2787 
2788     flags = bdrv_reopen_get_flags(reopen_queue, bs);
2789 
2790     /*
2791      * Apart from the modifications below, the same permissions are
2792      * forwarded and left alone as for filters
2793      */
2794     bdrv_filter_default_perms(bs, c, role, reopen_queue,
2795                               perm, shared, &perm, &shared);
2796 
2797     if (role & BDRV_CHILD_METADATA) {
2798         /* Format drivers may touch metadata even if the guest doesn't write */
2799         if (bdrv_is_writable_after_reopen(bs, reopen_queue)) {
2800             perm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
2801         }
2802 
2803         /*
2804          * bs->file always needs to be consistent because of the
2805          * metadata. We can never allow other users to resize or write
2806          * to it.
2807          */
2808         if (!(flags & BDRV_O_NO_IO)) {
2809             perm |= BLK_PERM_CONSISTENT_READ;
2810         }
2811         shared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
2812     }
2813 
2814     if (role & BDRV_CHILD_DATA) {
2815         /*
2816          * Technically, everything in this block is a subset of the
2817          * BDRV_CHILD_METADATA path taken above, and so this could
2818          * be an "else if" branch.  However, that is not obvious, and
2819          * this function is not performance critical, therefore we let
2820          * this be an independent "if".
2821          */
2822 
2823         /*
2824          * We cannot allow other users to resize the file because the
2825          * format driver might have some assumptions about the size
2826          * (e.g. because it is stored in metadata, or because the file
2827          * is split into fixed-size data files).
2828          */
2829         shared &= ~BLK_PERM_RESIZE;
2830 
2831         /*
2832          * WRITE_UNCHANGED often cannot be performed as such on the
2833          * data file.  For example, the qcow2 driver may still need to
2834          * write copied clusters on copy-on-read.
2835          */
2836         if (perm & BLK_PERM_WRITE_UNCHANGED) {
2837             perm |= BLK_PERM_WRITE;
2838         }
2839 
2840         /*
2841          * If the data file is written to, the format driver may
2842          * expect to be able to resize it by writing beyond the EOF.
2843          */
2844         if (perm & BLK_PERM_WRITE) {
2845             perm |= BLK_PERM_RESIZE;
2846         }
2847     }
2848 
2849     if (bs->open_flags & BDRV_O_INACTIVE) {
2850         shared |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
2851     }
2852 
2853     *nperm = perm;
2854     *nshared = shared;
2855 }
2856 
2857 void bdrv_default_perms(BlockDriverState *bs, BdrvChild *c,
2858                         BdrvChildRole role, BlockReopenQueue *reopen_queue,
2859                         uint64_t perm, uint64_t shared,
2860                         uint64_t *nperm, uint64_t *nshared)
2861 {
2862     GLOBAL_STATE_CODE();
2863     if (role & BDRV_CHILD_FILTERED) {
2864         assert(!(role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA |
2865                          BDRV_CHILD_COW)));
2866         bdrv_filter_default_perms(bs, c, role, reopen_queue,
2867                                   perm, shared, nperm, nshared);
2868     } else if (role & BDRV_CHILD_COW) {
2869         assert(!(role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA)));
2870         bdrv_default_perms_for_cow(bs, c, role, reopen_queue,
2871                                    perm, shared, nperm, nshared);
2872     } else if (role & (BDRV_CHILD_METADATA | BDRV_CHILD_DATA)) {
2873         bdrv_default_perms_for_storage(bs, c, role, reopen_queue,
2874                                        perm, shared, nperm, nshared);
2875     } else {
2876         g_assert_not_reached();
2877     }
2878 }
2879 
2880 uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm)
2881 {
2882     static const uint64_t permissions[] = {
2883         [BLOCK_PERMISSION_CONSISTENT_READ]  = BLK_PERM_CONSISTENT_READ,
2884         [BLOCK_PERMISSION_WRITE]            = BLK_PERM_WRITE,
2885         [BLOCK_PERMISSION_WRITE_UNCHANGED]  = BLK_PERM_WRITE_UNCHANGED,
2886         [BLOCK_PERMISSION_RESIZE]           = BLK_PERM_RESIZE,
2887     };
2888 
2889     QEMU_BUILD_BUG_ON(ARRAY_SIZE(permissions) != BLOCK_PERMISSION__MAX);
2890     QEMU_BUILD_BUG_ON(1UL << ARRAY_SIZE(permissions) != BLK_PERM_ALL + 1);
2891 
2892     assert(qapi_perm < BLOCK_PERMISSION__MAX);
2893 
2894     return permissions[qapi_perm];
2895 }
2896 
2897 /*
2898  * Replaces the node that a BdrvChild points to without updating permissions.
2899  *
2900  * If @new_bs is non-NULL, the parent of @child must already be drained through
2901  * @child and the caller must hold the AioContext lock for @new_bs.
2902  */
2903 static void GRAPH_WRLOCK
2904 bdrv_replace_child_noperm(BdrvChild *child, BlockDriverState *new_bs)
2905 {
2906     BlockDriverState *old_bs = child->bs;
2907     int new_bs_quiesce_counter;
2908 
2909     assert(!child->frozen);
2910 
2911     /*
2912      * If we want to change the BdrvChild to point to a drained node as its new
2913      * child->bs, we need to make sure that its new parent is drained, too. In
2914      * other words, either child->quiesce_parent must already be true or we must
2915      * be able to set it and keep the parent's quiesce_counter consistent with
2916      * that, but without polling or starting new requests (this function
2917      * guarantees that it doesn't poll, and starting new requests would be
2918      * against the invariants of drain sections).
2919      *
2920      * To keep things simple, we pick the first option (child->quiesce_parent
2921      * must already be true). We also generalise the rule a bit to make it
2922      * easier to verify in callers and more likely to be covered in test cases:
2923      * The parent must be quiesced through this child even if new_bs isn't
2924      * currently drained.
2925      *
2926      * The only exception is for callers that always pass new_bs == NULL. In
2927      * this case, we obviously never need to consider the case of a drained
2928      * new_bs, so we can keep the callers simpler by allowing them not to drain
2929      * the parent.
2930      */
2931     assert(!new_bs || child->quiesced_parent);
2932     assert(old_bs != new_bs);
2933     GLOBAL_STATE_CODE();
2934 
2935     if (old_bs && new_bs) {
2936         assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
2937     }
2938 
2939     if (old_bs) {
2940         if (child->klass->detach) {
2941             child->klass->detach(child);
2942         }
2943         QLIST_REMOVE(child, next_parent);
2944     }
2945 
2946     child->bs = new_bs;
2947 
2948     if (new_bs) {
2949         QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
2950         if (child->klass->attach) {
2951             child->klass->attach(child);
2952         }
2953     }
2954 
2955     /*
2956      * If the parent was drained through this BdrvChild previously, but new_bs
2957      * is not drained, allow requests to come in only after the new node has
2958      * been attached.
2959      */
2960     new_bs_quiesce_counter = (new_bs ? new_bs->quiesce_counter : 0);
2961     if (!new_bs_quiesce_counter && child->quiesced_parent) {
2962         bdrv_parent_drained_end_single(child);
2963     }
2964 }
2965 
2966 /**
2967  * Free the given @child.
2968  *
2969  * The child must be empty (i.e. `child->bs == NULL`) and it must be
2970  * unused (i.e. not in a children list).
2971  */
2972 static void bdrv_child_free(BdrvChild *child)
2973 {
2974     assert(!child->bs);
2975     GLOBAL_STATE_CODE();
2976     GRAPH_RDLOCK_GUARD_MAINLOOP();
2977 
2978     assert(!child->next.le_prev); /* not in children list */
2979 
2980     g_free(child->name);
2981     g_free(child);
2982 }
2983 
2984 typedef struct BdrvAttachChildCommonState {
2985     BdrvChild *child;
2986     AioContext *old_parent_ctx;
2987     AioContext *old_child_ctx;
2988 } BdrvAttachChildCommonState;
2989 
2990 static void GRAPH_WRLOCK bdrv_attach_child_common_abort(void *opaque)
2991 {
2992     BdrvAttachChildCommonState *s = opaque;
2993     BlockDriverState *bs = s->child->bs;
2994 
2995     GLOBAL_STATE_CODE();
2996     assert_bdrv_graph_writable();
2997 
2998     bdrv_replace_child_noperm(s->child, NULL);
2999 
3000     if (bdrv_get_aio_context(bs) != s->old_child_ctx) {
3001         bdrv_try_change_aio_context(bs, s->old_child_ctx, NULL, &error_abort);
3002     }
3003 
3004     if (bdrv_child_get_parent_aio_context(s->child) != s->old_parent_ctx) {
3005         Transaction *tran;
3006         GHashTable *visited;
3007         bool ret;
3008 
3009         tran = tran_new();
3010 
3011         /* No need to visit `child`, because it has been detached already */
3012         visited = g_hash_table_new(NULL, NULL);
3013         ret = s->child->klass->change_aio_ctx(s->child, s->old_parent_ctx,
3014                                               visited, tran, &error_abort);
3015         g_hash_table_destroy(visited);
3016 
3017         /* transaction is supposed to always succeed */
3018         assert(ret == true);
3019         tran_commit(tran);
3020     }
3021 
3022     bdrv_schedule_unref(bs);
3023     bdrv_child_free(s->child);
3024 }
3025 
3026 static TransactionActionDrv bdrv_attach_child_common_drv = {
3027     .abort = bdrv_attach_child_common_abort,
3028     .clean = g_free,
3029 };
3030 
3031 /*
3032  * Common part of attaching bdrv child to bs or to blk or to job
3033  *
3034  * Function doesn't update permissions, caller is responsible for this.
3035  *
3036  * After calling this function, the transaction @tran may only be completed
3037  * while holding a writer lock for the graph.
3038  *
3039  * Returns new created child.
3040  *
3041  * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
3042  * @child_bs can move to a different AioContext in this function. Callers must
3043  * make sure that their AioContext locking is still correct after this.
3044  */
3045 static BdrvChild * GRAPH_WRLOCK
3046 bdrv_attach_child_common(BlockDriverState *child_bs,
3047                          const char *child_name,
3048                          const BdrvChildClass *child_class,
3049                          BdrvChildRole child_role,
3050                          uint64_t perm, uint64_t shared_perm,
3051                          void *opaque,
3052                          Transaction *tran, Error **errp)
3053 {
3054     BdrvChild *new_child;
3055     AioContext *parent_ctx, *new_child_ctx;
3056     AioContext *child_ctx = bdrv_get_aio_context(child_bs);
3057 
3058     assert(child_class->get_parent_desc);
3059     GLOBAL_STATE_CODE();
3060 
3061     new_child = g_new(BdrvChild, 1);
3062     *new_child = (BdrvChild) {
3063         .bs             = NULL,
3064         .name           = g_strdup(child_name),
3065         .klass          = child_class,
3066         .role           = child_role,
3067         .perm           = perm,
3068         .shared_perm    = shared_perm,
3069         .opaque         = opaque,
3070     };
3071 
3072     /*
3073      * If the AioContexts don't match, first try to move the subtree of
3074      * child_bs into the AioContext of the new parent. If this doesn't work,
3075      * try moving the parent into the AioContext of child_bs instead.
3076      */
3077     parent_ctx = bdrv_child_get_parent_aio_context(new_child);
3078     if (child_ctx != parent_ctx) {
3079         Error *local_err = NULL;
3080         int ret = bdrv_try_change_aio_context(child_bs, parent_ctx, NULL,
3081                                               &local_err);
3082 
3083         if (ret < 0 && child_class->change_aio_ctx) {
3084             Transaction *aio_ctx_tran = tran_new();
3085             GHashTable *visited = g_hash_table_new(NULL, NULL);
3086             bool ret_child;
3087 
3088             g_hash_table_add(visited, new_child);
3089             ret_child = child_class->change_aio_ctx(new_child, child_ctx,
3090                                                     visited, aio_ctx_tran,
3091                                                     NULL);
3092             if (ret_child == true) {
3093                 error_free(local_err);
3094                 ret = 0;
3095             }
3096             tran_finalize(aio_ctx_tran, ret_child == true ? 0 : -1);
3097             g_hash_table_destroy(visited);
3098         }
3099 
3100         if (ret < 0) {
3101             error_propagate(errp, local_err);
3102             bdrv_child_free(new_child);
3103             return NULL;
3104         }
3105     }
3106 
3107     new_child_ctx = bdrv_get_aio_context(child_bs);
3108     if (new_child_ctx != child_ctx) {
3109         aio_context_release(child_ctx);
3110         aio_context_acquire(new_child_ctx);
3111     }
3112 
3113     bdrv_ref(child_bs);
3114     /*
3115      * Let every new BdrvChild start with a drained parent. Inserting the child
3116      * in the graph with bdrv_replace_child_noperm() will undrain it if
3117      * @child_bs is not drained.
3118      *
3119      * The child was only just created and is not yet visible in global state
3120      * until bdrv_replace_child_noperm() inserts it into the graph, so nobody
3121      * could have sent requests and polling is not necessary.
3122      *
3123      * Note that this means that the parent isn't fully drained yet, we only
3124      * stop new requests from coming in. This is fine, we don't care about the
3125      * old requests here, they are not for this child. If another place enters a
3126      * drain section for the same parent, but wants it to be fully quiesced, it
3127      * will not run most of the the code in .drained_begin() again (which is not
3128      * a problem, we already did this), but it will still poll until the parent
3129      * is fully quiesced, so it will not be negatively affected either.
3130      */
3131     bdrv_parent_drained_begin_single(new_child);
3132     bdrv_replace_child_noperm(new_child, child_bs);
3133 
3134     BdrvAttachChildCommonState *s = g_new(BdrvAttachChildCommonState, 1);
3135     *s = (BdrvAttachChildCommonState) {
3136         .child = new_child,
3137         .old_parent_ctx = parent_ctx,
3138         .old_child_ctx = child_ctx,
3139     };
3140     tran_add(tran, &bdrv_attach_child_common_drv, s);
3141 
3142     if (new_child_ctx != child_ctx) {
3143         aio_context_release(new_child_ctx);
3144         aio_context_acquire(child_ctx);
3145     }
3146 
3147     return new_child;
3148 }
3149 
3150 /*
3151  * Function doesn't update permissions, caller is responsible for this.
3152  *
3153  * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
3154  * @child_bs can move to a different AioContext in this function. Callers must
3155  * make sure that their AioContext locking is still correct after this.
3156  *
3157  * After calling this function, the transaction @tran may only be completed
3158  * while holding a writer lock for the graph.
3159  */
3160 static BdrvChild * GRAPH_WRLOCK
3161 bdrv_attach_child_noperm(BlockDriverState *parent_bs,
3162                          BlockDriverState *child_bs,
3163                          const char *child_name,
3164                          const BdrvChildClass *child_class,
3165                          BdrvChildRole child_role,
3166                          Transaction *tran,
3167                          Error **errp)
3168 {
3169     uint64_t perm, shared_perm;
3170 
3171     assert(parent_bs->drv);
3172     GLOBAL_STATE_CODE();
3173 
3174     if (bdrv_recurse_has_child(child_bs, parent_bs)) {
3175         error_setg(errp, "Making '%s' a %s child of '%s' would create a cycle",
3176                    child_bs->node_name, child_name, parent_bs->node_name);
3177         return NULL;
3178     }
3179 
3180     bdrv_get_cumulative_perm(parent_bs, &perm, &shared_perm);
3181     bdrv_child_perm(parent_bs, child_bs, NULL, child_role, NULL,
3182                     perm, shared_perm, &perm, &shared_perm);
3183 
3184     return bdrv_attach_child_common(child_bs, child_name, child_class,
3185                                     child_role, perm, shared_perm, parent_bs,
3186                                     tran, errp);
3187 }
3188 
3189 /*
3190  * This function steals the reference to child_bs from the caller.
3191  * That reference is later dropped by bdrv_root_unref_child().
3192  *
3193  * On failure NULL is returned, errp is set and the reference to
3194  * child_bs is also dropped.
3195  *
3196  * The caller must hold the AioContext lock @child_bs, but not that of @ctx
3197  * (unless @child_bs is already in @ctx).
3198  */
3199 BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
3200                                   const char *child_name,
3201                                   const BdrvChildClass *child_class,
3202                                   BdrvChildRole child_role,
3203                                   uint64_t perm, uint64_t shared_perm,
3204                                   void *opaque, Error **errp)
3205 {
3206     int ret;
3207     BdrvChild *child;
3208     Transaction *tran = tran_new();
3209 
3210     GLOBAL_STATE_CODE();
3211 
3212     bdrv_graph_wrlock(child_bs);
3213 
3214     child = bdrv_attach_child_common(child_bs, child_name, child_class,
3215                                    child_role, perm, shared_perm, opaque,
3216                                    tran, errp);
3217     if (!child) {
3218         ret = -EINVAL;
3219         goto out;
3220     }
3221 
3222     ret = bdrv_refresh_perms(child_bs, tran, errp);
3223 
3224 out:
3225     tran_finalize(tran, ret);
3226     bdrv_graph_wrunlock();
3227 
3228     bdrv_unref(child_bs);
3229 
3230     return ret < 0 ? NULL : child;
3231 }
3232 
3233 /*
3234  * This function transfers the reference to child_bs from the caller
3235  * to parent_bs. That reference is later dropped by parent_bs on
3236  * bdrv_close() or if someone calls bdrv_unref_child().
3237  *
3238  * On failure NULL is returned, errp is set and the reference to
3239  * child_bs is also dropped.
3240  *
3241  * If @parent_bs and @child_bs are in different AioContexts, the caller must
3242  * hold the AioContext lock for @child_bs, but not for @parent_bs.
3243  */
3244 BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
3245                              BlockDriverState *child_bs,
3246                              const char *child_name,
3247                              const BdrvChildClass *child_class,
3248                              BdrvChildRole child_role,
3249                              Error **errp)
3250 {
3251     int ret;
3252     BdrvChild *child;
3253     Transaction *tran = tran_new();
3254 
3255     GLOBAL_STATE_CODE();
3256 
3257     child = bdrv_attach_child_noperm(parent_bs, child_bs, child_name,
3258                                      child_class, child_role, tran, errp);
3259     if (!child) {
3260         ret = -EINVAL;
3261         goto out;
3262     }
3263 
3264     ret = bdrv_refresh_perms(parent_bs, tran, errp);
3265     if (ret < 0) {
3266         goto out;
3267     }
3268 
3269 out:
3270     tran_finalize(tran, ret);
3271 
3272     bdrv_schedule_unref(child_bs);
3273 
3274     return ret < 0 ? NULL : child;
3275 }
3276 
3277 /* Callers must ensure that child->frozen is false. */
3278 void bdrv_root_unref_child(BdrvChild *child)
3279 {
3280     BlockDriverState *child_bs = child->bs;
3281 
3282     GLOBAL_STATE_CODE();
3283     bdrv_replace_child_noperm(child, NULL);
3284     bdrv_child_free(child);
3285 
3286     if (child_bs) {
3287         /*
3288          * Update permissions for old node. We're just taking a parent away, so
3289          * we're loosening restrictions. Errors of permission update are not
3290          * fatal in this case, ignore them.
3291          */
3292         bdrv_refresh_perms(child_bs, NULL, NULL);
3293 
3294         /*
3295          * When the parent requiring a non-default AioContext is removed, the
3296          * node moves back to the main AioContext
3297          */
3298         bdrv_try_change_aio_context(child_bs, qemu_get_aio_context(), NULL,
3299                                     NULL);
3300     }
3301 
3302     bdrv_schedule_unref(child_bs);
3303 }
3304 
3305 typedef struct BdrvSetInheritsFrom {
3306     BlockDriverState *bs;
3307     BlockDriverState *old_inherits_from;
3308 } BdrvSetInheritsFrom;
3309 
3310 static void bdrv_set_inherits_from_abort(void *opaque)
3311 {
3312     BdrvSetInheritsFrom *s = opaque;
3313 
3314     s->bs->inherits_from = s->old_inherits_from;
3315 }
3316 
3317 static TransactionActionDrv bdrv_set_inherits_from_drv = {
3318     .abort = bdrv_set_inherits_from_abort,
3319     .clean = g_free,
3320 };
3321 
3322 /* @tran is allowed to be NULL. In this case no rollback is possible */
3323 static void bdrv_set_inherits_from(BlockDriverState *bs,
3324                                    BlockDriverState *new_inherits_from,
3325                                    Transaction *tran)
3326 {
3327     if (tran) {
3328         BdrvSetInheritsFrom *s = g_new(BdrvSetInheritsFrom, 1);
3329 
3330         *s = (BdrvSetInheritsFrom) {
3331             .bs = bs,
3332             .old_inherits_from = bs->inherits_from,
3333         };
3334 
3335         tran_add(tran, &bdrv_set_inherits_from_drv, s);
3336     }
3337 
3338     bs->inherits_from = new_inherits_from;
3339 }
3340 
3341 /**
3342  * Clear all inherits_from pointers from children and grandchildren of
3343  * @root that point to @root, where necessary.
3344  * @tran is allowed to be NULL. In this case no rollback is possible
3345  */
3346 static void GRAPH_WRLOCK
3347 bdrv_unset_inherits_from(BlockDriverState *root, BdrvChild *child,
3348                          Transaction *tran)
3349 {
3350     BdrvChild *c;
3351 
3352     if (child->bs->inherits_from == root) {
3353         /*
3354          * Remove inherits_from only when the last reference between root and
3355          * child->bs goes away.
3356          */
3357         QLIST_FOREACH(c, &root->children, next) {
3358             if (c != child && c->bs == child->bs) {
3359                 break;
3360             }
3361         }
3362         if (c == NULL) {
3363             bdrv_set_inherits_from(child->bs, NULL, tran);
3364         }
3365     }
3366 
3367     QLIST_FOREACH(c, &child->bs->children, next) {
3368         bdrv_unset_inherits_from(root, c, tran);
3369     }
3370 }
3371 
3372 /* Callers must ensure that child->frozen is false. */
3373 void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child)
3374 {
3375     GLOBAL_STATE_CODE();
3376     if (child == NULL) {
3377         return;
3378     }
3379 
3380     bdrv_unset_inherits_from(parent, child, NULL);
3381     bdrv_root_unref_child(child);
3382 }
3383 
3384 
3385 static void GRAPH_RDLOCK
3386 bdrv_parent_cb_change_media(BlockDriverState *bs, bool load)
3387 {
3388     BdrvChild *c;
3389     GLOBAL_STATE_CODE();
3390     QLIST_FOREACH(c, &bs->parents, next_parent) {
3391         if (c->klass->change_media) {
3392             c->klass->change_media(c, load);
3393         }
3394     }
3395 }
3396 
3397 /* Return true if you can reach parent going through child->inherits_from
3398  * recursively. If parent or child are NULL, return false */
3399 static bool bdrv_inherits_from_recursive(BlockDriverState *child,
3400                                          BlockDriverState *parent)
3401 {
3402     while (child && child != parent) {
3403         child = child->inherits_from;
3404     }
3405 
3406     return child != NULL;
3407 }
3408 
3409 /*
3410  * Return the BdrvChildRole for @bs's backing child.  bs->backing is
3411  * mostly used for COW backing children (role = COW), but also for
3412  * filtered children (role = FILTERED | PRIMARY).
3413  */
3414 static BdrvChildRole bdrv_backing_role(BlockDriverState *bs)
3415 {
3416     if (bs->drv && bs->drv->is_filter) {
3417         return BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY;
3418     } else {
3419         return BDRV_CHILD_COW;
3420     }
3421 }
3422 
3423 /*
3424  * Sets the bs->backing or bs->file link of a BDS. A new reference is created;
3425  * callers which don't need their own reference any more must call bdrv_unref().
3426  *
3427  * If the respective child is already present (i.e. we're detaching a node),
3428  * that child node must be drained.
3429  *
3430  * Function doesn't update permissions, caller is responsible for this.
3431  *
3432  * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
3433  * @child_bs can move to a different AioContext in this function. Callers must
3434  * make sure that their AioContext locking is still correct after this.
3435  *
3436  * After calling this function, the transaction @tran may only be completed
3437  * while holding a writer lock for the graph.
3438  */
3439 static int GRAPH_WRLOCK
3440 bdrv_set_file_or_backing_noperm(BlockDriverState *parent_bs,
3441                                 BlockDriverState *child_bs,
3442                                 bool is_backing,
3443                                 Transaction *tran, Error **errp)
3444 {
3445     bool update_inherits_from =
3446         bdrv_inherits_from_recursive(child_bs, parent_bs);
3447     BdrvChild *child = is_backing ? parent_bs->backing : parent_bs->file;
3448     BdrvChildRole role;
3449 
3450     GLOBAL_STATE_CODE();
3451 
3452     if (!parent_bs->drv) {
3453         /*
3454          * Node without drv is an object without a class :/. TODO: finally fix
3455          * qcow2 driver to never clear bs->drv and implement format corruption
3456          * handling in other way.
3457          */
3458         error_setg(errp, "Node corrupted");
3459         return -EINVAL;
3460     }
3461 
3462     if (child && child->frozen) {
3463         error_setg(errp, "Cannot change frozen '%s' link from '%s' to '%s'",
3464                    child->name, parent_bs->node_name, child->bs->node_name);
3465         return -EPERM;
3466     }
3467 
3468     if (is_backing && !parent_bs->drv->is_filter &&
3469         !parent_bs->drv->supports_backing)
3470     {
3471         error_setg(errp, "Driver '%s' of node '%s' does not support backing "
3472                    "files", parent_bs->drv->format_name, parent_bs->node_name);
3473         return -EINVAL;
3474     }
3475 
3476     if (parent_bs->drv->is_filter) {
3477         role = BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY;
3478     } else if (is_backing) {
3479         role = BDRV_CHILD_COW;
3480     } else {
3481         /*
3482          * We only can use same role as it is in existing child. We don't have
3483          * infrastructure to determine role of file child in generic way
3484          */
3485         if (!child) {
3486             error_setg(errp, "Cannot set file child to format node without "
3487                        "file child");
3488             return -EINVAL;
3489         }
3490         role = child->role;
3491     }
3492 
3493     if (child) {
3494         assert(child->bs->quiesce_counter);
3495         bdrv_unset_inherits_from(parent_bs, child, tran);
3496         bdrv_remove_child(child, tran);
3497     }
3498 
3499     if (!child_bs) {
3500         goto out;
3501     }
3502 
3503     child = bdrv_attach_child_noperm(parent_bs, child_bs,
3504                                      is_backing ? "backing" : "file",
3505                                      &child_of_bds, role,
3506                                      tran, errp);
3507     if (!child) {
3508         return -EINVAL;
3509     }
3510 
3511 
3512     /*
3513      * If inherits_from pointed recursively to bs then let's update it to
3514      * point directly to bs (else it will become NULL).
3515      */
3516     if (update_inherits_from) {
3517         bdrv_set_inherits_from(child_bs, parent_bs, tran);
3518     }
3519 
3520 out:
3521     bdrv_refresh_limits(parent_bs, tran, NULL);
3522 
3523     return 0;
3524 }
3525 
3526 /*
3527  * The caller must hold the AioContext lock for @backing_hd. Both @bs and
3528  * @backing_hd can move to a different AioContext in this function. Callers must
3529  * make sure that their AioContext locking is still correct after this.
3530  *
3531  * If a backing child is already present (i.e. we're detaching a node), that
3532  * child node must be drained.
3533  *
3534  * After calling this function, the transaction @tran may only be completed
3535  * while holding a writer lock for the graph.
3536  */
3537 static int GRAPH_WRLOCK
3538 bdrv_set_backing_noperm(BlockDriverState *bs,
3539                         BlockDriverState *backing_hd,
3540                         Transaction *tran, Error **errp)
3541 {
3542     GLOBAL_STATE_CODE();
3543     return bdrv_set_file_or_backing_noperm(bs, backing_hd, true, tran, errp);
3544 }
3545 
3546 int bdrv_set_backing_hd_drained(BlockDriverState *bs,
3547                                 BlockDriverState *backing_hd,
3548                                 Error **errp)
3549 {
3550     int ret;
3551     Transaction *tran = tran_new();
3552 
3553     GLOBAL_STATE_CODE();
3554     assert(bs->quiesce_counter > 0);
3555     if (bs->backing) {
3556         assert(bs->backing->bs->quiesce_counter > 0);
3557     }
3558     bdrv_graph_wrlock(backing_hd);
3559 
3560     ret = bdrv_set_backing_noperm(bs, backing_hd, tran, errp);
3561     if (ret < 0) {
3562         goto out;
3563     }
3564 
3565     ret = bdrv_refresh_perms(bs, tran, errp);
3566 out:
3567     tran_finalize(tran, ret);
3568     bdrv_graph_wrunlock();
3569     return ret;
3570 }
3571 
3572 int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
3573                         Error **errp)
3574 {
3575     BlockDriverState *drain_bs = bs->backing ? bs->backing->bs : bs;
3576     int ret;
3577     GLOBAL_STATE_CODE();
3578 
3579     bdrv_ref(drain_bs);
3580     bdrv_drained_begin(drain_bs);
3581     ret = bdrv_set_backing_hd_drained(bs, backing_hd, errp);
3582     bdrv_drained_end(drain_bs);
3583     bdrv_unref(drain_bs);
3584 
3585     return ret;
3586 }
3587 
3588 /*
3589  * Opens the backing file for a BlockDriverState if not yet open
3590  *
3591  * bdref_key specifies the key for the image's BlockdevRef in the options QDict.
3592  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
3593  * itself, all options starting with "${bdref_key}." are considered part of the
3594  * BlockdevRef.
3595  *
3596  * The caller must hold the main AioContext lock.
3597  *
3598  * TODO Can this be unified with bdrv_open_image()?
3599  */
3600 int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
3601                            const char *bdref_key, Error **errp)
3602 {
3603     char *backing_filename = NULL;
3604     char *bdref_key_dot;
3605     const char *reference = NULL;
3606     int ret = 0;
3607     bool implicit_backing = false;
3608     BlockDriverState *backing_hd;
3609     AioContext *backing_hd_ctx;
3610     QDict *options;
3611     QDict *tmp_parent_options = NULL;
3612     Error *local_err = NULL;
3613 
3614     GLOBAL_STATE_CODE();
3615 
3616     if (bs->backing != NULL) {
3617         goto free_exit;
3618     }
3619 
3620     /* NULL means an empty set of options */
3621     if (parent_options == NULL) {
3622         tmp_parent_options = qdict_new();
3623         parent_options = tmp_parent_options;
3624     }
3625 
3626     bs->open_flags &= ~BDRV_O_NO_BACKING;
3627 
3628     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
3629     qdict_extract_subqdict(parent_options, &options, bdref_key_dot);
3630     g_free(bdref_key_dot);
3631 
3632     /*
3633      * Caution: while qdict_get_try_str() is fine, getting non-string
3634      * types would require more care.  When @parent_options come from
3635      * -blockdev or blockdev_add, its members are typed according to
3636      * the QAPI schema, but when they come from -drive, they're all
3637      * QString.
3638      */
3639     reference = qdict_get_try_str(parent_options, bdref_key);
3640     if (reference || qdict_haskey(options, "file.filename")) {
3641         /* keep backing_filename NULL */
3642     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
3643         qobject_unref(options);
3644         goto free_exit;
3645     } else {
3646         if (qdict_size(options) == 0) {
3647             /* If the user specifies options that do not modify the
3648              * backing file's behavior, we might still consider it the
3649              * implicit backing file.  But it's easier this way, and
3650              * just specifying some of the backing BDS's options is
3651              * only possible with -drive anyway (otherwise the QAPI
3652              * schema forces the user to specify everything). */
3653             implicit_backing = !strcmp(bs->auto_backing_file, bs->backing_file);
3654         }
3655 
3656         bdrv_graph_rdlock_main_loop();
3657         backing_filename = bdrv_get_full_backing_filename(bs, &local_err);
3658         bdrv_graph_rdunlock_main_loop();
3659 
3660         if (local_err) {
3661             ret = -EINVAL;
3662             error_propagate(errp, local_err);
3663             qobject_unref(options);
3664             goto free_exit;
3665         }
3666     }
3667 
3668     if (!bs->drv || !bs->drv->supports_backing) {
3669         ret = -EINVAL;
3670         error_setg(errp, "Driver doesn't support backing files");
3671         qobject_unref(options);
3672         goto free_exit;
3673     }
3674 
3675     if (!reference &&
3676         bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) {
3677         qdict_put_str(options, "driver", bs->backing_format);
3678     }
3679 
3680     backing_hd = bdrv_open_inherit(backing_filename, reference, options, 0, bs,
3681                                    &child_of_bds, bdrv_backing_role(bs), errp);
3682     if (!backing_hd) {
3683         bs->open_flags |= BDRV_O_NO_BACKING;
3684         error_prepend(errp, "Could not open backing file: ");
3685         ret = -EINVAL;
3686         goto free_exit;
3687     }
3688 
3689     if (implicit_backing) {
3690         bdrv_graph_rdlock_main_loop();
3691         bdrv_refresh_filename(backing_hd);
3692         bdrv_graph_rdunlock_main_loop();
3693         pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
3694                 backing_hd->filename);
3695     }
3696 
3697     /* Hook up the backing file link; drop our reference, bs owns the
3698      * backing_hd reference now */
3699     backing_hd_ctx = bdrv_get_aio_context(backing_hd);
3700     aio_context_acquire(backing_hd_ctx);
3701     ret = bdrv_set_backing_hd(bs, backing_hd, errp);
3702     bdrv_unref(backing_hd);
3703     aio_context_release(backing_hd_ctx);
3704 
3705     if (ret < 0) {
3706         goto free_exit;
3707     }
3708 
3709     qdict_del(parent_options, bdref_key);
3710 
3711 free_exit:
3712     g_free(backing_filename);
3713     qobject_unref(tmp_parent_options);
3714     return ret;
3715 }
3716 
3717 static BlockDriverState *
3718 bdrv_open_child_bs(const char *filename, QDict *options, const char *bdref_key,
3719                    BlockDriverState *parent, const BdrvChildClass *child_class,
3720                    BdrvChildRole child_role, bool allow_none, Error **errp)
3721 {
3722     BlockDriverState *bs = NULL;
3723     QDict *image_options;
3724     char *bdref_key_dot;
3725     const char *reference;
3726 
3727     assert(child_class != NULL);
3728 
3729     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
3730     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
3731     g_free(bdref_key_dot);
3732 
3733     /*
3734      * Caution: while qdict_get_try_str() is fine, getting non-string
3735      * types would require more care.  When @options come from
3736      * -blockdev or blockdev_add, its members are typed according to
3737      * the QAPI schema, but when they come from -drive, they're all
3738      * QString.
3739      */
3740     reference = qdict_get_try_str(options, bdref_key);
3741     if (!filename && !reference && !qdict_size(image_options)) {
3742         if (!allow_none) {
3743             error_setg(errp, "A block device must be specified for \"%s\"",
3744                        bdref_key);
3745         }
3746         qobject_unref(image_options);
3747         goto done;
3748     }
3749 
3750     bs = bdrv_open_inherit(filename, reference, image_options, 0,
3751                            parent, child_class, child_role, errp);
3752     if (!bs) {
3753         goto done;
3754     }
3755 
3756 done:
3757     qdict_del(options, bdref_key);
3758     return bs;
3759 }
3760 
3761 /*
3762  * Opens a disk image whose options are given as BlockdevRef in another block
3763  * device's options.
3764  *
3765  * If allow_none is true, no image will be opened if filename is false and no
3766  * BlockdevRef is given. NULL will be returned, but errp remains unset.
3767  *
3768  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
3769  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
3770  * itself, all options starting with "${bdref_key}." are considered part of the
3771  * BlockdevRef.
3772  *
3773  * The BlockdevRef will be removed from the options QDict.
3774  *
3775  * The caller must hold the lock of the main AioContext and no other AioContext.
3776  * @parent can move to a different AioContext in this function. Callers must
3777  * make sure that their AioContext locking is still correct after this.
3778  */
3779 BdrvChild *bdrv_open_child(const char *filename,
3780                            QDict *options, const char *bdref_key,
3781                            BlockDriverState *parent,
3782                            const BdrvChildClass *child_class,
3783                            BdrvChildRole child_role,
3784                            bool allow_none, Error **errp)
3785 {
3786     BlockDriverState *bs;
3787     BdrvChild *child;
3788     AioContext *ctx;
3789 
3790     GLOBAL_STATE_CODE();
3791 
3792     bs = bdrv_open_child_bs(filename, options, bdref_key, parent, child_class,
3793                             child_role, allow_none, errp);
3794     if (bs == NULL) {
3795         return NULL;
3796     }
3797 
3798     bdrv_graph_wrlock(NULL);
3799     ctx = bdrv_get_aio_context(bs);
3800     aio_context_acquire(ctx);
3801     child = bdrv_attach_child(parent, bs, bdref_key, child_class, child_role,
3802                               errp);
3803     aio_context_release(ctx);
3804     bdrv_graph_wrunlock();
3805 
3806     return child;
3807 }
3808 
3809 /*
3810  * Wrapper on bdrv_open_child() for most popular case: open primary child of bs.
3811  *
3812  * The caller must hold the lock of the main AioContext and no other AioContext.
3813  * @parent can move to a different AioContext in this function. Callers must
3814  * make sure that their AioContext locking is still correct after this.
3815  */
3816 int bdrv_open_file_child(const char *filename,
3817                          QDict *options, const char *bdref_key,
3818                          BlockDriverState *parent, Error **errp)
3819 {
3820     BdrvChildRole role;
3821 
3822     /* commit_top and mirror_top don't use this function */
3823     assert(!parent->drv->filtered_child_is_backing);
3824     role = parent->drv->is_filter ?
3825         (BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY) : BDRV_CHILD_IMAGE;
3826 
3827     if (!bdrv_open_child(filename, options, bdref_key, parent,
3828                          &child_of_bds, role, false, errp))
3829     {
3830         return -EINVAL;
3831     }
3832 
3833     return 0;
3834 }
3835 
3836 /*
3837  * TODO Future callers may need to specify parent/child_class in order for
3838  * option inheritance to work. Existing callers use it for the root node.
3839  */
3840 BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp)
3841 {
3842     BlockDriverState *bs = NULL;
3843     QObject *obj = NULL;
3844     QDict *qdict = NULL;
3845     const char *reference = NULL;
3846     Visitor *v = NULL;
3847 
3848     GLOBAL_STATE_CODE();
3849 
3850     if (ref->type == QTYPE_QSTRING) {
3851         reference = ref->u.reference;
3852     } else {
3853         BlockdevOptions *options = &ref->u.definition;
3854         assert(ref->type == QTYPE_QDICT);
3855 
3856         v = qobject_output_visitor_new(&obj);
3857         visit_type_BlockdevOptions(v, NULL, &options, &error_abort);
3858         visit_complete(v, &obj);
3859 
3860         qdict = qobject_to(QDict, obj);
3861         qdict_flatten(qdict);
3862 
3863         /* bdrv_open_inherit() defaults to the values in bdrv_flags (for
3864          * compatibility with other callers) rather than what we want as the
3865          * real defaults. Apply the defaults here instead. */
3866         qdict_set_default_str(qdict, BDRV_OPT_CACHE_DIRECT, "off");
3867         qdict_set_default_str(qdict, BDRV_OPT_CACHE_NO_FLUSH, "off");
3868         qdict_set_default_str(qdict, BDRV_OPT_READ_ONLY, "off");
3869         qdict_set_default_str(qdict, BDRV_OPT_AUTO_READ_ONLY, "off");
3870 
3871     }
3872 
3873     bs = bdrv_open_inherit(NULL, reference, qdict, 0, NULL, NULL, 0, errp);
3874     obj = NULL;
3875     qobject_unref(obj);
3876     visit_free(v);
3877     return bs;
3878 }
3879 
3880 static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs,
3881                                                    int flags,
3882                                                    QDict *snapshot_options,
3883                                                    Error **errp)
3884 {
3885     g_autofree char *tmp_filename = NULL;
3886     int64_t total_size;
3887     QemuOpts *opts = NULL;
3888     BlockDriverState *bs_snapshot = NULL;
3889     AioContext *ctx = bdrv_get_aio_context(bs);
3890     int ret;
3891 
3892     GLOBAL_STATE_CODE();
3893 
3894     /* if snapshot, we create a temporary backing file and open it
3895        instead of opening 'filename' directly */
3896 
3897     /* Get the required size from the image */
3898     aio_context_acquire(ctx);
3899     total_size = bdrv_getlength(bs);
3900     aio_context_release(ctx);
3901 
3902     if (total_size < 0) {
3903         error_setg_errno(errp, -total_size, "Could not get image size");
3904         goto out;
3905     }
3906 
3907     /* Create the temporary image */
3908     tmp_filename = create_tmp_file(errp);
3909     if (!tmp_filename) {
3910         goto out;
3911     }
3912 
3913     opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
3914                             &error_abort);
3915     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size, &error_abort);
3916     ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, errp);
3917     qemu_opts_del(opts);
3918     if (ret < 0) {
3919         error_prepend(errp, "Could not create temporary overlay '%s': ",
3920                       tmp_filename);
3921         goto out;
3922     }
3923 
3924     /* Prepare options QDict for the temporary file */
3925     qdict_put_str(snapshot_options, "file.driver", "file");
3926     qdict_put_str(snapshot_options, "file.filename", tmp_filename);
3927     qdict_put_str(snapshot_options, "driver", "qcow2");
3928 
3929     bs_snapshot = bdrv_open(NULL, NULL, snapshot_options, flags, errp);
3930     snapshot_options = NULL;
3931     if (!bs_snapshot) {
3932         goto out;
3933     }
3934 
3935     aio_context_acquire(ctx);
3936     ret = bdrv_append(bs_snapshot, bs, errp);
3937     aio_context_release(ctx);
3938 
3939     if (ret < 0) {
3940         bs_snapshot = NULL;
3941         goto out;
3942     }
3943 
3944 out:
3945     qobject_unref(snapshot_options);
3946     return bs_snapshot;
3947 }
3948 
3949 /*
3950  * Opens a disk image (raw, qcow2, vmdk, ...)
3951  *
3952  * options is a QDict of options to pass to the block drivers, or NULL for an
3953  * empty set of options. The reference to the QDict belongs to the block layer
3954  * after the call (even on failure), so if the caller intends to reuse the
3955  * dictionary, it needs to use qobject_ref() before calling bdrv_open.
3956  *
3957  * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
3958  * If it is not NULL, the referenced BDS will be reused.
3959  *
3960  * The reference parameter may be used to specify an existing block device which
3961  * should be opened. If specified, neither options nor a filename may be given,
3962  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
3963  *
3964  * The caller must always hold the main AioContext lock.
3965  */
3966 static BlockDriverState * no_coroutine_fn
3967 bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
3968                   int flags, BlockDriverState *parent,
3969                   const BdrvChildClass *child_class, BdrvChildRole child_role,
3970                   Error **errp)
3971 {
3972     int ret;
3973     BlockBackend *file = NULL;
3974     BlockDriverState *bs;
3975     BlockDriver *drv = NULL;
3976     BdrvChild *child;
3977     const char *drvname;
3978     const char *backing;
3979     Error *local_err = NULL;
3980     QDict *snapshot_options = NULL;
3981     int snapshot_flags = 0;
3982     AioContext *ctx = qemu_get_aio_context();
3983 
3984     assert(!child_class || !flags);
3985     assert(!child_class == !parent);
3986     GLOBAL_STATE_CODE();
3987     assert(!qemu_in_coroutine());
3988 
3989     /* TODO We'll eventually have to take a writer lock in this function */
3990     GRAPH_RDLOCK_GUARD_MAINLOOP();
3991 
3992     if (reference) {
3993         bool options_non_empty = options ? qdict_size(options) : false;
3994         qobject_unref(options);
3995 
3996         if (filename || options_non_empty) {
3997             error_setg(errp, "Cannot reference an existing block device with "
3998                        "additional options or a new filename");
3999             return NULL;
4000         }
4001 
4002         bs = bdrv_lookup_bs(reference, reference, errp);
4003         if (!bs) {
4004             return NULL;
4005         }
4006 
4007         bdrv_ref(bs);
4008         return bs;
4009     }
4010 
4011     bs = bdrv_new();
4012 
4013     /* NULL means an empty set of options */
4014     if (options == NULL) {
4015         options = qdict_new();
4016     }
4017 
4018     /* json: syntax counts as explicit options, as if in the QDict */
4019     parse_json_protocol(options, &filename, &local_err);
4020     if (local_err) {
4021         goto fail;
4022     }
4023 
4024     bs->explicit_options = qdict_clone_shallow(options);
4025 
4026     if (child_class) {
4027         bool parent_is_format;
4028 
4029         if (parent->drv) {
4030             parent_is_format = parent->drv->is_format;
4031         } else {
4032             /*
4033              * parent->drv is not set yet because this node is opened for
4034              * (potential) format probing.  That means that @parent is going
4035              * to be a format node.
4036              */
4037             parent_is_format = true;
4038         }
4039 
4040         bs->inherits_from = parent;
4041         child_class->inherit_options(child_role, parent_is_format,
4042                                      &flags, options,
4043                                      parent->open_flags, parent->options);
4044     }
4045 
4046     ret = bdrv_fill_options(&options, filename, &flags, &local_err);
4047     if (ret < 0) {
4048         goto fail;
4049     }
4050 
4051     /*
4052      * Set the BDRV_O_RDWR and BDRV_O_ALLOW_RDWR flags.
4053      * Caution: getting a boolean member of @options requires care.
4054      * When @options come from -blockdev or blockdev_add, members are
4055      * typed according to the QAPI schema, but when they come from
4056      * -drive, they're all QString.
4057      */
4058     if (g_strcmp0(qdict_get_try_str(options, BDRV_OPT_READ_ONLY), "on") &&
4059         !qdict_get_try_bool(options, BDRV_OPT_READ_ONLY, false)) {
4060         flags |= (BDRV_O_RDWR | BDRV_O_ALLOW_RDWR);
4061     } else {
4062         flags &= ~BDRV_O_RDWR;
4063     }
4064 
4065     if (flags & BDRV_O_SNAPSHOT) {
4066         snapshot_options = qdict_new();
4067         bdrv_temp_snapshot_options(&snapshot_flags, snapshot_options,
4068                                    flags, options);
4069         /* Let bdrv_backing_options() override "read-only" */
4070         qdict_del(options, BDRV_OPT_READ_ONLY);
4071         bdrv_inherited_options(BDRV_CHILD_COW, true,
4072                                &flags, options, flags, options);
4073     }
4074 
4075     bs->open_flags = flags;
4076     bs->options = options;
4077     options = qdict_clone_shallow(options);
4078 
4079     /* Find the right image format driver */
4080     /* See cautionary note on accessing @options above */
4081     drvname = qdict_get_try_str(options, "driver");
4082     if (drvname) {
4083         drv = bdrv_find_format(drvname);
4084         if (!drv) {
4085             error_setg(errp, "Unknown driver: '%s'", drvname);
4086             goto fail;
4087         }
4088     }
4089 
4090     assert(drvname || !(flags & BDRV_O_PROTOCOL));
4091 
4092     /* See cautionary note on accessing @options above */
4093     backing = qdict_get_try_str(options, "backing");
4094     if (qobject_to(QNull, qdict_get(options, "backing")) != NULL ||
4095         (backing && *backing == '\0'))
4096     {
4097         if (backing) {
4098             warn_report("Use of \"backing\": \"\" is deprecated; "
4099                         "use \"backing\": null instead");
4100         }
4101         flags |= BDRV_O_NO_BACKING;
4102         qdict_del(bs->explicit_options, "backing");
4103         qdict_del(bs->options, "backing");
4104         qdict_del(options, "backing");
4105     }
4106 
4107     /* Open image file without format layer. This BlockBackend is only used for
4108      * probing, the block drivers will do their own bdrv_open_child() for the
4109      * same BDS, which is why we put the node name back into options. */
4110     if ((flags & BDRV_O_PROTOCOL) == 0) {
4111         BlockDriverState *file_bs;
4112 
4113         file_bs = bdrv_open_child_bs(filename, options, "file", bs,
4114                                      &child_of_bds, BDRV_CHILD_IMAGE,
4115                                      true, &local_err);
4116         if (local_err) {
4117             goto fail;
4118         }
4119         if (file_bs != NULL) {
4120             /* Not requesting BLK_PERM_CONSISTENT_READ because we're only
4121              * looking at the header to guess the image format. This works even
4122              * in cases where a guest would not see a consistent state. */
4123             ctx = bdrv_get_aio_context(file_bs);
4124             aio_context_acquire(ctx);
4125             file = blk_new(ctx, 0, BLK_PERM_ALL);
4126             blk_insert_bs(file, file_bs, &local_err);
4127             bdrv_unref(file_bs);
4128             aio_context_release(ctx);
4129 
4130             if (local_err) {
4131                 goto fail;
4132             }
4133 
4134             qdict_put_str(options, "file", bdrv_get_node_name(file_bs));
4135         }
4136     }
4137 
4138     /* Image format probing */
4139     bs->probed = !drv;
4140     if (!drv && file) {
4141         ret = find_image_format(file, filename, &drv, &local_err);
4142         if (ret < 0) {
4143             goto fail;
4144         }
4145         /*
4146          * This option update would logically belong in bdrv_fill_options(),
4147          * but we first need to open bs->file for the probing to work, while
4148          * opening bs->file already requires the (mostly) final set of options
4149          * so that cache mode etc. can be inherited.
4150          *
4151          * Adding the driver later is somewhat ugly, but it's not an option
4152          * that would ever be inherited, so it's correct. We just need to make
4153          * sure to update both bs->options (which has the full effective
4154          * options for bs) and options (which has file.* already removed).
4155          */
4156         qdict_put_str(bs->options, "driver", drv->format_name);
4157         qdict_put_str(options, "driver", drv->format_name);
4158     } else if (!drv) {
4159         error_setg(errp, "Must specify either driver or file");
4160         goto fail;
4161     }
4162 
4163     /* BDRV_O_PROTOCOL must be set iff a protocol BDS is about to be created */
4164     assert(!!(flags & BDRV_O_PROTOCOL) == !!drv->bdrv_file_open);
4165     /* file must be NULL if a protocol BDS is about to be created
4166      * (the inverse results in an error message from bdrv_open_common()) */
4167     assert(!(flags & BDRV_O_PROTOCOL) || !file);
4168 
4169     /* Open the image */
4170     ret = bdrv_open_common(bs, file, options, &local_err);
4171     if (ret < 0) {
4172         goto fail;
4173     }
4174 
4175     /* The AioContext could have changed during bdrv_open_common() */
4176     ctx = bdrv_get_aio_context(bs);
4177 
4178     if (file) {
4179         aio_context_acquire(ctx);
4180         blk_unref(file);
4181         aio_context_release(ctx);
4182         file = NULL;
4183     }
4184 
4185     /* If there is a backing file, use it */
4186     if ((flags & BDRV_O_NO_BACKING) == 0) {
4187         ret = bdrv_open_backing_file(bs, options, "backing", &local_err);
4188         if (ret < 0) {
4189             goto close_and_fail;
4190         }
4191     }
4192 
4193     /* Remove all children options and references
4194      * from bs->options and bs->explicit_options */
4195     QLIST_FOREACH(child, &bs->children, next) {
4196         char *child_key_dot;
4197         child_key_dot = g_strdup_printf("%s.", child->name);
4198         qdict_extract_subqdict(bs->explicit_options, NULL, child_key_dot);
4199         qdict_extract_subqdict(bs->options, NULL, child_key_dot);
4200         qdict_del(bs->explicit_options, child->name);
4201         qdict_del(bs->options, child->name);
4202         g_free(child_key_dot);
4203     }
4204 
4205     /* Check if any unknown options were used */
4206     if (qdict_size(options) != 0) {
4207         const QDictEntry *entry = qdict_first(options);
4208         if (flags & BDRV_O_PROTOCOL) {
4209             error_setg(errp, "Block protocol '%s' doesn't support the option "
4210                        "'%s'", drv->format_name, entry->key);
4211         } else {
4212             error_setg(errp,
4213                        "Block format '%s' does not support the option '%s'",
4214                        drv->format_name, entry->key);
4215         }
4216 
4217         goto close_and_fail;
4218     }
4219 
4220     bdrv_parent_cb_change_media(bs, true);
4221 
4222     qobject_unref(options);
4223     options = NULL;
4224 
4225     /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
4226      * temporary snapshot afterwards. */
4227     if (snapshot_flags) {
4228         BlockDriverState *snapshot_bs;
4229         snapshot_bs = bdrv_append_temp_snapshot(bs, snapshot_flags,
4230                                                 snapshot_options, &local_err);
4231         snapshot_options = NULL;
4232         if (local_err) {
4233             goto close_and_fail;
4234         }
4235         /* We are not going to return bs but the overlay on top of it
4236          * (snapshot_bs); thus, we have to drop the strong reference to bs
4237          * (which we obtained by calling bdrv_new()). bs will not be deleted,
4238          * though, because the overlay still has a reference to it. */
4239         aio_context_acquire(ctx);
4240         bdrv_unref(bs);
4241         aio_context_release(ctx);
4242         bs = snapshot_bs;
4243     }
4244 
4245     return bs;
4246 
4247 fail:
4248     aio_context_acquire(ctx);
4249     blk_unref(file);
4250     qobject_unref(snapshot_options);
4251     qobject_unref(bs->explicit_options);
4252     qobject_unref(bs->options);
4253     qobject_unref(options);
4254     bs->options = NULL;
4255     bs->explicit_options = NULL;
4256     bdrv_unref(bs);
4257     aio_context_release(ctx);
4258     error_propagate(errp, local_err);
4259     return NULL;
4260 
4261 close_and_fail:
4262     aio_context_acquire(ctx);
4263     bdrv_unref(bs);
4264     aio_context_release(ctx);
4265     qobject_unref(snapshot_options);
4266     qobject_unref(options);
4267     error_propagate(errp, local_err);
4268     return NULL;
4269 }
4270 
4271 /* The caller must always hold the main AioContext lock. */
4272 BlockDriverState *bdrv_open(const char *filename, const char *reference,
4273                             QDict *options, int flags, Error **errp)
4274 {
4275     GLOBAL_STATE_CODE();
4276 
4277     return bdrv_open_inherit(filename, reference, options, flags, NULL,
4278                              NULL, 0, errp);
4279 }
4280 
4281 /* Return true if the NULL-terminated @list contains @str */
4282 static bool is_str_in_list(const char *str, const char *const *list)
4283 {
4284     if (str && list) {
4285         int i;
4286         for (i = 0; list[i] != NULL; i++) {
4287             if (!strcmp(str, list[i])) {
4288                 return true;
4289             }
4290         }
4291     }
4292     return false;
4293 }
4294 
4295 /*
4296  * Check that every option set in @bs->options is also set in
4297  * @new_opts.
4298  *
4299  * Options listed in the common_options list and in
4300  * @bs->drv->mutable_opts are skipped.
4301  *
4302  * Return 0 on success, otherwise return -EINVAL and set @errp.
4303  */
4304 static int bdrv_reset_options_allowed(BlockDriverState *bs,
4305                                       const QDict *new_opts, Error **errp)
4306 {
4307     const QDictEntry *e;
4308     /* These options are common to all block drivers and are handled
4309      * in bdrv_reopen_prepare() so they can be left out of @new_opts */
4310     const char *const common_options[] = {
4311         "node-name", "discard", "cache.direct", "cache.no-flush",
4312         "read-only", "auto-read-only", "detect-zeroes", NULL
4313     };
4314 
4315     for (e = qdict_first(bs->options); e; e = qdict_next(bs->options, e)) {
4316         if (!qdict_haskey(new_opts, e->key) &&
4317             !is_str_in_list(e->key, common_options) &&
4318             !is_str_in_list(e->key, bs->drv->mutable_opts)) {
4319             error_setg(errp, "Option '%s' cannot be reset "
4320                        "to its default value", e->key);
4321             return -EINVAL;
4322         }
4323     }
4324 
4325     return 0;
4326 }
4327 
4328 /*
4329  * Returns true if @child can be reached recursively from @bs
4330  */
4331 static bool GRAPH_RDLOCK
4332 bdrv_recurse_has_child(BlockDriverState *bs, BlockDriverState *child)
4333 {
4334     BdrvChild *c;
4335 
4336     if (bs == child) {
4337         return true;
4338     }
4339 
4340     QLIST_FOREACH(c, &bs->children, next) {
4341         if (bdrv_recurse_has_child(c->bs, child)) {
4342             return true;
4343         }
4344     }
4345 
4346     return false;
4347 }
4348 
4349 /*
4350  * Adds a BlockDriverState to a simple queue for an atomic, transactional
4351  * reopen of multiple devices.
4352  *
4353  * bs_queue can either be an existing BlockReopenQueue that has had QTAILQ_INIT
4354  * already performed, or alternatively may be NULL a new BlockReopenQueue will
4355  * be created and initialized. This newly created BlockReopenQueue should be
4356  * passed back in for subsequent calls that are intended to be of the same
4357  * atomic 'set'.
4358  *
4359  * bs is the BlockDriverState to add to the reopen queue.
4360  *
4361  * options contains the changed options for the associated bs
4362  * (the BlockReopenQueue takes ownership)
4363  *
4364  * flags contains the open flags for the associated bs
4365  *
4366  * returns a pointer to bs_queue, which is either the newly allocated
4367  * bs_queue, or the existing bs_queue being used.
4368  *
4369  * bs is drained here and undrained by bdrv_reopen_queue_free().
4370  *
4371  * To be called with bs->aio_context locked.
4372  */
4373 static BlockReopenQueue * GRAPH_RDLOCK
4374 bdrv_reopen_queue_child(BlockReopenQueue *bs_queue, BlockDriverState *bs,
4375                         QDict *options, const BdrvChildClass *klass,
4376                         BdrvChildRole role, bool parent_is_format,
4377                         QDict *parent_options, int parent_flags,
4378                         bool keep_old_opts)
4379 {
4380     assert(bs != NULL);
4381 
4382     BlockReopenQueueEntry *bs_entry;
4383     BdrvChild *child;
4384     QDict *old_options, *explicit_options, *options_copy;
4385     int flags;
4386     QemuOpts *opts;
4387 
4388     GLOBAL_STATE_CODE();
4389 
4390     /*
4391      * Strictly speaking, draining is illegal under GRAPH_RDLOCK. We know that
4392      * we've been called with bdrv_graph_rdlock_main_loop(), though, so it's ok
4393      * in practice.
4394      */
4395     bdrv_drained_begin(bs);
4396 
4397     if (bs_queue == NULL) {
4398         bs_queue = g_new0(BlockReopenQueue, 1);
4399         QTAILQ_INIT(bs_queue);
4400     }
4401 
4402     if (!options) {
4403         options = qdict_new();
4404     }
4405 
4406     /* Check if this BlockDriverState is already in the queue */
4407     QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
4408         if (bs == bs_entry->state.bs) {
4409             break;
4410         }
4411     }
4412 
4413     /*
4414      * Precedence of options:
4415      * 1. Explicitly passed in options (highest)
4416      * 2. Retained from explicitly set options of bs
4417      * 3. Inherited from parent node
4418      * 4. Retained from effective options of bs
4419      */
4420 
4421     /* Old explicitly set values (don't overwrite by inherited value) */
4422     if (bs_entry || keep_old_opts) {
4423         old_options = qdict_clone_shallow(bs_entry ?
4424                                           bs_entry->state.explicit_options :
4425                                           bs->explicit_options);
4426         bdrv_join_options(bs, options, old_options);
4427         qobject_unref(old_options);
4428     }
4429 
4430     explicit_options = qdict_clone_shallow(options);
4431 
4432     /* Inherit from parent node */
4433     if (parent_options) {
4434         flags = 0;
4435         klass->inherit_options(role, parent_is_format, &flags, options,
4436                                parent_flags, parent_options);
4437     } else {
4438         flags = bdrv_get_flags(bs);
4439     }
4440 
4441     if (keep_old_opts) {
4442         /* Old values are used for options that aren't set yet */
4443         old_options = qdict_clone_shallow(bs->options);
4444         bdrv_join_options(bs, options, old_options);
4445         qobject_unref(old_options);
4446     }
4447 
4448     /* We have the final set of options so let's update the flags */
4449     options_copy = qdict_clone_shallow(options);
4450     opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
4451     qemu_opts_absorb_qdict(opts, options_copy, NULL);
4452     update_flags_from_options(&flags, opts);
4453     qemu_opts_del(opts);
4454     qobject_unref(options_copy);
4455 
4456     /* bdrv_open_inherit() sets and clears some additional flags internally */
4457     flags &= ~BDRV_O_PROTOCOL;
4458     if (flags & BDRV_O_RDWR) {
4459         flags |= BDRV_O_ALLOW_RDWR;
4460     }
4461 
4462     if (!bs_entry) {
4463         bs_entry = g_new0(BlockReopenQueueEntry, 1);
4464         QTAILQ_INSERT_TAIL(bs_queue, bs_entry, entry);
4465     } else {
4466         qobject_unref(bs_entry->state.options);
4467         qobject_unref(bs_entry->state.explicit_options);
4468     }
4469 
4470     bs_entry->state.bs = bs;
4471     bs_entry->state.options = options;
4472     bs_entry->state.explicit_options = explicit_options;
4473     bs_entry->state.flags = flags;
4474 
4475     /*
4476      * If keep_old_opts is false then it means that unspecified
4477      * options must be reset to their original value. We don't allow
4478      * resetting 'backing' but we need to know if the option is
4479      * missing in order to decide if we have to return an error.
4480      */
4481     if (!keep_old_opts) {
4482         bs_entry->state.backing_missing =
4483             !qdict_haskey(options, "backing") &&
4484             !qdict_haskey(options, "backing.driver");
4485     }
4486 
4487     QLIST_FOREACH(child, &bs->children, next) {
4488         QDict *new_child_options = NULL;
4489         bool child_keep_old = keep_old_opts;
4490 
4491         /* reopen can only change the options of block devices that were
4492          * implicitly created and inherited options. For other (referenced)
4493          * block devices, a syntax like "backing.foo" results in an error. */
4494         if (child->bs->inherits_from != bs) {
4495             continue;
4496         }
4497 
4498         /* Check if the options contain a child reference */
4499         if (qdict_haskey(options, child->name)) {
4500             const char *childref = qdict_get_try_str(options, child->name);
4501             /*
4502              * The current child must not be reopened if the child
4503              * reference is null or points to a different node.
4504              */
4505             if (g_strcmp0(childref, child->bs->node_name)) {
4506                 continue;
4507             }
4508             /*
4509              * If the child reference points to the current child then
4510              * reopen it with its existing set of options (note that
4511              * it can still inherit new options from the parent).
4512              */
4513             child_keep_old = true;
4514         } else {
4515             /* Extract child options ("child-name.*") */
4516             char *child_key_dot = g_strdup_printf("%s.", child->name);
4517             qdict_extract_subqdict(explicit_options, NULL, child_key_dot);
4518             qdict_extract_subqdict(options, &new_child_options, child_key_dot);
4519             g_free(child_key_dot);
4520         }
4521 
4522         bdrv_reopen_queue_child(bs_queue, child->bs, new_child_options,
4523                                 child->klass, child->role, bs->drv->is_format,
4524                                 options, flags, child_keep_old);
4525     }
4526 
4527     return bs_queue;
4528 }
4529 
4530 /* To be called with bs->aio_context locked */
4531 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
4532                                     BlockDriverState *bs,
4533                                     QDict *options, bool keep_old_opts)
4534 {
4535     GLOBAL_STATE_CODE();
4536     GRAPH_RDLOCK_GUARD_MAINLOOP();
4537 
4538     return bdrv_reopen_queue_child(bs_queue, bs, options, NULL, 0, false,
4539                                    NULL, 0, keep_old_opts);
4540 }
4541 
4542 void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue)
4543 {
4544     GLOBAL_STATE_CODE();
4545     if (bs_queue) {
4546         BlockReopenQueueEntry *bs_entry, *next;
4547         QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
4548             AioContext *ctx = bdrv_get_aio_context(bs_entry->state.bs);
4549 
4550             aio_context_acquire(ctx);
4551             bdrv_drained_end(bs_entry->state.bs);
4552             aio_context_release(ctx);
4553 
4554             qobject_unref(bs_entry->state.explicit_options);
4555             qobject_unref(bs_entry->state.options);
4556             g_free(bs_entry);
4557         }
4558         g_free(bs_queue);
4559     }
4560 }
4561 
4562 /*
4563  * Reopen multiple BlockDriverStates atomically & transactionally.
4564  *
4565  * The queue passed in (bs_queue) must have been built up previous
4566  * via bdrv_reopen_queue().
4567  *
4568  * Reopens all BDS specified in the queue, with the appropriate
4569  * flags.  All devices are prepared for reopen, and failure of any
4570  * device will cause all device changes to be abandoned, and intermediate
4571  * data cleaned up.
4572  *
4573  * If all devices prepare successfully, then the changes are committed
4574  * to all devices.
4575  *
4576  * All affected nodes must be drained between bdrv_reopen_queue() and
4577  * bdrv_reopen_multiple().
4578  *
4579  * To be called from the main thread, with all other AioContexts unlocked.
4580  */
4581 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
4582 {
4583     int ret = -1;
4584     BlockReopenQueueEntry *bs_entry, *next;
4585     AioContext *ctx;
4586     Transaction *tran = tran_new();
4587     g_autoptr(GSList) refresh_list = NULL;
4588 
4589     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
4590     assert(bs_queue != NULL);
4591     GLOBAL_STATE_CODE();
4592 
4593     QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
4594         ctx = bdrv_get_aio_context(bs_entry->state.bs);
4595         aio_context_acquire(ctx);
4596         ret = bdrv_flush(bs_entry->state.bs);
4597         aio_context_release(ctx);
4598         if (ret < 0) {
4599             error_setg_errno(errp, -ret, "Error flushing drive");
4600             goto abort;
4601         }
4602     }
4603 
4604     QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
4605         assert(bs_entry->state.bs->quiesce_counter > 0);
4606         ctx = bdrv_get_aio_context(bs_entry->state.bs);
4607         aio_context_acquire(ctx);
4608         ret = bdrv_reopen_prepare(&bs_entry->state, bs_queue, tran, errp);
4609         aio_context_release(ctx);
4610         if (ret < 0) {
4611             goto abort;
4612         }
4613         bs_entry->prepared = true;
4614     }
4615 
4616     QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
4617         BDRVReopenState *state = &bs_entry->state;
4618 
4619         refresh_list = g_slist_prepend(refresh_list, state->bs);
4620         if (state->old_backing_bs) {
4621             refresh_list = g_slist_prepend(refresh_list, state->old_backing_bs);
4622         }
4623         if (state->old_file_bs) {
4624             refresh_list = g_slist_prepend(refresh_list, state->old_file_bs);
4625         }
4626     }
4627 
4628     /*
4629      * Note that file-posix driver rely on permission update done during reopen
4630      * (even if no permission changed), because it wants "new" permissions for
4631      * reconfiguring the fd and that's why it does it in raw_check_perm(), not
4632      * in raw_reopen_prepare() which is called with "old" permissions.
4633      */
4634     bdrv_graph_rdlock_main_loop();
4635     ret = bdrv_list_refresh_perms(refresh_list, bs_queue, tran, errp);
4636     bdrv_graph_rdunlock_main_loop();
4637 
4638     if (ret < 0) {
4639         goto abort;
4640     }
4641 
4642     /*
4643      * If we reach this point, we have success and just need to apply the
4644      * changes.
4645      *
4646      * Reverse order is used to comfort qcow2 driver: on commit it need to write
4647      * IN_USE flag to the image, to mark bitmaps in the image as invalid. But
4648      * children are usually goes after parents in reopen-queue, so go from last
4649      * to first element.
4650      */
4651     QTAILQ_FOREACH_REVERSE(bs_entry, bs_queue, entry) {
4652         ctx = bdrv_get_aio_context(bs_entry->state.bs);
4653         aio_context_acquire(ctx);
4654         bdrv_reopen_commit(&bs_entry->state);
4655         aio_context_release(ctx);
4656     }
4657 
4658     bdrv_graph_wrlock(NULL);
4659     tran_commit(tran);
4660     bdrv_graph_wrunlock();
4661 
4662     QTAILQ_FOREACH_REVERSE(bs_entry, bs_queue, entry) {
4663         BlockDriverState *bs = bs_entry->state.bs;
4664 
4665         if (bs->drv->bdrv_reopen_commit_post) {
4666             ctx = bdrv_get_aio_context(bs);
4667             aio_context_acquire(ctx);
4668             bs->drv->bdrv_reopen_commit_post(&bs_entry->state);
4669             aio_context_release(ctx);
4670         }
4671     }
4672 
4673     ret = 0;
4674     goto cleanup;
4675 
4676 abort:
4677     bdrv_graph_wrlock(NULL);
4678     tran_abort(tran);
4679     bdrv_graph_wrunlock();
4680 
4681     QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
4682         if (bs_entry->prepared) {
4683             ctx = bdrv_get_aio_context(bs_entry->state.bs);
4684             aio_context_acquire(ctx);
4685             bdrv_reopen_abort(&bs_entry->state);
4686             aio_context_release(ctx);
4687         }
4688     }
4689 
4690 cleanup:
4691     bdrv_reopen_queue_free(bs_queue);
4692 
4693     return ret;
4694 }
4695 
4696 int bdrv_reopen(BlockDriverState *bs, QDict *opts, bool keep_old_opts,
4697                 Error **errp)
4698 {
4699     AioContext *ctx = bdrv_get_aio_context(bs);
4700     BlockReopenQueue *queue;
4701     int ret;
4702 
4703     GLOBAL_STATE_CODE();
4704 
4705     queue = bdrv_reopen_queue(NULL, bs, opts, keep_old_opts);
4706 
4707     if (ctx != qemu_get_aio_context()) {
4708         aio_context_release(ctx);
4709     }
4710     ret = bdrv_reopen_multiple(queue, errp);
4711 
4712     if (ctx != qemu_get_aio_context()) {
4713         aio_context_acquire(ctx);
4714     }
4715 
4716     return ret;
4717 }
4718 
4719 int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only,
4720                               Error **errp)
4721 {
4722     QDict *opts = qdict_new();
4723 
4724     GLOBAL_STATE_CODE();
4725 
4726     qdict_put_bool(opts, BDRV_OPT_READ_ONLY, read_only);
4727 
4728     return bdrv_reopen(bs, opts, true, errp);
4729 }
4730 
4731 /*
4732  * Take a BDRVReopenState and check if the value of 'backing' in the
4733  * reopen_state->options QDict is valid or not.
4734  *
4735  * If 'backing' is missing from the QDict then return 0.
4736  *
4737  * If 'backing' contains the node name of the backing file of
4738  * reopen_state->bs then return 0.
4739  *
4740  * If 'backing' contains a different node name (or is null) then check
4741  * whether the current backing file can be replaced with the new one.
4742  * If that's the case then reopen_state->replace_backing_bs is set to
4743  * true and reopen_state->new_backing_bs contains a pointer to the new
4744  * backing BlockDriverState (or NULL).
4745  *
4746  * After calling this function, the transaction @tran may only be completed
4747  * while holding a writer lock for the graph.
4748  *
4749  * Return 0 on success, otherwise return < 0 and set @errp.
4750  *
4751  * The caller must hold the AioContext lock of @reopen_state->bs.
4752  * @reopen_state->bs can move to a different AioContext in this function.
4753  * Callers must make sure that their AioContext locking is still correct after
4754  * this.
4755  */
4756 static int GRAPH_UNLOCKED
4757 bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
4758                                   bool is_backing, Transaction *tran,
4759                                   Error **errp)
4760 {
4761     BlockDriverState *bs = reopen_state->bs;
4762     BlockDriverState *new_child_bs;
4763     BlockDriverState *old_child_bs = is_backing ? child_bs(bs->backing) :
4764                                                   child_bs(bs->file);
4765     const char *child_name = is_backing ? "backing" : "file";
4766     QObject *value;
4767     const char *str;
4768     AioContext *ctx, *old_ctx;
4769     bool has_child;
4770     int ret;
4771 
4772     GLOBAL_STATE_CODE();
4773 
4774     value = qdict_get(reopen_state->options, child_name);
4775     if (value == NULL) {
4776         return 0;
4777     }
4778 
4779     switch (qobject_type(value)) {
4780     case QTYPE_QNULL:
4781         assert(is_backing); /* The 'file' option does not allow a null value */
4782         new_child_bs = NULL;
4783         break;
4784     case QTYPE_QSTRING:
4785         str = qstring_get_str(qobject_to(QString, value));
4786         new_child_bs = bdrv_lookup_bs(NULL, str, errp);
4787         if (new_child_bs == NULL) {
4788             return -EINVAL;
4789         }
4790 
4791         bdrv_graph_rdlock_main_loop();
4792         has_child = bdrv_recurse_has_child(new_child_bs, bs);
4793         bdrv_graph_rdunlock_main_loop();
4794 
4795         if (has_child) {
4796             error_setg(errp, "Making '%s' a %s child of '%s' would create a "
4797                        "cycle", str, child_name, bs->node_name);
4798             return -EINVAL;
4799         }
4800         break;
4801     default:
4802         /*
4803          * The options QDict has been flattened, so 'backing' and 'file'
4804          * do not allow any other data type here.
4805          */
4806         g_assert_not_reached();
4807     }
4808 
4809     if (old_child_bs == new_child_bs) {
4810         return 0;
4811     }
4812 
4813     if (old_child_bs) {
4814         if (bdrv_skip_implicit_filters(old_child_bs) == new_child_bs) {
4815             return 0;
4816         }
4817 
4818         if (old_child_bs->implicit) {
4819             error_setg(errp, "Cannot replace implicit %s child of %s",
4820                        child_name, bs->node_name);
4821             return -EPERM;
4822         }
4823     }
4824 
4825     if (bs->drv->is_filter && !old_child_bs) {
4826         /*
4827          * Filters always have a file or a backing child, so we are trying to
4828          * change wrong child
4829          */
4830         error_setg(errp, "'%s' is a %s filter node that does not support a "
4831                    "%s child", bs->node_name, bs->drv->format_name, child_name);
4832         return -EINVAL;
4833     }
4834 
4835     if (is_backing) {
4836         reopen_state->old_backing_bs = old_child_bs;
4837     } else {
4838         reopen_state->old_file_bs = old_child_bs;
4839     }
4840 
4841     if (old_child_bs) {
4842         bdrv_ref(old_child_bs);
4843         bdrv_drained_begin(old_child_bs);
4844     }
4845 
4846     old_ctx = bdrv_get_aio_context(bs);
4847     ctx = bdrv_get_aio_context(new_child_bs);
4848     if (old_ctx != ctx) {
4849         aio_context_release(old_ctx);
4850         aio_context_acquire(ctx);
4851     }
4852 
4853     bdrv_graph_wrlock(new_child_bs);
4854 
4855     ret = bdrv_set_file_or_backing_noperm(bs, new_child_bs, is_backing,
4856                                           tran, errp);
4857 
4858     bdrv_graph_wrunlock();
4859 
4860     if (old_ctx != ctx) {
4861         aio_context_release(ctx);
4862         aio_context_acquire(old_ctx);
4863     }
4864 
4865     if (old_child_bs) {
4866         bdrv_drained_end(old_child_bs);
4867         bdrv_unref(old_child_bs);
4868     }
4869 
4870     return ret;
4871 }
4872 
4873 /*
4874  * Prepares a BlockDriverState for reopen. All changes are staged in the
4875  * 'opaque' field of the BDRVReopenState, which is used and allocated by
4876  * the block driver layer .bdrv_reopen_prepare()
4877  *
4878  * bs is the BlockDriverState to reopen
4879  * flags are the new open flags
4880  * queue is the reopen queue
4881  *
4882  * Returns 0 on success, non-zero on error.  On error errp will be set
4883  * as well.
4884  *
4885  * On failure, bdrv_reopen_abort() will be called to clean up any data.
4886  * It is the responsibility of the caller to then call the abort() or
4887  * commit() for any other BDS that have been left in a prepare() state
4888  *
4889  * The caller must hold the AioContext lock of @reopen_state->bs.
4890  *
4891  * After calling this function, the transaction @change_child_tran may only be
4892  * completed while holding a writer lock for the graph.
4893  */
4894 static int GRAPH_UNLOCKED
4895 bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
4896                     Transaction *change_child_tran, Error **errp)
4897 {
4898     int ret = -1;
4899     int old_flags;
4900     Error *local_err = NULL;
4901     BlockDriver *drv;
4902     QemuOpts *opts;
4903     QDict *orig_reopen_opts;
4904     char *discard = NULL;
4905     bool read_only;
4906     bool drv_prepared = false;
4907 
4908     assert(reopen_state != NULL);
4909     assert(reopen_state->bs->drv != NULL);
4910     GLOBAL_STATE_CODE();
4911     drv = reopen_state->bs->drv;
4912 
4913     /* This function and each driver's bdrv_reopen_prepare() remove
4914      * entries from reopen_state->options as they are processed, so
4915      * we need to make a copy of the original QDict. */
4916     orig_reopen_opts = qdict_clone_shallow(reopen_state->options);
4917 
4918     /* Process generic block layer options */
4919     opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
4920     if (!qemu_opts_absorb_qdict(opts, reopen_state->options, errp)) {
4921         ret = -EINVAL;
4922         goto error;
4923     }
4924 
4925     /* This was already called in bdrv_reopen_queue_child() so the flags
4926      * are up-to-date. This time we simply want to remove the options from
4927      * QemuOpts in order to indicate that they have been processed. */
4928     old_flags = reopen_state->flags;
4929     update_flags_from_options(&reopen_state->flags, opts);
4930     assert(old_flags == reopen_state->flags);
4931 
4932     discard = qemu_opt_get_del(opts, BDRV_OPT_DISCARD);
4933     if (discard != NULL) {
4934         if (bdrv_parse_discard_flags(discard, &reopen_state->flags) != 0) {
4935             error_setg(errp, "Invalid discard option");
4936             ret = -EINVAL;
4937             goto error;
4938         }
4939     }
4940 
4941     reopen_state->detect_zeroes =
4942         bdrv_parse_detect_zeroes(opts, reopen_state->flags, &local_err);
4943     if (local_err) {
4944         error_propagate(errp, local_err);
4945         ret = -EINVAL;
4946         goto error;
4947     }
4948 
4949     /* All other options (including node-name and driver) must be unchanged.
4950      * Put them back into the QDict, so that they are checked at the end
4951      * of this function. */
4952     qemu_opts_to_qdict(opts, reopen_state->options);
4953 
4954     /* If we are to stay read-only, do not allow permission change
4955      * to r/w. Attempting to set to r/w may fail if either BDRV_O_ALLOW_RDWR is
4956      * not set, or if the BDS still has copy_on_read enabled */
4957     read_only = !(reopen_state->flags & BDRV_O_RDWR);
4958 
4959     bdrv_graph_rdlock_main_loop();
4960     ret = bdrv_can_set_read_only(reopen_state->bs, read_only, true, &local_err);
4961     bdrv_graph_rdunlock_main_loop();
4962     if (local_err) {
4963         error_propagate(errp, local_err);
4964         goto error;
4965     }
4966 
4967     if (drv->bdrv_reopen_prepare) {
4968         /*
4969          * If a driver-specific option is missing, it means that we
4970          * should reset it to its default value.
4971          * But not all options allow that, so we need to check it first.
4972          */
4973         ret = bdrv_reset_options_allowed(reopen_state->bs,
4974                                          reopen_state->options, errp);
4975         if (ret) {
4976             goto error;
4977         }
4978 
4979         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
4980         if (ret) {
4981             if (local_err != NULL) {
4982                 error_propagate(errp, local_err);
4983             } else {
4984                 bdrv_graph_rdlock_main_loop();
4985                 bdrv_refresh_filename(reopen_state->bs);
4986                 bdrv_graph_rdunlock_main_loop();
4987                 error_setg(errp, "failed while preparing to reopen image '%s'",
4988                            reopen_state->bs->filename);
4989             }
4990             goto error;
4991         }
4992     } else {
4993         /* It is currently mandatory to have a bdrv_reopen_prepare()
4994          * handler for each supported drv. */
4995         bdrv_graph_rdlock_main_loop();
4996         error_setg(errp, "Block format '%s' used by node '%s' "
4997                    "does not support reopening files", drv->format_name,
4998                    bdrv_get_device_or_node_name(reopen_state->bs));
4999         bdrv_graph_rdunlock_main_loop();
5000         ret = -1;
5001         goto error;
5002     }
5003 
5004     drv_prepared = true;
5005 
5006     /*
5007      * We must provide the 'backing' option if the BDS has a backing
5008      * file or if the image file has a backing file name as part of
5009      * its metadata. Otherwise the 'backing' option can be omitted.
5010      */
5011     if (drv->supports_backing && reopen_state->backing_missing &&
5012         (reopen_state->bs->backing || reopen_state->bs->backing_file[0])) {
5013         error_setg(errp, "backing is missing for '%s'",
5014                    reopen_state->bs->node_name);
5015         ret = -EINVAL;
5016         goto error;
5017     }
5018 
5019     /*
5020      * Allow changing the 'backing' option. The new value can be
5021      * either a reference to an existing node (using its node name)
5022      * or NULL to simply detach the current backing file.
5023      */
5024     ret = bdrv_reopen_parse_file_or_backing(reopen_state, true,
5025                                             change_child_tran, errp);
5026     if (ret < 0) {
5027         goto error;
5028     }
5029     qdict_del(reopen_state->options, "backing");
5030 
5031     /* Allow changing the 'file' option. In this case NULL is not allowed */
5032     ret = bdrv_reopen_parse_file_or_backing(reopen_state, false,
5033                                             change_child_tran, errp);
5034     if (ret < 0) {
5035         goto error;
5036     }
5037     qdict_del(reopen_state->options, "file");
5038 
5039     /* Options that are not handled are only okay if they are unchanged
5040      * compared to the old state. It is expected that some options are only
5041      * used for the initial open, but not reopen (e.g. filename) */
5042     if (qdict_size(reopen_state->options)) {
5043         const QDictEntry *entry = qdict_first(reopen_state->options);
5044 
5045         GRAPH_RDLOCK_GUARD_MAINLOOP();
5046 
5047         do {
5048             QObject *new = entry->value;
5049             QObject *old = qdict_get(reopen_state->bs->options, entry->key);
5050 
5051             /* Allow child references (child_name=node_name) as long as they
5052              * point to the current child (i.e. everything stays the same). */
5053             if (qobject_type(new) == QTYPE_QSTRING) {
5054                 BdrvChild *child;
5055                 QLIST_FOREACH(child, &reopen_state->bs->children, next) {
5056                     if (!strcmp(child->name, entry->key)) {
5057                         break;
5058                     }
5059                 }
5060 
5061                 if (child) {
5062                     if (!strcmp(child->bs->node_name,
5063                                 qstring_get_str(qobject_to(QString, new)))) {
5064                         continue; /* Found child with this name, skip option */
5065                     }
5066                 }
5067             }
5068 
5069             /*
5070              * TODO: When using -drive to specify blockdev options, all values
5071              * will be strings; however, when using -blockdev, blockdev-add or
5072              * filenames using the json:{} pseudo-protocol, they will be
5073              * correctly typed.
5074              * In contrast, reopening options are (currently) always strings
5075              * (because you can only specify them through qemu-io; all other
5076              * callers do not specify any options).
5077              * Therefore, when using anything other than -drive to create a BDS,
5078              * this cannot detect non-string options as unchanged, because
5079              * qobject_is_equal() always returns false for objects of different
5080              * type.  In the future, this should be remedied by correctly typing
5081              * all options.  For now, this is not too big of an issue because
5082              * the user can simply omit options which cannot be changed anyway,
5083              * so they will stay unchanged.
5084              */
5085             if (!qobject_is_equal(new, old)) {
5086                 error_setg(errp, "Cannot change the option '%s'", entry->key);
5087                 ret = -EINVAL;
5088                 goto error;
5089             }
5090         } while ((entry = qdict_next(reopen_state->options, entry)));
5091     }
5092 
5093     ret = 0;
5094 
5095     /* Restore the original reopen_state->options QDict */
5096     qobject_unref(reopen_state->options);
5097     reopen_state->options = qobject_ref(orig_reopen_opts);
5098 
5099 error:
5100     if (ret < 0 && drv_prepared) {
5101         /* drv->bdrv_reopen_prepare() has succeeded, so we need to
5102          * call drv->bdrv_reopen_abort() before signaling an error
5103          * (bdrv_reopen_multiple() will not call bdrv_reopen_abort()
5104          * when the respective bdrv_reopen_prepare() has failed) */
5105         if (drv->bdrv_reopen_abort) {
5106             drv->bdrv_reopen_abort(reopen_state);
5107         }
5108     }
5109     qemu_opts_del(opts);
5110     qobject_unref(orig_reopen_opts);
5111     g_free(discard);
5112     return ret;
5113 }
5114 
5115 /*
5116  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
5117  * makes them final by swapping the staging BlockDriverState contents into
5118  * the active BlockDriverState contents.
5119  */
5120 static void GRAPH_UNLOCKED bdrv_reopen_commit(BDRVReopenState *reopen_state)
5121 {
5122     BlockDriver *drv;
5123     BlockDriverState *bs;
5124     BdrvChild *child;
5125 
5126     assert(reopen_state != NULL);
5127     bs = reopen_state->bs;
5128     drv = bs->drv;
5129     assert(drv != NULL);
5130     GLOBAL_STATE_CODE();
5131 
5132     /* If there are any driver level actions to take */
5133     if (drv->bdrv_reopen_commit) {
5134         drv->bdrv_reopen_commit(reopen_state);
5135     }
5136 
5137     GRAPH_RDLOCK_GUARD_MAINLOOP();
5138 
5139     /* set BDS specific flags now */
5140     qobject_unref(bs->explicit_options);
5141     qobject_unref(bs->options);
5142     qobject_ref(reopen_state->explicit_options);
5143     qobject_ref(reopen_state->options);
5144 
5145     bs->explicit_options   = reopen_state->explicit_options;
5146     bs->options            = reopen_state->options;
5147     bs->open_flags         = reopen_state->flags;
5148     bs->detect_zeroes      = reopen_state->detect_zeroes;
5149 
5150     /* Remove child references from bs->options and bs->explicit_options.
5151      * Child options were already removed in bdrv_reopen_queue_child() */
5152     QLIST_FOREACH(child, &bs->children, next) {
5153         qdict_del(bs->explicit_options, child->name);
5154         qdict_del(bs->options, child->name);
5155     }
5156     /* backing is probably removed, so it's not handled by previous loop */
5157     qdict_del(bs->explicit_options, "backing");
5158     qdict_del(bs->options, "backing");
5159 
5160     bdrv_refresh_limits(bs, NULL, NULL);
5161     bdrv_refresh_total_sectors(bs, bs->total_sectors);
5162 }
5163 
5164 /*
5165  * Abort the reopen, and delete and free the staged changes in
5166  * reopen_state
5167  */
5168 static void GRAPH_UNLOCKED bdrv_reopen_abort(BDRVReopenState *reopen_state)
5169 {
5170     BlockDriver *drv;
5171 
5172     assert(reopen_state != NULL);
5173     drv = reopen_state->bs->drv;
5174     assert(drv != NULL);
5175     GLOBAL_STATE_CODE();
5176 
5177     if (drv->bdrv_reopen_abort) {
5178         drv->bdrv_reopen_abort(reopen_state);
5179     }
5180 }
5181 
5182 
5183 static void bdrv_close(BlockDriverState *bs)
5184 {
5185     BdrvAioNotifier *ban, *ban_next;
5186     BdrvChild *child, *next;
5187 
5188     GLOBAL_STATE_CODE();
5189     assert(!bs->refcnt);
5190 
5191     bdrv_drained_begin(bs); /* complete I/O */
5192     bdrv_flush(bs);
5193     bdrv_drain(bs); /* in case flush left pending I/O */
5194 
5195     if (bs->drv) {
5196         if (bs->drv->bdrv_close) {
5197             /* Must unfreeze all children, so bdrv_unref_child() works */
5198             bs->drv->bdrv_close(bs);
5199         }
5200         bs->drv = NULL;
5201     }
5202 
5203     bdrv_graph_wrlock(bs);
5204     QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
5205         bdrv_unref_child(bs, child);
5206     }
5207     bdrv_graph_wrunlock();
5208 
5209     assert(!bs->backing);
5210     assert(!bs->file);
5211     g_free(bs->opaque);
5212     bs->opaque = NULL;
5213     qatomic_set(&bs->copy_on_read, 0);
5214     bs->backing_file[0] = '\0';
5215     bs->backing_format[0] = '\0';
5216     bs->total_sectors = 0;
5217     bs->encrypted = false;
5218     bs->sg = false;
5219     qobject_unref(bs->options);
5220     qobject_unref(bs->explicit_options);
5221     bs->options = NULL;
5222     bs->explicit_options = NULL;
5223     qobject_unref(bs->full_open_options);
5224     bs->full_open_options = NULL;
5225     g_free(bs->block_status_cache);
5226     bs->block_status_cache = NULL;
5227 
5228     bdrv_release_named_dirty_bitmaps(bs);
5229     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
5230 
5231     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5232         g_free(ban);
5233     }
5234     QLIST_INIT(&bs->aio_notifiers);
5235     bdrv_drained_end(bs);
5236 
5237     /*
5238      * If we're still inside some bdrv_drain_all_begin()/end() sections, end
5239      * them now since this BDS won't exist anymore when bdrv_drain_all_end()
5240      * gets called.
5241      */
5242     if (bs->quiesce_counter) {
5243         bdrv_drain_all_end_quiesce(bs);
5244     }
5245 }
5246 
5247 void bdrv_close_all(void)
5248 {
5249     GLOBAL_STATE_CODE();
5250     assert(job_next(NULL) == NULL);
5251 
5252     /* Drop references from requests still in flight, such as canceled block
5253      * jobs whose AIO context has not been polled yet */
5254     bdrv_drain_all();
5255 
5256     blk_remove_all_bs();
5257     blockdev_close_all_bdrv_states();
5258 
5259     assert(QTAILQ_EMPTY(&all_bdrv_states));
5260 }
5261 
5262 static bool GRAPH_RDLOCK should_update_child(BdrvChild *c, BlockDriverState *to)
5263 {
5264     GQueue *queue;
5265     GHashTable *found;
5266     bool ret;
5267 
5268     if (c->klass->stay_at_node) {
5269         return false;
5270     }
5271 
5272     /* If the child @c belongs to the BDS @to, replacing the current
5273      * c->bs by @to would mean to create a loop.
5274      *
5275      * Such a case occurs when appending a BDS to a backing chain.
5276      * For instance, imagine the following chain:
5277      *
5278      *   guest device -> node A -> further backing chain...
5279      *
5280      * Now we create a new BDS B which we want to put on top of this
5281      * chain, so we first attach A as its backing node:
5282      *
5283      *                   node B
5284      *                     |
5285      *                     v
5286      *   guest device -> node A -> further backing chain...
5287      *
5288      * Finally we want to replace A by B.  When doing that, we want to
5289      * replace all pointers to A by pointers to B -- except for the
5290      * pointer from B because (1) that would create a loop, and (2)
5291      * that pointer should simply stay intact:
5292      *
5293      *   guest device -> node B
5294      *                     |
5295      *                     v
5296      *                   node A -> further backing chain...
5297      *
5298      * In general, when replacing a node A (c->bs) by a node B (@to),
5299      * if A is a child of B, that means we cannot replace A by B there
5300      * because that would create a loop.  Silently detaching A from B
5301      * is also not really an option.  So overall just leaving A in
5302      * place there is the most sensible choice.
5303      *
5304      * We would also create a loop in any cases where @c is only
5305      * indirectly referenced by @to. Prevent this by returning false
5306      * if @c is found (by breadth-first search) anywhere in the whole
5307      * subtree of @to.
5308      */
5309 
5310     ret = true;
5311     found = g_hash_table_new(NULL, NULL);
5312     g_hash_table_add(found, to);
5313     queue = g_queue_new();
5314     g_queue_push_tail(queue, to);
5315 
5316     while (!g_queue_is_empty(queue)) {
5317         BlockDriverState *v = g_queue_pop_head(queue);
5318         BdrvChild *c2;
5319 
5320         QLIST_FOREACH(c2, &v->children, next) {
5321             if (c2 == c) {
5322                 ret = false;
5323                 break;
5324             }
5325 
5326             if (g_hash_table_contains(found, c2->bs)) {
5327                 continue;
5328             }
5329 
5330             g_queue_push_tail(queue, c2->bs);
5331             g_hash_table_add(found, c2->bs);
5332         }
5333     }
5334 
5335     g_queue_free(queue);
5336     g_hash_table_destroy(found);
5337 
5338     return ret;
5339 }
5340 
5341 static void bdrv_remove_child_commit(void *opaque)
5342 {
5343     GLOBAL_STATE_CODE();
5344     bdrv_child_free(opaque);
5345 }
5346 
5347 static TransactionActionDrv bdrv_remove_child_drv = {
5348     .commit = bdrv_remove_child_commit,
5349 };
5350 
5351 /*
5352  * Function doesn't update permissions, caller is responsible for this.
5353  *
5354  * @child->bs (if non-NULL) must be drained.
5355  *
5356  * After calling this function, the transaction @tran may only be completed
5357  * while holding a writer lock for the graph.
5358  */
5359 static void GRAPH_WRLOCK bdrv_remove_child(BdrvChild *child, Transaction *tran)
5360 {
5361     if (!child) {
5362         return;
5363     }
5364 
5365     if (child->bs) {
5366         assert(child->quiesced_parent);
5367         bdrv_replace_child_tran(child, NULL, tran);
5368     }
5369 
5370     tran_add(tran, &bdrv_remove_child_drv, child);
5371 }
5372 
5373 /*
5374  * Both @from and @to (if non-NULL) must be drained. @to must be kept drained
5375  * until the transaction is completed.
5376  *
5377  * After calling this function, the transaction @tran may only be completed
5378  * while holding a writer lock for the graph.
5379  */
5380 static int GRAPH_WRLOCK
5381 bdrv_replace_node_noperm(BlockDriverState *from,
5382                          BlockDriverState *to,
5383                          bool auto_skip, Transaction *tran,
5384                          Error **errp)
5385 {
5386     BdrvChild *c, *next;
5387 
5388     GLOBAL_STATE_CODE();
5389 
5390     assert(from->quiesce_counter);
5391     assert(to->quiesce_counter);
5392 
5393     QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
5394         assert(c->bs == from);
5395         if (!should_update_child(c, to)) {
5396             if (auto_skip) {
5397                 continue;
5398             }
5399             error_setg(errp, "Should not change '%s' link to '%s'",
5400                        c->name, from->node_name);
5401             return -EINVAL;
5402         }
5403         if (c->frozen) {
5404             error_setg(errp, "Cannot change '%s' link to '%s'",
5405                        c->name, from->node_name);
5406             return -EPERM;
5407         }
5408         bdrv_replace_child_tran(c, to, tran);
5409     }
5410 
5411     return 0;
5412 }
5413 
5414 /*
5415  * With auto_skip=true bdrv_replace_node_common skips updating from parents
5416  * if it creates a parent-child relation loop or if parent is block-job.
5417  *
5418  * With auto_skip=false the error is returned if from has a parent which should
5419  * not be updated.
5420  *
5421  * With @detach_subchain=true @to must be in a backing chain of @from. In this
5422  * case backing link of the cow-parent of @to is removed.
5423  */
5424 static int bdrv_replace_node_common(BlockDriverState *from,
5425                                     BlockDriverState *to,
5426                                     bool auto_skip, bool detach_subchain,
5427                                     Error **errp)
5428 {
5429     Transaction *tran = tran_new();
5430     g_autoptr(GSList) refresh_list = NULL;
5431     BlockDriverState *to_cow_parent = NULL;
5432     int ret;
5433 
5434     GLOBAL_STATE_CODE();
5435 
5436     if (detach_subchain) {
5437         assert(bdrv_chain_contains(from, to));
5438         assert(from != to);
5439         for (to_cow_parent = from;
5440              bdrv_filter_or_cow_bs(to_cow_parent) != to;
5441              to_cow_parent = bdrv_filter_or_cow_bs(to_cow_parent))
5442         {
5443             ;
5444         }
5445     }
5446 
5447     /* Make sure that @from doesn't go away until we have successfully attached
5448      * all of its parents to @to. */
5449     bdrv_ref(from);
5450 
5451     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
5452     assert(bdrv_get_aio_context(from) == bdrv_get_aio_context(to));
5453     bdrv_drained_begin(from);
5454     bdrv_drained_begin(to);
5455 
5456     bdrv_graph_wrlock(to);
5457 
5458     /*
5459      * Do the replacement without permission update.
5460      * Replacement may influence the permissions, we should calculate new
5461      * permissions based on new graph. If we fail, we'll roll-back the
5462      * replacement.
5463      */
5464     ret = bdrv_replace_node_noperm(from, to, auto_skip, tran, errp);
5465     if (ret < 0) {
5466         goto out;
5467     }
5468 
5469     if (detach_subchain) {
5470         /* to_cow_parent is already drained because from is drained */
5471         bdrv_remove_child(bdrv_filter_or_cow_child(to_cow_parent), tran);
5472     }
5473 
5474     refresh_list = g_slist_prepend(refresh_list, to);
5475     refresh_list = g_slist_prepend(refresh_list, from);
5476 
5477     ret = bdrv_list_refresh_perms(refresh_list, NULL, tran, errp);
5478     if (ret < 0) {
5479         goto out;
5480     }
5481 
5482     ret = 0;
5483 
5484 out:
5485     tran_finalize(tran, ret);
5486     bdrv_graph_wrunlock();
5487 
5488     bdrv_drained_end(to);
5489     bdrv_drained_end(from);
5490     bdrv_unref(from);
5491 
5492     return ret;
5493 }
5494 
5495 int bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
5496                       Error **errp)
5497 {
5498     GLOBAL_STATE_CODE();
5499 
5500     return bdrv_replace_node_common(from, to, true, false, errp);
5501 }
5502 
5503 int bdrv_drop_filter(BlockDriverState *bs, Error **errp)
5504 {
5505     GLOBAL_STATE_CODE();
5506 
5507     return bdrv_replace_node_common(bs, bdrv_filter_or_cow_bs(bs), true, true,
5508                                     errp);
5509 }
5510 
5511 /*
5512  * Add new bs contents at the top of an image chain while the chain is
5513  * live, while keeping required fields on the top layer.
5514  *
5515  * This will modify the BlockDriverState fields, and swap contents
5516  * between bs_new and bs_top. Both bs_new and bs_top are modified.
5517  *
5518  * bs_new must not be attached to a BlockBackend and must not have backing
5519  * child.
5520  *
5521  * This function does not create any image files.
5522  *
5523  * The caller must hold the AioContext lock for @bs_top.
5524  */
5525 int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
5526                 Error **errp)
5527 {
5528     int ret;
5529     BdrvChild *child;
5530     Transaction *tran = tran_new();
5531     AioContext *old_context, *new_context = NULL;
5532 
5533     GLOBAL_STATE_CODE();
5534 
5535     assert(!bs_new->backing);
5536 
5537     old_context = bdrv_get_aio_context(bs_top);
5538     bdrv_drained_begin(bs_top);
5539 
5540     /*
5541      * bdrv_drained_begin() requires that only the AioContext of the drained
5542      * node is locked, and at this point it can still differ from the AioContext
5543      * of bs_top.
5544      */
5545     new_context = bdrv_get_aio_context(bs_new);
5546     aio_context_release(old_context);
5547     aio_context_acquire(new_context);
5548     bdrv_drained_begin(bs_new);
5549     aio_context_release(new_context);
5550     aio_context_acquire(old_context);
5551     new_context = NULL;
5552 
5553     bdrv_graph_wrlock(bs_top);
5554 
5555     child = bdrv_attach_child_noperm(bs_new, bs_top, "backing",
5556                                      &child_of_bds, bdrv_backing_role(bs_new),
5557                                      tran, errp);
5558     if (!child) {
5559         ret = -EINVAL;
5560         goto out;
5561     }
5562 
5563     /*
5564      * bdrv_attach_child_noperm could change the AioContext of bs_top and
5565      * bs_new, but at least they are in the same AioContext now. This is the
5566      * AioContext that we need to lock for the rest of the function.
5567      */
5568     new_context = bdrv_get_aio_context(bs_top);
5569 
5570     if (old_context != new_context) {
5571         aio_context_release(old_context);
5572         aio_context_acquire(new_context);
5573     }
5574 
5575     ret = bdrv_replace_node_noperm(bs_top, bs_new, true, tran, errp);
5576     if (ret < 0) {
5577         goto out;
5578     }
5579 
5580     ret = bdrv_refresh_perms(bs_new, tran, errp);
5581 out:
5582     tran_finalize(tran, ret);
5583 
5584     bdrv_refresh_limits(bs_top, NULL, NULL);
5585     bdrv_graph_wrunlock();
5586 
5587     bdrv_drained_end(bs_top);
5588     bdrv_drained_end(bs_new);
5589 
5590     if (new_context && old_context != new_context) {
5591         aio_context_release(new_context);
5592         aio_context_acquire(old_context);
5593     }
5594 
5595     return ret;
5596 }
5597 
5598 /* Not for empty child */
5599 int bdrv_replace_child_bs(BdrvChild *child, BlockDriverState *new_bs,
5600                           Error **errp)
5601 {
5602     int ret;
5603     Transaction *tran = tran_new();
5604     g_autoptr(GSList) refresh_list = NULL;
5605     BlockDriverState *old_bs = child->bs;
5606 
5607     GLOBAL_STATE_CODE();
5608 
5609     bdrv_ref(old_bs);
5610     bdrv_drained_begin(old_bs);
5611     bdrv_drained_begin(new_bs);
5612     bdrv_graph_wrlock(new_bs);
5613 
5614     bdrv_replace_child_tran(child, new_bs, tran);
5615 
5616     refresh_list = g_slist_prepend(refresh_list, old_bs);
5617     refresh_list = g_slist_prepend(refresh_list, new_bs);
5618 
5619     ret = bdrv_list_refresh_perms(refresh_list, NULL, tran, errp);
5620 
5621     tran_finalize(tran, ret);
5622 
5623     bdrv_graph_wrunlock();
5624     bdrv_drained_end(old_bs);
5625     bdrv_drained_end(new_bs);
5626     bdrv_unref(old_bs);
5627 
5628     return ret;
5629 }
5630 
5631 static void bdrv_delete(BlockDriverState *bs)
5632 {
5633     assert(bdrv_op_blocker_is_empty(bs));
5634     assert(!bs->refcnt);
5635     GLOBAL_STATE_CODE();
5636 
5637     /* remove from list, if necessary */
5638     if (bs->node_name[0] != '\0') {
5639         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
5640     }
5641     QTAILQ_REMOVE(&all_bdrv_states, bs, bs_list);
5642 
5643     bdrv_close(bs);
5644 
5645     qemu_mutex_destroy(&bs->reqs_lock);
5646 
5647     g_free(bs);
5648 }
5649 
5650 
5651 /*
5652  * Replace @bs by newly created block node.
5653  *
5654  * @options is a QDict of options to pass to the block drivers, or NULL for an
5655  * empty set of options. The reference to the QDict belongs to the block layer
5656  * after the call (even on failure), so if the caller intends to reuse the
5657  * dictionary, it needs to use qobject_ref() before calling bdrv_open.
5658  *
5659  * The caller holds the AioContext lock for @bs. It must make sure that @bs
5660  * stays in the same AioContext, i.e. @options must not refer to nodes in a
5661  * different AioContext.
5662  */
5663 BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *options,
5664                                    int flags, Error **errp)
5665 {
5666     ERRP_GUARD();
5667     int ret;
5668     AioContext *ctx = bdrv_get_aio_context(bs);
5669     BlockDriverState *new_node_bs = NULL;
5670     const char *drvname, *node_name;
5671     BlockDriver *drv;
5672 
5673     drvname = qdict_get_try_str(options, "driver");
5674     if (!drvname) {
5675         error_setg(errp, "driver is not specified");
5676         goto fail;
5677     }
5678 
5679     drv = bdrv_find_format(drvname);
5680     if (!drv) {
5681         error_setg(errp, "Unknown driver: '%s'", drvname);
5682         goto fail;
5683     }
5684 
5685     node_name = qdict_get_try_str(options, "node-name");
5686 
5687     GLOBAL_STATE_CODE();
5688 
5689     aio_context_release(ctx);
5690     aio_context_acquire(qemu_get_aio_context());
5691     new_node_bs = bdrv_new_open_driver_opts(drv, node_name, options, flags,
5692                                             errp);
5693     aio_context_release(qemu_get_aio_context());
5694     aio_context_acquire(ctx);
5695     assert(bdrv_get_aio_context(bs) == ctx);
5696 
5697     options = NULL; /* bdrv_new_open_driver() eats options */
5698     if (!new_node_bs) {
5699         error_prepend(errp, "Could not create node: ");
5700         goto fail;
5701     }
5702 
5703     bdrv_drained_begin(bs);
5704     ret = bdrv_replace_node(bs, new_node_bs, errp);
5705     bdrv_drained_end(bs);
5706 
5707     if (ret < 0) {
5708         error_prepend(errp, "Could not replace node: ");
5709         goto fail;
5710     }
5711 
5712     return new_node_bs;
5713 
5714 fail:
5715     qobject_unref(options);
5716     bdrv_unref(new_node_bs);
5717     return NULL;
5718 }
5719 
5720 /*
5721  * Run consistency checks on an image
5722  *
5723  * Returns 0 if the check could be completed (it doesn't mean that the image is
5724  * free of errors) or -errno when an internal error occurred. The results of the
5725  * check are stored in res.
5726  */
5727 int coroutine_fn bdrv_co_check(BlockDriverState *bs,
5728                                BdrvCheckResult *res, BdrvCheckMode fix)
5729 {
5730     IO_CODE();
5731     assert_bdrv_graph_readable();
5732     if (bs->drv == NULL) {
5733         return -ENOMEDIUM;
5734     }
5735     if (bs->drv->bdrv_co_check == NULL) {
5736         return -ENOTSUP;
5737     }
5738 
5739     memset(res, 0, sizeof(*res));
5740     return bs->drv->bdrv_co_check(bs, res, fix);
5741 }
5742 
5743 /*
5744  * Return values:
5745  * 0        - success
5746  * -EINVAL  - backing format specified, but no file
5747  * -ENOSPC  - can't update the backing file because no space is left in the
5748  *            image file header
5749  * -ENOTSUP - format driver doesn't support changing the backing file
5750  */
5751 int bdrv_change_backing_file(BlockDriverState *bs, const char *backing_file,
5752                              const char *backing_fmt, bool require)
5753 {
5754     BlockDriver *drv = bs->drv;
5755     int ret;
5756 
5757     GLOBAL_STATE_CODE();
5758 
5759     if (!drv) {
5760         return -ENOMEDIUM;
5761     }
5762 
5763     /* Backing file format doesn't make sense without a backing file */
5764     if (backing_fmt && !backing_file) {
5765         return -EINVAL;
5766     }
5767 
5768     if (require && backing_file && !backing_fmt) {
5769         return -EINVAL;
5770     }
5771 
5772     if (drv->bdrv_change_backing_file != NULL) {
5773         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
5774     } else {
5775         ret = -ENOTSUP;
5776     }
5777 
5778     if (ret == 0) {
5779         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
5780         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
5781         pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
5782                 backing_file ?: "");
5783     }
5784     return ret;
5785 }
5786 
5787 /*
5788  * Finds the first non-filter node above bs in the chain between
5789  * active and bs.  The returned node is either an immediate parent of
5790  * bs, or there are only filter nodes between the two.
5791  *
5792  * Returns NULL if bs is not found in active's image chain,
5793  * or if active == bs.
5794  *
5795  * Returns the bottommost base image if bs == NULL.
5796  */
5797 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
5798                                     BlockDriverState *bs)
5799 {
5800 
5801     GLOBAL_STATE_CODE();
5802 
5803     bs = bdrv_skip_filters(bs);
5804     active = bdrv_skip_filters(active);
5805 
5806     while (active) {
5807         BlockDriverState *next = bdrv_backing_chain_next(active);
5808         if (bs == next) {
5809             return active;
5810         }
5811         active = next;
5812     }
5813 
5814     return NULL;
5815 }
5816 
5817 /* Given a BDS, searches for the base layer. */
5818 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
5819 {
5820     GLOBAL_STATE_CODE();
5821 
5822     return bdrv_find_overlay(bs, NULL);
5823 }
5824 
5825 /*
5826  * Return true if at least one of the COW (backing) and filter links
5827  * between @bs and @base is frozen. @errp is set if that's the case.
5828  * @base must be reachable from @bs, or NULL.
5829  */
5830 bool bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base,
5831                                   Error **errp)
5832 {
5833     BlockDriverState *i;
5834     BdrvChild *child;
5835 
5836     GLOBAL_STATE_CODE();
5837 
5838     for (i = bs; i != base; i = child_bs(child)) {
5839         child = bdrv_filter_or_cow_child(i);
5840 
5841         if (child && child->frozen) {
5842             error_setg(errp, "Cannot change '%s' link from '%s' to '%s'",
5843                        child->name, i->node_name, child->bs->node_name);
5844             return true;
5845         }
5846     }
5847 
5848     return false;
5849 }
5850 
5851 /*
5852  * Freeze all COW (backing) and filter links between @bs and @base.
5853  * If any of the links is already frozen the operation is aborted and
5854  * none of the links are modified.
5855  * @base must be reachable from @bs, or NULL.
5856  * Returns 0 on success. On failure returns < 0 and sets @errp.
5857  */
5858 int bdrv_freeze_backing_chain(BlockDriverState *bs, BlockDriverState *base,
5859                               Error **errp)
5860 {
5861     BlockDriverState *i;
5862     BdrvChild *child;
5863 
5864     GLOBAL_STATE_CODE();
5865 
5866     if (bdrv_is_backing_chain_frozen(bs, base, errp)) {
5867         return -EPERM;
5868     }
5869 
5870     for (i = bs; i != base; i = child_bs(child)) {
5871         child = bdrv_filter_or_cow_child(i);
5872         if (child && child->bs->never_freeze) {
5873             error_setg(errp, "Cannot freeze '%s' link to '%s'",
5874                        child->name, child->bs->node_name);
5875             return -EPERM;
5876         }
5877     }
5878 
5879     for (i = bs; i != base; i = child_bs(child)) {
5880         child = bdrv_filter_or_cow_child(i);
5881         if (child) {
5882             child->frozen = true;
5883         }
5884     }
5885 
5886     return 0;
5887 }
5888 
5889 /*
5890  * Unfreeze all COW (backing) and filter links between @bs and @base.
5891  * The caller must ensure that all links are frozen before using this
5892  * function.
5893  * @base must be reachable from @bs, or NULL.
5894  */
5895 void bdrv_unfreeze_backing_chain(BlockDriverState *bs, BlockDriverState *base)
5896 {
5897     BlockDriverState *i;
5898     BdrvChild *child;
5899 
5900     GLOBAL_STATE_CODE();
5901 
5902     for (i = bs; i != base; i = child_bs(child)) {
5903         child = bdrv_filter_or_cow_child(i);
5904         if (child) {
5905             assert(child->frozen);
5906             child->frozen = false;
5907         }
5908     }
5909 }
5910 
5911 /*
5912  * Drops images above 'base' up to and including 'top', and sets the image
5913  * above 'top' to have base as its backing file.
5914  *
5915  * Requires that the overlay to 'top' is opened r/w, so that the backing file
5916  * information in 'bs' can be properly updated.
5917  *
5918  * E.g., this will convert the following chain:
5919  * bottom <- base <- intermediate <- top <- active
5920  *
5921  * to
5922  *
5923  * bottom <- base <- active
5924  *
5925  * It is allowed for bottom==base, in which case it converts:
5926  *
5927  * base <- intermediate <- top <- active
5928  *
5929  * to
5930  *
5931  * base <- active
5932  *
5933  * If backing_file_str is non-NULL, it will be used when modifying top's
5934  * overlay image metadata.
5935  *
5936  * Error conditions:
5937  *  if active == top, that is considered an error
5938  *
5939  */
5940 int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,
5941                            const char *backing_file_str)
5942 {
5943     BlockDriverState *explicit_top = top;
5944     bool update_inherits_from;
5945     BdrvChild *c;
5946     Error *local_err = NULL;
5947     int ret = -EIO;
5948     g_autoptr(GSList) updated_children = NULL;
5949     GSList *p;
5950 
5951     GLOBAL_STATE_CODE();
5952 
5953     bdrv_ref(top);
5954     bdrv_drained_begin(base);
5955     bdrv_graph_rdlock_main_loop();
5956 
5957     if (!top->drv || !base->drv) {
5958         goto exit;
5959     }
5960 
5961     /* Make sure that base is in the backing chain of top */
5962     if (!bdrv_chain_contains(top, base)) {
5963         goto exit;
5964     }
5965 
5966     /* If 'base' recursively inherits from 'top' then we should set
5967      * base->inherits_from to top->inherits_from after 'top' and all
5968      * other intermediate nodes have been dropped.
5969      * If 'top' is an implicit node (e.g. "commit_top") we should skip
5970      * it because no one inherits from it. We use explicit_top for that. */
5971     explicit_top = bdrv_skip_implicit_filters(explicit_top);
5972     update_inherits_from = bdrv_inherits_from_recursive(base, explicit_top);
5973 
5974     /* success - we can delete the intermediate states, and link top->base */
5975     if (!backing_file_str) {
5976         bdrv_refresh_filename(base);
5977         backing_file_str = base->filename;
5978     }
5979 
5980     QLIST_FOREACH(c, &top->parents, next_parent) {
5981         updated_children = g_slist_prepend(updated_children, c);
5982     }
5983 
5984     /*
5985      * It seems correct to pass detach_subchain=true here, but it triggers
5986      * one more yet not fixed bug, when due to nested aio_poll loop we switch to
5987      * another drained section, which modify the graph (for example, removing
5988      * the child, which we keep in updated_children list). So, it's a TODO.
5989      *
5990      * Note, bug triggered if pass detach_subchain=true here and run
5991      * test-bdrv-drain. test_drop_intermediate_poll() test-case will crash.
5992      * That's a FIXME.
5993      */
5994     bdrv_replace_node_common(top, base, false, false, &local_err);
5995     if (local_err) {
5996         error_report_err(local_err);
5997         goto exit;
5998     }
5999 
6000     for (p = updated_children; p; p = p->next) {
6001         c = p->data;
6002 
6003         if (c->klass->update_filename) {
6004             ret = c->klass->update_filename(c, base, backing_file_str,
6005                                             &local_err);
6006             if (ret < 0) {
6007                 /*
6008                  * TODO: Actually, we want to rollback all previous iterations
6009                  * of this loop, and (which is almost impossible) previous
6010                  * bdrv_replace_node()...
6011                  *
6012                  * Note, that c->klass->update_filename may lead to permission
6013                  * update, so it's a bad idea to call it inside permission
6014                  * update transaction of bdrv_replace_node.
6015                  */
6016                 error_report_err(local_err);
6017                 goto exit;
6018             }
6019         }
6020     }
6021 
6022     if (update_inherits_from) {
6023         base->inherits_from = explicit_top->inherits_from;
6024     }
6025 
6026     ret = 0;
6027 exit:
6028     bdrv_graph_rdunlock_main_loop();
6029     bdrv_drained_end(base);
6030     bdrv_unref(top);
6031     return ret;
6032 }
6033 
6034 /**
6035  * Implementation of BlockDriver.bdrv_co_get_allocated_file_size() that
6036  * sums the size of all data-bearing children.  (This excludes backing
6037  * children.)
6038  */
6039 static int64_t coroutine_fn GRAPH_RDLOCK
6040 bdrv_sum_allocated_file_size(BlockDriverState *bs)
6041 {
6042     BdrvChild *child;
6043     int64_t child_size, sum = 0;
6044 
6045     QLIST_FOREACH(child, &bs->children, next) {
6046         if (child->role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA |
6047                            BDRV_CHILD_FILTERED))
6048         {
6049             child_size = bdrv_co_get_allocated_file_size(child->bs);
6050             if (child_size < 0) {
6051                 return child_size;
6052             }
6053             sum += child_size;
6054         }
6055     }
6056 
6057     return sum;
6058 }
6059 
6060 /**
6061  * Length of a allocated file in bytes. Sparse files are counted by actual
6062  * allocated space. Return < 0 if error or unknown.
6063  */
6064 int64_t coroutine_fn bdrv_co_get_allocated_file_size(BlockDriverState *bs)
6065 {
6066     BlockDriver *drv = bs->drv;
6067     IO_CODE();
6068     assert_bdrv_graph_readable();
6069 
6070     if (!drv) {
6071         return -ENOMEDIUM;
6072     }
6073     if (drv->bdrv_co_get_allocated_file_size) {
6074         return drv->bdrv_co_get_allocated_file_size(bs);
6075     }
6076 
6077     if (drv->bdrv_file_open) {
6078         /*
6079          * Protocol drivers default to -ENOTSUP (most of their data is
6080          * not stored in any of their children (if they even have any),
6081          * so there is no generic way to figure it out).
6082          */
6083         return -ENOTSUP;
6084     } else if (drv->is_filter) {
6085         /* Filter drivers default to the size of their filtered child */
6086         return bdrv_co_get_allocated_file_size(bdrv_filter_bs(bs));
6087     } else {
6088         /* Other drivers default to summing their children's sizes */
6089         return bdrv_sum_allocated_file_size(bs);
6090     }
6091 }
6092 
6093 /*
6094  * bdrv_measure:
6095  * @drv: Format driver
6096  * @opts: Creation options for new image
6097  * @in_bs: Existing image containing data for new image (may be NULL)
6098  * @errp: Error object
6099  * Returns: A #BlockMeasureInfo (free using qapi_free_BlockMeasureInfo())
6100  *          or NULL on error
6101  *
6102  * Calculate file size required to create a new image.
6103  *
6104  * If @in_bs is given then space for allocated clusters and zero clusters
6105  * from that image are included in the calculation.  If @opts contains a
6106  * backing file that is shared by @in_bs then backing clusters may be omitted
6107  * from the calculation.
6108  *
6109  * If @in_bs is NULL then the calculation includes no allocated clusters
6110  * unless a preallocation option is given in @opts.
6111  *
6112  * Note that @in_bs may use a different BlockDriver from @drv.
6113  *
6114  * If an error occurs the @errp pointer is set.
6115  */
6116 BlockMeasureInfo *bdrv_measure(BlockDriver *drv, QemuOpts *opts,
6117                                BlockDriverState *in_bs, Error **errp)
6118 {
6119     IO_CODE();
6120     if (!drv->bdrv_measure) {
6121         error_setg(errp, "Block driver '%s' does not support size measurement",
6122                    drv->format_name);
6123         return NULL;
6124     }
6125 
6126     return drv->bdrv_measure(opts, in_bs, errp);
6127 }
6128 
6129 /**
6130  * Return number of sectors on success, -errno on error.
6131  */
6132 int64_t coroutine_fn bdrv_co_nb_sectors(BlockDriverState *bs)
6133 {
6134     BlockDriver *drv = bs->drv;
6135     IO_CODE();
6136     assert_bdrv_graph_readable();
6137 
6138     if (!drv)
6139         return -ENOMEDIUM;
6140 
6141     if (bs->bl.has_variable_length) {
6142         int ret = bdrv_co_refresh_total_sectors(bs, bs->total_sectors);
6143         if (ret < 0) {
6144             return ret;
6145         }
6146     }
6147     return bs->total_sectors;
6148 }
6149 
6150 /*
6151  * This wrapper is written by hand because this function is in the hot I/O path,
6152  * via blk_get_geometry.
6153  */
6154 int64_t coroutine_mixed_fn bdrv_nb_sectors(BlockDriverState *bs)
6155 {
6156     BlockDriver *drv = bs->drv;
6157     IO_CODE();
6158 
6159     if (!drv)
6160         return -ENOMEDIUM;
6161 
6162     if (bs->bl.has_variable_length) {
6163         int ret = bdrv_refresh_total_sectors(bs, bs->total_sectors);
6164         if (ret < 0) {
6165             return ret;
6166         }
6167     }
6168 
6169     return bs->total_sectors;
6170 }
6171 
6172 /**
6173  * Return length in bytes on success, -errno on error.
6174  * The length is always a multiple of BDRV_SECTOR_SIZE.
6175  */
6176 int64_t coroutine_fn bdrv_co_getlength(BlockDriverState *bs)
6177 {
6178     int64_t ret;
6179     IO_CODE();
6180     assert_bdrv_graph_readable();
6181 
6182     ret = bdrv_co_nb_sectors(bs);
6183     if (ret < 0) {
6184         return ret;
6185     }
6186     if (ret > INT64_MAX / BDRV_SECTOR_SIZE) {
6187         return -EFBIG;
6188     }
6189     return ret * BDRV_SECTOR_SIZE;
6190 }
6191 
6192 bool bdrv_is_sg(BlockDriverState *bs)
6193 {
6194     IO_CODE();
6195     return bs->sg;
6196 }
6197 
6198 /**
6199  * Return whether the given node supports compressed writes.
6200  */
6201 bool bdrv_supports_compressed_writes(BlockDriverState *bs)
6202 {
6203     BlockDriverState *filtered;
6204     IO_CODE();
6205 
6206     if (!bs->drv || !block_driver_can_compress(bs->drv)) {
6207         return false;
6208     }
6209 
6210     filtered = bdrv_filter_bs(bs);
6211     if (filtered) {
6212         /*
6213          * Filters can only forward compressed writes, so we have to
6214          * check the child.
6215          */
6216         return bdrv_supports_compressed_writes(filtered);
6217     }
6218 
6219     return true;
6220 }
6221 
6222 const char *bdrv_get_format_name(BlockDriverState *bs)
6223 {
6224     IO_CODE();
6225     return bs->drv ? bs->drv->format_name : NULL;
6226 }
6227 
6228 static int qsort_strcmp(const void *a, const void *b)
6229 {
6230     return strcmp(*(char *const *)a, *(char *const *)b);
6231 }
6232 
6233 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
6234                          void *opaque, bool read_only)
6235 {
6236     BlockDriver *drv;
6237     int count = 0;
6238     int i;
6239     const char **formats = NULL;
6240 
6241     GLOBAL_STATE_CODE();
6242 
6243     QLIST_FOREACH(drv, &bdrv_drivers, list) {
6244         if (drv->format_name) {
6245             bool found = false;
6246 
6247             if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, read_only)) {
6248                 continue;
6249             }
6250 
6251             i = count;
6252             while (formats && i && !found) {
6253                 found = !strcmp(formats[--i], drv->format_name);
6254             }
6255 
6256             if (!found) {
6257                 formats = g_renew(const char *, formats, count + 1);
6258                 formats[count++] = drv->format_name;
6259             }
6260         }
6261     }
6262 
6263     for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); i++) {
6264         const char *format_name = block_driver_modules[i].format_name;
6265 
6266         if (format_name) {
6267             bool found = false;
6268             int j = count;
6269 
6270             if (use_bdrv_whitelist &&
6271                 !bdrv_format_is_whitelisted(format_name, read_only)) {
6272                 continue;
6273             }
6274 
6275             while (formats && j && !found) {
6276                 found = !strcmp(formats[--j], format_name);
6277             }
6278 
6279             if (!found) {
6280                 formats = g_renew(const char *, formats, count + 1);
6281                 formats[count++] = format_name;
6282             }
6283         }
6284     }
6285 
6286     qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
6287 
6288     for (i = 0; i < count; i++) {
6289         it(opaque, formats[i]);
6290     }
6291 
6292     g_free(formats);
6293 }
6294 
6295 /* This function is to find a node in the bs graph */
6296 BlockDriverState *bdrv_find_node(const char *node_name)
6297 {
6298     BlockDriverState *bs;
6299 
6300     assert(node_name);
6301     GLOBAL_STATE_CODE();
6302 
6303     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
6304         if (!strcmp(node_name, bs->node_name)) {
6305             return bs;
6306         }
6307     }
6308     return NULL;
6309 }
6310 
6311 /* Put this QMP function here so it can access the static graph_bdrv_states. */
6312 BlockDeviceInfoList *bdrv_named_nodes_list(bool flat,
6313                                            Error **errp)
6314 {
6315     BlockDeviceInfoList *list;
6316     BlockDriverState *bs;
6317 
6318     GLOBAL_STATE_CODE();
6319     GRAPH_RDLOCK_GUARD_MAINLOOP();
6320 
6321     list = NULL;
6322     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
6323         BlockDeviceInfo *info = bdrv_block_device_info(NULL, bs, flat, errp);
6324         if (!info) {
6325             qapi_free_BlockDeviceInfoList(list);
6326             return NULL;
6327         }
6328         QAPI_LIST_PREPEND(list, info);
6329     }
6330 
6331     return list;
6332 }
6333 
6334 typedef struct XDbgBlockGraphConstructor {
6335     XDbgBlockGraph *graph;
6336     GHashTable *graph_nodes;
6337 } XDbgBlockGraphConstructor;
6338 
6339 static XDbgBlockGraphConstructor *xdbg_graph_new(void)
6340 {
6341     XDbgBlockGraphConstructor *gr = g_new(XDbgBlockGraphConstructor, 1);
6342 
6343     gr->graph = g_new0(XDbgBlockGraph, 1);
6344     gr->graph_nodes = g_hash_table_new(NULL, NULL);
6345 
6346     return gr;
6347 }
6348 
6349 static XDbgBlockGraph *xdbg_graph_finalize(XDbgBlockGraphConstructor *gr)
6350 {
6351     XDbgBlockGraph *graph = gr->graph;
6352 
6353     g_hash_table_destroy(gr->graph_nodes);
6354     g_free(gr);
6355 
6356     return graph;
6357 }
6358 
6359 static uintptr_t xdbg_graph_node_num(XDbgBlockGraphConstructor *gr, void *node)
6360 {
6361     uintptr_t ret = (uintptr_t)g_hash_table_lookup(gr->graph_nodes, node);
6362 
6363     if (ret != 0) {
6364         return ret;
6365     }
6366 
6367     /*
6368      * Start counting from 1, not 0, because 0 interferes with not-found (NULL)
6369      * answer of g_hash_table_lookup.
6370      */
6371     ret = g_hash_table_size(gr->graph_nodes) + 1;
6372     g_hash_table_insert(gr->graph_nodes, node, (void *)ret);
6373 
6374     return ret;
6375 }
6376 
6377 static void xdbg_graph_add_node(XDbgBlockGraphConstructor *gr, void *node,
6378                                 XDbgBlockGraphNodeType type, const char *name)
6379 {
6380     XDbgBlockGraphNode *n;
6381 
6382     n = g_new0(XDbgBlockGraphNode, 1);
6383 
6384     n->id = xdbg_graph_node_num(gr, node);
6385     n->type = type;
6386     n->name = g_strdup(name);
6387 
6388     QAPI_LIST_PREPEND(gr->graph->nodes, n);
6389 }
6390 
6391 static void xdbg_graph_add_edge(XDbgBlockGraphConstructor *gr, void *parent,
6392                                 const BdrvChild *child)
6393 {
6394     BlockPermission qapi_perm;
6395     XDbgBlockGraphEdge *edge;
6396     GLOBAL_STATE_CODE();
6397 
6398     edge = g_new0(XDbgBlockGraphEdge, 1);
6399 
6400     edge->parent = xdbg_graph_node_num(gr, parent);
6401     edge->child = xdbg_graph_node_num(gr, child->bs);
6402     edge->name = g_strdup(child->name);
6403 
6404     for (qapi_perm = 0; qapi_perm < BLOCK_PERMISSION__MAX; qapi_perm++) {
6405         uint64_t flag = bdrv_qapi_perm_to_blk_perm(qapi_perm);
6406 
6407         if (flag & child->perm) {
6408             QAPI_LIST_PREPEND(edge->perm, qapi_perm);
6409         }
6410         if (flag & child->shared_perm) {
6411             QAPI_LIST_PREPEND(edge->shared_perm, qapi_perm);
6412         }
6413     }
6414 
6415     QAPI_LIST_PREPEND(gr->graph->edges, edge);
6416 }
6417 
6418 
6419 XDbgBlockGraph *bdrv_get_xdbg_block_graph(Error **errp)
6420 {
6421     BlockBackend *blk;
6422     BlockJob *job;
6423     BlockDriverState *bs;
6424     BdrvChild *child;
6425     XDbgBlockGraphConstructor *gr = xdbg_graph_new();
6426 
6427     GLOBAL_STATE_CODE();
6428 
6429     for (blk = blk_all_next(NULL); blk; blk = blk_all_next(blk)) {
6430         char *allocated_name = NULL;
6431         const char *name = blk_name(blk);
6432 
6433         if (!*name) {
6434             name = allocated_name = blk_get_attached_dev_id(blk);
6435         }
6436         xdbg_graph_add_node(gr, blk, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_BACKEND,
6437                            name);
6438         g_free(allocated_name);
6439         if (blk_root(blk)) {
6440             xdbg_graph_add_edge(gr, blk, blk_root(blk));
6441         }
6442     }
6443 
6444     WITH_JOB_LOCK_GUARD() {
6445         for (job = block_job_next_locked(NULL); job;
6446              job = block_job_next_locked(job)) {
6447             GSList *el;
6448 
6449             xdbg_graph_add_node(gr, job, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_JOB,
6450                                 job->job.id);
6451             for (el = job->nodes; el; el = el->next) {
6452                 xdbg_graph_add_edge(gr, job, (BdrvChild *)el->data);
6453             }
6454         }
6455     }
6456 
6457     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
6458         xdbg_graph_add_node(gr, bs, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_DRIVER,
6459                            bs->node_name);
6460         QLIST_FOREACH(child, &bs->children, next) {
6461             xdbg_graph_add_edge(gr, bs, child);
6462         }
6463     }
6464 
6465     return xdbg_graph_finalize(gr);
6466 }
6467 
6468 BlockDriverState *bdrv_lookup_bs(const char *device,
6469                                  const char *node_name,
6470                                  Error **errp)
6471 {
6472     BlockBackend *blk;
6473     BlockDriverState *bs;
6474 
6475     GLOBAL_STATE_CODE();
6476 
6477     if (device) {
6478         blk = blk_by_name(device);
6479 
6480         if (blk) {
6481             bs = blk_bs(blk);
6482             if (!bs) {
6483                 error_setg(errp, "Device '%s' has no medium", device);
6484             }
6485 
6486             return bs;
6487         }
6488     }
6489 
6490     if (node_name) {
6491         bs = bdrv_find_node(node_name);
6492 
6493         if (bs) {
6494             return bs;
6495         }
6496     }
6497 
6498     error_setg(errp, "Cannot find device=\'%s\' nor node-name=\'%s\'",
6499                      device ? device : "",
6500                      node_name ? node_name : "");
6501     return NULL;
6502 }
6503 
6504 /* If 'base' is in the same chain as 'top', return true. Otherwise,
6505  * return false.  If either argument is NULL, return false. */
6506 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
6507 {
6508 
6509     GLOBAL_STATE_CODE();
6510 
6511     while (top && top != base) {
6512         top = bdrv_filter_or_cow_bs(top);
6513     }
6514 
6515     return top != NULL;
6516 }
6517 
6518 BlockDriverState *bdrv_next_node(BlockDriverState *bs)
6519 {
6520     GLOBAL_STATE_CODE();
6521     if (!bs) {
6522         return QTAILQ_FIRST(&graph_bdrv_states);
6523     }
6524     return QTAILQ_NEXT(bs, node_list);
6525 }
6526 
6527 BlockDriverState *bdrv_next_all_states(BlockDriverState *bs)
6528 {
6529     GLOBAL_STATE_CODE();
6530     if (!bs) {
6531         return QTAILQ_FIRST(&all_bdrv_states);
6532     }
6533     return QTAILQ_NEXT(bs, bs_list);
6534 }
6535 
6536 const char *bdrv_get_node_name(const BlockDriverState *bs)
6537 {
6538     IO_CODE();
6539     return bs->node_name;
6540 }
6541 
6542 const char *bdrv_get_parent_name(const BlockDriverState *bs)
6543 {
6544     BdrvChild *c;
6545     const char *name;
6546     IO_CODE();
6547 
6548     /* If multiple parents have a name, just pick the first one. */
6549     QLIST_FOREACH(c, &bs->parents, next_parent) {
6550         if (c->klass->get_name) {
6551             name = c->klass->get_name(c);
6552             if (name && *name) {
6553                 return name;
6554             }
6555         }
6556     }
6557 
6558     return NULL;
6559 }
6560 
6561 /* TODO check what callers really want: bs->node_name or blk_name() */
6562 const char *bdrv_get_device_name(const BlockDriverState *bs)
6563 {
6564     IO_CODE();
6565     return bdrv_get_parent_name(bs) ?: "";
6566 }
6567 
6568 /* This can be used to identify nodes that might not have a device
6569  * name associated. Since node and device names live in the same
6570  * namespace, the result is unambiguous. The exception is if both are
6571  * absent, then this returns an empty (non-null) string. */
6572 const char *bdrv_get_device_or_node_name(const BlockDriverState *bs)
6573 {
6574     IO_CODE();
6575     return bdrv_get_parent_name(bs) ?: bs->node_name;
6576 }
6577 
6578 int bdrv_get_flags(BlockDriverState *bs)
6579 {
6580     IO_CODE();
6581     return bs->open_flags;
6582 }
6583 
6584 int bdrv_has_zero_init_1(BlockDriverState *bs)
6585 {
6586     GLOBAL_STATE_CODE();
6587     return 1;
6588 }
6589 
6590 int bdrv_has_zero_init(BlockDriverState *bs)
6591 {
6592     BlockDriverState *filtered;
6593     GLOBAL_STATE_CODE();
6594 
6595     if (!bs->drv) {
6596         return 0;
6597     }
6598 
6599     /* If BS is a copy on write image, it is initialized to
6600        the contents of the base image, which may not be zeroes.  */
6601     if (bdrv_cow_child(bs)) {
6602         return 0;
6603     }
6604     if (bs->drv->bdrv_has_zero_init) {
6605         return bs->drv->bdrv_has_zero_init(bs);
6606     }
6607 
6608     filtered = bdrv_filter_bs(bs);
6609     if (filtered) {
6610         return bdrv_has_zero_init(filtered);
6611     }
6612 
6613     /* safe default */
6614     return 0;
6615 }
6616 
6617 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
6618 {
6619     IO_CODE();
6620     if (!(bs->open_flags & BDRV_O_UNMAP)) {
6621         return false;
6622     }
6623 
6624     return bs->supported_zero_flags & BDRV_REQ_MAY_UNMAP;
6625 }
6626 
6627 void bdrv_get_backing_filename(BlockDriverState *bs,
6628                                char *filename, int filename_size)
6629 {
6630     IO_CODE();
6631     pstrcpy(filename, filename_size, bs->backing_file);
6632 }
6633 
6634 int coroutine_fn bdrv_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
6635 {
6636     int ret;
6637     BlockDriver *drv = bs->drv;
6638     IO_CODE();
6639     assert_bdrv_graph_readable();
6640 
6641     /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
6642     if (!drv) {
6643         return -ENOMEDIUM;
6644     }
6645     if (!drv->bdrv_co_get_info) {
6646         BlockDriverState *filtered = bdrv_filter_bs(bs);
6647         if (filtered) {
6648             return bdrv_co_get_info(filtered, bdi);
6649         }
6650         return -ENOTSUP;
6651     }
6652     memset(bdi, 0, sizeof(*bdi));
6653     ret = drv->bdrv_co_get_info(bs, bdi);
6654     if (bdi->subcluster_size == 0) {
6655         /*
6656          * If the driver left this unset, subclusters are not supported.
6657          * Then it is safe to treat each cluster as having only one subcluster.
6658          */
6659         bdi->subcluster_size = bdi->cluster_size;
6660     }
6661     if (ret < 0) {
6662         return ret;
6663     }
6664 
6665     if (bdi->cluster_size > BDRV_MAX_ALIGNMENT) {
6666         return -EINVAL;
6667     }
6668 
6669     return 0;
6670 }
6671 
6672 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs,
6673                                           Error **errp)
6674 {
6675     BlockDriver *drv = bs->drv;
6676     IO_CODE();
6677     if (drv && drv->bdrv_get_specific_info) {
6678         return drv->bdrv_get_specific_info(bs, errp);
6679     }
6680     return NULL;
6681 }
6682 
6683 BlockStatsSpecific *bdrv_get_specific_stats(BlockDriverState *bs)
6684 {
6685     BlockDriver *drv = bs->drv;
6686     IO_CODE();
6687     if (!drv || !drv->bdrv_get_specific_stats) {
6688         return NULL;
6689     }
6690     return drv->bdrv_get_specific_stats(bs);
6691 }
6692 
6693 void coroutine_fn bdrv_co_debug_event(BlockDriverState *bs, BlkdebugEvent event)
6694 {
6695     IO_CODE();
6696     assert_bdrv_graph_readable();
6697 
6698     if (!bs || !bs->drv || !bs->drv->bdrv_co_debug_event) {
6699         return;
6700     }
6701 
6702     bs->drv->bdrv_co_debug_event(bs, event);
6703 }
6704 
6705 static BlockDriverState * GRAPH_RDLOCK
6706 bdrv_find_debug_node(BlockDriverState *bs)
6707 {
6708     GLOBAL_STATE_CODE();
6709     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
6710         bs = bdrv_primary_bs(bs);
6711     }
6712 
6713     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
6714         assert(bs->drv->bdrv_debug_remove_breakpoint);
6715         return bs;
6716     }
6717 
6718     return NULL;
6719 }
6720 
6721 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
6722                           const char *tag)
6723 {
6724     GLOBAL_STATE_CODE();
6725     GRAPH_RDLOCK_GUARD_MAINLOOP();
6726 
6727     bs = bdrv_find_debug_node(bs);
6728     if (bs) {
6729         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
6730     }
6731 
6732     return -ENOTSUP;
6733 }
6734 
6735 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
6736 {
6737     GLOBAL_STATE_CODE();
6738     GRAPH_RDLOCK_GUARD_MAINLOOP();
6739 
6740     bs = bdrv_find_debug_node(bs);
6741     if (bs) {
6742         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
6743     }
6744 
6745     return -ENOTSUP;
6746 }
6747 
6748 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
6749 {
6750     GLOBAL_STATE_CODE();
6751     GRAPH_RDLOCK_GUARD_MAINLOOP();
6752 
6753     while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
6754         bs = bdrv_primary_bs(bs);
6755     }
6756 
6757     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
6758         return bs->drv->bdrv_debug_resume(bs, tag);
6759     }
6760 
6761     return -ENOTSUP;
6762 }
6763 
6764 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
6765 {
6766     GLOBAL_STATE_CODE();
6767     GRAPH_RDLOCK_GUARD_MAINLOOP();
6768 
6769     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
6770         bs = bdrv_primary_bs(bs);
6771     }
6772 
6773     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
6774         return bs->drv->bdrv_debug_is_suspended(bs, tag);
6775     }
6776 
6777     return false;
6778 }
6779 
6780 /* backing_file can either be relative, or absolute, or a protocol.  If it is
6781  * relative, it must be relative to the chain.  So, passing in bs->filename
6782  * from a BDS as backing_file should not be done, as that may be relative to
6783  * the CWD rather than the chain. */
6784 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
6785         const char *backing_file)
6786 {
6787     char *filename_full = NULL;
6788     char *backing_file_full = NULL;
6789     char *filename_tmp = NULL;
6790     int is_protocol = 0;
6791     bool filenames_refreshed = false;
6792     BlockDriverState *curr_bs = NULL;
6793     BlockDriverState *retval = NULL;
6794     BlockDriverState *bs_below;
6795 
6796     GLOBAL_STATE_CODE();
6797     GRAPH_RDLOCK_GUARD_MAINLOOP();
6798 
6799     if (!bs || !bs->drv || !backing_file) {
6800         return NULL;
6801     }
6802 
6803     filename_full     = g_malloc(PATH_MAX);
6804     backing_file_full = g_malloc(PATH_MAX);
6805 
6806     is_protocol = path_has_protocol(backing_file);
6807 
6808     /*
6809      * Being largely a legacy function, skip any filters here
6810      * (because filters do not have normal filenames, so they cannot
6811      * match anyway; and allowing json:{} filenames is a bit out of
6812      * scope).
6813      */
6814     for (curr_bs = bdrv_skip_filters(bs);
6815          bdrv_cow_child(curr_bs) != NULL;
6816          curr_bs = bs_below)
6817     {
6818         bs_below = bdrv_backing_chain_next(curr_bs);
6819 
6820         if (bdrv_backing_overridden(curr_bs)) {
6821             /*
6822              * If the backing file was overridden, we can only compare
6823              * directly against the backing node's filename.
6824              */
6825 
6826             if (!filenames_refreshed) {
6827                 /*
6828                  * This will automatically refresh all of the
6829                  * filenames in the rest of the backing chain, so we
6830                  * only need to do this once.
6831                  */
6832                 bdrv_refresh_filename(bs_below);
6833                 filenames_refreshed = true;
6834             }
6835 
6836             if (strcmp(backing_file, bs_below->filename) == 0) {
6837                 retval = bs_below;
6838                 break;
6839             }
6840         } else if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
6841             /*
6842              * If either of the filename paths is actually a protocol, then
6843              * compare unmodified paths; otherwise make paths relative.
6844              */
6845             char *backing_file_full_ret;
6846 
6847             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
6848                 retval = bs_below;
6849                 break;
6850             }
6851             /* Also check against the full backing filename for the image */
6852             backing_file_full_ret = bdrv_get_full_backing_filename(curr_bs,
6853                                                                    NULL);
6854             if (backing_file_full_ret) {
6855                 bool equal = strcmp(backing_file, backing_file_full_ret) == 0;
6856                 g_free(backing_file_full_ret);
6857                 if (equal) {
6858                     retval = bs_below;
6859                     break;
6860                 }
6861             }
6862         } else {
6863             /* If not an absolute filename path, make it relative to the current
6864              * image's filename path */
6865             filename_tmp = bdrv_make_absolute_filename(curr_bs, backing_file,
6866                                                        NULL);
6867             /* We are going to compare canonicalized absolute pathnames */
6868             if (!filename_tmp || !realpath(filename_tmp, filename_full)) {
6869                 g_free(filename_tmp);
6870                 continue;
6871             }
6872             g_free(filename_tmp);
6873 
6874             /* We need to make sure the backing filename we are comparing against
6875              * is relative to the current image filename (or absolute) */
6876             filename_tmp = bdrv_get_full_backing_filename(curr_bs, NULL);
6877             if (!filename_tmp || !realpath(filename_tmp, backing_file_full)) {
6878                 g_free(filename_tmp);
6879                 continue;
6880             }
6881             g_free(filename_tmp);
6882 
6883             if (strcmp(backing_file_full, filename_full) == 0) {
6884                 retval = bs_below;
6885                 break;
6886             }
6887         }
6888     }
6889 
6890     g_free(filename_full);
6891     g_free(backing_file_full);
6892     return retval;
6893 }
6894 
6895 void bdrv_init(void)
6896 {
6897 #ifdef CONFIG_BDRV_WHITELIST_TOOLS
6898     use_bdrv_whitelist = 1;
6899 #endif
6900     module_call_init(MODULE_INIT_BLOCK);
6901 }
6902 
6903 void bdrv_init_with_whitelist(void)
6904 {
6905     use_bdrv_whitelist = 1;
6906     bdrv_init();
6907 }
6908 
6909 int bdrv_activate(BlockDriverState *bs, Error **errp)
6910 {
6911     BdrvChild *child, *parent;
6912     Error *local_err = NULL;
6913     int ret;
6914     BdrvDirtyBitmap *bm;
6915 
6916     GLOBAL_STATE_CODE();
6917     GRAPH_RDLOCK_GUARD_MAINLOOP();
6918 
6919     if (!bs->drv)  {
6920         return -ENOMEDIUM;
6921     }
6922 
6923     QLIST_FOREACH(child, &bs->children, next) {
6924         bdrv_activate(child->bs, &local_err);
6925         if (local_err) {
6926             error_propagate(errp, local_err);
6927             return -EINVAL;
6928         }
6929     }
6930 
6931     /*
6932      * Update permissions, they may differ for inactive nodes.
6933      *
6934      * Note that the required permissions of inactive images are always a
6935      * subset of the permissions required after activating the image. This
6936      * allows us to just get the permissions upfront without restricting
6937      * bdrv_co_invalidate_cache().
6938      *
6939      * It also means that in error cases, we don't have to try and revert to
6940      * the old permissions (which is an operation that could fail, too). We can
6941      * just keep the extended permissions for the next time that an activation
6942      * of the image is tried.
6943      */
6944     if (bs->open_flags & BDRV_O_INACTIVE) {
6945         bs->open_flags &= ~BDRV_O_INACTIVE;
6946         ret = bdrv_refresh_perms(bs, NULL, errp);
6947         if (ret < 0) {
6948             bs->open_flags |= BDRV_O_INACTIVE;
6949             return ret;
6950         }
6951 
6952         ret = bdrv_invalidate_cache(bs, errp);
6953         if (ret < 0) {
6954             bs->open_flags |= BDRV_O_INACTIVE;
6955             return ret;
6956         }
6957 
6958         FOR_EACH_DIRTY_BITMAP(bs, bm) {
6959             bdrv_dirty_bitmap_skip_store(bm, false);
6960         }
6961 
6962         ret = bdrv_refresh_total_sectors(bs, bs->total_sectors);
6963         if (ret < 0) {
6964             bs->open_flags |= BDRV_O_INACTIVE;
6965             error_setg_errno(errp, -ret, "Could not refresh total sector count");
6966             return ret;
6967         }
6968     }
6969 
6970     QLIST_FOREACH(parent, &bs->parents, next_parent) {
6971         if (parent->klass->activate) {
6972             parent->klass->activate(parent, &local_err);
6973             if (local_err) {
6974                 bs->open_flags |= BDRV_O_INACTIVE;
6975                 error_propagate(errp, local_err);
6976                 return -EINVAL;
6977             }
6978         }
6979     }
6980 
6981     return 0;
6982 }
6983 
6984 int coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp)
6985 {
6986     Error *local_err = NULL;
6987     IO_CODE();
6988 
6989     assert(!(bs->open_flags & BDRV_O_INACTIVE));
6990     assert_bdrv_graph_readable();
6991 
6992     if (bs->drv->bdrv_co_invalidate_cache) {
6993         bs->drv->bdrv_co_invalidate_cache(bs, &local_err);
6994         if (local_err) {
6995             error_propagate(errp, local_err);
6996             return -EINVAL;
6997         }
6998     }
6999 
7000     return 0;
7001 }
7002 
7003 void bdrv_activate_all(Error **errp)
7004 {
7005     BlockDriverState *bs;
7006     BdrvNextIterator it;
7007 
7008     GLOBAL_STATE_CODE();
7009     GRAPH_RDLOCK_GUARD_MAINLOOP();
7010 
7011     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
7012         AioContext *aio_context = bdrv_get_aio_context(bs);
7013         int ret;
7014 
7015         aio_context_acquire(aio_context);
7016         ret = bdrv_activate(bs, errp);
7017         aio_context_release(aio_context);
7018         if (ret < 0) {
7019             bdrv_next_cleanup(&it);
7020             return;
7021         }
7022     }
7023 }
7024 
7025 static bool GRAPH_RDLOCK
7026 bdrv_has_bds_parent(BlockDriverState *bs, bool only_active)
7027 {
7028     BdrvChild *parent;
7029     GLOBAL_STATE_CODE();
7030 
7031     QLIST_FOREACH(parent, &bs->parents, next_parent) {
7032         if (parent->klass->parent_is_bds) {
7033             BlockDriverState *parent_bs = parent->opaque;
7034             if (!only_active || !(parent_bs->open_flags & BDRV_O_INACTIVE)) {
7035                 return true;
7036             }
7037         }
7038     }
7039 
7040     return false;
7041 }
7042 
7043 static int GRAPH_RDLOCK bdrv_inactivate_recurse(BlockDriverState *bs)
7044 {
7045     BdrvChild *child, *parent;
7046     int ret;
7047     uint64_t cumulative_perms, cumulative_shared_perms;
7048 
7049     GLOBAL_STATE_CODE();
7050 
7051     if (!bs->drv) {
7052         return -ENOMEDIUM;
7053     }
7054 
7055     /* Make sure that we don't inactivate a child before its parent.
7056      * It will be covered by recursion from the yet active parent. */
7057     if (bdrv_has_bds_parent(bs, true)) {
7058         return 0;
7059     }
7060 
7061     assert(!(bs->open_flags & BDRV_O_INACTIVE));
7062 
7063     /* Inactivate this node */
7064     if (bs->drv->bdrv_inactivate) {
7065         ret = bs->drv->bdrv_inactivate(bs);
7066         if (ret < 0) {
7067             return ret;
7068         }
7069     }
7070 
7071     QLIST_FOREACH(parent, &bs->parents, next_parent) {
7072         if (parent->klass->inactivate) {
7073             ret = parent->klass->inactivate(parent);
7074             if (ret < 0) {
7075                 return ret;
7076             }
7077         }
7078     }
7079 
7080     bdrv_get_cumulative_perm(bs, &cumulative_perms,
7081                              &cumulative_shared_perms);
7082     if (cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) {
7083         /* Our inactive parents still need write access. Inactivation failed. */
7084         return -EPERM;
7085     }
7086 
7087     bs->open_flags |= BDRV_O_INACTIVE;
7088 
7089     /*
7090      * Update permissions, they may differ for inactive nodes.
7091      * We only tried to loosen restrictions, so errors are not fatal, ignore
7092      * them.
7093      */
7094     bdrv_refresh_perms(bs, NULL, NULL);
7095 
7096     /* Recursively inactivate children */
7097     QLIST_FOREACH(child, &bs->children, next) {
7098         ret = bdrv_inactivate_recurse(child->bs);
7099         if (ret < 0) {
7100             return ret;
7101         }
7102     }
7103 
7104     return 0;
7105 }
7106 
7107 int bdrv_inactivate_all(void)
7108 {
7109     BlockDriverState *bs = NULL;
7110     BdrvNextIterator it;
7111     int ret = 0;
7112     GSList *aio_ctxs = NULL, *ctx;
7113 
7114     GLOBAL_STATE_CODE();
7115     GRAPH_RDLOCK_GUARD_MAINLOOP();
7116 
7117     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
7118         AioContext *aio_context = bdrv_get_aio_context(bs);
7119 
7120         if (!g_slist_find(aio_ctxs, aio_context)) {
7121             aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
7122             aio_context_acquire(aio_context);
7123         }
7124     }
7125 
7126     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
7127         /* Nodes with BDS parents are covered by recursion from the last
7128          * parent that gets inactivated. Don't inactivate them a second
7129          * time if that has already happened. */
7130         if (bdrv_has_bds_parent(bs, false)) {
7131             continue;
7132         }
7133         ret = bdrv_inactivate_recurse(bs);
7134         if (ret < 0) {
7135             bdrv_next_cleanup(&it);
7136             goto out;
7137         }
7138     }
7139 
7140 out:
7141     for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
7142         AioContext *aio_context = ctx->data;
7143         aio_context_release(aio_context);
7144     }
7145     g_slist_free(aio_ctxs);
7146 
7147     return ret;
7148 }
7149 
7150 /**************************************************************/
7151 /* removable device support */
7152 
7153 /**
7154  * Return TRUE if the media is present
7155  */
7156 bool coroutine_fn bdrv_co_is_inserted(BlockDriverState *bs)
7157 {
7158     BlockDriver *drv = bs->drv;
7159     BdrvChild *child;
7160     IO_CODE();
7161     assert_bdrv_graph_readable();
7162 
7163     if (!drv) {
7164         return false;
7165     }
7166     if (drv->bdrv_co_is_inserted) {
7167         return drv->bdrv_co_is_inserted(bs);
7168     }
7169     QLIST_FOREACH(child, &bs->children, next) {
7170         if (!bdrv_co_is_inserted(child->bs)) {
7171             return false;
7172         }
7173     }
7174     return true;
7175 }
7176 
7177 /**
7178  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
7179  */
7180 void coroutine_fn bdrv_co_eject(BlockDriverState *bs, bool eject_flag)
7181 {
7182     BlockDriver *drv = bs->drv;
7183     IO_CODE();
7184     assert_bdrv_graph_readable();
7185 
7186     if (drv && drv->bdrv_co_eject) {
7187         drv->bdrv_co_eject(bs, eject_flag);
7188     }
7189 }
7190 
7191 /**
7192  * Lock or unlock the media (if it is locked, the user won't be able
7193  * to eject it manually).
7194  */
7195 void coroutine_fn bdrv_co_lock_medium(BlockDriverState *bs, bool locked)
7196 {
7197     BlockDriver *drv = bs->drv;
7198     IO_CODE();
7199     assert_bdrv_graph_readable();
7200     trace_bdrv_lock_medium(bs, locked);
7201 
7202     if (drv && drv->bdrv_co_lock_medium) {
7203         drv->bdrv_co_lock_medium(bs, locked);
7204     }
7205 }
7206 
7207 /* Get a reference to bs */
7208 void bdrv_ref(BlockDriverState *bs)
7209 {
7210     GLOBAL_STATE_CODE();
7211     bs->refcnt++;
7212 }
7213 
7214 /* Release a previously grabbed reference to bs.
7215  * If after releasing, reference count is zero, the BlockDriverState is
7216  * deleted. */
7217 void bdrv_unref(BlockDriverState *bs)
7218 {
7219     GLOBAL_STATE_CODE();
7220     if (!bs) {
7221         return;
7222     }
7223     assert(bs->refcnt > 0);
7224     if (--bs->refcnt == 0) {
7225         bdrv_delete(bs);
7226     }
7227 }
7228 
7229 /*
7230  * Release a BlockDriverState reference while holding the graph write lock.
7231  *
7232  * Calling bdrv_unref() directly is forbidden while holding the graph lock
7233  * because bdrv_close() both involves polling and taking the graph lock
7234  * internally. bdrv_schedule_unref() instead delays decreasing the refcount and
7235  * possibly closing @bs until the graph lock is released.
7236  */
7237 void bdrv_schedule_unref(BlockDriverState *bs)
7238 {
7239     if (!bs) {
7240         return;
7241     }
7242     aio_bh_schedule_oneshot(qemu_get_aio_context(),
7243                             (QEMUBHFunc *) bdrv_unref, bs);
7244 }
7245 
7246 struct BdrvOpBlocker {
7247     Error *reason;
7248     QLIST_ENTRY(BdrvOpBlocker) list;
7249 };
7250 
7251 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
7252 {
7253     BdrvOpBlocker *blocker;
7254     GLOBAL_STATE_CODE();
7255 
7256     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
7257     if (!QLIST_EMPTY(&bs->op_blockers[op])) {
7258         blocker = QLIST_FIRST(&bs->op_blockers[op]);
7259         error_propagate_prepend(errp, error_copy(blocker->reason),
7260                                 "Node '%s' is busy: ",
7261                                 bdrv_get_device_or_node_name(bs));
7262         return true;
7263     }
7264     return false;
7265 }
7266 
7267 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
7268 {
7269     BdrvOpBlocker *blocker;
7270     GLOBAL_STATE_CODE();
7271     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
7272 
7273     blocker = g_new0(BdrvOpBlocker, 1);
7274     blocker->reason = reason;
7275     QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
7276 }
7277 
7278 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
7279 {
7280     BdrvOpBlocker *blocker, *next;
7281     GLOBAL_STATE_CODE();
7282     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
7283     QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
7284         if (blocker->reason == reason) {
7285             QLIST_REMOVE(blocker, list);
7286             g_free(blocker);
7287         }
7288     }
7289 }
7290 
7291 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
7292 {
7293     int i;
7294     GLOBAL_STATE_CODE();
7295     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
7296         bdrv_op_block(bs, i, reason);
7297     }
7298 }
7299 
7300 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
7301 {
7302     int i;
7303     GLOBAL_STATE_CODE();
7304     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
7305         bdrv_op_unblock(bs, i, reason);
7306     }
7307 }
7308 
7309 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
7310 {
7311     int i;
7312     GLOBAL_STATE_CODE();
7313     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
7314         if (!QLIST_EMPTY(&bs->op_blockers[i])) {
7315             return false;
7316         }
7317     }
7318     return true;
7319 }
7320 
7321 /*
7322  * Must not be called while holding the lock of an AioContext other than the
7323  * current one.
7324  */
7325 void bdrv_img_create(const char *filename, const char *fmt,
7326                      const char *base_filename, const char *base_fmt,
7327                      char *options, uint64_t img_size, int flags, bool quiet,
7328                      Error **errp)
7329 {
7330     QemuOptsList *create_opts = NULL;
7331     QemuOpts *opts = NULL;
7332     const char *backing_fmt, *backing_file;
7333     int64_t size;
7334     BlockDriver *drv, *proto_drv;
7335     Error *local_err = NULL;
7336     int ret = 0;
7337 
7338     GLOBAL_STATE_CODE();
7339 
7340     /* Find driver and parse its options */
7341     drv = bdrv_find_format(fmt);
7342     if (!drv) {
7343         error_setg(errp, "Unknown file format '%s'", fmt);
7344         return;
7345     }
7346 
7347     proto_drv = bdrv_find_protocol(filename, true, errp);
7348     if (!proto_drv) {
7349         return;
7350     }
7351 
7352     if (!drv->create_opts) {
7353         error_setg(errp, "Format driver '%s' does not support image creation",
7354                    drv->format_name);
7355         return;
7356     }
7357 
7358     if (!proto_drv->create_opts) {
7359         error_setg(errp, "Protocol driver '%s' does not support image creation",
7360                    proto_drv->format_name);
7361         return;
7362     }
7363 
7364     aio_context_acquire(qemu_get_aio_context());
7365 
7366     /* Create parameter list */
7367     create_opts = qemu_opts_append(create_opts, drv->create_opts);
7368     create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
7369 
7370     opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
7371 
7372     /* Parse -o options */
7373     if (options) {
7374         if (!qemu_opts_do_parse(opts, options, NULL, errp)) {
7375             goto out;
7376         }
7377     }
7378 
7379     if (!qemu_opt_get(opts, BLOCK_OPT_SIZE)) {
7380         qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size, &error_abort);
7381     } else if (img_size != UINT64_C(-1)) {
7382         error_setg(errp, "The image size must be specified only once");
7383         goto out;
7384     }
7385 
7386     if (base_filename) {
7387         if (!qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename,
7388                           NULL)) {
7389             error_setg(errp, "Backing file not supported for file format '%s'",
7390                        fmt);
7391             goto out;
7392         }
7393     }
7394 
7395     if (base_fmt) {
7396         if (!qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt, NULL)) {
7397             error_setg(errp, "Backing file format not supported for file "
7398                              "format '%s'", fmt);
7399             goto out;
7400         }
7401     }
7402 
7403     backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
7404     if (backing_file) {
7405         if (!strcmp(filename, backing_file)) {
7406             error_setg(errp, "Error: Trying to create an image with the "
7407                              "same filename as the backing file");
7408             goto out;
7409         }
7410         if (backing_file[0] == '\0') {
7411             error_setg(errp, "Expected backing file name, got empty string");
7412             goto out;
7413         }
7414     }
7415 
7416     backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
7417 
7418     /* The size for the image must always be specified, unless we have a backing
7419      * file and we have not been forbidden from opening it. */
7420     size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, img_size);
7421     if (backing_file && !(flags & BDRV_O_NO_BACKING)) {
7422         BlockDriverState *bs;
7423         char *full_backing;
7424         int back_flags;
7425         QDict *backing_options = NULL;
7426 
7427         full_backing =
7428             bdrv_get_full_backing_filename_from_filename(filename, backing_file,
7429                                                          &local_err);
7430         if (local_err) {
7431             goto out;
7432         }
7433         assert(full_backing);
7434 
7435         /*
7436          * No need to do I/O here, which allows us to open encrypted
7437          * backing images without needing the secret
7438          */
7439         back_flags = flags;
7440         back_flags &= ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
7441         back_flags |= BDRV_O_NO_IO;
7442 
7443         backing_options = qdict_new();
7444         if (backing_fmt) {
7445             qdict_put_str(backing_options, "driver", backing_fmt);
7446         }
7447         qdict_put_bool(backing_options, BDRV_OPT_FORCE_SHARE, true);
7448 
7449         bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
7450                        &local_err);
7451         g_free(full_backing);
7452         if (!bs) {
7453             error_append_hint(&local_err, "Could not open backing image.\n");
7454             goto out;
7455         } else {
7456             if (!backing_fmt) {
7457                 error_setg(&local_err,
7458                            "Backing file specified without backing format");
7459                 error_append_hint(&local_err, "Detected format of %s.\n",
7460                                   bs->drv->format_name);
7461                 goto out;
7462             }
7463             if (size == -1) {
7464                 /* Opened BS, have no size */
7465                 size = bdrv_getlength(bs);
7466                 if (size < 0) {
7467                     error_setg_errno(errp, -size, "Could not get size of '%s'",
7468                                      backing_file);
7469                     bdrv_unref(bs);
7470                     goto out;
7471                 }
7472                 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size, &error_abort);
7473             }
7474             bdrv_unref(bs);
7475         }
7476         /* (backing_file && !(flags & BDRV_O_NO_BACKING)) */
7477     } else if (backing_file && !backing_fmt) {
7478         error_setg(&local_err,
7479                    "Backing file specified without backing format");
7480         goto out;
7481     }
7482 
7483     if (size == -1) {
7484         error_setg(errp, "Image creation needs a size parameter");
7485         goto out;
7486     }
7487 
7488     if (!quiet) {
7489         printf("Formatting '%s', fmt=%s ", filename, fmt);
7490         qemu_opts_print(opts, " ");
7491         puts("");
7492         fflush(stdout);
7493     }
7494 
7495     ret = bdrv_create(drv, filename, opts, &local_err);
7496 
7497     if (ret == -EFBIG) {
7498         /* This is generally a better message than whatever the driver would
7499          * deliver (especially because of the cluster_size_hint), since that
7500          * is most probably not much different from "image too large". */
7501         const char *cluster_size_hint = "";
7502         if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
7503             cluster_size_hint = " (try using a larger cluster size)";
7504         }
7505         error_setg(errp, "The image size is too large for file format '%s'"
7506                    "%s", fmt, cluster_size_hint);
7507         error_free(local_err);
7508         local_err = NULL;
7509     }
7510 
7511 out:
7512     qemu_opts_del(opts);
7513     qemu_opts_free(create_opts);
7514     error_propagate(errp, local_err);
7515     aio_context_release(qemu_get_aio_context());
7516 }
7517 
7518 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
7519 {
7520     IO_CODE();
7521     return bs ? bs->aio_context : qemu_get_aio_context();
7522 }
7523 
7524 AioContext *coroutine_fn bdrv_co_enter(BlockDriverState *bs)
7525 {
7526     Coroutine *self = qemu_coroutine_self();
7527     AioContext *old_ctx = qemu_coroutine_get_aio_context(self);
7528     AioContext *new_ctx;
7529     IO_CODE();
7530 
7531     /*
7532      * Increase bs->in_flight to ensure that this operation is completed before
7533      * moving the node to a different AioContext. Read new_ctx only afterwards.
7534      */
7535     bdrv_inc_in_flight(bs);
7536 
7537     new_ctx = bdrv_get_aio_context(bs);
7538     aio_co_reschedule_self(new_ctx);
7539     return old_ctx;
7540 }
7541 
7542 void coroutine_fn bdrv_co_leave(BlockDriverState *bs, AioContext *old_ctx)
7543 {
7544     IO_CODE();
7545     aio_co_reschedule_self(old_ctx);
7546     bdrv_dec_in_flight(bs);
7547 }
7548 
7549 void coroutine_fn bdrv_co_lock(BlockDriverState *bs)
7550 {
7551     AioContext *ctx = bdrv_get_aio_context(bs);
7552 
7553     /* In the main thread, bs->aio_context won't change concurrently */
7554     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
7555 
7556     /*
7557      * We're in coroutine context, so we already hold the lock of the main
7558      * loop AioContext. Don't lock it twice to avoid deadlocks.
7559      */
7560     assert(qemu_in_coroutine());
7561     if (ctx != qemu_get_aio_context()) {
7562         aio_context_acquire(ctx);
7563     }
7564 }
7565 
7566 void coroutine_fn bdrv_co_unlock(BlockDriverState *bs)
7567 {
7568     AioContext *ctx = bdrv_get_aio_context(bs);
7569 
7570     assert(qemu_in_coroutine());
7571     if (ctx != qemu_get_aio_context()) {
7572         aio_context_release(ctx);
7573     }
7574 }
7575 
7576 static void bdrv_do_remove_aio_context_notifier(BdrvAioNotifier *ban)
7577 {
7578     GLOBAL_STATE_CODE();
7579     QLIST_REMOVE(ban, list);
7580     g_free(ban);
7581 }
7582 
7583 static void bdrv_detach_aio_context(BlockDriverState *bs)
7584 {
7585     BdrvAioNotifier *baf, *baf_tmp;
7586 
7587     assert(!bs->walking_aio_notifiers);
7588     GLOBAL_STATE_CODE();
7589     bs->walking_aio_notifiers = true;
7590     QLIST_FOREACH_SAFE(baf, &bs->aio_notifiers, list, baf_tmp) {
7591         if (baf->deleted) {
7592             bdrv_do_remove_aio_context_notifier(baf);
7593         } else {
7594             baf->detach_aio_context(baf->opaque);
7595         }
7596     }
7597     /* Never mind iterating again to check for ->deleted.  bdrv_close() will
7598      * remove remaining aio notifiers if we aren't called again.
7599      */
7600     bs->walking_aio_notifiers = false;
7601 
7602     if (bs->drv && bs->drv->bdrv_detach_aio_context) {
7603         bs->drv->bdrv_detach_aio_context(bs);
7604     }
7605 
7606     bs->aio_context = NULL;
7607 }
7608 
7609 static void bdrv_attach_aio_context(BlockDriverState *bs,
7610                                     AioContext *new_context)
7611 {
7612     BdrvAioNotifier *ban, *ban_tmp;
7613     GLOBAL_STATE_CODE();
7614 
7615     bs->aio_context = new_context;
7616 
7617     if (bs->drv && bs->drv->bdrv_attach_aio_context) {
7618         bs->drv->bdrv_attach_aio_context(bs, new_context);
7619     }
7620 
7621     assert(!bs->walking_aio_notifiers);
7622     bs->walking_aio_notifiers = true;
7623     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_tmp) {
7624         if (ban->deleted) {
7625             bdrv_do_remove_aio_context_notifier(ban);
7626         } else {
7627             ban->attached_aio_context(new_context, ban->opaque);
7628         }
7629     }
7630     bs->walking_aio_notifiers = false;
7631 }
7632 
7633 typedef struct BdrvStateSetAioContext {
7634     AioContext *new_ctx;
7635     BlockDriverState *bs;
7636 } BdrvStateSetAioContext;
7637 
7638 static bool bdrv_parent_change_aio_context(BdrvChild *c, AioContext *ctx,
7639                                            GHashTable *visited,
7640                                            Transaction *tran,
7641                                            Error **errp)
7642 {
7643     GLOBAL_STATE_CODE();
7644     if (g_hash_table_contains(visited, c)) {
7645         return true;
7646     }
7647     g_hash_table_add(visited, c);
7648 
7649     /*
7650      * A BdrvChildClass that doesn't handle AioContext changes cannot
7651      * tolerate any AioContext changes
7652      */
7653     if (!c->klass->change_aio_ctx) {
7654         char *user = bdrv_child_user_desc(c);
7655         error_setg(errp, "Changing iothreads is not supported by %s", user);
7656         g_free(user);
7657         return false;
7658     }
7659     if (!c->klass->change_aio_ctx(c, ctx, visited, tran, errp)) {
7660         assert(!errp || *errp);
7661         return false;
7662     }
7663     return true;
7664 }
7665 
7666 bool bdrv_child_change_aio_context(BdrvChild *c, AioContext *ctx,
7667                                    GHashTable *visited, Transaction *tran,
7668                                    Error **errp)
7669 {
7670     GLOBAL_STATE_CODE();
7671     if (g_hash_table_contains(visited, c)) {
7672         return true;
7673     }
7674     g_hash_table_add(visited, c);
7675     return bdrv_change_aio_context(c->bs, ctx, visited, tran, errp);
7676 }
7677 
7678 static void bdrv_set_aio_context_clean(void *opaque)
7679 {
7680     BdrvStateSetAioContext *state = (BdrvStateSetAioContext *) opaque;
7681     BlockDriverState *bs = (BlockDriverState *) state->bs;
7682 
7683     /* Paired with bdrv_drained_begin in bdrv_change_aio_context() */
7684     bdrv_drained_end(bs);
7685 
7686     g_free(state);
7687 }
7688 
7689 static void bdrv_set_aio_context_commit(void *opaque)
7690 {
7691     BdrvStateSetAioContext *state = (BdrvStateSetAioContext *) opaque;
7692     BlockDriverState *bs = (BlockDriverState *) state->bs;
7693     AioContext *new_context = state->new_ctx;
7694     AioContext *old_context = bdrv_get_aio_context(bs);
7695 
7696     /*
7697      * Take the old AioContex when detaching it from bs.
7698      * At this point, new_context lock is already acquired, and we are now
7699      * also taking old_context. This is safe as long as bdrv_detach_aio_context
7700      * does not call AIO_POLL_WHILE().
7701      */
7702     if (old_context != qemu_get_aio_context()) {
7703         aio_context_acquire(old_context);
7704     }
7705     bdrv_detach_aio_context(bs);
7706     if (old_context != qemu_get_aio_context()) {
7707         aio_context_release(old_context);
7708     }
7709     bdrv_attach_aio_context(bs, new_context);
7710 }
7711 
7712 static TransactionActionDrv set_aio_context = {
7713     .commit = bdrv_set_aio_context_commit,
7714     .clean = bdrv_set_aio_context_clean,
7715 };
7716 
7717 /*
7718  * Changes the AioContext used for fd handlers, timers, and BHs by this
7719  * BlockDriverState and all its children and parents.
7720  *
7721  * Must be called from the main AioContext.
7722  *
7723  * The caller must own the AioContext lock for the old AioContext of bs, but it
7724  * must not own the AioContext lock for new_context (unless new_context is the
7725  * same as the current context of bs).
7726  *
7727  * @visited will accumulate all visited BdrvChild objects. The caller is
7728  * responsible for freeing the list afterwards.
7729  */
7730 static bool bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx,
7731                                     GHashTable *visited, Transaction *tran,
7732                                     Error **errp)
7733 {
7734     BdrvChild *c;
7735     BdrvStateSetAioContext *state;
7736 
7737     GLOBAL_STATE_CODE();
7738 
7739     if (bdrv_get_aio_context(bs) == ctx) {
7740         return true;
7741     }
7742 
7743     bdrv_graph_rdlock_main_loop();
7744     QLIST_FOREACH(c, &bs->parents, next_parent) {
7745         if (!bdrv_parent_change_aio_context(c, ctx, visited, tran, errp)) {
7746             bdrv_graph_rdunlock_main_loop();
7747             return false;
7748         }
7749     }
7750 
7751     QLIST_FOREACH(c, &bs->children, next) {
7752         if (!bdrv_child_change_aio_context(c, ctx, visited, tran, errp)) {
7753             bdrv_graph_rdunlock_main_loop();
7754             return false;
7755         }
7756     }
7757     bdrv_graph_rdunlock_main_loop();
7758 
7759     state = g_new(BdrvStateSetAioContext, 1);
7760     *state = (BdrvStateSetAioContext) {
7761         .new_ctx = ctx,
7762         .bs = bs,
7763     };
7764 
7765     /* Paired with bdrv_drained_end in bdrv_set_aio_context_clean() */
7766     bdrv_drained_begin(bs);
7767 
7768     tran_add(tran, &set_aio_context, state);
7769 
7770     return true;
7771 }
7772 
7773 /*
7774  * Change bs's and recursively all of its parents' and children's AioContext
7775  * to the given new context, returning an error if that isn't possible.
7776  *
7777  * If ignore_child is not NULL, that child (and its subgraph) will not
7778  * be touched.
7779  *
7780  * This function still requires the caller to take the bs current
7781  * AioContext lock, otherwise draining will fail since AIO_WAIT_WHILE
7782  * assumes the lock is always held if bs is in another AioContext.
7783  * For the same reason, it temporarily also holds the new AioContext, since
7784  * bdrv_drained_end calls BDRV_POLL_WHILE that assumes the lock is taken too.
7785  * Therefore the new AioContext lock must not be taken by the caller.
7786  */
7787 int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx,
7788                                 BdrvChild *ignore_child, Error **errp)
7789 {
7790     Transaction *tran;
7791     GHashTable *visited;
7792     int ret;
7793     AioContext *old_context = bdrv_get_aio_context(bs);
7794     GLOBAL_STATE_CODE();
7795 
7796     /*
7797      * Recursion phase: go through all nodes of the graph.
7798      * Take care of checking that all nodes support changing AioContext
7799      * and drain them, building a linear list of callbacks to run if everything
7800      * is successful (the transaction itself).
7801      */
7802     tran = tran_new();
7803     visited = g_hash_table_new(NULL, NULL);
7804     if (ignore_child) {
7805         g_hash_table_add(visited, ignore_child);
7806     }
7807     ret = bdrv_change_aio_context(bs, ctx, visited, tran, errp);
7808     g_hash_table_destroy(visited);
7809 
7810     /*
7811      * Linear phase: go through all callbacks collected in the transaction.
7812      * Run all callbacks collected in the recursion to switch all nodes
7813      * AioContext lock (transaction commit), or undo all changes done in the
7814      * recursion (transaction abort).
7815      */
7816 
7817     if (!ret) {
7818         /* Just run clean() callbacks. No AioContext changed. */
7819         tran_abort(tran);
7820         return -EPERM;
7821     }
7822 
7823     /*
7824      * Release old AioContext, it won't be needed anymore, as all
7825      * bdrv_drained_begin() have been called already.
7826      */
7827     if (qemu_get_aio_context() != old_context) {
7828         aio_context_release(old_context);
7829     }
7830 
7831     /*
7832      * Acquire new AioContext since bdrv_drained_end() is going to be called
7833      * after we switched all nodes in the new AioContext, and the function
7834      * assumes that the lock of the bs is always taken.
7835      */
7836     if (qemu_get_aio_context() != ctx) {
7837         aio_context_acquire(ctx);
7838     }
7839 
7840     tran_commit(tran);
7841 
7842     if (qemu_get_aio_context() != ctx) {
7843         aio_context_release(ctx);
7844     }
7845 
7846     /* Re-acquire the old AioContext, since the caller takes and releases it. */
7847     if (qemu_get_aio_context() != old_context) {
7848         aio_context_acquire(old_context);
7849     }
7850 
7851     return 0;
7852 }
7853 
7854 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
7855         void (*attached_aio_context)(AioContext *new_context, void *opaque),
7856         void (*detach_aio_context)(void *opaque), void *opaque)
7857 {
7858     BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
7859     *ban = (BdrvAioNotifier){
7860         .attached_aio_context = attached_aio_context,
7861         .detach_aio_context   = detach_aio_context,
7862         .opaque               = opaque
7863     };
7864     GLOBAL_STATE_CODE();
7865 
7866     QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
7867 }
7868 
7869 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
7870                                       void (*attached_aio_context)(AioContext *,
7871                                                                    void *),
7872                                       void (*detach_aio_context)(void *),
7873                                       void *opaque)
7874 {
7875     BdrvAioNotifier *ban, *ban_next;
7876     GLOBAL_STATE_CODE();
7877 
7878     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
7879         if (ban->attached_aio_context == attached_aio_context &&
7880             ban->detach_aio_context   == detach_aio_context   &&
7881             ban->opaque               == opaque               &&
7882             ban->deleted              == false)
7883         {
7884             if (bs->walking_aio_notifiers) {
7885                 ban->deleted = true;
7886             } else {
7887                 bdrv_do_remove_aio_context_notifier(ban);
7888             }
7889             return;
7890         }
7891     }
7892 
7893     abort();
7894 }
7895 
7896 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
7897                        BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
7898                        bool force,
7899                        Error **errp)
7900 {
7901     GLOBAL_STATE_CODE();
7902     if (!bs->drv) {
7903         error_setg(errp, "Node is ejected");
7904         return -ENOMEDIUM;
7905     }
7906     if (!bs->drv->bdrv_amend_options) {
7907         error_setg(errp, "Block driver '%s' does not support option amendment",
7908                    bs->drv->format_name);
7909         return -ENOTSUP;
7910     }
7911     return bs->drv->bdrv_amend_options(bs, opts, status_cb,
7912                                        cb_opaque, force, errp);
7913 }
7914 
7915 /*
7916  * This function checks whether the given @to_replace is allowed to be
7917  * replaced by a node that always shows the same data as @bs.  This is
7918  * used for example to verify whether the mirror job can replace
7919  * @to_replace by the target mirrored from @bs.
7920  * To be replaceable, @bs and @to_replace may either be guaranteed to
7921  * always show the same data (because they are only connected through
7922  * filters), or some driver may allow replacing one of its children
7923  * because it can guarantee that this child's data is not visible at
7924  * all (for example, for dissenting quorum children that have no other
7925  * parents).
7926  */
7927 bool bdrv_recurse_can_replace(BlockDriverState *bs,
7928                               BlockDriverState *to_replace)
7929 {
7930     BlockDriverState *filtered;
7931 
7932     GLOBAL_STATE_CODE();
7933 
7934     if (!bs || !bs->drv) {
7935         return false;
7936     }
7937 
7938     if (bs == to_replace) {
7939         return true;
7940     }
7941 
7942     /* See what the driver can do */
7943     if (bs->drv->bdrv_recurse_can_replace) {
7944         return bs->drv->bdrv_recurse_can_replace(bs, to_replace);
7945     }
7946 
7947     /* For filters without an own implementation, we can recurse on our own */
7948     filtered = bdrv_filter_bs(bs);
7949     if (filtered) {
7950         return bdrv_recurse_can_replace(filtered, to_replace);
7951     }
7952 
7953     /* Safe default */
7954     return false;
7955 }
7956 
7957 /*
7958  * Check whether the given @node_name can be replaced by a node that
7959  * has the same data as @parent_bs.  If so, return @node_name's BDS;
7960  * NULL otherwise.
7961  *
7962  * @node_name must be a (recursive) *child of @parent_bs (or this
7963  * function will return NULL).
7964  *
7965  * The result (whether the node can be replaced or not) is only valid
7966  * for as long as no graph or permission changes occur.
7967  */
7968 BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs,
7969                                         const char *node_name, Error **errp)
7970 {
7971     BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
7972     AioContext *aio_context;
7973 
7974     GLOBAL_STATE_CODE();
7975 
7976     if (!to_replace_bs) {
7977         error_setg(errp, "Failed to find node with node-name='%s'", node_name);
7978         return NULL;
7979     }
7980 
7981     aio_context = bdrv_get_aio_context(to_replace_bs);
7982     aio_context_acquire(aio_context);
7983 
7984     if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
7985         to_replace_bs = NULL;
7986         goto out;
7987     }
7988 
7989     /* We don't want arbitrary node of the BDS chain to be replaced only the top
7990      * most non filter in order to prevent data corruption.
7991      * Another benefit is that this tests exclude backing files which are
7992      * blocked by the backing blockers.
7993      */
7994     if (!bdrv_recurse_can_replace(parent_bs, to_replace_bs)) {
7995         error_setg(errp, "Cannot replace '%s' by a node mirrored from '%s', "
7996                    "because it cannot be guaranteed that doing so would not "
7997                    "lead to an abrupt change of visible data",
7998                    node_name, parent_bs->node_name);
7999         to_replace_bs = NULL;
8000         goto out;
8001     }
8002 
8003 out:
8004     aio_context_release(aio_context);
8005     return to_replace_bs;
8006 }
8007 
8008 /**
8009  * Iterates through the list of runtime option keys that are said to
8010  * be "strong" for a BDS.  An option is called "strong" if it changes
8011  * a BDS's data.  For example, the null block driver's "size" and
8012  * "read-zeroes" options are strong, but its "latency-ns" option is
8013  * not.
8014  *
8015  * If a key returned by this function ends with a dot, all options
8016  * starting with that prefix are strong.
8017  */
8018 static const char *const *strong_options(BlockDriverState *bs,
8019                                          const char *const *curopt)
8020 {
8021     static const char *const global_options[] = {
8022         "driver", "filename", NULL
8023     };
8024 
8025     if (!curopt) {
8026         return &global_options[0];
8027     }
8028 
8029     curopt++;
8030     if (curopt == &global_options[ARRAY_SIZE(global_options) - 1] && bs->drv) {
8031         curopt = bs->drv->strong_runtime_opts;
8032     }
8033 
8034     return (curopt && *curopt) ? curopt : NULL;
8035 }
8036 
8037 /**
8038  * Copies all strong runtime options from bs->options to the given
8039  * QDict.  The set of strong option keys is determined by invoking
8040  * strong_options().
8041  *
8042  * Returns true iff any strong option was present in bs->options (and
8043  * thus copied to the target QDict) with the exception of "filename"
8044  * and "driver".  The caller is expected to use this value to decide
8045  * whether the existence of strong options prevents the generation of
8046  * a plain filename.
8047  */
8048 static bool append_strong_runtime_options(QDict *d, BlockDriverState *bs)
8049 {
8050     bool found_any = false;
8051     const char *const *option_name = NULL;
8052 
8053     if (!bs->drv) {
8054         return false;
8055     }
8056 
8057     while ((option_name = strong_options(bs, option_name))) {
8058         bool option_given = false;
8059 
8060         assert(strlen(*option_name) > 0);
8061         if ((*option_name)[strlen(*option_name) - 1] != '.') {
8062             QObject *entry = qdict_get(bs->options, *option_name);
8063             if (!entry) {
8064                 continue;
8065             }
8066 
8067             qdict_put_obj(d, *option_name, qobject_ref(entry));
8068             option_given = true;
8069         } else {
8070             const QDictEntry *entry;
8071             for (entry = qdict_first(bs->options); entry;
8072                  entry = qdict_next(bs->options, entry))
8073             {
8074                 if (strstart(qdict_entry_key(entry), *option_name, NULL)) {
8075                     qdict_put_obj(d, qdict_entry_key(entry),
8076                                   qobject_ref(qdict_entry_value(entry)));
8077                     option_given = true;
8078                 }
8079             }
8080         }
8081 
8082         /* While "driver" and "filename" need to be included in a JSON filename,
8083          * their existence does not prohibit generation of a plain filename. */
8084         if (!found_any && option_given &&
8085             strcmp(*option_name, "driver") && strcmp(*option_name, "filename"))
8086         {
8087             found_any = true;
8088         }
8089     }
8090 
8091     if (!qdict_haskey(d, "driver")) {
8092         /* Drivers created with bdrv_new_open_driver() may not have a
8093          * @driver option.  Add it here. */
8094         qdict_put_str(d, "driver", bs->drv->format_name);
8095     }
8096 
8097     return found_any;
8098 }
8099 
8100 /* Note: This function may return false positives; it may return true
8101  * even if opening the backing file specified by bs's image header
8102  * would result in exactly bs->backing. */
8103 static bool bdrv_backing_overridden(BlockDriverState *bs)
8104 {
8105     GLOBAL_STATE_CODE();
8106     if (bs->backing) {
8107         return strcmp(bs->auto_backing_file,
8108                       bs->backing->bs->filename);
8109     } else {
8110         /* No backing BDS, so if the image header reports any backing
8111          * file, it must have been suppressed */
8112         return bs->auto_backing_file[0] != '\0';
8113     }
8114 }
8115 
8116 /* Updates the following BDS fields:
8117  *  - exact_filename: A filename which may be used for opening a block device
8118  *                    which (mostly) equals the given BDS (even without any
8119  *                    other options; so reading and writing must return the same
8120  *                    results, but caching etc. may be different)
8121  *  - full_open_options: Options which, when given when opening a block device
8122  *                       (without a filename), result in a BDS (mostly)
8123  *                       equalling the given one
8124  *  - filename: If exact_filename is set, it is copied here. Otherwise,
8125  *              full_open_options is converted to a JSON object, prefixed with
8126  *              "json:" (for use through the JSON pseudo protocol) and put here.
8127  */
8128 void bdrv_refresh_filename(BlockDriverState *bs)
8129 {
8130     BlockDriver *drv = bs->drv;
8131     BdrvChild *child;
8132     BlockDriverState *primary_child_bs;
8133     QDict *opts;
8134     bool backing_overridden;
8135     bool generate_json_filename; /* Whether our default implementation should
8136                                     fill exact_filename (false) or not (true) */
8137 
8138     GLOBAL_STATE_CODE();
8139 
8140     if (!drv) {
8141         return;
8142     }
8143 
8144     /* This BDS's file name may depend on any of its children's file names, so
8145      * refresh those first */
8146     QLIST_FOREACH(child, &bs->children, next) {
8147         bdrv_refresh_filename(child->bs);
8148     }
8149 
8150     if (bs->implicit) {
8151         /* For implicit nodes, just copy everything from the single child */
8152         child = QLIST_FIRST(&bs->children);
8153         assert(QLIST_NEXT(child, next) == NULL);
8154 
8155         pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
8156                 child->bs->exact_filename);
8157         pstrcpy(bs->filename, sizeof(bs->filename), child->bs->filename);
8158 
8159         qobject_unref(bs->full_open_options);
8160         bs->full_open_options = qobject_ref(child->bs->full_open_options);
8161 
8162         return;
8163     }
8164 
8165     backing_overridden = bdrv_backing_overridden(bs);
8166 
8167     if (bs->open_flags & BDRV_O_NO_IO) {
8168         /* Without I/O, the backing file does not change anything.
8169          * Therefore, in such a case (primarily qemu-img), we can
8170          * pretend the backing file has not been overridden even if
8171          * it technically has been. */
8172         backing_overridden = false;
8173     }
8174 
8175     /* Gather the options QDict */
8176     opts = qdict_new();
8177     generate_json_filename = append_strong_runtime_options(opts, bs);
8178     generate_json_filename |= backing_overridden;
8179 
8180     if (drv->bdrv_gather_child_options) {
8181         /* Some block drivers may not want to present all of their children's
8182          * options, or name them differently from BdrvChild.name */
8183         drv->bdrv_gather_child_options(bs, opts, backing_overridden);
8184     } else {
8185         QLIST_FOREACH(child, &bs->children, next) {
8186             if (child == bs->backing && !backing_overridden) {
8187                 /* We can skip the backing BDS if it has not been overridden */
8188                 continue;
8189             }
8190 
8191             qdict_put(opts, child->name,
8192                       qobject_ref(child->bs->full_open_options));
8193         }
8194 
8195         if (backing_overridden && !bs->backing) {
8196             /* Force no backing file */
8197             qdict_put_null(opts, "backing");
8198         }
8199     }
8200 
8201     qobject_unref(bs->full_open_options);
8202     bs->full_open_options = opts;
8203 
8204     primary_child_bs = bdrv_primary_bs(bs);
8205 
8206     if (drv->bdrv_refresh_filename) {
8207         /* Obsolete information is of no use here, so drop the old file name
8208          * information before refreshing it */
8209         bs->exact_filename[0] = '\0';
8210 
8211         drv->bdrv_refresh_filename(bs);
8212     } else if (primary_child_bs) {
8213         /*
8214          * Try to reconstruct valid information from the underlying
8215          * file -- this only works for format nodes (filter nodes
8216          * cannot be probed and as such must be selected by the user
8217          * either through an options dict, or through a special
8218          * filename which the filter driver must construct in its
8219          * .bdrv_refresh_filename() implementation).
8220          */
8221 
8222         bs->exact_filename[0] = '\0';
8223 
8224         /*
8225          * We can use the underlying file's filename if:
8226          * - it has a filename,
8227          * - the current BDS is not a filter,
8228          * - the file is a protocol BDS, and
8229          * - opening that file (as this BDS's format) will automatically create
8230          *   the BDS tree we have right now, that is:
8231          *   - the user did not significantly change this BDS's behavior with
8232          *     some explicit (strong) options
8233          *   - no non-file child of this BDS has been overridden by the user
8234          *   Both of these conditions are represented by generate_json_filename.
8235          */
8236         if (primary_child_bs->exact_filename[0] &&
8237             primary_child_bs->drv->bdrv_file_open &&
8238             !drv->is_filter && !generate_json_filename)
8239         {
8240             strcpy(bs->exact_filename, primary_child_bs->exact_filename);
8241         }
8242     }
8243 
8244     if (bs->exact_filename[0]) {
8245         pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
8246     } else {
8247         GString *json = qobject_to_json(QOBJECT(bs->full_open_options));
8248         if (snprintf(bs->filename, sizeof(bs->filename), "json:%s",
8249                      json->str) >= sizeof(bs->filename)) {
8250             /* Give user a hint if we truncated things. */
8251             strcpy(bs->filename + sizeof(bs->filename) - 4, "...");
8252         }
8253         g_string_free(json, true);
8254     }
8255 }
8256 
8257 char *bdrv_dirname(BlockDriverState *bs, Error **errp)
8258 {
8259     BlockDriver *drv = bs->drv;
8260     BlockDriverState *child_bs;
8261 
8262     GLOBAL_STATE_CODE();
8263 
8264     if (!drv) {
8265         error_setg(errp, "Node '%s' is ejected", bs->node_name);
8266         return NULL;
8267     }
8268 
8269     if (drv->bdrv_dirname) {
8270         return drv->bdrv_dirname(bs, errp);
8271     }
8272 
8273     child_bs = bdrv_primary_bs(bs);
8274     if (child_bs) {
8275         return bdrv_dirname(child_bs, errp);
8276     }
8277 
8278     bdrv_refresh_filename(bs);
8279     if (bs->exact_filename[0] != '\0') {
8280         return path_combine(bs->exact_filename, "");
8281     }
8282 
8283     error_setg(errp, "Cannot generate a base directory for %s nodes",
8284                drv->format_name);
8285     return NULL;
8286 }
8287 
8288 /*
8289  * Hot add/remove a BDS's child. So the user can take a child offline when
8290  * it is broken and take a new child online
8291  */
8292 void bdrv_add_child(BlockDriverState *parent_bs, BlockDriverState *child_bs,
8293                     Error **errp)
8294 {
8295     GLOBAL_STATE_CODE();
8296     if (!parent_bs->drv || !parent_bs->drv->bdrv_add_child) {
8297         error_setg(errp, "The node %s does not support adding a child",
8298                    bdrv_get_device_or_node_name(parent_bs));
8299         return;
8300     }
8301 
8302     /*
8303      * Non-zoned block drivers do not follow zoned storage constraints
8304      * (i.e. sequential writes to zones). Refuse mixing zoned and non-zoned
8305      * drivers in a graph.
8306      */
8307     if (!parent_bs->drv->supports_zoned_children &&
8308         child_bs->bl.zoned == BLK_Z_HM) {
8309         /*
8310          * The host-aware model allows zoned storage constraints and random
8311          * write. Allow mixing host-aware and non-zoned drivers. Using
8312          * host-aware device as a regular device.
8313          */
8314         error_setg(errp, "Cannot add a %s child to a %s parent",
8315                    child_bs->bl.zoned == BLK_Z_HM ? "zoned" : "non-zoned",
8316                    parent_bs->drv->supports_zoned_children ?
8317                    "support zoned children" : "not support zoned children");
8318         return;
8319     }
8320 
8321     if (!QLIST_EMPTY(&child_bs->parents)) {
8322         error_setg(errp, "The node %s already has a parent",
8323                    child_bs->node_name);
8324         return;
8325     }
8326 
8327     parent_bs->drv->bdrv_add_child(parent_bs, child_bs, errp);
8328 }
8329 
8330 void bdrv_del_child(BlockDriverState *parent_bs, BdrvChild *child, Error **errp)
8331 {
8332     BdrvChild *tmp;
8333 
8334     GLOBAL_STATE_CODE();
8335     if (!parent_bs->drv || !parent_bs->drv->bdrv_del_child) {
8336         error_setg(errp, "The node %s does not support removing a child",
8337                    bdrv_get_device_or_node_name(parent_bs));
8338         return;
8339     }
8340 
8341     QLIST_FOREACH(tmp, &parent_bs->children, next) {
8342         if (tmp == child) {
8343             break;
8344         }
8345     }
8346 
8347     if (!tmp) {
8348         error_setg(errp, "The node %s does not have a child named %s",
8349                    bdrv_get_device_or_node_name(parent_bs),
8350                    bdrv_get_device_or_node_name(child->bs));
8351         return;
8352     }
8353 
8354     parent_bs->drv->bdrv_del_child(parent_bs, child, errp);
8355 }
8356 
8357 int bdrv_make_empty(BdrvChild *c, Error **errp)
8358 {
8359     BlockDriver *drv = c->bs->drv;
8360     int ret;
8361 
8362     GLOBAL_STATE_CODE();
8363     assert(c->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED));
8364 
8365     if (!drv->bdrv_make_empty) {
8366         error_setg(errp, "%s does not support emptying nodes",
8367                    drv->format_name);
8368         return -ENOTSUP;
8369     }
8370 
8371     ret = drv->bdrv_make_empty(c->bs);
8372     if (ret < 0) {
8373         error_setg_errno(errp, -ret, "Failed to empty %s",
8374                          c->bs->filename);
8375         return ret;
8376     }
8377 
8378     return 0;
8379 }
8380 
8381 /*
8382  * Return the child that @bs acts as an overlay for, and from which data may be
8383  * copied in COW or COR operations.  Usually this is the backing file.
8384  */
8385 BdrvChild *bdrv_cow_child(BlockDriverState *bs)
8386 {
8387     IO_CODE();
8388 
8389     if (!bs || !bs->drv) {
8390         return NULL;
8391     }
8392 
8393     if (bs->drv->is_filter) {
8394         return NULL;
8395     }
8396 
8397     if (!bs->backing) {
8398         return NULL;
8399     }
8400 
8401     assert(bs->backing->role & BDRV_CHILD_COW);
8402     return bs->backing;
8403 }
8404 
8405 /*
8406  * If @bs acts as a filter for exactly one of its children, return
8407  * that child.
8408  */
8409 BdrvChild *bdrv_filter_child(BlockDriverState *bs)
8410 {
8411     BdrvChild *c;
8412     IO_CODE();
8413 
8414     if (!bs || !bs->drv) {
8415         return NULL;
8416     }
8417 
8418     if (!bs->drv->is_filter) {
8419         return NULL;
8420     }
8421 
8422     /* Only one of @backing or @file may be used */
8423     assert(!(bs->backing && bs->file));
8424 
8425     c = bs->backing ?: bs->file;
8426     if (!c) {
8427         return NULL;
8428     }
8429 
8430     assert(c->role & BDRV_CHILD_FILTERED);
8431     return c;
8432 }
8433 
8434 /*
8435  * Return either the result of bdrv_cow_child() or bdrv_filter_child(),
8436  * whichever is non-NULL.
8437  *
8438  * Return NULL if both are NULL.
8439  */
8440 BdrvChild *bdrv_filter_or_cow_child(BlockDriverState *bs)
8441 {
8442     BdrvChild *cow_child = bdrv_cow_child(bs);
8443     BdrvChild *filter_child = bdrv_filter_child(bs);
8444     IO_CODE();
8445 
8446     /* Filter nodes cannot have COW backing files */
8447     assert(!(cow_child && filter_child));
8448 
8449     return cow_child ?: filter_child;
8450 }
8451 
8452 /*
8453  * Return the primary child of this node: For filters, that is the
8454  * filtered child.  For other nodes, that is usually the child storing
8455  * metadata.
8456  * (A generally more helpful description is that this is (usually) the
8457  * child that has the same filename as @bs.)
8458  *
8459  * Drivers do not necessarily have a primary child; for example quorum
8460  * does not.
8461  */
8462 BdrvChild *bdrv_primary_child(BlockDriverState *bs)
8463 {
8464     BdrvChild *c, *found = NULL;
8465     IO_CODE();
8466 
8467     QLIST_FOREACH(c, &bs->children, next) {
8468         if (c->role & BDRV_CHILD_PRIMARY) {
8469             assert(!found);
8470             found = c;
8471         }
8472     }
8473 
8474     return found;
8475 }
8476 
8477 static BlockDriverState *bdrv_do_skip_filters(BlockDriverState *bs,
8478                                               bool stop_on_explicit_filter)
8479 {
8480     BdrvChild *c;
8481 
8482     if (!bs) {
8483         return NULL;
8484     }
8485 
8486     while (!(stop_on_explicit_filter && !bs->implicit)) {
8487         c = bdrv_filter_child(bs);
8488         if (!c) {
8489             /*
8490              * A filter that is embedded in a working block graph must
8491              * have a child.  Assert this here so this function does
8492              * not return a filter node that is not expected by the
8493              * caller.
8494              */
8495             assert(!bs->drv || !bs->drv->is_filter);
8496             break;
8497         }
8498         bs = c->bs;
8499     }
8500     /*
8501      * Note that this treats nodes with bs->drv == NULL as not being
8502      * filters (bs->drv == NULL should be replaced by something else
8503      * anyway).
8504      * The advantage of this behavior is that this function will thus
8505      * always return a non-NULL value (given a non-NULL @bs).
8506      */
8507 
8508     return bs;
8509 }
8510 
8511 /*
8512  * Return the first BDS that has not been added implicitly or that
8513  * does not have a filtered child down the chain starting from @bs
8514  * (including @bs itself).
8515  */
8516 BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs)
8517 {
8518     GLOBAL_STATE_CODE();
8519     return bdrv_do_skip_filters(bs, true);
8520 }
8521 
8522 /*
8523  * Return the first BDS that does not have a filtered child down the
8524  * chain starting from @bs (including @bs itself).
8525  */
8526 BlockDriverState *bdrv_skip_filters(BlockDriverState *bs)
8527 {
8528     IO_CODE();
8529     return bdrv_do_skip_filters(bs, false);
8530 }
8531 
8532 /*
8533  * For a backing chain, return the first non-filter backing image of
8534  * the first non-filter image.
8535  */
8536 BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs)
8537 {
8538     IO_CODE();
8539     return bdrv_skip_filters(bdrv_cow_bs(bdrv_skip_filters(bs)));
8540 }
8541 
8542 /**
8543  * Check whether [offset, offset + bytes) overlaps with the cached
8544  * block-status data region.
8545  *
8546  * If so, and @pnum is not NULL, set *pnum to `bsc.data_end - offset`,
8547  * which is what bdrv_bsc_is_data()'s interface needs.
8548  * Otherwise, *pnum is not touched.
8549  */
8550 static bool bdrv_bsc_range_overlaps_locked(BlockDriverState *bs,
8551                                            int64_t offset, int64_t bytes,
8552                                            int64_t *pnum)
8553 {
8554     BdrvBlockStatusCache *bsc = qatomic_rcu_read(&bs->block_status_cache);
8555     bool overlaps;
8556 
8557     overlaps =
8558         qatomic_read(&bsc->valid) &&
8559         ranges_overlap(offset, bytes, bsc->data_start,
8560                        bsc->data_end - bsc->data_start);
8561 
8562     if (overlaps && pnum) {
8563         *pnum = bsc->data_end - offset;
8564     }
8565 
8566     return overlaps;
8567 }
8568 
8569 /**
8570  * See block_int.h for this function's documentation.
8571  */
8572 bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum)
8573 {
8574     IO_CODE();
8575     RCU_READ_LOCK_GUARD();
8576     return bdrv_bsc_range_overlaps_locked(bs, offset, 1, pnum);
8577 }
8578 
8579 /**
8580  * See block_int.h for this function's documentation.
8581  */
8582 void bdrv_bsc_invalidate_range(BlockDriverState *bs,
8583                                int64_t offset, int64_t bytes)
8584 {
8585     IO_CODE();
8586     RCU_READ_LOCK_GUARD();
8587 
8588     if (bdrv_bsc_range_overlaps_locked(bs, offset, bytes, NULL)) {
8589         qatomic_set(&bs->block_status_cache->valid, false);
8590     }
8591 }
8592 
8593 /**
8594  * See block_int.h for this function's documentation.
8595  */
8596 void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes)
8597 {
8598     BdrvBlockStatusCache *new_bsc = g_new(BdrvBlockStatusCache, 1);
8599     BdrvBlockStatusCache *old_bsc;
8600     IO_CODE();
8601 
8602     *new_bsc = (BdrvBlockStatusCache) {
8603         .valid = true,
8604         .data_start = offset,
8605         .data_end = offset + bytes,
8606     };
8607 
8608     QEMU_LOCK_GUARD(&bs->bsc_modify_lock);
8609 
8610     old_bsc = qatomic_rcu_read(&bs->block_status_cache);
8611     qatomic_rcu_set(&bs->block_status_cache, new_bsc);
8612     if (old_bsc) {
8613         g_free_rcu(old_bsc, rcu);
8614     }
8615 }
8616