xref: /openbmc/qemu/block.c (revision c7c2ff0c)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/sysemu.h"
32 #include "qemu/notify.h"
33 #include "block/coroutine.h"
34 #include "block/qapi.h"
35 #include "qmp-commands.h"
36 #include "qemu/timer.h"
37 #include "qapi-event.h"
38 
39 #ifdef CONFIG_BSD
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/queue.h>
44 #ifndef __DragonFly__
45 #include <sys/disk.h>
46 #endif
47 #endif
48 
49 #ifdef _WIN32
50 #include <windows.h>
51 #endif
52 
53 struct BdrvDirtyBitmap {
54     HBitmap *bitmap;
55     QLIST_ENTRY(BdrvDirtyBitmap) list;
56 };
57 
58 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59 
60 #define COROUTINE_POOL_RESERVATION 64 /* number of coroutines to reserve */
61 
62 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
63 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
64         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
65         BlockDriverCompletionFunc *cb, void *opaque);
66 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
67         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
68         BlockDriverCompletionFunc *cb, void *opaque);
69 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
70                                          int64_t sector_num, int nb_sectors,
71                                          QEMUIOVector *iov);
72 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
73                                          int64_t sector_num, int nb_sectors,
74                                          QEMUIOVector *iov);
75 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
76     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
77     BdrvRequestFlags flags);
78 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
79     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
80     BdrvRequestFlags flags);
81 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
82                                                int64_t sector_num,
83                                                QEMUIOVector *qiov,
84                                                int nb_sectors,
85                                                BdrvRequestFlags flags,
86                                                BlockDriverCompletionFunc *cb,
87                                                void *opaque,
88                                                bool is_write);
89 static void coroutine_fn bdrv_co_do_rw(void *opaque);
90 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
91     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
92 
93 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94     QTAILQ_HEAD_INITIALIZER(bdrv_states);
95 
96 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
97     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
98 
99 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
100     QLIST_HEAD_INITIALIZER(bdrv_drivers);
101 
102 /* If non-zero, use only whitelisted block drivers */
103 static int use_bdrv_whitelist;
104 
105 #ifdef _WIN32
106 static int is_windows_drive_prefix(const char *filename)
107 {
108     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
110             filename[1] == ':');
111 }
112 
113 int is_windows_drive(const char *filename)
114 {
115     if (is_windows_drive_prefix(filename) &&
116         filename[2] == '\0')
117         return 1;
118     if (strstart(filename, "\\\\.\\", NULL) ||
119         strstart(filename, "//./", NULL))
120         return 1;
121     return 0;
122 }
123 #endif
124 
125 /* throttling disk I/O limits */
126 void bdrv_set_io_limits(BlockDriverState *bs,
127                         ThrottleConfig *cfg)
128 {
129     int i;
130 
131     throttle_config(&bs->throttle_state, cfg);
132 
133     for (i = 0; i < 2; i++) {
134         qemu_co_enter_next(&bs->throttled_reqs[i]);
135     }
136 }
137 
138 /* this function drain all the throttled IOs */
139 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
140 {
141     bool drained = false;
142     bool enabled = bs->io_limits_enabled;
143     int i;
144 
145     bs->io_limits_enabled = false;
146 
147     for (i = 0; i < 2; i++) {
148         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
149             drained = true;
150         }
151     }
152 
153     bs->io_limits_enabled = enabled;
154 
155     return drained;
156 }
157 
158 void bdrv_io_limits_disable(BlockDriverState *bs)
159 {
160     bs->io_limits_enabled = false;
161 
162     bdrv_start_throttled_reqs(bs);
163 
164     throttle_destroy(&bs->throttle_state);
165 }
166 
167 static void bdrv_throttle_read_timer_cb(void *opaque)
168 {
169     BlockDriverState *bs = opaque;
170     qemu_co_enter_next(&bs->throttled_reqs[0]);
171 }
172 
173 static void bdrv_throttle_write_timer_cb(void *opaque)
174 {
175     BlockDriverState *bs = opaque;
176     qemu_co_enter_next(&bs->throttled_reqs[1]);
177 }
178 
179 /* should be called before bdrv_set_io_limits if a limit is set */
180 void bdrv_io_limits_enable(BlockDriverState *bs)
181 {
182     assert(!bs->io_limits_enabled);
183     throttle_init(&bs->throttle_state,
184                   bdrv_get_aio_context(bs),
185                   QEMU_CLOCK_VIRTUAL,
186                   bdrv_throttle_read_timer_cb,
187                   bdrv_throttle_write_timer_cb,
188                   bs);
189     bs->io_limits_enabled = true;
190 }
191 
192 /* This function makes an IO wait if needed
193  *
194  * @nb_sectors: the number of sectors of the IO
195  * @is_write:   is the IO a write
196  */
197 static void bdrv_io_limits_intercept(BlockDriverState *bs,
198                                      unsigned int bytes,
199                                      bool is_write)
200 {
201     /* does this io must wait */
202     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
203 
204     /* if must wait or any request of this type throttled queue the IO */
205     if (must_wait ||
206         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
207         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
208     }
209 
210     /* the IO will be executed, do the accounting */
211     throttle_account(&bs->throttle_state, is_write, bytes);
212 
213 
214     /* if the next request must wait -> do nothing */
215     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
216         return;
217     }
218 
219     /* else queue next request for execution */
220     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
221 }
222 
223 size_t bdrv_opt_mem_align(BlockDriverState *bs)
224 {
225     if (!bs || !bs->drv) {
226         /* 4k should be on the safe side */
227         return 4096;
228     }
229 
230     return bs->bl.opt_mem_alignment;
231 }
232 
233 /* check if the path starts with "<protocol>:" */
234 static int path_has_protocol(const char *path)
235 {
236     const char *p;
237 
238 #ifdef _WIN32
239     if (is_windows_drive(path) ||
240         is_windows_drive_prefix(path)) {
241         return 0;
242     }
243     p = path + strcspn(path, ":/\\");
244 #else
245     p = path + strcspn(path, ":/");
246 #endif
247 
248     return *p == ':';
249 }
250 
251 int path_is_absolute(const char *path)
252 {
253 #ifdef _WIN32
254     /* specific case for names like: "\\.\d:" */
255     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
256         return 1;
257     }
258     return (*path == '/' || *path == '\\');
259 #else
260     return (*path == '/');
261 #endif
262 }
263 
264 /* if filename is absolute, just copy it to dest. Otherwise, build a
265    path to it by considering it is relative to base_path. URL are
266    supported. */
267 void path_combine(char *dest, int dest_size,
268                   const char *base_path,
269                   const char *filename)
270 {
271     const char *p, *p1;
272     int len;
273 
274     if (dest_size <= 0)
275         return;
276     if (path_is_absolute(filename)) {
277         pstrcpy(dest, dest_size, filename);
278     } else {
279         p = strchr(base_path, ':');
280         if (p)
281             p++;
282         else
283             p = base_path;
284         p1 = strrchr(base_path, '/');
285 #ifdef _WIN32
286         {
287             const char *p2;
288             p2 = strrchr(base_path, '\\');
289             if (!p1 || p2 > p1)
290                 p1 = p2;
291         }
292 #endif
293         if (p1)
294             p1++;
295         else
296             p1 = base_path;
297         if (p1 > p)
298             p = p1;
299         len = p - base_path;
300         if (len > dest_size - 1)
301             len = dest_size - 1;
302         memcpy(dest, base_path, len);
303         dest[len] = '\0';
304         pstrcat(dest, dest_size, filename);
305     }
306 }
307 
308 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
309 {
310     if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
311         pstrcpy(dest, sz, bs->backing_file);
312     } else {
313         path_combine(dest, sz, bs->filename, bs->backing_file);
314     }
315 }
316 
317 void bdrv_register(BlockDriver *bdrv)
318 {
319     /* Block drivers without coroutine functions need emulation */
320     if (!bdrv->bdrv_co_readv) {
321         bdrv->bdrv_co_readv = bdrv_co_readv_em;
322         bdrv->bdrv_co_writev = bdrv_co_writev_em;
323 
324         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
325          * the block driver lacks aio we need to emulate that too.
326          */
327         if (!bdrv->bdrv_aio_readv) {
328             /* add AIO emulation layer */
329             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
330             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
331         }
332     }
333 
334     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
335 }
336 
337 /* create a new block device (by default it is empty) */
338 BlockDriverState *bdrv_new(const char *device_name, Error **errp)
339 {
340     BlockDriverState *bs;
341     int i;
342 
343     if (bdrv_find(device_name)) {
344         error_setg(errp, "Device with id '%s' already exists",
345                    device_name);
346         return NULL;
347     }
348     if (bdrv_find_node(device_name)) {
349         error_setg(errp, "Device with node-name '%s' already exists",
350                    device_name);
351         return NULL;
352     }
353 
354     bs = g_new0(BlockDriverState, 1);
355     QLIST_INIT(&bs->dirty_bitmaps);
356     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
357     if (device_name[0] != '\0') {
358         QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
359     }
360     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
361         QLIST_INIT(&bs->op_blockers[i]);
362     }
363     bdrv_iostatus_disable(bs);
364     notifier_list_init(&bs->close_notifiers);
365     notifier_with_return_list_init(&bs->before_write_notifiers);
366     qemu_co_queue_init(&bs->throttled_reqs[0]);
367     qemu_co_queue_init(&bs->throttled_reqs[1]);
368     bs->refcnt = 1;
369     bs->aio_context = qemu_get_aio_context();
370 
371     return bs;
372 }
373 
374 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
375 {
376     notifier_list_add(&bs->close_notifiers, notify);
377 }
378 
379 BlockDriver *bdrv_find_format(const char *format_name)
380 {
381     BlockDriver *drv1;
382     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
383         if (!strcmp(drv1->format_name, format_name)) {
384             return drv1;
385         }
386     }
387     return NULL;
388 }
389 
390 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
391 {
392     static const char *whitelist_rw[] = {
393         CONFIG_BDRV_RW_WHITELIST
394     };
395     static const char *whitelist_ro[] = {
396         CONFIG_BDRV_RO_WHITELIST
397     };
398     const char **p;
399 
400     if (!whitelist_rw[0] && !whitelist_ro[0]) {
401         return 1;               /* no whitelist, anything goes */
402     }
403 
404     for (p = whitelist_rw; *p; p++) {
405         if (!strcmp(drv->format_name, *p)) {
406             return 1;
407         }
408     }
409     if (read_only) {
410         for (p = whitelist_ro; *p; p++) {
411             if (!strcmp(drv->format_name, *p)) {
412                 return 1;
413             }
414         }
415     }
416     return 0;
417 }
418 
419 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
420                                           bool read_only)
421 {
422     BlockDriver *drv = bdrv_find_format(format_name);
423     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
424 }
425 
426 typedef struct CreateCo {
427     BlockDriver *drv;
428     char *filename;
429     QemuOpts *opts;
430     int ret;
431     Error *err;
432 } CreateCo;
433 
434 static void coroutine_fn bdrv_create_co_entry(void *opaque)
435 {
436     Error *local_err = NULL;
437     int ret;
438 
439     CreateCo *cco = opaque;
440     assert(cco->drv);
441 
442     ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
443     if (local_err) {
444         error_propagate(&cco->err, local_err);
445     }
446     cco->ret = ret;
447 }
448 
449 int bdrv_create(BlockDriver *drv, const char* filename,
450                 QemuOpts *opts, Error **errp)
451 {
452     int ret;
453 
454     Coroutine *co;
455     CreateCo cco = {
456         .drv = drv,
457         .filename = g_strdup(filename),
458         .opts = opts,
459         .ret = NOT_DONE,
460         .err = NULL,
461     };
462 
463     if (!drv->bdrv_create) {
464         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
465         ret = -ENOTSUP;
466         goto out;
467     }
468 
469     if (qemu_in_coroutine()) {
470         /* Fast-path if already in coroutine context */
471         bdrv_create_co_entry(&cco);
472     } else {
473         co = qemu_coroutine_create(bdrv_create_co_entry);
474         qemu_coroutine_enter(co, &cco);
475         while (cco.ret == NOT_DONE) {
476             aio_poll(qemu_get_aio_context(), true);
477         }
478     }
479 
480     ret = cco.ret;
481     if (ret < 0) {
482         if (cco.err) {
483             error_propagate(errp, cco.err);
484         } else {
485             error_setg_errno(errp, -ret, "Could not create image");
486         }
487     }
488 
489 out:
490     g_free(cco.filename);
491     return ret;
492 }
493 
494 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
495 {
496     BlockDriver *drv;
497     Error *local_err = NULL;
498     int ret;
499 
500     drv = bdrv_find_protocol(filename, true);
501     if (drv == NULL) {
502         error_setg(errp, "Could not find protocol for file '%s'", filename);
503         return -ENOENT;
504     }
505 
506     ret = bdrv_create(drv, filename, opts, &local_err);
507     if (local_err) {
508         error_propagate(errp, local_err);
509     }
510     return ret;
511 }
512 
513 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
514 {
515     BlockDriver *drv = bs->drv;
516     Error *local_err = NULL;
517 
518     memset(&bs->bl, 0, sizeof(bs->bl));
519 
520     if (!drv) {
521         return;
522     }
523 
524     /* Take some limits from the children as a default */
525     if (bs->file) {
526         bdrv_refresh_limits(bs->file, &local_err);
527         if (local_err) {
528             error_propagate(errp, local_err);
529             return;
530         }
531         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
532         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
533     } else {
534         bs->bl.opt_mem_alignment = 512;
535     }
536 
537     if (bs->backing_hd) {
538         bdrv_refresh_limits(bs->backing_hd, &local_err);
539         if (local_err) {
540             error_propagate(errp, local_err);
541             return;
542         }
543         bs->bl.opt_transfer_length =
544             MAX(bs->bl.opt_transfer_length,
545                 bs->backing_hd->bl.opt_transfer_length);
546         bs->bl.opt_mem_alignment =
547             MAX(bs->bl.opt_mem_alignment,
548                 bs->backing_hd->bl.opt_mem_alignment);
549     }
550 
551     /* Then let the driver override it */
552     if (drv->bdrv_refresh_limits) {
553         drv->bdrv_refresh_limits(bs, errp);
554     }
555 }
556 
557 /*
558  * Create a uniquely-named empty temporary file.
559  * Return 0 upon success, otherwise a negative errno value.
560  */
561 int get_tmp_filename(char *filename, int size)
562 {
563 #ifdef _WIN32
564     char temp_dir[MAX_PATH];
565     /* GetTempFileName requires that its output buffer (4th param)
566        have length MAX_PATH or greater.  */
567     assert(size >= MAX_PATH);
568     return (GetTempPath(MAX_PATH, temp_dir)
569             && GetTempFileName(temp_dir, "qem", 0, filename)
570             ? 0 : -GetLastError());
571 #else
572     int fd;
573     const char *tmpdir;
574     tmpdir = getenv("TMPDIR");
575     if (!tmpdir) {
576         tmpdir = "/var/tmp";
577     }
578     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
579         return -EOVERFLOW;
580     }
581     fd = mkstemp(filename);
582     if (fd < 0) {
583         return -errno;
584     }
585     if (close(fd) != 0) {
586         unlink(filename);
587         return -errno;
588     }
589     return 0;
590 #endif
591 }
592 
593 /*
594  * Detect host devices. By convention, /dev/cdrom[N] is always
595  * recognized as a host CDROM.
596  */
597 static BlockDriver *find_hdev_driver(const char *filename)
598 {
599     int score_max = 0, score;
600     BlockDriver *drv = NULL, *d;
601 
602     QLIST_FOREACH(d, &bdrv_drivers, list) {
603         if (d->bdrv_probe_device) {
604             score = d->bdrv_probe_device(filename);
605             if (score > score_max) {
606                 score_max = score;
607                 drv = d;
608             }
609         }
610     }
611 
612     return drv;
613 }
614 
615 BlockDriver *bdrv_find_protocol(const char *filename,
616                                 bool allow_protocol_prefix)
617 {
618     BlockDriver *drv1;
619     char protocol[128];
620     int len;
621     const char *p;
622 
623     /* TODO Drivers without bdrv_file_open must be specified explicitly */
624 
625     /*
626      * XXX(hch): we really should not let host device detection
627      * override an explicit protocol specification, but moving this
628      * later breaks access to device names with colons in them.
629      * Thanks to the brain-dead persistent naming schemes on udev-
630      * based Linux systems those actually are quite common.
631      */
632     drv1 = find_hdev_driver(filename);
633     if (drv1) {
634         return drv1;
635     }
636 
637     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
638         return bdrv_find_format("file");
639     }
640 
641     p = strchr(filename, ':');
642     assert(p != NULL);
643     len = p - filename;
644     if (len > sizeof(protocol) - 1)
645         len = sizeof(protocol) - 1;
646     memcpy(protocol, filename, len);
647     protocol[len] = '\0';
648     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
649         if (drv1->protocol_name &&
650             !strcmp(drv1->protocol_name, protocol)) {
651             return drv1;
652         }
653     }
654     return NULL;
655 }
656 
657 static int find_image_format(BlockDriverState *bs, const char *filename,
658                              BlockDriver **pdrv, Error **errp)
659 {
660     int score, score_max;
661     BlockDriver *drv1, *drv;
662     uint8_t buf[2048];
663     int ret = 0;
664 
665     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
666     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
667         drv = bdrv_find_format("raw");
668         if (!drv) {
669             error_setg(errp, "Could not find raw image format");
670             ret = -ENOENT;
671         }
672         *pdrv = drv;
673         return ret;
674     }
675 
676     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
677     if (ret < 0) {
678         error_setg_errno(errp, -ret, "Could not read image for determining its "
679                          "format");
680         *pdrv = NULL;
681         return ret;
682     }
683 
684     score_max = 0;
685     drv = NULL;
686     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
687         if (drv1->bdrv_probe) {
688             score = drv1->bdrv_probe(buf, ret, filename);
689             if (score > score_max) {
690                 score_max = score;
691                 drv = drv1;
692             }
693         }
694     }
695     if (!drv) {
696         error_setg(errp, "Could not determine image format: No compatible "
697                    "driver found");
698         ret = -ENOENT;
699     }
700     *pdrv = drv;
701     return ret;
702 }
703 
704 /**
705  * Set the current 'total_sectors' value
706  * Return 0 on success, -errno on error.
707  */
708 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
709 {
710     BlockDriver *drv = bs->drv;
711 
712     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
713     if (bs->sg)
714         return 0;
715 
716     /* query actual device if possible, otherwise just trust the hint */
717     if (drv->bdrv_getlength) {
718         int64_t length = drv->bdrv_getlength(bs);
719         if (length < 0) {
720             return length;
721         }
722         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
723     }
724 
725     bs->total_sectors = hint;
726     return 0;
727 }
728 
729 /**
730  * Set open flags for a given discard mode
731  *
732  * Return 0 on success, -1 if the discard mode was invalid.
733  */
734 int bdrv_parse_discard_flags(const char *mode, int *flags)
735 {
736     *flags &= ~BDRV_O_UNMAP;
737 
738     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
739         /* do nothing */
740     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
741         *flags |= BDRV_O_UNMAP;
742     } else {
743         return -1;
744     }
745 
746     return 0;
747 }
748 
749 /**
750  * Set open flags for a given cache mode
751  *
752  * Return 0 on success, -1 if the cache mode was invalid.
753  */
754 int bdrv_parse_cache_flags(const char *mode, int *flags)
755 {
756     *flags &= ~BDRV_O_CACHE_MASK;
757 
758     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
759         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
760     } else if (!strcmp(mode, "directsync")) {
761         *flags |= BDRV_O_NOCACHE;
762     } else if (!strcmp(mode, "writeback")) {
763         *flags |= BDRV_O_CACHE_WB;
764     } else if (!strcmp(mode, "unsafe")) {
765         *flags |= BDRV_O_CACHE_WB;
766         *flags |= BDRV_O_NO_FLUSH;
767     } else if (!strcmp(mode, "writethrough")) {
768         /* this is the default */
769     } else {
770         return -1;
771     }
772 
773     return 0;
774 }
775 
776 /**
777  * The copy-on-read flag is actually a reference count so multiple users may
778  * use the feature without worrying about clobbering its previous state.
779  * Copy-on-read stays enabled until all users have called to disable it.
780  */
781 void bdrv_enable_copy_on_read(BlockDriverState *bs)
782 {
783     bs->copy_on_read++;
784 }
785 
786 void bdrv_disable_copy_on_read(BlockDriverState *bs)
787 {
788     assert(bs->copy_on_read > 0);
789     bs->copy_on_read--;
790 }
791 
792 /*
793  * Returns the flags that a temporary snapshot should get, based on the
794  * originally requested flags (the originally requested image will have flags
795  * like a backing file)
796  */
797 static int bdrv_temp_snapshot_flags(int flags)
798 {
799     return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
800 }
801 
802 /*
803  * Returns the flags that bs->file should get, based on the given flags for
804  * the parent BDS
805  */
806 static int bdrv_inherited_flags(int flags)
807 {
808     /* Enable protocol handling, disable format probing for bs->file */
809     flags |= BDRV_O_PROTOCOL;
810 
811     /* Our block drivers take care to send flushes and respect unmap policy,
812      * so we can enable both unconditionally on lower layers. */
813     flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
814 
815     /* Clear flags that only apply to the top layer */
816     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
817 
818     return flags;
819 }
820 
821 /*
822  * Returns the flags that bs->backing_hd should get, based on the given flags
823  * for the parent BDS
824  */
825 static int bdrv_backing_flags(int flags)
826 {
827     /* backing files always opened read-only */
828     flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
829 
830     /* snapshot=on is handled on the top layer */
831     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
832 
833     return flags;
834 }
835 
836 static int bdrv_open_flags(BlockDriverState *bs, int flags)
837 {
838     int open_flags = flags | BDRV_O_CACHE_WB;
839 
840     /*
841      * Clear flags that are internal to the block layer before opening the
842      * image.
843      */
844     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
845 
846     /*
847      * Snapshots should be writable.
848      */
849     if (flags & BDRV_O_TEMPORARY) {
850         open_flags |= BDRV_O_RDWR;
851     }
852 
853     return open_flags;
854 }
855 
856 static void bdrv_assign_node_name(BlockDriverState *bs,
857                                   const char *node_name,
858                                   Error **errp)
859 {
860     if (!node_name) {
861         return;
862     }
863 
864     /* empty string node name is invalid */
865     if (node_name[0] == '\0') {
866         error_setg(errp, "Empty node name");
867         return;
868     }
869 
870     /* takes care of avoiding namespaces collisions */
871     if (bdrv_find(node_name)) {
872         error_setg(errp, "node-name=%s is conflicting with a device id",
873                    node_name);
874         return;
875     }
876 
877     /* takes care of avoiding duplicates node names */
878     if (bdrv_find_node(node_name)) {
879         error_setg(errp, "Duplicate node name");
880         return;
881     }
882 
883     /* copy node name into the bs and insert it into the graph list */
884     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
885     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
886 }
887 
888 /*
889  * Common part for opening disk images and files
890  *
891  * Removes all processed options from *options.
892  */
893 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
894     QDict *options, int flags, BlockDriver *drv, Error **errp)
895 {
896     int ret, open_flags;
897     const char *filename;
898     const char *node_name = NULL;
899     Error *local_err = NULL;
900 
901     assert(drv != NULL);
902     assert(bs->file == NULL);
903     assert(options != NULL && bs->options != options);
904 
905     if (file != NULL) {
906         filename = file->filename;
907     } else {
908         filename = qdict_get_try_str(options, "filename");
909     }
910 
911     if (drv->bdrv_needs_filename && !filename) {
912         error_setg(errp, "The '%s' block driver requires a file name",
913                    drv->format_name);
914         return -EINVAL;
915     }
916 
917     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
918 
919     node_name = qdict_get_try_str(options, "node-name");
920     bdrv_assign_node_name(bs, node_name, &local_err);
921     if (local_err) {
922         error_propagate(errp, local_err);
923         return -EINVAL;
924     }
925     qdict_del(options, "node-name");
926 
927     /* bdrv_open() with directly using a protocol as drv. This layer is already
928      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
929      * and return immediately. */
930     if (file != NULL && drv->bdrv_file_open) {
931         bdrv_swap(file, bs);
932         return 0;
933     }
934 
935     bs->open_flags = flags;
936     bs->guest_block_size = 512;
937     bs->request_alignment = 512;
938     bs->zero_beyond_eof = true;
939     open_flags = bdrv_open_flags(bs, flags);
940     bs->read_only = !(open_flags & BDRV_O_RDWR);
941     bs->growable = !!(flags & BDRV_O_PROTOCOL);
942 
943     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
944         error_setg(errp,
945                    !bs->read_only && bdrv_is_whitelisted(drv, true)
946                         ? "Driver '%s' can only be used for read-only devices"
947                         : "Driver '%s' is not whitelisted",
948                    drv->format_name);
949         return -ENOTSUP;
950     }
951 
952     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
953     if (flags & BDRV_O_COPY_ON_READ) {
954         if (!bs->read_only) {
955             bdrv_enable_copy_on_read(bs);
956         } else {
957             error_setg(errp, "Can't use copy-on-read on read-only device");
958             return -EINVAL;
959         }
960     }
961 
962     if (filename != NULL) {
963         pstrcpy(bs->filename, sizeof(bs->filename), filename);
964     } else {
965         bs->filename[0] = '\0';
966     }
967     pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
968 
969     bs->drv = drv;
970     bs->opaque = g_malloc0(drv->instance_size);
971 
972     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
973 
974     /* Open the image, either directly or using a protocol */
975     if (drv->bdrv_file_open) {
976         assert(file == NULL);
977         assert(!drv->bdrv_needs_filename || filename != NULL);
978         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
979     } else {
980         if (file == NULL) {
981             error_setg(errp, "Can't use '%s' as a block driver for the "
982                        "protocol level", drv->format_name);
983             ret = -EINVAL;
984             goto free_and_fail;
985         }
986         bs->file = file;
987         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
988     }
989 
990     if (ret < 0) {
991         if (local_err) {
992             error_propagate(errp, local_err);
993         } else if (bs->filename[0]) {
994             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
995         } else {
996             error_setg_errno(errp, -ret, "Could not open image");
997         }
998         goto free_and_fail;
999     }
1000 
1001     ret = refresh_total_sectors(bs, bs->total_sectors);
1002     if (ret < 0) {
1003         error_setg_errno(errp, -ret, "Could not refresh total sector count");
1004         goto free_and_fail;
1005     }
1006 
1007     bdrv_refresh_limits(bs, &local_err);
1008     if (local_err) {
1009         error_propagate(errp, local_err);
1010         ret = -EINVAL;
1011         goto free_and_fail;
1012     }
1013 
1014     assert(bdrv_opt_mem_align(bs) != 0);
1015     assert((bs->request_alignment != 0) || bs->sg);
1016     return 0;
1017 
1018 free_and_fail:
1019     bs->file = NULL;
1020     g_free(bs->opaque);
1021     bs->opaque = NULL;
1022     bs->drv = NULL;
1023     return ret;
1024 }
1025 
1026 static QDict *parse_json_filename(const char *filename, Error **errp)
1027 {
1028     QObject *options_obj;
1029     QDict *options;
1030     int ret;
1031 
1032     ret = strstart(filename, "json:", &filename);
1033     assert(ret);
1034 
1035     options_obj = qobject_from_json(filename);
1036     if (!options_obj) {
1037         error_setg(errp, "Could not parse the JSON options");
1038         return NULL;
1039     }
1040 
1041     if (qobject_type(options_obj) != QTYPE_QDICT) {
1042         qobject_decref(options_obj);
1043         error_setg(errp, "Invalid JSON object given");
1044         return NULL;
1045     }
1046 
1047     options = qobject_to_qdict(options_obj);
1048     qdict_flatten(options);
1049 
1050     return options;
1051 }
1052 
1053 /*
1054  * Fills in default options for opening images and converts the legacy
1055  * filename/flags pair to option QDict entries.
1056  */
1057 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1058                              BlockDriver *drv, Error **errp)
1059 {
1060     const char *filename = *pfilename;
1061     const char *drvname;
1062     bool protocol = flags & BDRV_O_PROTOCOL;
1063     bool parse_filename = false;
1064     Error *local_err = NULL;
1065 
1066     /* Parse json: pseudo-protocol */
1067     if (filename && g_str_has_prefix(filename, "json:")) {
1068         QDict *json_options = parse_json_filename(filename, &local_err);
1069         if (local_err) {
1070             error_propagate(errp, local_err);
1071             return -EINVAL;
1072         }
1073 
1074         /* Options given in the filename have lower priority than options
1075          * specified directly */
1076         qdict_join(*options, json_options, false);
1077         QDECREF(json_options);
1078         *pfilename = filename = NULL;
1079     }
1080 
1081     /* Fetch the file name from the options QDict if necessary */
1082     if (protocol && filename) {
1083         if (!qdict_haskey(*options, "filename")) {
1084             qdict_put(*options, "filename", qstring_from_str(filename));
1085             parse_filename = true;
1086         } else {
1087             error_setg(errp, "Can't specify 'file' and 'filename' options at "
1088                              "the same time");
1089             return -EINVAL;
1090         }
1091     }
1092 
1093     /* Find the right block driver */
1094     filename = qdict_get_try_str(*options, "filename");
1095     drvname = qdict_get_try_str(*options, "driver");
1096 
1097     if (drv) {
1098         if (drvname) {
1099             error_setg(errp, "Driver specified twice");
1100             return -EINVAL;
1101         }
1102         drvname = drv->format_name;
1103         qdict_put(*options, "driver", qstring_from_str(drvname));
1104     } else {
1105         if (!drvname && protocol) {
1106             if (filename) {
1107                 drv = bdrv_find_protocol(filename, parse_filename);
1108                 if (!drv) {
1109                     error_setg(errp, "Unknown protocol");
1110                     return -EINVAL;
1111                 }
1112 
1113                 drvname = drv->format_name;
1114                 qdict_put(*options, "driver", qstring_from_str(drvname));
1115             } else {
1116                 error_setg(errp, "Must specify either driver or file");
1117                 return -EINVAL;
1118             }
1119         } else if (drvname) {
1120             drv = bdrv_find_format(drvname);
1121             if (!drv) {
1122                 error_setg(errp, "Unknown driver '%s'", drvname);
1123                 return -ENOENT;
1124             }
1125         }
1126     }
1127 
1128     assert(drv || !protocol);
1129 
1130     /* Driver-specific filename parsing */
1131     if (drv && drv->bdrv_parse_filename && parse_filename) {
1132         drv->bdrv_parse_filename(filename, *options, &local_err);
1133         if (local_err) {
1134             error_propagate(errp, local_err);
1135             return -EINVAL;
1136         }
1137 
1138         if (!drv->bdrv_needs_filename) {
1139             qdict_del(*options, "filename");
1140         }
1141     }
1142 
1143     return 0;
1144 }
1145 
1146 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1147 {
1148 
1149     if (bs->backing_hd) {
1150         assert(bs->backing_blocker);
1151         bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1152     } else if (backing_hd) {
1153         error_setg(&bs->backing_blocker,
1154                    "device is used as backing hd of '%s'",
1155                    bs->device_name);
1156     }
1157 
1158     bs->backing_hd = backing_hd;
1159     if (!backing_hd) {
1160         error_free(bs->backing_blocker);
1161         bs->backing_blocker = NULL;
1162         goto out;
1163     }
1164     bs->open_flags &= ~BDRV_O_NO_BACKING;
1165     pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1166     pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1167             backing_hd->drv ? backing_hd->drv->format_name : "");
1168 
1169     bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1170     /* Otherwise we won't be able to commit due to check in bdrv_commit */
1171     bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1172                     bs->backing_blocker);
1173 out:
1174     bdrv_refresh_limits(bs, NULL);
1175 }
1176 
1177 /*
1178  * Opens the backing file for a BlockDriverState if not yet open
1179  *
1180  * options is a QDict of options to pass to the block drivers, or NULL for an
1181  * empty set of options. The reference to the QDict is transferred to this
1182  * function (even on failure), so if the caller intends to reuse the dictionary,
1183  * it needs to use QINCREF() before calling bdrv_file_open.
1184  */
1185 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1186 {
1187     char *backing_filename = g_malloc0(PATH_MAX);
1188     int ret = 0;
1189     BlockDriver *back_drv = NULL;
1190     BlockDriverState *backing_hd;
1191     Error *local_err = NULL;
1192 
1193     if (bs->backing_hd != NULL) {
1194         QDECREF(options);
1195         goto free_exit;
1196     }
1197 
1198     /* NULL means an empty set of options */
1199     if (options == NULL) {
1200         options = qdict_new();
1201     }
1202 
1203     bs->open_flags &= ~BDRV_O_NO_BACKING;
1204     if (qdict_haskey(options, "file.filename")) {
1205         backing_filename[0] = '\0';
1206     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1207         QDECREF(options);
1208         goto free_exit;
1209     } else {
1210         bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1211     }
1212 
1213     if (!bs->drv || !bs->drv->supports_backing) {
1214         ret = -EINVAL;
1215         error_setg(errp, "Driver doesn't support backing files");
1216         QDECREF(options);
1217         goto free_exit;
1218     }
1219 
1220     backing_hd = bdrv_new("", errp);
1221 
1222     if (bs->backing_format[0] != '\0') {
1223         back_drv = bdrv_find_format(bs->backing_format);
1224     }
1225 
1226     assert(bs->backing_hd == NULL);
1227     ret = bdrv_open(&backing_hd,
1228                     *backing_filename ? backing_filename : NULL, NULL, options,
1229                     bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
1230     if (ret < 0) {
1231         bdrv_unref(backing_hd);
1232         backing_hd = NULL;
1233         bs->open_flags |= BDRV_O_NO_BACKING;
1234         error_setg(errp, "Could not open backing file: %s",
1235                    error_get_pretty(local_err));
1236         error_free(local_err);
1237         goto free_exit;
1238     }
1239     bdrv_set_backing_hd(bs, backing_hd);
1240 
1241 free_exit:
1242     g_free(backing_filename);
1243     return ret;
1244 }
1245 
1246 /*
1247  * Opens a disk image whose options are given as BlockdevRef in another block
1248  * device's options.
1249  *
1250  * If allow_none is true, no image will be opened if filename is false and no
1251  * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1252  *
1253  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1254  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1255  * itself, all options starting with "${bdref_key}." are considered part of the
1256  * BlockdevRef.
1257  *
1258  * The BlockdevRef will be removed from the options QDict.
1259  *
1260  * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1261  */
1262 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1263                     QDict *options, const char *bdref_key, int flags,
1264                     bool allow_none, Error **errp)
1265 {
1266     QDict *image_options;
1267     int ret;
1268     char *bdref_key_dot;
1269     const char *reference;
1270 
1271     assert(pbs);
1272     assert(*pbs == NULL);
1273 
1274     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1275     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1276     g_free(bdref_key_dot);
1277 
1278     reference = qdict_get_try_str(options, bdref_key);
1279     if (!filename && !reference && !qdict_size(image_options)) {
1280         if (allow_none) {
1281             ret = 0;
1282         } else {
1283             error_setg(errp, "A block device must be specified for \"%s\"",
1284                        bdref_key);
1285             ret = -EINVAL;
1286         }
1287         QDECREF(image_options);
1288         goto done;
1289     }
1290 
1291     ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1292 
1293 done:
1294     qdict_del(options, bdref_key);
1295     return ret;
1296 }
1297 
1298 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1299 {
1300     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1301     char *tmp_filename = g_malloc0(PATH_MAX + 1);
1302     int64_t total_size;
1303     BlockDriver *bdrv_qcow2;
1304     QemuOpts *opts = NULL;
1305     QDict *snapshot_options;
1306     BlockDriverState *bs_snapshot;
1307     Error *local_err;
1308     int ret;
1309 
1310     /* if snapshot, we create a temporary backing file and open it
1311        instead of opening 'filename' directly */
1312 
1313     /* Get the required size from the image */
1314     total_size = bdrv_getlength(bs);
1315     if (total_size < 0) {
1316         ret = total_size;
1317         error_setg_errno(errp, -total_size, "Could not get image size");
1318         goto out;
1319     }
1320 
1321     /* Create the temporary image */
1322     ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1323     if (ret < 0) {
1324         error_setg_errno(errp, -ret, "Could not get temporary filename");
1325         goto out;
1326     }
1327 
1328     bdrv_qcow2 = bdrv_find_format("qcow2");
1329     opts = qemu_opts_create(bdrv_qcow2->create_opts, NULL, 0,
1330                             &error_abort);
1331     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
1332     ret = bdrv_create(bdrv_qcow2, tmp_filename, opts, &local_err);
1333     qemu_opts_del(opts);
1334     if (ret < 0) {
1335         error_setg_errno(errp, -ret, "Could not create temporary overlay "
1336                          "'%s': %s", tmp_filename,
1337                          error_get_pretty(local_err));
1338         error_free(local_err);
1339         goto out;
1340     }
1341 
1342     /* Prepare a new options QDict for the temporary file */
1343     snapshot_options = qdict_new();
1344     qdict_put(snapshot_options, "file.driver",
1345               qstring_from_str("file"));
1346     qdict_put(snapshot_options, "file.filename",
1347               qstring_from_str(tmp_filename));
1348 
1349     bs_snapshot = bdrv_new("", &error_abort);
1350 
1351     ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1352                     flags, bdrv_qcow2, &local_err);
1353     if (ret < 0) {
1354         error_propagate(errp, local_err);
1355         goto out;
1356     }
1357 
1358     bdrv_append(bs_snapshot, bs);
1359 
1360 out:
1361     g_free(tmp_filename);
1362     return ret;
1363 }
1364 
1365 /*
1366  * Opens a disk image (raw, qcow2, vmdk, ...)
1367  *
1368  * options is a QDict of options to pass to the block drivers, or NULL for an
1369  * empty set of options. The reference to the QDict belongs to the block layer
1370  * after the call (even on failure), so if the caller intends to reuse the
1371  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1372  *
1373  * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1374  * If it is not NULL, the referenced BDS will be reused.
1375  *
1376  * The reference parameter may be used to specify an existing block device which
1377  * should be opened. If specified, neither options nor a filename may be given,
1378  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1379  */
1380 int bdrv_open(BlockDriverState **pbs, const char *filename,
1381               const char *reference, QDict *options, int flags,
1382               BlockDriver *drv, Error **errp)
1383 {
1384     int ret;
1385     BlockDriverState *file = NULL, *bs;
1386     const char *drvname;
1387     Error *local_err = NULL;
1388     int snapshot_flags = 0;
1389 
1390     assert(pbs);
1391 
1392     if (reference) {
1393         bool options_non_empty = options ? qdict_size(options) : false;
1394         QDECREF(options);
1395 
1396         if (*pbs) {
1397             error_setg(errp, "Cannot reuse an existing BDS when referencing "
1398                        "another block device");
1399             return -EINVAL;
1400         }
1401 
1402         if (filename || options_non_empty) {
1403             error_setg(errp, "Cannot reference an existing block device with "
1404                        "additional options or a new filename");
1405             return -EINVAL;
1406         }
1407 
1408         bs = bdrv_lookup_bs(reference, reference, errp);
1409         if (!bs) {
1410             return -ENODEV;
1411         }
1412         bdrv_ref(bs);
1413         *pbs = bs;
1414         return 0;
1415     }
1416 
1417     if (*pbs) {
1418         bs = *pbs;
1419     } else {
1420         bs = bdrv_new("", &error_abort);
1421     }
1422 
1423     /* NULL means an empty set of options */
1424     if (options == NULL) {
1425         options = qdict_new();
1426     }
1427 
1428     ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1429     if (local_err) {
1430         goto fail;
1431     }
1432 
1433     /* Find the right image format driver */
1434     drv = NULL;
1435     drvname = qdict_get_try_str(options, "driver");
1436     if (drvname) {
1437         drv = bdrv_find_format(drvname);
1438         qdict_del(options, "driver");
1439         if (!drv) {
1440             error_setg(errp, "Unknown driver: '%s'", drvname);
1441             ret = -EINVAL;
1442             goto fail;
1443         }
1444     }
1445 
1446     assert(drvname || !(flags & BDRV_O_PROTOCOL));
1447     if (drv && !drv->bdrv_file_open) {
1448         /* If the user explicitly wants a format driver here, we'll need to add
1449          * another layer for the protocol in bs->file */
1450         flags &= ~BDRV_O_PROTOCOL;
1451     }
1452 
1453     bs->options = options;
1454     options = qdict_clone_shallow(options);
1455 
1456     /* Open image file without format layer */
1457     if ((flags & BDRV_O_PROTOCOL) == 0) {
1458         if (flags & BDRV_O_RDWR) {
1459             flags |= BDRV_O_ALLOW_RDWR;
1460         }
1461         if (flags & BDRV_O_SNAPSHOT) {
1462             snapshot_flags = bdrv_temp_snapshot_flags(flags);
1463             flags = bdrv_backing_flags(flags);
1464         }
1465 
1466         assert(file == NULL);
1467         ret = bdrv_open_image(&file, filename, options, "file",
1468                               bdrv_inherited_flags(flags),
1469                               true, &local_err);
1470         if (ret < 0) {
1471             goto fail;
1472         }
1473     }
1474 
1475     /* Image format probing */
1476     if (!drv && file) {
1477         ret = find_image_format(file, filename, &drv, &local_err);
1478         if (ret < 0) {
1479             goto fail;
1480         }
1481     } else if (!drv) {
1482         error_setg(errp, "Must specify either driver or file");
1483         ret = -EINVAL;
1484         goto fail;
1485     }
1486 
1487     /* Open the image */
1488     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1489     if (ret < 0) {
1490         goto fail;
1491     }
1492 
1493     if (file && (bs->file != file)) {
1494         bdrv_unref(file);
1495         file = NULL;
1496     }
1497 
1498     /* If there is a backing file, use it */
1499     if ((flags & BDRV_O_NO_BACKING) == 0) {
1500         QDict *backing_options;
1501 
1502         qdict_extract_subqdict(options, &backing_options, "backing.");
1503         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1504         if (ret < 0) {
1505             goto close_and_fail;
1506         }
1507     }
1508 
1509     bdrv_refresh_filename(bs);
1510 
1511     /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1512      * temporary snapshot afterwards. */
1513     if (snapshot_flags) {
1514         ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1515         if (local_err) {
1516             goto close_and_fail;
1517         }
1518     }
1519 
1520     /* Check if any unknown options were used */
1521     if (options && (qdict_size(options) != 0)) {
1522         const QDictEntry *entry = qdict_first(options);
1523         if (flags & BDRV_O_PROTOCOL) {
1524             error_setg(errp, "Block protocol '%s' doesn't support the option "
1525                        "'%s'", drv->format_name, entry->key);
1526         } else {
1527             error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1528                        "support the option '%s'", drv->format_name,
1529                        bs->device_name, entry->key);
1530         }
1531 
1532         ret = -EINVAL;
1533         goto close_and_fail;
1534     }
1535 
1536     if (!bdrv_key_required(bs)) {
1537         bdrv_dev_change_media_cb(bs, true);
1538     } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1539                && !runstate_check(RUN_STATE_INMIGRATE)
1540                && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1541         error_setg(errp,
1542                    "Guest must be stopped for opening of encrypted image");
1543         ret = -EBUSY;
1544         goto close_and_fail;
1545     }
1546 
1547     QDECREF(options);
1548     *pbs = bs;
1549     return 0;
1550 
1551 fail:
1552     if (file != NULL) {
1553         bdrv_unref(file);
1554     }
1555     QDECREF(bs->options);
1556     QDECREF(options);
1557     bs->options = NULL;
1558     if (!*pbs) {
1559         /* If *pbs is NULL, a new BDS has been created in this function and
1560            needs to be freed now. Otherwise, it does not need to be closed,
1561            since it has not really been opened yet. */
1562         bdrv_unref(bs);
1563     }
1564     if (local_err) {
1565         error_propagate(errp, local_err);
1566     }
1567     return ret;
1568 
1569 close_and_fail:
1570     /* See fail path, but now the BDS has to be always closed */
1571     if (*pbs) {
1572         bdrv_close(bs);
1573     } else {
1574         bdrv_unref(bs);
1575     }
1576     QDECREF(options);
1577     if (local_err) {
1578         error_propagate(errp, local_err);
1579     }
1580     return ret;
1581 }
1582 
1583 typedef struct BlockReopenQueueEntry {
1584      bool prepared;
1585      BDRVReopenState state;
1586      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1587 } BlockReopenQueueEntry;
1588 
1589 /*
1590  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1591  * reopen of multiple devices.
1592  *
1593  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1594  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1595  * be created and initialized. This newly created BlockReopenQueue should be
1596  * passed back in for subsequent calls that are intended to be of the same
1597  * atomic 'set'.
1598  *
1599  * bs is the BlockDriverState to add to the reopen queue.
1600  *
1601  * flags contains the open flags for the associated bs
1602  *
1603  * returns a pointer to bs_queue, which is either the newly allocated
1604  * bs_queue, or the existing bs_queue being used.
1605  *
1606  */
1607 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1608                                     BlockDriverState *bs, int flags)
1609 {
1610     assert(bs != NULL);
1611 
1612     BlockReopenQueueEntry *bs_entry;
1613     if (bs_queue == NULL) {
1614         bs_queue = g_new0(BlockReopenQueue, 1);
1615         QSIMPLEQ_INIT(bs_queue);
1616     }
1617 
1618     /* bdrv_open() masks this flag out */
1619     flags &= ~BDRV_O_PROTOCOL;
1620 
1621     if (bs->file) {
1622         bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1623     }
1624 
1625     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1626     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1627 
1628     bs_entry->state.bs = bs;
1629     bs_entry->state.flags = flags;
1630 
1631     return bs_queue;
1632 }
1633 
1634 /*
1635  * Reopen multiple BlockDriverStates atomically & transactionally.
1636  *
1637  * The queue passed in (bs_queue) must have been built up previous
1638  * via bdrv_reopen_queue().
1639  *
1640  * Reopens all BDS specified in the queue, with the appropriate
1641  * flags.  All devices are prepared for reopen, and failure of any
1642  * device will cause all device changes to be abandonded, and intermediate
1643  * data cleaned up.
1644  *
1645  * If all devices prepare successfully, then the changes are committed
1646  * to all devices.
1647  *
1648  */
1649 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1650 {
1651     int ret = -1;
1652     BlockReopenQueueEntry *bs_entry, *next;
1653     Error *local_err = NULL;
1654 
1655     assert(bs_queue != NULL);
1656 
1657     bdrv_drain_all();
1658 
1659     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1660         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1661             error_propagate(errp, local_err);
1662             goto cleanup;
1663         }
1664         bs_entry->prepared = true;
1665     }
1666 
1667     /* If we reach this point, we have success and just need to apply the
1668      * changes
1669      */
1670     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1671         bdrv_reopen_commit(&bs_entry->state);
1672     }
1673 
1674     ret = 0;
1675 
1676 cleanup:
1677     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1678         if (ret && bs_entry->prepared) {
1679             bdrv_reopen_abort(&bs_entry->state);
1680         }
1681         g_free(bs_entry);
1682     }
1683     g_free(bs_queue);
1684     return ret;
1685 }
1686 
1687 
1688 /* Reopen a single BlockDriverState with the specified flags. */
1689 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1690 {
1691     int ret = -1;
1692     Error *local_err = NULL;
1693     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1694 
1695     ret = bdrv_reopen_multiple(queue, &local_err);
1696     if (local_err != NULL) {
1697         error_propagate(errp, local_err);
1698     }
1699     return ret;
1700 }
1701 
1702 
1703 /*
1704  * Prepares a BlockDriverState for reopen. All changes are staged in the
1705  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1706  * the block driver layer .bdrv_reopen_prepare()
1707  *
1708  * bs is the BlockDriverState to reopen
1709  * flags are the new open flags
1710  * queue is the reopen queue
1711  *
1712  * Returns 0 on success, non-zero on error.  On error errp will be set
1713  * as well.
1714  *
1715  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1716  * It is the responsibility of the caller to then call the abort() or
1717  * commit() for any other BDS that have been left in a prepare() state
1718  *
1719  */
1720 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1721                         Error **errp)
1722 {
1723     int ret = -1;
1724     Error *local_err = NULL;
1725     BlockDriver *drv;
1726 
1727     assert(reopen_state != NULL);
1728     assert(reopen_state->bs->drv != NULL);
1729     drv = reopen_state->bs->drv;
1730 
1731     /* if we are to stay read-only, do not allow permission change
1732      * to r/w */
1733     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1734         reopen_state->flags & BDRV_O_RDWR) {
1735         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1736                   reopen_state->bs->device_name);
1737         goto error;
1738     }
1739 
1740 
1741     ret = bdrv_flush(reopen_state->bs);
1742     if (ret) {
1743         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1744                   strerror(-ret));
1745         goto error;
1746     }
1747 
1748     if (drv->bdrv_reopen_prepare) {
1749         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1750         if (ret) {
1751             if (local_err != NULL) {
1752                 error_propagate(errp, local_err);
1753             } else {
1754                 error_setg(errp, "failed while preparing to reopen image '%s'",
1755                            reopen_state->bs->filename);
1756             }
1757             goto error;
1758         }
1759     } else {
1760         /* It is currently mandatory to have a bdrv_reopen_prepare()
1761          * handler for each supported drv. */
1762         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1763                   drv->format_name, reopen_state->bs->device_name,
1764                  "reopening of file");
1765         ret = -1;
1766         goto error;
1767     }
1768 
1769     ret = 0;
1770 
1771 error:
1772     return ret;
1773 }
1774 
1775 /*
1776  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1777  * makes them final by swapping the staging BlockDriverState contents into
1778  * the active BlockDriverState contents.
1779  */
1780 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1781 {
1782     BlockDriver *drv;
1783 
1784     assert(reopen_state != NULL);
1785     drv = reopen_state->bs->drv;
1786     assert(drv != NULL);
1787 
1788     /* If there are any driver level actions to take */
1789     if (drv->bdrv_reopen_commit) {
1790         drv->bdrv_reopen_commit(reopen_state);
1791     }
1792 
1793     /* set BDS specific flags now */
1794     reopen_state->bs->open_flags         = reopen_state->flags;
1795     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1796                                               BDRV_O_CACHE_WB);
1797     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1798 
1799     bdrv_refresh_limits(reopen_state->bs, NULL);
1800 }
1801 
1802 /*
1803  * Abort the reopen, and delete and free the staged changes in
1804  * reopen_state
1805  */
1806 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1807 {
1808     BlockDriver *drv;
1809 
1810     assert(reopen_state != NULL);
1811     drv = reopen_state->bs->drv;
1812     assert(drv != NULL);
1813 
1814     if (drv->bdrv_reopen_abort) {
1815         drv->bdrv_reopen_abort(reopen_state);
1816     }
1817 }
1818 
1819 
1820 void bdrv_close(BlockDriverState *bs)
1821 {
1822     BdrvAioNotifier *ban, *ban_next;
1823 
1824     if (bs->job) {
1825         block_job_cancel_sync(bs->job);
1826     }
1827     bdrv_drain_all(); /* complete I/O */
1828     bdrv_flush(bs);
1829     bdrv_drain_all(); /* in case flush left pending I/O */
1830     notifier_list_notify(&bs->close_notifiers, bs);
1831 
1832     if (bs->drv) {
1833         if (bs->backing_hd) {
1834             BlockDriverState *backing_hd = bs->backing_hd;
1835             bdrv_set_backing_hd(bs, NULL);
1836             bdrv_unref(backing_hd);
1837         }
1838         bs->drv->bdrv_close(bs);
1839         g_free(bs->opaque);
1840         bs->opaque = NULL;
1841         bs->drv = NULL;
1842         bs->copy_on_read = 0;
1843         bs->backing_file[0] = '\0';
1844         bs->backing_format[0] = '\0';
1845         bs->total_sectors = 0;
1846         bs->encrypted = 0;
1847         bs->valid_key = 0;
1848         bs->sg = 0;
1849         bs->growable = 0;
1850         bs->zero_beyond_eof = false;
1851         QDECREF(bs->options);
1852         bs->options = NULL;
1853         QDECREF(bs->full_open_options);
1854         bs->full_open_options = NULL;
1855 
1856         if (bs->file != NULL) {
1857             bdrv_unref(bs->file);
1858             bs->file = NULL;
1859         }
1860     }
1861 
1862     bdrv_dev_change_media_cb(bs, false);
1863 
1864     /*throttling disk I/O limits*/
1865     if (bs->io_limits_enabled) {
1866         bdrv_io_limits_disable(bs);
1867     }
1868 
1869     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1870         g_free(ban);
1871     }
1872     QLIST_INIT(&bs->aio_notifiers);
1873 }
1874 
1875 void bdrv_close_all(void)
1876 {
1877     BlockDriverState *bs;
1878 
1879     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1880         AioContext *aio_context = bdrv_get_aio_context(bs);
1881 
1882         aio_context_acquire(aio_context);
1883         bdrv_close(bs);
1884         aio_context_release(aio_context);
1885     }
1886 }
1887 
1888 /* Check if any requests are in-flight (including throttled requests) */
1889 static bool bdrv_requests_pending(BlockDriverState *bs)
1890 {
1891     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1892         return true;
1893     }
1894     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1895         return true;
1896     }
1897     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1898         return true;
1899     }
1900     if (bs->file && bdrv_requests_pending(bs->file)) {
1901         return true;
1902     }
1903     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1904         return true;
1905     }
1906     return false;
1907 }
1908 
1909 /*
1910  * Wait for pending requests to complete across all BlockDriverStates
1911  *
1912  * This function does not flush data to disk, use bdrv_flush_all() for that
1913  * after calling this function.
1914  *
1915  * Note that completion of an asynchronous I/O operation can trigger any
1916  * number of other I/O operations on other devices---for example a coroutine
1917  * can be arbitrarily complex and a constant flow of I/O can come until the
1918  * coroutine is complete.  Because of this, it is not possible to have a
1919  * function to drain a single device's I/O queue.
1920  */
1921 void bdrv_drain_all(void)
1922 {
1923     /* Always run first iteration so any pending completion BHs run */
1924     bool busy = true;
1925     BlockDriverState *bs;
1926 
1927     while (busy) {
1928         busy = false;
1929 
1930         QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1931             AioContext *aio_context = bdrv_get_aio_context(bs);
1932             bool bs_busy;
1933 
1934             aio_context_acquire(aio_context);
1935             bdrv_flush_io_queue(bs);
1936             bdrv_start_throttled_reqs(bs);
1937             bs_busy = bdrv_requests_pending(bs);
1938             bs_busy |= aio_poll(aio_context, bs_busy);
1939             aio_context_release(aio_context);
1940 
1941             busy |= bs_busy;
1942         }
1943     }
1944 }
1945 
1946 /* make a BlockDriverState anonymous by removing from bdrv_state and
1947  * graph_bdrv_state list.
1948    Also, NULL terminate the device_name to prevent double remove */
1949 void bdrv_make_anon(BlockDriverState *bs)
1950 {
1951     if (bs->device_name[0] != '\0') {
1952         QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1953     }
1954     bs->device_name[0] = '\0';
1955     if (bs->node_name[0] != '\0') {
1956         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1957     }
1958     bs->node_name[0] = '\0';
1959 }
1960 
1961 static void bdrv_rebind(BlockDriverState *bs)
1962 {
1963     if (bs->drv && bs->drv->bdrv_rebind) {
1964         bs->drv->bdrv_rebind(bs);
1965     }
1966 }
1967 
1968 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1969                                      BlockDriverState *bs_src)
1970 {
1971     /* move some fields that need to stay attached to the device */
1972 
1973     /* dev info */
1974     bs_dest->dev_ops            = bs_src->dev_ops;
1975     bs_dest->dev_opaque         = bs_src->dev_opaque;
1976     bs_dest->dev                = bs_src->dev;
1977     bs_dest->guest_block_size   = bs_src->guest_block_size;
1978     bs_dest->copy_on_read       = bs_src->copy_on_read;
1979 
1980     bs_dest->enable_write_cache = bs_src->enable_write_cache;
1981 
1982     /* i/o throttled req */
1983     memcpy(&bs_dest->throttle_state,
1984            &bs_src->throttle_state,
1985            sizeof(ThrottleState));
1986     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1987     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1988     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1989 
1990     /* r/w error */
1991     bs_dest->on_read_error      = bs_src->on_read_error;
1992     bs_dest->on_write_error     = bs_src->on_write_error;
1993 
1994     /* i/o status */
1995     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1996     bs_dest->iostatus           = bs_src->iostatus;
1997 
1998     /* dirty bitmap */
1999     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
2000 
2001     /* reference count */
2002     bs_dest->refcnt             = bs_src->refcnt;
2003 
2004     /* job */
2005     bs_dest->job                = bs_src->job;
2006 
2007     /* keep the same entry in bdrv_states */
2008     pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
2009             bs_src->device_name);
2010     bs_dest->device_list = bs_src->device_list;
2011     memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2012            sizeof(bs_dest->op_blockers));
2013 }
2014 
2015 /*
2016  * Swap bs contents for two image chains while they are live,
2017  * while keeping required fields on the BlockDriverState that is
2018  * actually attached to a device.
2019  *
2020  * This will modify the BlockDriverState fields, and swap contents
2021  * between bs_new and bs_old. Both bs_new and bs_old are modified.
2022  *
2023  * bs_new is required to be anonymous.
2024  *
2025  * This function does not create any image files.
2026  */
2027 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2028 {
2029     BlockDriverState tmp;
2030 
2031     /* The code needs to swap the node_name but simply swapping node_list won't
2032      * work so first remove the nodes from the graph list, do the swap then
2033      * insert them back if needed.
2034      */
2035     if (bs_new->node_name[0] != '\0') {
2036         QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2037     }
2038     if (bs_old->node_name[0] != '\0') {
2039         QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2040     }
2041 
2042     /* bs_new must be anonymous and shouldn't have anything fancy enabled */
2043     assert(bs_new->device_name[0] == '\0');
2044     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2045     assert(bs_new->job == NULL);
2046     assert(bs_new->dev == NULL);
2047     assert(bs_new->io_limits_enabled == false);
2048     assert(!throttle_have_timer(&bs_new->throttle_state));
2049 
2050     tmp = *bs_new;
2051     *bs_new = *bs_old;
2052     *bs_old = tmp;
2053 
2054     /* there are some fields that should not be swapped, move them back */
2055     bdrv_move_feature_fields(&tmp, bs_old);
2056     bdrv_move_feature_fields(bs_old, bs_new);
2057     bdrv_move_feature_fields(bs_new, &tmp);
2058 
2059     /* bs_new shouldn't be in bdrv_states even after the swap!  */
2060     assert(bs_new->device_name[0] == '\0');
2061 
2062     /* Check a few fields that should remain attached to the device */
2063     assert(bs_new->dev == NULL);
2064     assert(bs_new->job == NULL);
2065     assert(bs_new->io_limits_enabled == false);
2066     assert(!throttle_have_timer(&bs_new->throttle_state));
2067 
2068     /* insert the nodes back into the graph node list if needed */
2069     if (bs_new->node_name[0] != '\0') {
2070         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2071     }
2072     if (bs_old->node_name[0] != '\0') {
2073         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2074     }
2075 
2076     bdrv_rebind(bs_new);
2077     bdrv_rebind(bs_old);
2078 }
2079 
2080 /*
2081  * Add new bs contents at the top of an image chain while the chain is
2082  * live, while keeping required fields on the top layer.
2083  *
2084  * This will modify the BlockDriverState fields, and swap contents
2085  * between bs_new and bs_top. Both bs_new and bs_top are modified.
2086  *
2087  * bs_new is required to be anonymous.
2088  *
2089  * This function does not create any image files.
2090  */
2091 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2092 {
2093     bdrv_swap(bs_new, bs_top);
2094 
2095     /* The contents of 'tmp' will become bs_top, as we are
2096      * swapping bs_new and bs_top contents. */
2097     bdrv_set_backing_hd(bs_top, bs_new);
2098 }
2099 
2100 static void bdrv_delete(BlockDriverState *bs)
2101 {
2102     assert(!bs->dev);
2103     assert(!bs->job);
2104     assert(bdrv_op_blocker_is_empty(bs));
2105     assert(!bs->refcnt);
2106     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2107 
2108     bdrv_close(bs);
2109 
2110     /* remove from list, if necessary */
2111     bdrv_make_anon(bs);
2112 
2113     g_free(bs);
2114 }
2115 
2116 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
2117 /* TODO change to DeviceState *dev when all users are qdevified */
2118 {
2119     if (bs->dev) {
2120         return -EBUSY;
2121     }
2122     bs->dev = dev;
2123     bdrv_iostatus_reset(bs);
2124 
2125     /* We're expecting I/O from the device so bump up coroutine pool size */
2126     qemu_coroutine_adjust_pool_size(COROUTINE_POOL_RESERVATION);
2127     return 0;
2128 }
2129 
2130 /* TODO qdevified devices don't use this, remove when devices are qdevified */
2131 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
2132 {
2133     if (bdrv_attach_dev(bs, dev) < 0) {
2134         abort();
2135     }
2136 }
2137 
2138 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
2139 /* TODO change to DeviceState *dev when all users are qdevified */
2140 {
2141     assert(bs->dev == dev);
2142     bs->dev = NULL;
2143     bs->dev_ops = NULL;
2144     bs->dev_opaque = NULL;
2145     bs->guest_block_size = 512;
2146     qemu_coroutine_adjust_pool_size(-COROUTINE_POOL_RESERVATION);
2147 }
2148 
2149 /* TODO change to return DeviceState * when all users are qdevified */
2150 void *bdrv_get_attached_dev(BlockDriverState *bs)
2151 {
2152     return bs->dev;
2153 }
2154 
2155 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2156                       void *opaque)
2157 {
2158     bs->dev_ops = ops;
2159     bs->dev_opaque = opaque;
2160 }
2161 
2162 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2163 {
2164     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2165         bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2166         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2167         if (tray_was_closed) {
2168             /* tray open */
2169             qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
2170                                               true, &error_abort);
2171         }
2172         if (load) {
2173             /* tray close */
2174             qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
2175                                               false, &error_abort);
2176         }
2177     }
2178 }
2179 
2180 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2181 {
2182     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2183 }
2184 
2185 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2186 {
2187     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2188         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2189     }
2190 }
2191 
2192 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2193 {
2194     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2195         return bs->dev_ops->is_tray_open(bs->dev_opaque);
2196     }
2197     return false;
2198 }
2199 
2200 static void bdrv_dev_resize_cb(BlockDriverState *bs)
2201 {
2202     if (bs->dev_ops && bs->dev_ops->resize_cb) {
2203         bs->dev_ops->resize_cb(bs->dev_opaque);
2204     }
2205 }
2206 
2207 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2208 {
2209     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2210         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2211     }
2212     return false;
2213 }
2214 
2215 /*
2216  * Run consistency checks on an image
2217  *
2218  * Returns 0 if the check could be completed (it doesn't mean that the image is
2219  * free of errors) or -errno when an internal error occurred. The results of the
2220  * check are stored in res.
2221  */
2222 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2223 {
2224     if (bs->drv == NULL) {
2225         return -ENOMEDIUM;
2226     }
2227     if (bs->drv->bdrv_check == NULL) {
2228         return -ENOTSUP;
2229     }
2230 
2231     memset(res, 0, sizeof(*res));
2232     return bs->drv->bdrv_check(bs, res, fix);
2233 }
2234 
2235 #define COMMIT_BUF_SECTORS 2048
2236 
2237 /* commit COW file into the raw image */
2238 int bdrv_commit(BlockDriverState *bs)
2239 {
2240     BlockDriver *drv = bs->drv;
2241     int64_t sector, total_sectors, length, backing_length;
2242     int n, ro, open_flags;
2243     int ret = 0;
2244     uint8_t *buf = NULL;
2245     char filename[PATH_MAX];
2246 
2247     if (!drv)
2248         return -ENOMEDIUM;
2249 
2250     if (!bs->backing_hd) {
2251         return -ENOTSUP;
2252     }
2253 
2254     if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2255         bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2256         return -EBUSY;
2257     }
2258 
2259     ro = bs->backing_hd->read_only;
2260     /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2261     pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2262     open_flags =  bs->backing_hd->open_flags;
2263 
2264     if (ro) {
2265         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2266             return -EACCES;
2267         }
2268     }
2269 
2270     length = bdrv_getlength(bs);
2271     if (length < 0) {
2272         ret = length;
2273         goto ro_cleanup;
2274     }
2275 
2276     backing_length = bdrv_getlength(bs->backing_hd);
2277     if (backing_length < 0) {
2278         ret = backing_length;
2279         goto ro_cleanup;
2280     }
2281 
2282     /* If our top snapshot is larger than the backing file image,
2283      * grow the backing file image if possible.  If not possible,
2284      * we must return an error */
2285     if (length > backing_length) {
2286         ret = bdrv_truncate(bs->backing_hd, length);
2287         if (ret < 0) {
2288             goto ro_cleanup;
2289         }
2290     }
2291 
2292     total_sectors = length >> BDRV_SECTOR_BITS;
2293 
2294     /* qemu_try_blockalign() for bs will choose an alignment that works for
2295      * bs->backing_hd as well, so no need to compare the alignment manually. */
2296     buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2297     if (buf == NULL) {
2298         ret = -ENOMEM;
2299         goto ro_cleanup;
2300     }
2301 
2302     for (sector = 0; sector < total_sectors; sector += n) {
2303         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2304         if (ret < 0) {
2305             goto ro_cleanup;
2306         }
2307         if (ret) {
2308             ret = bdrv_read(bs, sector, buf, n);
2309             if (ret < 0) {
2310                 goto ro_cleanup;
2311             }
2312 
2313             ret = bdrv_write(bs->backing_hd, sector, buf, n);
2314             if (ret < 0) {
2315                 goto ro_cleanup;
2316             }
2317         }
2318     }
2319 
2320     if (drv->bdrv_make_empty) {
2321         ret = drv->bdrv_make_empty(bs);
2322         if (ret < 0) {
2323             goto ro_cleanup;
2324         }
2325         bdrv_flush(bs);
2326     }
2327 
2328     /*
2329      * Make sure all data we wrote to the backing device is actually
2330      * stable on disk.
2331      */
2332     if (bs->backing_hd) {
2333         bdrv_flush(bs->backing_hd);
2334     }
2335 
2336     ret = 0;
2337 ro_cleanup:
2338     qemu_vfree(buf);
2339 
2340     if (ro) {
2341         /* ignoring error return here */
2342         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2343     }
2344 
2345     return ret;
2346 }
2347 
2348 int bdrv_commit_all(void)
2349 {
2350     BlockDriverState *bs;
2351 
2352     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2353         AioContext *aio_context = bdrv_get_aio_context(bs);
2354 
2355         aio_context_acquire(aio_context);
2356         if (bs->drv && bs->backing_hd) {
2357             int ret = bdrv_commit(bs);
2358             if (ret < 0) {
2359                 aio_context_release(aio_context);
2360                 return ret;
2361             }
2362         }
2363         aio_context_release(aio_context);
2364     }
2365     return 0;
2366 }
2367 
2368 /**
2369  * Remove an active request from the tracked requests list
2370  *
2371  * This function should be called when a tracked request is completing.
2372  */
2373 static void tracked_request_end(BdrvTrackedRequest *req)
2374 {
2375     if (req->serialising) {
2376         req->bs->serialising_in_flight--;
2377     }
2378 
2379     QLIST_REMOVE(req, list);
2380     qemu_co_queue_restart_all(&req->wait_queue);
2381 }
2382 
2383 /**
2384  * Add an active request to the tracked requests list
2385  */
2386 static void tracked_request_begin(BdrvTrackedRequest *req,
2387                                   BlockDriverState *bs,
2388                                   int64_t offset,
2389                                   unsigned int bytes, bool is_write)
2390 {
2391     *req = (BdrvTrackedRequest){
2392         .bs = bs,
2393         .offset         = offset,
2394         .bytes          = bytes,
2395         .is_write       = is_write,
2396         .co             = qemu_coroutine_self(),
2397         .serialising    = false,
2398         .overlap_offset = offset,
2399         .overlap_bytes  = bytes,
2400     };
2401 
2402     qemu_co_queue_init(&req->wait_queue);
2403 
2404     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2405 }
2406 
2407 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2408 {
2409     int64_t overlap_offset = req->offset & ~(align - 1);
2410     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2411                                - overlap_offset;
2412 
2413     if (!req->serialising) {
2414         req->bs->serialising_in_flight++;
2415         req->serialising = true;
2416     }
2417 
2418     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2419     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2420 }
2421 
2422 /**
2423  * Round a region to cluster boundaries
2424  */
2425 void bdrv_round_to_clusters(BlockDriverState *bs,
2426                             int64_t sector_num, int nb_sectors,
2427                             int64_t *cluster_sector_num,
2428                             int *cluster_nb_sectors)
2429 {
2430     BlockDriverInfo bdi;
2431 
2432     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2433         *cluster_sector_num = sector_num;
2434         *cluster_nb_sectors = nb_sectors;
2435     } else {
2436         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2437         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2438         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2439                                             nb_sectors, c);
2440     }
2441 }
2442 
2443 static int bdrv_get_cluster_size(BlockDriverState *bs)
2444 {
2445     BlockDriverInfo bdi;
2446     int ret;
2447 
2448     ret = bdrv_get_info(bs, &bdi);
2449     if (ret < 0 || bdi.cluster_size == 0) {
2450         return bs->request_alignment;
2451     } else {
2452         return bdi.cluster_size;
2453     }
2454 }
2455 
2456 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2457                                      int64_t offset, unsigned int bytes)
2458 {
2459     /*        aaaa   bbbb */
2460     if (offset >= req->overlap_offset + req->overlap_bytes) {
2461         return false;
2462     }
2463     /* bbbb   aaaa        */
2464     if (req->overlap_offset >= offset + bytes) {
2465         return false;
2466     }
2467     return true;
2468 }
2469 
2470 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2471 {
2472     BlockDriverState *bs = self->bs;
2473     BdrvTrackedRequest *req;
2474     bool retry;
2475     bool waited = false;
2476 
2477     if (!bs->serialising_in_flight) {
2478         return false;
2479     }
2480 
2481     do {
2482         retry = false;
2483         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2484             if (req == self || (!req->serialising && !self->serialising)) {
2485                 continue;
2486             }
2487             if (tracked_request_overlaps(req, self->overlap_offset,
2488                                          self->overlap_bytes))
2489             {
2490                 /* Hitting this means there was a reentrant request, for
2491                  * example, a block driver issuing nested requests.  This must
2492                  * never happen since it means deadlock.
2493                  */
2494                 assert(qemu_coroutine_self() != req->co);
2495 
2496                 /* If the request is already (indirectly) waiting for us, or
2497                  * will wait for us as soon as it wakes up, then just go on
2498                  * (instead of producing a deadlock in the former case). */
2499                 if (!req->waiting_for) {
2500                     self->waiting_for = req;
2501                     qemu_co_queue_wait(&req->wait_queue);
2502                     self->waiting_for = NULL;
2503                     retry = true;
2504                     waited = true;
2505                     break;
2506                 }
2507             }
2508         }
2509     } while (retry);
2510 
2511     return waited;
2512 }
2513 
2514 /*
2515  * Return values:
2516  * 0        - success
2517  * -EINVAL  - backing format specified, but no file
2518  * -ENOSPC  - can't update the backing file because no space is left in the
2519  *            image file header
2520  * -ENOTSUP - format driver doesn't support changing the backing file
2521  */
2522 int bdrv_change_backing_file(BlockDriverState *bs,
2523     const char *backing_file, const char *backing_fmt)
2524 {
2525     BlockDriver *drv = bs->drv;
2526     int ret;
2527 
2528     /* Backing file format doesn't make sense without a backing file */
2529     if (backing_fmt && !backing_file) {
2530         return -EINVAL;
2531     }
2532 
2533     if (drv->bdrv_change_backing_file != NULL) {
2534         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2535     } else {
2536         ret = -ENOTSUP;
2537     }
2538 
2539     if (ret == 0) {
2540         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2541         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2542     }
2543     return ret;
2544 }
2545 
2546 /*
2547  * Finds the image layer in the chain that has 'bs' as its backing file.
2548  *
2549  * active is the current topmost image.
2550  *
2551  * Returns NULL if bs is not found in active's image chain,
2552  * or if active == bs.
2553  *
2554  * Returns the bottommost base image if bs == NULL.
2555  */
2556 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2557                                     BlockDriverState *bs)
2558 {
2559     while (active && bs != active->backing_hd) {
2560         active = active->backing_hd;
2561     }
2562 
2563     return active;
2564 }
2565 
2566 /* Given a BDS, searches for the base layer. */
2567 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2568 {
2569     return bdrv_find_overlay(bs, NULL);
2570 }
2571 
2572 typedef struct BlkIntermediateStates {
2573     BlockDriverState *bs;
2574     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2575 } BlkIntermediateStates;
2576 
2577 
2578 /*
2579  * Drops images above 'base' up to and including 'top', and sets the image
2580  * above 'top' to have base as its backing file.
2581  *
2582  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2583  * information in 'bs' can be properly updated.
2584  *
2585  * E.g., this will convert the following chain:
2586  * bottom <- base <- intermediate <- top <- active
2587  *
2588  * to
2589  *
2590  * bottom <- base <- active
2591  *
2592  * It is allowed for bottom==base, in which case it converts:
2593  *
2594  * base <- intermediate <- top <- active
2595  *
2596  * to
2597  *
2598  * base <- active
2599  *
2600  * If backing_file_str is non-NULL, it will be used when modifying top's
2601  * overlay image metadata.
2602  *
2603  * Error conditions:
2604  *  if active == top, that is considered an error
2605  *
2606  */
2607 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2608                            BlockDriverState *base, const char *backing_file_str)
2609 {
2610     BlockDriverState *intermediate;
2611     BlockDriverState *base_bs = NULL;
2612     BlockDriverState *new_top_bs = NULL;
2613     BlkIntermediateStates *intermediate_state, *next;
2614     int ret = -EIO;
2615 
2616     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2617     QSIMPLEQ_INIT(&states_to_delete);
2618 
2619     if (!top->drv || !base->drv) {
2620         goto exit;
2621     }
2622 
2623     new_top_bs = bdrv_find_overlay(active, top);
2624 
2625     if (new_top_bs == NULL) {
2626         /* we could not find the image above 'top', this is an error */
2627         goto exit;
2628     }
2629 
2630     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2631      * to do, no intermediate images */
2632     if (new_top_bs->backing_hd == base) {
2633         ret = 0;
2634         goto exit;
2635     }
2636 
2637     intermediate = top;
2638 
2639     /* now we will go down through the list, and add each BDS we find
2640      * into our deletion queue, until we hit the 'base'
2641      */
2642     while (intermediate) {
2643         intermediate_state = g_new0(BlkIntermediateStates, 1);
2644         intermediate_state->bs = intermediate;
2645         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2646 
2647         if (intermediate->backing_hd == base) {
2648             base_bs = intermediate->backing_hd;
2649             break;
2650         }
2651         intermediate = intermediate->backing_hd;
2652     }
2653     if (base_bs == NULL) {
2654         /* something went wrong, we did not end at the base. safely
2655          * unravel everything, and exit with error */
2656         goto exit;
2657     }
2658 
2659     /* success - we can delete the intermediate states, and link top->base */
2660     backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2661     ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2662                                    base_bs->drv ? base_bs->drv->format_name : "");
2663     if (ret) {
2664         goto exit;
2665     }
2666     bdrv_set_backing_hd(new_top_bs, base_bs);
2667 
2668     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2669         /* so that bdrv_close() does not recursively close the chain */
2670         bdrv_set_backing_hd(intermediate_state->bs, NULL);
2671         bdrv_unref(intermediate_state->bs);
2672     }
2673     ret = 0;
2674 
2675 exit:
2676     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2677         g_free(intermediate_state);
2678     }
2679     return ret;
2680 }
2681 
2682 
2683 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2684                                    size_t size)
2685 {
2686     int64_t len;
2687 
2688     if (size > INT_MAX) {
2689         return -EIO;
2690     }
2691 
2692     if (!bdrv_is_inserted(bs))
2693         return -ENOMEDIUM;
2694 
2695     if (bs->growable)
2696         return 0;
2697 
2698     len = bdrv_getlength(bs);
2699 
2700     if (offset < 0)
2701         return -EIO;
2702 
2703     if ((offset > len) || (len - offset < size))
2704         return -EIO;
2705 
2706     return 0;
2707 }
2708 
2709 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2710                               int nb_sectors)
2711 {
2712     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2713         return -EIO;
2714     }
2715 
2716     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2717                                    nb_sectors * BDRV_SECTOR_SIZE);
2718 }
2719 
2720 typedef struct RwCo {
2721     BlockDriverState *bs;
2722     int64_t offset;
2723     QEMUIOVector *qiov;
2724     bool is_write;
2725     int ret;
2726     BdrvRequestFlags flags;
2727 } RwCo;
2728 
2729 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2730 {
2731     RwCo *rwco = opaque;
2732 
2733     if (!rwco->is_write) {
2734         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2735                                       rwco->qiov->size, rwco->qiov,
2736                                       rwco->flags);
2737     } else {
2738         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2739                                        rwco->qiov->size, rwco->qiov,
2740                                        rwco->flags);
2741     }
2742 }
2743 
2744 /*
2745  * Process a vectored synchronous request using coroutines
2746  */
2747 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2748                         QEMUIOVector *qiov, bool is_write,
2749                         BdrvRequestFlags flags)
2750 {
2751     Coroutine *co;
2752     RwCo rwco = {
2753         .bs = bs,
2754         .offset = offset,
2755         .qiov = qiov,
2756         .is_write = is_write,
2757         .ret = NOT_DONE,
2758         .flags = flags,
2759     };
2760 
2761     /**
2762      * In sync call context, when the vcpu is blocked, this throttling timer
2763      * will not fire; so the I/O throttling function has to be disabled here
2764      * if it has been enabled.
2765      */
2766     if (bs->io_limits_enabled) {
2767         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2768                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2769         bdrv_io_limits_disable(bs);
2770     }
2771 
2772     if (qemu_in_coroutine()) {
2773         /* Fast-path if already in coroutine context */
2774         bdrv_rw_co_entry(&rwco);
2775     } else {
2776         AioContext *aio_context = bdrv_get_aio_context(bs);
2777 
2778         co = qemu_coroutine_create(bdrv_rw_co_entry);
2779         qemu_coroutine_enter(co, &rwco);
2780         while (rwco.ret == NOT_DONE) {
2781             aio_poll(aio_context, true);
2782         }
2783     }
2784     return rwco.ret;
2785 }
2786 
2787 /*
2788  * Process a synchronous request using coroutines
2789  */
2790 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2791                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2792 {
2793     QEMUIOVector qiov;
2794     struct iovec iov = {
2795         .iov_base = (void *)buf,
2796         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2797     };
2798 
2799     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2800         return -EINVAL;
2801     }
2802 
2803     qemu_iovec_init_external(&qiov, &iov, 1);
2804     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2805                         &qiov, is_write, flags);
2806 }
2807 
2808 /* return < 0 if error. See bdrv_write() for the return codes */
2809 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2810               uint8_t *buf, int nb_sectors)
2811 {
2812     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2813 }
2814 
2815 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2816 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2817                           uint8_t *buf, int nb_sectors)
2818 {
2819     bool enabled;
2820     int ret;
2821 
2822     enabled = bs->io_limits_enabled;
2823     bs->io_limits_enabled = false;
2824     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2825     bs->io_limits_enabled = enabled;
2826     return ret;
2827 }
2828 
2829 /* Return < 0 if error. Important errors are:
2830   -EIO         generic I/O error (may happen for all errors)
2831   -ENOMEDIUM   No media inserted.
2832   -EINVAL      Invalid sector number or nb_sectors
2833   -EACCES      Trying to write a read-only device
2834 */
2835 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2836                const uint8_t *buf, int nb_sectors)
2837 {
2838     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2839 }
2840 
2841 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2842                       int nb_sectors, BdrvRequestFlags flags)
2843 {
2844     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2845                       BDRV_REQ_ZERO_WRITE | flags);
2846 }
2847 
2848 /*
2849  * Completely zero out a block device with the help of bdrv_write_zeroes.
2850  * The operation is sped up by checking the block status and only writing
2851  * zeroes to the device if they currently do not return zeroes. Optional
2852  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2853  *
2854  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2855  */
2856 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2857 {
2858     int64_t target_sectors, ret, nb_sectors, sector_num = 0;
2859     int n;
2860 
2861     target_sectors = bdrv_nb_sectors(bs);
2862     if (target_sectors < 0) {
2863         return target_sectors;
2864     }
2865 
2866     for (;;) {
2867         nb_sectors = target_sectors - sector_num;
2868         if (nb_sectors <= 0) {
2869             return 0;
2870         }
2871         if (nb_sectors > INT_MAX) {
2872             nb_sectors = INT_MAX;
2873         }
2874         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2875         if (ret < 0) {
2876             error_report("error getting block status at sector %" PRId64 ": %s",
2877                          sector_num, strerror(-ret));
2878             return ret;
2879         }
2880         if (ret & BDRV_BLOCK_ZERO) {
2881             sector_num += n;
2882             continue;
2883         }
2884         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2885         if (ret < 0) {
2886             error_report("error writing zeroes at sector %" PRId64 ": %s",
2887                          sector_num, strerror(-ret));
2888             return ret;
2889         }
2890         sector_num += n;
2891     }
2892 }
2893 
2894 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2895 {
2896     QEMUIOVector qiov;
2897     struct iovec iov = {
2898         .iov_base = (void *)buf,
2899         .iov_len = bytes,
2900     };
2901     int ret;
2902 
2903     if (bytes < 0) {
2904         return -EINVAL;
2905     }
2906 
2907     qemu_iovec_init_external(&qiov, &iov, 1);
2908     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2909     if (ret < 0) {
2910         return ret;
2911     }
2912 
2913     return bytes;
2914 }
2915 
2916 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2917 {
2918     int ret;
2919 
2920     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2921     if (ret < 0) {
2922         return ret;
2923     }
2924 
2925     return qiov->size;
2926 }
2927 
2928 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2929                 const void *buf, int bytes)
2930 {
2931     QEMUIOVector qiov;
2932     struct iovec iov = {
2933         .iov_base   = (void *) buf,
2934         .iov_len    = bytes,
2935     };
2936 
2937     if (bytes < 0) {
2938         return -EINVAL;
2939     }
2940 
2941     qemu_iovec_init_external(&qiov, &iov, 1);
2942     return bdrv_pwritev(bs, offset, &qiov);
2943 }
2944 
2945 /*
2946  * Writes to the file and ensures that no writes are reordered across this
2947  * request (acts as a barrier)
2948  *
2949  * Returns 0 on success, -errno in error cases.
2950  */
2951 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2952     const void *buf, int count)
2953 {
2954     int ret;
2955 
2956     ret = bdrv_pwrite(bs, offset, buf, count);
2957     if (ret < 0) {
2958         return ret;
2959     }
2960 
2961     /* No flush needed for cache modes that already do it */
2962     if (bs->enable_write_cache) {
2963         bdrv_flush(bs);
2964     }
2965 
2966     return 0;
2967 }
2968 
2969 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2970         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2971 {
2972     /* Perform I/O through a temporary buffer so that users who scribble over
2973      * their read buffer while the operation is in progress do not end up
2974      * modifying the image file.  This is critical for zero-copy guest I/O
2975      * where anything might happen inside guest memory.
2976      */
2977     void *bounce_buffer;
2978 
2979     BlockDriver *drv = bs->drv;
2980     struct iovec iov;
2981     QEMUIOVector bounce_qiov;
2982     int64_t cluster_sector_num;
2983     int cluster_nb_sectors;
2984     size_t skip_bytes;
2985     int ret;
2986 
2987     /* Cover entire cluster so no additional backing file I/O is required when
2988      * allocating cluster in the image file.
2989      */
2990     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2991                            &cluster_sector_num, &cluster_nb_sectors);
2992 
2993     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2994                                    cluster_sector_num, cluster_nb_sectors);
2995 
2996     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2997     iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
2998     if (bounce_buffer == NULL) {
2999         ret = -ENOMEM;
3000         goto err;
3001     }
3002 
3003     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
3004 
3005     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
3006                              &bounce_qiov);
3007     if (ret < 0) {
3008         goto err;
3009     }
3010 
3011     if (drv->bdrv_co_write_zeroes &&
3012         buffer_is_zero(bounce_buffer, iov.iov_len)) {
3013         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
3014                                       cluster_nb_sectors, 0);
3015     } else {
3016         /* This does not change the data on the disk, it is not necessary
3017          * to flush even in cache=writethrough mode.
3018          */
3019         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
3020                                   &bounce_qiov);
3021     }
3022 
3023     if (ret < 0) {
3024         /* It might be okay to ignore write errors for guest requests.  If this
3025          * is a deliberate copy-on-read then we don't want to ignore the error.
3026          * Simply report it in all cases.
3027          */
3028         goto err;
3029     }
3030 
3031     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
3032     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
3033                         nb_sectors * BDRV_SECTOR_SIZE);
3034 
3035 err:
3036     qemu_vfree(bounce_buffer);
3037     return ret;
3038 }
3039 
3040 /*
3041  * Forwards an already correctly aligned request to the BlockDriver. This
3042  * handles copy on read and zeroing after EOF; any other features must be
3043  * implemented by the caller.
3044  */
3045 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
3046     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3047     int64_t align, QEMUIOVector *qiov, int flags)
3048 {
3049     BlockDriver *drv = bs->drv;
3050     int ret;
3051 
3052     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3053     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3054 
3055     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3056     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3057     assert(!qiov || bytes == qiov->size);
3058 
3059     /* Handle Copy on Read and associated serialisation */
3060     if (flags & BDRV_REQ_COPY_ON_READ) {
3061         /* If we touch the same cluster it counts as an overlap.  This
3062          * guarantees that allocating writes will be serialized and not race
3063          * with each other for the same cluster.  For example, in copy-on-read
3064          * it ensures that the CoR read and write operations are atomic and
3065          * guest writes cannot interleave between them. */
3066         mark_request_serialising(req, bdrv_get_cluster_size(bs));
3067     }
3068 
3069     wait_serialising_requests(req);
3070 
3071     if (flags & BDRV_REQ_COPY_ON_READ) {
3072         int pnum;
3073 
3074         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3075         if (ret < 0) {
3076             goto out;
3077         }
3078 
3079         if (!ret || pnum != nb_sectors) {
3080             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3081             goto out;
3082         }
3083     }
3084 
3085     /* Forward the request to the BlockDriver */
3086     if (!(bs->zero_beyond_eof && bs->growable)) {
3087         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3088     } else {
3089         /* Read zeros after EOF of growable BDSes */
3090         int64_t total_sectors, max_nb_sectors;
3091 
3092         total_sectors = bdrv_nb_sectors(bs);
3093         if (total_sectors < 0) {
3094             ret = total_sectors;
3095             goto out;
3096         }
3097 
3098         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3099                                   align >> BDRV_SECTOR_BITS);
3100         if (max_nb_sectors > 0) {
3101             QEMUIOVector local_qiov;
3102             size_t local_sectors;
3103 
3104             max_nb_sectors = MIN(max_nb_sectors, SIZE_MAX / BDRV_SECTOR_BITS);
3105             local_sectors = MIN(max_nb_sectors, nb_sectors);
3106 
3107             qemu_iovec_init(&local_qiov, qiov->niov);
3108             qemu_iovec_concat(&local_qiov, qiov, 0,
3109                               local_sectors * BDRV_SECTOR_SIZE);
3110 
3111             ret = drv->bdrv_co_readv(bs, sector_num, local_sectors,
3112                                      &local_qiov);
3113 
3114             qemu_iovec_destroy(&local_qiov);
3115         } else {
3116             ret = 0;
3117         }
3118 
3119         /* Reading beyond end of file is supposed to produce zeroes */
3120         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3121             uint64_t offset = MAX(0, total_sectors - sector_num);
3122             uint64_t bytes = (sector_num + nb_sectors - offset) *
3123                               BDRV_SECTOR_SIZE;
3124             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3125         }
3126     }
3127 
3128 out:
3129     return ret;
3130 }
3131 
3132 /*
3133  * Handle a read request in coroutine context
3134  */
3135 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3136     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3137     BdrvRequestFlags flags)
3138 {
3139     BlockDriver *drv = bs->drv;
3140     BdrvTrackedRequest req;
3141 
3142     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3143     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3144     uint8_t *head_buf = NULL;
3145     uint8_t *tail_buf = NULL;
3146     QEMUIOVector local_qiov;
3147     bool use_local_qiov = false;
3148     int ret;
3149 
3150     if (!drv) {
3151         return -ENOMEDIUM;
3152     }
3153     if (bdrv_check_byte_request(bs, offset, bytes)) {
3154         return -EIO;
3155     }
3156 
3157     if (bs->copy_on_read) {
3158         flags |= BDRV_REQ_COPY_ON_READ;
3159     }
3160 
3161     /* throttling disk I/O */
3162     if (bs->io_limits_enabled) {
3163         bdrv_io_limits_intercept(bs, bytes, false);
3164     }
3165 
3166     /* Align read if necessary by padding qiov */
3167     if (offset & (align - 1)) {
3168         head_buf = qemu_blockalign(bs, align);
3169         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3170         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3171         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3172         use_local_qiov = true;
3173 
3174         bytes += offset & (align - 1);
3175         offset = offset & ~(align - 1);
3176     }
3177 
3178     if ((offset + bytes) & (align - 1)) {
3179         if (!use_local_qiov) {
3180             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3181             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3182             use_local_qiov = true;
3183         }
3184         tail_buf = qemu_blockalign(bs, align);
3185         qemu_iovec_add(&local_qiov, tail_buf,
3186                        align - ((offset + bytes) & (align - 1)));
3187 
3188         bytes = ROUND_UP(bytes, align);
3189     }
3190 
3191     tracked_request_begin(&req, bs, offset, bytes, false);
3192     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3193                               use_local_qiov ? &local_qiov : qiov,
3194                               flags);
3195     tracked_request_end(&req);
3196 
3197     if (use_local_qiov) {
3198         qemu_iovec_destroy(&local_qiov);
3199         qemu_vfree(head_buf);
3200         qemu_vfree(tail_buf);
3201     }
3202 
3203     return ret;
3204 }
3205 
3206 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3207     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3208     BdrvRequestFlags flags)
3209 {
3210     if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3211         return -EINVAL;
3212     }
3213 
3214     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3215                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3216 }
3217 
3218 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3219     int nb_sectors, QEMUIOVector *qiov)
3220 {
3221     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3222 
3223     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3224 }
3225 
3226 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3227     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3228 {
3229     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3230 
3231     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3232                             BDRV_REQ_COPY_ON_READ);
3233 }
3234 
3235 /* if no limit is specified in the BlockLimits use a default
3236  * of 32768 512-byte sectors (16 MiB) per request.
3237  */
3238 #define MAX_WRITE_ZEROES_DEFAULT 32768
3239 
3240 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3241     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3242 {
3243     BlockDriver *drv = bs->drv;
3244     QEMUIOVector qiov;
3245     struct iovec iov = {0};
3246     int ret = 0;
3247 
3248     int max_write_zeroes = bs->bl.max_write_zeroes ?
3249                            bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3250 
3251     while (nb_sectors > 0 && !ret) {
3252         int num = nb_sectors;
3253 
3254         /* Align request.  Block drivers can expect the "bulk" of the request
3255          * to be aligned.
3256          */
3257         if (bs->bl.write_zeroes_alignment
3258             && num > bs->bl.write_zeroes_alignment) {
3259             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3260                 /* Make a small request up to the first aligned sector.  */
3261                 num = bs->bl.write_zeroes_alignment;
3262                 num -= sector_num % bs->bl.write_zeroes_alignment;
3263             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3264                 /* Shorten the request to the last aligned sector.  num cannot
3265                  * underflow because num > bs->bl.write_zeroes_alignment.
3266                  */
3267                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3268             }
3269         }
3270 
3271         /* limit request size */
3272         if (num > max_write_zeroes) {
3273             num = max_write_zeroes;
3274         }
3275 
3276         ret = -ENOTSUP;
3277         /* First try the efficient write zeroes operation */
3278         if (drv->bdrv_co_write_zeroes) {
3279             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3280         }
3281 
3282         if (ret == -ENOTSUP) {
3283             /* Fall back to bounce buffer if write zeroes is unsupported */
3284             iov.iov_len = num * BDRV_SECTOR_SIZE;
3285             if (iov.iov_base == NULL) {
3286                 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3287                 if (iov.iov_base == NULL) {
3288                     ret = -ENOMEM;
3289                     goto fail;
3290                 }
3291                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3292             }
3293             qemu_iovec_init_external(&qiov, &iov, 1);
3294 
3295             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3296 
3297             /* Keep bounce buffer around if it is big enough for all
3298              * all future requests.
3299              */
3300             if (num < max_write_zeroes) {
3301                 qemu_vfree(iov.iov_base);
3302                 iov.iov_base = NULL;
3303             }
3304         }
3305 
3306         sector_num += num;
3307         nb_sectors -= num;
3308     }
3309 
3310 fail:
3311     qemu_vfree(iov.iov_base);
3312     return ret;
3313 }
3314 
3315 /*
3316  * Forwards an already correctly aligned write request to the BlockDriver.
3317  */
3318 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3319     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3320     QEMUIOVector *qiov, int flags)
3321 {
3322     BlockDriver *drv = bs->drv;
3323     bool waited;
3324     int ret;
3325 
3326     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3327     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3328 
3329     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3330     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3331     assert(!qiov || bytes == qiov->size);
3332 
3333     waited = wait_serialising_requests(req);
3334     assert(!waited || !req->serialising);
3335     assert(req->overlap_offset <= offset);
3336     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3337 
3338     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3339 
3340     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3341         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3342         qemu_iovec_is_zero(qiov)) {
3343         flags |= BDRV_REQ_ZERO_WRITE;
3344         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3345             flags |= BDRV_REQ_MAY_UNMAP;
3346         }
3347     }
3348 
3349     if (ret < 0) {
3350         /* Do nothing, write notifier decided to fail this request */
3351     } else if (flags & BDRV_REQ_ZERO_WRITE) {
3352         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3353         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3354     } else {
3355         BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3356         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3357     }
3358     BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3359 
3360     if (ret == 0 && !bs->enable_write_cache) {
3361         ret = bdrv_co_flush(bs);
3362     }
3363 
3364     bdrv_set_dirty(bs, sector_num, nb_sectors);
3365 
3366     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3367         bs->wr_highest_sector = sector_num + nb_sectors - 1;
3368     }
3369     if (bs->growable && ret >= 0) {
3370         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3371     }
3372 
3373     return ret;
3374 }
3375 
3376 /*
3377  * Handle a write request in coroutine context
3378  */
3379 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3380     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3381     BdrvRequestFlags flags)
3382 {
3383     BdrvTrackedRequest req;
3384     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3385     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3386     uint8_t *head_buf = NULL;
3387     uint8_t *tail_buf = NULL;
3388     QEMUIOVector local_qiov;
3389     bool use_local_qiov = false;
3390     int ret;
3391 
3392     if (!bs->drv) {
3393         return -ENOMEDIUM;
3394     }
3395     if (bs->read_only) {
3396         return -EACCES;
3397     }
3398     if (bdrv_check_byte_request(bs, offset, bytes)) {
3399         return -EIO;
3400     }
3401 
3402     /* throttling disk I/O */
3403     if (bs->io_limits_enabled) {
3404         bdrv_io_limits_intercept(bs, bytes, true);
3405     }
3406 
3407     /*
3408      * Align write if necessary by performing a read-modify-write cycle.
3409      * Pad qiov with the read parts and be sure to have a tracked request not
3410      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3411      */
3412     tracked_request_begin(&req, bs, offset, bytes, true);
3413 
3414     if (offset & (align - 1)) {
3415         QEMUIOVector head_qiov;
3416         struct iovec head_iov;
3417 
3418         mark_request_serialising(&req, align);
3419         wait_serialising_requests(&req);
3420 
3421         head_buf = qemu_blockalign(bs, align);
3422         head_iov = (struct iovec) {
3423             .iov_base   = head_buf,
3424             .iov_len    = align,
3425         };
3426         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3427 
3428         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3429         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3430                                   align, &head_qiov, 0);
3431         if (ret < 0) {
3432             goto fail;
3433         }
3434         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3435 
3436         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3437         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3438         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3439         use_local_qiov = true;
3440 
3441         bytes += offset & (align - 1);
3442         offset = offset & ~(align - 1);
3443     }
3444 
3445     if ((offset + bytes) & (align - 1)) {
3446         QEMUIOVector tail_qiov;
3447         struct iovec tail_iov;
3448         size_t tail_bytes;
3449         bool waited;
3450 
3451         mark_request_serialising(&req, align);
3452         waited = wait_serialising_requests(&req);
3453         assert(!waited || !use_local_qiov);
3454 
3455         tail_buf = qemu_blockalign(bs, align);
3456         tail_iov = (struct iovec) {
3457             .iov_base   = tail_buf,
3458             .iov_len    = align,
3459         };
3460         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3461 
3462         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3463         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3464                                   align, &tail_qiov, 0);
3465         if (ret < 0) {
3466             goto fail;
3467         }
3468         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3469 
3470         if (!use_local_qiov) {
3471             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3472             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3473             use_local_qiov = true;
3474         }
3475 
3476         tail_bytes = (offset + bytes) & (align - 1);
3477         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3478 
3479         bytes = ROUND_UP(bytes, align);
3480     }
3481 
3482     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3483                                use_local_qiov ? &local_qiov : qiov,
3484                                flags);
3485 
3486 fail:
3487     tracked_request_end(&req);
3488 
3489     if (use_local_qiov) {
3490         qemu_iovec_destroy(&local_qiov);
3491     }
3492     qemu_vfree(head_buf);
3493     qemu_vfree(tail_buf);
3494 
3495     return ret;
3496 }
3497 
3498 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3499     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3500     BdrvRequestFlags flags)
3501 {
3502     if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3503         return -EINVAL;
3504     }
3505 
3506     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3507                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3508 }
3509 
3510 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3511     int nb_sectors, QEMUIOVector *qiov)
3512 {
3513     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3514 
3515     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3516 }
3517 
3518 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3519                                       int64_t sector_num, int nb_sectors,
3520                                       BdrvRequestFlags flags)
3521 {
3522     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3523 
3524     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3525         flags &= ~BDRV_REQ_MAY_UNMAP;
3526     }
3527 
3528     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3529                              BDRV_REQ_ZERO_WRITE | flags);
3530 }
3531 
3532 /**
3533  * Truncate file to 'offset' bytes (needed only for file protocols)
3534  */
3535 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3536 {
3537     BlockDriver *drv = bs->drv;
3538     int ret;
3539     if (!drv)
3540         return -ENOMEDIUM;
3541     if (!drv->bdrv_truncate)
3542         return -ENOTSUP;
3543     if (bs->read_only)
3544         return -EACCES;
3545 
3546     ret = drv->bdrv_truncate(bs, offset);
3547     if (ret == 0) {
3548         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3549         bdrv_dev_resize_cb(bs);
3550     }
3551     return ret;
3552 }
3553 
3554 /**
3555  * Length of a allocated file in bytes. Sparse files are counted by actual
3556  * allocated space. Return < 0 if error or unknown.
3557  */
3558 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3559 {
3560     BlockDriver *drv = bs->drv;
3561     if (!drv) {
3562         return -ENOMEDIUM;
3563     }
3564     if (drv->bdrv_get_allocated_file_size) {
3565         return drv->bdrv_get_allocated_file_size(bs);
3566     }
3567     if (bs->file) {
3568         return bdrv_get_allocated_file_size(bs->file);
3569     }
3570     return -ENOTSUP;
3571 }
3572 
3573 /**
3574  * Return number of sectors on success, -errno on error.
3575  */
3576 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3577 {
3578     BlockDriver *drv = bs->drv;
3579 
3580     if (!drv)
3581         return -ENOMEDIUM;
3582 
3583     if (drv->has_variable_length) {
3584         int ret = refresh_total_sectors(bs, bs->total_sectors);
3585         if (ret < 0) {
3586             return ret;
3587         }
3588     }
3589     return bs->total_sectors;
3590 }
3591 
3592 /**
3593  * Return length in bytes on success, -errno on error.
3594  * The length is always a multiple of BDRV_SECTOR_SIZE.
3595  */
3596 int64_t bdrv_getlength(BlockDriverState *bs)
3597 {
3598     int64_t ret = bdrv_nb_sectors(bs);
3599 
3600     return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3601 }
3602 
3603 /* return 0 as number of sectors if no device present or error */
3604 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3605 {
3606     int64_t nb_sectors = bdrv_nb_sectors(bs);
3607 
3608     *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3609 }
3610 
3611 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3612                        BlockdevOnError on_write_error)
3613 {
3614     bs->on_read_error = on_read_error;
3615     bs->on_write_error = on_write_error;
3616 }
3617 
3618 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3619 {
3620     return is_read ? bs->on_read_error : bs->on_write_error;
3621 }
3622 
3623 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3624 {
3625     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3626 
3627     switch (on_err) {
3628     case BLOCKDEV_ON_ERROR_ENOSPC:
3629         return (error == ENOSPC) ?
3630                BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3631     case BLOCKDEV_ON_ERROR_STOP:
3632         return BLOCK_ERROR_ACTION_STOP;
3633     case BLOCKDEV_ON_ERROR_REPORT:
3634         return BLOCK_ERROR_ACTION_REPORT;
3635     case BLOCKDEV_ON_ERROR_IGNORE:
3636         return BLOCK_ERROR_ACTION_IGNORE;
3637     default:
3638         abort();
3639     }
3640 }
3641 
3642 static void send_qmp_error_event(BlockDriverState *bs,
3643                                  BlockErrorAction action,
3644                                  bool is_read, int error)
3645 {
3646     BlockErrorAction ac;
3647 
3648     ac = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3649     qapi_event_send_block_io_error(bdrv_get_device_name(bs), ac, action,
3650                                    bdrv_iostatus_is_enabled(bs),
3651                                    error == ENOSPC, &error_abort);
3652 }
3653 
3654 /* This is done by device models because, while the block layer knows
3655  * about the error, it does not know whether an operation comes from
3656  * the device or the block layer (from a job, for example).
3657  */
3658 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3659                        bool is_read, int error)
3660 {
3661     assert(error >= 0);
3662 
3663     if (action == BLOCK_ERROR_ACTION_STOP) {
3664         /* First set the iostatus, so that "info block" returns an iostatus
3665          * that matches the events raised so far (an additional error iostatus
3666          * is fine, but not a lost one).
3667          */
3668         bdrv_iostatus_set_err(bs, error);
3669 
3670         /* Then raise the request to stop the VM and the event.
3671          * qemu_system_vmstop_request_prepare has two effects.  First,
3672          * it ensures that the STOP event always comes after the
3673          * BLOCK_IO_ERROR event.  Second, it ensures that even if management
3674          * can observe the STOP event and do a "cont" before the STOP
3675          * event is issued, the VM will not stop.  In this case, vm_start()
3676          * also ensures that the STOP/RESUME pair of events is emitted.
3677          */
3678         qemu_system_vmstop_request_prepare();
3679         send_qmp_error_event(bs, action, is_read, error);
3680         qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3681     } else {
3682         send_qmp_error_event(bs, action, is_read, error);
3683     }
3684 }
3685 
3686 int bdrv_is_read_only(BlockDriverState *bs)
3687 {
3688     return bs->read_only;
3689 }
3690 
3691 int bdrv_is_sg(BlockDriverState *bs)
3692 {
3693     return bs->sg;
3694 }
3695 
3696 int bdrv_enable_write_cache(BlockDriverState *bs)
3697 {
3698     return bs->enable_write_cache;
3699 }
3700 
3701 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3702 {
3703     bs->enable_write_cache = wce;
3704 
3705     /* so a reopen() will preserve wce */
3706     if (wce) {
3707         bs->open_flags |= BDRV_O_CACHE_WB;
3708     } else {
3709         bs->open_flags &= ~BDRV_O_CACHE_WB;
3710     }
3711 }
3712 
3713 int bdrv_is_encrypted(BlockDriverState *bs)
3714 {
3715     if (bs->backing_hd && bs->backing_hd->encrypted)
3716         return 1;
3717     return bs->encrypted;
3718 }
3719 
3720 int bdrv_key_required(BlockDriverState *bs)
3721 {
3722     BlockDriverState *backing_hd = bs->backing_hd;
3723 
3724     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3725         return 1;
3726     return (bs->encrypted && !bs->valid_key);
3727 }
3728 
3729 int bdrv_set_key(BlockDriverState *bs, const char *key)
3730 {
3731     int ret;
3732     if (bs->backing_hd && bs->backing_hd->encrypted) {
3733         ret = bdrv_set_key(bs->backing_hd, key);
3734         if (ret < 0)
3735             return ret;
3736         if (!bs->encrypted)
3737             return 0;
3738     }
3739     if (!bs->encrypted) {
3740         return -EINVAL;
3741     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3742         return -ENOMEDIUM;
3743     }
3744     ret = bs->drv->bdrv_set_key(bs, key);
3745     if (ret < 0) {
3746         bs->valid_key = 0;
3747     } else if (!bs->valid_key) {
3748         bs->valid_key = 1;
3749         /* call the change callback now, we skipped it on open */
3750         bdrv_dev_change_media_cb(bs, true);
3751     }
3752     return ret;
3753 }
3754 
3755 const char *bdrv_get_format_name(BlockDriverState *bs)
3756 {
3757     return bs->drv ? bs->drv->format_name : NULL;
3758 }
3759 
3760 static int qsort_strcmp(const void *a, const void *b)
3761 {
3762     return strcmp(a, b);
3763 }
3764 
3765 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3766                          void *opaque)
3767 {
3768     BlockDriver *drv;
3769     int count = 0;
3770     int i;
3771     const char **formats = NULL;
3772 
3773     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3774         if (drv->format_name) {
3775             bool found = false;
3776             int i = count;
3777             while (formats && i && !found) {
3778                 found = !strcmp(formats[--i], drv->format_name);
3779             }
3780 
3781             if (!found) {
3782                 formats = g_renew(const char *, formats, count + 1);
3783                 formats[count++] = drv->format_name;
3784             }
3785         }
3786     }
3787 
3788     qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3789 
3790     for (i = 0; i < count; i++) {
3791         it(opaque, formats[i]);
3792     }
3793 
3794     g_free(formats);
3795 }
3796 
3797 /* This function is to find block backend bs */
3798 BlockDriverState *bdrv_find(const char *name)
3799 {
3800     BlockDriverState *bs;
3801 
3802     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3803         if (!strcmp(name, bs->device_name)) {
3804             return bs;
3805         }
3806     }
3807     return NULL;
3808 }
3809 
3810 /* This function is to find a node in the bs graph */
3811 BlockDriverState *bdrv_find_node(const char *node_name)
3812 {
3813     BlockDriverState *bs;
3814 
3815     assert(node_name);
3816 
3817     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3818         if (!strcmp(node_name, bs->node_name)) {
3819             return bs;
3820         }
3821     }
3822     return NULL;
3823 }
3824 
3825 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3826 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3827 {
3828     BlockDeviceInfoList *list, *entry;
3829     BlockDriverState *bs;
3830 
3831     list = NULL;
3832     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3833         entry = g_malloc0(sizeof(*entry));
3834         entry->value = bdrv_block_device_info(bs);
3835         entry->next = list;
3836         list = entry;
3837     }
3838 
3839     return list;
3840 }
3841 
3842 BlockDriverState *bdrv_lookup_bs(const char *device,
3843                                  const char *node_name,
3844                                  Error **errp)
3845 {
3846     BlockDriverState *bs = NULL;
3847 
3848     if (device) {
3849         bs = bdrv_find(device);
3850 
3851         if (bs) {
3852             return bs;
3853         }
3854     }
3855 
3856     if (node_name) {
3857         bs = bdrv_find_node(node_name);
3858 
3859         if (bs) {
3860             return bs;
3861         }
3862     }
3863 
3864     error_setg(errp, "Cannot find device=%s nor node_name=%s",
3865                      device ? device : "",
3866                      node_name ? node_name : "");
3867     return NULL;
3868 }
3869 
3870 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3871  * return false.  If either argument is NULL, return false. */
3872 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3873 {
3874     while (top && top != base) {
3875         top = top->backing_hd;
3876     }
3877 
3878     return top != NULL;
3879 }
3880 
3881 BlockDriverState *bdrv_next(BlockDriverState *bs)
3882 {
3883     if (!bs) {
3884         return QTAILQ_FIRST(&bdrv_states);
3885     }
3886     return QTAILQ_NEXT(bs, device_list);
3887 }
3888 
3889 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3890 {
3891     BlockDriverState *bs;
3892 
3893     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3894         it(opaque, bs);
3895     }
3896 }
3897 
3898 const char *bdrv_get_device_name(BlockDriverState *bs)
3899 {
3900     return bs->device_name;
3901 }
3902 
3903 int bdrv_get_flags(BlockDriverState *bs)
3904 {
3905     return bs->open_flags;
3906 }
3907 
3908 int bdrv_flush_all(void)
3909 {
3910     BlockDriverState *bs;
3911     int result = 0;
3912 
3913     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3914         AioContext *aio_context = bdrv_get_aio_context(bs);
3915         int ret;
3916 
3917         aio_context_acquire(aio_context);
3918         ret = bdrv_flush(bs);
3919         if (ret < 0 && !result) {
3920             result = ret;
3921         }
3922         aio_context_release(aio_context);
3923     }
3924 
3925     return result;
3926 }
3927 
3928 int bdrv_has_zero_init_1(BlockDriverState *bs)
3929 {
3930     return 1;
3931 }
3932 
3933 int bdrv_has_zero_init(BlockDriverState *bs)
3934 {
3935     assert(bs->drv);
3936 
3937     /* If BS is a copy on write image, it is initialized to
3938        the contents of the base image, which may not be zeroes.  */
3939     if (bs->backing_hd) {
3940         return 0;
3941     }
3942     if (bs->drv->bdrv_has_zero_init) {
3943         return bs->drv->bdrv_has_zero_init(bs);
3944     }
3945 
3946     /* safe default */
3947     return 0;
3948 }
3949 
3950 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3951 {
3952     BlockDriverInfo bdi;
3953 
3954     if (bs->backing_hd) {
3955         return false;
3956     }
3957 
3958     if (bdrv_get_info(bs, &bdi) == 0) {
3959         return bdi.unallocated_blocks_are_zero;
3960     }
3961 
3962     return false;
3963 }
3964 
3965 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3966 {
3967     BlockDriverInfo bdi;
3968 
3969     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3970         return false;
3971     }
3972 
3973     if (bdrv_get_info(bs, &bdi) == 0) {
3974         return bdi.can_write_zeroes_with_unmap;
3975     }
3976 
3977     return false;
3978 }
3979 
3980 typedef struct BdrvCoGetBlockStatusData {
3981     BlockDriverState *bs;
3982     BlockDriverState *base;
3983     int64_t sector_num;
3984     int nb_sectors;
3985     int *pnum;
3986     int64_t ret;
3987     bool done;
3988 } BdrvCoGetBlockStatusData;
3989 
3990 /*
3991  * Returns true iff the specified sector is present in the disk image. Drivers
3992  * not implementing the functionality are assumed to not support backing files,
3993  * hence all their sectors are reported as allocated.
3994  *
3995  * If 'sector_num' is beyond the end of the disk image the return value is 0
3996  * and 'pnum' is set to 0.
3997  *
3998  * 'pnum' is set to the number of sectors (including and immediately following
3999  * the specified sector) that are known to be in the same
4000  * allocated/unallocated state.
4001  *
4002  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
4003  * beyond the end of the disk image it will be clamped.
4004  */
4005 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
4006                                                      int64_t sector_num,
4007                                                      int nb_sectors, int *pnum)
4008 {
4009     int64_t total_sectors;
4010     int64_t n;
4011     int64_t ret, ret2;
4012 
4013     total_sectors = bdrv_nb_sectors(bs);
4014     if (total_sectors < 0) {
4015         return total_sectors;
4016     }
4017 
4018     if (sector_num >= total_sectors) {
4019         *pnum = 0;
4020         return 0;
4021     }
4022 
4023     n = total_sectors - sector_num;
4024     if (n < nb_sectors) {
4025         nb_sectors = n;
4026     }
4027 
4028     if (!bs->drv->bdrv_co_get_block_status) {
4029         *pnum = nb_sectors;
4030         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
4031         if (bs->drv->protocol_name) {
4032             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
4033         }
4034         return ret;
4035     }
4036 
4037     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
4038     if (ret < 0) {
4039         *pnum = 0;
4040         return ret;
4041     }
4042 
4043     if (ret & BDRV_BLOCK_RAW) {
4044         assert(ret & BDRV_BLOCK_OFFSET_VALID);
4045         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4046                                      *pnum, pnum);
4047     }
4048 
4049     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
4050         ret |= BDRV_BLOCK_ALLOCATED;
4051     }
4052 
4053     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
4054         if (bdrv_unallocated_blocks_are_zero(bs)) {
4055             ret |= BDRV_BLOCK_ZERO;
4056         } else if (bs->backing_hd) {
4057             BlockDriverState *bs2 = bs->backing_hd;
4058             int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4059             if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
4060                 ret |= BDRV_BLOCK_ZERO;
4061             }
4062         }
4063     }
4064 
4065     if (bs->file &&
4066         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4067         (ret & BDRV_BLOCK_OFFSET_VALID)) {
4068         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4069                                         *pnum, pnum);
4070         if (ret2 >= 0) {
4071             /* Ignore errors.  This is just providing extra information, it
4072              * is useful but not necessary.
4073              */
4074             ret |= (ret2 & BDRV_BLOCK_ZERO);
4075         }
4076     }
4077 
4078     return ret;
4079 }
4080 
4081 /* Coroutine wrapper for bdrv_get_block_status() */
4082 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
4083 {
4084     BdrvCoGetBlockStatusData *data = opaque;
4085     BlockDriverState *bs = data->bs;
4086 
4087     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4088                                          data->pnum);
4089     data->done = true;
4090 }
4091 
4092 /*
4093  * Synchronous wrapper around bdrv_co_get_block_status().
4094  *
4095  * See bdrv_co_get_block_status() for details.
4096  */
4097 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4098                               int nb_sectors, int *pnum)
4099 {
4100     Coroutine *co;
4101     BdrvCoGetBlockStatusData data = {
4102         .bs = bs,
4103         .sector_num = sector_num,
4104         .nb_sectors = nb_sectors,
4105         .pnum = pnum,
4106         .done = false,
4107     };
4108 
4109     if (qemu_in_coroutine()) {
4110         /* Fast-path if already in coroutine context */
4111         bdrv_get_block_status_co_entry(&data);
4112     } else {
4113         AioContext *aio_context = bdrv_get_aio_context(bs);
4114 
4115         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4116         qemu_coroutine_enter(co, &data);
4117         while (!data.done) {
4118             aio_poll(aio_context, true);
4119         }
4120     }
4121     return data.ret;
4122 }
4123 
4124 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4125                                    int nb_sectors, int *pnum)
4126 {
4127     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4128     if (ret < 0) {
4129         return ret;
4130     }
4131     return !!(ret & BDRV_BLOCK_ALLOCATED);
4132 }
4133 
4134 /*
4135  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4136  *
4137  * Return true if the given sector is allocated in any image between
4138  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
4139  * sector is allocated in any image of the chain.  Return false otherwise.
4140  *
4141  * 'pnum' is set to the number of sectors (including and immediately following
4142  *  the specified sector) that are known to be in the same
4143  *  allocated/unallocated state.
4144  *
4145  */
4146 int bdrv_is_allocated_above(BlockDriverState *top,
4147                             BlockDriverState *base,
4148                             int64_t sector_num,
4149                             int nb_sectors, int *pnum)
4150 {
4151     BlockDriverState *intermediate;
4152     int ret, n = nb_sectors;
4153 
4154     intermediate = top;
4155     while (intermediate && intermediate != base) {
4156         int pnum_inter;
4157         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4158                                 &pnum_inter);
4159         if (ret < 0) {
4160             return ret;
4161         } else if (ret) {
4162             *pnum = pnum_inter;
4163             return 1;
4164         }
4165 
4166         /*
4167          * [sector_num, nb_sectors] is unallocated on top but intermediate
4168          * might have
4169          *
4170          * [sector_num+x, nr_sectors] allocated.
4171          */
4172         if (n > pnum_inter &&
4173             (intermediate == top ||
4174              sector_num + pnum_inter < intermediate->total_sectors)) {
4175             n = pnum_inter;
4176         }
4177 
4178         intermediate = intermediate->backing_hd;
4179     }
4180 
4181     *pnum = n;
4182     return 0;
4183 }
4184 
4185 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4186 {
4187     if (bs->backing_hd && bs->backing_hd->encrypted)
4188         return bs->backing_file;
4189     else if (bs->encrypted)
4190         return bs->filename;
4191     else
4192         return NULL;
4193 }
4194 
4195 void bdrv_get_backing_filename(BlockDriverState *bs,
4196                                char *filename, int filename_size)
4197 {
4198     pstrcpy(filename, filename_size, bs->backing_file);
4199 }
4200 
4201 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4202                           const uint8_t *buf, int nb_sectors)
4203 {
4204     BlockDriver *drv = bs->drv;
4205     if (!drv)
4206         return -ENOMEDIUM;
4207     if (!drv->bdrv_write_compressed)
4208         return -ENOTSUP;
4209     if (bdrv_check_request(bs, sector_num, nb_sectors))
4210         return -EIO;
4211 
4212     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4213 
4214     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4215 }
4216 
4217 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4218 {
4219     BlockDriver *drv = bs->drv;
4220     if (!drv)
4221         return -ENOMEDIUM;
4222     if (!drv->bdrv_get_info)
4223         return -ENOTSUP;
4224     memset(bdi, 0, sizeof(*bdi));
4225     return drv->bdrv_get_info(bs, bdi);
4226 }
4227 
4228 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4229 {
4230     BlockDriver *drv = bs->drv;
4231     if (drv && drv->bdrv_get_specific_info) {
4232         return drv->bdrv_get_specific_info(bs);
4233     }
4234     return NULL;
4235 }
4236 
4237 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4238                       int64_t pos, int size)
4239 {
4240     QEMUIOVector qiov;
4241     struct iovec iov = {
4242         .iov_base   = (void *) buf,
4243         .iov_len    = size,
4244     };
4245 
4246     qemu_iovec_init_external(&qiov, &iov, 1);
4247     return bdrv_writev_vmstate(bs, &qiov, pos);
4248 }
4249 
4250 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4251 {
4252     BlockDriver *drv = bs->drv;
4253 
4254     if (!drv) {
4255         return -ENOMEDIUM;
4256     } else if (drv->bdrv_save_vmstate) {
4257         return drv->bdrv_save_vmstate(bs, qiov, pos);
4258     } else if (bs->file) {
4259         return bdrv_writev_vmstate(bs->file, qiov, pos);
4260     }
4261 
4262     return -ENOTSUP;
4263 }
4264 
4265 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4266                       int64_t pos, int size)
4267 {
4268     BlockDriver *drv = bs->drv;
4269     if (!drv)
4270         return -ENOMEDIUM;
4271     if (drv->bdrv_load_vmstate)
4272         return drv->bdrv_load_vmstate(bs, buf, pos, size);
4273     if (bs->file)
4274         return bdrv_load_vmstate(bs->file, buf, pos, size);
4275     return -ENOTSUP;
4276 }
4277 
4278 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4279 {
4280     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4281         return;
4282     }
4283 
4284     bs->drv->bdrv_debug_event(bs, event);
4285 }
4286 
4287 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4288                           const char *tag)
4289 {
4290     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4291         bs = bs->file;
4292     }
4293 
4294     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4295         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4296     }
4297 
4298     return -ENOTSUP;
4299 }
4300 
4301 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4302 {
4303     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4304         bs = bs->file;
4305     }
4306 
4307     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4308         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4309     }
4310 
4311     return -ENOTSUP;
4312 }
4313 
4314 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4315 {
4316     while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4317         bs = bs->file;
4318     }
4319 
4320     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4321         return bs->drv->bdrv_debug_resume(bs, tag);
4322     }
4323 
4324     return -ENOTSUP;
4325 }
4326 
4327 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4328 {
4329     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4330         bs = bs->file;
4331     }
4332 
4333     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4334         return bs->drv->bdrv_debug_is_suspended(bs, tag);
4335     }
4336 
4337     return false;
4338 }
4339 
4340 int bdrv_is_snapshot(BlockDriverState *bs)
4341 {
4342     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4343 }
4344 
4345 /* backing_file can either be relative, or absolute, or a protocol.  If it is
4346  * relative, it must be relative to the chain.  So, passing in bs->filename
4347  * from a BDS as backing_file should not be done, as that may be relative to
4348  * the CWD rather than the chain. */
4349 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4350         const char *backing_file)
4351 {
4352     char *filename_full = NULL;
4353     char *backing_file_full = NULL;
4354     char *filename_tmp = NULL;
4355     int is_protocol = 0;
4356     BlockDriverState *curr_bs = NULL;
4357     BlockDriverState *retval = NULL;
4358 
4359     if (!bs || !bs->drv || !backing_file) {
4360         return NULL;
4361     }
4362 
4363     filename_full     = g_malloc(PATH_MAX);
4364     backing_file_full = g_malloc(PATH_MAX);
4365     filename_tmp      = g_malloc(PATH_MAX);
4366 
4367     is_protocol = path_has_protocol(backing_file);
4368 
4369     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4370 
4371         /* If either of the filename paths is actually a protocol, then
4372          * compare unmodified paths; otherwise make paths relative */
4373         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4374             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4375                 retval = curr_bs->backing_hd;
4376                 break;
4377             }
4378         } else {
4379             /* If not an absolute filename path, make it relative to the current
4380              * image's filename path */
4381             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4382                          backing_file);
4383 
4384             /* We are going to compare absolute pathnames */
4385             if (!realpath(filename_tmp, filename_full)) {
4386                 continue;
4387             }
4388 
4389             /* We need to make sure the backing filename we are comparing against
4390              * is relative to the current image filename (or absolute) */
4391             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4392                          curr_bs->backing_file);
4393 
4394             if (!realpath(filename_tmp, backing_file_full)) {
4395                 continue;
4396             }
4397 
4398             if (strcmp(backing_file_full, filename_full) == 0) {
4399                 retval = curr_bs->backing_hd;
4400                 break;
4401             }
4402         }
4403     }
4404 
4405     g_free(filename_full);
4406     g_free(backing_file_full);
4407     g_free(filename_tmp);
4408     return retval;
4409 }
4410 
4411 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4412 {
4413     if (!bs->drv) {
4414         return 0;
4415     }
4416 
4417     if (!bs->backing_hd) {
4418         return 0;
4419     }
4420 
4421     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4422 }
4423 
4424 /**************************************************************/
4425 /* async I/Os */
4426 
4427 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4428                                  QEMUIOVector *qiov, int nb_sectors,
4429                                  BlockDriverCompletionFunc *cb, void *opaque)
4430 {
4431     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4432 
4433     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4434                                  cb, opaque, false);
4435 }
4436 
4437 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4438                                   QEMUIOVector *qiov, int nb_sectors,
4439                                   BlockDriverCompletionFunc *cb, void *opaque)
4440 {
4441     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4442 
4443     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4444                                  cb, opaque, true);
4445 }
4446 
4447 BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4448         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4449         BlockDriverCompletionFunc *cb, void *opaque)
4450 {
4451     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4452 
4453     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4454                                  BDRV_REQ_ZERO_WRITE | flags,
4455                                  cb, opaque, true);
4456 }
4457 
4458 
4459 typedef struct MultiwriteCB {
4460     int error;
4461     int num_requests;
4462     int num_callbacks;
4463     struct {
4464         BlockDriverCompletionFunc *cb;
4465         void *opaque;
4466         QEMUIOVector *free_qiov;
4467     } callbacks[];
4468 } MultiwriteCB;
4469 
4470 static void multiwrite_user_cb(MultiwriteCB *mcb)
4471 {
4472     int i;
4473 
4474     for (i = 0; i < mcb->num_callbacks; i++) {
4475         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4476         if (mcb->callbacks[i].free_qiov) {
4477             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4478         }
4479         g_free(mcb->callbacks[i].free_qiov);
4480     }
4481 }
4482 
4483 static void multiwrite_cb(void *opaque, int ret)
4484 {
4485     MultiwriteCB *mcb = opaque;
4486 
4487     trace_multiwrite_cb(mcb, ret);
4488 
4489     if (ret < 0 && !mcb->error) {
4490         mcb->error = ret;
4491     }
4492 
4493     mcb->num_requests--;
4494     if (mcb->num_requests == 0) {
4495         multiwrite_user_cb(mcb);
4496         g_free(mcb);
4497     }
4498 }
4499 
4500 static int multiwrite_req_compare(const void *a, const void *b)
4501 {
4502     const BlockRequest *req1 = a, *req2 = b;
4503 
4504     /*
4505      * Note that we can't simply subtract req2->sector from req1->sector
4506      * here as that could overflow the return value.
4507      */
4508     if (req1->sector > req2->sector) {
4509         return 1;
4510     } else if (req1->sector < req2->sector) {
4511         return -1;
4512     } else {
4513         return 0;
4514     }
4515 }
4516 
4517 /*
4518  * Takes a bunch of requests and tries to merge them. Returns the number of
4519  * requests that remain after merging.
4520  */
4521 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4522     int num_reqs, MultiwriteCB *mcb)
4523 {
4524     int i, outidx;
4525 
4526     // Sort requests by start sector
4527     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4528 
4529     // Check if adjacent requests touch the same clusters. If so, combine them,
4530     // filling up gaps with zero sectors.
4531     outidx = 0;
4532     for (i = 1; i < num_reqs; i++) {
4533         int merge = 0;
4534         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4535 
4536         // Handle exactly sequential writes and overlapping writes.
4537         if (reqs[i].sector <= oldreq_last) {
4538             merge = 1;
4539         }
4540 
4541         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4542             merge = 0;
4543         }
4544 
4545         if (merge) {
4546             size_t size;
4547             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4548             qemu_iovec_init(qiov,
4549                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4550 
4551             // Add the first request to the merged one. If the requests are
4552             // overlapping, drop the last sectors of the first request.
4553             size = (reqs[i].sector - reqs[outidx].sector) << 9;
4554             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4555 
4556             // We should need to add any zeros between the two requests
4557             assert (reqs[i].sector <= oldreq_last);
4558 
4559             // Add the second request
4560             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4561 
4562             // Add tail of first request, if necessary
4563             if (qiov->size < reqs[outidx].qiov->size) {
4564                 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4565                                   reqs[outidx].qiov->size - qiov->size);
4566             }
4567 
4568             reqs[outidx].nb_sectors = qiov->size >> 9;
4569             reqs[outidx].qiov = qiov;
4570 
4571             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4572         } else {
4573             outidx++;
4574             reqs[outidx].sector     = reqs[i].sector;
4575             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4576             reqs[outidx].qiov       = reqs[i].qiov;
4577         }
4578     }
4579 
4580     return outidx + 1;
4581 }
4582 
4583 /*
4584  * Submit multiple AIO write requests at once.
4585  *
4586  * On success, the function returns 0 and all requests in the reqs array have
4587  * been submitted. In error case this function returns -1, and any of the
4588  * requests may or may not be submitted yet. In particular, this means that the
4589  * callback will be called for some of the requests, for others it won't. The
4590  * caller must check the error field of the BlockRequest to wait for the right
4591  * callbacks (if error != 0, no callback will be called).
4592  *
4593  * The implementation may modify the contents of the reqs array, e.g. to merge
4594  * requests. However, the fields opaque and error are left unmodified as they
4595  * are used to signal failure for a single request to the caller.
4596  */
4597 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4598 {
4599     MultiwriteCB *mcb;
4600     int i;
4601 
4602     /* don't submit writes if we don't have a medium */
4603     if (bs->drv == NULL) {
4604         for (i = 0; i < num_reqs; i++) {
4605             reqs[i].error = -ENOMEDIUM;
4606         }
4607         return -1;
4608     }
4609 
4610     if (num_reqs == 0) {
4611         return 0;
4612     }
4613 
4614     // Create MultiwriteCB structure
4615     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4616     mcb->num_requests = 0;
4617     mcb->num_callbacks = num_reqs;
4618 
4619     for (i = 0; i < num_reqs; i++) {
4620         mcb->callbacks[i].cb = reqs[i].cb;
4621         mcb->callbacks[i].opaque = reqs[i].opaque;
4622     }
4623 
4624     // Check for mergable requests
4625     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4626 
4627     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4628 
4629     /* Run the aio requests. */
4630     mcb->num_requests = num_reqs;
4631     for (i = 0; i < num_reqs; i++) {
4632         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4633                               reqs[i].nb_sectors, reqs[i].flags,
4634                               multiwrite_cb, mcb,
4635                               true);
4636     }
4637 
4638     return 0;
4639 }
4640 
4641 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4642 {
4643     acb->aiocb_info->cancel(acb);
4644 }
4645 
4646 /**************************************************************/
4647 /* async block device emulation */
4648 
4649 typedef struct BlockDriverAIOCBSync {
4650     BlockDriverAIOCB common;
4651     QEMUBH *bh;
4652     int ret;
4653     /* vector translation state */
4654     QEMUIOVector *qiov;
4655     uint8_t *bounce;
4656     int is_write;
4657 } BlockDriverAIOCBSync;
4658 
4659 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4660 {
4661     BlockDriverAIOCBSync *acb =
4662         container_of(blockacb, BlockDriverAIOCBSync, common);
4663     qemu_bh_delete(acb->bh);
4664     acb->bh = NULL;
4665     qemu_aio_release(acb);
4666 }
4667 
4668 static const AIOCBInfo bdrv_em_aiocb_info = {
4669     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
4670     .cancel             = bdrv_aio_cancel_em,
4671 };
4672 
4673 static void bdrv_aio_bh_cb(void *opaque)
4674 {
4675     BlockDriverAIOCBSync *acb = opaque;
4676 
4677     if (!acb->is_write && acb->ret >= 0) {
4678         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4679     }
4680     qemu_vfree(acb->bounce);
4681     acb->common.cb(acb->common.opaque, acb->ret);
4682     qemu_bh_delete(acb->bh);
4683     acb->bh = NULL;
4684     qemu_aio_release(acb);
4685 }
4686 
4687 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4688                                             int64_t sector_num,
4689                                             QEMUIOVector *qiov,
4690                                             int nb_sectors,
4691                                             BlockDriverCompletionFunc *cb,
4692                                             void *opaque,
4693                                             int is_write)
4694 
4695 {
4696     BlockDriverAIOCBSync *acb;
4697 
4698     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4699     acb->is_write = is_write;
4700     acb->qiov = qiov;
4701     acb->bounce = qemu_try_blockalign(bs, qiov->size);
4702     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4703 
4704     if (acb->bounce == NULL) {
4705         acb->ret = -ENOMEM;
4706     } else if (is_write) {
4707         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4708         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4709     } else {
4710         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4711     }
4712 
4713     qemu_bh_schedule(acb->bh);
4714 
4715     return &acb->common;
4716 }
4717 
4718 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4719         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4720         BlockDriverCompletionFunc *cb, void *opaque)
4721 {
4722     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4723 }
4724 
4725 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4726         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4727         BlockDriverCompletionFunc *cb, void *opaque)
4728 {
4729     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4730 }
4731 
4732 
4733 typedef struct BlockDriverAIOCBCoroutine {
4734     BlockDriverAIOCB common;
4735     BlockRequest req;
4736     bool is_write;
4737     bool *done;
4738     QEMUBH* bh;
4739 } BlockDriverAIOCBCoroutine;
4740 
4741 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4742 {
4743     AioContext *aio_context = bdrv_get_aio_context(blockacb->bs);
4744     BlockDriverAIOCBCoroutine *acb =
4745         container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4746     bool done = false;
4747 
4748     acb->done = &done;
4749     while (!done) {
4750         aio_poll(aio_context, true);
4751     }
4752 }
4753 
4754 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4755     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4756     .cancel             = bdrv_aio_co_cancel_em,
4757 };
4758 
4759 static void bdrv_co_em_bh(void *opaque)
4760 {
4761     BlockDriverAIOCBCoroutine *acb = opaque;
4762 
4763     acb->common.cb(acb->common.opaque, acb->req.error);
4764 
4765     if (acb->done) {
4766         *acb->done = true;
4767     }
4768 
4769     qemu_bh_delete(acb->bh);
4770     qemu_aio_release(acb);
4771 }
4772 
4773 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4774 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4775 {
4776     BlockDriverAIOCBCoroutine *acb = opaque;
4777     BlockDriverState *bs = acb->common.bs;
4778 
4779     if (!acb->is_write) {
4780         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4781             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4782     } else {
4783         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4784             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4785     }
4786 
4787     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4788     qemu_bh_schedule(acb->bh);
4789 }
4790 
4791 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4792                                                int64_t sector_num,
4793                                                QEMUIOVector *qiov,
4794                                                int nb_sectors,
4795                                                BdrvRequestFlags flags,
4796                                                BlockDriverCompletionFunc *cb,
4797                                                void *opaque,
4798                                                bool is_write)
4799 {
4800     Coroutine *co;
4801     BlockDriverAIOCBCoroutine *acb;
4802 
4803     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4804     acb->req.sector = sector_num;
4805     acb->req.nb_sectors = nb_sectors;
4806     acb->req.qiov = qiov;
4807     acb->req.flags = flags;
4808     acb->is_write = is_write;
4809     acb->done = NULL;
4810 
4811     co = qemu_coroutine_create(bdrv_co_do_rw);
4812     qemu_coroutine_enter(co, acb);
4813 
4814     return &acb->common;
4815 }
4816 
4817 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4818 {
4819     BlockDriverAIOCBCoroutine *acb = opaque;
4820     BlockDriverState *bs = acb->common.bs;
4821 
4822     acb->req.error = bdrv_co_flush(bs);
4823     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4824     qemu_bh_schedule(acb->bh);
4825 }
4826 
4827 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4828         BlockDriverCompletionFunc *cb, void *opaque)
4829 {
4830     trace_bdrv_aio_flush(bs, opaque);
4831 
4832     Coroutine *co;
4833     BlockDriverAIOCBCoroutine *acb;
4834 
4835     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4836     acb->done = NULL;
4837 
4838     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4839     qemu_coroutine_enter(co, acb);
4840 
4841     return &acb->common;
4842 }
4843 
4844 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4845 {
4846     BlockDriverAIOCBCoroutine *acb = opaque;
4847     BlockDriverState *bs = acb->common.bs;
4848 
4849     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4850     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4851     qemu_bh_schedule(acb->bh);
4852 }
4853 
4854 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4855         int64_t sector_num, int nb_sectors,
4856         BlockDriverCompletionFunc *cb, void *opaque)
4857 {
4858     Coroutine *co;
4859     BlockDriverAIOCBCoroutine *acb;
4860 
4861     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4862 
4863     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4864     acb->req.sector = sector_num;
4865     acb->req.nb_sectors = nb_sectors;
4866     acb->done = NULL;
4867     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4868     qemu_coroutine_enter(co, acb);
4869 
4870     return &acb->common;
4871 }
4872 
4873 void bdrv_init(void)
4874 {
4875     module_call_init(MODULE_INIT_BLOCK);
4876 }
4877 
4878 void bdrv_init_with_whitelist(void)
4879 {
4880     use_bdrv_whitelist = 1;
4881     bdrv_init();
4882 }
4883 
4884 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4885                    BlockDriverCompletionFunc *cb, void *opaque)
4886 {
4887     BlockDriverAIOCB *acb;
4888 
4889     acb = g_slice_alloc(aiocb_info->aiocb_size);
4890     acb->aiocb_info = aiocb_info;
4891     acb->bs = bs;
4892     acb->cb = cb;
4893     acb->opaque = opaque;
4894     return acb;
4895 }
4896 
4897 void qemu_aio_release(void *p)
4898 {
4899     BlockDriverAIOCB *acb = p;
4900     g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4901 }
4902 
4903 /**************************************************************/
4904 /* Coroutine block device emulation */
4905 
4906 typedef struct CoroutineIOCompletion {
4907     Coroutine *coroutine;
4908     int ret;
4909 } CoroutineIOCompletion;
4910 
4911 static void bdrv_co_io_em_complete(void *opaque, int ret)
4912 {
4913     CoroutineIOCompletion *co = opaque;
4914 
4915     co->ret = ret;
4916     qemu_coroutine_enter(co->coroutine, NULL);
4917 }
4918 
4919 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4920                                       int nb_sectors, QEMUIOVector *iov,
4921                                       bool is_write)
4922 {
4923     CoroutineIOCompletion co = {
4924         .coroutine = qemu_coroutine_self(),
4925     };
4926     BlockDriverAIOCB *acb;
4927 
4928     if (is_write) {
4929         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4930                                        bdrv_co_io_em_complete, &co);
4931     } else {
4932         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4933                                       bdrv_co_io_em_complete, &co);
4934     }
4935 
4936     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4937     if (!acb) {
4938         return -EIO;
4939     }
4940     qemu_coroutine_yield();
4941 
4942     return co.ret;
4943 }
4944 
4945 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4946                                          int64_t sector_num, int nb_sectors,
4947                                          QEMUIOVector *iov)
4948 {
4949     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4950 }
4951 
4952 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4953                                          int64_t sector_num, int nb_sectors,
4954                                          QEMUIOVector *iov)
4955 {
4956     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4957 }
4958 
4959 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4960 {
4961     RwCo *rwco = opaque;
4962 
4963     rwco->ret = bdrv_co_flush(rwco->bs);
4964 }
4965 
4966 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4967 {
4968     int ret;
4969 
4970     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4971         return 0;
4972     }
4973 
4974     /* Write back cached data to the OS even with cache=unsafe */
4975     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4976     if (bs->drv->bdrv_co_flush_to_os) {
4977         ret = bs->drv->bdrv_co_flush_to_os(bs);
4978         if (ret < 0) {
4979             return ret;
4980         }
4981     }
4982 
4983     /* But don't actually force it to the disk with cache=unsafe */
4984     if (bs->open_flags & BDRV_O_NO_FLUSH) {
4985         goto flush_parent;
4986     }
4987 
4988     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4989     if (bs->drv->bdrv_co_flush_to_disk) {
4990         ret = bs->drv->bdrv_co_flush_to_disk(bs);
4991     } else if (bs->drv->bdrv_aio_flush) {
4992         BlockDriverAIOCB *acb;
4993         CoroutineIOCompletion co = {
4994             .coroutine = qemu_coroutine_self(),
4995         };
4996 
4997         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4998         if (acb == NULL) {
4999             ret = -EIO;
5000         } else {
5001             qemu_coroutine_yield();
5002             ret = co.ret;
5003         }
5004     } else {
5005         /*
5006          * Some block drivers always operate in either writethrough or unsafe
5007          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
5008          * know how the server works (because the behaviour is hardcoded or
5009          * depends on server-side configuration), so we can't ensure that
5010          * everything is safe on disk. Returning an error doesn't work because
5011          * that would break guests even if the server operates in writethrough
5012          * mode.
5013          *
5014          * Let's hope the user knows what he's doing.
5015          */
5016         ret = 0;
5017     }
5018     if (ret < 0) {
5019         return ret;
5020     }
5021 
5022     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
5023      * in the case of cache=unsafe, so there are no useless flushes.
5024      */
5025 flush_parent:
5026     return bdrv_co_flush(bs->file);
5027 }
5028 
5029 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
5030 {
5031     Error *local_err = NULL;
5032     int ret;
5033 
5034     if (!bs->drv)  {
5035         return;
5036     }
5037 
5038     if (bs->drv->bdrv_invalidate_cache) {
5039         bs->drv->bdrv_invalidate_cache(bs, &local_err);
5040     } else if (bs->file) {
5041         bdrv_invalidate_cache(bs->file, &local_err);
5042     }
5043     if (local_err) {
5044         error_propagate(errp, local_err);
5045         return;
5046     }
5047 
5048     ret = refresh_total_sectors(bs, bs->total_sectors);
5049     if (ret < 0) {
5050         error_setg_errno(errp, -ret, "Could not refresh total sector count");
5051         return;
5052     }
5053 }
5054 
5055 void bdrv_invalidate_cache_all(Error **errp)
5056 {
5057     BlockDriverState *bs;
5058     Error *local_err = NULL;
5059 
5060     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5061         AioContext *aio_context = bdrv_get_aio_context(bs);
5062 
5063         aio_context_acquire(aio_context);
5064         bdrv_invalidate_cache(bs, &local_err);
5065         aio_context_release(aio_context);
5066         if (local_err) {
5067             error_propagate(errp, local_err);
5068             return;
5069         }
5070     }
5071 }
5072 
5073 void bdrv_clear_incoming_migration_all(void)
5074 {
5075     BlockDriverState *bs;
5076 
5077     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5078         AioContext *aio_context = bdrv_get_aio_context(bs);
5079 
5080         aio_context_acquire(aio_context);
5081         bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
5082         aio_context_release(aio_context);
5083     }
5084 }
5085 
5086 int bdrv_flush(BlockDriverState *bs)
5087 {
5088     Coroutine *co;
5089     RwCo rwco = {
5090         .bs = bs,
5091         .ret = NOT_DONE,
5092     };
5093 
5094     if (qemu_in_coroutine()) {
5095         /* Fast-path if already in coroutine context */
5096         bdrv_flush_co_entry(&rwco);
5097     } else {
5098         AioContext *aio_context = bdrv_get_aio_context(bs);
5099 
5100         co = qemu_coroutine_create(bdrv_flush_co_entry);
5101         qemu_coroutine_enter(co, &rwco);
5102         while (rwco.ret == NOT_DONE) {
5103             aio_poll(aio_context, true);
5104         }
5105     }
5106 
5107     return rwco.ret;
5108 }
5109 
5110 typedef struct DiscardCo {
5111     BlockDriverState *bs;
5112     int64_t sector_num;
5113     int nb_sectors;
5114     int ret;
5115 } DiscardCo;
5116 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5117 {
5118     DiscardCo *rwco = opaque;
5119 
5120     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5121 }
5122 
5123 /* if no limit is specified in the BlockLimits use a default
5124  * of 32768 512-byte sectors (16 MiB) per request.
5125  */
5126 #define MAX_DISCARD_DEFAULT 32768
5127 
5128 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5129                                  int nb_sectors)
5130 {
5131     int max_discard;
5132 
5133     if (!bs->drv) {
5134         return -ENOMEDIUM;
5135     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5136         return -EIO;
5137     } else if (bs->read_only) {
5138         return -EROFS;
5139     }
5140 
5141     bdrv_reset_dirty(bs, sector_num, nb_sectors);
5142 
5143     /* Do nothing if disabled.  */
5144     if (!(bs->open_flags & BDRV_O_UNMAP)) {
5145         return 0;
5146     }
5147 
5148     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5149         return 0;
5150     }
5151 
5152     max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5153     while (nb_sectors > 0) {
5154         int ret;
5155         int num = nb_sectors;
5156 
5157         /* align request */
5158         if (bs->bl.discard_alignment &&
5159             num >= bs->bl.discard_alignment &&
5160             sector_num % bs->bl.discard_alignment) {
5161             if (num > bs->bl.discard_alignment) {
5162                 num = bs->bl.discard_alignment;
5163             }
5164             num -= sector_num % bs->bl.discard_alignment;
5165         }
5166 
5167         /* limit request size */
5168         if (num > max_discard) {
5169             num = max_discard;
5170         }
5171 
5172         if (bs->drv->bdrv_co_discard) {
5173             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5174         } else {
5175             BlockDriverAIOCB *acb;
5176             CoroutineIOCompletion co = {
5177                 .coroutine = qemu_coroutine_self(),
5178             };
5179 
5180             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5181                                             bdrv_co_io_em_complete, &co);
5182             if (acb == NULL) {
5183                 return -EIO;
5184             } else {
5185                 qemu_coroutine_yield();
5186                 ret = co.ret;
5187             }
5188         }
5189         if (ret && ret != -ENOTSUP) {
5190             return ret;
5191         }
5192 
5193         sector_num += num;
5194         nb_sectors -= num;
5195     }
5196     return 0;
5197 }
5198 
5199 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5200 {
5201     Coroutine *co;
5202     DiscardCo rwco = {
5203         .bs = bs,
5204         .sector_num = sector_num,
5205         .nb_sectors = nb_sectors,
5206         .ret = NOT_DONE,
5207     };
5208 
5209     if (qemu_in_coroutine()) {
5210         /* Fast-path if already in coroutine context */
5211         bdrv_discard_co_entry(&rwco);
5212     } else {
5213         AioContext *aio_context = bdrv_get_aio_context(bs);
5214 
5215         co = qemu_coroutine_create(bdrv_discard_co_entry);
5216         qemu_coroutine_enter(co, &rwco);
5217         while (rwco.ret == NOT_DONE) {
5218             aio_poll(aio_context, true);
5219         }
5220     }
5221 
5222     return rwco.ret;
5223 }
5224 
5225 /**************************************************************/
5226 /* removable device support */
5227 
5228 /**
5229  * Return TRUE if the media is present
5230  */
5231 int bdrv_is_inserted(BlockDriverState *bs)
5232 {
5233     BlockDriver *drv = bs->drv;
5234 
5235     if (!drv)
5236         return 0;
5237     if (!drv->bdrv_is_inserted)
5238         return 1;
5239     return drv->bdrv_is_inserted(bs);
5240 }
5241 
5242 /**
5243  * Return whether the media changed since the last call to this
5244  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
5245  */
5246 int bdrv_media_changed(BlockDriverState *bs)
5247 {
5248     BlockDriver *drv = bs->drv;
5249 
5250     if (drv && drv->bdrv_media_changed) {
5251         return drv->bdrv_media_changed(bs);
5252     }
5253     return -ENOTSUP;
5254 }
5255 
5256 /**
5257  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5258  */
5259 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5260 {
5261     BlockDriver *drv = bs->drv;
5262 
5263     if (drv && drv->bdrv_eject) {
5264         drv->bdrv_eject(bs, eject_flag);
5265     }
5266 
5267     if (bs->device_name[0] != '\0') {
5268         qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
5269                                           eject_flag, &error_abort);
5270     }
5271 }
5272 
5273 /**
5274  * Lock or unlock the media (if it is locked, the user won't be able
5275  * to eject it manually).
5276  */
5277 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5278 {
5279     BlockDriver *drv = bs->drv;
5280 
5281     trace_bdrv_lock_medium(bs, locked);
5282 
5283     if (drv && drv->bdrv_lock_medium) {
5284         drv->bdrv_lock_medium(bs, locked);
5285     }
5286 }
5287 
5288 /* needed for generic scsi interface */
5289 
5290 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5291 {
5292     BlockDriver *drv = bs->drv;
5293 
5294     if (drv && drv->bdrv_ioctl)
5295         return drv->bdrv_ioctl(bs, req, buf);
5296     return -ENOTSUP;
5297 }
5298 
5299 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5300         unsigned long int req, void *buf,
5301         BlockDriverCompletionFunc *cb, void *opaque)
5302 {
5303     BlockDriver *drv = bs->drv;
5304 
5305     if (drv && drv->bdrv_aio_ioctl)
5306         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5307     return NULL;
5308 }
5309 
5310 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5311 {
5312     bs->guest_block_size = align;
5313 }
5314 
5315 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5316 {
5317     return qemu_memalign(bdrv_opt_mem_align(bs), size);
5318 }
5319 
5320 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5321 {
5322     size_t align = bdrv_opt_mem_align(bs);
5323 
5324     /* Ensure that NULL is never returned on success */
5325     assert(align > 0);
5326     if (size == 0) {
5327         size = align;
5328     }
5329 
5330     return qemu_try_memalign(align, size);
5331 }
5332 
5333 /*
5334  * Check if all memory in this vector is sector aligned.
5335  */
5336 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5337 {
5338     int i;
5339     size_t alignment = bdrv_opt_mem_align(bs);
5340 
5341     for (i = 0; i < qiov->niov; i++) {
5342         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5343             return false;
5344         }
5345         if (qiov->iov[i].iov_len % alignment) {
5346             return false;
5347         }
5348     }
5349 
5350     return true;
5351 }
5352 
5353 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5354                                           Error **errp)
5355 {
5356     int64_t bitmap_size;
5357     BdrvDirtyBitmap *bitmap;
5358 
5359     assert((granularity & (granularity - 1)) == 0);
5360 
5361     granularity >>= BDRV_SECTOR_BITS;
5362     assert(granularity);
5363     bitmap_size = bdrv_nb_sectors(bs);
5364     if (bitmap_size < 0) {
5365         error_setg_errno(errp, -bitmap_size, "could not get length of device");
5366         errno = -bitmap_size;
5367         return NULL;
5368     }
5369     bitmap = g_new0(BdrvDirtyBitmap, 1);
5370     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5371     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5372     return bitmap;
5373 }
5374 
5375 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5376 {
5377     BdrvDirtyBitmap *bm, *next;
5378     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5379         if (bm == bitmap) {
5380             QLIST_REMOVE(bitmap, list);
5381             hbitmap_free(bitmap->bitmap);
5382             g_free(bitmap);
5383             return;
5384         }
5385     }
5386 }
5387 
5388 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5389 {
5390     BdrvDirtyBitmap *bm;
5391     BlockDirtyInfoList *list = NULL;
5392     BlockDirtyInfoList **plist = &list;
5393 
5394     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5395         BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5396         BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
5397         info->count = bdrv_get_dirty_count(bs, bm);
5398         info->granularity =
5399             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5400         entry->value = info;
5401         *plist = entry;
5402         plist = &entry->next;
5403     }
5404 
5405     return list;
5406 }
5407 
5408 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5409 {
5410     if (bitmap) {
5411         return hbitmap_get(bitmap->bitmap, sector);
5412     } else {
5413         return 0;
5414     }
5415 }
5416 
5417 void bdrv_dirty_iter_init(BlockDriverState *bs,
5418                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5419 {
5420     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5421 }
5422 
5423 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5424                     int nr_sectors)
5425 {
5426     BdrvDirtyBitmap *bitmap;
5427     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5428         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5429     }
5430 }
5431 
5432 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5433 {
5434     BdrvDirtyBitmap *bitmap;
5435     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5436         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5437     }
5438 }
5439 
5440 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5441 {
5442     return hbitmap_count(bitmap->bitmap);
5443 }
5444 
5445 /* Get a reference to bs */
5446 void bdrv_ref(BlockDriverState *bs)
5447 {
5448     bs->refcnt++;
5449 }
5450 
5451 /* Release a previously grabbed reference to bs.
5452  * If after releasing, reference count is zero, the BlockDriverState is
5453  * deleted. */
5454 void bdrv_unref(BlockDriverState *bs)
5455 {
5456     if (!bs) {
5457         return;
5458     }
5459     assert(bs->refcnt > 0);
5460     if (--bs->refcnt == 0) {
5461         bdrv_delete(bs);
5462     }
5463 }
5464 
5465 struct BdrvOpBlocker {
5466     Error *reason;
5467     QLIST_ENTRY(BdrvOpBlocker) list;
5468 };
5469 
5470 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5471 {
5472     BdrvOpBlocker *blocker;
5473     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5474     if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5475         blocker = QLIST_FIRST(&bs->op_blockers[op]);
5476         if (errp) {
5477             error_setg(errp, "Device '%s' is busy: %s",
5478                        bs->device_name, error_get_pretty(blocker->reason));
5479         }
5480         return true;
5481     }
5482     return false;
5483 }
5484 
5485 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5486 {
5487     BdrvOpBlocker *blocker;
5488     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5489 
5490     blocker = g_new0(BdrvOpBlocker, 1);
5491     blocker->reason = reason;
5492     QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5493 }
5494 
5495 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5496 {
5497     BdrvOpBlocker *blocker, *next;
5498     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5499     QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5500         if (blocker->reason == reason) {
5501             QLIST_REMOVE(blocker, list);
5502             g_free(blocker);
5503         }
5504     }
5505 }
5506 
5507 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5508 {
5509     int i;
5510     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5511         bdrv_op_block(bs, i, reason);
5512     }
5513 }
5514 
5515 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5516 {
5517     int i;
5518     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5519         bdrv_op_unblock(bs, i, reason);
5520     }
5521 }
5522 
5523 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5524 {
5525     int i;
5526 
5527     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5528         if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5529             return false;
5530         }
5531     }
5532     return true;
5533 }
5534 
5535 void bdrv_iostatus_enable(BlockDriverState *bs)
5536 {
5537     bs->iostatus_enabled = true;
5538     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5539 }
5540 
5541 /* The I/O status is only enabled if the drive explicitly
5542  * enables it _and_ the VM is configured to stop on errors */
5543 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5544 {
5545     return (bs->iostatus_enabled &&
5546            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5547             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5548             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5549 }
5550 
5551 void bdrv_iostatus_disable(BlockDriverState *bs)
5552 {
5553     bs->iostatus_enabled = false;
5554 }
5555 
5556 void bdrv_iostatus_reset(BlockDriverState *bs)
5557 {
5558     if (bdrv_iostatus_is_enabled(bs)) {
5559         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5560         if (bs->job) {
5561             block_job_iostatus_reset(bs->job);
5562         }
5563     }
5564 }
5565 
5566 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5567 {
5568     assert(bdrv_iostatus_is_enabled(bs));
5569     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5570         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5571                                          BLOCK_DEVICE_IO_STATUS_FAILED;
5572     }
5573 }
5574 
5575 void
5576 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5577         enum BlockAcctType type)
5578 {
5579     assert(type < BDRV_MAX_IOTYPE);
5580 
5581     cookie->bytes = bytes;
5582     cookie->start_time_ns = get_clock();
5583     cookie->type = type;
5584 }
5585 
5586 void
5587 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5588 {
5589     assert(cookie->type < BDRV_MAX_IOTYPE);
5590 
5591     bs->nr_bytes[cookie->type] += cookie->bytes;
5592     bs->nr_ops[cookie->type]++;
5593     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5594 }
5595 
5596 void bdrv_img_create(const char *filename, const char *fmt,
5597                      const char *base_filename, const char *base_fmt,
5598                      char *options, uint64_t img_size, int flags,
5599                      Error **errp, bool quiet)
5600 {
5601     QemuOptsList *create_opts = NULL;
5602     QemuOpts *opts = NULL;
5603     const char *backing_fmt, *backing_file;
5604     int64_t size;
5605     BlockDriver *drv, *proto_drv;
5606     BlockDriver *backing_drv = NULL;
5607     Error *local_err = NULL;
5608     int ret = 0;
5609 
5610     /* Find driver and parse its options */
5611     drv = bdrv_find_format(fmt);
5612     if (!drv) {
5613         error_setg(errp, "Unknown file format '%s'", fmt);
5614         return;
5615     }
5616 
5617     proto_drv = bdrv_find_protocol(filename, true);
5618     if (!proto_drv) {
5619         error_setg(errp, "Unknown protocol '%s'", filename);
5620         return;
5621     }
5622 
5623     create_opts = qemu_opts_append(create_opts, drv->create_opts);
5624     create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5625 
5626     /* Create parameter list with default values */
5627     opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5628     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
5629 
5630     /* Parse -o options */
5631     if (options) {
5632         if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5633             error_setg(errp, "Invalid options for file format '%s'", fmt);
5634             goto out;
5635         }
5636     }
5637 
5638     if (base_filename) {
5639         if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
5640             error_setg(errp, "Backing file not supported for file format '%s'",
5641                        fmt);
5642             goto out;
5643         }
5644     }
5645 
5646     if (base_fmt) {
5647         if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5648             error_setg(errp, "Backing file format not supported for file "
5649                              "format '%s'", fmt);
5650             goto out;
5651         }
5652     }
5653 
5654     backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5655     if (backing_file) {
5656         if (!strcmp(filename, backing_file)) {
5657             error_setg(errp, "Error: Trying to create an image with the "
5658                              "same filename as the backing file");
5659             goto out;
5660         }
5661     }
5662 
5663     backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5664     if (backing_fmt) {
5665         backing_drv = bdrv_find_format(backing_fmt);
5666         if (!backing_drv) {
5667             error_setg(errp, "Unknown backing file format '%s'",
5668                        backing_fmt);
5669             goto out;
5670         }
5671     }
5672 
5673     // The size for the image must always be specified, with one exception:
5674     // If we are using a backing file, we can obtain the size from there
5675     size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5676     if (size == -1) {
5677         if (backing_file) {
5678             BlockDriverState *bs;
5679             int64_t size;
5680             int back_flags;
5681 
5682             /* backing files always opened read-only */
5683             back_flags =
5684                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5685 
5686             bs = NULL;
5687             ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
5688                             backing_drv, &local_err);
5689             if (ret < 0) {
5690                 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5691                                  backing_file,
5692                                  error_get_pretty(local_err));
5693                 error_free(local_err);
5694                 local_err = NULL;
5695                 goto out;
5696             }
5697             size = bdrv_getlength(bs);
5698             if (size < 0) {
5699                 error_setg_errno(errp, -size, "Could not get size of '%s'",
5700                                  backing_file);
5701                 bdrv_unref(bs);
5702                 goto out;
5703             }
5704 
5705             qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
5706 
5707             bdrv_unref(bs);
5708         } else {
5709             error_setg(errp, "Image creation needs a size parameter");
5710             goto out;
5711         }
5712     }
5713 
5714     if (!quiet) {
5715         printf("Formatting '%s', fmt=%s ", filename, fmt);
5716         qemu_opts_print(opts);
5717         puts("");
5718     }
5719 
5720     ret = bdrv_create(drv, filename, opts, &local_err);
5721 
5722     if (ret == -EFBIG) {
5723         /* This is generally a better message than whatever the driver would
5724          * deliver (especially because of the cluster_size_hint), since that
5725          * is most probably not much different from "image too large". */
5726         const char *cluster_size_hint = "";
5727         if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
5728             cluster_size_hint = " (try using a larger cluster size)";
5729         }
5730         error_setg(errp, "The image size is too large for file format '%s'"
5731                    "%s", fmt, cluster_size_hint);
5732         error_free(local_err);
5733         local_err = NULL;
5734     }
5735 
5736 out:
5737     qemu_opts_del(opts);
5738     qemu_opts_free(create_opts);
5739     if (local_err) {
5740         error_propagate(errp, local_err);
5741     }
5742 }
5743 
5744 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5745 {
5746     return bs->aio_context;
5747 }
5748 
5749 void bdrv_detach_aio_context(BlockDriverState *bs)
5750 {
5751     BdrvAioNotifier *baf;
5752 
5753     if (!bs->drv) {
5754         return;
5755     }
5756 
5757     QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5758         baf->detach_aio_context(baf->opaque);
5759     }
5760 
5761     if (bs->io_limits_enabled) {
5762         throttle_detach_aio_context(&bs->throttle_state);
5763     }
5764     if (bs->drv->bdrv_detach_aio_context) {
5765         bs->drv->bdrv_detach_aio_context(bs);
5766     }
5767     if (bs->file) {
5768         bdrv_detach_aio_context(bs->file);
5769     }
5770     if (bs->backing_hd) {
5771         bdrv_detach_aio_context(bs->backing_hd);
5772     }
5773 
5774     bs->aio_context = NULL;
5775 }
5776 
5777 void bdrv_attach_aio_context(BlockDriverState *bs,
5778                              AioContext *new_context)
5779 {
5780     BdrvAioNotifier *ban;
5781 
5782     if (!bs->drv) {
5783         return;
5784     }
5785 
5786     bs->aio_context = new_context;
5787 
5788     if (bs->backing_hd) {
5789         bdrv_attach_aio_context(bs->backing_hd, new_context);
5790     }
5791     if (bs->file) {
5792         bdrv_attach_aio_context(bs->file, new_context);
5793     }
5794     if (bs->drv->bdrv_attach_aio_context) {
5795         bs->drv->bdrv_attach_aio_context(bs, new_context);
5796     }
5797     if (bs->io_limits_enabled) {
5798         throttle_attach_aio_context(&bs->throttle_state, new_context);
5799     }
5800 
5801     QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5802         ban->attached_aio_context(new_context, ban->opaque);
5803     }
5804 }
5805 
5806 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5807 {
5808     bdrv_drain_all(); /* ensure there are no in-flight requests */
5809 
5810     bdrv_detach_aio_context(bs);
5811 
5812     /* This function executes in the old AioContext so acquire the new one in
5813      * case it runs in a different thread.
5814      */
5815     aio_context_acquire(new_context);
5816     bdrv_attach_aio_context(bs, new_context);
5817     aio_context_release(new_context);
5818 }
5819 
5820 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5821         void (*attached_aio_context)(AioContext *new_context, void *opaque),
5822         void (*detach_aio_context)(void *opaque), void *opaque)
5823 {
5824     BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5825     *ban = (BdrvAioNotifier){
5826         .attached_aio_context = attached_aio_context,
5827         .detach_aio_context   = detach_aio_context,
5828         .opaque               = opaque
5829     };
5830 
5831     QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5832 }
5833 
5834 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5835                                       void (*attached_aio_context)(AioContext *,
5836                                                                    void *),
5837                                       void (*detach_aio_context)(void *),
5838                                       void *opaque)
5839 {
5840     BdrvAioNotifier *ban, *ban_next;
5841 
5842     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5843         if (ban->attached_aio_context == attached_aio_context &&
5844             ban->detach_aio_context   == detach_aio_context   &&
5845             ban->opaque               == opaque)
5846         {
5847             QLIST_REMOVE(ban, list);
5848             g_free(ban);
5849 
5850             return;
5851         }
5852     }
5853 
5854     abort();
5855 }
5856 
5857 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5858                                     NotifierWithReturn *notifier)
5859 {
5860     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5861 }
5862 
5863 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts)
5864 {
5865     if (!bs->drv->bdrv_amend_options) {
5866         return -ENOTSUP;
5867     }
5868     return bs->drv->bdrv_amend_options(bs, opts);
5869 }
5870 
5871 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5872  * of block filter and by bdrv_is_first_non_filter.
5873  * It is used to test if the given bs is the candidate or recurse more in the
5874  * node graph.
5875  */
5876 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5877                                       BlockDriverState *candidate)
5878 {
5879     /* return false if basic checks fails */
5880     if (!bs || !bs->drv) {
5881         return false;
5882     }
5883 
5884     /* the code reached a non block filter driver -> check if the bs is
5885      * the same as the candidate. It's the recursion termination condition.
5886      */
5887     if (!bs->drv->is_filter) {
5888         return bs == candidate;
5889     }
5890     /* Down this path the driver is a block filter driver */
5891 
5892     /* If the block filter recursion method is defined use it to recurse down
5893      * the node graph.
5894      */
5895     if (bs->drv->bdrv_recurse_is_first_non_filter) {
5896         return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5897     }
5898 
5899     /* the driver is a block filter but don't allow to recurse -> return false
5900      */
5901     return false;
5902 }
5903 
5904 /* This function checks if the candidate is the first non filter bs down it's
5905  * bs chain. Since we don't have pointers to parents it explore all bs chains
5906  * from the top. Some filters can choose not to pass down the recursion.
5907  */
5908 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5909 {
5910     BlockDriverState *bs;
5911 
5912     /* walk down the bs forest recursively */
5913     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5914         bool perm;
5915 
5916         /* try to recurse in this top level bs */
5917         perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5918 
5919         /* candidate is the first non filter */
5920         if (perm) {
5921             return true;
5922         }
5923     }
5924 
5925     return false;
5926 }
5927 
5928 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5929 {
5930     BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5931     if (!to_replace_bs) {
5932         error_setg(errp, "Node name '%s' not found", node_name);
5933         return NULL;
5934     }
5935 
5936     if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5937         return NULL;
5938     }
5939 
5940     /* We don't want arbitrary node of the BDS chain to be replaced only the top
5941      * most non filter in order to prevent data corruption.
5942      * Another benefit is that this tests exclude backing files which are
5943      * blocked by the backing blockers.
5944      */
5945     if (!bdrv_is_first_non_filter(to_replace_bs)) {
5946         error_setg(errp, "Only top most non filter can be replaced");
5947         return NULL;
5948     }
5949 
5950     return to_replace_bs;
5951 }
5952 
5953 void bdrv_io_plug(BlockDriverState *bs)
5954 {
5955     BlockDriver *drv = bs->drv;
5956     if (drv && drv->bdrv_io_plug) {
5957         drv->bdrv_io_plug(bs);
5958     } else if (bs->file) {
5959         bdrv_io_plug(bs->file);
5960     }
5961 }
5962 
5963 void bdrv_io_unplug(BlockDriverState *bs)
5964 {
5965     BlockDriver *drv = bs->drv;
5966     if (drv && drv->bdrv_io_unplug) {
5967         drv->bdrv_io_unplug(bs);
5968     } else if (bs->file) {
5969         bdrv_io_unplug(bs->file);
5970     }
5971 }
5972 
5973 void bdrv_flush_io_queue(BlockDriverState *bs)
5974 {
5975     BlockDriver *drv = bs->drv;
5976     if (drv && drv->bdrv_flush_io_queue) {
5977         drv->bdrv_flush_io_queue(bs);
5978     } else if (bs->file) {
5979         bdrv_flush_io_queue(bs->file);
5980     }
5981 }
5982 
5983 static bool append_open_options(QDict *d, BlockDriverState *bs)
5984 {
5985     const QDictEntry *entry;
5986     bool found_any = false;
5987 
5988     for (entry = qdict_first(bs->options); entry;
5989          entry = qdict_next(bs->options, entry))
5990     {
5991         /* Only take options for this level and exclude all non-driver-specific
5992          * options */
5993         if (!strchr(qdict_entry_key(entry), '.') &&
5994             strcmp(qdict_entry_key(entry), "node-name"))
5995         {
5996             qobject_incref(qdict_entry_value(entry));
5997             qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
5998             found_any = true;
5999         }
6000     }
6001 
6002     return found_any;
6003 }
6004 
6005 /* Updates the following BDS fields:
6006  *  - exact_filename: A filename which may be used for opening a block device
6007  *                    which (mostly) equals the given BDS (even without any
6008  *                    other options; so reading and writing must return the same
6009  *                    results, but caching etc. may be different)
6010  *  - full_open_options: Options which, when given when opening a block device
6011  *                       (without a filename), result in a BDS (mostly)
6012  *                       equalling the given one
6013  *  - filename: If exact_filename is set, it is copied here. Otherwise,
6014  *              full_open_options is converted to a JSON object, prefixed with
6015  *              "json:" (for use through the JSON pseudo protocol) and put here.
6016  */
6017 void bdrv_refresh_filename(BlockDriverState *bs)
6018 {
6019     BlockDriver *drv = bs->drv;
6020     QDict *opts;
6021 
6022     if (!drv) {
6023         return;
6024     }
6025 
6026     /* This BDS's file name will most probably depend on its file's name, so
6027      * refresh that first */
6028     if (bs->file) {
6029         bdrv_refresh_filename(bs->file);
6030     }
6031 
6032     if (drv->bdrv_refresh_filename) {
6033         /* Obsolete information is of no use here, so drop the old file name
6034          * information before refreshing it */
6035         bs->exact_filename[0] = '\0';
6036         if (bs->full_open_options) {
6037             QDECREF(bs->full_open_options);
6038             bs->full_open_options = NULL;
6039         }
6040 
6041         drv->bdrv_refresh_filename(bs);
6042     } else if (bs->file) {
6043         /* Try to reconstruct valid information from the underlying file */
6044         bool has_open_options;
6045 
6046         bs->exact_filename[0] = '\0';
6047         if (bs->full_open_options) {
6048             QDECREF(bs->full_open_options);
6049             bs->full_open_options = NULL;
6050         }
6051 
6052         opts = qdict_new();
6053         has_open_options = append_open_options(opts, bs);
6054 
6055         /* If no specific options have been given for this BDS, the filename of
6056          * the underlying file should suffice for this one as well */
6057         if (bs->file->exact_filename[0] && !has_open_options) {
6058             strcpy(bs->exact_filename, bs->file->exact_filename);
6059         }
6060         /* Reconstructing the full options QDict is simple for most format block
6061          * drivers, as long as the full options are known for the underlying
6062          * file BDS. The full options QDict of that file BDS should somehow
6063          * contain a representation of the filename, therefore the following
6064          * suffices without querying the (exact_)filename of this BDS. */
6065         if (bs->file->full_open_options) {
6066             qdict_put_obj(opts, "driver",
6067                           QOBJECT(qstring_from_str(drv->format_name)));
6068             QINCREF(bs->file->full_open_options);
6069             qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6070 
6071             bs->full_open_options = opts;
6072         } else {
6073             QDECREF(opts);
6074         }
6075     } else if (!bs->full_open_options && qdict_size(bs->options)) {
6076         /* There is no underlying file BDS (at least referenced by BDS.file),
6077          * so the full options QDict should be equal to the options given
6078          * specifically for this block device when it was opened (plus the
6079          * driver specification).
6080          * Because those options don't change, there is no need to update
6081          * full_open_options when it's already set. */
6082 
6083         opts = qdict_new();
6084         append_open_options(opts, bs);
6085         qdict_put_obj(opts, "driver",
6086                       QOBJECT(qstring_from_str(drv->format_name)));
6087 
6088         if (bs->exact_filename[0]) {
6089             /* This may not work for all block protocol drivers (some may
6090              * require this filename to be parsed), but we have to find some
6091              * default solution here, so just include it. If some block driver
6092              * does not support pure options without any filename at all or
6093              * needs some special format of the options QDict, it needs to
6094              * implement the driver-specific bdrv_refresh_filename() function.
6095              */
6096             qdict_put_obj(opts, "filename",
6097                           QOBJECT(qstring_from_str(bs->exact_filename)));
6098         }
6099 
6100         bs->full_open_options = opts;
6101     }
6102 
6103     if (bs->exact_filename[0]) {
6104         pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6105     } else if (bs->full_open_options) {
6106         QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6107         snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6108                  qstring_get_str(json));
6109         QDECREF(json);
6110     }
6111 }
6112