xref: /openbmc/qemu/block.c (revision d34682cd4a06efe9ee3fc8cb7e8a0ea445299989)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor/monitor.h"
28 #include "block/block_int.h"
29 #include "block/blockjob.h"
30 #include "qemu/module.h"
31 #include "qapi/qmp/qjson.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 
39 #ifdef CONFIG_BSD
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/queue.h>
44 #ifndef __DragonFly__
45 #include <sys/disk.h>
46 #endif
47 #endif
48 
49 #ifdef _WIN32
50 #include <windows.h>
51 #endif
52 
53 struct BdrvDirtyBitmap {
54     HBitmap *bitmap;
55     QLIST_ENTRY(BdrvDirtyBitmap) list;
56 };
57 
58 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59 
60 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63         BlockDriverCompletionFunc *cb, void *opaque);
64 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66         BlockDriverCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68                                          int64_t sector_num, int nb_sectors,
69                                          QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71                                          int64_t sector_num, int nb_sectors,
72                                          QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
74     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
75     BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
77     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
78     BdrvRequestFlags flags);
79 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80                                                int64_t sector_num,
81                                                QEMUIOVector *qiov,
82                                                int nb_sectors,
83                                                BdrvRequestFlags flags,
84                                                BlockDriverCompletionFunc *cb,
85                                                void *opaque,
86                                                bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90 
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92     QTAILQ_HEAD_INITIALIZER(bdrv_states);
93 
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96 
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98     QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
102 
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
105 {
106     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108             filename[1] == ':');
109 }
110 
111 int is_windows_drive(const char *filename)
112 {
113     if (is_windows_drive_prefix(filename) &&
114         filename[2] == '\0')
115         return 1;
116     if (strstart(filename, "\\\\.\\", NULL) ||
117         strstart(filename, "//./", NULL))
118         return 1;
119     return 0;
120 }
121 #endif
122 
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125                         ThrottleConfig *cfg)
126 {
127     int i;
128 
129     throttle_config(&bs->throttle_state, cfg);
130 
131     for (i = 0; i < 2; i++) {
132         qemu_co_enter_next(&bs->throttled_reqs[i]);
133     }
134 }
135 
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138 {
139     bool drained = false;
140     bool enabled = bs->io_limits_enabled;
141     int i;
142 
143     bs->io_limits_enabled = false;
144 
145     for (i = 0; i < 2; i++) {
146         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147             drained = true;
148         }
149     }
150 
151     bs->io_limits_enabled = enabled;
152 
153     return drained;
154 }
155 
156 void bdrv_io_limits_disable(BlockDriverState *bs)
157 {
158     bs->io_limits_enabled = false;
159 
160     bdrv_start_throttled_reqs(bs);
161 
162     throttle_destroy(&bs->throttle_state);
163 }
164 
165 static void bdrv_throttle_read_timer_cb(void *opaque)
166 {
167     BlockDriverState *bs = opaque;
168     qemu_co_enter_next(&bs->throttled_reqs[0]);
169 }
170 
171 static void bdrv_throttle_write_timer_cb(void *opaque)
172 {
173     BlockDriverState *bs = opaque;
174     qemu_co_enter_next(&bs->throttled_reqs[1]);
175 }
176 
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
179 {
180     assert(!bs->io_limits_enabled);
181     throttle_init(&bs->throttle_state,
182                   QEMU_CLOCK_VIRTUAL,
183                   bdrv_throttle_read_timer_cb,
184                   bdrv_throttle_write_timer_cb,
185                   bs);
186     bs->io_limits_enabled = true;
187 }
188 
189 /* This function makes an IO wait if needed
190  *
191  * @nb_sectors: the number of sectors of the IO
192  * @is_write:   is the IO a write
193  */
194 static void bdrv_io_limits_intercept(BlockDriverState *bs,
195                                      int nb_sectors,
196                                      bool is_write)
197 {
198     /* does this io must wait */
199     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
200 
201     /* if must wait or any request of this type throttled queue the IO */
202     if (must_wait ||
203         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
205     }
206 
207     /* the IO will be executed, do the accounting */
208     throttle_account(&bs->throttle_state,
209                      is_write,
210                      nb_sectors * BDRV_SECTOR_SIZE);
211 
212     /* if the next request must wait -> do nothing */
213     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214         return;
215     }
216 
217     /* else queue next request for execution */
218     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
219 }
220 
221 /* check if the path starts with "<protocol>:" */
222 static int path_has_protocol(const char *path)
223 {
224     const char *p;
225 
226 #ifdef _WIN32
227     if (is_windows_drive(path) ||
228         is_windows_drive_prefix(path)) {
229         return 0;
230     }
231     p = path + strcspn(path, ":/\\");
232 #else
233     p = path + strcspn(path, ":/");
234 #endif
235 
236     return *p == ':';
237 }
238 
239 int path_is_absolute(const char *path)
240 {
241 #ifdef _WIN32
242     /* specific case for names like: "\\.\d:" */
243     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
244         return 1;
245     }
246     return (*path == '/' || *path == '\\');
247 #else
248     return (*path == '/');
249 #endif
250 }
251 
252 /* if filename is absolute, just copy it to dest. Otherwise, build a
253    path to it by considering it is relative to base_path. URL are
254    supported. */
255 void path_combine(char *dest, int dest_size,
256                   const char *base_path,
257                   const char *filename)
258 {
259     const char *p, *p1;
260     int len;
261 
262     if (dest_size <= 0)
263         return;
264     if (path_is_absolute(filename)) {
265         pstrcpy(dest, dest_size, filename);
266     } else {
267         p = strchr(base_path, ':');
268         if (p)
269             p++;
270         else
271             p = base_path;
272         p1 = strrchr(base_path, '/');
273 #ifdef _WIN32
274         {
275             const char *p2;
276             p2 = strrchr(base_path, '\\');
277             if (!p1 || p2 > p1)
278                 p1 = p2;
279         }
280 #endif
281         if (p1)
282             p1++;
283         else
284             p1 = base_path;
285         if (p1 > p)
286             p = p1;
287         len = p - base_path;
288         if (len > dest_size - 1)
289             len = dest_size - 1;
290         memcpy(dest, base_path, len);
291         dest[len] = '\0';
292         pstrcat(dest, dest_size, filename);
293     }
294 }
295 
296 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
297 {
298     if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
299         pstrcpy(dest, sz, bs->backing_file);
300     } else {
301         path_combine(dest, sz, bs->filename, bs->backing_file);
302     }
303 }
304 
305 void bdrv_register(BlockDriver *bdrv)
306 {
307     /* Block drivers without coroutine functions need emulation */
308     if (!bdrv->bdrv_co_readv) {
309         bdrv->bdrv_co_readv = bdrv_co_readv_em;
310         bdrv->bdrv_co_writev = bdrv_co_writev_em;
311 
312         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
313          * the block driver lacks aio we need to emulate that too.
314          */
315         if (!bdrv->bdrv_aio_readv) {
316             /* add AIO emulation layer */
317             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
318             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
319         }
320     }
321 
322     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
323 }
324 
325 /* create a new block device (by default it is empty) */
326 BlockDriverState *bdrv_new(const char *device_name)
327 {
328     BlockDriverState *bs;
329 
330     bs = g_malloc0(sizeof(BlockDriverState));
331     QLIST_INIT(&bs->dirty_bitmaps);
332     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
333     if (device_name[0] != '\0') {
334         QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
335     }
336     bdrv_iostatus_disable(bs);
337     notifier_list_init(&bs->close_notifiers);
338     notifier_with_return_list_init(&bs->before_write_notifiers);
339     qemu_co_queue_init(&bs->throttled_reqs[0]);
340     qemu_co_queue_init(&bs->throttled_reqs[1]);
341     bs->refcnt = 1;
342 
343     return bs;
344 }
345 
346 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
347 {
348     notifier_list_add(&bs->close_notifiers, notify);
349 }
350 
351 BlockDriver *bdrv_find_format(const char *format_name)
352 {
353     BlockDriver *drv1;
354     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
355         if (!strcmp(drv1->format_name, format_name)) {
356             return drv1;
357         }
358     }
359     return NULL;
360 }
361 
362 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
363 {
364     static const char *whitelist_rw[] = {
365         CONFIG_BDRV_RW_WHITELIST
366     };
367     static const char *whitelist_ro[] = {
368         CONFIG_BDRV_RO_WHITELIST
369     };
370     const char **p;
371 
372     if (!whitelist_rw[0] && !whitelist_ro[0]) {
373         return 1;               /* no whitelist, anything goes */
374     }
375 
376     for (p = whitelist_rw; *p; p++) {
377         if (!strcmp(drv->format_name, *p)) {
378             return 1;
379         }
380     }
381     if (read_only) {
382         for (p = whitelist_ro; *p; p++) {
383             if (!strcmp(drv->format_name, *p)) {
384                 return 1;
385             }
386         }
387     }
388     return 0;
389 }
390 
391 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
392                                           bool read_only)
393 {
394     BlockDriver *drv = bdrv_find_format(format_name);
395     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
396 }
397 
398 typedef struct CreateCo {
399     BlockDriver *drv;
400     char *filename;
401     QEMUOptionParameter *options;
402     int ret;
403     Error *err;
404 } CreateCo;
405 
406 static void coroutine_fn bdrv_create_co_entry(void *opaque)
407 {
408     Error *local_err = NULL;
409     int ret;
410 
411     CreateCo *cco = opaque;
412     assert(cco->drv);
413 
414     ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
415     if (error_is_set(&local_err)) {
416         error_propagate(&cco->err, local_err);
417     }
418     cco->ret = ret;
419 }
420 
421 int bdrv_create(BlockDriver *drv, const char* filename,
422     QEMUOptionParameter *options, Error **errp)
423 {
424     int ret;
425 
426     Coroutine *co;
427     CreateCo cco = {
428         .drv = drv,
429         .filename = g_strdup(filename),
430         .options = options,
431         .ret = NOT_DONE,
432         .err = NULL,
433     };
434 
435     if (!drv->bdrv_create) {
436         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
437         ret = -ENOTSUP;
438         goto out;
439     }
440 
441     if (qemu_in_coroutine()) {
442         /* Fast-path if already in coroutine context */
443         bdrv_create_co_entry(&cco);
444     } else {
445         co = qemu_coroutine_create(bdrv_create_co_entry);
446         qemu_coroutine_enter(co, &cco);
447         while (cco.ret == NOT_DONE) {
448             qemu_aio_wait();
449         }
450     }
451 
452     ret = cco.ret;
453     if (ret < 0) {
454         if (error_is_set(&cco.err)) {
455             error_propagate(errp, cco.err);
456         } else {
457             error_setg_errno(errp, -ret, "Could not create image");
458         }
459     }
460 
461 out:
462     g_free(cco.filename);
463     return ret;
464 }
465 
466 int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
467                      Error **errp)
468 {
469     BlockDriver *drv;
470     Error *local_err = NULL;
471     int ret;
472 
473     drv = bdrv_find_protocol(filename, true);
474     if (drv == NULL) {
475         error_setg(errp, "Could not find protocol for file '%s'", filename);
476         return -ENOENT;
477     }
478 
479     ret = bdrv_create(drv, filename, options, &local_err);
480     if (error_is_set(&local_err)) {
481         error_propagate(errp, local_err);
482     }
483     return ret;
484 }
485 
486 static int bdrv_refresh_limits(BlockDriverState *bs)
487 {
488     BlockDriver *drv = bs->drv;
489 
490     memset(&bs->bl, 0, sizeof(bs->bl));
491 
492     if (drv && drv->bdrv_refresh_limits) {
493         return drv->bdrv_refresh_limits(bs);
494     }
495 
496     return 0;
497 }
498 
499 /*
500  * Create a uniquely-named empty temporary file.
501  * Return 0 upon success, otherwise a negative errno value.
502  */
503 int get_tmp_filename(char *filename, int size)
504 {
505 #ifdef _WIN32
506     char temp_dir[MAX_PATH];
507     /* GetTempFileName requires that its output buffer (4th param)
508        have length MAX_PATH or greater.  */
509     assert(size >= MAX_PATH);
510     return (GetTempPath(MAX_PATH, temp_dir)
511             && GetTempFileName(temp_dir, "qem", 0, filename)
512             ? 0 : -GetLastError());
513 #else
514     int fd;
515     const char *tmpdir;
516     tmpdir = getenv("TMPDIR");
517     if (!tmpdir)
518         tmpdir = "/tmp";
519     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
520         return -EOVERFLOW;
521     }
522     fd = mkstemp(filename);
523     if (fd < 0) {
524         return -errno;
525     }
526     if (close(fd) != 0) {
527         unlink(filename);
528         return -errno;
529     }
530     return 0;
531 #endif
532 }
533 
534 /*
535  * Detect host devices. By convention, /dev/cdrom[N] is always
536  * recognized as a host CDROM.
537  */
538 static BlockDriver *find_hdev_driver(const char *filename)
539 {
540     int score_max = 0, score;
541     BlockDriver *drv = NULL, *d;
542 
543     QLIST_FOREACH(d, &bdrv_drivers, list) {
544         if (d->bdrv_probe_device) {
545             score = d->bdrv_probe_device(filename);
546             if (score > score_max) {
547                 score_max = score;
548                 drv = d;
549             }
550         }
551     }
552 
553     return drv;
554 }
555 
556 BlockDriver *bdrv_find_protocol(const char *filename,
557                                 bool allow_protocol_prefix)
558 {
559     BlockDriver *drv1;
560     char protocol[128];
561     int len;
562     const char *p;
563 
564     /* TODO Drivers without bdrv_file_open must be specified explicitly */
565 
566     /*
567      * XXX(hch): we really should not let host device detection
568      * override an explicit protocol specification, but moving this
569      * later breaks access to device names with colons in them.
570      * Thanks to the brain-dead persistent naming schemes on udev-
571      * based Linux systems those actually are quite common.
572      */
573     drv1 = find_hdev_driver(filename);
574     if (drv1) {
575         return drv1;
576     }
577 
578     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
579         return bdrv_find_format("file");
580     }
581 
582     p = strchr(filename, ':');
583     assert(p != NULL);
584     len = p - filename;
585     if (len > sizeof(protocol) - 1)
586         len = sizeof(protocol) - 1;
587     memcpy(protocol, filename, len);
588     protocol[len] = '\0';
589     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
590         if (drv1->protocol_name &&
591             !strcmp(drv1->protocol_name, protocol)) {
592             return drv1;
593         }
594     }
595     return NULL;
596 }
597 
598 static int find_image_format(BlockDriverState *bs, const char *filename,
599                              BlockDriver **pdrv, Error **errp)
600 {
601     int score, score_max;
602     BlockDriver *drv1, *drv;
603     uint8_t buf[2048];
604     int ret = 0;
605 
606     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
607     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
608         drv = bdrv_find_format("raw");
609         if (!drv) {
610             error_setg(errp, "Could not find raw image format");
611             ret = -ENOENT;
612         }
613         *pdrv = drv;
614         return ret;
615     }
616 
617     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
618     if (ret < 0) {
619         error_setg_errno(errp, -ret, "Could not read image for determining its "
620                          "format");
621         *pdrv = NULL;
622         return ret;
623     }
624 
625     score_max = 0;
626     drv = NULL;
627     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
628         if (drv1->bdrv_probe) {
629             score = drv1->bdrv_probe(buf, ret, filename);
630             if (score > score_max) {
631                 score_max = score;
632                 drv = drv1;
633             }
634         }
635     }
636     if (!drv) {
637         error_setg(errp, "Could not determine image format: No compatible "
638                    "driver found");
639         ret = -ENOENT;
640     }
641     *pdrv = drv;
642     return ret;
643 }
644 
645 /**
646  * Set the current 'total_sectors' value
647  */
648 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
649 {
650     BlockDriver *drv = bs->drv;
651 
652     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
653     if (bs->sg)
654         return 0;
655 
656     /* query actual device if possible, otherwise just trust the hint */
657     if (drv->bdrv_getlength) {
658         int64_t length = drv->bdrv_getlength(bs);
659         if (length < 0) {
660             return length;
661         }
662         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
663     }
664 
665     bs->total_sectors = hint;
666     return 0;
667 }
668 
669 /**
670  * Set open flags for a given discard mode
671  *
672  * Return 0 on success, -1 if the discard mode was invalid.
673  */
674 int bdrv_parse_discard_flags(const char *mode, int *flags)
675 {
676     *flags &= ~BDRV_O_UNMAP;
677 
678     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
679         /* do nothing */
680     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
681         *flags |= BDRV_O_UNMAP;
682     } else {
683         return -1;
684     }
685 
686     return 0;
687 }
688 
689 /**
690  * Set open flags for a given cache mode
691  *
692  * Return 0 on success, -1 if the cache mode was invalid.
693  */
694 int bdrv_parse_cache_flags(const char *mode, int *flags)
695 {
696     *flags &= ~BDRV_O_CACHE_MASK;
697 
698     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
699         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
700     } else if (!strcmp(mode, "directsync")) {
701         *flags |= BDRV_O_NOCACHE;
702     } else if (!strcmp(mode, "writeback")) {
703         *flags |= BDRV_O_CACHE_WB;
704     } else if (!strcmp(mode, "unsafe")) {
705         *flags |= BDRV_O_CACHE_WB;
706         *flags |= BDRV_O_NO_FLUSH;
707     } else if (!strcmp(mode, "writethrough")) {
708         /* this is the default */
709     } else {
710         return -1;
711     }
712 
713     return 0;
714 }
715 
716 /**
717  * The copy-on-read flag is actually a reference count so multiple users may
718  * use the feature without worrying about clobbering its previous state.
719  * Copy-on-read stays enabled until all users have called to disable it.
720  */
721 void bdrv_enable_copy_on_read(BlockDriverState *bs)
722 {
723     bs->copy_on_read++;
724 }
725 
726 void bdrv_disable_copy_on_read(BlockDriverState *bs)
727 {
728     assert(bs->copy_on_read > 0);
729     bs->copy_on_read--;
730 }
731 
732 static int bdrv_open_flags(BlockDriverState *bs, int flags)
733 {
734     int open_flags = flags | BDRV_O_CACHE_WB;
735 
736     /*
737      * Clear flags that are internal to the block layer before opening the
738      * image.
739      */
740     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
741 
742     /*
743      * Snapshots should be writable.
744      */
745     if (bs->is_temporary) {
746         open_flags |= BDRV_O_RDWR;
747     }
748 
749     return open_flags;
750 }
751 
752 static int bdrv_assign_node_name(BlockDriverState *bs,
753                                  const char *node_name,
754                                  Error **errp)
755 {
756     if (!node_name) {
757         return 0;
758     }
759 
760     /* empty string node name is invalid */
761     if (node_name[0] == '\0') {
762         error_setg(errp, "Empty node name");
763         return -EINVAL;
764     }
765 
766     /* takes care of avoiding duplicates node names */
767     if (bdrv_find_node(node_name)) {
768         error_setg(errp, "Duplicate node name");
769         return -EINVAL;
770     }
771 
772     /* copy node name into the bs and insert it into the graph list */
773     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
774     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
775 
776     return 0;
777 }
778 
779 /*
780  * Common part for opening disk images and files
781  *
782  * Removes all processed options from *options.
783  */
784 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
785     QDict *options, int flags, BlockDriver *drv, Error **errp)
786 {
787     int ret, open_flags;
788     const char *filename;
789     const char *node_name = NULL;
790     Error *local_err = NULL;
791 
792     assert(drv != NULL);
793     assert(bs->file == NULL);
794     assert(options != NULL && bs->options != options);
795 
796     if (file != NULL) {
797         filename = file->filename;
798     } else {
799         filename = qdict_get_try_str(options, "filename");
800     }
801 
802     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
803 
804     node_name = qdict_get_try_str(options, "node-name");
805     ret = bdrv_assign_node_name(bs, node_name, errp);
806     if (ret < 0) {
807         return ret;
808     }
809     qdict_del(options, "node-name");
810 
811     /* bdrv_open() with directly using a protocol as drv. This layer is already
812      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
813      * and return immediately. */
814     if (file != NULL && drv->bdrv_file_open) {
815         bdrv_swap(file, bs);
816         return 0;
817     }
818 
819     bs->open_flags = flags;
820     bs->buffer_alignment = 512;
821     bs->zero_beyond_eof = true;
822     open_flags = bdrv_open_flags(bs, flags);
823     bs->read_only = !(open_flags & BDRV_O_RDWR);
824 
825     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
826         error_setg(errp,
827                    !bs->read_only && bdrv_is_whitelisted(drv, true)
828                         ? "Driver '%s' can only be used for read-only devices"
829                         : "Driver '%s' is not whitelisted",
830                    drv->format_name);
831         return -ENOTSUP;
832     }
833 
834     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
835     if (flags & BDRV_O_COPY_ON_READ) {
836         if (!bs->read_only) {
837             bdrv_enable_copy_on_read(bs);
838         } else {
839             error_setg(errp, "Can't use copy-on-read on read-only device");
840             return -EINVAL;
841         }
842     }
843 
844     if (filename != NULL) {
845         pstrcpy(bs->filename, sizeof(bs->filename), filename);
846     } else {
847         bs->filename[0] = '\0';
848     }
849 
850     bs->drv = drv;
851     bs->opaque = g_malloc0(drv->instance_size);
852 
853     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
854 
855     /* Open the image, either directly or using a protocol */
856     if (drv->bdrv_file_open) {
857         assert(file == NULL);
858         assert(!drv->bdrv_needs_filename || filename != NULL);
859         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
860     } else {
861         if (file == NULL) {
862             error_setg(errp, "Can't use '%s' as a block driver for the "
863                        "protocol level", drv->format_name);
864             ret = -EINVAL;
865             goto free_and_fail;
866         }
867         bs->file = file;
868         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
869     }
870 
871     if (ret < 0) {
872         if (error_is_set(&local_err)) {
873             error_propagate(errp, local_err);
874         } else if (bs->filename[0]) {
875             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
876         } else {
877             error_setg_errno(errp, -ret, "Could not open image");
878         }
879         goto free_and_fail;
880     }
881 
882     ret = refresh_total_sectors(bs, bs->total_sectors);
883     if (ret < 0) {
884         error_setg_errno(errp, -ret, "Could not refresh total sector count");
885         goto free_and_fail;
886     }
887 
888     bdrv_refresh_limits(bs);
889 
890 #ifndef _WIN32
891     if (bs->is_temporary) {
892         assert(bs->filename[0] != '\0');
893         unlink(bs->filename);
894     }
895 #endif
896     return 0;
897 
898 free_and_fail:
899     bs->file = NULL;
900     g_free(bs->opaque);
901     bs->opaque = NULL;
902     bs->drv = NULL;
903     return ret;
904 }
905 
906 /*
907  * Opens a file using a protocol (file, host_device, nbd, ...)
908  *
909  * options is a QDict of options to pass to the block drivers, or NULL for an
910  * empty set of options. The reference to the QDict belongs to the block layer
911  * after the call (even on failure), so if the caller intends to reuse the
912  * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
913  */
914 int bdrv_file_open(BlockDriverState **pbs, const char *filename,
915                    const char *reference, QDict *options, int flags,
916                    Error **errp)
917 {
918     BlockDriverState *bs = NULL;
919     BlockDriver *drv;
920     const char *drvname;
921     bool allow_protocol_prefix = false;
922     Error *local_err = NULL;
923     int ret;
924 
925     /* NULL means an empty set of options */
926     if (options == NULL) {
927         options = qdict_new();
928     }
929 
930     if (reference) {
931         if (filename || qdict_size(options)) {
932             error_setg(errp, "Cannot reference an existing block device with "
933                        "additional options or a new filename");
934             return -EINVAL;
935         }
936         QDECREF(options);
937 
938         bs = bdrv_find(reference);
939         if (!bs) {
940             error_setg(errp, "Cannot find block device '%s'", reference);
941             return -ENODEV;
942         }
943         bdrv_ref(bs);
944         *pbs = bs;
945         return 0;
946     }
947 
948     bs = bdrv_new("");
949     bs->options = options;
950     options = qdict_clone_shallow(options);
951 
952     /* Fetch the file name from the options QDict if necessary */
953     if (!filename) {
954         filename = qdict_get_try_str(options, "filename");
955     } else if (filename && !qdict_haskey(options, "filename")) {
956         qdict_put(options, "filename", qstring_from_str(filename));
957         allow_protocol_prefix = true;
958     } else {
959         error_setg(errp, "Can't specify 'file' and 'filename' options at the "
960                    "same time");
961         ret = -EINVAL;
962         goto fail;
963     }
964 
965     /* Find the right block driver */
966     drvname = qdict_get_try_str(options, "driver");
967     if (drvname) {
968         drv = bdrv_find_format(drvname);
969         if (!drv) {
970             error_setg(errp, "Unknown driver '%s'", drvname);
971         }
972         qdict_del(options, "driver");
973     } else if (filename) {
974         drv = bdrv_find_protocol(filename, allow_protocol_prefix);
975         if (!drv) {
976             error_setg(errp, "Unknown protocol");
977         }
978     } else {
979         error_setg(errp, "Must specify either driver or file");
980         drv = NULL;
981     }
982 
983     if (!drv) {
984         /* errp has been set already */
985         ret = -ENOENT;
986         goto fail;
987     }
988 
989     /* Parse the filename and open it */
990     if (drv->bdrv_parse_filename && filename) {
991         drv->bdrv_parse_filename(filename, options, &local_err);
992         if (error_is_set(&local_err)) {
993             error_propagate(errp, local_err);
994             ret = -EINVAL;
995             goto fail;
996         }
997         qdict_del(options, "filename");
998     } else if (drv->bdrv_needs_filename && !filename) {
999         error_setg(errp, "The '%s' block driver requires a file name",
1000                    drv->format_name);
1001         ret = -EINVAL;
1002         goto fail;
1003     }
1004 
1005     if (!drv->bdrv_file_open) {
1006         ret = bdrv_open(bs, filename, options, flags, drv, &local_err);
1007         options = NULL;
1008     } else {
1009         ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err);
1010     }
1011     if (ret < 0) {
1012         error_propagate(errp, local_err);
1013         goto fail;
1014     }
1015 
1016     /* Check if any unknown options were used */
1017     if (options && (qdict_size(options) != 0)) {
1018         const QDictEntry *entry = qdict_first(options);
1019         error_setg(errp, "Block protocol '%s' doesn't support the option '%s'",
1020                    drv->format_name, entry->key);
1021         ret = -EINVAL;
1022         goto fail;
1023     }
1024     QDECREF(options);
1025 
1026     bs->growable = 1;
1027     *pbs = bs;
1028     return 0;
1029 
1030 fail:
1031     QDECREF(options);
1032     if (!bs->drv) {
1033         QDECREF(bs->options);
1034     }
1035     bdrv_unref(bs);
1036     return ret;
1037 }
1038 
1039 /*
1040  * Opens the backing file for a BlockDriverState if not yet open
1041  *
1042  * options is a QDict of options to pass to the block drivers, or NULL for an
1043  * empty set of options. The reference to the QDict is transferred to this
1044  * function (even on failure), so if the caller intends to reuse the dictionary,
1045  * it needs to use QINCREF() before calling bdrv_file_open.
1046  */
1047 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1048 {
1049     char backing_filename[PATH_MAX];
1050     int back_flags, ret;
1051     BlockDriver *back_drv = NULL;
1052     Error *local_err = NULL;
1053 
1054     if (bs->backing_hd != NULL) {
1055         QDECREF(options);
1056         return 0;
1057     }
1058 
1059     /* NULL means an empty set of options */
1060     if (options == NULL) {
1061         options = qdict_new();
1062     }
1063 
1064     bs->open_flags &= ~BDRV_O_NO_BACKING;
1065     if (qdict_haskey(options, "file.filename")) {
1066         backing_filename[0] = '\0';
1067     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1068         QDECREF(options);
1069         return 0;
1070     } else {
1071         bdrv_get_full_backing_filename(bs, backing_filename,
1072                                        sizeof(backing_filename));
1073     }
1074 
1075     bs->backing_hd = bdrv_new("");
1076 
1077     if (bs->backing_format[0] != '\0') {
1078         back_drv = bdrv_find_format(bs->backing_format);
1079     }
1080 
1081     /* backing files always opened read-only */
1082     back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
1083                                     BDRV_O_COPY_ON_READ);
1084 
1085     ret = bdrv_open(bs->backing_hd,
1086                     *backing_filename ? backing_filename : NULL, options,
1087                     back_flags, back_drv, &local_err);
1088     if (ret < 0) {
1089         bdrv_unref(bs->backing_hd);
1090         bs->backing_hd = NULL;
1091         bs->open_flags |= BDRV_O_NO_BACKING;
1092         error_setg(errp, "Could not open backing file: %s",
1093                    error_get_pretty(local_err));
1094         error_free(local_err);
1095         return ret;
1096     }
1097 
1098     if (bs->backing_hd->file) {
1099         pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1100                 bs->backing_hd->file->filename);
1101     }
1102 
1103     /* Recalculate the BlockLimits with the backing file */
1104     bdrv_refresh_limits(bs);
1105 
1106     return 0;
1107 }
1108 
1109 /*
1110  * Opens a disk image whose options are given as BlockdevRef in another block
1111  * device's options.
1112  *
1113  * If force_raw is true, bdrv_file_open() will be used, thereby preventing any
1114  * image format auto-detection. If it is false and a filename is given,
1115  * bdrv_open() will be used for auto-detection.
1116  *
1117  * If allow_none is true, no image will be opened if filename is false and no
1118  * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1119  *
1120  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1121  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1122  * itself, all options starting with "${bdref_key}." are considered part of the
1123  * BlockdevRef.
1124  *
1125  * The BlockdevRef will be removed from the options QDict.
1126  */
1127 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1128                     QDict *options, const char *bdref_key, int flags,
1129                     bool force_raw, bool allow_none, Error **errp)
1130 {
1131     QDict *image_options;
1132     int ret;
1133     char *bdref_key_dot;
1134     const char *reference;
1135 
1136     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1137     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1138     g_free(bdref_key_dot);
1139 
1140     reference = qdict_get_try_str(options, bdref_key);
1141     if (!filename && !reference && !qdict_size(image_options)) {
1142         if (allow_none) {
1143             ret = 0;
1144         } else {
1145             error_setg(errp, "A block device must be specified for \"%s\"",
1146                        bdref_key);
1147             ret = -EINVAL;
1148         }
1149         goto done;
1150     }
1151 
1152     if (filename && !force_raw) {
1153         /* If a filename is given and the block driver should be detected
1154            automatically (instead of using none), use bdrv_open() in order to do
1155            that auto-detection. */
1156         BlockDriverState *bs;
1157 
1158         if (reference) {
1159             error_setg(errp, "Cannot reference an existing block device while "
1160                        "giving a filename");
1161             ret = -EINVAL;
1162             goto done;
1163         }
1164 
1165         bs = bdrv_new("");
1166         ret = bdrv_open(bs, filename, image_options, flags, NULL, errp);
1167         if (ret < 0) {
1168             bdrv_unref(bs);
1169         } else {
1170             *pbs = bs;
1171         }
1172     } else {
1173         ret = bdrv_file_open(pbs, filename, reference, image_options, flags,
1174                              errp);
1175     }
1176 
1177 done:
1178     qdict_del(options, bdref_key);
1179     return ret;
1180 }
1181 
1182 /*
1183  * Opens a disk image (raw, qcow2, vmdk, ...)
1184  *
1185  * options is a QDict of options to pass to the block drivers, or NULL for an
1186  * empty set of options. The reference to the QDict belongs to the block layer
1187  * after the call (even on failure), so if the caller intends to reuse the
1188  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1189  */
1190 int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
1191               int flags, BlockDriver *drv, Error **errp)
1192 {
1193     int ret;
1194     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1195     char tmp_filename[PATH_MAX + 1];
1196     BlockDriverState *file = NULL;
1197     const char *drvname;
1198     Error *local_err = NULL;
1199 
1200     /* NULL means an empty set of options */
1201     if (options == NULL) {
1202         options = qdict_new();
1203     }
1204 
1205     bs->options = options;
1206     options = qdict_clone_shallow(options);
1207 
1208     /* For snapshot=on, create a temporary qcow2 overlay */
1209     if (flags & BDRV_O_SNAPSHOT) {
1210         BlockDriverState *bs1;
1211         int64_t total_size;
1212         BlockDriver *bdrv_qcow2;
1213         QEMUOptionParameter *create_options;
1214         QDict *snapshot_options;
1215 
1216         /* if snapshot, we create a temporary backing file and open it
1217            instead of opening 'filename' directly */
1218 
1219         /* Get the required size from the image */
1220         bs1 = bdrv_new("");
1221         QINCREF(options);
1222         ret = bdrv_open(bs1, filename, options, BDRV_O_NO_BACKING,
1223                         drv, &local_err);
1224         if (ret < 0) {
1225             bdrv_unref(bs1);
1226             goto fail;
1227         }
1228         total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1229 
1230         bdrv_unref(bs1);
1231 
1232         /* Create the temporary image */
1233         ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1234         if (ret < 0) {
1235             error_setg_errno(errp, -ret, "Could not get temporary filename");
1236             goto fail;
1237         }
1238 
1239         bdrv_qcow2 = bdrv_find_format("qcow2");
1240         create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1241                                                  NULL);
1242 
1243         set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1244 
1245         ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1246         free_option_parameters(create_options);
1247         if (ret < 0) {
1248             error_setg_errno(errp, -ret, "Could not create temporary overlay "
1249                              "'%s': %s", tmp_filename,
1250                              error_get_pretty(local_err));
1251             error_free(local_err);
1252             local_err = NULL;
1253             goto fail;
1254         }
1255 
1256         /* Prepare a new options QDict for the temporary file, where user
1257          * options refer to the backing file */
1258         if (filename) {
1259             qdict_put(options, "file.filename", qstring_from_str(filename));
1260         }
1261         if (drv) {
1262             qdict_put(options, "driver", qstring_from_str(drv->format_name));
1263         }
1264 
1265         snapshot_options = qdict_new();
1266         qdict_put(snapshot_options, "backing", options);
1267         qdict_flatten(snapshot_options);
1268 
1269         bs->options = snapshot_options;
1270         options = qdict_clone_shallow(bs->options);
1271 
1272         filename = tmp_filename;
1273         drv = bdrv_qcow2;
1274         bs->is_temporary = 1;
1275     }
1276 
1277     /* Open image file without format layer */
1278     if (flags & BDRV_O_RDWR) {
1279         flags |= BDRV_O_ALLOW_RDWR;
1280     }
1281 
1282     ret = bdrv_open_image(&file, filename, options, "file",
1283                           bdrv_open_flags(bs, flags | BDRV_O_UNMAP), true, true,
1284                           &local_err);
1285     if (ret < 0) {
1286         goto fail;
1287     }
1288 
1289     /* Find the right image format driver */
1290     drvname = qdict_get_try_str(options, "driver");
1291     if (drvname) {
1292         drv = bdrv_find_format(drvname);
1293         qdict_del(options, "driver");
1294         if (!drv) {
1295             error_setg(errp, "Invalid driver: '%s'", drvname);
1296             ret = -EINVAL;
1297             goto unlink_and_fail;
1298         }
1299     }
1300 
1301     if (!drv) {
1302         if (file) {
1303             ret = find_image_format(file, filename, &drv, &local_err);
1304         } else {
1305             error_setg(errp, "Must specify either driver or file");
1306             ret = -EINVAL;
1307             goto unlink_and_fail;
1308         }
1309     }
1310 
1311     if (!drv) {
1312         goto unlink_and_fail;
1313     }
1314 
1315     /* Open the image */
1316     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1317     if (ret < 0) {
1318         goto unlink_and_fail;
1319     }
1320 
1321     if (file && (bs->file != file)) {
1322         bdrv_unref(file);
1323         file = NULL;
1324     }
1325 
1326     /* If there is a backing file, use it */
1327     if ((flags & BDRV_O_NO_BACKING) == 0) {
1328         QDict *backing_options;
1329 
1330         qdict_extract_subqdict(options, &backing_options, "backing.");
1331         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1332         if (ret < 0) {
1333             goto close_and_fail;
1334         }
1335     }
1336 
1337     /* Check if any unknown options were used */
1338     if (qdict_size(options) != 0) {
1339         const QDictEntry *entry = qdict_first(options);
1340         error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1341                    "support the option '%s'", drv->format_name, bs->device_name,
1342                    entry->key);
1343 
1344         ret = -EINVAL;
1345         goto close_and_fail;
1346     }
1347     QDECREF(options);
1348 
1349     if (!bdrv_key_required(bs)) {
1350         bdrv_dev_change_media_cb(bs, true);
1351     }
1352 
1353     return 0;
1354 
1355 unlink_and_fail:
1356     if (file != NULL) {
1357         bdrv_unref(file);
1358     }
1359     if (bs->is_temporary) {
1360         unlink(filename);
1361     }
1362 fail:
1363     QDECREF(bs->options);
1364     QDECREF(options);
1365     bs->options = NULL;
1366     if (error_is_set(&local_err)) {
1367         error_propagate(errp, local_err);
1368     }
1369     return ret;
1370 
1371 close_and_fail:
1372     bdrv_close(bs);
1373     QDECREF(options);
1374     if (error_is_set(&local_err)) {
1375         error_propagate(errp, local_err);
1376     }
1377     return ret;
1378 }
1379 
1380 typedef struct BlockReopenQueueEntry {
1381      bool prepared;
1382      BDRVReopenState state;
1383      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1384 } BlockReopenQueueEntry;
1385 
1386 /*
1387  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1388  * reopen of multiple devices.
1389  *
1390  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1391  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1392  * be created and initialized. This newly created BlockReopenQueue should be
1393  * passed back in for subsequent calls that are intended to be of the same
1394  * atomic 'set'.
1395  *
1396  * bs is the BlockDriverState to add to the reopen queue.
1397  *
1398  * flags contains the open flags for the associated bs
1399  *
1400  * returns a pointer to bs_queue, which is either the newly allocated
1401  * bs_queue, or the existing bs_queue being used.
1402  *
1403  */
1404 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1405                                     BlockDriverState *bs, int flags)
1406 {
1407     assert(bs != NULL);
1408 
1409     BlockReopenQueueEntry *bs_entry;
1410     if (bs_queue == NULL) {
1411         bs_queue = g_new0(BlockReopenQueue, 1);
1412         QSIMPLEQ_INIT(bs_queue);
1413     }
1414 
1415     if (bs->file) {
1416         bdrv_reopen_queue(bs_queue, bs->file, flags);
1417     }
1418 
1419     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1420     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1421 
1422     bs_entry->state.bs = bs;
1423     bs_entry->state.flags = flags;
1424 
1425     return bs_queue;
1426 }
1427 
1428 /*
1429  * Reopen multiple BlockDriverStates atomically & transactionally.
1430  *
1431  * The queue passed in (bs_queue) must have been built up previous
1432  * via bdrv_reopen_queue().
1433  *
1434  * Reopens all BDS specified in the queue, with the appropriate
1435  * flags.  All devices are prepared for reopen, and failure of any
1436  * device will cause all device changes to be abandonded, and intermediate
1437  * data cleaned up.
1438  *
1439  * If all devices prepare successfully, then the changes are committed
1440  * to all devices.
1441  *
1442  */
1443 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1444 {
1445     int ret = -1;
1446     BlockReopenQueueEntry *bs_entry, *next;
1447     Error *local_err = NULL;
1448 
1449     assert(bs_queue != NULL);
1450 
1451     bdrv_drain_all();
1452 
1453     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1454         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1455             error_propagate(errp, local_err);
1456             goto cleanup;
1457         }
1458         bs_entry->prepared = true;
1459     }
1460 
1461     /* If we reach this point, we have success and just need to apply the
1462      * changes
1463      */
1464     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1465         bdrv_reopen_commit(&bs_entry->state);
1466     }
1467 
1468     ret = 0;
1469 
1470 cleanup:
1471     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1472         if (ret && bs_entry->prepared) {
1473             bdrv_reopen_abort(&bs_entry->state);
1474         }
1475         g_free(bs_entry);
1476     }
1477     g_free(bs_queue);
1478     return ret;
1479 }
1480 
1481 
1482 /* Reopen a single BlockDriverState with the specified flags. */
1483 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1484 {
1485     int ret = -1;
1486     Error *local_err = NULL;
1487     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1488 
1489     ret = bdrv_reopen_multiple(queue, &local_err);
1490     if (local_err != NULL) {
1491         error_propagate(errp, local_err);
1492     }
1493     return ret;
1494 }
1495 
1496 
1497 /*
1498  * Prepares a BlockDriverState for reopen. All changes are staged in the
1499  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1500  * the block driver layer .bdrv_reopen_prepare()
1501  *
1502  * bs is the BlockDriverState to reopen
1503  * flags are the new open flags
1504  * queue is the reopen queue
1505  *
1506  * Returns 0 on success, non-zero on error.  On error errp will be set
1507  * as well.
1508  *
1509  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1510  * It is the responsibility of the caller to then call the abort() or
1511  * commit() for any other BDS that have been left in a prepare() state
1512  *
1513  */
1514 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1515                         Error **errp)
1516 {
1517     int ret = -1;
1518     Error *local_err = NULL;
1519     BlockDriver *drv;
1520 
1521     assert(reopen_state != NULL);
1522     assert(reopen_state->bs->drv != NULL);
1523     drv = reopen_state->bs->drv;
1524 
1525     /* if we are to stay read-only, do not allow permission change
1526      * to r/w */
1527     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1528         reopen_state->flags & BDRV_O_RDWR) {
1529         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1530                   reopen_state->bs->device_name);
1531         goto error;
1532     }
1533 
1534 
1535     ret = bdrv_flush(reopen_state->bs);
1536     if (ret) {
1537         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1538                   strerror(-ret));
1539         goto error;
1540     }
1541 
1542     if (drv->bdrv_reopen_prepare) {
1543         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1544         if (ret) {
1545             if (local_err != NULL) {
1546                 error_propagate(errp, local_err);
1547             } else {
1548                 error_setg(errp, "failed while preparing to reopen image '%s'",
1549                            reopen_state->bs->filename);
1550             }
1551             goto error;
1552         }
1553     } else {
1554         /* It is currently mandatory to have a bdrv_reopen_prepare()
1555          * handler for each supported drv. */
1556         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1557                   drv->format_name, reopen_state->bs->device_name,
1558                  "reopening of file");
1559         ret = -1;
1560         goto error;
1561     }
1562 
1563     ret = 0;
1564 
1565 error:
1566     return ret;
1567 }
1568 
1569 /*
1570  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1571  * makes them final by swapping the staging BlockDriverState contents into
1572  * the active BlockDriverState contents.
1573  */
1574 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1575 {
1576     BlockDriver *drv;
1577 
1578     assert(reopen_state != NULL);
1579     drv = reopen_state->bs->drv;
1580     assert(drv != NULL);
1581 
1582     /* If there are any driver level actions to take */
1583     if (drv->bdrv_reopen_commit) {
1584         drv->bdrv_reopen_commit(reopen_state);
1585     }
1586 
1587     /* set BDS specific flags now */
1588     reopen_state->bs->open_flags         = reopen_state->flags;
1589     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1590                                               BDRV_O_CACHE_WB);
1591     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1592 }
1593 
1594 /*
1595  * Abort the reopen, and delete and free the staged changes in
1596  * reopen_state
1597  */
1598 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1599 {
1600     BlockDriver *drv;
1601 
1602     assert(reopen_state != NULL);
1603     drv = reopen_state->bs->drv;
1604     assert(drv != NULL);
1605 
1606     if (drv->bdrv_reopen_abort) {
1607         drv->bdrv_reopen_abort(reopen_state);
1608     }
1609 }
1610 
1611 
1612 void bdrv_close(BlockDriverState *bs)
1613 {
1614     if (bs->job) {
1615         block_job_cancel_sync(bs->job);
1616     }
1617     bdrv_drain_all(); /* complete I/O */
1618     bdrv_flush(bs);
1619     bdrv_drain_all(); /* in case flush left pending I/O */
1620     notifier_list_notify(&bs->close_notifiers, bs);
1621 
1622     if (bs->drv) {
1623         if (bs->backing_hd) {
1624             bdrv_unref(bs->backing_hd);
1625             bs->backing_hd = NULL;
1626         }
1627         bs->drv->bdrv_close(bs);
1628         g_free(bs->opaque);
1629 #ifdef _WIN32
1630         if (bs->is_temporary) {
1631             unlink(bs->filename);
1632         }
1633 #endif
1634         bs->opaque = NULL;
1635         bs->drv = NULL;
1636         bs->copy_on_read = 0;
1637         bs->backing_file[0] = '\0';
1638         bs->backing_format[0] = '\0';
1639         bs->total_sectors = 0;
1640         bs->encrypted = 0;
1641         bs->valid_key = 0;
1642         bs->sg = 0;
1643         bs->growable = 0;
1644         bs->zero_beyond_eof = false;
1645         QDECREF(bs->options);
1646         bs->options = NULL;
1647 
1648         if (bs->file != NULL) {
1649             bdrv_unref(bs->file);
1650             bs->file = NULL;
1651         }
1652     }
1653 
1654     bdrv_dev_change_media_cb(bs, false);
1655 
1656     /*throttling disk I/O limits*/
1657     if (bs->io_limits_enabled) {
1658         bdrv_io_limits_disable(bs);
1659     }
1660 }
1661 
1662 void bdrv_close_all(void)
1663 {
1664     BlockDriverState *bs;
1665 
1666     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1667         bdrv_close(bs);
1668     }
1669 }
1670 
1671 /* Check if any requests are in-flight (including throttled requests) */
1672 static bool bdrv_requests_pending(BlockDriverState *bs)
1673 {
1674     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1675         return true;
1676     }
1677     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1678         return true;
1679     }
1680     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1681         return true;
1682     }
1683     if (bs->file && bdrv_requests_pending(bs->file)) {
1684         return true;
1685     }
1686     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1687         return true;
1688     }
1689     return false;
1690 }
1691 
1692 static bool bdrv_requests_pending_all(void)
1693 {
1694     BlockDriverState *bs;
1695     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1696         if (bdrv_requests_pending(bs)) {
1697             return true;
1698         }
1699     }
1700     return false;
1701 }
1702 
1703 /*
1704  * Wait for pending requests to complete across all BlockDriverStates
1705  *
1706  * This function does not flush data to disk, use bdrv_flush_all() for that
1707  * after calling this function.
1708  *
1709  * Note that completion of an asynchronous I/O operation can trigger any
1710  * number of other I/O operations on other devices---for example a coroutine
1711  * can be arbitrarily complex and a constant flow of I/O can come until the
1712  * coroutine is complete.  Because of this, it is not possible to have a
1713  * function to drain a single device's I/O queue.
1714  */
1715 void bdrv_drain_all(void)
1716 {
1717     /* Always run first iteration so any pending completion BHs run */
1718     bool busy = true;
1719     BlockDriverState *bs;
1720 
1721     while (busy) {
1722         QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1723             bdrv_start_throttled_reqs(bs);
1724         }
1725 
1726         busy = bdrv_requests_pending_all();
1727         busy |= aio_poll(qemu_get_aio_context(), busy);
1728     }
1729 }
1730 
1731 /* make a BlockDriverState anonymous by removing from bdrv_state and
1732  * graph_bdrv_state list.
1733    Also, NULL terminate the device_name to prevent double remove */
1734 void bdrv_make_anon(BlockDriverState *bs)
1735 {
1736     if (bs->device_name[0] != '\0') {
1737         QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1738     }
1739     bs->device_name[0] = '\0';
1740     if (bs->node_name[0] != '\0') {
1741         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1742     }
1743     bs->node_name[0] = '\0';
1744 }
1745 
1746 static void bdrv_rebind(BlockDriverState *bs)
1747 {
1748     if (bs->drv && bs->drv->bdrv_rebind) {
1749         bs->drv->bdrv_rebind(bs);
1750     }
1751 }
1752 
1753 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1754                                      BlockDriverState *bs_src)
1755 {
1756     /* move some fields that need to stay attached to the device */
1757     bs_dest->open_flags         = bs_src->open_flags;
1758 
1759     /* dev info */
1760     bs_dest->dev_ops            = bs_src->dev_ops;
1761     bs_dest->dev_opaque         = bs_src->dev_opaque;
1762     bs_dest->dev                = bs_src->dev;
1763     bs_dest->buffer_alignment   = bs_src->buffer_alignment;
1764     bs_dest->copy_on_read       = bs_src->copy_on_read;
1765 
1766     bs_dest->enable_write_cache = bs_src->enable_write_cache;
1767 
1768     /* i/o throttled req */
1769     memcpy(&bs_dest->throttle_state,
1770            &bs_src->throttle_state,
1771            sizeof(ThrottleState));
1772     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1773     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1774     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1775 
1776     /* r/w error */
1777     bs_dest->on_read_error      = bs_src->on_read_error;
1778     bs_dest->on_write_error     = bs_src->on_write_error;
1779 
1780     /* i/o status */
1781     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1782     bs_dest->iostatus           = bs_src->iostatus;
1783 
1784     /* dirty bitmap */
1785     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1786 
1787     /* reference count */
1788     bs_dest->refcnt             = bs_src->refcnt;
1789 
1790     /* job */
1791     bs_dest->in_use             = bs_src->in_use;
1792     bs_dest->job                = bs_src->job;
1793 
1794     /* keep the same entry in bdrv_states */
1795     pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1796             bs_src->device_name);
1797     bs_dest->device_list = bs_src->device_list;
1798 
1799     /* keep the same entry in graph_bdrv_states
1800      * We do want to swap name but don't want to swap linked list entries
1801      */
1802     bs_dest->node_list   = bs_src->node_list;
1803 }
1804 
1805 /*
1806  * Swap bs contents for two image chains while they are live,
1807  * while keeping required fields on the BlockDriverState that is
1808  * actually attached to a device.
1809  *
1810  * This will modify the BlockDriverState fields, and swap contents
1811  * between bs_new and bs_old. Both bs_new and bs_old are modified.
1812  *
1813  * bs_new is required to be anonymous.
1814  *
1815  * This function does not create any image files.
1816  */
1817 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1818 {
1819     BlockDriverState tmp;
1820 
1821     /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1822     assert(bs_new->device_name[0] == '\0');
1823     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1824     assert(bs_new->job == NULL);
1825     assert(bs_new->dev == NULL);
1826     assert(bs_new->in_use == 0);
1827     assert(bs_new->io_limits_enabled == false);
1828     assert(!throttle_have_timer(&bs_new->throttle_state));
1829 
1830     tmp = *bs_new;
1831     *bs_new = *bs_old;
1832     *bs_old = tmp;
1833 
1834     /* there are some fields that should not be swapped, move them back */
1835     bdrv_move_feature_fields(&tmp, bs_old);
1836     bdrv_move_feature_fields(bs_old, bs_new);
1837     bdrv_move_feature_fields(bs_new, &tmp);
1838 
1839     /* bs_new shouldn't be in bdrv_states even after the swap!  */
1840     assert(bs_new->device_name[0] == '\0');
1841 
1842     /* Check a few fields that should remain attached to the device */
1843     assert(bs_new->dev == NULL);
1844     assert(bs_new->job == NULL);
1845     assert(bs_new->in_use == 0);
1846     assert(bs_new->io_limits_enabled == false);
1847     assert(!throttle_have_timer(&bs_new->throttle_state));
1848 
1849     bdrv_rebind(bs_new);
1850     bdrv_rebind(bs_old);
1851 }
1852 
1853 /*
1854  * Add new bs contents at the top of an image chain while the chain is
1855  * live, while keeping required fields on the top layer.
1856  *
1857  * This will modify the BlockDriverState fields, and swap contents
1858  * between bs_new and bs_top. Both bs_new and bs_top are modified.
1859  *
1860  * bs_new is required to be anonymous.
1861  *
1862  * This function does not create any image files.
1863  */
1864 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1865 {
1866     bdrv_swap(bs_new, bs_top);
1867 
1868     /* The contents of 'tmp' will become bs_top, as we are
1869      * swapping bs_new and bs_top contents. */
1870     bs_top->backing_hd = bs_new;
1871     bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1872     pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1873             bs_new->filename);
1874     pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1875             bs_new->drv ? bs_new->drv->format_name : "");
1876 }
1877 
1878 static void bdrv_delete(BlockDriverState *bs)
1879 {
1880     assert(!bs->dev);
1881     assert(!bs->job);
1882     assert(!bs->in_use);
1883     assert(!bs->refcnt);
1884     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1885 
1886     bdrv_close(bs);
1887 
1888     /* remove from list, if necessary */
1889     bdrv_make_anon(bs);
1890 
1891     g_free(bs);
1892 }
1893 
1894 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1895 /* TODO change to DeviceState *dev when all users are qdevified */
1896 {
1897     if (bs->dev) {
1898         return -EBUSY;
1899     }
1900     bs->dev = dev;
1901     bdrv_iostatus_reset(bs);
1902     return 0;
1903 }
1904 
1905 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1906 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1907 {
1908     if (bdrv_attach_dev(bs, dev) < 0) {
1909         abort();
1910     }
1911 }
1912 
1913 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1914 /* TODO change to DeviceState *dev when all users are qdevified */
1915 {
1916     assert(bs->dev == dev);
1917     bs->dev = NULL;
1918     bs->dev_ops = NULL;
1919     bs->dev_opaque = NULL;
1920     bs->buffer_alignment = 512;
1921 }
1922 
1923 /* TODO change to return DeviceState * when all users are qdevified */
1924 void *bdrv_get_attached_dev(BlockDriverState *bs)
1925 {
1926     return bs->dev;
1927 }
1928 
1929 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1930                       void *opaque)
1931 {
1932     bs->dev_ops = ops;
1933     bs->dev_opaque = opaque;
1934 }
1935 
1936 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1937                                enum MonitorEvent ev,
1938                                BlockErrorAction action, bool is_read)
1939 {
1940     QObject *data;
1941     const char *action_str;
1942 
1943     switch (action) {
1944     case BDRV_ACTION_REPORT:
1945         action_str = "report";
1946         break;
1947     case BDRV_ACTION_IGNORE:
1948         action_str = "ignore";
1949         break;
1950     case BDRV_ACTION_STOP:
1951         action_str = "stop";
1952         break;
1953     default:
1954         abort();
1955     }
1956 
1957     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1958                               bdrv->device_name,
1959                               action_str,
1960                               is_read ? "read" : "write");
1961     monitor_protocol_event(ev, data);
1962 
1963     qobject_decref(data);
1964 }
1965 
1966 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1967 {
1968     QObject *data;
1969 
1970     data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1971                               bdrv_get_device_name(bs), ejected);
1972     monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1973 
1974     qobject_decref(data);
1975 }
1976 
1977 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1978 {
1979     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1980         bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1981         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1982         if (tray_was_closed) {
1983             /* tray open */
1984             bdrv_emit_qmp_eject_event(bs, true);
1985         }
1986         if (load) {
1987             /* tray close */
1988             bdrv_emit_qmp_eject_event(bs, false);
1989         }
1990     }
1991 }
1992 
1993 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1994 {
1995     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1996 }
1997 
1998 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1999 {
2000     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2001         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2002     }
2003 }
2004 
2005 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2006 {
2007     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2008         return bs->dev_ops->is_tray_open(bs->dev_opaque);
2009     }
2010     return false;
2011 }
2012 
2013 static void bdrv_dev_resize_cb(BlockDriverState *bs)
2014 {
2015     if (bs->dev_ops && bs->dev_ops->resize_cb) {
2016         bs->dev_ops->resize_cb(bs->dev_opaque);
2017     }
2018 }
2019 
2020 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2021 {
2022     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2023         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2024     }
2025     return false;
2026 }
2027 
2028 /*
2029  * Run consistency checks on an image
2030  *
2031  * Returns 0 if the check could be completed (it doesn't mean that the image is
2032  * free of errors) or -errno when an internal error occurred. The results of the
2033  * check are stored in res.
2034  */
2035 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2036 {
2037     if (bs->drv->bdrv_check == NULL) {
2038         return -ENOTSUP;
2039     }
2040 
2041     memset(res, 0, sizeof(*res));
2042     return bs->drv->bdrv_check(bs, res, fix);
2043 }
2044 
2045 #define COMMIT_BUF_SECTORS 2048
2046 
2047 /* commit COW file into the raw image */
2048 int bdrv_commit(BlockDriverState *bs)
2049 {
2050     BlockDriver *drv = bs->drv;
2051     int64_t sector, total_sectors, length, backing_length;
2052     int n, ro, open_flags;
2053     int ret = 0;
2054     uint8_t *buf = NULL;
2055     char filename[PATH_MAX];
2056 
2057     if (!drv)
2058         return -ENOMEDIUM;
2059 
2060     if (!bs->backing_hd) {
2061         return -ENOTSUP;
2062     }
2063 
2064     if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
2065         return -EBUSY;
2066     }
2067 
2068     ro = bs->backing_hd->read_only;
2069     /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2070     pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2071     open_flags =  bs->backing_hd->open_flags;
2072 
2073     if (ro) {
2074         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2075             return -EACCES;
2076         }
2077     }
2078 
2079     length = bdrv_getlength(bs);
2080     if (length < 0) {
2081         ret = length;
2082         goto ro_cleanup;
2083     }
2084 
2085     backing_length = bdrv_getlength(bs->backing_hd);
2086     if (backing_length < 0) {
2087         ret = backing_length;
2088         goto ro_cleanup;
2089     }
2090 
2091     /* If our top snapshot is larger than the backing file image,
2092      * grow the backing file image if possible.  If not possible,
2093      * we must return an error */
2094     if (length > backing_length) {
2095         ret = bdrv_truncate(bs->backing_hd, length);
2096         if (ret < 0) {
2097             goto ro_cleanup;
2098         }
2099     }
2100 
2101     total_sectors = length >> BDRV_SECTOR_BITS;
2102     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2103 
2104     for (sector = 0; sector < total_sectors; sector += n) {
2105         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2106         if (ret < 0) {
2107             goto ro_cleanup;
2108         }
2109         if (ret) {
2110             ret = bdrv_read(bs, sector, buf, n);
2111             if (ret < 0) {
2112                 goto ro_cleanup;
2113             }
2114 
2115             ret = bdrv_write(bs->backing_hd, sector, buf, n);
2116             if (ret < 0) {
2117                 goto ro_cleanup;
2118             }
2119         }
2120     }
2121 
2122     if (drv->bdrv_make_empty) {
2123         ret = drv->bdrv_make_empty(bs);
2124         if (ret < 0) {
2125             goto ro_cleanup;
2126         }
2127         bdrv_flush(bs);
2128     }
2129 
2130     /*
2131      * Make sure all data we wrote to the backing device is actually
2132      * stable on disk.
2133      */
2134     if (bs->backing_hd) {
2135         bdrv_flush(bs->backing_hd);
2136     }
2137 
2138     ret = 0;
2139 ro_cleanup:
2140     g_free(buf);
2141 
2142     if (ro) {
2143         /* ignoring error return here */
2144         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2145     }
2146 
2147     return ret;
2148 }
2149 
2150 int bdrv_commit_all(void)
2151 {
2152     BlockDriverState *bs;
2153 
2154     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2155         if (bs->drv && bs->backing_hd) {
2156             int ret = bdrv_commit(bs);
2157             if (ret < 0) {
2158                 return ret;
2159             }
2160         }
2161     }
2162     return 0;
2163 }
2164 
2165 /**
2166  * Remove an active request from the tracked requests list
2167  *
2168  * This function should be called when a tracked request is completing.
2169  */
2170 static void tracked_request_end(BdrvTrackedRequest *req)
2171 {
2172     QLIST_REMOVE(req, list);
2173     qemu_co_queue_restart_all(&req->wait_queue);
2174 }
2175 
2176 /**
2177  * Add an active request to the tracked requests list
2178  */
2179 static void tracked_request_begin(BdrvTrackedRequest *req,
2180                                   BlockDriverState *bs,
2181                                   int64_t sector_num,
2182                                   int nb_sectors, bool is_write)
2183 {
2184     *req = (BdrvTrackedRequest){
2185         .bs = bs,
2186         .sector_num = sector_num,
2187         .nb_sectors = nb_sectors,
2188         .is_write = is_write,
2189         .co = qemu_coroutine_self(),
2190     };
2191 
2192     qemu_co_queue_init(&req->wait_queue);
2193 
2194     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2195 }
2196 
2197 /**
2198  * Round a region to cluster boundaries
2199  */
2200 void bdrv_round_to_clusters(BlockDriverState *bs,
2201                             int64_t sector_num, int nb_sectors,
2202                             int64_t *cluster_sector_num,
2203                             int *cluster_nb_sectors)
2204 {
2205     BlockDriverInfo bdi;
2206 
2207     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2208         *cluster_sector_num = sector_num;
2209         *cluster_nb_sectors = nb_sectors;
2210     } else {
2211         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2212         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2213         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2214                                             nb_sectors, c);
2215     }
2216 }
2217 
2218 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2219                                      int64_t sector_num, int nb_sectors) {
2220     /*        aaaa   bbbb */
2221     if (sector_num >= req->sector_num + req->nb_sectors) {
2222         return false;
2223     }
2224     /* bbbb   aaaa        */
2225     if (req->sector_num >= sector_num + nb_sectors) {
2226         return false;
2227     }
2228     return true;
2229 }
2230 
2231 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
2232         int64_t sector_num, int nb_sectors)
2233 {
2234     BdrvTrackedRequest *req;
2235     int64_t cluster_sector_num;
2236     int cluster_nb_sectors;
2237     bool retry;
2238 
2239     /* If we touch the same cluster it counts as an overlap.  This guarantees
2240      * that allocating writes will be serialized and not race with each other
2241      * for the same cluster.  For example, in copy-on-read it ensures that the
2242      * CoR read and write operations are atomic and guest writes cannot
2243      * interleave between them.
2244      */
2245     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2246                            &cluster_sector_num, &cluster_nb_sectors);
2247 
2248     do {
2249         retry = false;
2250         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2251             if (tracked_request_overlaps(req, cluster_sector_num,
2252                                          cluster_nb_sectors)) {
2253                 /* Hitting this means there was a reentrant request, for
2254                  * example, a block driver issuing nested requests.  This must
2255                  * never happen since it means deadlock.
2256                  */
2257                 assert(qemu_coroutine_self() != req->co);
2258 
2259                 qemu_co_queue_wait(&req->wait_queue);
2260                 retry = true;
2261                 break;
2262             }
2263         }
2264     } while (retry);
2265 }
2266 
2267 /*
2268  * Return values:
2269  * 0        - success
2270  * -EINVAL  - backing format specified, but no file
2271  * -ENOSPC  - can't update the backing file because no space is left in the
2272  *            image file header
2273  * -ENOTSUP - format driver doesn't support changing the backing file
2274  */
2275 int bdrv_change_backing_file(BlockDriverState *bs,
2276     const char *backing_file, const char *backing_fmt)
2277 {
2278     BlockDriver *drv = bs->drv;
2279     int ret;
2280 
2281     /* Backing file format doesn't make sense without a backing file */
2282     if (backing_fmt && !backing_file) {
2283         return -EINVAL;
2284     }
2285 
2286     if (drv->bdrv_change_backing_file != NULL) {
2287         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2288     } else {
2289         ret = -ENOTSUP;
2290     }
2291 
2292     if (ret == 0) {
2293         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2294         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2295     }
2296     return ret;
2297 }
2298 
2299 /*
2300  * Finds the image layer in the chain that has 'bs' as its backing file.
2301  *
2302  * active is the current topmost image.
2303  *
2304  * Returns NULL if bs is not found in active's image chain,
2305  * or if active == bs.
2306  */
2307 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2308                                     BlockDriverState *bs)
2309 {
2310     BlockDriverState *overlay = NULL;
2311     BlockDriverState *intermediate;
2312 
2313     assert(active != NULL);
2314     assert(bs != NULL);
2315 
2316     /* if bs is the same as active, then by definition it has no overlay
2317      */
2318     if (active == bs) {
2319         return NULL;
2320     }
2321 
2322     intermediate = active;
2323     while (intermediate->backing_hd) {
2324         if (intermediate->backing_hd == bs) {
2325             overlay = intermediate;
2326             break;
2327         }
2328         intermediate = intermediate->backing_hd;
2329     }
2330 
2331     return overlay;
2332 }
2333 
2334 typedef struct BlkIntermediateStates {
2335     BlockDriverState *bs;
2336     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2337 } BlkIntermediateStates;
2338 
2339 
2340 /*
2341  * Drops images above 'base' up to and including 'top', and sets the image
2342  * above 'top' to have base as its backing file.
2343  *
2344  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2345  * information in 'bs' can be properly updated.
2346  *
2347  * E.g., this will convert the following chain:
2348  * bottom <- base <- intermediate <- top <- active
2349  *
2350  * to
2351  *
2352  * bottom <- base <- active
2353  *
2354  * It is allowed for bottom==base, in which case it converts:
2355  *
2356  * base <- intermediate <- top <- active
2357  *
2358  * to
2359  *
2360  * base <- active
2361  *
2362  * Error conditions:
2363  *  if active == top, that is considered an error
2364  *
2365  */
2366 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2367                            BlockDriverState *base)
2368 {
2369     BlockDriverState *intermediate;
2370     BlockDriverState *base_bs = NULL;
2371     BlockDriverState *new_top_bs = NULL;
2372     BlkIntermediateStates *intermediate_state, *next;
2373     int ret = -EIO;
2374 
2375     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2376     QSIMPLEQ_INIT(&states_to_delete);
2377 
2378     if (!top->drv || !base->drv) {
2379         goto exit;
2380     }
2381 
2382     new_top_bs = bdrv_find_overlay(active, top);
2383 
2384     if (new_top_bs == NULL) {
2385         /* we could not find the image above 'top', this is an error */
2386         goto exit;
2387     }
2388 
2389     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2390      * to do, no intermediate images */
2391     if (new_top_bs->backing_hd == base) {
2392         ret = 0;
2393         goto exit;
2394     }
2395 
2396     intermediate = top;
2397 
2398     /* now we will go down through the list, and add each BDS we find
2399      * into our deletion queue, until we hit the 'base'
2400      */
2401     while (intermediate) {
2402         intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2403         intermediate_state->bs = intermediate;
2404         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2405 
2406         if (intermediate->backing_hd == base) {
2407             base_bs = intermediate->backing_hd;
2408             break;
2409         }
2410         intermediate = intermediate->backing_hd;
2411     }
2412     if (base_bs == NULL) {
2413         /* something went wrong, we did not end at the base. safely
2414          * unravel everything, and exit with error */
2415         goto exit;
2416     }
2417 
2418     /* success - we can delete the intermediate states, and link top->base */
2419     ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2420                                    base_bs->drv ? base_bs->drv->format_name : "");
2421     if (ret) {
2422         goto exit;
2423     }
2424     new_top_bs->backing_hd = base_bs;
2425 
2426 
2427     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2428         /* so that bdrv_close() does not recursively close the chain */
2429         intermediate_state->bs->backing_hd = NULL;
2430         bdrv_unref(intermediate_state->bs);
2431     }
2432     ret = 0;
2433 
2434 exit:
2435     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2436         g_free(intermediate_state);
2437     }
2438     return ret;
2439 }
2440 
2441 
2442 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2443                                    size_t size)
2444 {
2445     int64_t len;
2446 
2447     if (!bdrv_is_inserted(bs))
2448         return -ENOMEDIUM;
2449 
2450     if (bs->growable)
2451         return 0;
2452 
2453     len = bdrv_getlength(bs);
2454 
2455     if (offset < 0)
2456         return -EIO;
2457 
2458     if ((offset > len) || (len - offset < size))
2459         return -EIO;
2460 
2461     return 0;
2462 }
2463 
2464 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2465                               int nb_sectors)
2466 {
2467     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2468                                    nb_sectors * BDRV_SECTOR_SIZE);
2469 }
2470 
2471 typedef struct RwCo {
2472     BlockDriverState *bs;
2473     int64_t sector_num;
2474     int nb_sectors;
2475     QEMUIOVector *qiov;
2476     bool is_write;
2477     int ret;
2478     BdrvRequestFlags flags;
2479 } RwCo;
2480 
2481 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2482 {
2483     RwCo *rwco = opaque;
2484 
2485     if (!rwco->is_write) {
2486         rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
2487                                      rwco->nb_sectors, rwco->qiov,
2488                                      rwco->flags);
2489     } else {
2490         rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
2491                                       rwco->nb_sectors, rwco->qiov,
2492                                       rwco->flags);
2493     }
2494 }
2495 
2496 /*
2497  * Process a vectored synchronous request using coroutines
2498  */
2499 static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
2500                        QEMUIOVector *qiov, bool is_write,
2501                        BdrvRequestFlags flags)
2502 {
2503     Coroutine *co;
2504     RwCo rwco = {
2505         .bs = bs,
2506         .sector_num = sector_num,
2507         .nb_sectors = qiov->size >> BDRV_SECTOR_BITS,
2508         .qiov = qiov,
2509         .is_write = is_write,
2510         .ret = NOT_DONE,
2511         .flags = flags,
2512     };
2513     assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0);
2514 
2515     /**
2516      * In sync call context, when the vcpu is blocked, this throttling timer
2517      * will not fire; so the I/O throttling function has to be disabled here
2518      * if it has been enabled.
2519      */
2520     if (bs->io_limits_enabled) {
2521         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2522                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2523         bdrv_io_limits_disable(bs);
2524     }
2525 
2526     if (qemu_in_coroutine()) {
2527         /* Fast-path if already in coroutine context */
2528         bdrv_rw_co_entry(&rwco);
2529     } else {
2530         co = qemu_coroutine_create(bdrv_rw_co_entry);
2531         qemu_coroutine_enter(co, &rwco);
2532         while (rwco.ret == NOT_DONE) {
2533             qemu_aio_wait();
2534         }
2535     }
2536     return rwco.ret;
2537 }
2538 
2539 /*
2540  * Process a synchronous request using coroutines
2541  */
2542 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2543                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2544 {
2545     QEMUIOVector qiov;
2546     struct iovec iov = {
2547         .iov_base = (void *)buf,
2548         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2549     };
2550 
2551     qemu_iovec_init_external(&qiov, &iov, 1);
2552     return bdrv_rwv_co(bs, sector_num, &qiov, is_write, flags);
2553 }
2554 
2555 /* return < 0 if error. See bdrv_write() for the return codes */
2556 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2557               uint8_t *buf, int nb_sectors)
2558 {
2559     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2560 }
2561 
2562 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2563 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2564                           uint8_t *buf, int nb_sectors)
2565 {
2566     bool enabled;
2567     int ret;
2568 
2569     enabled = bs->io_limits_enabled;
2570     bs->io_limits_enabled = false;
2571     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2572     bs->io_limits_enabled = enabled;
2573     return ret;
2574 }
2575 
2576 /* Return < 0 if error. Important errors are:
2577   -EIO         generic I/O error (may happen for all errors)
2578   -ENOMEDIUM   No media inserted.
2579   -EINVAL      Invalid sector number or nb_sectors
2580   -EACCES      Trying to write a read-only device
2581 */
2582 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2583                const uint8_t *buf, int nb_sectors)
2584 {
2585     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2586 }
2587 
2588 int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
2589 {
2590     return bdrv_rwv_co(bs, sector_num, qiov, true, 0);
2591 }
2592 
2593 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2594                       int nb_sectors, BdrvRequestFlags flags)
2595 {
2596     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2597                       BDRV_REQ_ZERO_WRITE | flags);
2598 }
2599 
2600 /*
2601  * Completely zero out a block device with the help of bdrv_write_zeroes.
2602  * The operation is sped up by checking the block status and only writing
2603  * zeroes to the device if they currently do not return zeroes. Optional
2604  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2605  *
2606  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2607  */
2608 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2609 {
2610     int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
2611     int64_t ret, nb_sectors, sector_num = 0;
2612     int n;
2613 
2614     for (;;) {
2615         nb_sectors = target_size - sector_num;
2616         if (nb_sectors <= 0) {
2617             return 0;
2618         }
2619         if (nb_sectors > INT_MAX) {
2620             nb_sectors = INT_MAX;
2621         }
2622         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2623         if (ret < 0) {
2624             error_report("error getting block status at sector %" PRId64 ": %s",
2625                          sector_num, strerror(-ret));
2626             return ret;
2627         }
2628         if (ret & BDRV_BLOCK_ZERO) {
2629             sector_num += n;
2630             continue;
2631         }
2632         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2633         if (ret < 0) {
2634             error_report("error writing zeroes at sector %" PRId64 ": %s",
2635                          sector_num, strerror(-ret));
2636             return ret;
2637         }
2638         sector_num += n;
2639     }
2640 }
2641 
2642 int bdrv_pread(BlockDriverState *bs, int64_t offset,
2643                void *buf, int count1)
2644 {
2645     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2646     int len, nb_sectors, count;
2647     int64_t sector_num;
2648     int ret;
2649 
2650     count = count1;
2651     /* first read to align to sector start */
2652     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2653     if (len > count)
2654         len = count;
2655     sector_num = offset >> BDRV_SECTOR_BITS;
2656     if (len > 0) {
2657         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2658             return ret;
2659         memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
2660         count -= len;
2661         if (count == 0)
2662             return count1;
2663         sector_num++;
2664         buf += len;
2665     }
2666 
2667     /* read the sectors "in place" */
2668     nb_sectors = count >> BDRV_SECTOR_BITS;
2669     if (nb_sectors > 0) {
2670         if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
2671             return ret;
2672         sector_num += nb_sectors;
2673         len = nb_sectors << BDRV_SECTOR_BITS;
2674         buf += len;
2675         count -= len;
2676     }
2677 
2678     /* add data from the last sector */
2679     if (count > 0) {
2680         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2681             return ret;
2682         memcpy(buf, tmp_buf, count);
2683     }
2684     return count1;
2685 }
2686 
2687 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2688 {
2689     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2690     int len, nb_sectors, count;
2691     int64_t sector_num;
2692     int ret;
2693 
2694     count = qiov->size;
2695 
2696     /* first write to align to sector start */
2697     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2698     if (len > count)
2699         len = count;
2700     sector_num = offset >> BDRV_SECTOR_BITS;
2701     if (len > 0) {
2702         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2703             return ret;
2704         qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)),
2705                           len);
2706         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2707             return ret;
2708         count -= len;
2709         if (count == 0)
2710             return qiov->size;
2711         sector_num++;
2712     }
2713 
2714     /* write the sectors "in place" */
2715     nb_sectors = count >> BDRV_SECTOR_BITS;
2716     if (nb_sectors > 0) {
2717         QEMUIOVector qiov_inplace;
2718 
2719         qemu_iovec_init(&qiov_inplace, qiov->niov);
2720         qemu_iovec_concat(&qiov_inplace, qiov, len,
2721                           nb_sectors << BDRV_SECTOR_BITS);
2722         ret = bdrv_writev(bs, sector_num, &qiov_inplace);
2723         qemu_iovec_destroy(&qiov_inplace);
2724         if (ret < 0) {
2725             return ret;
2726         }
2727 
2728         sector_num += nb_sectors;
2729         len = nb_sectors << BDRV_SECTOR_BITS;
2730         count -= len;
2731     }
2732 
2733     /* add data from the last sector */
2734     if (count > 0) {
2735         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2736             return ret;
2737         qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count);
2738         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2739             return ret;
2740     }
2741     return qiov->size;
2742 }
2743 
2744 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2745                 const void *buf, int count1)
2746 {
2747     QEMUIOVector qiov;
2748     struct iovec iov = {
2749         .iov_base   = (void *) buf,
2750         .iov_len    = count1,
2751     };
2752 
2753     qemu_iovec_init_external(&qiov, &iov, 1);
2754     return bdrv_pwritev(bs, offset, &qiov);
2755 }
2756 
2757 /*
2758  * Writes to the file and ensures that no writes are reordered across this
2759  * request (acts as a barrier)
2760  *
2761  * Returns 0 on success, -errno in error cases.
2762  */
2763 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2764     const void *buf, int count)
2765 {
2766     int ret;
2767 
2768     ret = bdrv_pwrite(bs, offset, buf, count);
2769     if (ret < 0) {
2770         return ret;
2771     }
2772 
2773     /* No flush needed for cache modes that already do it */
2774     if (bs->enable_write_cache) {
2775         bdrv_flush(bs);
2776     }
2777 
2778     return 0;
2779 }
2780 
2781 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2782         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2783 {
2784     /* Perform I/O through a temporary buffer so that users who scribble over
2785      * their read buffer while the operation is in progress do not end up
2786      * modifying the image file.  This is critical for zero-copy guest I/O
2787      * where anything might happen inside guest memory.
2788      */
2789     void *bounce_buffer;
2790 
2791     BlockDriver *drv = bs->drv;
2792     struct iovec iov;
2793     QEMUIOVector bounce_qiov;
2794     int64_t cluster_sector_num;
2795     int cluster_nb_sectors;
2796     size_t skip_bytes;
2797     int ret;
2798 
2799     /* Cover entire cluster so no additional backing file I/O is required when
2800      * allocating cluster in the image file.
2801      */
2802     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2803                            &cluster_sector_num, &cluster_nb_sectors);
2804 
2805     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2806                                    cluster_sector_num, cluster_nb_sectors);
2807 
2808     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2809     iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2810     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2811 
2812     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2813                              &bounce_qiov);
2814     if (ret < 0) {
2815         goto err;
2816     }
2817 
2818     if (drv->bdrv_co_write_zeroes &&
2819         buffer_is_zero(bounce_buffer, iov.iov_len)) {
2820         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2821                                       cluster_nb_sectors, 0);
2822     } else {
2823         /* This does not change the data on the disk, it is not necessary
2824          * to flush even in cache=writethrough mode.
2825          */
2826         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2827                                   &bounce_qiov);
2828     }
2829 
2830     if (ret < 0) {
2831         /* It might be okay to ignore write errors for guest requests.  If this
2832          * is a deliberate copy-on-read then we don't want to ignore the error.
2833          * Simply report it in all cases.
2834          */
2835         goto err;
2836     }
2837 
2838     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2839     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2840                         nb_sectors * BDRV_SECTOR_SIZE);
2841 
2842 err:
2843     qemu_vfree(bounce_buffer);
2844     return ret;
2845 }
2846 
2847 /*
2848  * Handle a read request in coroutine context
2849  */
2850 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
2851     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2852     BdrvRequestFlags flags)
2853 {
2854     BlockDriver *drv = bs->drv;
2855     BdrvTrackedRequest req;
2856     int ret;
2857 
2858     if (!drv) {
2859         return -ENOMEDIUM;
2860     }
2861     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2862         return -EIO;
2863     }
2864 
2865     if (bs->copy_on_read) {
2866         flags |= BDRV_REQ_COPY_ON_READ;
2867     }
2868     if (flags & BDRV_REQ_COPY_ON_READ) {
2869         bs->copy_on_read_in_flight++;
2870     }
2871 
2872     if (bs->copy_on_read_in_flight) {
2873         wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2874     }
2875 
2876     /* throttling disk I/O */
2877     if (bs->io_limits_enabled) {
2878         bdrv_io_limits_intercept(bs, nb_sectors, false);
2879     }
2880 
2881     tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
2882 
2883     if (flags & BDRV_REQ_COPY_ON_READ) {
2884         int pnum;
2885 
2886         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2887         if (ret < 0) {
2888             goto out;
2889         }
2890 
2891         if (!ret || pnum != nb_sectors) {
2892             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2893             goto out;
2894         }
2895     }
2896 
2897     if (!(bs->zero_beyond_eof && bs->growable)) {
2898         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2899     } else {
2900         /* Read zeros after EOF of growable BDSes */
2901         int64_t len, total_sectors, max_nb_sectors;
2902 
2903         len = bdrv_getlength(bs);
2904         if (len < 0) {
2905             ret = len;
2906             goto out;
2907         }
2908 
2909         total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
2910         max_nb_sectors = MAX(0, total_sectors - sector_num);
2911         if (max_nb_sectors > 0) {
2912             ret = drv->bdrv_co_readv(bs, sector_num,
2913                                      MIN(nb_sectors, max_nb_sectors), qiov);
2914         } else {
2915             ret = 0;
2916         }
2917 
2918         /* Reading beyond end of file is supposed to produce zeroes */
2919         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2920             uint64_t offset = MAX(0, total_sectors - sector_num);
2921             uint64_t bytes = (sector_num + nb_sectors - offset) *
2922                               BDRV_SECTOR_SIZE;
2923             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
2924         }
2925     }
2926 
2927 out:
2928     tracked_request_end(&req);
2929 
2930     if (flags & BDRV_REQ_COPY_ON_READ) {
2931         bs->copy_on_read_in_flight--;
2932     }
2933 
2934     return ret;
2935 }
2936 
2937 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
2938     int nb_sectors, QEMUIOVector *qiov)
2939 {
2940     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
2941 
2942     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
2943 }
2944 
2945 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
2946     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2947 {
2948     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
2949 
2950     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
2951                             BDRV_REQ_COPY_ON_READ);
2952 }
2953 
2954 /* if no limit is specified in the BlockLimits use a default
2955  * of 32768 512-byte sectors (16 MiB) per request.
2956  */
2957 #define MAX_WRITE_ZEROES_DEFAULT 32768
2958 
2959 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
2960     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
2961 {
2962     BlockDriver *drv = bs->drv;
2963     QEMUIOVector qiov;
2964     struct iovec iov = {0};
2965     int ret = 0;
2966 
2967     int max_write_zeroes = bs->bl.max_write_zeroes ?
2968                            bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
2969 
2970     while (nb_sectors > 0 && !ret) {
2971         int num = nb_sectors;
2972 
2973         /* Align request.  Block drivers can expect the "bulk" of the request
2974          * to be aligned.
2975          */
2976         if (bs->bl.write_zeroes_alignment
2977             && num > bs->bl.write_zeroes_alignment) {
2978             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
2979                 /* Make a small request up to the first aligned sector.  */
2980                 num = bs->bl.write_zeroes_alignment;
2981                 num -= sector_num % bs->bl.write_zeroes_alignment;
2982             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
2983                 /* Shorten the request to the last aligned sector.  num cannot
2984                  * underflow because num > bs->bl.write_zeroes_alignment.
2985                  */
2986                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
2987             }
2988         }
2989 
2990         /* limit request size */
2991         if (num > max_write_zeroes) {
2992             num = max_write_zeroes;
2993         }
2994 
2995         ret = -ENOTSUP;
2996         /* First try the efficient write zeroes operation */
2997         if (drv->bdrv_co_write_zeroes) {
2998             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
2999         }
3000 
3001         if (ret == -ENOTSUP) {
3002             /* Fall back to bounce buffer if write zeroes is unsupported */
3003             iov.iov_len = num * BDRV_SECTOR_SIZE;
3004             if (iov.iov_base == NULL) {
3005                 iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3006                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3007             }
3008             qemu_iovec_init_external(&qiov, &iov, 1);
3009 
3010             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3011 
3012             /* Keep bounce buffer around if it is big enough for all
3013              * all future requests.
3014              */
3015             if (num < max_write_zeroes) {
3016                 qemu_vfree(iov.iov_base);
3017                 iov.iov_base = NULL;
3018             }
3019         }
3020 
3021         sector_num += num;
3022         nb_sectors -= num;
3023     }
3024 
3025     qemu_vfree(iov.iov_base);
3026     return ret;
3027 }
3028 
3029 /*
3030  * Handle a write request in coroutine context
3031  */
3032 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3033     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3034     BdrvRequestFlags flags)
3035 {
3036     BlockDriver *drv = bs->drv;
3037     BdrvTrackedRequest req;
3038     int ret;
3039 
3040     if (!bs->drv) {
3041         return -ENOMEDIUM;
3042     }
3043     if (bs->read_only) {
3044         return -EACCES;
3045     }
3046     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3047         return -EIO;
3048     }
3049 
3050     if (bs->copy_on_read_in_flight) {
3051         wait_for_overlapping_requests(bs, sector_num, nb_sectors);
3052     }
3053 
3054     /* throttling disk I/O */
3055     if (bs->io_limits_enabled) {
3056         bdrv_io_limits_intercept(bs, nb_sectors, true);
3057     }
3058 
3059     tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
3060 
3061     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
3062 
3063     if (ret < 0) {
3064         /* Do nothing, write notifier decided to fail this request */
3065     } else if (flags & BDRV_REQ_ZERO_WRITE) {
3066         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3067     } else {
3068         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3069     }
3070 
3071     if (ret == 0 && !bs->enable_write_cache) {
3072         ret = bdrv_co_flush(bs);
3073     }
3074 
3075     bdrv_set_dirty(bs, sector_num, nb_sectors);
3076 
3077     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3078         bs->wr_highest_sector = sector_num + nb_sectors - 1;
3079     }
3080     if (bs->growable && ret >= 0) {
3081         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3082     }
3083 
3084     tracked_request_end(&req);
3085 
3086     return ret;
3087 }
3088 
3089 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3090     int nb_sectors, QEMUIOVector *qiov)
3091 {
3092     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3093 
3094     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3095 }
3096 
3097 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3098                                       int64_t sector_num, int nb_sectors,
3099                                       BdrvRequestFlags flags)
3100 {
3101     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3102 
3103     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3104         flags &= ~BDRV_REQ_MAY_UNMAP;
3105     }
3106 
3107     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3108                              BDRV_REQ_ZERO_WRITE | flags);
3109 }
3110 
3111 /**
3112  * Truncate file to 'offset' bytes (needed only for file protocols)
3113  */
3114 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3115 {
3116     BlockDriver *drv = bs->drv;
3117     int ret;
3118     if (!drv)
3119         return -ENOMEDIUM;
3120     if (!drv->bdrv_truncate)
3121         return -ENOTSUP;
3122     if (bs->read_only)
3123         return -EACCES;
3124     if (bdrv_in_use(bs))
3125         return -EBUSY;
3126     ret = drv->bdrv_truncate(bs, offset);
3127     if (ret == 0) {
3128         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3129         bdrv_dev_resize_cb(bs);
3130     }
3131     return ret;
3132 }
3133 
3134 /**
3135  * Length of a allocated file in bytes. Sparse files are counted by actual
3136  * allocated space. Return < 0 if error or unknown.
3137  */
3138 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3139 {
3140     BlockDriver *drv = bs->drv;
3141     if (!drv) {
3142         return -ENOMEDIUM;
3143     }
3144     if (drv->bdrv_get_allocated_file_size) {
3145         return drv->bdrv_get_allocated_file_size(bs);
3146     }
3147     if (bs->file) {
3148         return bdrv_get_allocated_file_size(bs->file);
3149     }
3150     return -ENOTSUP;
3151 }
3152 
3153 /**
3154  * Length of a file in bytes. Return < 0 if error or unknown.
3155  */
3156 int64_t bdrv_getlength(BlockDriverState *bs)
3157 {
3158     BlockDriver *drv = bs->drv;
3159     if (!drv)
3160         return -ENOMEDIUM;
3161 
3162     if (drv->has_variable_length) {
3163         int ret = refresh_total_sectors(bs, bs->total_sectors);
3164         if (ret < 0) {
3165             return ret;
3166         }
3167     }
3168     return bs->total_sectors * BDRV_SECTOR_SIZE;
3169 }
3170 
3171 /* return 0 as number of sectors if no device present or error */
3172 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3173 {
3174     int64_t length;
3175     length = bdrv_getlength(bs);
3176     if (length < 0)
3177         length = 0;
3178     else
3179         length = length >> BDRV_SECTOR_BITS;
3180     *nb_sectors_ptr = length;
3181 }
3182 
3183 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3184                        BlockdevOnError on_write_error)
3185 {
3186     bs->on_read_error = on_read_error;
3187     bs->on_write_error = on_write_error;
3188 }
3189 
3190 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3191 {
3192     return is_read ? bs->on_read_error : bs->on_write_error;
3193 }
3194 
3195 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3196 {
3197     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3198 
3199     switch (on_err) {
3200     case BLOCKDEV_ON_ERROR_ENOSPC:
3201         return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3202     case BLOCKDEV_ON_ERROR_STOP:
3203         return BDRV_ACTION_STOP;
3204     case BLOCKDEV_ON_ERROR_REPORT:
3205         return BDRV_ACTION_REPORT;
3206     case BLOCKDEV_ON_ERROR_IGNORE:
3207         return BDRV_ACTION_IGNORE;
3208     default:
3209         abort();
3210     }
3211 }
3212 
3213 /* This is done by device models because, while the block layer knows
3214  * about the error, it does not know whether an operation comes from
3215  * the device or the block layer (from a job, for example).
3216  */
3217 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3218                        bool is_read, int error)
3219 {
3220     assert(error >= 0);
3221     bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3222     if (action == BDRV_ACTION_STOP) {
3223         vm_stop(RUN_STATE_IO_ERROR);
3224         bdrv_iostatus_set_err(bs, error);
3225     }
3226 }
3227 
3228 int bdrv_is_read_only(BlockDriverState *bs)
3229 {
3230     return bs->read_only;
3231 }
3232 
3233 int bdrv_is_sg(BlockDriverState *bs)
3234 {
3235     return bs->sg;
3236 }
3237 
3238 int bdrv_enable_write_cache(BlockDriverState *bs)
3239 {
3240     return bs->enable_write_cache;
3241 }
3242 
3243 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3244 {
3245     bs->enable_write_cache = wce;
3246 
3247     /* so a reopen() will preserve wce */
3248     if (wce) {
3249         bs->open_flags |= BDRV_O_CACHE_WB;
3250     } else {
3251         bs->open_flags &= ~BDRV_O_CACHE_WB;
3252     }
3253 }
3254 
3255 int bdrv_is_encrypted(BlockDriverState *bs)
3256 {
3257     if (bs->backing_hd && bs->backing_hd->encrypted)
3258         return 1;
3259     return bs->encrypted;
3260 }
3261 
3262 int bdrv_key_required(BlockDriverState *bs)
3263 {
3264     BlockDriverState *backing_hd = bs->backing_hd;
3265 
3266     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3267         return 1;
3268     return (bs->encrypted && !bs->valid_key);
3269 }
3270 
3271 int bdrv_set_key(BlockDriverState *bs, const char *key)
3272 {
3273     int ret;
3274     if (bs->backing_hd && bs->backing_hd->encrypted) {
3275         ret = bdrv_set_key(bs->backing_hd, key);
3276         if (ret < 0)
3277             return ret;
3278         if (!bs->encrypted)
3279             return 0;
3280     }
3281     if (!bs->encrypted) {
3282         return -EINVAL;
3283     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3284         return -ENOMEDIUM;
3285     }
3286     ret = bs->drv->bdrv_set_key(bs, key);
3287     if (ret < 0) {
3288         bs->valid_key = 0;
3289     } else if (!bs->valid_key) {
3290         bs->valid_key = 1;
3291         /* call the change callback now, we skipped it on open */
3292         bdrv_dev_change_media_cb(bs, true);
3293     }
3294     return ret;
3295 }
3296 
3297 const char *bdrv_get_format_name(BlockDriverState *bs)
3298 {
3299     return bs->drv ? bs->drv->format_name : NULL;
3300 }
3301 
3302 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3303                          void *opaque)
3304 {
3305     BlockDriver *drv;
3306 
3307     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3308         it(opaque, drv->format_name);
3309     }
3310 }
3311 
3312 /* This function is to find block backend bs */
3313 BlockDriverState *bdrv_find(const char *name)
3314 {
3315     BlockDriverState *bs;
3316 
3317     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3318         if (!strcmp(name, bs->device_name)) {
3319             return bs;
3320         }
3321     }
3322     return NULL;
3323 }
3324 
3325 /* This function is to find a node in the bs graph */
3326 BlockDriverState *bdrv_find_node(const char *node_name)
3327 {
3328     BlockDriverState *bs;
3329 
3330     assert(node_name);
3331 
3332     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3333         if (!strcmp(node_name, bs->node_name)) {
3334             return bs;
3335         }
3336     }
3337     return NULL;
3338 }
3339 
3340 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3341 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3342 {
3343     BlockDeviceInfoList *list, *entry;
3344     BlockDriverState *bs;
3345 
3346     list = NULL;
3347     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3348         entry = g_malloc0(sizeof(*entry));
3349         entry->value = bdrv_block_device_info(bs);
3350         entry->next = list;
3351         list = entry;
3352     }
3353 
3354     return list;
3355 }
3356 
3357 BlockDriverState *bdrv_lookup_bs(const char *device,
3358                                  const char *node_name,
3359                                  Error **errp)
3360 {
3361     BlockDriverState *bs = NULL;
3362 
3363     if ((!device && !node_name) || (device && node_name)) {
3364         error_setg(errp, "Use either device or node-name but not both");
3365         return NULL;
3366     }
3367 
3368     if (device) {
3369         bs = bdrv_find(device);
3370 
3371         if (!bs) {
3372             error_set(errp, QERR_DEVICE_NOT_FOUND, device);
3373             return NULL;
3374         }
3375 
3376         return bs;
3377     }
3378 
3379     bs = bdrv_find_node(node_name);
3380 
3381     if (!bs) {
3382         error_set(errp, QERR_DEVICE_NOT_FOUND, node_name);
3383         return NULL;
3384     }
3385 
3386     return bs;
3387 }
3388 
3389 BlockDriverState *bdrv_next(BlockDriverState *bs)
3390 {
3391     if (!bs) {
3392         return QTAILQ_FIRST(&bdrv_states);
3393     }
3394     return QTAILQ_NEXT(bs, device_list);
3395 }
3396 
3397 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3398 {
3399     BlockDriverState *bs;
3400 
3401     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3402         it(opaque, bs);
3403     }
3404 }
3405 
3406 const char *bdrv_get_device_name(BlockDriverState *bs)
3407 {
3408     return bs->device_name;
3409 }
3410 
3411 int bdrv_get_flags(BlockDriverState *bs)
3412 {
3413     return bs->open_flags;
3414 }
3415 
3416 int bdrv_flush_all(void)
3417 {
3418     BlockDriverState *bs;
3419     int result = 0;
3420 
3421     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3422         int ret = bdrv_flush(bs);
3423         if (ret < 0 && !result) {
3424             result = ret;
3425         }
3426     }
3427 
3428     return result;
3429 }
3430 
3431 int bdrv_has_zero_init_1(BlockDriverState *bs)
3432 {
3433     return 1;
3434 }
3435 
3436 int bdrv_has_zero_init(BlockDriverState *bs)
3437 {
3438     assert(bs->drv);
3439 
3440     /* If BS is a copy on write image, it is initialized to
3441        the contents of the base image, which may not be zeroes.  */
3442     if (bs->backing_hd) {
3443         return 0;
3444     }
3445     if (bs->drv->bdrv_has_zero_init) {
3446         return bs->drv->bdrv_has_zero_init(bs);
3447     }
3448 
3449     /* safe default */
3450     return 0;
3451 }
3452 
3453 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3454 {
3455     BlockDriverInfo bdi;
3456 
3457     if (bs->backing_hd) {
3458         return false;
3459     }
3460 
3461     if (bdrv_get_info(bs, &bdi) == 0) {
3462         return bdi.unallocated_blocks_are_zero;
3463     }
3464 
3465     return false;
3466 }
3467 
3468 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3469 {
3470     BlockDriverInfo bdi;
3471 
3472     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3473         return false;
3474     }
3475 
3476     if (bdrv_get_info(bs, &bdi) == 0) {
3477         return bdi.can_write_zeroes_with_unmap;
3478     }
3479 
3480     return false;
3481 }
3482 
3483 typedef struct BdrvCoGetBlockStatusData {
3484     BlockDriverState *bs;
3485     BlockDriverState *base;
3486     int64_t sector_num;
3487     int nb_sectors;
3488     int *pnum;
3489     int64_t ret;
3490     bool done;
3491 } BdrvCoGetBlockStatusData;
3492 
3493 /*
3494  * Returns true iff the specified sector is present in the disk image. Drivers
3495  * not implementing the functionality are assumed to not support backing files,
3496  * hence all their sectors are reported as allocated.
3497  *
3498  * If 'sector_num' is beyond the end of the disk image the return value is 0
3499  * and 'pnum' is set to 0.
3500  *
3501  * 'pnum' is set to the number of sectors (including and immediately following
3502  * the specified sector) that are known to be in the same
3503  * allocated/unallocated state.
3504  *
3505  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3506  * beyond the end of the disk image it will be clamped.
3507  */
3508 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3509                                                      int64_t sector_num,
3510                                                      int nb_sectors, int *pnum)
3511 {
3512     int64_t length;
3513     int64_t n;
3514     int64_t ret, ret2;
3515 
3516     length = bdrv_getlength(bs);
3517     if (length < 0) {
3518         return length;
3519     }
3520 
3521     if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3522         *pnum = 0;
3523         return 0;
3524     }
3525 
3526     n = bs->total_sectors - sector_num;
3527     if (n < nb_sectors) {
3528         nb_sectors = n;
3529     }
3530 
3531     if (!bs->drv->bdrv_co_get_block_status) {
3532         *pnum = nb_sectors;
3533         ret = BDRV_BLOCK_DATA;
3534         if (bs->drv->protocol_name) {
3535             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3536         }
3537         return ret;
3538     }
3539 
3540     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3541     if (ret < 0) {
3542         *pnum = 0;
3543         return ret;
3544     }
3545 
3546     if (ret & BDRV_BLOCK_RAW) {
3547         assert(ret & BDRV_BLOCK_OFFSET_VALID);
3548         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3549                                      *pnum, pnum);
3550     }
3551 
3552     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3553         if (bdrv_unallocated_blocks_are_zero(bs)) {
3554             ret |= BDRV_BLOCK_ZERO;
3555         } else if (bs->backing_hd) {
3556             BlockDriverState *bs2 = bs->backing_hd;
3557             int64_t length2 = bdrv_getlength(bs2);
3558             if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3559                 ret |= BDRV_BLOCK_ZERO;
3560             }
3561         }
3562     }
3563 
3564     if (bs->file &&
3565         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3566         (ret & BDRV_BLOCK_OFFSET_VALID)) {
3567         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3568                                         *pnum, pnum);
3569         if (ret2 >= 0) {
3570             /* Ignore errors.  This is just providing extra information, it
3571              * is useful but not necessary.
3572              */
3573             ret |= (ret2 & BDRV_BLOCK_ZERO);
3574         }
3575     }
3576 
3577     return ret;
3578 }
3579 
3580 /* Coroutine wrapper for bdrv_get_block_status() */
3581 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3582 {
3583     BdrvCoGetBlockStatusData *data = opaque;
3584     BlockDriverState *bs = data->bs;
3585 
3586     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3587                                          data->pnum);
3588     data->done = true;
3589 }
3590 
3591 /*
3592  * Synchronous wrapper around bdrv_co_get_block_status().
3593  *
3594  * See bdrv_co_get_block_status() for details.
3595  */
3596 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3597                               int nb_sectors, int *pnum)
3598 {
3599     Coroutine *co;
3600     BdrvCoGetBlockStatusData data = {
3601         .bs = bs,
3602         .sector_num = sector_num,
3603         .nb_sectors = nb_sectors,
3604         .pnum = pnum,
3605         .done = false,
3606     };
3607 
3608     if (qemu_in_coroutine()) {
3609         /* Fast-path if already in coroutine context */
3610         bdrv_get_block_status_co_entry(&data);
3611     } else {
3612         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3613         qemu_coroutine_enter(co, &data);
3614         while (!data.done) {
3615             qemu_aio_wait();
3616         }
3617     }
3618     return data.ret;
3619 }
3620 
3621 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3622                                    int nb_sectors, int *pnum)
3623 {
3624     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3625     if (ret < 0) {
3626         return ret;
3627     }
3628     return
3629         (ret & BDRV_BLOCK_DATA) ||
3630         ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3631 }
3632 
3633 /*
3634  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3635  *
3636  * Return true if the given sector is allocated in any image between
3637  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3638  * sector is allocated in any image of the chain.  Return false otherwise.
3639  *
3640  * 'pnum' is set to the number of sectors (including and immediately following
3641  *  the specified sector) that are known to be in the same
3642  *  allocated/unallocated state.
3643  *
3644  */
3645 int bdrv_is_allocated_above(BlockDriverState *top,
3646                             BlockDriverState *base,
3647                             int64_t sector_num,
3648                             int nb_sectors, int *pnum)
3649 {
3650     BlockDriverState *intermediate;
3651     int ret, n = nb_sectors;
3652 
3653     intermediate = top;
3654     while (intermediate && intermediate != base) {
3655         int pnum_inter;
3656         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3657                                 &pnum_inter);
3658         if (ret < 0) {
3659             return ret;
3660         } else if (ret) {
3661             *pnum = pnum_inter;
3662             return 1;
3663         }
3664 
3665         /*
3666          * [sector_num, nb_sectors] is unallocated on top but intermediate
3667          * might have
3668          *
3669          * [sector_num+x, nr_sectors] allocated.
3670          */
3671         if (n > pnum_inter &&
3672             (intermediate == top ||
3673              sector_num + pnum_inter < intermediate->total_sectors)) {
3674             n = pnum_inter;
3675         }
3676 
3677         intermediate = intermediate->backing_hd;
3678     }
3679 
3680     *pnum = n;
3681     return 0;
3682 }
3683 
3684 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3685 {
3686     if (bs->backing_hd && bs->backing_hd->encrypted)
3687         return bs->backing_file;
3688     else if (bs->encrypted)
3689         return bs->filename;
3690     else
3691         return NULL;
3692 }
3693 
3694 void bdrv_get_backing_filename(BlockDriverState *bs,
3695                                char *filename, int filename_size)
3696 {
3697     pstrcpy(filename, filename_size, bs->backing_file);
3698 }
3699 
3700 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3701                           const uint8_t *buf, int nb_sectors)
3702 {
3703     BlockDriver *drv = bs->drv;
3704     if (!drv)
3705         return -ENOMEDIUM;
3706     if (!drv->bdrv_write_compressed)
3707         return -ENOTSUP;
3708     if (bdrv_check_request(bs, sector_num, nb_sectors))
3709         return -EIO;
3710 
3711     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
3712 
3713     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3714 }
3715 
3716 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3717 {
3718     BlockDriver *drv = bs->drv;
3719     if (!drv)
3720         return -ENOMEDIUM;
3721     if (!drv->bdrv_get_info)
3722         return -ENOTSUP;
3723     memset(bdi, 0, sizeof(*bdi));
3724     return drv->bdrv_get_info(bs, bdi);
3725 }
3726 
3727 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
3728 {
3729     BlockDriver *drv = bs->drv;
3730     if (drv && drv->bdrv_get_specific_info) {
3731         return drv->bdrv_get_specific_info(bs);
3732     }
3733     return NULL;
3734 }
3735 
3736 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3737                       int64_t pos, int size)
3738 {
3739     QEMUIOVector qiov;
3740     struct iovec iov = {
3741         .iov_base   = (void *) buf,
3742         .iov_len    = size,
3743     };
3744 
3745     qemu_iovec_init_external(&qiov, &iov, 1);
3746     return bdrv_writev_vmstate(bs, &qiov, pos);
3747 }
3748 
3749 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
3750 {
3751     BlockDriver *drv = bs->drv;
3752 
3753     if (!drv) {
3754         return -ENOMEDIUM;
3755     } else if (drv->bdrv_save_vmstate) {
3756         return drv->bdrv_save_vmstate(bs, qiov, pos);
3757     } else if (bs->file) {
3758         return bdrv_writev_vmstate(bs->file, qiov, pos);
3759     }
3760 
3761     return -ENOTSUP;
3762 }
3763 
3764 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3765                       int64_t pos, int size)
3766 {
3767     BlockDriver *drv = bs->drv;
3768     if (!drv)
3769         return -ENOMEDIUM;
3770     if (drv->bdrv_load_vmstate)
3771         return drv->bdrv_load_vmstate(bs, buf, pos, size);
3772     if (bs->file)
3773         return bdrv_load_vmstate(bs->file, buf, pos, size);
3774     return -ENOTSUP;
3775 }
3776 
3777 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
3778 {
3779     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
3780         return;
3781     }
3782 
3783     bs->drv->bdrv_debug_event(bs, event);
3784 }
3785 
3786 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
3787                           const char *tag)
3788 {
3789     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
3790         bs = bs->file;
3791     }
3792 
3793     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
3794         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
3795     }
3796 
3797     return -ENOTSUP;
3798 }
3799 
3800 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
3801 {
3802     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
3803         bs = bs->file;
3804     }
3805 
3806     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
3807         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
3808     }
3809 
3810     return -ENOTSUP;
3811 }
3812 
3813 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
3814 {
3815     while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
3816         bs = bs->file;
3817     }
3818 
3819     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
3820         return bs->drv->bdrv_debug_resume(bs, tag);
3821     }
3822 
3823     return -ENOTSUP;
3824 }
3825 
3826 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
3827 {
3828     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
3829         bs = bs->file;
3830     }
3831 
3832     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
3833         return bs->drv->bdrv_debug_is_suspended(bs, tag);
3834     }
3835 
3836     return false;
3837 }
3838 
3839 int bdrv_is_snapshot(BlockDriverState *bs)
3840 {
3841     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
3842 }
3843 
3844 /* backing_file can either be relative, or absolute, or a protocol.  If it is
3845  * relative, it must be relative to the chain.  So, passing in bs->filename
3846  * from a BDS as backing_file should not be done, as that may be relative to
3847  * the CWD rather than the chain. */
3848 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
3849         const char *backing_file)
3850 {
3851     char *filename_full = NULL;
3852     char *backing_file_full = NULL;
3853     char *filename_tmp = NULL;
3854     int is_protocol = 0;
3855     BlockDriverState *curr_bs = NULL;
3856     BlockDriverState *retval = NULL;
3857 
3858     if (!bs || !bs->drv || !backing_file) {
3859         return NULL;
3860     }
3861 
3862     filename_full     = g_malloc(PATH_MAX);
3863     backing_file_full = g_malloc(PATH_MAX);
3864     filename_tmp      = g_malloc(PATH_MAX);
3865 
3866     is_protocol = path_has_protocol(backing_file);
3867 
3868     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
3869 
3870         /* If either of the filename paths is actually a protocol, then
3871          * compare unmodified paths; otherwise make paths relative */
3872         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
3873             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
3874                 retval = curr_bs->backing_hd;
3875                 break;
3876             }
3877         } else {
3878             /* If not an absolute filename path, make it relative to the current
3879              * image's filename path */
3880             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3881                          backing_file);
3882 
3883             /* We are going to compare absolute pathnames */
3884             if (!realpath(filename_tmp, filename_full)) {
3885                 continue;
3886             }
3887 
3888             /* We need to make sure the backing filename we are comparing against
3889              * is relative to the current image filename (or absolute) */
3890             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3891                          curr_bs->backing_file);
3892 
3893             if (!realpath(filename_tmp, backing_file_full)) {
3894                 continue;
3895             }
3896 
3897             if (strcmp(backing_file_full, filename_full) == 0) {
3898                 retval = curr_bs->backing_hd;
3899                 break;
3900             }
3901         }
3902     }
3903 
3904     g_free(filename_full);
3905     g_free(backing_file_full);
3906     g_free(filename_tmp);
3907     return retval;
3908 }
3909 
3910 int bdrv_get_backing_file_depth(BlockDriverState *bs)
3911 {
3912     if (!bs->drv) {
3913         return 0;
3914     }
3915 
3916     if (!bs->backing_hd) {
3917         return 0;
3918     }
3919 
3920     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
3921 }
3922 
3923 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
3924 {
3925     BlockDriverState *curr_bs = NULL;
3926 
3927     if (!bs) {
3928         return NULL;
3929     }
3930 
3931     curr_bs = bs;
3932 
3933     while (curr_bs->backing_hd) {
3934         curr_bs = curr_bs->backing_hd;
3935     }
3936     return curr_bs;
3937 }
3938 
3939 /**************************************************************/
3940 /* async I/Os */
3941 
3942 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3943                                  QEMUIOVector *qiov, int nb_sectors,
3944                                  BlockDriverCompletionFunc *cb, void *opaque)
3945 {
3946     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3947 
3948     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
3949                                  cb, opaque, false);
3950 }
3951 
3952 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3953                                   QEMUIOVector *qiov, int nb_sectors,
3954                                   BlockDriverCompletionFunc *cb, void *opaque)
3955 {
3956     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3957 
3958     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
3959                                  cb, opaque, true);
3960 }
3961 
3962 BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
3963         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
3964         BlockDriverCompletionFunc *cb, void *opaque)
3965 {
3966     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
3967 
3968     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
3969                                  BDRV_REQ_ZERO_WRITE | flags,
3970                                  cb, opaque, true);
3971 }
3972 
3973 
3974 typedef struct MultiwriteCB {
3975     int error;
3976     int num_requests;
3977     int num_callbacks;
3978     struct {
3979         BlockDriverCompletionFunc *cb;
3980         void *opaque;
3981         QEMUIOVector *free_qiov;
3982     } callbacks[];
3983 } MultiwriteCB;
3984 
3985 static void multiwrite_user_cb(MultiwriteCB *mcb)
3986 {
3987     int i;
3988 
3989     for (i = 0; i < mcb->num_callbacks; i++) {
3990         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3991         if (mcb->callbacks[i].free_qiov) {
3992             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3993         }
3994         g_free(mcb->callbacks[i].free_qiov);
3995     }
3996 }
3997 
3998 static void multiwrite_cb(void *opaque, int ret)
3999 {
4000     MultiwriteCB *mcb = opaque;
4001 
4002     trace_multiwrite_cb(mcb, ret);
4003 
4004     if (ret < 0 && !mcb->error) {
4005         mcb->error = ret;
4006     }
4007 
4008     mcb->num_requests--;
4009     if (mcb->num_requests == 0) {
4010         multiwrite_user_cb(mcb);
4011         g_free(mcb);
4012     }
4013 }
4014 
4015 static int multiwrite_req_compare(const void *a, const void *b)
4016 {
4017     const BlockRequest *req1 = a, *req2 = b;
4018 
4019     /*
4020      * Note that we can't simply subtract req2->sector from req1->sector
4021      * here as that could overflow the return value.
4022      */
4023     if (req1->sector > req2->sector) {
4024         return 1;
4025     } else if (req1->sector < req2->sector) {
4026         return -1;
4027     } else {
4028         return 0;
4029     }
4030 }
4031 
4032 /*
4033  * Takes a bunch of requests and tries to merge them. Returns the number of
4034  * requests that remain after merging.
4035  */
4036 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4037     int num_reqs, MultiwriteCB *mcb)
4038 {
4039     int i, outidx;
4040 
4041     // Sort requests by start sector
4042     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4043 
4044     // Check if adjacent requests touch the same clusters. If so, combine them,
4045     // filling up gaps with zero sectors.
4046     outidx = 0;
4047     for (i = 1; i < num_reqs; i++) {
4048         int merge = 0;
4049         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4050 
4051         // Handle exactly sequential writes and overlapping writes.
4052         if (reqs[i].sector <= oldreq_last) {
4053             merge = 1;
4054         }
4055 
4056         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4057             merge = 0;
4058         }
4059 
4060         if (merge) {
4061             size_t size;
4062             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4063             qemu_iovec_init(qiov,
4064                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4065 
4066             // Add the first request to the merged one. If the requests are
4067             // overlapping, drop the last sectors of the first request.
4068             size = (reqs[i].sector - reqs[outidx].sector) << 9;
4069             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4070 
4071             // We should need to add any zeros between the two requests
4072             assert (reqs[i].sector <= oldreq_last);
4073 
4074             // Add the second request
4075             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4076 
4077             reqs[outidx].nb_sectors = qiov->size >> 9;
4078             reqs[outidx].qiov = qiov;
4079 
4080             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4081         } else {
4082             outidx++;
4083             reqs[outidx].sector     = reqs[i].sector;
4084             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4085             reqs[outidx].qiov       = reqs[i].qiov;
4086         }
4087     }
4088 
4089     return outidx + 1;
4090 }
4091 
4092 /*
4093  * Submit multiple AIO write requests at once.
4094  *
4095  * On success, the function returns 0 and all requests in the reqs array have
4096  * been submitted. In error case this function returns -1, and any of the
4097  * requests may or may not be submitted yet. In particular, this means that the
4098  * callback will be called for some of the requests, for others it won't. The
4099  * caller must check the error field of the BlockRequest to wait for the right
4100  * callbacks (if error != 0, no callback will be called).
4101  *
4102  * The implementation may modify the contents of the reqs array, e.g. to merge
4103  * requests. However, the fields opaque and error are left unmodified as they
4104  * are used to signal failure for a single request to the caller.
4105  */
4106 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4107 {
4108     MultiwriteCB *mcb;
4109     int i;
4110 
4111     /* don't submit writes if we don't have a medium */
4112     if (bs->drv == NULL) {
4113         for (i = 0; i < num_reqs; i++) {
4114             reqs[i].error = -ENOMEDIUM;
4115         }
4116         return -1;
4117     }
4118 
4119     if (num_reqs == 0) {
4120         return 0;
4121     }
4122 
4123     // Create MultiwriteCB structure
4124     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4125     mcb->num_requests = 0;
4126     mcb->num_callbacks = num_reqs;
4127 
4128     for (i = 0; i < num_reqs; i++) {
4129         mcb->callbacks[i].cb = reqs[i].cb;
4130         mcb->callbacks[i].opaque = reqs[i].opaque;
4131     }
4132 
4133     // Check for mergable requests
4134     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4135 
4136     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4137 
4138     /* Run the aio requests. */
4139     mcb->num_requests = num_reqs;
4140     for (i = 0; i < num_reqs; i++) {
4141         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4142                               reqs[i].nb_sectors, reqs[i].flags,
4143                               multiwrite_cb, mcb,
4144                               true);
4145     }
4146 
4147     return 0;
4148 }
4149 
4150 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4151 {
4152     acb->aiocb_info->cancel(acb);
4153 }
4154 
4155 /**************************************************************/
4156 /* async block device emulation */
4157 
4158 typedef struct BlockDriverAIOCBSync {
4159     BlockDriverAIOCB common;
4160     QEMUBH *bh;
4161     int ret;
4162     /* vector translation state */
4163     QEMUIOVector *qiov;
4164     uint8_t *bounce;
4165     int is_write;
4166 } BlockDriverAIOCBSync;
4167 
4168 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4169 {
4170     BlockDriverAIOCBSync *acb =
4171         container_of(blockacb, BlockDriverAIOCBSync, common);
4172     qemu_bh_delete(acb->bh);
4173     acb->bh = NULL;
4174     qemu_aio_release(acb);
4175 }
4176 
4177 static const AIOCBInfo bdrv_em_aiocb_info = {
4178     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
4179     .cancel             = bdrv_aio_cancel_em,
4180 };
4181 
4182 static void bdrv_aio_bh_cb(void *opaque)
4183 {
4184     BlockDriverAIOCBSync *acb = opaque;
4185 
4186     if (!acb->is_write)
4187         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4188     qemu_vfree(acb->bounce);
4189     acb->common.cb(acb->common.opaque, acb->ret);
4190     qemu_bh_delete(acb->bh);
4191     acb->bh = NULL;
4192     qemu_aio_release(acb);
4193 }
4194 
4195 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4196                                             int64_t sector_num,
4197                                             QEMUIOVector *qiov,
4198                                             int nb_sectors,
4199                                             BlockDriverCompletionFunc *cb,
4200                                             void *opaque,
4201                                             int is_write)
4202 
4203 {
4204     BlockDriverAIOCBSync *acb;
4205 
4206     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4207     acb->is_write = is_write;
4208     acb->qiov = qiov;
4209     acb->bounce = qemu_blockalign(bs, qiov->size);
4210     acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4211 
4212     if (is_write) {
4213         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4214         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4215     } else {
4216         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4217     }
4218 
4219     qemu_bh_schedule(acb->bh);
4220 
4221     return &acb->common;
4222 }
4223 
4224 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4225         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4226         BlockDriverCompletionFunc *cb, void *opaque)
4227 {
4228     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4229 }
4230 
4231 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4232         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4233         BlockDriverCompletionFunc *cb, void *opaque)
4234 {
4235     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4236 }
4237 
4238 
4239 typedef struct BlockDriverAIOCBCoroutine {
4240     BlockDriverAIOCB common;
4241     BlockRequest req;
4242     bool is_write;
4243     bool *done;
4244     QEMUBH* bh;
4245 } BlockDriverAIOCBCoroutine;
4246 
4247 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4248 {
4249     BlockDriverAIOCBCoroutine *acb =
4250         container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4251     bool done = false;
4252 
4253     acb->done = &done;
4254     while (!done) {
4255         qemu_aio_wait();
4256     }
4257 }
4258 
4259 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4260     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4261     .cancel             = bdrv_aio_co_cancel_em,
4262 };
4263 
4264 static void bdrv_co_em_bh(void *opaque)
4265 {
4266     BlockDriverAIOCBCoroutine *acb = opaque;
4267 
4268     acb->common.cb(acb->common.opaque, acb->req.error);
4269 
4270     if (acb->done) {
4271         *acb->done = true;
4272     }
4273 
4274     qemu_bh_delete(acb->bh);
4275     qemu_aio_release(acb);
4276 }
4277 
4278 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4279 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4280 {
4281     BlockDriverAIOCBCoroutine *acb = opaque;
4282     BlockDriverState *bs = acb->common.bs;
4283 
4284     if (!acb->is_write) {
4285         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4286             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4287     } else {
4288         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4289             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4290     }
4291 
4292     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4293     qemu_bh_schedule(acb->bh);
4294 }
4295 
4296 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4297                                                int64_t sector_num,
4298                                                QEMUIOVector *qiov,
4299                                                int nb_sectors,
4300                                                BdrvRequestFlags flags,
4301                                                BlockDriverCompletionFunc *cb,
4302                                                void *opaque,
4303                                                bool is_write)
4304 {
4305     Coroutine *co;
4306     BlockDriverAIOCBCoroutine *acb;
4307 
4308     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4309     acb->req.sector = sector_num;
4310     acb->req.nb_sectors = nb_sectors;
4311     acb->req.qiov = qiov;
4312     acb->req.flags = flags;
4313     acb->is_write = is_write;
4314     acb->done = NULL;
4315 
4316     co = qemu_coroutine_create(bdrv_co_do_rw);
4317     qemu_coroutine_enter(co, acb);
4318 
4319     return &acb->common;
4320 }
4321 
4322 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4323 {
4324     BlockDriverAIOCBCoroutine *acb = opaque;
4325     BlockDriverState *bs = acb->common.bs;
4326 
4327     acb->req.error = bdrv_co_flush(bs);
4328     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4329     qemu_bh_schedule(acb->bh);
4330 }
4331 
4332 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4333         BlockDriverCompletionFunc *cb, void *opaque)
4334 {
4335     trace_bdrv_aio_flush(bs, opaque);
4336 
4337     Coroutine *co;
4338     BlockDriverAIOCBCoroutine *acb;
4339 
4340     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4341     acb->done = NULL;
4342 
4343     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4344     qemu_coroutine_enter(co, acb);
4345 
4346     return &acb->common;
4347 }
4348 
4349 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4350 {
4351     BlockDriverAIOCBCoroutine *acb = opaque;
4352     BlockDriverState *bs = acb->common.bs;
4353 
4354     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4355     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4356     qemu_bh_schedule(acb->bh);
4357 }
4358 
4359 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4360         int64_t sector_num, int nb_sectors,
4361         BlockDriverCompletionFunc *cb, void *opaque)
4362 {
4363     Coroutine *co;
4364     BlockDriverAIOCBCoroutine *acb;
4365 
4366     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4367 
4368     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4369     acb->req.sector = sector_num;
4370     acb->req.nb_sectors = nb_sectors;
4371     acb->done = NULL;
4372     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4373     qemu_coroutine_enter(co, acb);
4374 
4375     return &acb->common;
4376 }
4377 
4378 void bdrv_init(void)
4379 {
4380     module_call_init(MODULE_INIT_BLOCK);
4381 }
4382 
4383 void bdrv_init_with_whitelist(void)
4384 {
4385     use_bdrv_whitelist = 1;
4386     bdrv_init();
4387 }
4388 
4389 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4390                    BlockDriverCompletionFunc *cb, void *opaque)
4391 {
4392     BlockDriverAIOCB *acb;
4393 
4394     acb = g_slice_alloc(aiocb_info->aiocb_size);
4395     acb->aiocb_info = aiocb_info;
4396     acb->bs = bs;
4397     acb->cb = cb;
4398     acb->opaque = opaque;
4399     return acb;
4400 }
4401 
4402 void qemu_aio_release(void *p)
4403 {
4404     BlockDriverAIOCB *acb = p;
4405     g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4406 }
4407 
4408 /**************************************************************/
4409 /* Coroutine block device emulation */
4410 
4411 typedef struct CoroutineIOCompletion {
4412     Coroutine *coroutine;
4413     int ret;
4414 } CoroutineIOCompletion;
4415 
4416 static void bdrv_co_io_em_complete(void *opaque, int ret)
4417 {
4418     CoroutineIOCompletion *co = opaque;
4419 
4420     co->ret = ret;
4421     qemu_coroutine_enter(co->coroutine, NULL);
4422 }
4423 
4424 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4425                                       int nb_sectors, QEMUIOVector *iov,
4426                                       bool is_write)
4427 {
4428     CoroutineIOCompletion co = {
4429         .coroutine = qemu_coroutine_self(),
4430     };
4431     BlockDriverAIOCB *acb;
4432 
4433     if (is_write) {
4434         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4435                                        bdrv_co_io_em_complete, &co);
4436     } else {
4437         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4438                                       bdrv_co_io_em_complete, &co);
4439     }
4440 
4441     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4442     if (!acb) {
4443         return -EIO;
4444     }
4445     qemu_coroutine_yield();
4446 
4447     return co.ret;
4448 }
4449 
4450 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4451                                          int64_t sector_num, int nb_sectors,
4452                                          QEMUIOVector *iov)
4453 {
4454     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4455 }
4456 
4457 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4458                                          int64_t sector_num, int nb_sectors,
4459                                          QEMUIOVector *iov)
4460 {
4461     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4462 }
4463 
4464 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4465 {
4466     RwCo *rwco = opaque;
4467 
4468     rwco->ret = bdrv_co_flush(rwco->bs);
4469 }
4470 
4471 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4472 {
4473     int ret;
4474 
4475     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4476         return 0;
4477     }
4478 
4479     /* Write back cached data to the OS even with cache=unsafe */
4480     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4481     if (bs->drv->bdrv_co_flush_to_os) {
4482         ret = bs->drv->bdrv_co_flush_to_os(bs);
4483         if (ret < 0) {
4484             return ret;
4485         }
4486     }
4487 
4488     /* But don't actually force it to the disk with cache=unsafe */
4489     if (bs->open_flags & BDRV_O_NO_FLUSH) {
4490         goto flush_parent;
4491     }
4492 
4493     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4494     if (bs->drv->bdrv_co_flush_to_disk) {
4495         ret = bs->drv->bdrv_co_flush_to_disk(bs);
4496     } else if (bs->drv->bdrv_aio_flush) {
4497         BlockDriverAIOCB *acb;
4498         CoroutineIOCompletion co = {
4499             .coroutine = qemu_coroutine_self(),
4500         };
4501 
4502         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4503         if (acb == NULL) {
4504             ret = -EIO;
4505         } else {
4506             qemu_coroutine_yield();
4507             ret = co.ret;
4508         }
4509     } else {
4510         /*
4511          * Some block drivers always operate in either writethrough or unsafe
4512          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4513          * know how the server works (because the behaviour is hardcoded or
4514          * depends on server-side configuration), so we can't ensure that
4515          * everything is safe on disk. Returning an error doesn't work because
4516          * that would break guests even if the server operates in writethrough
4517          * mode.
4518          *
4519          * Let's hope the user knows what he's doing.
4520          */
4521         ret = 0;
4522     }
4523     if (ret < 0) {
4524         return ret;
4525     }
4526 
4527     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4528      * in the case of cache=unsafe, so there are no useless flushes.
4529      */
4530 flush_parent:
4531     return bdrv_co_flush(bs->file);
4532 }
4533 
4534 void bdrv_invalidate_cache(BlockDriverState *bs)
4535 {
4536     if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4537         bs->drv->bdrv_invalidate_cache(bs);
4538     }
4539 }
4540 
4541 void bdrv_invalidate_cache_all(void)
4542 {
4543     BlockDriverState *bs;
4544 
4545     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4546         bdrv_invalidate_cache(bs);
4547     }
4548 }
4549 
4550 void bdrv_clear_incoming_migration_all(void)
4551 {
4552     BlockDriverState *bs;
4553 
4554     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4555         bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4556     }
4557 }
4558 
4559 int bdrv_flush(BlockDriverState *bs)
4560 {
4561     Coroutine *co;
4562     RwCo rwco = {
4563         .bs = bs,
4564         .ret = NOT_DONE,
4565     };
4566 
4567     if (qemu_in_coroutine()) {
4568         /* Fast-path if already in coroutine context */
4569         bdrv_flush_co_entry(&rwco);
4570     } else {
4571         co = qemu_coroutine_create(bdrv_flush_co_entry);
4572         qemu_coroutine_enter(co, &rwco);
4573         while (rwco.ret == NOT_DONE) {
4574             qemu_aio_wait();
4575         }
4576     }
4577 
4578     return rwco.ret;
4579 }
4580 
4581 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4582 {
4583     RwCo *rwco = opaque;
4584 
4585     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4586 }
4587 
4588 /* if no limit is specified in the BlockLimits use a default
4589  * of 32768 512-byte sectors (16 MiB) per request.
4590  */
4591 #define MAX_DISCARD_DEFAULT 32768
4592 
4593 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4594                                  int nb_sectors)
4595 {
4596     int max_discard;
4597 
4598     if (!bs->drv) {
4599         return -ENOMEDIUM;
4600     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4601         return -EIO;
4602     } else if (bs->read_only) {
4603         return -EROFS;
4604     }
4605 
4606     bdrv_reset_dirty(bs, sector_num, nb_sectors);
4607 
4608     /* Do nothing if disabled.  */
4609     if (!(bs->open_flags & BDRV_O_UNMAP)) {
4610         return 0;
4611     }
4612 
4613     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4614         return 0;
4615     }
4616 
4617     max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4618     while (nb_sectors > 0) {
4619         int ret;
4620         int num = nb_sectors;
4621 
4622         /* align request */
4623         if (bs->bl.discard_alignment &&
4624             num >= bs->bl.discard_alignment &&
4625             sector_num % bs->bl.discard_alignment) {
4626             if (num > bs->bl.discard_alignment) {
4627                 num = bs->bl.discard_alignment;
4628             }
4629             num -= sector_num % bs->bl.discard_alignment;
4630         }
4631 
4632         /* limit request size */
4633         if (num > max_discard) {
4634             num = max_discard;
4635         }
4636 
4637         if (bs->drv->bdrv_co_discard) {
4638             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
4639         } else {
4640             BlockDriverAIOCB *acb;
4641             CoroutineIOCompletion co = {
4642                 .coroutine = qemu_coroutine_self(),
4643             };
4644 
4645             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4646                                             bdrv_co_io_em_complete, &co);
4647             if (acb == NULL) {
4648                 return -EIO;
4649             } else {
4650                 qemu_coroutine_yield();
4651                 ret = co.ret;
4652             }
4653         }
4654         if (ret && ret != -ENOTSUP) {
4655             return ret;
4656         }
4657 
4658         sector_num += num;
4659         nb_sectors -= num;
4660     }
4661     return 0;
4662 }
4663 
4664 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4665 {
4666     Coroutine *co;
4667     RwCo rwco = {
4668         .bs = bs,
4669         .sector_num = sector_num,
4670         .nb_sectors = nb_sectors,
4671         .ret = NOT_DONE,
4672     };
4673 
4674     if (qemu_in_coroutine()) {
4675         /* Fast-path if already in coroutine context */
4676         bdrv_discard_co_entry(&rwco);
4677     } else {
4678         co = qemu_coroutine_create(bdrv_discard_co_entry);
4679         qemu_coroutine_enter(co, &rwco);
4680         while (rwco.ret == NOT_DONE) {
4681             qemu_aio_wait();
4682         }
4683     }
4684 
4685     return rwco.ret;
4686 }
4687 
4688 /**************************************************************/
4689 /* removable device support */
4690 
4691 /**
4692  * Return TRUE if the media is present
4693  */
4694 int bdrv_is_inserted(BlockDriverState *bs)
4695 {
4696     BlockDriver *drv = bs->drv;
4697 
4698     if (!drv)
4699         return 0;
4700     if (!drv->bdrv_is_inserted)
4701         return 1;
4702     return drv->bdrv_is_inserted(bs);
4703 }
4704 
4705 /**
4706  * Return whether the media changed since the last call to this
4707  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4708  */
4709 int bdrv_media_changed(BlockDriverState *bs)
4710 {
4711     BlockDriver *drv = bs->drv;
4712 
4713     if (drv && drv->bdrv_media_changed) {
4714         return drv->bdrv_media_changed(bs);
4715     }
4716     return -ENOTSUP;
4717 }
4718 
4719 /**
4720  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4721  */
4722 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4723 {
4724     BlockDriver *drv = bs->drv;
4725 
4726     if (drv && drv->bdrv_eject) {
4727         drv->bdrv_eject(bs, eject_flag);
4728     }
4729 
4730     if (bs->device_name[0] != '\0') {
4731         bdrv_emit_qmp_eject_event(bs, eject_flag);
4732     }
4733 }
4734 
4735 /**
4736  * Lock or unlock the media (if it is locked, the user won't be able
4737  * to eject it manually).
4738  */
4739 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4740 {
4741     BlockDriver *drv = bs->drv;
4742 
4743     trace_bdrv_lock_medium(bs, locked);
4744 
4745     if (drv && drv->bdrv_lock_medium) {
4746         drv->bdrv_lock_medium(bs, locked);
4747     }
4748 }
4749 
4750 /* needed for generic scsi interface */
4751 
4752 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4753 {
4754     BlockDriver *drv = bs->drv;
4755 
4756     if (drv && drv->bdrv_ioctl)
4757         return drv->bdrv_ioctl(bs, req, buf);
4758     return -ENOTSUP;
4759 }
4760 
4761 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4762         unsigned long int req, void *buf,
4763         BlockDriverCompletionFunc *cb, void *opaque)
4764 {
4765     BlockDriver *drv = bs->drv;
4766 
4767     if (drv && drv->bdrv_aio_ioctl)
4768         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4769     return NULL;
4770 }
4771 
4772 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
4773 {
4774     bs->buffer_alignment = align;
4775 }
4776 
4777 void *qemu_blockalign(BlockDriverState *bs, size_t size)
4778 {
4779     return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
4780 }
4781 
4782 /*
4783  * Check if all memory in this vector is sector aligned.
4784  */
4785 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
4786 {
4787     int i;
4788 
4789     for (i = 0; i < qiov->niov; i++) {
4790         if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
4791             return false;
4792         }
4793     }
4794 
4795     return true;
4796 }
4797 
4798 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity)
4799 {
4800     int64_t bitmap_size;
4801     BdrvDirtyBitmap *bitmap;
4802 
4803     assert((granularity & (granularity - 1)) == 0);
4804 
4805     granularity >>= BDRV_SECTOR_BITS;
4806     assert(granularity);
4807     bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
4808     bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
4809     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
4810     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
4811     return bitmap;
4812 }
4813 
4814 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
4815 {
4816     BdrvDirtyBitmap *bm, *next;
4817     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
4818         if (bm == bitmap) {
4819             QLIST_REMOVE(bitmap, list);
4820             hbitmap_free(bitmap->bitmap);
4821             g_free(bitmap);
4822             return;
4823         }
4824     }
4825 }
4826 
4827 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
4828 {
4829     BdrvDirtyBitmap *bm;
4830     BlockDirtyInfoList *list = NULL;
4831     BlockDirtyInfoList **plist = &list;
4832 
4833     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
4834         BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
4835         BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
4836         info->count = bdrv_get_dirty_count(bs, bm);
4837         info->granularity =
4838             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
4839         entry->value = info;
4840         *plist = entry;
4841         plist = &entry->next;
4842     }
4843 
4844     return list;
4845 }
4846 
4847 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
4848 {
4849     if (bitmap) {
4850         return hbitmap_get(bitmap->bitmap, sector);
4851     } else {
4852         return 0;
4853     }
4854 }
4855 
4856 void bdrv_dirty_iter_init(BlockDriverState *bs,
4857                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
4858 {
4859     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
4860 }
4861 
4862 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
4863                     int nr_sectors)
4864 {
4865     BdrvDirtyBitmap *bitmap;
4866     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
4867         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
4868     }
4869 }
4870 
4871 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
4872 {
4873     BdrvDirtyBitmap *bitmap;
4874     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
4875         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
4876     }
4877 }
4878 
4879 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
4880 {
4881     return hbitmap_count(bitmap->bitmap);
4882 }
4883 
4884 /* Get a reference to bs */
4885 void bdrv_ref(BlockDriverState *bs)
4886 {
4887     bs->refcnt++;
4888 }
4889 
4890 /* Release a previously grabbed reference to bs.
4891  * If after releasing, reference count is zero, the BlockDriverState is
4892  * deleted. */
4893 void bdrv_unref(BlockDriverState *bs)
4894 {
4895     assert(bs->refcnt > 0);
4896     if (--bs->refcnt == 0) {
4897         bdrv_delete(bs);
4898     }
4899 }
4900 
4901 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4902 {
4903     assert(bs->in_use != in_use);
4904     bs->in_use = in_use;
4905 }
4906 
4907 int bdrv_in_use(BlockDriverState *bs)
4908 {
4909     return bs->in_use;
4910 }
4911 
4912 void bdrv_iostatus_enable(BlockDriverState *bs)
4913 {
4914     bs->iostatus_enabled = true;
4915     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4916 }
4917 
4918 /* The I/O status is only enabled if the drive explicitly
4919  * enables it _and_ the VM is configured to stop on errors */
4920 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4921 {
4922     return (bs->iostatus_enabled &&
4923            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
4924             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
4925             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
4926 }
4927 
4928 void bdrv_iostatus_disable(BlockDriverState *bs)
4929 {
4930     bs->iostatus_enabled = false;
4931 }
4932 
4933 void bdrv_iostatus_reset(BlockDriverState *bs)
4934 {
4935     if (bdrv_iostatus_is_enabled(bs)) {
4936         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4937         if (bs->job) {
4938             block_job_iostatus_reset(bs->job);
4939         }
4940     }
4941 }
4942 
4943 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4944 {
4945     assert(bdrv_iostatus_is_enabled(bs));
4946     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4947         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4948                                          BLOCK_DEVICE_IO_STATUS_FAILED;
4949     }
4950 }
4951 
4952 void
4953 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4954         enum BlockAcctType type)
4955 {
4956     assert(type < BDRV_MAX_IOTYPE);
4957 
4958     cookie->bytes = bytes;
4959     cookie->start_time_ns = get_clock();
4960     cookie->type = type;
4961 }
4962 
4963 void
4964 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4965 {
4966     assert(cookie->type < BDRV_MAX_IOTYPE);
4967 
4968     bs->nr_bytes[cookie->type] += cookie->bytes;
4969     bs->nr_ops[cookie->type]++;
4970     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4971 }
4972 
4973 void bdrv_img_create(const char *filename, const char *fmt,
4974                      const char *base_filename, const char *base_fmt,
4975                      char *options, uint64_t img_size, int flags,
4976                      Error **errp, bool quiet)
4977 {
4978     QEMUOptionParameter *param = NULL, *create_options = NULL;
4979     QEMUOptionParameter *backing_fmt, *backing_file, *size;
4980     BlockDriver *drv, *proto_drv;
4981     BlockDriver *backing_drv = NULL;
4982     Error *local_err = NULL;
4983     int ret = 0;
4984 
4985     /* Find driver and parse its options */
4986     drv = bdrv_find_format(fmt);
4987     if (!drv) {
4988         error_setg(errp, "Unknown file format '%s'", fmt);
4989         return;
4990     }
4991 
4992     proto_drv = bdrv_find_protocol(filename, true);
4993     if (!proto_drv) {
4994         error_setg(errp, "Unknown protocol '%s'", filename);
4995         return;
4996     }
4997 
4998     create_options = append_option_parameters(create_options,
4999                                               drv->create_options);
5000     create_options = append_option_parameters(create_options,
5001                                               proto_drv->create_options);
5002 
5003     /* Create parameter list with default values */
5004     param = parse_option_parameters("", create_options, param);
5005 
5006     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5007 
5008     /* Parse -o options */
5009     if (options) {
5010         param = parse_option_parameters(options, create_options, param);
5011         if (param == NULL) {
5012             error_setg(errp, "Invalid options for file format '%s'.", fmt);
5013             goto out;
5014         }
5015     }
5016 
5017     if (base_filename) {
5018         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5019                                  base_filename)) {
5020             error_setg(errp, "Backing file not supported for file format '%s'",
5021                        fmt);
5022             goto out;
5023         }
5024     }
5025 
5026     if (base_fmt) {
5027         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5028             error_setg(errp, "Backing file format not supported for file "
5029                              "format '%s'", fmt);
5030             goto out;
5031         }
5032     }
5033 
5034     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5035     if (backing_file && backing_file->value.s) {
5036         if (!strcmp(filename, backing_file->value.s)) {
5037             error_setg(errp, "Error: Trying to create an image with the "
5038                              "same filename as the backing file");
5039             goto out;
5040         }
5041     }
5042 
5043     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5044     if (backing_fmt && backing_fmt->value.s) {
5045         backing_drv = bdrv_find_format(backing_fmt->value.s);
5046         if (!backing_drv) {
5047             error_setg(errp, "Unknown backing file format '%s'",
5048                        backing_fmt->value.s);
5049             goto out;
5050         }
5051     }
5052 
5053     // The size for the image must always be specified, with one exception:
5054     // If we are using a backing file, we can obtain the size from there
5055     size = get_option_parameter(param, BLOCK_OPT_SIZE);
5056     if (size && size->value.n == -1) {
5057         if (backing_file && backing_file->value.s) {
5058             BlockDriverState *bs;
5059             uint64_t size;
5060             char buf[32];
5061             int back_flags;
5062 
5063             /* backing files always opened read-only */
5064             back_flags =
5065                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5066 
5067             bs = bdrv_new("");
5068 
5069             ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags,
5070                             backing_drv, &local_err);
5071             if (ret < 0) {
5072                 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5073                                  backing_file->value.s,
5074                                  error_get_pretty(local_err));
5075                 error_free(local_err);
5076                 local_err = NULL;
5077                 bdrv_unref(bs);
5078                 goto out;
5079             }
5080             bdrv_get_geometry(bs, &size);
5081             size *= 512;
5082 
5083             snprintf(buf, sizeof(buf), "%" PRId64, size);
5084             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
5085 
5086             bdrv_unref(bs);
5087         } else {
5088             error_setg(errp, "Image creation needs a size parameter");
5089             goto out;
5090         }
5091     }
5092 
5093     if (!quiet) {
5094         printf("Formatting '%s', fmt=%s ", filename, fmt);
5095         print_option_parameters(param);
5096         puts("");
5097     }
5098     ret = bdrv_create(drv, filename, param, &local_err);
5099     if (ret == -EFBIG) {
5100         /* This is generally a better message than whatever the driver would
5101          * deliver (especially because of the cluster_size_hint), since that
5102          * is most probably not much different from "image too large". */
5103         const char *cluster_size_hint = "";
5104         if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5105             cluster_size_hint = " (try using a larger cluster size)";
5106         }
5107         error_setg(errp, "The image size is too large for file format '%s'"
5108                    "%s", fmt, cluster_size_hint);
5109         error_free(local_err);
5110         local_err = NULL;
5111     }
5112 
5113 out:
5114     free_option_parameters(create_options);
5115     free_option_parameters(param);
5116 
5117     if (error_is_set(&local_err)) {
5118         error_propagate(errp, local_err);
5119     }
5120 }
5121 
5122 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5123 {
5124     /* Currently BlockDriverState always uses the main loop AioContext */
5125     return qemu_get_aio_context();
5126 }
5127 
5128 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5129                                     NotifierWithReturn *notifier)
5130 {
5131     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5132 }
5133 
5134 int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5135 {
5136     if (bs->drv->bdrv_amend_options == NULL) {
5137         return -ENOTSUP;
5138     }
5139     return bs->drv->bdrv_amend_options(bs, options);
5140 }
5141 
5142 /* Used to recurse on single child block filters.
5143  * Single child block filter will store their child in bs->file.
5144  */
5145 bool bdrv_generic_is_first_non_filter(BlockDriverState *bs,
5146                                       BlockDriverState *candidate)
5147 {
5148     if (!bs->drv) {
5149         return false;
5150     }
5151 
5152     if (!bs->drv->authorizations[BS_IS_A_FILTER]) {
5153         if (bs == candidate) {
5154             return true;
5155         } else {
5156             return false;
5157         }
5158     }
5159 
5160     if (!bs->drv->authorizations[BS_FILTER_PASS_DOWN]) {
5161         return false;
5162     }
5163 
5164     if (!bs->file) {
5165         return false;
5166     }
5167 
5168     return bdrv_recurse_is_first_non_filter(bs->file, candidate);
5169 }
5170 
5171 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5172                                       BlockDriverState *candidate)
5173 {
5174     if (bs->drv && bs->drv->bdrv_recurse_is_first_non_filter) {
5175         return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5176     }
5177 
5178     return bdrv_generic_is_first_non_filter(bs, candidate);
5179 }
5180 
5181 /* This function checks if the candidate is the first non filter bs down it's
5182  * bs chain. Since we don't have pointers to parents it explore all bs chains
5183  * from the top. Some filters can choose not to pass down the recursion.
5184  */
5185 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5186 {
5187     BlockDriverState *bs;
5188 
5189     /* walk down the bs forest recursively */
5190     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5191         bool perm;
5192 
5193         if (!bs->file) {
5194             continue;
5195         }
5196 
5197         perm = bdrv_recurse_is_first_non_filter(bs->file, candidate);
5198 
5199         /* candidate is the first non filter */
5200         if (perm) {
5201             return true;
5202         }
5203     }
5204 
5205     return false;
5206 }
5207