xref: /openbmc/qemu/block.c (revision d5ef94d43da8c57a2d597efbdec3d9a54d97fdf7)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor/monitor.h"
28 #include "block/block_int.h"
29 #include "block/blockjob.h"
30 #include "qemu/module.h"
31 #include "qapi/qmp/qjson.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "qmp-commands.h"
36 #include "qemu/timer.h"
37 
38 #ifdef CONFIG_BSD
39 #include <sys/types.h>
40 #include <sys/stat.h>
41 #include <sys/ioctl.h>
42 #include <sys/queue.h>
43 #ifndef __DragonFly__
44 #include <sys/disk.h>
45 #endif
46 #endif
47 
48 #ifdef _WIN32
49 #include <windows.h>
50 #endif
51 
52 struct BdrvDirtyBitmap {
53     HBitmap *bitmap;
54     QLIST_ENTRY(BdrvDirtyBitmap) list;
55 };
56 
57 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
58 
59 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
60 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
61         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62         BlockDriverCompletionFunc *cb, void *opaque);
63 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
64         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
65         BlockDriverCompletionFunc *cb, void *opaque);
66 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
67                                          int64_t sector_num, int nb_sectors,
68                                          QEMUIOVector *iov);
69 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
70                                          int64_t sector_num, int nb_sectors,
71                                          QEMUIOVector *iov);
72 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
73     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74     BdrvRequestFlags flags);
75 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
76     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
77     BdrvRequestFlags flags);
78 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
79                                                int64_t sector_num,
80                                                QEMUIOVector *qiov,
81                                                int nb_sectors,
82                                                BdrvRequestFlags flags,
83                                                BlockDriverCompletionFunc *cb,
84                                                void *opaque,
85                                                bool is_write);
86 static void coroutine_fn bdrv_co_do_rw(void *opaque);
87 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
88     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
89 
90 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
91     QTAILQ_HEAD_INITIALIZER(bdrv_states);
92 
93 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
94     QLIST_HEAD_INITIALIZER(bdrv_drivers);
95 
96 /* If non-zero, use only whitelisted block drivers */
97 static int use_bdrv_whitelist;
98 
99 #ifdef _WIN32
100 static int is_windows_drive_prefix(const char *filename)
101 {
102     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
103              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
104             filename[1] == ':');
105 }
106 
107 int is_windows_drive(const char *filename)
108 {
109     if (is_windows_drive_prefix(filename) &&
110         filename[2] == '\0')
111         return 1;
112     if (strstart(filename, "\\\\.\\", NULL) ||
113         strstart(filename, "//./", NULL))
114         return 1;
115     return 0;
116 }
117 #endif
118 
119 /* throttling disk I/O limits */
120 void bdrv_set_io_limits(BlockDriverState *bs,
121                         ThrottleConfig *cfg)
122 {
123     int i;
124 
125     throttle_config(&bs->throttle_state, cfg);
126 
127     for (i = 0; i < 2; i++) {
128         qemu_co_enter_next(&bs->throttled_reqs[i]);
129     }
130 }
131 
132 /* this function drain all the throttled IOs */
133 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
134 {
135     bool drained = false;
136     bool enabled = bs->io_limits_enabled;
137     int i;
138 
139     bs->io_limits_enabled = false;
140 
141     for (i = 0; i < 2; i++) {
142         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
143             drained = true;
144         }
145     }
146 
147     bs->io_limits_enabled = enabled;
148 
149     return drained;
150 }
151 
152 void bdrv_io_limits_disable(BlockDriverState *bs)
153 {
154     bs->io_limits_enabled = false;
155 
156     bdrv_start_throttled_reqs(bs);
157 
158     throttle_destroy(&bs->throttle_state);
159 }
160 
161 static void bdrv_throttle_read_timer_cb(void *opaque)
162 {
163     BlockDriverState *bs = opaque;
164     qemu_co_enter_next(&bs->throttled_reqs[0]);
165 }
166 
167 static void bdrv_throttle_write_timer_cb(void *opaque)
168 {
169     BlockDriverState *bs = opaque;
170     qemu_co_enter_next(&bs->throttled_reqs[1]);
171 }
172 
173 /* should be called before bdrv_set_io_limits if a limit is set */
174 void bdrv_io_limits_enable(BlockDriverState *bs)
175 {
176     assert(!bs->io_limits_enabled);
177     throttle_init(&bs->throttle_state,
178                   QEMU_CLOCK_VIRTUAL,
179                   bdrv_throttle_read_timer_cb,
180                   bdrv_throttle_write_timer_cb,
181                   bs);
182     bs->io_limits_enabled = true;
183 }
184 
185 /* This function makes an IO wait if needed
186  *
187  * @nb_sectors: the number of sectors of the IO
188  * @is_write:   is the IO a write
189  */
190 static void bdrv_io_limits_intercept(BlockDriverState *bs,
191                                      int nb_sectors,
192                                      bool is_write)
193 {
194     /* does this io must wait */
195     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
196 
197     /* if must wait or any request of this type throttled queue the IO */
198     if (must_wait ||
199         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
200         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
201     }
202 
203     /* the IO will be executed, do the accounting */
204     throttle_account(&bs->throttle_state,
205                      is_write,
206                      nb_sectors * BDRV_SECTOR_SIZE);
207 
208     /* if the next request must wait -> do nothing */
209     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
210         return;
211     }
212 
213     /* else queue next request for execution */
214     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
215 }
216 
217 /* check if the path starts with "<protocol>:" */
218 static int path_has_protocol(const char *path)
219 {
220     const char *p;
221 
222 #ifdef _WIN32
223     if (is_windows_drive(path) ||
224         is_windows_drive_prefix(path)) {
225         return 0;
226     }
227     p = path + strcspn(path, ":/\\");
228 #else
229     p = path + strcspn(path, ":/");
230 #endif
231 
232     return *p == ':';
233 }
234 
235 int path_is_absolute(const char *path)
236 {
237 #ifdef _WIN32
238     /* specific case for names like: "\\.\d:" */
239     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
240         return 1;
241     }
242     return (*path == '/' || *path == '\\');
243 #else
244     return (*path == '/');
245 #endif
246 }
247 
248 /* if filename is absolute, just copy it to dest. Otherwise, build a
249    path to it by considering it is relative to base_path. URL are
250    supported. */
251 void path_combine(char *dest, int dest_size,
252                   const char *base_path,
253                   const char *filename)
254 {
255     const char *p, *p1;
256     int len;
257 
258     if (dest_size <= 0)
259         return;
260     if (path_is_absolute(filename)) {
261         pstrcpy(dest, dest_size, filename);
262     } else {
263         p = strchr(base_path, ':');
264         if (p)
265             p++;
266         else
267             p = base_path;
268         p1 = strrchr(base_path, '/');
269 #ifdef _WIN32
270         {
271             const char *p2;
272             p2 = strrchr(base_path, '\\');
273             if (!p1 || p2 > p1)
274                 p1 = p2;
275         }
276 #endif
277         if (p1)
278             p1++;
279         else
280             p1 = base_path;
281         if (p1 > p)
282             p = p1;
283         len = p - base_path;
284         if (len > dest_size - 1)
285             len = dest_size - 1;
286         memcpy(dest, base_path, len);
287         dest[len] = '\0';
288         pstrcat(dest, dest_size, filename);
289     }
290 }
291 
292 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
293 {
294     if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
295         pstrcpy(dest, sz, bs->backing_file);
296     } else {
297         path_combine(dest, sz, bs->filename, bs->backing_file);
298     }
299 }
300 
301 void bdrv_register(BlockDriver *bdrv)
302 {
303     /* Block drivers without coroutine functions need emulation */
304     if (!bdrv->bdrv_co_readv) {
305         bdrv->bdrv_co_readv = bdrv_co_readv_em;
306         bdrv->bdrv_co_writev = bdrv_co_writev_em;
307 
308         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
309          * the block driver lacks aio we need to emulate that too.
310          */
311         if (!bdrv->bdrv_aio_readv) {
312             /* add AIO emulation layer */
313             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
314             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
315         }
316     }
317 
318     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
319 }
320 
321 /* create a new block device (by default it is empty) */
322 BlockDriverState *bdrv_new(const char *device_name)
323 {
324     BlockDriverState *bs;
325 
326     bs = g_malloc0(sizeof(BlockDriverState));
327     QLIST_INIT(&bs->dirty_bitmaps);
328     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
329     if (device_name[0] != '\0') {
330         QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
331     }
332     bdrv_iostatus_disable(bs);
333     notifier_list_init(&bs->close_notifiers);
334     notifier_with_return_list_init(&bs->before_write_notifiers);
335     qemu_co_queue_init(&bs->throttled_reqs[0]);
336     qemu_co_queue_init(&bs->throttled_reqs[1]);
337     bs->refcnt = 1;
338 
339     return bs;
340 }
341 
342 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
343 {
344     notifier_list_add(&bs->close_notifiers, notify);
345 }
346 
347 BlockDriver *bdrv_find_format(const char *format_name)
348 {
349     BlockDriver *drv1;
350     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
351         if (!strcmp(drv1->format_name, format_name)) {
352             return drv1;
353         }
354     }
355     return NULL;
356 }
357 
358 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
359 {
360     static const char *whitelist_rw[] = {
361         CONFIG_BDRV_RW_WHITELIST
362     };
363     static const char *whitelist_ro[] = {
364         CONFIG_BDRV_RO_WHITELIST
365     };
366     const char **p;
367 
368     if (!whitelist_rw[0] && !whitelist_ro[0]) {
369         return 1;               /* no whitelist, anything goes */
370     }
371 
372     for (p = whitelist_rw; *p; p++) {
373         if (!strcmp(drv->format_name, *p)) {
374             return 1;
375         }
376     }
377     if (read_only) {
378         for (p = whitelist_ro; *p; p++) {
379             if (!strcmp(drv->format_name, *p)) {
380                 return 1;
381             }
382         }
383     }
384     return 0;
385 }
386 
387 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
388                                           bool read_only)
389 {
390     BlockDriver *drv = bdrv_find_format(format_name);
391     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
392 }
393 
394 typedef struct CreateCo {
395     BlockDriver *drv;
396     char *filename;
397     QEMUOptionParameter *options;
398     int ret;
399     Error *err;
400 } CreateCo;
401 
402 static void coroutine_fn bdrv_create_co_entry(void *opaque)
403 {
404     Error *local_err = NULL;
405     int ret;
406 
407     CreateCo *cco = opaque;
408     assert(cco->drv);
409 
410     ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
411     if (error_is_set(&local_err)) {
412         error_propagate(&cco->err, local_err);
413     }
414     cco->ret = ret;
415 }
416 
417 int bdrv_create(BlockDriver *drv, const char* filename,
418     QEMUOptionParameter *options, Error **errp)
419 {
420     int ret;
421 
422     Coroutine *co;
423     CreateCo cco = {
424         .drv = drv,
425         .filename = g_strdup(filename),
426         .options = options,
427         .ret = NOT_DONE,
428         .err = NULL,
429     };
430 
431     if (!drv->bdrv_create) {
432         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
433         ret = -ENOTSUP;
434         goto out;
435     }
436 
437     if (qemu_in_coroutine()) {
438         /* Fast-path if already in coroutine context */
439         bdrv_create_co_entry(&cco);
440     } else {
441         co = qemu_coroutine_create(bdrv_create_co_entry);
442         qemu_coroutine_enter(co, &cco);
443         while (cco.ret == NOT_DONE) {
444             qemu_aio_wait();
445         }
446     }
447 
448     ret = cco.ret;
449     if (ret < 0) {
450         if (error_is_set(&cco.err)) {
451             error_propagate(errp, cco.err);
452         } else {
453             error_setg_errno(errp, -ret, "Could not create image");
454         }
455     }
456 
457 out:
458     g_free(cco.filename);
459     return ret;
460 }
461 
462 int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
463                      Error **errp)
464 {
465     BlockDriver *drv;
466     Error *local_err = NULL;
467     int ret;
468 
469     drv = bdrv_find_protocol(filename, true);
470     if (drv == NULL) {
471         error_setg(errp, "Could not find protocol for file '%s'", filename);
472         return -ENOENT;
473     }
474 
475     ret = bdrv_create(drv, filename, options, &local_err);
476     if (error_is_set(&local_err)) {
477         error_propagate(errp, local_err);
478     }
479     return ret;
480 }
481 
482 /*
483  * Create a uniquely-named empty temporary file.
484  * Return 0 upon success, otherwise a negative errno value.
485  */
486 int get_tmp_filename(char *filename, int size)
487 {
488 #ifdef _WIN32
489     char temp_dir[MAX_PATH];
490     /* GetTempFileName requires that its output buffer (4th param)
491        have length MAX_PATH or greater.  */
492     assert(size >= MAX_PATH);
493     return (GetTempPath(MAX_PATH, temp_dir)
494             && GetTempFileName(temp_dir, "qem", 0, filename)
495             ? 0 : -GetLastError());
496 #else
497     int fd;
498     const char *tmpdir;
499     tmpdir = getenv("TMPDIR");
500     if (!tmpdir)
501         tmpdir = "/tmp";
502     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
503         return -EOVERFLOW;
504     }
505     fd = mkstemp(filename);
506     if (fd < 0) {
507         return -errno;
508     }
509     if (close(fd) != 0) {
510         unlink(filename);
511         return -errno;
512     }
513     return 0;
514 #endif
515 }
516 
517 /*
518  * Detect host devices. By convention, /dev/cdrom[N] is always
519  * recognized as a host CDROM.
520  */
521 static BlockDriver *find_hdev_driver(const char *filename)
522 {
523     int score_max = 0, score;
524     BlockDriver *drv = NULL, *d;
525 
526     QLIST_FOREACH(d, &bdrv_drivers, list) {
527         if (d->bdrv_probe_device) {
528             score = d->bdrv_probe_device(filename);
529             if (score > score_max) {
530                 score_max = score;
531                 drv = d;
532             }
533         }
534     }
535 
536     return drv;
537 }
538 
539 BlockDriver *bdrv_find_protocol(const char *filename,
540                                 bool allow_protocol_prefix)
541 {
542     BlockDriver *drv1;
543     char protocol[128];
544     int len;
545     const char *p;
546 
547     /* TODO Drivers without bdrv_file_open must be specified explicitly */
548 
549     /*
550      * XXX(hch): we really should not let host device detection
551      * override an explicit protocol specification, but moving this
552      * later breaks access to device names with colons in them.
553      * Thanks to the brain-dead persistent naming schemes on udev-
554      * based Linux systems those actually are quite common.
555      */
556     drv1 = find_hdev_driver(filename);
557     if (drv1) {
558         return drv1;
559     }
560 
561     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
562         return bdrv_find_format("file");
563     }
564 
565     p = strchr(filename, ':');
566     assert(p != NULL);
567     len = p - filename;
568     if (len > sizeof(protocol) - 1)
569         len = sizeof(protocol) - 1;
570     memcpy(protocol, filename, len);
571     protocol[len] = '\0';
572     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
573         if (drv1->protocol_name &&
574             !strcmp(drv1->protocol_name, protocol)) {
575             return drv1;
576         }
577     }
578     return NULL;
579 }
580 
581 static int find_image_format(BlockDriverState *bs, const char *filename,
582                              BlockDriver **pdrv, Error **errp)
583 {
584     int score, score_max;
585     BlockDriver *drv1, *drv;
586     uint8_t buf[2048];
587     int ret = 0;
588 
589     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
590     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
591         drv = bdrv_find_format("raw");
592         if (!drv) {
593             error_setg(errp, "Could not find raw image format");
594             ret = -ENOENT;
595         }
596         *pdrv = drv;
597         return ret;
598     }
599 
600     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
601     if (ret < 0) {
602         error_setg_errno(errp, -ret, "Could not read image for determining its "
603                          "format");
604         *pdrv = NULL;
605         return ret;
606     }
607 
608     score_max = 0;
609     drv = NULL;
610     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
611         if (drv1->bdrv_probe) {
612             score = drv1->bdrv_probe(buf, ret, filename);
613             if (score > score_max) {
614                 score_max = score;
615                 drv = drv1;
616             }
617         }
618     }
619     if (!drv) {
620         error_setg(errp, "Could not determine image format: No compatible "
621                    "driver found");
622         ret = -ENOENT;
623     }
624     *pdrv = drv;
625     return ret;
626 }
627 
628 /**
629  * Set the current 'total_sectors' value
630  */
631 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
632 {
633     BlockDriver *drv = bs->drv;
634 
635     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
636     if (bs->sg)
637         return 0;
638 
639     /* query actual device if possible, otherwise just trust the hint */
640     if (drv->bdrv_getlength) {
641         int64_t length = drv->bdrv_getlength(bs);
642         if (length < 0) {
643             return length;
644         }
645         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
646     }
647 
648     bs->total_sectors = hint;
649     return 0;
650 }
651 
652 /**
653  * Set open flags for a given discard mode
654  *
655  * Return 0 on success, -1 if the discard mode was invalid.
656  */
657 int bdrv_parse_discard_flags(const char *mode, int *flags)
658 {
659     *flags &= ~BDRV_O_UNMAP;
660 
661     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
662         /* do nothing */
663     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
664         *flags |= BDRV_O_UNMAP;
665     } else {
666         return -1;
667     }
668 
669     return 0;
670 }
671 
672 /**
673  * Set open flags for a given cache mode
674  *
675  * Return 0 on success, -1 if the cache mode was invalid.
676  */
677 int bdrv_parse_cache_flags(const char *mode, int *flags)
678 {
679     *flags &= ~BDRV_O_CACHE_MASK;
680 
681     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
682         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
683     } else if (!strcmp(mode, "directsync")) {
684         *flags |= BDRV_O_NOCACHE;
685     } else if (!strcmp(mode, "writeback")) {
686         *flags |= BDRV_O_CACHE_WB;
687     } else if (!strcmp(mode, "unsafe")) {
688         *flags |= BDRV_O_CACHE_WB;
689         *flags |= BDRV_O_NO_FLUSH;
690     } else if (!strcmp(mode, "writethrough")) {
691         /* this is the default */
692     } else {
693         return -1;
694     }
695 
696     return 0;
697 }
698 
699 /**
700  * The copy-on-read flag is actually a reference count so multiple users may
701  * use the feature without worrying about clobbering its previous state.
702  * Copy-on-read stays enabled until all users have called to disable it.
703  */
704 void bdrv_enable_copy_on_read(BlockDriverState *bs)
705 {
706     bs->copy_on_read++;
707 }
708 
709 void bdrv_disable_copy_on_read(BlockDriverState *bs)
710 {
711     assert(bs->copy_on_read > 0);
712     bs->copy_on_read--;
713 }
714 
715 static int bdrv_open_flags(BlockDriverState *bs, int flags)
716 {
717     int open_flags = flags | BDRV_O_CACHE_WB;
718 
719     /*
720      * Clear flags that are internal to the block layer before opening the
721      * image.
722      */
723     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
724 
725     /*
726      * Snapshots should be writable.
727      */
728     if (bs->is_temporary) {
729         open_flags |= BDRV_O_RDWR;
730     }
731 
732     return open_flags;
733 }
734 
735 /*
736  * Common part for opening disk images and files
737  *
738  * Removes all processed options from *options.
739  */
740 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
741     QDict *options, int flags, BlockDriver *drv, Error **errp)
742 {
743     int ret, open_flags;
744     const char *filename;
745     Error *local_err = NULL;
746 
747     assert(drv != NULL);
748     assert(bs->file == NULL);
749     assert(options != NULL && bs->options != options);
750 
751     if (file != NULL) {
752         filename = file->filename;
753     } else {
754         filename = qdict_get_try_str(options, "filename");
755     }
756 
757     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
758 
759     /* bdrv_open() with directly using a protocol as drv. This layer is already
760      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
761      * and return immediately. */
762     if (file != NULL && drv->bdrv_file_open) {
763         bdrv_swap(file, bs);
764         return 0;
765     }
766 
767     bs->open_flags = flags;
768     bs->buffer_alignment = 512;
769     bs->zero_beyond_eof = true;
770     open_flags = bdrv_open_flags(bs, flags);
771     bs->read_only = !(open_flags & BDRV_O_RDWR);
772 
773     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
774         error_setg(errp,
775                    !bs->read_only && bdrv_is_whitelisted(drv, true)
776                         ? "Driver '%s' can only be used for read-only devices"
777                         : "Driver '%s' is not whitelisted",
778                    drv->format_name);
779         return -ENOTSUP;
780     }
781 
782     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
783     if (flags & BDRV_O_COPY_ON_READ) {
784         if (!bs->read_only) {
785             bdrv_enable_copy_on_read(bs);
786         } else {
787             error_setg(errp, "Can't use copy-on-read on read-only device");
788             return -EINVAL;
789         }
790     }
791 
792     if (filename != NULL) {
793         pstrcpy(bs->filename, sizeof(bs->filename), filename);
794     } else {
795         bs->filename[0] = '\0';
796     }
797 
798     bs->drv = drv;
799     bs->opaque = g_malloc0(drv->instance_size);
800 
801     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
802 
803     /* Open the image, either directly or using a protocol */
804     if (drv->bdrv_file_open) {
805         assert(file == NULL);
806         assert(!drv->bdrv_needs_filename || filename != NULL);
807         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
808     } else {
809         if (file == NULL) {
810             error_setg(errp, "Can't use '%s' as a block driver for the "
811                        "protocol level", drv->format_name);
812             ret = -EINVAL;
813             goto free_and_fail;
814         }
815         bs->file = file;
816         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
817     }
818 
819     if (ret < 0) {
820         if (error_is_set(&local_err)) {
821             error_propagate(errp, local_err);
822         } else if (bs->filename[0]) {
823             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
824         } else {
825             error_setg_errno(errp, -ret, "Could not open image");
826         }
827         goto free_and_fail;
828     }
829 
830     ret = refresh_total_sectors(bs, bs->total_sectors);
831     if (ret < 0) {
832         error_setg_errno(errp, -ret, "Could not refresh total sector count");
833         goto free_and_fail;
834     }
835 
836 #ifndef _WIN32
837     if (bs->is_temporary) {
838         assert(bs->filename[0] != '\0');
839         unlink(bs->filename);
840     }
841 #endif
842     return 0;
843 
844 free_and_fail:
845     bs->file = NULL;
846     g_free(bs->opaque);
847     bs->opaque = NULL;
848     bs->drv = NULL;
849     return ret;
850 }
851 
852 /*
853  * Opens a file using a protocol (file, host_device, nbd, ...)
854  *
855  * options is a QDict of options to pass to the block drivers, or NULL for an
856  * empty set of options. The reference to the QDict belongs to the block layer
857  * after the call (even on failure), so if the caller intends to reuse the
858  * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
859  */
860 int bdrv_file_open(BlockDriverState **pbs, const char *filename,
861                    QDict *options, int flags, Error **errp)
862 {
863     BlockDriverState *bs;
864     BlockDriver *drv;
865     const char *drvname;
866     bool allow_protocol_prefix = false;
867     Error *local_err = NULL;
868     int ret;
869 
870     /* NULL means an empty set of options */
871     if (options == NULL) {
872         options = qdict_new();
873     }
874 
875     bs = bdrv_new("");
876     bs->options = options;
877     options = qdict_clone_shallow(options);
878 
879     /* Fetch the file name from the options QDict if necessary */
880     if (!filename) {
881         filename = qdict_get_try_str(options, "filename");
882     } else if (filename && !qdict_haskey(options, "filename")) {
883         qdict_put(options, "filename", qstring_from_str(filename));
884         allow_protocol_prefix = true;
885     } else {
886         error_setg(errp, "Can't specify 'file' and 'filename' options at the "
887                    "same time");
888         ret = -EINVAL;
889         goto fail;
890     }
891 
892     /* Find the right block driver */
893     drvname = qdict_get_try_str(options, "driver");
894     if (drvname) {
895         drv = bdrv_find_format(drvname);
896         if (!drv) {
897             error_setg(errp, "Unknown driver '%s'", drvname);
898         }
899         qdict_del(options, "driver");
900     } else if (filename) {
901         drv = bdrv_find_protocol(filename, allow_protocol_prefix);
902         if (!drv) {
903             error_setg(errp, "Unknown protocol");
904         }
905     } else {
906         error_setg(errp, "Must specify either driver or file");
907         drv = NULL;
908     }
909 
910     if (!drv) {
911         /* errp has been set already */
912         ret = -ENOENT;
913         goto fail;
914     }
915 
916     /* Parse the filename and open it */
917     if (drv->bdrv_parse_filename && filename) {
918         drv->bdrv_parse_filename(filename, options, &local_err);
919         if (error_is_set(&local_err)) {
920             error_propagate(errp, local_err);
921             ret = -EINVAL;
922             goto fail;
923         }
924         qdict_del(options, "filename");
925     } else if (drv->bdrv_needs_filename && !filename) {
926         error_setg(errp, "The '%s' block driver requires a file name",
927                    drv->format_name);
928         ret = -EINVAL;
929         goto fail;
930     }
931 
932     ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err);
933     if (ret < 0) {
934         error_propagate(errp, local_err);
935         goto fail;
936     }
937 
938     /* Check if any unknown options were used */
939     if (qdict_size(options) != 0) {
940         const QDictEntry *entry = qdict_first(options);
941         error_setg(errp, "Block protocol '%s' doesn't support the option '%s'",
942                    drv->format_name, entry->key);
943         ret = -EINVAL;
944         goto fail;
945     }
946     QDECREF(options);
947 
948     bs->growable = 1;
949     *pbs = bs;
950     return 0;
951 
952 fail:
953     QDECREF(options);
954     if (!bs->drv) {
955         QDECREF(bs->options);
956     }
957     bdrv_unref(bs);
958     return ret;
959 }
960 
961 /*
962  * Opens the backing file for a BlockDriverState if not yet open
963  *
964  * options is a QDict of options to pass to the block drivers, or NULL for an
965  * empty set of options. The reference to the QDict is transferred to this
966  * function (even on failure), so if the caller intends to reuse the dictionary,
967  * it needs to use QINCREF() before calling bdrv_file_open.
968  */
969 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
970 {
971     char backing_filename[PATH_MAX];
972     int back_flags, ret;
973     BlockDriver *back_drv = NULL;
974     Error *local_err = NULL;
975 
976     if (bs->backing_hd != NULL) {
977         QDECREF(options);
978         return 0;
979     }
980 
981     /* NULL means an empty set of options */
982     if (options == NULL) {
983         options = qdict_new();
984     }
985 
986     bs->open_flags &= ~BDRV_O_NO_BACKING;
987     if (qdict_haskey(options, "file.filename")) {
988         backing_filename[0] = '\0';
989     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
990         QDECREF(options);
991         return 0;
992     } else {
993         bdrv_get_full_backing_filename(bs, backing_filename,
994                                        sizeof(backing_filename));
995     }
996 
997     bs->backing_hd = bdrv_new("");
998 
999     if (bs->backing_format[0] != '\0') {
1000         back_drv = bdrv_find_format(bs->backing_format);
1001     }
1002 
1003     /* backing files always opened read-only */
1004     back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
1005                                     BDRV_O_COPY_ON_READ);
1006 
1007     ret = bdrv_open(bs->backing_hd,
1008                     *backing_filename ? backing_filename : NULL, options,
1009                     back_flags, back_drv, &local_err);
1010     if (ret < 0) {
1011         bdrv_unref(bs->backing_hd);
1012         bs->backing_hd = NULL;
1013         bs->open_flags |= BDRV_O_NO_BACKING;
1014         error_setg(errp, "Could not open backing file: %s",
1015                    error_get_pretty(local_err));
1016         error_free(local_err);
1017         return ret;
1018     }
1019     pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1020             bs->backing_hd->file->filename);
1021     return 0;
1022 }
1023 
1024 /*
1025  * Opens a disk image (raw, qcow2, vmdk, ...)
1026  *
1027  * options is a QDict of options to pass to the block drivers, or NULL for an
1028  * empty set of options. The reference to the QDict belongs to the block layer
1029  * after the call (even on failure), so if the caller intends to reuse the
1030  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1031  */
1032 int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
1033               int flags, BlockDriver *drv, Error **errp)
1034 {
1035     int ret;
1036     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1037     char tmp_filename[PATH_MAX + 1];
1038     BlockDriverState *file = NULL;
1039     QDict *file_options = NULL;
1040     const char *drvname;
1041     Error *local_err = NULL;
1042 
1043     /* NULL means an empty set of options */
1044     if (options == NULL) {
1045         options = qdict_new();
1046     }
1047 
1048     bs->options = options;
1049     options = qdict_clone_shallow(options);
1050 
1051     /* For snapshot=on, create a temporary qcow2 overlay */
1052     if (flags & BDRV_O_SNAPSHOT) {
1053         BlockDriverState *bs1;
1054         int64_t total_size;
1055         BlockDriver *bdrv_qcow2;
1056         QEMUOptionParameter *create_options;
1057         QDict *snapshot_options;
1058 
1059         /* if snapshot, we create a temporary backing file and open it
1060            instead of opening 'filename' directly */
1061 
1062         /* Get the required size from the image */
1063         bs1 = bdrv_new("");
1064         QINCREF(options);
1065         ret = bdrv_open(bs1, filename, options, BDRV_O_NO_BACKING,
1066                         drv, &local_err);
1067         if (ret < 0) {
1068             bdrv_unref(bs1);
1069             goto fail;
1070         }
1071         total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1072 
1073         bdrv_unref(bs1);
1074 
1075         /* Create the temporary image */
1076         ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1077         if (ret < 0) {
1078             error_setg_errno(errp, -ret, "Could not get temporary filename");
1079             goto fail;
1080         }
1081 
1082         bdrv_qcow2 = bdrv_find_format("qcow2");
1083         create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1084                                                  NULL);
1085 
1086         set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1087 
1088         ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1089         free_option_parameters(create_options);
1090         if (ret < 0) {
1091             error_setg_errno(errp, -ret, "Could not create temporary overlay "
1092                              "'%s': %s", tmp_filename,
1093                              error_get_pretty(local_err));
1094             error_free(local_err);
1095             local_err = NULL;
1096             goto fail;
1097         }
1098 
1099         /* Prepare a new options QDict for the temporary file, where user
1100          * options refer to the backing file */
1101         if (filename) {
1102             qdict_put(options, "file.filename", qstring_from_str(filename));
1103         }
1104         if (drv) {
1105             qdict_put(options, "driver", qstring_from_str(drv->format_name));
1106         }
1107 
1108         snapshot_options = qdict_new();
1109         qdict_put(snapshot_options, "backing", options);
1110         qdict_flatten(snapshot_options);
1111 
1112         bs->options = snapshot_options;
1113         options = qdict_clone_shallow(bs->options);
1114 
1115         filename = tmp_filename;
1116         drv = bdrv_qcow2;
1117         bs->is_temporary = 1;
1118     }
1119 
1120     /* Open image file without format layer */
1121     if (flags & BDRV_O_RDWR) {
1122         flags |= BDRV_O_ALLOW_RDWR;
1123     }
1124 
1125     qdict_extract_subqdict(options, &file_options, "file.");
1126 
1127     ret = bdrv_file_open(&file, filename, file_options,
1128                          bdrv_open_flags(bs, flags | BDRV_O_UNMAP), &local_err);
1129     if (ret < 0) {
1130         goto fail;
1131     }
1132 
1133     /* Find the right image format driver */
1134     drvname = qdict_get_try_str(options, "driver");
1135     if (drvname) {
1136         drv = bdrv_find_format(drvname);
1137         qdict_del(options, "driver");
1138         if (!drv) {
1139             error_setg(errp, "Invalid driver: '%s'", drvname);
1140             ret = -EINVAL;
1141             goto unlink_and_fail;
1142         }
1143     }
1144 
1145     if (!drv) {
1146         ret = find_image_format(file, filename, &drv, &local_err);
1147     }
1148 
1149     if (!drv) {
1150         goto unlink_and_fail;
1151     }
1152 
1153     /* Open the image */
1154     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1155     if (ret < 0) {
1156         goto unlink_and_fail;
1157     }
1158 
1159     if (bs->file != file) {
1160         bdrv_unref(file);
1161         file = NULL;
1162     }
1163 
1164     /* If there is a backing file, use it */
1165     if ((flags & BDRV_O_NO_BACKING) == 0) {
1166         QDict *backing_options;
1167 
1168         qdict_extract_subqdict(options, &backing_options, "backing.");
1169         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1170         if (ret < 0) {
1171             goto close_and_fail;
1172         }
1173     }
1174 
1175     /* Check if any unknown options were used */
1176     if (qdict_size(options) != 0) {
1177         const QDictEntry *entry = qdict_first(options);
1178         error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1179                    "support the option '%s'", drv->format_name, bs->device_name,
1180                    entry->key);
1181 
1182         ret = -EINVAL;
1183         goto close_and_fail;
1184     }
1185     QDECREF(options);
1186 
1187     if (!bdrv_key_required(bs)) {
1188         bdrv_dev_change_media_cb(bs, true);
1189     }
1190 
1191     return 0;
1192 
1193 unlink_and_fail:
1194     if (file != NULL) {
1195         bdrv_unref(file);
1196     }
1197     if (bs->is_temporary) {
1198         unlink(filename);
1199     }
1200 fail:
1201     QDECREF(bs->options);
1202     QDECREF(options);
1203     bs->options = NULL;
1204     if (error_is_set(&local_err)) {
1205         error_propagate(errp, local_err);
1206     }
1207     return ret;
1208 
1209 close_and_fail:
1210     bdrv_close(bs);
1211     QDECREF(options);
1212     if (error_is_set(&local_err)) {
1213         error_propagate(errp, local_err);
1214     }
1215     return ret;
1216 }
1217 
1218 typedef struct BlockReopenQueueEntry {
1219      bool prepared;
1220      BDRVReopenState state;
1221      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1222 } BlockReopenQueueEntry;
1223 
1224 /*
1225  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1226  * reopen of multiple devices.
1227  *
1228  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1229  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1230  * be created and initialized. This newly created BlockReopenQueue should be
1231  * passed back in for subsequent calls that are intended to be of the same
1232  * atomic 'set'.
1233  *
1234  * bs is the BlockDriverState to add to the reopen queue.
1235  *
1236  * flags contains the open flags for the associated bs
1237  *
1238  * returns a pointer to bs_queue, which is either the newly allocated
1239  * bs_queue, or the existing bs_queue being used.
1240  *
1241  */
1242 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1243                                     BlockDriverState *bs, int flags)
1244 {
1245     assert(bs != NULL);
1246 
1247     BlockReopenQueueEntry *bs_entry;
1248     if (bs_queue == NULL) {
1249         bs_queue = g_new0(BlockReopenQueue, 1);
1250         QSIMPLEQ_INIT(bs_queue);
1251     }
1252 
1253     if (bs->file) {
1254         bdrv_reopen_queue(bs_queue, bs->file, flags);
1255     }
1256 
1257     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1258     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1259 
1260     bs_entry->state.bs = bs;
1261     bs_entry->state.flags = flags;
1262 
1263     return bs_queue;
1264 }
1265 
1266 /*
1267  * Reopen multiple BlockDriverStates atomically & transactionally.
1268  *
1269  * The queue passed in (bs_queue) must have been built up previous
1270  * via bdrv_reopen_queue().
1271  *
1272  * Reopens all BDS specified in the queue, with the appropriate
1273  * flags.  All devices are prepared for reopen, and failure of any
1274  * device will cause all device changes to be abandonded, and intermediate
1275  * data cleaned up.
1276  *
1277  * If all devices prepare successfully, then the changes are committed
1278  * to all devices.
1279  *
1280  */
1281 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1282 {
1283     int ret = -1;
1284     BlockReopenQueueEntry *bs_entry, *next;
1285     Error *local_err = NULL;
1286 
1287     assert(bs_queue != NULL);
1288 
1289     bdrv_drain_all();
1290 
1291     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1292         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1293             error_propagate(errp, local_err);
1294             goto cleanup;
1295         }
1296         bs_entry->prepared = true;
1297     }
1298 
1299     /* If we reach this point, we have success and just need to apply the
1300      * changes
1301      */
1302     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1303         bdrv_reopen_commit(&bs_entry->state);
1304     }
1305 
1306     ret = 0;
1307 
1308 cleanup:
1309     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1310         if (ret && bs_entry->prepared) {
1311             bdrv_reopen_abort(&bs_entry->state);
1312         }
1313         g_free(bs_entry);
1314     }
1315     g_free(bs_queue);
1316     return ret;
1317 }
1318 
1319 
1320 /* Reopen a single BlockDriverState with the specified flags. */
1321 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1322 {
1323     int ret = -1;
1324     Error *local_err = NULL;
1325     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1326 
1327     ret = bdrv_reopen_multiple(queue, &local_err);
1328     if (local_err != NULL) {
1329         error_propagate(errp, local_err);
1330     }
1331     return ret;
1332 }
1333 
1334 
1335 /*
1336  * Prepares a BlockDriverState for reopen. All changes are staged in the
1337  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1338  * the block driver layer .bdrv_reopen_prepare()
1339  *
1340  * bs is the BlockDriverState to reopen
1341  * flags are the new open flags
1342  * queue is the reopen queue
1343  *
1344  * Returns 0 on success, non-zero on error.  On error errp will be set
1345  * as well.
1346  *
1347  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1348  * It is the responsibility of the caller to then call the abort() or
1349  * commit() for any other BDS that have been left in a prepare() state
1350  *
1351  */
1352 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1353                         Error **errp)
1354 {
1355     int ret = -1;
1356     Error *local_err = NULL;
1357     BlockDriver *drv;
1358 
1359     assert(reopen_state != NULL);
1360     assert(reopen_state->bs->drv != NULL);
1361     drv = reopen_state->bs->drv;
1362 
1363     /* if we are to stay read-only, do not allow permission change
1364      * to r/w */
1365     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1366         reopen_state->flags & BDRV_O_RDWR) {
1367         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1368                   reopen_state->bs->device_name);
1369         goto error;
1370     }
1371 
1372 
1373     ret = bdrv_flush(reopen_state->bs);
1374     if (ret) {
1375         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1376                   strerror(-ret));
1377         goto error;
1378     }
1379 
1380     if (drv->bdrv_reopen_prepare) {
1381         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1382         if (ret) {
1383             if (local_err != NULL) {
1384                 error_propagate(errp, local_err);
1385             } else {
1386                 error_setg(errp, "failed while preparing to reopen image '%s'",
1387                            reopen_state->bs->filename);
1388             }
1389             goto error;
1390         }
1391     } else {
1392         /* It is currently mandatory to have a bdrv_reopen_prepare()
1393          * handler for each supported drv. */
1394         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1395                   drv->format_name, reopen_state->bs->device_name,
1396                  "reopening of file");
1397         ret = -1;
1398         goto error;
1399     }
1400 
1401     ret = 0;
1402 
1403 error:
1404     return ret;
1405 }
1406 
1407 /*
1408  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1409  * makes them final by swapping the staging BlockDriverState contents into
1410  * the active BlockDriverState contents.
1411  */
1412 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1413 {
1414     BlockDriver *drv;
1415 
1416     assert(reopen_state != NULL);
1417     drv = reopen_state->bs->drv;
1418     assert(drv != NULL);
1419 
1420     /* If there are any driver level actions to take */
1421     if (drv->bdrv_reopen_commit) {
1422         drv->bdrv_reopen_commit(reopen_state);
1423     }
1424 
1425     /* set BDS specific flags now */
1426     reopen_state->bs->open_flags         = reopen_state->flags;
1427     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1428                                               BDRV_O_CACHE_WB);
1429     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1430 }
1431 
1432 /*
1433  * Abort the reopen, and delete and free the staged changes in
1434  * reopen_state
1435  */
1436 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1437 {
1438     BlockDriver *drv;
1439 
1440     assert(reopen_state != NULL);
1441     drv = reopen_state->bs->drv;
1442     assert(drv != NULL);
1443 
1444     if (drv->bdrv_reopen_abort) {
1445         drv->bdrv_reopen_abort(reopen_state);
1446     }
1447 }
1448 
1449 
1450 void bdrv_close(BlockDriverState *bs)
1451 {
1452     if (bs->job) {
1453         block_job_cancel_sync(bs->job);
1454     }
1455     bdrv_drain_all(); /* complete I/O */
1456     bdrv_flush(bs);
1457     bdrv_drain_all(); /* in case flush left pending I/O */
1458     notifier_list_notify(&bs->close_notifiers, bs);
1459 
1460     if (bs->drv) {
1461         if (bs->backing_hd) {
1462             bdrv_unref(bs->backing_hd);
1463             bs->backing_hd = NULL;
1464         }
1465         bs->drv->bdrv_close(bs);
1466         g_free(bs->opaque);
1467 #ifdef _WIN32
1468         if (bs->is_temporary) {
1469             unlink(bs->filename);
1470         }
1471 #endif
1472         bs->opaque = NULL;
1473         bs->drv = NULL;
1474         bs->copy_on_read = 0;
1475         bs->backing_file[0] = '\0';
1476         bs->backing_format[0] = '\0';
1477         bs->total_sectors = 0;
1478         bs->encrypted = 0;
1479         bs->valid_key = 0;
1480         bs->sg = 0;
1481         bs->growable = 0;
1482         bs->zero_beyond_eof = false;
1483         QDECREF(bs->options);
1484         bs->options = NULL;
1485 
1486         if (bs->file != NULL) {
1487             bdrv_unref(bs->file);
1488             bs->file = NULL;
1489         }
1490     }
1491 
1492     bdrv_dev_change_media_cb(bs, false);
1493 
1494     /*throttling disk I/O limits*/
1495     if (bs->io_limits_enabled) {
1496         bdrv_io_limits_disable(bs);
1497     }
1498 }
1499 
1500 void bdrv_close_all(void)
1501 {
1502     BlockDriverState *bs;
1503 
1504     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1505         bdrv_close(bs);
1506     }
1507 }
1508 
1509 /* Check if any requests are in-flight (including throttled requests) */
1510 static bool bdrv_requests_pending(BlockDriverState *bs)
1511 {
1512     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1513         return true;
1514     }
1515     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1516         return true;
1517     }
1518     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1519         return true;
1520     }
1521     if (bs->file && bdrv_requests_pending(bs->file)) {
1522         return true;
1523     }
1524     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1525         return true;
1526     }
1527     return false;
1528 }
1529 
1530 static bool bdrv_requests_pending_all(void)
1531 {
1532     BlockDriverState *bs;
1533     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1534         if (bdrv_requests_pending(bs)) {
1535             return true;
1536         }
1537     }
1538     return false;
1539 }
1540 
1541 /*
1542  * Wait for pending requests to complete across all BlockDriverStates
1543  *
1544  * This function does not flush data to disk, use bdrv_flush_all() for that
1545  * after calling this function.
1546  *
1547  * Note that completion of an asynchronous I/O operation can trigger any
1548  * number of other I/O operations on other devices---for example a coroutine
1549  * can be arbitrarily complex and a constant flow of I/O can come until the
1550  * coroutine is complete.  Because of this, it is not possible to have a
1551  * function to drain a single device's I/O queue.
1552  */
1553 void bdrv_drain_all(void)
1554 {
1555     /* Always run first iteration so any pending completion BHs run */
1556     bool busy = true;
1557     BlockDriverState *bs;
1558 
1559     while (busy) {
1560         /* FIXME: We do not have timer support here, so this is effectively
1561          * a busy wait.
1562          */
1563         QTAILQ_FOREACH(bs, &bdrv_states, list) {
1564             if (bdrv_start_throttled_reqs(bs)) {
1565                 busy = true;
1566             }
1567         }
1568 
1569         busy = bdrv_requests_pending_all();
1570         busy |= aio_poll(qemu_get_aio_context(), busy);
1571     }
1572 }
1573 
1574 /* make a BlockDriverState anonymous by removing from bdrv_state list.
1575    Also, NULL terminate the device_name to prevent double remove */
1576 void bdrv_make_anon(BlockDriverState *bs)
1577 {
1578     if (bs->device_name[0] != '\0') {
1579         QTAILQ_REMOVE(&bdrv_states, bs, list);
1580     }
1581     bs->device_name[0] = '\0';
1582 }
1583 
1584 static void bdrv_rebind(BlockDriverState *bs)
1585 {
1586     if (bs->drv && bs->drv->bdrv_rebind) {
1587         bs->drv->bdrv_rebind(bs);
1588     }
1589 }
1590 
1591 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1592                                      BlockDriverState *bs_src)
1593 {
1594     /* move some fields that need to stay attached to the device */
1595     bs_dest->open_flags         = bs_src->open_flags;
1596 
1597     /* dev info */
1598     bs_dest->dev_ops            = bs_src->dev_ops;
1599     bs_dest->dev_opaque         = bs_src->dev_opaque;
1600     bs_dest->dev                = bs_src->dev;
1601     bs_dest->buffer_alignment   = bs_src->buffer_alignment;
1602     bs_dest->copy_on_read       = bs_src->copy_on_read;
1603 
1604     bs_dest->enable_write_cache = bs_src->enable_write_cache;
1605 
1606     /* i/o throttled req */
1607     memcpy(&bs_dest->throttle_state,
1608            &bs_src->throttle_state,
1609            sizeof(ThrottleState));
1610     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1611     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1612     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1613 
1614     /* r/w error */
1615     bs_dest->on_read_error      = bs_src->on_read_error;
1616     bs_dest->on_write_error     = bs_src->on_write_error;
1617 
1618     /* i/o status */
1619     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1620     bs_dest->iostatus           = bs_src->iostatus;
1621 
1622     /* dirty bitmap */
1623     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1624 
1625     /* reference count */
1626     bs_dest->refcnt             = bs_src->refcnt;
1627 
1628     /* job */
1629     bs_dest->in_use             = bs_src->in_use;
1630     bs_dest->job                = bs_src->job;
1631 
1632     /* keep the same entry in bdrv_states */
1633     pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1634             bs_src->device_name);
1635     bs_dest->list = bs_src->list;
1636 }
1637 
1638 /*
1639  * Swap bs contents for two image chains while they are live,
1640  * while keeping required fields on the BlockDriverState that is
1641  * actually attached to a device.
1642  *
1643  * This will modify the BlockDriverState fields, and swap contents
1644  * between bs_new and bs_old. Both bs_new and bs_old are modified.
1645  *
1646  * bs_new is required to be anonymous.
1647  *
1648  * This function does not create any image files.
1649  */
1650 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1651 {
1652     BlockDriverState tmp;
1653 
1654     /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1655     assert(bs_new->device_name[0] == '\0');
1656     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1657     assert(bs_new->job == NULL);
1658     assert(bs_new->dev == NULL);
1659     assert(bs_new->in_use == 0);
1660     assert(bs_new->io_limits_enabled == false);
1661     assert(!throttle_have_timer(&bs_new->throttle_state));
1662 
1663     tmp = *bs_new;
1664     *bs_new = *bs_old;
1665     *bs_old = tmp;
1666 
1667     /* there are some fields that should not be swapped, move them back */
1668     bdrv_move_feature_fields(&tmp, bs_old);
1669     bdrv_move_feature_fields(bs_old, bs_new);
1670     bdrv_move_feature_fields(bs_new, &tmp);
1671 
1672     /* bs_new shouldn't be in bdrv_states even after the swap!  */
1673     assert(bs_new->device_name[0] == '\0');
1674 
1675     /* Check a few fields that should remain attached to the device */
1676     assert(bs_new->dev == NULL);
1677     assert(bs_new->job == NULL);
1678     assert(bs_new->in_use == 0);
1679     assert(bs_new->io_limits_enabled == false);
1680     assert(!throttle_have_timer(&bs_new->throttle_state));
1681 
1682     bdrv_rebind(bs_new);
1683     bdrv_rebind(bs_old);
1684 }
1685 
1686 /*
1687  * Add new bs contents at the top of an image chain while the chain is
1688  * live, while keeping required fields on the top layer.
1689  *
1690  * This will modify the BlockDriverState fields, and swap contents
1691  * between bs_new and bs_top. Both bs_new and bs_top are modified.
1692  *
1693  * bs_new is required to be anonymous.
1694  *
1695  * This function does not create any image files.
1696  */
1697 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1698 {
1699     bdrv_swap(bs_new, bs_top);
1700 
1701     /* The contents of 'tmp' will become bs_top, as we are
1702      * swapping bs_new and bs_top contents. */
1703     bs_top->backing_hd = bs_new;
1704     bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1705     pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1706             bs_new->filename);
1707     pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1708             bs_new->drv ? bs_new->drv->format_name : "");
1709 }
1710 
1711 static void bdrv_delete(BlockDriverState *bs)
1712 {
1713     assert(!bs->dev);
1714     assert(!bs->job);
1715     assert(!bs->in_use);
1716     assert(!bs->refcnt);
1717     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1718 
1719     bdrv_close(bs);
1720 
1721     /* remove from list, if necessary */
1722     bdrv_make_anon(bs);
1723 
1724     g_free(bs);
1725 }
1726 
1727 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1728 /* TODO change to DeviceState *dev when all users are qdevified */
1729 {
1730     if (bs->dev) {
1731         return -EBUSY;
1732     }
1733     bs->dev = dev;
1734     bdrv_iostatus_reset(bs);
1735     return 0;
1736 }
1737 
1738 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1739 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1740 {
1741     if (bdrv_attach_dev(bs, dev) < 0) {
1742         abort();
1743     }
1744 }
1745 
1746 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1747 /* TODO change to DeviceState *dev when all users are qdevified */
1748 {
1749     assert(bs->dev == dev);
1750     bs->dev = NULL;
1751     bs->dev_ops = NULL;
1752     bs->dev_opaque = NULL;
1753     bs->buffer_alignment = 512;
1754 }
1755 
1756 /* TODO change to return DeviceState * when all users are qdevified */
1757 void *bdrv_get_attached_dev(BlockDriverState *bs)
1758 {
1759     return bs->dev;
1760 }
1761 
1762 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1763                       void *opaque)
1764 {
1765     bs->dev_ops = ops;
1766     bs->dev_opaque = opaque;
1767 }
1768 
1769 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1770                                enum MonitorEvent ev,
1771                                BlockErrorAction action, bool is_read)
1772 {
1773     QObject *data;
1774     const char *action_str;
1775 
1776     switch (action) {
1777     case BDRV_ACTION_REPORT:
1778         action_str = "report";
1779         break;
1780     case BDRV_ACTION_IGNORE:
1781         action_str = "ignore";
1782         break;
1783     case BDRV_ACTION_STOP:
1784         action_str = "stop";
1785         break;
1786     default:
1787         abort();
1788     }
1789 
1790     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1791                               bdrv->device_name,
1792                               action_str,
1793                               is_read ? "read" : "write");
1794     monitor_protocol_event(ev, data);
1795 
1796     qobject_decref(data);
1797 }
1798 
1799 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1800 {
1801     QObject *data;
1802 
1803     data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1804                               bdrv_get_device_name(bs), ejected);
1805     monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1806 
1807     qobject_decref(data);
1808 }
1809 
1810 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1811 {
1812     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1813         bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1814         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1815         if (tray_was_closed) {
1816             /* tray open */
1817             bdrv_emit_qmp_eject_event(bs, true);
1818         }
1819         if (load) {
1820             /* tray close */
1821             bdrv_emit_qmp_eject_event(bs, false);
1822         }
1823     }
1824 }
1825 
1826 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1827 {
1828     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1829 }
1830 
1831 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1832 {
1833     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1834         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1835     }
1836 }
1837 
1838 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1839 {
1840     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1841         return bs->dev_ops->is_tray_open(bs->dev_opaque);
1842     }
1843     return false;
1844 }
1845 
1846 static void bdrv_dev_resize_cb(BlockDriverState *bs)
1847 {
1848     if (bs->dev_ops && bs->dev_ops->resize_cb) {
1849         bs->dev_ops->resize_cb(bs->dev_opaque);
1850     }
1851 }
1852 
1853 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1854 {
1855     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1856         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1857     }
1858     return false;
1859 }
1860 
1861 /*
1862  * Run consistency checks on an image
1863  *
1864  * Returns 0 if the check could be completed (it doesn't mean that the image is
1865  * free of errors) or -errno when an internal error occurred. The results of the
1866  * check are stored in res.
1867  */
1868 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1869 {
1870     if (bs->drv->bdrv_check == NULL) {
1871         return -ENOTSUP;
1872     }
1873 
1874     memset(res, 0, sizeof(*res));
1875     return bs->drv->bdrv_check(bs, res, fix);
1876 }
1877 
1878 #define COMMIT_BUF_SECTORS 2048
1879 
1880 /* commit COW file into the raw image */
1881 int bdrv_commit(BlockDriverState *bs)
1882 {
1883     BlockDriver *drv = bs->drv;
1884     int64_t sector, total_sectors;
1885     int n, ro, open_flags;
1886     int ret = 0;
1887     uint8_t *buf;
1888     char filename[PATH_MAX];
1889 
1890     if (!drv)
1891         return -ENOMEDIUM;
1892 
1893     if (!bs->backing_hd) {
1894         return -ENOTSUP;
1895     }
1896 
1897     if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1898         return -EBUSY;
1899     }
1900 
1901     ro = bs->backing_hd->read_only;
1902     /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
1903     pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
1904     open_flags =  bs->backing_hd->open_flags;
1905 
1906     if (ro) {
1907         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
1908             return -EACCES;
1909         }
1910     }
1911 
1912     total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1913     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1914 
1915     for (sector = 0; sector < total_sectors; sector += n) {
1916         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
1917         if (ret < 0) {
1918             goto ro_cleanup;
1919         }
1920         if (ret) {
1921             if (bdrv_read(bs, sector, buf, n) != 0) {
1922                 ret = -EIO;
1923                 goto ro_cleanup;
1924             }
1925 
1926             if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1927                 ret = -EIO;
1928                 goto ro_cleanup;
1929             }
1930         }
1931     }
1932 
1933     if (drv->bdrv_make_empty) {
1934         ret = drv->bdrv_make_empty(bs);
1935         bdrv_flush(bs);
1936     }
1937 
1938     /*
1939      * Make sure all data we wrote to the backing device is actually
1940      * stable on disk.
1941      */
1942     if (bs->backing_hd)
1943         bdrv_flush(bs->backing_hd);
1944 
1945 ro_cleanup:
1946     g_free(buf);
1947 
1948     if (ro) {
1949         /* ignoring error return here */
1950         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
1951     }
1952 
1953     return ret;
1954 }
1955 
1956 int bdrv_commit_all(void)
1957 {
1958     BlockDriverState *bs;
1959 
1960     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1961         if (bs->drv && bs->backing_hd) {
1962             int ret = bdrv_commit(bs);
1963             if (ret < 0) {
1964                 return ret;
1965             }
1966         }
1967     }
1968     return 0;
1969 }
1970 
1971 /**
1972  * Remove an active request from the tracked requests list
1973  *
1974  * This function should be called when a tracked request is completing.
1975  */
1976 static void tracked_request_end(BdrvTrackedRequest *req)
1977 {
1978     QLIST_REMOVE(req, list);
1979     qemu_co_queue_restart_all(&req->wait_queue);
1980 }
1981 
1982 /**
1983  * Add an active request to the tracked requests list
1984  */
1985 static void tracked_request_begin(BdrvTrackedRequest *req,
1986                                   BlockDriverState *bs,
1987                                   int64_t sector_num,
1988                                   int nb_sectors, bool is_write)
1989 {
1990     *req = (BdrvTrackedRequest){
1991         .bs = bs,
1992         .sector_num = sector_num,
1993         .nb_sectors = nb_sectors,
1994         .is_write = is_write,
1995         .co = qemu_coroutine_self(),
1996     };
1997 
1998     qemu_co_queue_init(&req->wait_queue);
1999 
2000     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2001 }
2002 
2003 /**
2004  * Round a region to cluster boundaries
2005  */
2006 void bdrv_round_to_clusters(BlockDriverState *bs,
2007                             int64_t sector_num, int nb_sectors,
2008                             int64_t *cluster_sector_num,
2009                             int *cluster_nb_sectors)
2010 {
2011     BlockDriverInfo bdi;
2012 
2013     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2014         *cluster_sector_num = sector_num;
2015         *cluster_nb_sectors = nb_sectors;
2016     } else {
2017         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2018         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2019         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2020                                             nb_sectors, c);
2021     }
2022 }
2023 
2024 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2025                                      int64_t sector_num, int nb_sectors) {
2026     /*        aaaa   bbbb */
2027     if (sector_num >= req->sector_num + req->nb_sectors) {
2028         return false;
2029     }
2030     /* bbbb   aaaa        */
2031     if (req->sector_num >= sector_num + nb_sectors) {
2032         return false;
2033     }
2034     return true;
2035 }
2036 
2037 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
2038         int64_t sector_num, int nb_sectors)
2039 {
2040     BdrvTrackedRequest *req;
2041     int64_t cluster_sector_num;
2042     int cluster_nb_sectors;
2043     bool retry;
2044 
2045     /* If we touch the same cluster it counts as an overlap.  This guarantees
2046      * that allocating writes will be serialized and not race with each other
2047      * for the same cluster.  For example, in copy-on-read it ensures that the
2048      * CoR read and write operations are atomic and guest writes cannot
2049      * interleave between them.
2050      */
2051     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2052                            &cluster_sector_num, &cluster_nb_sectors);
2053 
2054     do {
2055         retry = false;
2056         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2057             if (tracked_request_overlaps(req, cluster_sector_num,
2058                                          cluster_nb_sectors)) {
2059                 /* Hitting this means there was a reentrant request, for
2060                  * example, a block driver issuing nested requests.  This must
2061                  * never happen since it means deadlock.
2062                  */
2063                 assert(qemu_coroutine_self() != req->co);
2064 
2065                 qemu_co_queue_wait(&req->wait_queue);
2066                 retry = true;
2067                 break;
2068             }
2069         }
2070     } while (retry);
2071 }
2072 
2073 /*
2074  * Return values:
2075  * 0        - success
2076  * -EINVAL  - backing format specified, but no file
2077  * -ENOSPC  - can't update the backing file because no space is left in the
2078  *            image file header
2079  * -ENOTSUP - format driver doesn't support changing the backing file
2080  */
2081 int bdrv_change_backing_file(BlockDriverState *bs,
2082     const char *backing_file, const char *backing_fmt)
2083 {
2084     BlockDriver *drv = bs->drv;
2085     int ret;
2086 
2087     /* Backing file format doesn't make sense without a backing file */
2088     if (backing_fmt && !backing_file) {
2089         return -EINVAL;
2090     }
2091 
2092     if (drv->bdrv_change_backing_file != NULL) {
2093         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2094     } else {
2095         ret = -ENOTSUP;
2096     }
2097 
2098     if (ret == 0) {
2099         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2100         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2101     }
2102     return ret;
2103 }
2104 
2105 /*
2106  * Finds the image layer in the chain that has 'bs' as its backing file.
2107  *
2108  * active is the current topmost image.
2109  *
2110  * Returns NULL if bs is not found in active's image chain,
2111  * or if active == bs.
2112  */
2113 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2114                                     BlockDriverState *bs)
2115 {
2116     BlockDriverState *overlay = NULL;
2117     BlockDriverState *intermediate;
2118 
2119     assert(active != NULL);
2120     assert(bs != NULL);
2121 
2122     /* if bs is the same as active, then by definition it has no overlay
2123      */
2124     if (active == bs) {
2125         return NULL;
2126     }
2127 
2128     intermediate = active;
2129     while (intermediate->backing_hd) {
2130         if (intermediate->backing_hd == bs) {
2131             overlay = intermediate;
2132             break;
2133         }
2134         intermediate = intermediate->backing_hd;
2135     }
2136 
2137     return overlay;
2138 }
2139 
2140 typedef struct BlkIntermediateStates {
2141     BlockDriverState *bs;
2142     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2143 } BlkIntermediateStates;
2144 
2145 
2146 /*
2147  * Drops images above 'base' up to and including 'top', and sets the image
2148  * above 'top' to have base as its backing file.
2149  *
2150  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2151  * information in 'bs' can be properly updated.
2152  *
2153  * E.g., this will convert the following chain:
2154  * bottom <- base <- intermediate <- top <- active
2155  *
2156  * to
2157  *
2158  * bottom <- base <- active
2159  *
2160  * It is allowed for bottom==base, in which case it converts:
2161  *
2162  * base <- intermediate <- top <- active
2163  *
2164  * to
2165  *
2166  * base <- active
2167  *
2168  * Error conditions:
2169  *  if active == top, that is considered an error
2170  *
2171  */
2172 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2173                            BlockDriverState *base)
2174 {
2175     BlockDriverState *intermediate;
2176     BlockDriverState *base_bs = NULL;
2177     BlockDriverState *new_top_bs = NULL;
2178     BlkIntermediateStates *intermediate_state, *next;
2179     int ret = -EIO;
2180 
2181     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2182     QSIMPLEQ_INIT(&states_to_delete);
2183 
2184     if (!top->drv || !base->drv) {
2185         goto exit;
2186     }
2187 
2188     new_top_bs = bdrv_find_overlay(active, top);
2189 
2190     if (new_top_bs == NULL) {
2191         /* we could not find the image above 'top', this is an error */
2192         goto exit;
2193     }
2194 
2195     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2196      * to do, no intermediate images */
2197     if (new_top_bs->backing_hd == base) {
2198         ret = 0;
2199         goto exit;
2200     }
2201 
2202     intermediate = top;
2203 
2204     /* now we will go down through the list, and add each BDS we find
2205      * into our deletion queue, until we hit the 'base'
2206      */
2207     while (intermediate) {
2208         intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2209         intermediate_state->bs = intermediate;
2210         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2211 
2212         if (intermediate->backing_hd == base) {
2213             base_bs = intermediate->backing_hd;
2214             break;
2215         }
2216         intermediate = intermediate->backing_hd;
2217     }
2218     if (base_bs == NULL) {
2219         /* something went wrong, we did not end at the base. safely
2220          * unravel everything, and exit with error */
2221         goto exit;
2222     }
2223 
2224     /* success - we can delete the intermediate states, and link top->base */
2225     ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2226                                    base_bs->drv ? base_bs->drv->format_name : "");
2227     if (ret) {
2228         goto exit;
2229     }
2230     new_top_bs->backing_hd = base_bs;
2231 
2232 
2233     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2234         /* so that bdrv_close() does not recursively close the chain */
2235         intermediate_state->bs->backing_hd = NULL;
2236         bdrv_unref(intermediate_state->bs);
2237     }
2238     ret = 0;
2239 
2240 exit:
2241     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2242         g_free(intermediate_state);
2243     }
2244     return ret;
2245 }
2246 
2247 
2248 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2249                                    size_t size)
2250 {
2251     int64_t len;
2252 
2253     if (!bdrv_is_inserted(bs))
2254         return -ENOMEDIUM;
2255 
2256     if (bs->growable)
2257         return 0;
2258 
2259     len = bdrv_getlength(bs);
2260 
2261     if (offset < 0)
2262         return -EIO;
2263 
2264     if ((offset > len) || (len - offset < size))
2265         return -EIO;
2266 
2267     return 0;
2268 }
2269 
2270 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2271                               int nb_sectors)
2272 {
2273     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2274                                    nb_sectors * BDRV_SECTOR_SIZE);
2275 }
2276 
2277 typedef struct RwCo {
2278     BlockDriverState *bs;
2279     int64_t sector_num;
2280     int nb_sectors;
2281     QEMUIOVector *qiov;
2282     bool is_write;
2283     int ret;
2284     BdrvRequestFlags flags;
2285 } RwCo;
2286 
2287 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2288 {
2289     RwCo *rwco = opaque;
2290 
2291     if (!rwco->is_write) {
2292         rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
2293                                      rwco->nb_sectors, rwco->qiov,
2294                                      rwco->flags);
2295     } else {
2296         rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
2297                                       rwco->nb_sectors, rwco->qiov,
2298                                       rwco->flags);
2299     }
2300 }
2301 
2302 /*
2303  * Process a vectored synchronous request using coroutines
2304  */
2305 static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
2306                        QEMUIOVector *qiov, bool is_write,
2307                        BdrvRequestFlags flags)
2308 {
2309     Coroutine *co;
2310     RwCo rwco = {
2311         .bs = bs,
2312         .sector_num = sector_num,
2313         .nb_sectors = qiov->size >> BDRV_SECTOR_BITS,
2314         .qiov = qiov,
2315         .is_write = is_write,
2316         .ret = NOT_DONE,
2317         .flags = flags,
2318     };
2319     assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0);
2320 
2321     /**
2322      * In sync call context, when the vcpu is blocked, this throttling timer
2323      * will not fire; so the I/O throttling function has to be disabled here
2324      * if it has been enabled.
2325      */
2326     if (bs->io_limits_enabled) {
2327         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2328                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2329         bdrv_io_limits_disable(bs);
2330     }
2331 
2332     if (qemu_in_coroutine()) {
2333         /* Fast-path if already in coroutine context */
2334         bdrv_rw_co_entry(&rwco);
2335     } else {
2336         co = qemu_coroutine_create(bdrv_rw_co_entry);
2337         qemu_coroutine_enter(co, &rwco);
2338         while (rwco.ret == NOT_DONE) {
2339             qemu_aio_wait();
2340         }
2341     }
2342     return rwco.ret;
2343 }
2344 
2345 /*
2346  * Process a synchronous request using coroutines
2347  */
2348 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2349                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2350 {
2351     QEMUIOVector qiov;
2352     struct iovec iov = {
2353         .iov_base = (void *)buf,
2354         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2355     };
2356 
2357     qemu_iovec_init_external(&qiov, &iov, 1);
2358     return bdrv_rwv_co(bs, sector_num, &qiov, is_write, flags);
2359 }
2360 
2361 /* return < 0 if error. See bdrv_write() for the return codes */
2362 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2363               uint8_t *buf, int nb_sectors)
2364 {
2365     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2366 }
2367 
2368 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2369 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2370                           uint8_t *buf, int nb_sectors)
2371 {
2372     bool enabled;
2373     int ret;
2374 
2375     enabled = bs->io_limits_enabled;
2376     bs->io_limits_enabled = false;
2377     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2378     bs->io_limits_enabled = enabled;
2379     return ret;
2380 }
2381 
2382 /* Return < 0 if error. Important errors are:
2383   -EIO         generic I/O error (may happen for all errors)
2384   -ENOMEDIUM   No media inserted.
2385   -EINVAL      Invalid sector number or nb_sectors
2386   -EACCES      Trying to write a read-only device
2387 */
2388 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2389                const uint8_t *buf, int nb_sectors)
2390 {
2391     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2392 }
2393 
2394 int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
2395 {
2396     return bdrv_rwv_co(bs, sector_num, qiov, true, 0);
2397 }
2398 
2399 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2400                       int nb_sectors, BdrvRequestFlags flags)
2401 {
2402     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2403                       BDRV_REQ_ZERO_WRITE | flags);
2404 }
2405 
2406 /*
2407  * Completely zero out a block device with the help of bdrv_write_zeroes.
2408  * The operation is sped up by checking the block status and only writing
2409  * zeroes to the device if they currently do not return zeroes. Optional
2410  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2411  *
2412  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2413  */
2414 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2415 {
2416     int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
2417     int64_t ret, nb_sectors, sector_num = 0;
2418     int n;
2419 
2420     for (;;) {
2421         nb_sectors = target_size - sector_num;
2422         if (nb_sectors <= 0) {
2423             return 0;
2424         }
2425         if (nb_sectors > INT_MAX) {
2426             nb_sectors = INT_MAX;
2427         }
2428         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2429         if (ret & BDRV_BLOCK_ZERO) {
2430             sector_num += n;
2431             continue;
2432         }
2433         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2434         if (ret < 0) {
2435             error_report("error writing zeroes at sector %" PRId64 ": %s",
2436                          sector_num, strerror(-ret));
2437             return ret;
2438         }
2439         sector_num += n;
2440     }
2441 }
2442 
2443 int bdrv_pread(BlockDriverState *bs, int64_t offset,
2444                void *buf, int count1)
2445 {
2446     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2447     int len, nb_sectors, count;
2448     int64_t sector_num;
2449     int ret;
2450 
2451     count = count1;
2452     /* first read to align to sector start */
2453     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2454     if (len > count)
2455         len = count;
2456     sector_num = offset >> BDRV_SECTOR_BITS;
2457     if (len > 0) {
2458         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2459             return ret;
2460         memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
2461         count -= len;
2462         if (count == 0)
2463             return count1;
2464         sector_num++;
2465         buf += len;
2466     }
2467 
2468     /* read the sectors "in place" */
2469     nb_sectors = count >> BDRV_SECTOR_BITS;
2470     if (nb_sectors > 0) {
2471         if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
2472             return ret;
2473         sector_num += nb_sectors;
2474         len = nb_sectors << BDRV_SECTOR_BITS;
2475         buf += len;
2476         count -= len;
2477     }
2478 
2479     /* add data from the last sector */
2480     if (count > 0) {
2481         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2482             return ret;
2483         memcpy(buf, tmp_buf, count);
2484     }
2485     return count1;
2486 }
2487 
2488 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2489 {
2490     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2491     int len, nb_sectors, count;
2492     int64_t sector_num;
2493     int ret;
2494 
2495     count = qiov->size;
2496 
2497     /* first write to align to sector start */
2498     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2499     if (len > count)
2500         len = count;
2501     sector_num = offset >> BDRV_SECTOR_BITS;
2502     if (len > 0) {
2503         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2504             return ret;
2505         qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)),
2506                           len);
2507         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2508             return ret;
2509         count -= len;
2510         if (count == 0)
2511             return qiov->size;
2512         sector_num++;
2513     }
2514 
2515     /* write the sectors "in place" */
2516     nb_sectors = count >> BDRV_SECTOR_BITS;
2517     if (nb_sectors > 0) {
2518         QEMUIOVector qiov_inplace;
2519 
2520         qemu_iovec_init(&qiov_inplace, qiov->niov);
2521         qemu_iovec_concat(&qiov_inplace, qiov, len,
2522                           nb_sectors << BDRV_SECTOR_BITS);
2523         ret = bdrv_writev(bs, sector_num, &qiov_inplace);
2524         qemu_iovec_destroy(&qiov_inplace);
2525         if (ret < 0) {
2526             return ret;
2527         }
2528 
2529         sector_num += nb_sectors;
2530         len = nb_sectors << BDRV_SECTOR_BITS;
2531         count -= len;
2532     }
2533 
2534     /* add data from the last sector */
2535     if (count > 0) {
2536         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2537             return ret;
2538         qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count);
2539         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2540             return ret;
2541     }
2542     return qiov->size;
2543 }
2544 
2545 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2546                 const void *buf, int count1)
2547 {
2548     QEMUIOVector qiov;
2549     struct iovec iov = {
2550         .iov_base   = (void *) buf,
2551         .iov_len    = count1,
2552     };
2553 
2554     qemu_iovec_init_external(&qiov, &iov, 1);
2555     return bdrv_pwritev(bs, offset, &qiov);
2556 }
2557 
2558 /*
2559  * Writes to the file and ensures that no writes are reordered across this
2560  * request (acts as a barrier)
2561  *
2562  * Returns 0 on success, -errno in error cases.
2563  */
2564 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2565     const void *buf, int count)
2566 {
2567     int ret;
2568 
2569     ret = bdrv_pwrite(bs, offset, buf, count);
2570     if (ret < 0) {
2571         return ret;
2572     }
2573 
2574     /* No flush needed for cache modes that already do it */
2575     if (bs->enable_write_cache) {
2576         bdrv_flush(bs);
2577     }
2578 
2579     return 0;
2580 }
2581 
2582 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2583         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2584 {
2585     /* Perform I/O through a temporary buffer so that users who scribble over
2586      * their read buffer while the operation is in progress do not end up
2587      * modifying the image file.  This is critical for zero-copy guest I/O
2588      * where anything might happen inside guest memory.
2589      */
2590     void *bounce_buffer;
2591 
2592     BlockDriver *drv = bs->drv;
2593     struct iovec iov;
2594     QEMUIOVector bounce_qiov;
2595     int64_t cluster_sector_num;
2596     int cluster_nb_sectors;
2597     size_t skip_bytes;
2598     int ret;
2599 
2600     /* Cover entire cluster so no additional backing file I/O is required when
2601      * allocating cluster in the image file.
2602      */
2603     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2604                            &cluster_sector_num, &cluster_nb_sectors);
2605 
2606     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2607                                    cluster_sector_num, cluster_nb_sectors);
2608 
2609     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2610     iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2611     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2612 
2613     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2614                              &bounce_qiov);
2615     if (ret < 0) {
2616         goto err;
2617     }
2618 
2619     if (drv->bdrv_co_write_zeroes &&
2620         buffer_is_zero(bounce_buffer, iov.iov_len)) {
2621         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2622                                       cluster_nb_sectors, 0);
2623     } else {
2624         /* This does not change the data on the disk, it is not necessary
2625          * to flush even in cache=writethrough mode.
2626          */
2627         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2628                                   &bounce_qiov);
2629     }
2630 
2631     if (ret < 0) {
2632         /* It might be okay to ignore write errors for guest requests.  If this
2633          * is a deliberate copy-on-read then we don't want to ignore the error.
2634          * Simply report it in all cases.
2635          */
2636         goto err;
2637     }
2638 
2639     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2640     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2641                         nb_sectors * BDRV_SECTOR_SIZE);
2642 
2643 err:
2644     qemu_vfree(bounce_buffer);
2645     return ret;
2646 }
2647 
2648 /*
2649  * Handle a read request in coroutine context
2650  */
2651 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
2652     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2653     BdrvRequestFlags flags)
2654 {
2655     BlockDriver *drv = bs->drv;
2656     BdrvTrackedRequest req;
2657     int ret;
2658 
2659     if (!drv) {
2660         return -ENOMEDIUM;
2661     }
2662     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2663         return -EIO;
2664     }
2665 
2666     if (bs->copy_on_read) {
2667         flags |= BDRV_REQ_COPY_ON_READ;
2668     }
2669     if (flags & BDRV_REQ_COPY_ON_READ) {
2670         bs->copy_on_read_in_flight++;
2671     }
2672 
2673     if (bs->copy_on_read_in_flight) {
2674         wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2675     }
2676 
2677     /* throttling disk I/O */
2678     if (bs->io_limits_enabled) {
2679         bdrv_io_limits_intercept(bs, nb_sectors, false);
2680     }
2681 
2682     tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
2683 
2684     if (flags & BDRV_REQ_COPY_ON_READ) {
2685         int pnum;
2686 
2687         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2688         if (ret < 0) {
2689             goto out;
2690         }
2691 
2692         if (!ret || pnum != nb_sectors) {
2693             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2694             goto out;
2695         }
2696     }
2697 
2698     if (!(bs->zero_beyond_eof && bs->growable)) {
2699         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2700     } else {
2701         /* Read zeros after EOF of growable BDSes */
2702         int64_t len, total_sectors, max_nb_sectors;
2703 
2704         len = bdrv_getlength(bs);
2705         if (len < 0) {
2706             ret = len;
2707             goto out;
2708         }
2709 
2710         total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
2711         max_nb_sectors = MAX(0, total_sectors - sector_num);
2712         if (max_nb_sectors > 0) {
2713             ret = drv->bdrv_co_readv(bs, sector_num,
2714                                      MIN(nb_sectors, max_nb_sectors), qiov);
2715         } else {
2716             ret = 0;
2717         }
2718 
2719         /* Reading beyond end of file is supposed to produce zeroes */
2720         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2721             uint64_t offset = MAX(0, total_sectors - sector_num);
2722             uint64_t bytes = (sector_num + nb_sectors - offset) *
2723                               BDRV_SECTOR_SIZE;
2724             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
2725         }
2726     }
2727 
2728 out:
2729     tracked_request_end(&req);
2730 
2731     if (flags & BDRV_REQ_COPY_ON_READ) {
2732         bs->copy_on_read_in_flight--;
2733     }
2734 
2735     return ret;
2736 }
2737 
2738 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
2739     int nb_sectors, QEMUIOVector *qiov)
2740 {
2741     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
2742 
2743     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
2744 }
2745 
2746 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
2747     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2748 {
2749     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
2750 
2751     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
2752                             BDRV_REQ_COPY_ON_READ);
2753 }
2754 
2755 /* if no limit is specified in the BlockLimits use a default
2756  * of 32768 512-byte sectors (16 MiB) per request.
2757  */
2758 #define MAX_WRITE_ZEROES_DEFAULT 32768
2759 
2760 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
2761     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
2762 {
2763     BlockDriver *drv = bs->drv;
2764     QEMUIOVector qiov;
2765     struct iovec iov = {0};
2766     int ret = 0;
2767 
2768     int max_write_zeroes = bs->bl.max_write_zeroes ?
2769                            bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
2770 
2771     while (nb_sectors > 0 && !ret) {
2772         int num = nb_sectors;
2773 
2774         /* align request */
2775         if (bs->bl.write_zeroes_alignment &&
2776             num >= bs->bl.write_zeroes_alignment &&
2777             sector_num % bs->bl.write_zeroes_alignment) {
2778             if (num > bs->bl.write_zeroes_alignment) {
2779                 num = bs->bl.write_zeroes_alignment;
2780             }
2781             num -= sector_num % bs->bl.write_zeroes_alignment;
2782         }
2783 
2784         /* limit request size */
2785         if (num > max_write_zeroes) {
2786             num = max_write_zeroes;
2787         }
2788 
2789         ret = -ENOTSUP;
2790         /* First try the efficient write zeroes operation */
2791         if (drv->bdrv_co_write_zeroes) {
2792             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
2793         }
2794 
2795         if (ret == -ENOTSUP) {
2796             /* Fall back to bounce buffer if write zeroes is unsupported */
2797             iov.iov_len = num * BDRV_SECTOR_SIZE;
2798             if (iov.iov_base == NULL) {
2799                 /* allocate bounce buffer only once and ensure that it
2800                  * is big enough for this and all future requests.
2801                  */
2802                 size_t bufsize = num <= nb_sectors ? num : max_write_zeroes;
2803                 iov.iov_base = qemu_blockalign(bs, bufsize * BDRV_SECTOR_SIZE);
2804                 memset(iov.iov_base, 0, bufsize * BDRV_SECTOR_SIZE);
2805             }
2806             qemu_iovec_init_external(&qiov, &iov, 1);
2807 
2808             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
2809         }
2810 
2811         sector_num += num;
2812         nb_sectors -= num;
2813     }
2814 
2815     qemu_vfree(iov.iov_base);
2816     return ret;
2817 }
2818 
2819 /*
2820  * Handle a write request in coroutine context
2821  */
2822 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
2823     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2824     BdrvRequestFlags flags)
2825 {
2826     BlockDriver *drv = bs->drv;
2827     BdrvTrackedRequest req;
2828     int ret;
2829 
2830     if (!bs->drv) {
2831         return -ENOMEDIUM;
2832     }
2833     if (bs->read_only) {
2834         return -EACCES;
2835     }
2836     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2837         return -EIO;
2838     }
2839 
2840     if (bs->copy_on_read_in_flight) {
2841         wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2842     }
2843 
2844     /* throttling disk I/O */
2845     if (bs->io_limits_enabled) {
2846         bdrv_io_limits_intercept(bs, nb_sectors, true);
2847     }
2848 
2849     tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
2850 
2851     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
2852 
2853     if (ret < 0) {
2854         /* Do nothing, write notifier decided to fail this request */
2855     } else if (flags & BDRV_REQ_ZERO_WRITE) {
2856         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
2857     } else {
2858         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
2859     }
2860 
2861     if (ret == 0 && !bs->enable_write_cache) {
2862         ret = bdrv_co_flush(bs);
2863     }
2864 
2865     bdrv_set_dirty(bs, sector_num, nb_sectors);
2866 
2867     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2868         bs->wr_highest_sector = sector_num + nb_sectors - 1;
2869     }
2870     if (bs->growable && ret >= 0) {
2871         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
2872     }
2873 
2874     tracked_request_end(&req);
2875 
2876     return ret;
2877 }
2878 
2879 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2880     int nb_sectors, QEMUIOVector *qiov)
2881 {
2882     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2883 
2884     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2885 }
2886 
2887 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2888                                       int64_t sector_num, int nb_sectors,
2889                                       BdrvRequestFlags flags)
2890 {
2891     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
2892 
2893     if (!(bs->open_flags & BDRV_O_UNMAP)) {
2894         flags &= ~BDRV_REQ_MAY_UNMAP;
2895     }
2896 
2897     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2898                              BDRV_REQ_ZERO_WRITE | flags);
2899 }
2900 
2901 /**
2902  * Truncate file to 'offset' bytes (needed only for file protocols)
2903  */
2904 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2905 {
2906     BlockDriver *drv = bs->drv;
2907     int ret;
2908     if (!drv)
2909         return -ENOMEDIUM;
2910     if (!drv->bdrv_truncate)
2911         return -ENOTSUP;
2912     if (bs->read_only)
2913         return -EACCES;
2914     if (bdrv_in_use(bs))
2915         return -EBUSY;
2916     ret = drv->bdrv_truncate(bs, offset);
2917     if (ret == 0) {
2918         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2919         bdrv_dev_resize_cb(bs);
2920     }
2921     return ret;
2922 }
2923 
2924 /**
2925  * Length of a allocated file in bytes. Sparse files are counted by actual
2926  * allocated space. Return < 0 if error or unknown.
2927  */
2928 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2929 {
2930     BlockDriver *drv = bs->drv;
2931     if (!drv) {
2932         return -ENOMEDIUM;
2933     }
2934     if (drv->bdrv_get_allocated_file_size) {
2935         return drv->bdrv_get_allocated_file_size(bs);
2936     }
2937     if (bs->file) {
2938         return bdrv_get_allocated_file_size(bs->file);
2939     }
2940     return -ENOTSUP;
2941 }
2942 
2943 /**
2944  * Length of a file in bytes. Return < 0 if error or unknown.
2945  */
2946 int64_t bdrv_getlength(BlockDriverState *bs)
2947 {
2948     BlockDriver *drv = bs->drv;
2949     if (!drv)
2950         return -ENOMEDIUM;
2951 
2952     if (drv->has_variable_length) {
2953         int ret = refresh_total_sectors(bs, bs->total_sectors);
2954         if (ret < 0) {
2955             return ret;
2956         }
2957     }
2958     return bs->total_sectors * BDRV_SECTOR_SIZE;
2959 }
2960 
2961 /* return 0 as number of sectors if no device present or error */
2962 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2963 {
2964     int64_t length;
2965     length = bdrv_getlength(bs);
2966     if (length < 0)
2967         length = 0;
2968     else
2969         length = length >> BDRV_SECTOR_BITS;
2970     *nb_sectors_ptr = length;
2971 }
2972 
2973 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
2974                        BlockdevOnError on_write_error)
2975 {
2976     bs->on_read_error = on_read_error;
2977     bs->on_write_error = on_write_error;
2978 }
2979 
2980 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
2981 {
2982     return is_read ? bs->on_read_error : bs->on_write_error;
2983 }
2984 
2985 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
2986 {
2987     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
2988 
2989     switch (on_err) {
2990     case BLOCKDEV_ON_ERROR_ENOSPC:
2991         return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
2992     case BLOCKDEV_ON_ERROR_STOP:
2993         return BDRV_ACTION_STOP;
2994     case BLOCKDEV_ON_ERROR_REPORT:
2995         return BDRV_ACTION_REPORT;
2996     case BLOCKDEV_ON_ERROR_IGNORE:
2997         return BDRV_ACTION_IGNORE;
2998     default:
2999         abort();
3000     }
3001 }
3002 
3003 /* This is done by device models because, while the block layer knows
3004  * about the error, it does not know whether an operation comes from
3005  * the device or the block layer (from a job, for example).
3006  */
3007 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3008                        bool is_read, int error)
3009 {
3010     assert(error >= 0);
3011     bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3012     if (action == BDRV_ACTION_STOP) {
3013         vm_stop(RUN_STATE_IO_ERROR);
3014         bdrv_iostatus_set_err(bs, error);
3015     }
3016 }
3017 
3018 int bdrv_is_read_only(BlockDriverState *bs)
3019 {
3020     return bs->read_only;
3021 }
3022 
3023 int bdrv_is_sg(BlockDriverState *bs)
3024 {
3025     return bs->sg;
3026 }
3027 
3028 int bdrv_enable_write_cache(BlockDriverState *bs)
3029 {
3030     return bs->enable_write_cache;
3031 }
3032 
3033 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3034 {
3035     bs->enable_write_cache = wce;
3036 
3037     /* so a reopen() will preserve wce */
3038     if (wce) {
3039         bs->open_flags |= BDRV_O_CACHE_WB;
3040     } else {
3041         bs->open_flags &= ~BDRV_O_CACHE_WB;
3042     }
3043 }
3044 
3045 int bdrv_is_encrypted(BlockDriverState *bs)
3046 {
3047     if (bs->backing_hd && bs->backing_hd->encrypted)
3048         return 1;
3049     return bs->encrypted;
3050 }
3051 
3052 int bdrv_key_required(BlockDriverState *bs)
3053 {
3054     BlockDriverState *backing_hd = bs->backing_hd;
3055 
3056     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3057         return 1;
3058     return (bs->encrypted && !bs->valid_key);
3059 }
3060 
3061 int bdrv_set_key(BlockDriverState *bs, const char *key)
3062 {
3063     int ret;
3064     if (bs->backing_hd && bs->backing_hd->encrypted) {
3065         ret = bdrv_set_key(bs->backing_hd, key);
3066         if (ret < 0)
3067             return ret;
3068         if (!bs->encrypted)
3069             return 0;
3070     }
3071     if (!bs->encrypted) {
3072         return -EINVAL;
3073     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3074         return -ENOMEDIUM;
3075     }
3076     ret = bs->drv->bdrv_set_key(bs, key);
3077     if (ret < 0) {
3078         bs->valid_key = 0;
3079     } else if (!bs->valid_key) {
3080         bs->valid_key = 1;
3081         /* call the change callback now, we skipped it on open */
3082         bdrv_dev_change_media_cb(bs, true);
3083     }
3084     return ret;
3085 }
3086 
3087 const char *bdrv_get_format_name(BlockDriverState *bs)
3088 {
3089     return bs->drv ? bs->drv->format_name : NULL;
3090 }
3091 
3092 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3093                          void *opaque)
3094 {
3095     BlockDriver *drv;
3096 
3097     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3098         it(opaque, drv->format_name);
3099     }
3100 }
3101 
3102 BlockDriverState *bdrv_find(const char *name)
3103 {
3104     BlockDriverState *bs;
3105 
3106     QTAILQ_FOREACH(bs, &bdrv_states, list) {
3107         if (!strcmp(name, bs->device_name)) {
3108             return bs;
3109         }
3110     }
3111     return NULL;
3112 }
3113 
3114 BlockDriverState *bdrv_next(BlockDriverState *bs)
3115 {
3116     if (!bs) {
3117         return QTAILQ_FIRST(&bdrv_states);
3118     }
3119     return QTAILQ_NEXT(bs, list);
3120 }
3121 
3122 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3123 {
3124     BlockDriverState *bs;
3125 
3126     QTAILQ_FOREACH(bs, &bdrv_states, list) {
3127         it(opaque, bs);
3128     }
3129 }
3130 
3131 const char *bdrv_get_device_name(BlockDriverState *bs)
3132 {
3133     return bs->device_name;
3134 }
3135 
3136 int bdrv_get_flags(BlockDriverState *bs)
3137 {
3138     return bs->open_flags;
3139 }
3140 
3141 int bdrv_flush_all(void)
3142 {
3143     BlockDriverState *bs;
3144     int result = 0;
3145 
3146     QTAILQ_FOREACH(bs, &bdrv_states, list) {
3147         int ret = bdrv_flush(bs);
3148         if (ret < 0 && !result) {
3149             result = ret;
3150         }
3151     }
3152 
3153     return result;
3154 }
3155 
3156 int bdrv_has_zero_init_1(BlockDriverState *bs)
3157 {
3158     return 1;
3159 }
3160 
3161 int bdrv_has_zero_init(BlockDriverState *bs)
3162 {
3163     assert(bs->drv);
3164 
3165     /* If BS is a copy on write image, it is initialized to
3166        the contents of the base image, which may not be zeroes.  */
3167     if (bs->backing_hd) {
3168         return 0;
3169     }
3170     if (bs->drv->bdrv_has_zero_init) {
3171         return bs->drv->bdrv_has_zero_init(bs);
3172     }
3173 
3174     /* safe default */
3175     return 0;
3176 }
3177 
3178 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3179 {
3180     BlockDriverInfo bdi;
3181 
3182     if (bs->backing_hd) {
3183         return false;
3184     }
3185 
3186     if (bdrv_get_info(bs, &bdi) == 0) {
3187         return bdi.unallocated_blocks_are_zero;
3188     }
3189 
3190     return false;
3191 }
3192 
3193 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3194 {
3195     BlockDriverInfo bdi;
3196 
3197     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3198         return false;
3199     }
3200 
3201     if (bdrv_get_info(bs, &bdi) == 0) {
3202         return bdi.can_write_zeroes_with_unmap;
3203     }
3204 
3205     return false;
3206 }
3207 
3208 typedef struct BdrvCoGetBlockStatusData {
3209     BlockDriverState *bs;
3210     BlockDriverState *base;
3211     int64_t sector_num;
3212     int nb_sectors;
3213     int *pnum;
3214     int64_t ret;
3215     bool done;
3216 } BdrvCoGetBlockStatusData;
3217 
3218 /*
3219  * Returns true iff the specified sector is present in the disk image. Drivers
3220  * not implementing the functionality are assumed to not support backing files,
3221  * hence all their sectors are reported as allocated.
3222  *
3223  * If 'sector_num' is beyond the end of the disk image the return value is 0
3224  * and 'pnum' is set to 0.
3225  *
3226  * 'pnum' is set to the number of sectors (including and immediately following
3227  * the specified sector) that are known to be in the same
3228  * allocated/unallocated state.
3229  *
3230  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3231  * beyond the end of the disk image it will be clamped.
3232  */
3233 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3234                                                      int64_t sector_num,
3235                                                      int nb_sectors, int *pnum)
3236 {
3237     int64_t length;
3238     int64_t n;
3239     int64_t ret, ret2;
3240 
3241     length = bdrv_getlength(bs);
3242     if (length < 0) {
3243         return length;
3244     }
3245 
3246     if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3247         *pnum = 0;
3248         return 0;
3249     }
3250 
3251     n = bs->total_sectors - sector_num;
3252     if (n < nb_sectors) {
3253         nb_sectors = n;
3254     }
3255 
3256     if (!bs->drv->bdrv_co_get_block_status) {
3257         *pnum = nb_sectors;
3258         ret = BDRV_BLOCK_DATA;
3259         if (bs->drv->protocol_name) {
3260             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3261         }
3262         return ret;
3263     }
3264 
3265     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3266     if (ret < 0) {
3267         *pnum = 0;
3268         return ret;
3269     }
3270 
3271     if (ret & BDRV_BLOCK_RAW) {
3272         assert(ret & BDRV_BLOCK_OFFSET_VALID);
3273         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3274                                      *pnum, pnum);
3275     }
3276 
3277     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3278         if (bdrv_unallocated_blocks_are_zero(bs)) {
3279             ret |= BDRV_BLOCK_ZERO;
3280         } else if (bs->backing_hd) {
3281             BlockDriverState *bs2 = bs->backing_hd;
3282             int64_t length2 = bdrv_getlength(bs2);
3283             if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3284                 ret |= BDRV_BLOCK_ZERO;
3285             }
3286         }
3287     }
3288 
3289     if (bs->file &&
3290         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3291         (ret & BDRV_BLOCK_OFFSET_VALID)) {
3292         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3293                                         *pnum, pnum);
3294         if (ret2 >= 0) {
3295             /* Ignore errors.  This is just providing extra information, it
3296              * is useful but not necessary.
3297              */
3298             ret |= (ret2 & BDRV_BLOCK_ZERO);
3299         }
3300     }
3301 
3302     return ret;
3303 }
3304 
3305 /* Coroutine wrapper for bdrv_get_block_status() */
3306 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3307 {
3308     BdrvCoGetBlockStatusData *data = opaque;
3309     BlockDriverState *bs = data->bs;
3310 
3311     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3312                                          data->pnum);
3313     data->done = true;
3314 }
3315 
3316 /*
3317  * Synchronous wrapper around bdrv_co_get_block_status().
3318  *
3319  * See bdrv_co_get_block_status() for details.
3320  */
3321 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3322                               int nb_sectors, int *pnum)
3323 {
3324     Coroutine *co;
3325     BdrvCoGetBlockStatusData data = {
3326         .bs = bs,
3327         .sector_num = sector_num,
3328         .nb_sectors = nb_sectors,
3329         .pnum = pnum,
3330         .done = false,
3331     };
3332 
3333     if (qemu_in_coroutine()) {
3334         /* Fast-path if already in coroutine context */
3335         bdrv_get_block_status_co_entry(&data);
3336     } else {
3337         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3338         qemu_coroutine_enter(co, &data);
3339         while (!data.done) {
3340             qemu_aio_wait();
3341         }
3342     }
3343     return data.ret;
3344 }
3345 
3346 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3347                                    int nb_sectors, int *pnum)
3348 {
3349     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3350     if (ret < 0) {
3351         return ret;
3352     }
3353     return
3354         (ret & BDRV_BLOCK_DATA) ||
3355         ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3356 }
3357 
3358 /*
3359  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3360  *
3361  * Return true if the given sector is allocated in any image between
3362  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3363  * sector is allocated in any image of the chain.  Return false otherwise.
3364  *
3365  * 'pnum' is set to the number of sectors (including and immediately following
3366  *  the specified sector) that are known to be in the same
3367  *  allocated/unallocated state.
3368  *
3369  */
3370 int bdrv_is_allocated_above(BlockDriverState *top,
3371                             BlockDriverState *base,
3372                             int64_t sector_num,
3373                             int nb_sectors, int *pnum)
3374 {
3375     BlockDriverState *intermediate;
3376     int ret, n = nb_sectors;
3377 
3378     intermediate = top;
3379     while (intermediate && intermediate != base) {
3380         int pnum_inter;
3381         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3382                                 &pnum_inter);
3383         if (ret < 0) {
3384             return ret;
3385         } else if (ret) {
3386             *pnum = pnum_inter;
3387             return 1;
3388         }
3389 
3390         /*
3391          * [sector_num, nb_sectors] is unallocated on top but intermediate
3392          * might have
3393          *
3394          * [sector_num+x, nr_sectors] allocated.
3395          */
3396         if (n > pnum_inter &&
3397             (intermediate == top ||
3398              sector_num + pnum_inter < intermediate->total_sectors)) {
3399             n = pnum_inter;
3400         }
3401 
3402         intermediate = intermediate->backing_hd;
3403     }
3404 
3405     *pnum = n;
3406     return 0;
3407 }
3408 
3409 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3410 {
3411     if (bs->backing_hd && bs->backing_hd->encrypted)
3412         return bs->backing_file;
3413     else if (bs->encrypted)
3414         return bs->filename;
3415     else
3416         return NULL;
3417 }
3418 
3419 void bdrv_get_backing_filename(BlockDriverState *bs,
3420                                char *filename, int filename_size)
3421 {
3422     pstrcpy(filename, filename_size, bs->backing_file);
3423 }
3424 
3425 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3426                           const uint8_t *buf, int nb_sectors)
3427 {
3428     BlockDriver *drv = bs->drv;
3429     if (!drv)
3430         return -ENOMEDIUM;
3431     if (!drv->bdrv_write_compressed)
3432         return -ENOTSUP;
3433     if (bdrv_check_request(bs, sector_num, nb_sectors))
3434         return -EIO;
3435 
3436     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
3437 
3438     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3439 }
3440 
3441 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3442 {
3443     BlockDriver *drv = bs->drv;
3444     if (!drv)
3445         return -ENOMEDIUM;
3446     if (!drv->bdrv_get_info)
3447         return -ENOTSUP;
3448     memset(bdi, 0, sizeof(*bdi));
3449     return drv->bdrv_get_info(bs, bdi);
3450 }
3451 
3452 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
3453 {
3454     BlockDriver *drv = bs->drv;
3455     if (drv && drv->bdrv_get_specific_info) {
3456         return drv->bdrv_get_specific_info(bs);
3457     }
3458     return NULL;
3459 }
3460 
3461 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3462                       int64_t pos, int size)
3463 {
3464     QEMUIOVector qiov;
3465     struct iovec iov = {
3466         .iov_base   = (void *) buf,
3467         .iov_len    = size,
3468     };
3469 
3470     qemu_iovec_init_external(&qiov, &iov, 1);
3471     return bdrv_writev_vmstate(bs, &qiov, pos);
3472 }
3473 
3474 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
3475 {
3476     BlockDriver *drv = bs->drv;
3477 
3478     if (!drv) {
3479         return -ENOMEDIUM;
3480     } else if (drv->bdrv_save_vmstate) {
3481         return drv->bdrv_save_vmstate(bs, qiov, pos);
3482     } else if (bs->file) {
3483         return bdrv_writev_vmstate(bs->file, qiov, pos);
3484     }
3485 
3486     return -ENOTSUP;
3487 }
3488 
3489 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3490                       int64_t pos, int size)
3491 {
3492     BlockDriver *drv = bs->drv;
3493     if (!drv)
3494         return -ENOMEDIUM;
3495     if (drv->bdrv_load_vmstate)
3496         return drv->bdrv_load_vmstate(bs, buf, pos, size);
3497     if (bs->file)
3498         return bdrv_load_vmstate(bs->file, buf, pos, size);
3499     return -ENOTSUP;
3500 }
3501 
3502 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
3503 {
3504     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
3505         return;
3506     }
3507 
3508     bs->drv->bdrv_debug_event(bs, event);
3509 }
3510 
3511 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
3512                           const char *tag)
3513 {
3514     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
3515         bs = bs->file;
3516     }
3517 
3518     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
3519         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
3520     }
3521 
3522     return -ENOTSUP;
3523 }
3524 
3525 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
3526 {
3527     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
3528         bs = bs->file;
3529     }
3530 
3531     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
3532         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
3533     }
3534 
3535     return -ENOTSUP;
3536 }
3537 
3538 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
3539 {
3540     while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
3541         bs = bs->file;
3542     }
3543 
3544     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
3545         return bs->drv->bdrv_debug_resume(bs, tag);
3546     }
3547 
3548     return -ENOTSUP;
3549 }
3550 
3551 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
3552 {
3553     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
3554         bs = bs->file;
3555     }
3556 
3557     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
3558         return bs->drv->bdrv_debug_is_suspended(bs, tag);
3559     }
3560 
3561     return false;
3562 }
3563 
3564 int bdrv_is_snapshot(BlockDriverState *bs)
3565 {
3566     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
3567 }
3568 
3569 /* backing_file can either be relative, or absolute, or a protocol.  If it is
3570  * relative, it must be relative to the chain.  So, passing in bs->filename
3571  * from a BDS as backing_file should not be done, as that may be relative to
3572  * the CWD rather than the chain. */
3573 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
3574         const char *backing_file)
3575 {
3576     char *filename_full = NULL;
3577     char *backing_file_full = NULL;
3578     char *filename_tmp = NULL;
3579     int is_protocol = 0;
3580     BlockDriverState *curr_bs = NULL;
3581     BlockDriverState *retval = NULL;
3582 
3583     if (!bs || !bs->drv || !backing_file) {
3584         return NULL;
3585     }
3586 
3587     filename_full     = g_malloc(PATH_MAX);
3588     backing_file_full = g_malloc(PATH_MAX);
3589     filename_tmp      = g_malloc(PATH_MAX);
3590 
3591     is_protocol = path_has_protocol(backing_file);
3592 
3593     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
3594 
3595         /* If either of the filename paths is actually a protocol, then
3596          * compare unmodified paths; otherwise make paths relative */
3597         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
3598             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
3599                 retval = curr_bs->backing_hd;
3600                 break;
3601             }
3602         } else {
3603             /* If not an absolute filename path, make it relative to the current
3604              * image's filename path */
3605             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3606                          backing_file);
3607 
3608             /* We are going to compare absolute pathnames */
3609             if (!realpath(filename_tmp, filename_full)) {
3610                 continue;
3611             }
3612 
3613             /* We need to make sure the backing filename we are comparing against
3614              * is relative to the current image filename (or absolute) */
3615             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3616                          curr_bs->backing_file);
3617 
3618             if (!realpath(filename_tmp, backing_file_full)) {
3619                 continue;
3620             }
3621 
3622             if (strcmp(backing_file_full, filename_full) == 0) {
3623                 retval = curr_bs->backing_hd;
3624                 break;
3625             }
3626         }
3627     }
3628 
3629     g_free(filename_full);
3630     g_free(backing_file_full);
3631     g_free(filename_tmp);
3632     return retval;
3633 }
3634 
3635 int bdrv_get_backing_file_depth(BlockDriverState *bs)
3636 {
3637     if (!bs->drv) {
3638         return 0;
3639     }
3640 
3641     if (!bs->backing_hd) {
3642         return 0;
3643     }
3644 
3645     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
3646 }
3647 
3648 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
3649 {
3650     BlockDriverState *curr_bs = NULL;
3651 
3652     if (!bs) {
3653         return NULL;
3654     }
3655 
3656     curr_bs = bs;
3657 
3658     while (curr_bs->backing_hd) {
3659         curr_bs = curr_bs->backing_hd;
3660     }
3661     return curr_bs;
3662 }
3663 
3664 /**************************************************************/
3665 /* async I/Os */
3666 
3667 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3668                                  QEMUIOVector *qiov, int nb_sectors,
3669                                  BlockDriverCompletionFunc *cb, void *opaque)
3670 {
3671     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3672 
3673     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
3674                                  cb, opaque, false);
3675 }
3676 
3677 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3678                                   QEMUIOVector *qiov, int nb_sectors,
3679                                   BlockDriverCompletionFunc *cb, void *opaque)
3680 {
3681     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3682 
3683     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
3684                                  cb, opaque, true);
3685 }
3686 
3687 BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
3688         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
3689         BlockDriverCompletionFunc *cb, void *opaque)
3690 {
3691     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
3692 
3693     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
3694                                  BDRV_REQ_ZERO_WRITE | flags,
3695                                  cb, opaque, true);
3696 }
3697 
3698 
3699 typedef struct MultiwriteCB {
3700     int error;
3701     int num_requests;
3702     int num_callbacks;
3703     struct {
3704         BlockDriverCompletionFunc *cb;
3705         void *opaque;
3706         QEMUIOVector *free_qiov;
3707     } callbacks[];
3708 } MultiwriteCB;
3709 
3710 static void multiwrite_user_cb(MultiwriteCB *mcb)
3711 {
3712     int i;
3713 
3714     for (i = 0; i < mcb->num_callbacks; i++) {
3715         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3716         if (mcb->callbacks[i].free_qiov) {
3717             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3718         }
3719         g_free(mcb->callbacks[i].free_qiov);
3720     }
3721 }
3722 
3723 static void multiwrite_cb(void *opaque, int ret)
3724 {
3725     MultiwriteCB *mcb = opaque;
3726 
3727     trace_multiwrite_cb(mcb, ret);
3728 
3729     if (ret < 0 && !mcb->error) {
3730         mcb->error = ret;
3731     }
3732 
3733     mcb->num_requests--;
3734     if (mcb->num_requests == 0) {
3735         multiwrite_user_cb(mcb);
3736         g_free(mcb);
3737     }
3738 }
3739 
3740 static int multiwrite_req_compare(const void *a, const void *b)
3741 {
3742     const BlockRequest *req1 = a, *req2 = b;
3743 
3744     /*
3745      * Note that we can't simply subtract req2->sector from req1->sector
3746      * here as that could overflow the return value.
3747      */
3748     if (req1->sector > req2->sector) {
3749         return 1;
3750     } else if (req1->sector < req2->sector) {
3751         return -1;
3752     } else {
3753         return 0;
3754     }
3755 }
3756 
3757 /*
3758  * Takes a bunch of requests and tries to merge them. Returns the number of
3759  * requests that remain after merging.
3760  */
3761 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3762     int num_reqs, MultiwriteCB *mcb)
3763 {
3764     int i, outidx;
3765 
3766     // Sort requests by start sector
3767     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3768 
3769     // Check if adjacent requests touch the same clusters. If so, combine them,
3770     // filling up gaps with zero sectors.
3771     outidx = 0;
3772     for (i = 1; i < num_reqs; i++) {
3773         int merge = 0;
3774         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3775 
3776         // Handle exactly sequential writes and overlapping writes.
3777         if (reqs[i].sector <= oldreq_last) {
3778             merge = 1;
3779         }
3780 
3781         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3782             merge = 0;
3783         }
3784 
3785         if (merge) {
3786             size_t size;
3787             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3788             qemu_iovec_init(qiov,
3789                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3790 
3791             // Add the first request to the merged one. If the requests are
3792             // overlapping, drop the last sectors of the first request.
3793             size = (reqs[i].sector - reqs[outidx].sector) << 9;
3794             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
3795 
3796             // We should need to add any zeros between the two requests
3797             assert (reqs[i].sector <= oldreq_last);
3798 
3799             // Add the second request
3800             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
3801 
3802             reqs[outidx].nb_sectors = qiov->size >> 9;
3803             reqs[outidx].qiov = qiov;
3804 
3805             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3806         } else {
3807             outidx++;
3808             reqs[outidx].sector     = reqs[i].sector;
3809             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3810             reqs[outidx].qiov       = reqs[i].qiov;
3811         }
3812     }
3813 
3814     return outidx + 1;
3815 }
3816 
3817 /*
3818  * Submit multiple AIO write requests at once.
3819  *
3820  * On success, the function returns 0 and all requests in the reqs array have
3821  * been submitted. In error case this function returns -1, and any of the
3822  * requests may or may not be submitted yet. In particular, this means that the
3823  * callback will be called for some of the requests, for others it won't. The
3824  * caller must check the error field of the BlockRequest to wait for the right
3825  * callbacks (if error != 0, no callback will be called).
3826  *
3827  * The implementation may modify the contents of the reqs array, e.g. to merge
3828  * requests. However, the fields opaque and error are left unmodified as they
3829  * are used to signal failure for a single request to the caller.
3830  */
3831 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3832 {
3833     MultiwriteCB *mcb;
3834     int i;
3835 
3836     /* don't submit writes if we don't have a medium */
3837     if (bs->drv == NULL) {
3838         for (i = 0; i < num_reqs; i++) {
3839             reqs[i].error = -ENOMEDIUM;
3840         }
3841         return -1;
3842     }
3843 
3844     if (num_reqs == 0) {
3845         return 0;
3846     }
3847 
3848     // Create MultiwriteCB structure
3849     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3850     mcb->num_requests = 0;
3851     mcb->num_callbacks = num_reqs;
3852 
3853     for (i = 0; i < num_reqs; i++) {
3854         mcb->callbacks[i].cb = reqs[i].cb;
3855         mcb->callbacks[i].opaque = reqs[i].opaque;
3856     }
3857 
3858     // Check for mergable requests
3859     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3860 
3861     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3862 
3863     /* Run the aio requests. */
3864     mcb->num_requests = num_reqs;
3865     for (i = 0; i < num_reqs; i++) {
3866         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
3867                               reqs[i].nb_sectors, reqs[i].flags,
3868                               multiwrite_cb, mcb,
3869                               true);
3870     }
3871 
3872     return 0;
3873 }
3874 
3875 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3876 {
3877     acb->aiocb_info->cancel(acb);
3878 }
3879 
3880 /**************************************************************/
3881 /* async block device emulation */
3882 
3883 typedef struct BlockDriverAIOCBSync {
3884     BlockDriverAIOCB common;
3885     QEMUBH *bh;
3886     int ret;
3887     /* vector translation state */
3888     QEMUIOVector *qiov;
3889     uint8_t *bounce;
3890     int is_write;
3891 } BlockDriverAIOCBSync;
3892 
3893 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3894 {
3895     BlockDriverAIOCBSync *acb =
3896         container_of(blockacb, BlockDriverAIOCBSync, common);
3897     qemu_bh_delete(acb->bh);
3898     acb->bh = NULL;
3899     qemu_aio_release(acb);
3900 }
3901 
3902 static const AIOCBInfo bdrv_em_aiocb_info = {
3903     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3904     .cancel             = bdrv_aio_cancel_em,
3905 };
3906 
3907 static void bdrv_aio_bh_cb(void *opaque)
3908 {
3909     BlockDriverAIOCBSync *acb = opaque;
3910 
3911     if (!acb->is_write)
3912         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
3913     qemu_vfree(acb->bounce);
3914     acb->common.cb(acb->common.opaque, acb->ret);
3915     qemu_bh_delete(acb->bh);
3916     acb->bh = NULL;
3917     qemu_aio_release(acb);
3918 }
3919 
3920 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3921                                             int64_t sector_num,
3922                                             QEMUIOVector *qiov,
3923                                             int nb_sectors,
3924                                             BlockDriverCompletionFunc *cb,
3925                                             void *opaque,
3926                                             int is_write)
3927 
3928 {
3929     BlockDriverAIOCBSync *acb;
3930 
3931     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
3932     acb->is_write = is_write;
3933     acb->qiov = qiov;
3934     acb->bounce = qemu_blockalign(bs, qiov->size);
3935     acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3936 
3937     if (is_write) {
3938         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
3939         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3940     } else {
3941         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3942     }
3943 
3944     qemu_bh_schedule(acb->bh);
3945 
3946     return &acb->common;
3947 }
3948 
3949 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3950         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3951         BlockDriverCompletionFunc *cb, void *opaque)
3952 {
3953     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3954 }
3955 
3956 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3957         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3958         BlockDriverCompletionFunc *cb, void *opaque)
3959 {
3960     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3961 }
3962 
3963 
3964 typedef struct BlockDriverAIOCBCoroutine {
3965     BlockDriverAIOCB common;
3966     BlockRequest req;
3967     bool is_write;
3968     bool *done;
3969     QEMUBH* bh;
3970 } BlockDriverAIOCBCoroutine;
3971 
3972 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3973 {
3974     BlockDriverAIOCBCoroutine *acb =
3975         container_of(blockacb, BlockDriverAIOCBCoroutine, common);
3976     bool done = false;
3977 
3978     acb->done = &done;
3979     while (!done) {
3980         qemu_aio_wait();
3981     }
3982 }
3983 
3984 static const AIOCBInfo bdrv_em_co_aiocb_info = {
3985     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3986     .cancel             = bdrv_aio_co_cancel_em,
3987 };
3988 
3989 static void bdrv_co_em_bh(void *opaque)
3990 {
3991     BlockDriverAIOCBCoroutine *acb = opaque;
3992 
3993     acb->common.cb(acb->common.opaque, acb->req.error);
3994 
3995     if (acb->done) {
3996         *acb->done = true;
3997     }
3998 
3999     qemu_bh_delete(acb->bh);
4000     qemu_aio_release(acb);
4001 }
4002 
4003 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4004 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4005 {
4006     BlockDriverAIOCBCoroutine *acb = opaque;
4007     BlockDriverState *bs = acb->common.bs;
4008 
4009     if (!acb->is_write) {
4010         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4011             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4012     } else {
4013         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4014             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4015     }
4016 
4017     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4018     qemu_bh_schedule(acb->bh);
4019 }
4020 
4021 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4022                                                int64_t sector_num,
4023                                                QEMUIOVector *qiov,
4024                                                int nb_sectors,
4025                                                BdrvRequestFlags flags,
4026                                                BlockDriverCompletionFunc *cb,
4027                                                void *opaque,
4028                                                bool is_write)
4029 {
4030     Coroutine *co;
4031     BlockDriverAIOCBCoroutine *acb;
4032 
4033     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4034     acb->req.sector = sector_num;
4035     acb->req.nb_sectors = nb_sectors;
4036     acb->req.qiov = qiov;
4037     acb->req.flags = flags;
4038     acb->is_write = is_write;
4039     acb->done = NULL;
4040 
4041     co = qemu_coroutine_create(bdrv_co_do_rw);
4042     qemu_coroutine_enter(co, acb);
4043 
4044     return &acb->common;
4045 }
4046 
4047 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4048 {
4049     BlockDriverAIOCBCoroutine *acb = opaque;
4050     BlockDriverState *bs = acb->common.bs;
4051 
4052     acb->req.error = bdrv_co_flush(bs);
4053     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4054     qemu_bh_schedule(acb->bh);
4055 }
4056 
4057 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4058         BlockDriverCompletionFunc *cb, void *opaque)
4059 {
4060     trace_bdrv_aio_flush(bs, opaque);
4061 
4062     Coroutine *co;
4063     BlockDriverAIOCBCoroutine *acb;
4064 
4065     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4066     acb->done = NULL;
4067 
4068     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4069     qemu_coroutine_enter(co, acb);
4070 
4071     return &acb->common;
4072 }
4073 
4074 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4075 {
4076     BlockDriverAIOCBCoroutine *acb = opaque;
4077     BlockDriverState *bs = acb->common.bs;
4078 
4079     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4080     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4081     qemu_bh_schedule(acb->bh);
4082 }
4083 
4084 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4085         int64_t sector_num, int nb_sectors,
4086         BlockDriverCompletionFunc *cb, void *opaque)
4087 {
4088     Coroutine *co;
4089     BlockDriverAIOCBCoroutine *acb;
4090 
4091     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4092 
4093     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4094     acb->req.sector = sector_num;
4095     acb->req.nb_sectors = nb_sectors;
4096     acb->done = NULL;
4097     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4098     qemu_coroutine_enter(co, acb);
4099 
4100     return &acb->common;
4101 }
4102 
4103 void bdrv_init(void)
4104 {
4105     module_call_init(MODULE_INIT_BLOCK);
4106 }
4107 
4108 void bdrv_init_with_whitelist(void)
4109 {
4110     use_bdrv_whitelist = 1;
4111     bdrv_init();
4112 }
4113 
4114 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4115                    BlockDriverCompletionFunc *cb, void *opaque)
4116 {
4117     BlockDriverAIOCB *acb;
4118 
4119     acb = g_slice_alloc(aiocb_info->aiocb_size);
4120     acb->aiocb_info = aiocb_info;
4121     acb->bs = bs;
4122     acb->cb = cb;
4123     acb->opaque = opaque;
4124     return acb;
4125 }
4126 
4127 void qemu_aio_release(void *p)
4128 {
4129     BlockDriverAIOCB *acb = p;
4130     g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4131 }
4132 
4133 /**************************************************************/
4134 /* Coroutine block device emulation */
4135 
4136 typedef struct CoroutineIOCompletion {
4137     Coroutine *coroutine;
4138     int ret;
4139 } CoroutineIOCompletion;
4140 
4141 static void bdrv_co_io_em_complete(void *opaque, int ret)
4142 {
4143     CoroutineIOCompletion *co = opaque;
4144 
4145     co->ret = ret;
4146     qemu_coroutine_enter(co->coroutine, NULL);
4147 }
4148 
4149 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4150                                       int nb_sectors, QEMUIOVector *iov,
4151                                       bool is_write)
4152 {
4153     CoroutineIOCompletion co = {
4154         .coroutine = qemu_coroutine_self(),
4155     };
4156     BlockDriverAIOCB *acb;
4157 
4158     if (is_write) {
4159         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4160                                        bdrv_co_io_em_complete, &co);
4161     } else {
4162         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4163                                       bdrv_co_io_em_complete, &co);
4164     }
4165 
4166     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4167     if (!acb) {
4168         return -EIO;
4169     }
4170     qemu_coroutine_yield();
4171 
4172     return co.ret;
4173 }
4174 
4175 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4176                                          int64_t sector_num, int nb_sectors,
4177                                          QEMUIOVector *iov)
4178 {
4179     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4180 }
4181 
4182 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4183                                          int64_t sector_num, int nb_sectors,
4184                                          QEMUIOVector *iov)
4185 {
4186     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4187 }
4188 
4189 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4190 {
4191     RwCo *rwco = opaque;
4192 
4193     rwco->ret = bdrv_co_flush(rwco->bs);
4194 }
4195 
4196 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4197 {
4198     int ret;
4199 
4200     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4201         return 0;
4202     }
4203 
4204     /* Write back cached data to the OS even with cache=unsafe */
4205     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4206     if (bs->drv->bdrv_co_flush_to_os) {
4207         ret = bs->drv->bdrv_co_flush_to_os(bs);
4208         if (ret < 0) {
4209             return ret;
4210         }
4211     }
4212 
4213     /* But don't actually force it to the disk with cache=unsafe */
4214     if (bs->open_flags & BDRV_O_NO_FLUSH) {
4215         goto flush_parent;
4216     }
4217 
4218     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4219     if (bs->drv->bdrv_co_flush_to_disk) {
4220         ret = bs->drv->bdrv_co_flush_to_disk(bs);
4221     } else if (bs->drv->bdrv_aio_flush) {
4222         BlockDriverAIOCB *acb;
4223         CoroutineIOCompletion co = {
4224             .coroutine = qemu_coroutine_self(),
4225         };
4226 
4227         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4228         if (acb == NULL) {
4229             ret = -EIO;
4230         } else {
4231             qemu_coroutine_yield();
4232             ret = co.ret;
4233         }
4234     } else {
4235         /*
4236          * Some block drivers always operate in either writethrough or unsafe
4237          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4238          * know how the server works (because the behaviour is hardcoded or
4239          * depends on server-side configuration), so we can't ensure that
4240          * everything is safe on disk. Returning an error doesn't work because
4241          * that would break guests even if the server operates in writethrough
4242          * mode.
4243          *
4244          * Let's hope the user knows what he's doing.
4245          */
4246         ret = 0;
4247     }
4248     if (ret < 0) {
4249         return ret;
4250     }
4251 
4252     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4253      * in the case of cache=unsafe, so there are no useless flushes.
4254      */
4255 flush_parent:
4256     return bdrv_co_flush(bs->file);
4257 }
4258 
4259 void bdrv_invalidate_cache(BlockDriverState *bs)
4260 {
4261     if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4262         bs->drv->bdrv_invalidate_cache(bs);
4263     }
4264 }
4265 
4266 void bdrv_invalidate_cache_all(void)
4267 {
4268     BlockDriverState *bs;
4269 
4270     QTAILQ_FOREACH(bs, &bdrv_states, list) {
4271         bdrv_invalidate_cache(bs);
4272     }
4273 }
4274 
4275 void bdrv_clear_incoming_migration_all(void)
4276 {
4277     BlockDriverState *bs;
4278 
4279     QTAILQ_FOREACH(bs, &bdrv_states, list) {
4280         bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4281     }
4282 }
4283 
4284 int bdrv_flush(BlockDriverState *bs)
4285 {
4286     Coroutine *co;
4287     RwCo rwco = {
4288         .bs = bs,
4289         .ret = NOT_DONE,
4290     };
4291 
4292     if (qemu_in_coroutine()) {
4293         /* Fast-path if already in coroutine context */
4294         bdrv_flush_co_entry(&rwco);
4295     } else {
4296         co = qemu_coroutine_create(bdrv_flush_co_entry);
4297         qemu_coroutine_enter(co, &rwco);
4298         while (rwco.ret == NOT_DONE) {
4299             qemu_aio_wait();
4300         }
4301     }
4302 
4303     return rwco.ret;
4304 }
4305 
4306 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4307 {
4308     RwCo *rwco = opaque;
4309 
4310     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4311 }
4312 
4313 /* if no limit is specified in the BlockLimits use a default
4314  * of 32768 512-byte sectors (16 MiB) per request.
4315  */
4316 #define MAX_DISCARD_DEFAULT 32768
4317 
4318 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4319                                  int nb_sectors)
4320 {
4321     int max_discard;
4322 
4323     if (!bs->drv) {
4324         return -ENOMEDIUM;
4325     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4326         return -EIO;
4327     } else if (bs->read_only) {
4328         return -EROFS;
4329     }
4330 
4331     bdrv_reset_dirty(bs, sector_num, nb_sectors);
4332 
4333     /* Do nothing if disabled.  */
4334     if (!(bs->open_flags & BDRV_O_UNMAP)) {
4335         return 0;
4336     }
4337 
4338     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4339         return 0;
4340     }
4341 
4342     max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4343     while (nb_sectors > 0) {
4344         int ret;
4345         int num = nb_sectors;
4346 
4347         /* align request */
4348         if (bs->bl.discard_alignment &&
4349             num >= bs->bl.discard_alignment &&
4350             sector_num % bs->bl.discard_alignment) {
4351             if (num > bs->bl.discard_alignment) {
4352                 num = bs->bl.discard_alignment;
4353             }
4354             num -= sector_num % bs->bl.discard_alignment;
4355         }
4356 
4357         /* limit request size */
4358         if (num > max_discard) {
4359             num = max_discard;
4360         }
4361 
4362         if (bs->drv->bdrv_co_discard) {
4363             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
4364         } else {
4365             BlockDriverAIOCB *acb;
4366             CoroutineIOCompletion co = {
4367                 .coroutine = qemu_coroutine_self(),
4368             };
4369 
4370             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4371                                             bdrv_co_io_em_complete, &co);
4372             if (acb == NULL) {
4373                 return -EIO;
4374             } else {
4375                 qemu_coroutine_yield();
4376                 ret = co.ret;
4377             }
4378         }
4379         if (ret) {
4380             return ret;
4381         }
4382 
4383         sector_num += num;
4384         nb_sectors -= num;
4385     }
4386     return 0;
4387 }
4388 
4389 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4390 {
4391     Coroutine *co;
4392     RwCo rwco = {
4393         .bs = bs,
4394         .sector_num = sector_num,
4395         .nb_sectors = nb_sectors,
4396         .ret = NOT_DONE,
4397     };
4398 
4399     if (qemu_in_coroutine()) {
4400         /* Fast-path if already in coroutine context */
4401         bdrv_discard_co_entry(&rwco);
4402     } else {
4403         co = qemu_coroutine_create(bdrv_discard_co_entry);
4404         qemu_coroutine_enter(co, &rwco);
4405         while (rwco.ret == NOT_DONE) {
4406             qemu_aio_wait();
4407         }
4408     }
4409 
4410     return rwco.ret;
4411 }
4412 
4413 /**************************************************************/
4414 /* removable device support */
4415 
4416 /**
4417  * Return TRUE if the media is present
4418  */
4419 int bdrv_is_inserted(BlockDriverState *bs)
4420 {
4421     BlockDriver *drv = bs->drv;
4422 
4423     if (!drv)
4424         return 0;
4425     if (!drv->bdrv_is_inserted)
4426         return 1;
4427     return drv->bdrv_is_inserted(bs);
4428 }
4429 
4430 /**
4431  * Return whether the media changed since the last call to this
4432  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4433  */
4434 int bdrv_media_changed(BlockDriverState *bs)
4435 {
4436     BlockDriver *drv = bs->drv;
4437 
4438     if (drv && drv->bdrv_media_changed) {
4439         return drv->bdrv_media_changed(bs);
4440     }
4441     return -ENOTSUP;
4442 }
4443 
4444 /**
4445  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4446  */
4447 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4448 {
4449     BlockDriver *drv = bs->drv;
4450 
4451     if (drv && drv->bdrv_eject) {
4452         drv->bdrv_eject(bs, eject_flag);
4453     }
4454 
4455     if (bs->device_name[0] != '\0') {
4456         bdrv_emit_qmp_eject_event(bs, eject_flag);
4457     }
4458 }
4459 
4460 /**
4461  * Lock or unlock the media (if it is locked, the user won't be able
4462  * to eject it manually).
4463  */
4464 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4465 {
4466     BlockDriver *drv = bs->drv;
4467 
4468     trace_bdrv_lock_medium(bs, locked);
4469 
4470     if (drv && drv->bdrv_lock_medium) {
4471         drv->bdrv_lock_medium(bs, locked);
4472     }
4473 }
4474 
4475 /* needed for generic scsi interface */
4476 
4477 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4478 {
4479     BlockDriver *drv = bs->drv;
4480 
4481     if (drv && drv->bdrv_ioctl)
4482         return drv->bdrv_ioctl(bs, req, buf);
4483     return -ENOTSUP;
4484 }
4485 
4486 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4487         unsigned long int req, void *buf,
4488         BlockDriverCompletionFunc *cb, void *opaque)
4489 {
4490     BlockDriver *drv = bs->drv;
4491 
4492     if (drv && drv->bdrv_aio_ioctl)
4493         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4494     return NULL;
4495 }
4496 
4497 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
4498 {
4499     bs->buffer_alignment = align;
4500 }
4501 
4502 void *qemu_blockalign(BlockDriverState *bs, size_t size)
4503 {
4504     return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
4505 }
4506 
4507 /*
4508  * Check if all memory in this vector is sector aligned.
4509  */
4510 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
4511 {
4512     int i;
4513 
4514     for (i = 0; i < qiov->niov; i++) {
4515         if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
4516             return false;
4517         }
4518     }
4519 
4520     return true;
4521 }
4522 
4523 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity)
4524 {
4525     int64_t bitmap_size;
4526     BdrvDirtyBitmap *bitmap;
4527 
4528     assert((granularity & (granularity - 1)) == 0);
4529 
4530     granularity >>= BDRV_SECTOR_BITS;
4531     assert(granularity);
4532     bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
4533     bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
4534     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
4535     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
4536     return bitmap;
4537 }
4538 
4539 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
4540 {
4541     BdrvDirtyBitmap *bm, *next;
4542     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
4543         if (bm == bitmap) {
4544             QLIST_REMOVE(bitmap, list);
4545             hbitmap_free(bitmap->bitmap);
4546             g_free(bitmap);
4547             return;
4548         }
4549     }
4550 }
4551 
4552 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
4553 {
4554     BdrvDirtyBitmap *bm;
4555     BlockDirtyInfoList *list = NULL;
4556     BlockDirtyInfoList **plist = &list;
4557 
4558     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
4559         BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
4560         BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
4561         info->count = bdrv_get_dirty_count(bs, bm);
4562         info->granularity =
4563             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
4564         entry->value = info;
4565         *plist = entry;
4566         plist = &entry->next;
4567     }
4568 
4569     return list;
4570 }
4571 
4572 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
4573 {
4574     if (bitmap) {
4575         return hbitmap_get(bitmap->bitmap, sector);
4576     } else {
4577         return 0;
4578     }
4579 }
4580 
4581 void bdrv_dirty_iter_init(BlockDriverState *bs,
4582                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
4583 {
4584     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
4585 }
4586 
4587 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
4588                     int nr_sectors)
4589 {
4590     BdrvDirtyBitmap *bitmap;
4591     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
4592         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
4593     }
4594 }
4595 
4596 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
4597 {
4598     BdrvDirtyBitmap *bitmap;
4599     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
4600         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
4601     }
4602 }
4603 
4604 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
4605 {
4606     return hbitmap_count(bitmap->bitmap);
4607 }
4608 
4609 /* Get a reference to bs */
4610 void bdrv_ref(BlockDriverState *bs)
4611 {
4612     bs->refcnt++;
4613 }
4614 
4615 /* Release a previously grabbed reference to bs.
4616  * If after releasing, reference count is zero, the BlockDriverState is
4617  * deleted. */
4618 void bdrv_unref(BlockDriverState *bs)
4619 {
4620     assert(bs->refcnt > 0);
4621     if (--bs->refcnt == 0) {
4622         bdrv_delete(bs);
4623     }
4624 }
4625 
4626 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4627 {
4628     assert(bs->in_use != in_use);
4629     bs->in_use = in_use;
4630 }
4631 
4632 int bdrv_in_use(BlockDriverState *bs)
4633 {
4634     return bs->in_use;
4635 }
4636 
4637 void bdrv_iostatus_enable(BlockDriverState *bs)
4638 {
4639     bs->iostatus_enabled = true;
4640     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4641 }
4642 
4643 /* The I/O status is only enabled if the drive explicitly
4644  * enables it _and_ the VM is configured to stop on errors */
4645 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4646 {
4647     return (bs->iostatus_enabled &&
4648            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
4649             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
4650             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
4651 }
4652 
4653 void bdrv_iostatus_disable(BlockDriverState *bs)
4654 {
4655     bs->iostatus_enabled = false;
4656 }
4657 
4658 void bdrv_iostatus_reset(BlockDriverState *bs)
4659 {
4660     if (bdrv_iostatus_is_enabled(bs)) {
4661         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4662         if (bs->job) {
4663             block_job_iostatus_reset(bs->job);
4664         }
4665     }
4666 }
4667 
4668 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4669 {
4670     assert(bdrv_iostatus_is_enabled(bs));
4671     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4672         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4673                                          BLOCK_DEVICE_IO_STATUS_FAILED;
4674     }
4675 }
4676 
4677 void
4678 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4679         enum BlockAcctType type)
4680 {
4681     assert(type < BDRV_MAX_IOTYPE);
4682 
4683     cookie->bytes = bytes;
4684     cookie->start_time_ns = get_clock();
4685     cookie->type = type;
4686 }
4687 
4688 void
4689 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4690 {
4691     assert(cookie->type < BDRV_MAX_IOTYPE);
4692 
4693     bs->nr_bytes[cookie->type] += cookie->bytes;
4694     bs->nr_ops[cookie->type]++;
4695     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4696 }
4697 
4698 void bdrv_img_create(const char *filename, const char *fmt,
4699                      const char *base_filename, const char *base_fmt,
4700                      char *options, uint64_t img_size, int flags,
4701                      Error **errp, bool quiet)
4702 {
4703     QEMUOptionParameter *param = NULL, *create_options = NULL;
4704     QEMUOptionParameter *backing_fmt, *backing_file, *size;
4705     BlockDriverState *bs = NULL;
4706     BlockDriver *drv, *proto_drv;
4707     BlockDriver *backing_drv = NULL;
4708     Error *local_err = NULL;
4709     int ret = 0;
4710 
4711     /* Find driver and parse its options */
4712     drv = bdrv_find_format(fmt);
4713     if (!drv) {
4714         error_setg(errp, "Unknown file format '%s'", fmt);
4715         return;
4716     }
4717 
4718     proto_drv = bdrv_find_protocol(filename, true);
4719     if (!proto_drv) {
4720         error_setg(errp, "Unknown protocol '%s'", filename);
4721         return;
4722     }
4723 
4724     create_options = append_option_parameters(create_options,
4725                                               drv->create_options);
4726     create_options = append_option_parameters(create_options,
4727                                               proto_drv->create_options);
4728 
4729     /* Create parameter list with default values */
4730     param = parse_option_parameters("", create_options, param);
4731 
4732     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4733 
4734     /* Parse -o options */
4735     if (options) {
4736         param = parse_option_parameters(options, create_options, param);
4737         if (param == NULL) {
4738             error_setg(errp, "Invalid options for file format '%s'.", fmt);
4739             goto out;
4740         }
4741     }
4742 
4743     if (base_filename) {
4744         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4745                                  base_filename)) {
4746             error_setg(errp, "Backing file not supported for file format '%s'",
4747                        fmt);
4748             goto out;
4749         }
4750     }
4751 
4752     if (base_fmt) {
4753         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4754             error_setg(errp, "Backing file format not supported for file "
4755                              "format '%s'", fmt);
4756             goto out;
4757         }
4758     }
4759 
4760     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4761     if (backing_file && backing_file->value.s) {
4762         if (!strcmp(filename, backing_file->value.s)) {
4763             error_setg(errp, "Error: Trying to create an image with the "
4764                              "same filename as the backing file");
4765             goto out;
4766         }
4767     }
4768 
4769     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4770     if (backing_fmt && backing_fmt->value.s) {
4771         backing_drv = bdrv_find_format(backing_fmt->value.s);
4772         if (!backing_drv) {
4773             error_setg(errp, "Unknown backing file format '%s'",
4774                        backing_fmt->value.s);
4775             goto out;
4776         }
4777     }
4778 
4779     // The size for the image must always be specified, with one exception:
4780     // If we are using a backing file, we can obtain the size from there
4781     size = get_option_parameter(param, BLOCK_OPT_SIZE);
4782     if (size && size->value.n == -1) {
4783         if (backing_file && backing_file->value.s) {
4784             uint64_t size;
4785             char buf[32];
4786             int back_flags;
4787 
4788             /* backing files always opened read-only */
4789             back_flags =
4790                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4791 
4792             bs = bdrv_new("");
4793 
4794             ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags,
4795                             backing_drv, &local_err);
4796             if (ret < 0) {
4797                 error_setg_errno(errp, -ret, "Could not open '%s': %s",
4798                                  backing_file->value.s,
4799                                  error_get_pretty(local_err));
4800                 error_free(local_err);
4801                 local_err = NULL;
4802                 goto out;
4803             }
4804             bdrv_get_geometry(bs, &size);
4805             size *= 512;
4806 
4807             snprintf(buf, sizeof(buf), "%" PRId64, size);
4808             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4809         } else {
4810             error_setg(errp, "Image creation needs a size parameter");
4811             goto out;
4812         }
4813     }
4814 
4815     if (!quiet) {
4816         printf("Formatting '%s', fmt=%s ", filename, fmt);
4817         print_option_parameters(param);
4818         puts("");
4819     }
4820     ret = bdrv_create(drv, filename, param, &local_err);
4821     if (ret == -EFBIG) {
4822         /* This is generally a better message than whatever the driver would
4823          * deliver (especially because of the cluster_size_hint), since that
4824          * is most probably not much different from "image too large". */
4825         const char *cluster_size_hint = "";
4826         if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
4827             cluster_size_hint = " (try using a larger cluster size)";
4828         }
4829         error_setg(errp, "The image size is too large for file format '%s'"
4830                    "%s", fmt, cluster_size_hint);
4831         error_free(local_err);
4832         local_err = NULL;
4833     }
4834 
4835 out:
4836     free_option_parameters(create_options);
4837     free_option_parameters(param);
4838 
4839     if (bs) {
4840         bdrv_unref(bs);
4841     }
4842     if (error_is_set(&local_err)) {
4843         error_propagate(errp, local_err);
4844     }
4845 }
4846 
4847 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
4848 {
4849     /* Currently BlockDriverState always uses the main loop AioContext */
4850     return qemu_get_aio_context();
4851 }
4852 
4853 void bdrv_add_before_write_notifier(BlockDriverState *bs,
4854                                     NotifierWithReturn *notifier)
4855 {
4856     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
4857 }
4858 
4859 int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
4860 {
4861     if (bs->drv->bdrv_amend_options == NULL) {
4862         return -ENOTSUP;
4863     }
4864     return bs->drv->bdrv_amend_options(bs, options);
4865 }
4866 
4867 ExtSnapshotPerm bdrv_check_ext_snapshot(BlockDriverState *bs)
4868 {
4869     if (bs->drv->bdrv_check_ext_snapshot) {
4870         return bs->drv->bdrv_check_ext_snapshot(bs);
4871     }
4872 
4873     if (bs->file && bs->file->drv && bs->file->drv->bdrv_check_ext_snapshot) {
4874         return bs->file->drv->bdrv_check_ext_snapshot(bs);
4875     }
4876 
4877     /* external snapshots are allowed by default */
4878     return EXT_SNAPSHOT_ALLOWED;
4879 }
4880 
4881 ExtSnapshotPerm bdrv_check_ext_snapshot_forbidden(BlockDriverState *bs)
4882 {
4883     return EXT_SNAPSHOT_FORBIDDEN;
4884 }
4885