xref: /openbmc/qemu/block.c (revision 1b111dc1)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor/monitor.h"
28 #include "block/block_int.h"
29 #include "block/blockjob.h"
30 #include "qemu/module.h"
31 #include "qapi/qmp/qjson.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "qmp-commands.h"
36 #include "qemu/timer.h"
37 
38 #ifdef CONFIG_BSD
39 #include <sys/types.h>
40 #include <sys/stat.h>
41 #include <sys/ioctl.h>
42 #include <sys/queue.h>
43 #ifndef __DragonFly__
44 #include <sys/disk.h>
45 #endif
46 #endif
47 
48 #ifdef _WIN32
49 #include <windows.h>
50 #endif
51 
52 struct BdrvDirtyBitmap {
53     HBitmap *bitmap;
54     QLIST_ENTRY(BdrvDirtyBitmap) list;
55 };
56 
57 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
58 
59 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
60 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
61         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62         BlockDriverCompletionFunc *cb, void *opaque);
63 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
64         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
65         BlockDriverCompletionFunc *cb, void *opaque);
66 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
67                                          int64_t sector_num, int nb_sectors,
68                                          QEMUIOVector *iov);
69 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
70                                          int64_t sector_num, int nb_sectors,
71                                          QEMUIOVector *iov);
72 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
73     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74     BdrvRequestFlags flags);
75 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
76     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
77     BdrvRequestFlags flags);
78 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
79                                                int64_t sector_num,
80                                                QEMUIOVector *qiov,
81                                                int nb_sectors,
82                                                BdrvRequestFlags flags,
83                                                BlockDriverCompletionFunc *cb,
84                                                void *opaque,
85                                                bool is_write);
86 static void coroutine_fn bdrv_co_do_rw(void *opaque);
87 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
88     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
89 
90 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
91     QTAILQ_HEAD_INITIALIZER(bdrv_states);
92 
93 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
94     QLIST_HEAD_INITIALIZER(bdrv_drivers);
95 
96 /* If non-zero, use only whitelisted block drivers */
97 static int use_bdrv_whitelist;
98 
99 #ifdef _WIN32
100 static int is_windows_drive_prefix(const char *filename)
101 {
102     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
103              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
104             filename[1] == ':');
105 }
106 
107 int is_windows_drive(const char *filename)
108 {
109     if (is_windows_drive_prefix(filename) &&
110         filename[2] == '\0')
111         return 1;
112     if (strstart(filename, "\\\\.\\", NULL) ||
113         strstart(filename, "//./", NULL))
114         return 1;
115     return 0;
116 }
117 #endif
118 
119 /* throttling disk I/O limits */
120 void bdrv_set_io_limits(BlockDriverState *bs,
121                         ThrottleConfig *cfg)
122 {
123     int i;
124 
125     throttle_config(&bs->throttle_state, cfg);
126 
127     for (i = 0; i < 2; i++) {
128         qemu_co_enter_next(&bs->throttled_reqs[i]);
129     }
130 }
131 
132 /* this function drain all the throttled IOs */
133 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
134 {
135     bool drained = false;
136     bool enabled = bs->io_limits_enabled;
137     int i;
138 
139     bs->io_limits_enabled = false;
140 
141     for (i = 0; i < 2; i++) {
142         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
143             drained = true;
144         }
145     }
146 
147     bs->io_limits_enabled = enabled;
148 
149     return drained;
150 }
151 
152 void bdrv_io_limits_disable(BlockDriverState *bs)
153 {
154     bs->io_limits_enabled = false;
155 
156     bdrv_start_throttled_reqs(bs);
157 
158     throttle_destroy(&bs->throttle_state);
159 }
160 
161 static void bdrv_throttle_read_timer_cb(void *opaque)
162 {
163     BlockDriverState *bs = opaque;
164     qemu_co_enter_next(&bs->throttled_reqs[0]);
165 }
166 
167 static void bdrv_throttle_write_timer_cb(void *opaque)
168 {
169     BlockDriverState *bs = opaque;
170     qemu_co_enter_next(&bs->throttled_reqs[1]);
171 }
172 
173 /* should be called before bdrv_set_io_limits if a limit is set */
174 void bdrv_io_limits_enable(BlockDriverState *bs)
175 {
176     assert(!bs->io_limits_enabled);
177     throttle_init(&bs->throttle_state,
178                   QEMU_CLOCK_VIRTUAL,
179                   bdrv_throttle_read_timer_cb,
180                   bdrv_throttle_write_timer_cb,
181                   bs);
182     bs->io_limits_enabled = true;
183 }
184 
185 /* This function makes an IO wait if needed
186  *
187  * @nb_sectors: the number of sectors of the IO
188  * @is_write:   is the IO a write
189  */
190 static void bdrv_io_limits_intercept(BlockDriverState *bs,
191                                      int nb_sectors,
192                                      bool is_write)
193 {
194     /* does this io must wait */
195     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
196 
197     /* if must wait or any request of this type throttled queue the IO */
198     if (must_wait ||
199         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
200         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
201     }
202 
203     /* the IO will be executed, do the accounting */
204     throttle_account(&bs->throttle_state,
205                      is_write,
206                      nb_sectors * BDRV_SECTOR_SIZE);
207 
208     /* if the next request must wait -> do nothing */
209     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
210         return;
211     }
212 
213     /* else queue next request for execution */
214     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
215 }
216 
217 /* check if the path starts with "<protocol>:" */
218 static int path_has_protocol(const char *path)
219 {
220     const char *p;
221 
222 #ifdef _WIN32
223     if (is_windows_drive(path) ||
224         is_windows_drive_prefix(path)) {
225         return 0;
226     }
227     p = path + strcspn(path, ":/\\");
228 #else
229     p = path + strcspn(path, ":/");
230 #endif
231 
232     return *p == ':';
233 }
234 
235 int path_is_absolute(const char *path)
236 {
237 #ifdef _WIN32
238     /* specific case for names like: "\\.\d:" */
239     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
240         return 1;
241     }
242     return (*path == '/' || *path == '\\');
243 #else
244     return (*path == '/');
245 #endif
246 }
247 
248 /* if filename is absolute, just copy it to dest. Otherwise, build a
249    path to it by considering it is relative to base_path. URL are
250    supported. */
251 void path_combine(char *dest, int dest_size,
252                   const char *base_path,
253                   const char *filename)
254 {
255     const char *p, *p1;
256     int len;
257 
258     if (dest_size <= 0)
259         return;
260     if (path_is_absolute(filename)) {
261         pstrcpy(dest, dest_size, filename);
262     } else {
263         p = strchr(base_path, ':');
264         if (p)
265             p++;
266         else
267             p = base_path;
268         p1 = strrchr(base_path, '/');
269 #ifdef _WIN32
270         {
271             const char *p2;
272             p2 = strrchr(base_path, '\\');
273             if (!p1 || p2 > p1)
274                 p1 = p2;
275         }
276 #endif
277         if (p1)
278             p1++;
279         else
280             p1 = base_path;
281         if (p1 > p)
282             p = p1;
283         len = p - base_path;
284         if (len > dest_size - 1)
285             len = dest_size - 1;
286         memcpy(dest, base_path, len);
287         dest[len] = '\0';
288         pstrcat(dest, dest_size, filename);
289     }
290 }
291 
292 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
293 {
294     if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
295         pstrcpy(dest, sz, bs->backing_file);
296     } else {
297         path_combine(dest, sz, bs->filename, bs->backing_file);
298     }
299 }
300 
301 void bdrv_register(BlockDriver *bdrv)
302 {
303     /* Block drivers without coroutine functions need emulation */
304     if (!bdrv->bdrv_co_readv) {
305         bdrv->bdrv_co_readv = bdrv_co_readv_em;
306         bdrv->bdrv_co_writev = bdrv_co_writev_em;
307 
308         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
309          * the block driver lacks aio we need to emulate that too.
310          */
311         if (!bdrv->bdrv_aio_readv) {
312             /* add AIO emulation layer */
313             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
314             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
315         }
316     }
317 
318     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
319 }
320 
321 /* create a new block device (by default it is empty) */
322 BlockDriverState *bdrv_new(const char *device_name)
323 {
324     BlockDriverState *bs;
325 
326     bs = g_malloc0(sizeof(BlockDriverState));
327     QLIST_INIT(&bs->dirty_bitmaps);
328     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
329     if (device_name[0] != '\0') {
330         QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
331     }
332     bdrv_iostatus_disable(bs);
333     notifier_list_init(&bs->close_notifiers);
334     notifier_with_return_list_init(&bs->before_write_notifiers);
335     qemu_co_queue_init(&bs->throttled_reqs[0]);
336     qemu_co_queue_init(&bs->throttled_reqs[1]);
337     bs->refcnt = 1;
338 
339     return bs;
340 }
341 
342 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
343 {
344     notifier_list_add(&bs->close_notifiers, notify);
345 }
346 
347 BlockDriver *bdrv_find_format(const char *format_name)
348 {
349     BlockDriver *drv1;
350     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
351         if (!strcmp(drv1->format_name, format_name)) {
352             return drv1;
353         }
354     }
355     return NULL;
356 }
357 
358 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
359 {
360     static const char *whitelist_rw[] = {
361         CONFIG_BDRV_RW_WHITELIST
362     };
363     static const char *whitelist_ro[] = {
364         CONFIG_BDRV_RO_WHITELIST
365     };
366     const char **p;
367 
368     if (!whitelist_rw[0] && !whitelist_ro[0]) {
369         return 1;               /* no whitelist, anything goes */
370     }
371 
372     for (p = whitelist_rw; *p; p++) {
373         if (!strcmp(drv->format_name, *p)) {
374             return 1;
375         }
376     }
377     if (read_only) {
378         for (p = whitelist_ro; *p; p++) {
379             if (!strcmp(drv->format_name, *p)) {
380                 return 1;
381             }
382         }
383     }
384     return 0;
385 }
386 
387 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
388                                           bool read_only)
389 {
390     BlockDriver *drv = bdrv_find_format(format_name);
391     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
392 }
393 
394 typedef struct CreateCo {
395     BlockDriver *drv;
396     char *filename;
397     QEMUOptionParameter *options;
398     int ret;
399     Error *err;
400 } CreateCo;
401 
402 static void coroutine_fn bdrv_create_co_entry(void *opaque)
403 {
404     Error *local_err = NULL;
405     int ret;
406 
407     CreateCo *cco = opaque;
408     assert(cco->drv);
409 
410     ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
411     if (error_is_set(&local_err)) {
412         error_propagate(&cco->err, local_err);
413     }
414     cco->ret = ret;
415 }
416 
417 int bdrv_create(BlockDriver *drv, const char* filename,
418     QEMUOptionParameter *options, Error **errp)
419 {
420     int ret;
421 
422     Coroutine *co;
423     CreateCo cco = {
424         .drv = drv,
425         .filename = g_strdup(filename),
426         .options = options,
427         .ret = NOT_DONE,
428         .err = NULL,
429     };
430 
431     if (!drv->bdrv_create) {
432         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
433         ret = -ENOTSUP;
434         goto out;
435     }
436 
437     if (qemu_in_coroutine()) {
438         /* Fast-path if already in coroutine context */
439         bdrv_create_co_entry(&cco);
440     } else {
441         co = qemu_coroutine_create(bdrv_create_co_entry);
442         qemu_coroutine_enter(co, &cco);
443         while (cco.ret == NOT_DONE) {
444             qemu_aio_wait();
445         }
446     }
447 
448     ret = cco.ret;
449     if (ret < 0) {
450         if (error_is_set(&cco.err)) {
451             error_propagate(errp, cco.err);
452         } else {
453             error_setg_errno(errp, -ret, "Could not create image");
454         }
455     }
456 
457 out:
458     g_free(cco.filename);
459     return ret;
460 }
461 
462 int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
463                      Error **errp)
464 {
465     BlockDriver *drv;
466     Error *local_err = NULL;
467     int ret;
468 
469     drv = bdrv_find_protocol(filename, true);
470     if (drv == NULL) {
471         error_setg(errp, "Could not find protocol for file '%s'", filename);
472         return -ENOENT;
473     }
474 
475     ret = bdrv_create(drv, filename, options, &local_err);
476     if (error_is_set(&local_err)) {
477         error_propagate(errp, local_err);
478     }
479     return ret;
480 }
481 
482 /*
483  * Create a uniquely-named empty temporary file.
484  * Return 0 upon success, otherwise a negative errno value.
485  */
486 int get_tmp_filename(char *filename, int size)
487 {
488 #ifdef _WIN32
489     char temp_dir[MAX_PATH];
490     /* GetTempFileName requires that its output buffer (4th param)
491        have length MAX_PATH or greater.  */
492     assert(size >= MAX_PATH);
493     return (GetTempPath(MAX_PATH, temp_dir)
494             && GetTempFileName(temp_dir, "qem", 0, filename)
495             ? 0 : -GetLastError());
496 #else
497     int fd;
498     const char *tmpdir;
499     tmpdir = getenv("TMPDIR");
500     if (!tmpdir)
501         tmpdir = "/tmp";
502     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
503         return -EOVERFLOW;
504     }
505     fd = mkstemp(filename);
506     if (fd < 0) {
507         return -errno;
508     }
509     if (close(fd) != 0) {
510         unlink(filename);
511         return -errno;
512     }
513     return 0;
514 #endif
515 }
516 
517 /*
518  * Detect host devices. By convention, /dev/cdrom[N] is always
519  * recognized as a host CDROM.
520  */
521 static BlockDriver *find_hdev_driver(const char *filename)
522 {
523     int score_max = 0, score;
524     BlockDriver *drv = NULL, *d;
525 
526     QLIST_FOREACH(d, &bdrv_drivers, list) {
527         if (d->bdrv_probe_device) {
528             score = d->bdrv_probe_device(filename);
529             if (score > score_max) {
530                 score_max = score;
531                 drv = d;
532             }
533         }
534     }
535 
536     return drv;
537 }
538 
539 BlockDriver *bdrv_find_protocol(const char *filename,
540                                 bool allow_protocol_prefix)
541 {
542     BlockDriver *drv1;
543     char protocol[128];
544     int len;
545     const char *p;
546 
547     /* TODO Drivers without bdrv_file_open must be specified explicitly */
548 
549     /*
550      * XXX(hch): we really should not let host device detection
551      * override an explicit protocol specification, but moving this
552      * later breaks access to device names with colons in them.
553      * Thanks to the brain-dead persistent naming schemes on udev-
554      * based Linux systems those actually are quite common.
555      */
556     drv1 = find_hdev_driver(filename);
557     if (drv1) {
558         return drv1;
559     }
560 
561     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
562         return bdrv_find_format("file");
563     }
564 
565     p = strchr(filename, ':');
566     assert(p != NULL);
567     len = p - filename;
568     if (len > sizeof(protocol) - 1)
569         len = sizeof(protocol) - 1;
570     memcpy(protocol, filename, len);
571     protocol[len] = '\0';
572     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
573         if (drv1->protocol_name &&
574             !strcmp(drv1->protocol_name, protocol)) {
575             return drv1;
576         }
577     }
578     return NULL;
579 }
580 
581 static int find_image_format(BlockDriverState *bs, const char *filename,
582                              BlockDriver **pdrv, Error **errp)
583 {
584     int score, score_max;
585     BlockDriver *drv1, *drv;
586     uint8_t buf[2048];
587     int ret = 0;
588 
589     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
590     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
591         drv = bdrv_find_format("raw");
592         if (!drv) {
593             error_setg(errp, "Could not find raw image format");
594             ret = -ENOENT;
595         }
596         *pdrv = drv;
597         return ret;
598     }
599 
600     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
601     if (ret < 0) {
602         error_setg_errno(errp, -ret, "Could not read image for determining its "
603                          "format");
604         *pdrv = NULL;
605         return ret;
606     }
607 
608     score_max = 0;
609     drv = NULL;
610     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
611         if (drv1->bdrv_probe) {
612             score = drv1->bdrv_probe(buf, ret, filename);
613             if (score > score_max) {
614                 score_max = score;
615                 drv = drv1;
616             }
617         }
618     }
619     if (!drv) {
620         error_setg(errp, "Could not determine image format: No compatible "
621                    "driver found");
622         ret = -ENOENT;
623     }
624     *pdrv = drv;
625     return ret;
626 }
627 
628 /**
629  * Set the current 'total_sectors' value
630  */
631 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
632 {
633     BlockDriver *drv = bs->drv;
634 
635     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
636     if (bs->sg)
637         return 0;
638 
639     /* query actual device if possible, otherwise just trust the hint */
640     if (drv->bdrv_getlength) {
641         int64_t length = drv->bdrv_getlength(bs);
642         if (length < 0) {
643             return length;
644         }
645         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
646     }
647 
648     bs->total_sectors = hint;
649     return 0;
650 }
651 
652 /**
653  * Set open flags for a given discard mode
654  *
655  * Return 0 on success, -1 if the discard mode was invalid.
656  */
657 int bdrv_parse_discard_flags(const char *mode, int *flags)
658 {
659     *flags &= ~BDRV_O_UNMAP;
660 
661     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
662         /* do nothing */
663     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
664         *flags |= BDRV_O_UNMAP;
665     } else {
666         return -1;
667     }
668 
669     return 0;
670 }
671 
672 /**
673  * Set open flags for a given cache mode
674  *
675  * Return 0 on success, -1 if the cache mode was invalid.
676  */
677 int bdrv_parse_cache_flags(const char *mode, int *flags)
678 {
679     *flags &= ~BDRV_O_CACHE_MASK;
680 
681     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
682         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
683     } else if (!strcmp(mode, "directsync")) {
684         *flags |= BDRV_O_NOCACHE;
685     } else if (!strcmp(mode, "writeback")) {
686         *flags |= BDRV_O_CACHE_WB;
687     } else if (!strcmp(mode, "unsafe")) {
688         *flags |= BDRV_O_CACHE_WB;
689         *flags |= BDRV_O_NO_FLUSH;
690     } else if (!strcmp(mode, "writethrough")) {
691         /* this is the default */
692     } else {
693         return -1;
694     }
695 
696     return 0;
697 }
698 
699 /**
700  * The copy-on-read flag is actually a reference count so multiple users may
701  * use the feature without worrying about clobbering its previous state.
702  * Copy-on-read stays enabled until all users have called to disable it.
703  */
704 void bdrv_enable_copy_on_read(BlockDriverState *bs)
705 {
706     bs->copy_on_read++;
707 }
708 
709 void bdrv_disable_copy_on_read(BlockDriverState *bs)
710 {
711     assert(bs->copy_on_read > 0);
712     bs->copy_on_read--;
713 }
714 
715 static int bdrv_open_flags(BlockDriverState *bs, int flags)
716 {
717     int open_flags = flags | BDRV_O_CACHE_WB;
718 
719     /*
720      * Clear flags that are internal to the block layer before opening the
721      * image.
722      */
723     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
724 
725     /*
726      * Snapshots should be writable.
727      */
728     if (bs->is_temporary) {
729         open_flags |= BDRV_O_RDWR;
730     }
731 
732     return open_flags;
733 }
734 
735 /*
736  * Common part for opening disk images and files
737  *
738  * Removes all processed options from *options.
739  */
740 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
741     QDict *options, int flags, BlockDriver *drv, Error **errp)
742 {
743     int ret, open_flags;
744     const char *filename;
745     Error *local_err = NULL;
746 
747     assert(drv != NULL);
748     assert(bs->file == NULL);
749     assert(options != NULL && bs->options != options);
750 
751     if (file != NULL) {
752         filename = file->filename;
753     } else {
754         filename = qdict_get_try_str(options, "filename");
755     }
756 
757     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
758 
759     /* bdrv_open() with directly using a protocol as drv. This layer is already
760      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
761      * and return immediately. */
762     if (file != NULL && drv->bdrv_file_open) {
763         bdrv_swap(file, bs);
764         return 0;
765     }
766 
767     bs->open_flags = flags;
768     bs->buffer_alignment = 512;
769     bs->zero_beyond_eof = true;
770     open_flags = bdrv_open_flags(bs, flags);
771     bs->read_only = !(open_flags & BDRV_O_RDWR);
772 
773     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
774         error_setg(errp,
775                    !bs->read_only && bdrv_is_whitelisted(drv, true)
776                         ? "Driver '%s' can only be used for read-only devices"
777                         : "Driver '%s' is not whitelisted",
778                    drv->format_name);
779         return -ENOTSUP;
780     }
781 
782     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
783     if (flags & BDRV_O_COPY_ON_READ) {
784         if (!bs->read_only) {
785             bdrv_enable_copy_on_read(bs);
786         } else {
787             error_setg(errp, "Can't use copy-on-read on read-only device");
788             return -EINVAL;
789         }
790     }
791 
792     if (filename != NULL) {
793         pstrcpy(bs->filename, sizeof(bs->filename), filename);
794     } else {
795         bs->filename[0] = '\0';
796     }
797 
798     bs->drv = drv;
799     bs->opaque = g_malloc0(drv->instance_size);
800 
801     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
802 
803     /* Open the image, either directly or using a protocol */
804     if (drv->bdrv_file_open) {
805         assert(file == NULL);
806         assert(!drv->bdrv_needs_filename || filename != NULL);
807         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
808     } else {
809         if (file == NULL) {
810             error_setg(errp, "Can't use '%s' as a block driver for the "
811                        "protocol level", drv->format_name);
812             ret = -EINVAL;
813             goto free_and_fail;
814         }
815         bs->file = file;
816         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
817     }
818 
819     if (ret < 0) {
820         if (error_is_set(&local_err)) {
821             error_propagate(errp, local_err);
822         } else if (bs->filename[0]) {
823             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
824         } else {
825             error_setg_errno(errp, -ret, "Could not open image");
826         }
827         goto free_and_fail;
828     }
829 
830     ret = refresh_total_sectors(bs, bs->total_sectors);
831     if (ret < 0) {
832         error_setg_errno(errp, -ret, "Could not refresh total sector count");
833         goto free_and_fail;
834     }
835 
836 #ifndef _WIN32
837     if (bs->is_temporary) {
838         assert(bs->filename[0] != '\0');
839         unlink(bs->filename);
840     }
841 #endif
842     return 0;
843 
844 free_and_fail:
845     bs->file = NULL;
846     g_free(bs->opaque);
847     bs->opaque = NULL;
848     bs->drv = NULL;
849     return ret;
850 }
851 
852 /*
853  * Opens a file using a protocol (file, host_device, nbd, ...)
854  *
855  * options is a QDict of options to pass to the block drivers, or NULL for an
856  * empty set of options. The reference to the QDict belongs to the block layer
857  * after the call (even on failure), so if the caller intends to reuse the
858  * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
859  */
860 int bdrv_file_open(BlockDriverState **pbs, const char *filename,
861                    QDict *options, int flags, Error **errp)
862 {
863     BlockDriverState *bs;
864     BlockDriver *drv;
865     const char *drvname;
866     bool allow_protocol_prefix = false;
867     Error *local_err = NULL;
868     int ret;
869 
870     /* NULL means an empty set of options */
871     if (options == NULL) {
872         options = qdict_new();
873     }
874 
875     bs = bdrv_new("");
876     bs->options = options;
877     options = qdict_clone_shallow(options);
878 
879     /* Fetch the file name from the options QDict if necessary */
880     if (!filename) {
881         filename = qdict_get_try_str(options, "filename");
882     } else if (filename && !qdict_haskey(options, "filename")) {
883         qdict_put(options, "filename", qstring_from_str(filename));
884         allow_protocol_prefix = true;
885     } else {
886         error_setg(errp, "Can't specify 'file' and 'filename' options at the "
887                    "same time");
888         ret = -EINVAL;
889         goto fail;
890     }
891 
892     /* Find the right block driver */
893     drvname = qdict_get_try_str(options, "driver");
894     if (drvname) {
895         drv = bdrv_find_format(drvname);
896         if (!drv) {
897             error_setg(errp, "Unknown driver '%s'", drvname);
898         }
899         qdict_del(options, "driver");
900     } else if (filename) {
901         drv = bdrv_find_protocol(filename, allow_protocol_prefix);
902         if (!drv) {
903             error_setg(errp, "Unknown protocol");
904         }
905     } else {
906         error_setg(errp, "Must specify either driver or file");
907         drv = NULL;
908     }
909 
910     if (!drv) {
911         /* errp has been set already */
912         ret = -ENOENT;
913         goto fail;
914     }
915 
916     /* Parse the filename and open it */
917     if (drv->bdrv_parse_filename && filename) {
918         drv->bdrv_parse_filename(filename, options, &local_err);
919         if (error_is_set(&local_err)) {
920             error_propagate(errp, local_err);
921             ret = -EINVAL;
922             goto fail;
923         }
924         qdict_del(options, "filename");
925     } else if (drv->bdrv_needs_filename && !filename) {
926         error_setg(errp, "The '%s' block driver requires a file name",
927                    drv->format_name);
928         ret = -EINVAL;
929         goto fail;
930     }
931 
932     ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err);
933     if (ret < 0) {
934         error_propagate(errp, local_err);
935         goto fail;
936     }
937 
938     /* Check if any unknown options were used */
939     if (qdict_size(options) != 0) {
940         const QDictEntry *entry = qdict_first(options);
941         error_setg(errp, "Block protocol '%s' doesn't support the option '%s'",
942                    drv->format_name, entry->key);
943         ret = -EINVAL;
944         goto fail;
945     }
946     QDECREF(options);
947 
948     bs->growable = 1;
949     *pbs = bs;
950     return 0;
951 
952 fail:
953     QDECREF(options);
954     if (!bs->drv) {
955         QDECREF(bs->options);
956     }
957     bdrv_unref(bs);
958     return ret;
959 }
960 
961 /*
962  * Opens the backing file for a BlockDriverState if not yet open
963  *
964  * options is a QDict of options to pass to the block drivers, or NULL for an
965  * empty set of options. The reference to the QDict is transferred to this
966  * function (even on failure), so if the caller intends to reuse the dictionary,
967  * it needs to use QINCREF() before calling bdrv_file_open.
968  */
969 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
970 {
971     char backing_filename[PATH_MAX];
972     int back_flags, ret;
973     BlockDriver *back_drv = NULL;
974     Error *local_err = NULL;
975 
976     if (bs->backing_hd != NULL) {
977         QDECREF(options);
978         return 0;
979     }
980 
981     /* NULL means an empty set of options */
982     if (options == NULL) {
983         options = qdict_new();
984     }
985 
986     bs->open_flags &= ~BDRV_O_NO_BACKING;
987     if (qdict_haskey(options, "file.filename")) {
988         backing_filename[0] = '\0';
989     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
990         QDECREF(options);
991         return 0;
992     } else {
993         bdrv_get_full_backing_filename(bs, backing_filename,
994                                        sizeof(backing_filename));
995     }
996 
997     bs->backing_hd = bdrv_new("");
998 
999     if (bs->backing_format[0] != '\0') {
1000         back_drv = bdrv_find_format(bs->backing_format);
1001     }
1002 
1003     /* backing files always opened read-only */
1004     back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
1005                                     BDRV_O_COPY_ON_READ);
1006 
1007     ret = bdrv_open(bs->backing_hd,
1008                     *backing_filename ? backing_filename : NULL, options,
1009                     back_flags, back_drv, &local_err);
1010     if (ret < 0) {
1011         bdrv_unref(bs->backing_hd);
1012         bs->backing_hd = NULL;
1013         bs->open_flags |= BDRV_O_NO_BACKING;
1014         error_setg(errp, "Could not open backing file: %s",
1015                    error_get_pretty(local_err));
1016         error_free(local_err);
1017         return ret;
1018     }
1019     pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1020             bs->backing_hd->file->filename);
1021     return 0;
1022 }
1023 
1024 /*
1025  * Opens a disk image (raw, qcow2, vmdk, ...)
1026  *
1027  * options is a QDict of options to pass to the block drivers, or NULL for an
1028  * empty set of options. The reference to the QDict belongs to the block layer
1029  * after the call (even on failure), so if the caller intends to reuse the
1030  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1031  */
1032 int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
1033               int flags, BlockDriver *drv, Error **errp)
1034 {
1035     int ret;
1036     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1037     char tmp_filename[PATH_MAX + 1];
1038     BlockDriverState *file = NULL;
1039     QDict *file_options = NULL;
1040     const char *drvname;
1041     Error *local_err = NULL;
1042 
1043     /* NULL means an empty set of options */
1044     if (options == NULL) {
1045         options = qdict_new();
1046     }
1047 
1048     bs->options = options;
1049     options = qdict_clone_shallow(options);
1050 
1051     /* For snapshot=on, create a temporary qcow2 overlay */
1052     if (flags & BDRV_O_SNAPSHOT) {
1053         BlockDriverState *bs1;
1054         int64_t total_size;
1055         BlockDriver *bdrv_qcow2;
1056         QEMUOptionParameter *create_options;
1057         QDict *snapshot_options;
1058 
1059         /* if snapshot, we create a temporary backing file and open it
1060            instead of opening 'filename' directly */
1061 
1062         /* Get the required size from the image */
1063         bs1 = bdrv_new("");
1064         QINCREF(options);
1065         ret = bdrv_open(bs1, filename, options, BDRV_O_NO_BACKING,
1066                         drv, &local_err);
1067         if (ret < 0) {
1068             bdrv_unref(bs1);
1069             goto fail;
1070         }
1071         total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1072 
1073         bdrv_unref(bs1);
1074 
1075         /* Create the temporary image */
1076         ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1077         if (ret < 0) {
1078             error_setg_errno(errp, -ret, "Could not get temporary filename");
1079             goto fail;
1080         }
1081 
1082         bdrv_qcow2 = bdrv_find_format("qcow2");
1083         create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1084                                                  NULL);
1085 
1086         set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1087 
1088         ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1089         free_option_parameters(create_options);
1090         if (ret < 0) {
1091             error_setg_errno(errp, -ret, "Could not create temporary overlay "
1092                              "'%s': %s", tmp_filename,
1093                              error_get_pretty(local_err));
1094             error_free(local_err);
1095             local_err = NULL;
1096             goto fail;
1097         }
1098 
1099         /* Prepare a new options QDict for the temporary file, where user
1100          * options refer to the backing file */
1101         if (filename) {
1102             qdict_put(options, "file.filename", qstring_from_str(filename));
1103         }
1104         if (drv) {
1105             qdict_put(options, "driver", qstring_from_str(drv->format_name));
1106         }
1107 
1108         snapshot_options = qdict_new();
1109         qdict_put(snapshot_options, "backing", options);
1110         qdict_flatten(snapshot_options);
1111 
1112         bs->options = snapshot_options;
1113         options = qdict_clone_shallow(bs->options);
1114 
1115         filename = tmp_filename;
1116         drv = bdrv_qcow2;
1117         bs->is_temporary = 1;
1118     }
1119 
1120     /* Open image file without format layer */
1121     if (flags & BDRV_O_RDWR) {
1122         flags |= BDRV_O_ALLOW_RDWR;
1123     }
1124 
1125     qdict_extract_subqdict(options, &file_options, "file.");
1126 
1127     ret = bdrv_file_open(&file, filename, file_options,
1128                          bdrv_open_flags(bs, flags | BDRV_O_UNMAP), &local_err);
1129     if (ret < 0) {
1130         goto fail;
1131     }
1132 
1133     /* Find the right image format driver */
1134     drvname = qdict_get_try_str(options, "driver");
1135     if (drvname) {
1136         drv = bdrv_find_format(drvname);
1137         qdict_del(options, "driver");
1138         if (!drv) {
1139             error_setg(errp, "Invalid driver: '%s'", drvname);
1140             ret = -EINVAL;
1141             goto unlink_and_fail;
1142         }
1143     }
1144 
1145     if (!drv) {
1146         ret = find_image_format(file, filename, &drv, &local_err);
1147     }
1148 
1149     if (!drv) {
1150         goto unlink_and_fail;
1151     }
1152 
1153     /* Open the image */
1154     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1155     if (ret < 0) {
1156         goto unlink_and_fail;
1157     }
1158 
1159     if (bs->file != file) {
1160         bdrv_unref(file);
1161         file = NULL;
1162     }
1163 
1164     /* If there is a backing file, use it */
1165     if ((flags & BDRV_O_NO_BACKING) == 0) {
1166         QDict *backing_options;
1167 
1168         qdict_extract_subqdict(options, &backing_options, "backing.");
1169         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1170         if (ret < 0) {
1171             goto close_and_fail;
1172         }
1173     }
1174 
1175     /* Check if any unknown options were used */
1176     if (qdict_size(options) != 0) {
1177         const QDictEntry *entry = qdict_first(options);
1178         error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1179                    "support the option '%s'", drv->format_name, bs->device_name,
1180                    entry->key);
1181 
1182         ret = -EINVAL;
1183         goto close_and_fail;
1184     }
1185     QDECREF(options);
1186 
1187     if (!bdrv_key_required(bs)) {
1188         bdrv_dev_change_media_cb(bs, true);
1189     }
1190 
1191     return 0;
1192 
1193 unlink_and_fail:
1194     if (file != NULL) {
1195         bdrv_unref(file);
1196     }
1197     if (bs->is_temporary) {
1198         unlink(filename);
1199     }
1200 fail:
1201     QDECREF(bs->options);
1202     QDECREF(options);
1203     bs->options = NULL;
1204     if (error_is_set(&local_err)) {
1205         error_propagate(errp, local_err);
1206     }
1207     return ret;
1208 
1209 close_and_fail:
1210     bdrv_close(bs);
1211     QDECREF(options);
1212     if (error_is_set(&local_err)) {
1213         error_propagate(errp, local_err);
1214     }
1215     return ret;
1216 }
1217 
1218 typedef struct BlockReopenQueueEntry {
1219      bool prepared;
1220      BDRVReopenState state;
1221      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1222 } BlockReopenQueueEntry;
1223 
1224 /*
1225  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1226  * reopen of multiple devices.
1227  *
1228  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1229  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1230  * be created and initialized. This newly created BlockReopenQueue should be
1231  * passed back in for subsequent calls that are intended to be of the same
1232  * atomic 'set'.
1233  *
1234  * bs is the BlockDriverState to add to the reopen queue.
1235  *
1236  * flags contains the open flags for the associated bs
1237  *
1238  * returns a pointer to bs_queue, which is either the newly allocated
1239  * bs_queue, or the existing bs_queue being used.
1240  *
1241  */
1242 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1243                                     BlockDriverState *bs, int flags)
1244 {
1245     assert(bs != NULL);
1246 
1247     BlockReopenQueueEntry *bs_entry;
1248     if (bs_queue == NULL) {
1249         bs_queue = g_new0(BlockReopenQueue, 1);
1250         QSIMPLEQ_INIT(bs_queue);
1251     }
1252 
1253     if (bs->file) {
1254         bdrv_reopen_queue(bs_queue, bs->file, flags);
1255     }
1256 
1257     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1258     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1259 
1260     bs_entry->state.bs = bs;
1261     bs_entry->state.flags = flags;
1262 
1263     return bs_queue;
1264 }
1265 
1266 /*
1267  * Reopen multiple BlockDriverStates atomically & transactionally.
1268  *
1269  * The queue passed in (bs_queue) must have been built up previous
1270  * via bdrv_reopen_queue().
1271  *
1272  * Reopens all BDS specified in the queue, with the appropriate
1273  * flags.  All devices are prepared for reopen, and failure of any
1274  * device will cause all device changes to be abandonded, and intermediate
1275  * data cleaned up.
1276  *
1277  * If all devices prepare successfully, then the changes are committed
1278  * to all devices.
1279  *
1280  */
1281 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1282 {
1283     int ret = -1;
1284     BlockReopenQueueEntry *bs_entry, *next;
1285     Error *local_err = NULL;
1286 
1287     assert(bs_queue != NULL);
1288 
1289     bdrv_drain_all();
1290 
1291     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1292         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1293             error_propagate(errp, local_err);
1294             goto cleanup;
1295         }
1296         bs_entry->prepared = true;
1297     }
1298 
1299     /* If we reach this point, we have success and just need to apply the
1300      * changes
1301      */
1302     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1303         bdrv_reopen_commit(&bs_entry->state);
1304     }
1305 
1306     ret = 0;
1307 
1308 cleanup:
1309     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1310         if (ret && bs_entry->prepared) {
1311             bdrv_reopen_abort(&bs_entry->state);
1312         }
1313         g_free(bs_entry);
1314     }
1315     g_free(bs_queue);
1316     return ret;
1317 }
1318 
1319 
1320 /* Reopen a single BlockDriverState with the specified flags. */
1321 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1322 {
1323     int ret = -1;
1324     Error *local_err = NULL;
1325     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1326 
1327     ret = bdrv_reopen_multiple(queue, &local_err);
1328     if (local_err != NULL) {
1329         error_propagate(errp, local_err);
1330     }
1331     return ret;
1332 }
1333 
1334 
1335 /*
1336  * Prepares a BlockDriverState for reopen. All changes are staged in the
1337  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1338  * the block driver layer .bdrv_reopen_prepare()
1339  *
1340  * bs is the BlockDriverState to reopen
1341  * flags are the new open flags
1342  * queue is the reopen queue
1343  *
1344  * Returns 0 on success, non-zero on error.  On error errp will be set
1345  * as well.
1346  *
1347  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1348  * It is the responsibility of the caller to then call the abort() or
1349  * commit() for any other BDS that have been left in a prepare() state
1350  *
1351  */
1352 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1353                         Error **errp)
1354 {
1355     int ret = -1;
1356     Error *local_err = NULL;
1357     BlockDriver *drv;
1358 
1359     assert(reopen_state != NULL);
1360     assert(reopen_state->bs->drv != NULL);
1361     drv = reopen_state->bs->drv;
1362 
1363     /* if we are to stay read-only, do not allow permission change
1364      * to r/w */
1365     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1366         reopen_state->flags & BDRV_O_RDWR) {
1367         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1368                   reopen_state->bs->device_name);
1369         goto error;
1370     }
1371 
1372 
1373     ret = bdrv_flush(reopen_state->bs);
1374     if (ret) {
1375         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1376                   strerror(-ret));
1377         goto error;
1378     }
1379 
1380     if (drv->bdrv_reopen_prepare) {
1381         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1382         if (ret) {
1383             if (local_err != NULL) {
1384                 error_propagate(errp, local_err);
1385             } else {
1386                 error_setg(errp, "failed while preparing to reopen image '%s'",
1387                            reopen_state->bs->filename);
1388             }
1389             goto error;
1390         }
1391     } else {
1392         /* It is currently mandatory to have a bdrv_reopen_prepare()
1393          * handler for each supported drv. */
1394         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1395                   drv->format_name, reopen_state->bs->device_name,
1396                  "reopening of file");
1397         ret = -1;
1398         goto error;
1399     }
1400 
1401     ret = 0;
1402 
1403 error:
1404     return ret;
1405 }
1406 
1407 /*
1408  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1409  * makes them final by swapping the staging BlockDriverState contents into
1410  * the active BlockDriverState contents.
1411  */
1412 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1413 {
1414     BlockDriver *drv;
1415 
1416     assert(reopen_state != NULL);
1417     drv = reopen_state->bs->drv;
1418     assert(drv != NULL);
1419 
1420     /* If there are any driver level actions to take */
1421     if (drv->bdrv_reopen_commit) {
1422         drv->bdrv_reopen_commit(reopen_state);
1423     }
1424 
1425     /* set BDS specific flags now */
1426     reopen_state->bs->open_flags         = reopen_state->flags;
1427     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1428                                               BDRV_O_CACHE_WB);
1429     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1430 }
1431 
1432 /*
1433  * Abort the reopen, and delete and free the staged changes in
1434  * reopen_state
1435  */
1436 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1437 {
1438     BlockDriver *drv;
1439 
1440     assert(reopen_state != NULL);
1441     drv = reopen_state->bs->drv;
1442     assert(drv != NULL);
1443 
1444     if (drv->bdrv_reopen_abort) {
1445         drv->bdrv_reopen_abort(reopen_state);
1446     }
1447 }
1448 
1449 
1450 void bdrv_close(BlockDriverState *bs)
1451 {
1452     if (bs->job) {
1453         block_job_cancel_sync(bs->job);
1454     }
1455     bdrv_drain_all(); /* complete I/O */
1456     bdrv_flush(bs);
1457     bdrv_drain_all(); /* in case flush left pending I/O */
1458     notifier_list_notify(&bs->close_notifiers, bs);
1459 
1460     if (bs->drv) {
1461         if (bs->backing_hd) {
1462             bdrv_unref(bs->backing_hd);
1463             bs->backing_hd = NULL;
1464         }
1465         bs->drv->bdrv_close(bs);
1466         g_free(bs->opaque);
1467 #ifdef _WIN32
1468         if (bs->is_temporary) {
1469             unlink(bs->filename);
1470         }
1471 #endif
1472         bs->opaque = NULL;
1473         bs->drv = NULL;
1474         bs->copy_on_read = 0;
1475         bs->backing_file[0] = '\0';
1476         bs->backing_format[0] = '\0';
1477         bs->total_sectors = 0;
1478         bs->encrypted = 0;
1479         bs->valid_key = 0;
1480         bs->sg = 0;
1481         bs->growable = 0;
1482         bs->zero_beyond_eof = false;
1483         QDECREF(bs->options);
1484         bs->options = NULL;
1485 
1486         if (bs->file != NULL) {
1487             bdrv_unref(bs->file);
1488             bs->file = NULL;
1489         }
1490     }
1491 
1492     bdrv_dev_change_media_cb(bs, false);
1493 
1494     /*throttling disk I/O limits*/
1495     if (bs->io_limits_enabled) {
1496         bdrv_io_limits_disable(bs);
1497     }
1498 }
1499 
1500 void bdrv_close_all(void)
1501 {
1502     BlockDriverState *bs;
1503 
1504     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1505         bdrv_close(bs);
1506     }
1507 }
1508 
1509 /* Check if any requests are in-flight (including throttled requests) */
1510 static bool bdrv_requests_pending(BlockDriverState *bs)
1511 {
1512     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1513         return true;
1514     }
1515     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1516         return true;
1517     }
1518     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1519         return true;
1520     }
1521     if (bs->file && bdrv_requests_pending(bs->file)) {
1522         return true;
1523     }
1524     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1525         return true;
1526     }
1527     return false;
1528 }
1529 
1530 static bool bdrv_requests_pending_all(void)
1531 {
1532     BlockDriverState *bs;
1533     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1534         if (bdrv_requests_pending(bs)) {
1535             return true;
1536         }
1537     }
1538     return false;
1539 }
1540 
1541 /*
1542  * Wait for pending requests to complete across all BlockDriverStates
1543  *
1544  * This function does not flush data to disk, use bdrv_flush_all() for that
1545  * after calling this function.
1546  *
1547  * Note that completion of an asynchronous I/O operation can trigger any
1548  * number of other I/O operations on other devices---for example a coroutine
1549  * can be arbitrarily complex and a constant flow of I/O can come until the
1550  * coroutine is complete.  Because of this, it is not possible to have a
1551  * function to drain a single device's I/O queue.
1552  */
1553 void bdrv_drain_all(void)
1554 {
1555     /* Always run first iteration so any pending completion BHs run */
1556     bool busy = true;
1557     BlockDriverState *bs;
1558 
1559     while (busy) {
1560         QTAILQ_FOREACH(bs, &bdrv_states, list) {
1561             bdrv_start_throttled_reqs(bs);
1562         }
1563 
1564         busy = bdrv_requests_pending_all();
1565         busy |= aio_poll(qemu_get_aio_context(), busy);
1566     }
1567 }
1568 
1569 /* make a BlockDriverState anonymous by removing from bdrv_state list.
1570    Also, NULL terminate the device_name to prevent double remove */
1571 void bdrv_make_anon(BlockDriverState *bs)
1572 {
1573     if (bs->device_name[0] != '\0') {
1574         QTAILQ_REMOVE(&bdrv_states, bs, list);
1575     }
1576     bs->device_name[0] = '\0';
1577 }
1578 
1579 static void bdrv_rebind(BlockDriverState *bs)
1580 {
1581     if (bs->drv && bs->drv->bdrv_rebind) {
1582         bs->drv->bdrv_rebind(bs);
1583     }
1584 }
1585 
1586 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1587                                      BlockDriverState *bs_src)
1588 {
1589     /* move some fields that need to stay attached to the device */
1590     bs_dest->open_flags         = bs_src->open_flags;
1591 
1592     /* dev info */
1593     bs_dest->dev_ops            = bs_src->dev_ops;
1594     bs_dest->dev_opaque         = bs_src->dev_opaque;
1595     bs_dest->dev                = bs_src->dev;
1596     bs_dest->buffer_alignment   = bs_src->buffer_alignment;
1597     bs_dest->copy_on_read       = bs_src->copy_on_read;
1598 
1599     bs_dest->enable_write_cache = bs_src->enable_write_cache;
1600 
1601     /* i/o throttled req */
1602     memcpy(&bs_dest->throttle_state,
1603            &bs_src->throttle_state,
1604            sizeof(ThrottleState));
1605     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1606     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1607     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1608 
1609     /* r/w error */
1610     bs_dest->on_read_error      = bs_src->on_read_error;
1611     bs_dest->on_write_error     = bs_src->on_write_error;
1612 
1613     /* i/o status */
1614     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1615     bs_dest->iostatus           = bs_src->iostatus;
1616 
1617     /* dirty bitmap */
1618     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1619 
1620     /* reference count */
1621     bs_dest->refcnt             = bs_src->refcnt;
1622 
1623     /* job */
1624     bs_dest->in_use             = bs_src->in_use;
1625     bs_dest->job                = bs_src->job;
1626 
1627     /* keep the same entry in bdrv_states */
1628     pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1629             bs_src->device_name);
1630     bs_dest->list = bs_src->list;
1631 }
1632 
1633 /*
1634  * Swap bs contents for two image chains while they are live,
1635  * while keeping required fields on the BlockDriverState that is
1636  * actually attached to a device.
1637  *
1638  * This will modify the BlockDriverState fields, and swap contents
1639  * between bs_new and bs_old. Both bs_new and bs_old are modified.
1640  *
1641  * bs_new is required to be anonymous.
1642  *
1643  * This function does not create any image files.
1644  */
1645 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1646 {
1647     BlockDriverState tmp;
1648 
1649     /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1650     assert(bs_new->device_name[0] == '\0');
1651     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1652     assert(bs_new->job == NULL);
1653     assert(bs_new->dev == NULL);
1654     assert(bs_new->in_use == 0);
1655     assert(bs_new->io_limits_enabled == false);
1656     assert(!throttle_have_timer(&bs_new->throttle_state));
1657 
1658     tmp = *bs_new;
1659     *bs_new = *bs_old;
1660     *bs_old = tmp;
1661 
1662     /* there are some fields that should not be swapped, move them back */
1663     bdrv_move_feature_fields(&tmp, bs_old);
1664     bdrv_move_feature_fields(bs_old, bs_new);
1665     bdrv_move_feature_fields(bs_new, &tmp);
1666 
1667     /* bs_new shouldn't be in bdrv_states even after the swap!  */
1668     assert(bs_new->device_name[0] == '\0');
1669 
1670     /* Check a few fields that should remain attached to the device */
1671     assert(bs_new->dev == NULL);
1672     assert(bs_new->job == NULL);
1673     assert(bs_new->in_use == 0);
1674     assert(bs_new->io_limits_enabled == false);
1675     assert(!throttle_have_timer(&bs_new->throttle_state));
1676 
1677     bdrv_rebind(bs_new);
1678     bdrv_rebind(bs_old);
1679 }
1680 
1681 /*
1682  * Add new bs contents at the top of an image chain while the chain is
1683  * live, while keeping required fields on the top layer.
1684  *
1685  * This will modify the BlockDriverState fields, and swap contents
1686  * between bs_new and bs_top. Both bs_new and bs_top are modified.
1687  *
1688  * bs_new is required to be anonymous.
1689  *
1690  * This function does not create any image files.
1691  */
1692 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1693 {
1694     bdrv_swap(bs_new, bs_top);
1695 
1696     /* The contents of 'tmp' will become bs_top, as we are
1697      * swapping bs_new and bs_top contents. */
1698     bs_top->backing_hd = bs_new;
1699     bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1700     pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1701             bs_new->filename);
1702     pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1703             bs_new->drv ? bs_new->drv->format_name : "");
1704 }
1705 
1706 static void bdrv_delete(BlockDriverState *bs)
1707 {
1708     assert(!bs->dev);
1709     assert(!bs->job);
1710     assert(!bs->in_use);
1711     assert(!bs->refcnt);
1712     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1713 
1714     bdrv_close(bs);
1715 
1716     /* remove from list, if necessary */
1717     bdrv_make_anon(bs);
1718 
1719     g_free(bs);
1720 }
1721 
1722 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1723 /* TODO change to DeviceState *dev when all users are qdevified */
1724 {
1725     if (bs->dev) {
1726         return -EBUSY;
1727     }
1728     bs->dev = dev;
1729     bdrv_iostatus_reset(bs);
1730     return 0;
1731 }
1732 
1733 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1734 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1735 {
1736     if (bdrv_attach_dev(bs, dev) < 0) {
1737         abort();
1738     }
1739 }
1740 
1741 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1742 /* TODO change to DeviceState *dev when all users are qdevified */
1743 {
1744     assert(bs->dev == dev);
1745     bs->dev = NULL;
1746     bs->dev_ops = NULL;
1747     bs->dev_opaque = NULL;
1748     bs->buffer_alignment = 512;
1749 }
1750 
1751 /* TODO change to return DeviceState * when all users are qdevified */
1752 void *bdrv_get_attached_dev(BlockDriverState *bs)
1753 {
1754     return bs->dev;
1755 }
1756 
1757 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1758                       void *opaque)
1759 {
1760     bs->dev_ops = ops;
1761     bs->dev_opaque = opaque;
1762 }
1763 
1764 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1765                                enum MonitorEvent ev,
1766                                BlockErrorAction action, bool is_read)
1767 {
1768     QObject *data;
1769     const char *action_str;
1770 
1771     switch (action) {
1772     case BDRV_ACTION_REPORT:
1773         action_str = "report";
1774         break;
1775     case BDRV_ACTION_IGNORE:
1776         action_str = "ignore";
1777         break;
1778     case BDRV_ACTION_STOP:
1779         action_str = "stop";
1780         break;
1781     default:
1782         abort();
1783     }
1784 
1785     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1786                               bdrv->device_name,
1787                               action_str,
1788                               is_read ? "read" : "write");
1789     monitor_protocol_event(ev, data);
1790 
1791     qobject_decref(data);
1792 }
1793 
1794 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1795 {
1796     QObject *data;
1797 
1798     data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1799                               bdrv_get_device_name(bs), ejected);
1800     monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1801 
1802     qobject_decref(data);
1803 }
1804 
1805 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1806 {
1807     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1808         bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1809         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1810         if (tray_was_closed) {
1811             /* tray open */
1812             bdrv_emit_qmp_eject_event(bs, true);
1813         }
1814         if (load) {
1815             /* tray close */
1816             bdrv_emit_qmp_eject_event(bs, false);
1817         }
1818     }
1819 }
1820 
1821 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1822 {
1823     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1824 }
1825 
1826 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1827 {
1828     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1829         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1830     }
1831 }
1832 
1833 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1834 {
1835     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1836         return bs->dev_ops->is_tray_open(bs->dev_opaque);
1837     }
1838     return false;
1839 }
1840 
1841 static void bdrv_dev_resize_cb(BlockDriverState *bs)
1842 {
1843     if (bs->dev_ops && bs->dev_ops->resize_cb) {
1844         bs->dev_ops->resize_cb(bs->dev_opaque);
1845     }
1846 }
1847 
1848 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1849 {
1850     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1851         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1852     }
1853     return false;
1854 }
1855 
1856 /*
1857  * Run consistency checks on an image
1858  *
1859  * Returns 0 if the check could be completed (it doesn't mean that the image is
1860  * free of errors) or -errno when an internal error occurred. The results of the
1861  * check are stored in res.
1862  */
1863 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1864 {
1865     if (bs->drv->bdrv_check == NULL) {
1866         return -ENOTSUP;
1867     }
1868 
1869     memset(res, 0, sizeof(*res));
1870     return bs->drv->bdrv_check(bs, res, fix);
1871 }
1872 
1873 #define COMMIT_BUF_SECTORS 2048
1874 
1875 /* commit COW file into the raw image */
1876 int bdrv_commit(BlockDriverState *bs)
1877 {
1878     BlockDriver *drv = bs->drv;
1879     int64_t sector, total_sectors;
1880     int n, ro, open_flags;
1881     int ret = 0;
1882     uint8_t *buf;
1883     char filename[PATH_MAX];
1884 
1885     if (!drv)
1886         return -ENOMEDIUM;
1887 
1888     if (!bs->backing_hd) {
1889         return -ENOTSUP;
1890     }
1891 
1892     if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1893         return -EBUSY;
1894     }
1895 
1896     ro = bs->backing_hd->read_only;
1897     /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
1898     pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
1899     open_flags =  bs->backing_hd->open_flags;
1900 
1901     if (ro) {
1902         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
1903             return -EACCES;
1904         }
1905     }
1906 
1907     total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1908     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1909 
1910     for (sector = 0; sector < total_sectors; sector += n) {
1911         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
1912         if (ret < 0) {
1913             goto ro_cleanup;
1914         }
1915         if (ret) {
1916             if (bdrv_read(bs, sector, buf, n) != 0) {
1917                 ret = -EIO;
1918                 goto ro_cleanup;
1919             }
1920 
1921             if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1922                 ret = -EIO;
1923                 goto ro_cleanup;
1924             }
1925         }
1926     }
1927 
1928     if (drv->bdrv_make_empty) {
1929         ret = drv->bdrv_make_empty(bs);
1930         bdrv_flush(bs);
1931     }
1932 
1933     /*
1934      * Make sure all data we wrote to the backing device is actually
1935      * stable on disk.
1936      */
1937     if (bs->backing_hd)
1938         bdrv_flush(bs->backing_hd);
1939 
1940 ro_cleanup:
1941     g_free(buf);
1942 
1943     if (ro) {
1944         /* ignoring error return here */
1945         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
1946     }
1947 
1948     return ret;
1949 }
1950 
1951 int bdrv_commit_all(void)
1952 {
1953     BlockDriverState *bs;
1954 
1955     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1956         if (bs->drv && bs->backing_hd) {
1957             int ret = bdrv_commit(bs);
1958             if (ret < 0) {
1959                 return ret;
1960             }
1961         }
1962     }
1963     return 0;
1964 }
1965 
1966 /**
1967  * Remove an active request from the tracked requests list
1968  *
1969  * This function should be called when a tracked request is completing.
1970  */
1971 static void tracked_request_end(BdrvTrackedRequest *req)
1972 {
1973     QLIST_REMOVE(req, list);
1974     qemu_co_queue_restart_all(&req->wait_queue);
1975 }
1976 
1977 /**
1978  * Add an active request to the tracked requests list
1979  */
1980 static void tracked_request_begin(BdrvTrackedRequest *req,
1981                                   BlockDriverState *bs,
1982                                   int64_t sector_num,
1983                                   int nb_sectors, bool is_write)
1984 {
1985     *req = (BdrvTrackedRequest){
1986         .bs = bs,
1987         .sector_num = sector_num,
1988         .nb_sectors = nb_sectors,
1989         .is_write = is_write,
1990         .co = qemu_coroutine_self(),
1991     };
1992 
1993     qemu_co_queue_init(&req->wait_queue);
1994 
1995     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1996 }
1997 
1998 /**
1999  * Round a region to cluster boundaries
2000  */
2001 void bdrv_round_to_clusters(BlockDriverState *bs,
2002                             int64_t sector_num, int nb_sectors,
2003                             int64_t *cluster_sector_num,
2004                             int *cluster_nb_sectors)
2005 {
2006     BlockDriverInfo bdi;
2007 
2008     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2009         *cluster_sector_num = sector_num;
2010         *cluster_nb_sectors = nb_sectors;
2011     } else {
2012         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2013         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2014         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2015                                             nb_sectors, c);
2016     }
2017 }
2018 
2019 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2020                                      int64_t sector_num, int nb_sectors) {
2021     /*        aaaa   bbbb */
2022     if (sector_num >= req->sector_num + req->nb_sectors) {
2023         return false;
2024     }
2025     /* bbbb   aaaa        */
2026     if (req->sector_num >= sector_num + nb_sectors) {
2027         return false;
2028     }
2029     return true;
2030 }
2031 
2032 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
2033         int64_t sector_num, int nb_sectors)
2034 {
2035     BdrvTrackedRequest *req;
2036     int64_t cluster_sector_num;
2037     int cluster_nb_sectors;
2038     bool retry;
2039 
2040     /* If we touch the same cluster it counts as an overlap.  This guarantees
2041      * that allocating writes will be serialized and not race with each other
2042      * for the same cluster.  For example, in copy-on-read it ensures that the
2043      * CoR read and write operations are atomic and guest writes cannot
2044      * interleave between them.
2045      */
2046     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2047                            &cluster_sector_num, &cluster_nb_sectors);
2048 
2049     do {
2050         retry = false;
2051         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2052             if (tracked_request_overlaps(req, cluster_sector_num,
2053                                          cluster_nb_sectors)) {
2054                 /* Hitting this means there was a reentrant request, for
2055                  * example, a block driver issuing nested requests.  This must
2056                  * never happen since it means deadlock.
2057                  */
2058                 assert(qemu_coroutine_self() != req->co);
2059 
2060                 qemu_co_queue_wait(&req->wait_queue);
2061                 retry = true;
2062                 break;
2063             }
2064         }
2065     } while (retry);
2066 }
2067 
2068 /*
2069  * Return values:
2070  * 0        - success
2071  * -EINVAL  - backing format specified, but no file
2072  * -ENOSPC  - can't update the backing file because no space is left in the
2073  *            image file header
2074  * -ENOTSUP - format driver doesn't support changing the backing file
2075  */
2076 int bdrv_change_backing_file(BlockDriverState *bs,
2077     const char *backing_file, const char *backing_fmt)
2078 {
2079     BlockDriver *drv = bs->drv;
2080     int ret;
2081 
2082     /* Backing file format doesn't make sense without a backing file */
2083     if (backing_fmt && !backing_file) {
2084         return -EINVAL;
2085     }
2086 
2087     if (drv->bdrv_change_backing_file != NULL) {
2088         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2089     } else {
2090         ret = -ENOTSUP;
2091     }
2092 
2093     if (ret == 0) {
2094         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2095         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2096     }
2097     return ret;
2098 }
2099 
2100 /*
2101  * Finds the image layer in the chain that has 'bs' as its backing file.
2102  *
2103  * active is the current topmost image.
2104  *
2105  * Returns NULL if bs is not found in active's image chain,
2106  * or if active == bs.
2107  */
2108 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2109                                     BlockDriverState *bs)
2110 {
2111     BlockDriverState *overlay = NULL;
2112     BlockDriverState *intermediate;
2113 
2114     assert(active != NULL);
2115     assert(bs != NULL);
2116 
2117     /* if bs is the same as active, then by definition it has no overlay
2118      */
2119     if (active == bs) {
2120         return NULL;
2121     }
2122 
2123     intermediate = active;
2124     while (intermediate->backing_hd) {
2125         if (intermediate->backing_hd == bs) {
2126             overlay = intermediate;
2127             break;
2128         }
2129         intermediate = intermediate->backing_hd;
2130     }
2131 
2132     return overlay;
2133 }
2134 
2135 typedef struct BlkIntermediateStates {
2136     BlockDriverState *bs;
2137     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2138 } BlkIntermediateStates;
2139 
2140 
2141 /*
2142  * Drops images above 'base' up to and including 'top', and sets the image
2143  * above 'top' to have base as its backing file.
2144  *
2145  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2146  * information in 'bs' can be properly updated.
2147  *
2148  * E.g., this will convert the following chain:
2149  * bottom <- base <- intermediate <- top <- active
2150  *
2151  * to
2152  *
2153  * bottom <- base <- active
2154  *
2155  * It is allowed for bottom==base, in which case it converts:
2156  *
2157  * base <- intermediate <- top <- active
2158  *
2159  * to
2160  *
2161  * base <- active
2162  *
2163  * Error conditions:
2164  *  if active == top, that is considered an error
2165  *
2166  */
2167 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2168                            BlockDriverState *base)
2169 {
2170     BlockDriverState *intermediate;
2171     BlockDriverState *base_bs = NULL;
2172     BlockDriverState *new_top_bs = NULL;
2173     BlkIntermediateStates *intermediate_state, *next;
2174     int ret = -EIO;
2175 
2176     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2177     QSIMPLEQ_INIT(&states_to_delete);
2178 
2179     if (!top->drv || !base->drv) {
2180         goto exit;
2181     }
2182 
2183     new_top_bs = bdrv_find_overlay(active, top);
2184 
2185     if (new_top_bs == NULL) {
2186         /* we could not find the image above 'top', this is an error */
2187         goto exit;
2188     }
2189 
2190     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2191      * to do, no intermediate images */
2192     if (new_top_bs->backing_hd == base) {
2193         ret = 0;
2194         goto exit;
2195     }
2196 
2197     intermediate = top;
2198 
2199     /* now we will go down through the list, and add each BDS we find
2200      * into our deletion queue, until we hit the 'base'
2201      */
2202     while (intermediate) {
2203         intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2204         intermediate_state->bs = intermediate;
2205         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2206 
2207         if (intermediate->backing_hd == base) {
2208             base_bs = intermediate->backing_hd;
2209             break;
2210         }
2211         intermediate = intermediate->backing_hd;
2212     }
2213     if (base_bs == NULL) {
2214         /* something went wrong, we did not end at the base. safely
2215          * unravel everything, and exit with error */
2216         goto exit;
2217     }
2218 
2219     /* success - we can delete the intermediate states, and link top->base */
2220     ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2221                                    base_bs->drv ? base_bs->drv->format_name : "");
2222     if (ret) {
2223         goto exit;
2224     }
2225     new_top_bs->backing_hd = base_bs;
2226 
2227 
2228     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2229         /* so that bdrv_close() does not recursively close the chain */
2230         intermediate_state->bs->backing_hd = NULL;
2231         bdrv_unref(intermediate_state->bs);
2232     }
2233     ret = 0;
2234 
2235 exit:
2236     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2237         g_free(intermediate_state);
2238     }
2239     return ret;
2240 }
2241 
2242 
2243 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2244                                    size_t size)
2245 {
2246     int64_t len;
2247 
2248     if (!bdrv_is_inserted(bs))
2249         return -ENOMEDIUM;
2250 
2251     if (bs->growable)
2252         return 0;
2253 
2254     len = bdrv_getlength(bs);
2255 
2256     if (offset < 0)
2257         return -EIO;
2258 
2259     if ((offset > len) || (len - offset < size))
2260         return -EIO;
2261 
2262     return 0;
2263 }
2264 
2265 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2266                               int nb_sectors)
2267 {
2268     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2269                                    nb_sectors * BDRV_SECTOR_SIZE);
2270 }
2271 
2272 typedef struct RwCo {
2273     BlockDriverState *bs;
2274     int64_t sector_num;
2275     int nb_sectors;
2276     QEMUIOVector *qiov;
2277     bool is_write;
2278     int ret;
2279     BdrvRequestFlags flags;
2280 } RwCo;
2281 
2282 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2283 {
2284     RwCo *rwco = opaque;
2285 
2286     if (!rwco->is_write) {
2287         rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
2288                                      rwco->nb_sectors, rwco->qiov,
2289                                      rwco->flags);
2290     } else {
2291         rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
2292                                       rwco->nb_sectors, rwco->qiov,
2293                                       rwco->flags);
2294     }
2295 }
2296 
2297 /*
2298  * Process a vectored synchronous request using coroutines
2299  */
2300 static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
2301                        QEMUIOVector *qiov, bool is_write,
2302                        BdrvRequestFlags flags)
2303 {
2304     Coroutine *co;
2305     RwCo rwco = {
2306         .bs = bs,
2307         .sector_num = sector_num,
2308         .nb_sectors = qiov->size >> BDRV_SECTOR_BITS,
2309         .qiov = qiov,
2310         .is_write = is_write,
2311         .ret = NOT_DONE,
2312         .flags = flags,
2313     };
2314     assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0);
2315 
2316     /**
2317      * In sync call context, when the vcpu is blocked, this throttling timer
2318      * will not fire; so the I/O throttling function has to be disabled here
2319      * if it has been enabled.
2320      */
2321     if (bs->io_limits_enabled) {
2322         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2323                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2324         bdrv_io_limits_disable(bs);
2325     }
2326 
2327     if (qemu_in_coroutine()) {
2328         /* Fast-path if already in coroutine context */
2329         bdrv_rw_co_entry(&rwco);
2330     } else {
2331         co = qemu_coroutine_create(bdrv_rw_co_entry);
2332         qemu_coroutine_enter(co, &rwco);
2333         while (rwco.ret == NOT_DONE) {
2334             qemu_aio_wait();
2335         }
2336     }
2337     return rwco.ret;
2338 }
2339 
2340 /*
2341  * Process a synchronous request using coroutines
2342  */
2343 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2344                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2345 {
2346     QEMUIOVector qiov;
2347     struct iovec iov = {
2348         .iov_base = (void *)buf,
2349         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2350     };
2351 
2352     qemu_iovec_init_external(&qiov, &iov, 1);
2353     return bdrv_rwv_co(bs, sector_num, &qiov, is_write, flags);
2354 }
2355 
2356 /* return < 0 if error. See bdrv_write() for the return codes */
2357 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2358               uint8_t *buf, int nb_sectors)
2359 {
2360     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2361 }
2362 
2363 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2364 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2365                           uint8_t *buf, int nb_sectors)
2366 {
2367     bool enabled;
2368     int ret;
2369 
2370     enabled = bs->io_limits_enabled;
2371     bs->io_limits_enabled = false;
2372     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2373     bs->io_limits_enabled = enabled;
2374     return ret;
2375 }
2376 
2377 /* Return < 0 if error. Important errors are:
2378   -EIO         generic I/O error (may happen for all errors)
2379   -ENOMEDIUM   No media inserted.
2380   -EINVAL      Invalid sector number or nb_sectors
2381   -EACCES      Trying to write a read-only device
2382 */
2383 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2384                const uint8_t *buf, int nb_sectors)
2385 {
2386     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2387 }
2388 
2389 int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
2390 {
2391     return bdrv_rwv_co(bs, sector_num, qiov, true, 0);
2392 }
2393 
2394 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2395                       int nb_sectors, BdrvRequestFlags flags)
2396 {
2397     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2398                       BDRV_REQ_ZERO_WRITE | flags);
2399 }
2400 
2401 /*
2402  * Completely zero out a block device with the help of bdrv_write_zeroes.
2403  * The operation is sped up by checking the block status and only writing
2404  * zeroes to the device if they currently do not return zeroes. Optional
2405  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2406  *
2407  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2408  */
2409 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2410 {
2411     int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
2412     int64_t ret, nb_sectors, sector_num = 0;
2413     int n;
2414 
2415     for (;;) {
2416         nb_sectors = target_size - sector_num;
2417         if (nb_sectors <= 0) {
2418             return 0;
2419         }
2420         if (nb_sectors > INT_MAX) {
2421             nb_sectors = INT_MAX;
2422         }
2423         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2424         if (ret < 0) {
2425             error_report("error getting block status at sector %" PRId64 ": %s",
2426                          sector_num, strerror(-ret));
2427             return ret;
2428         }
2429         if (ret & BDRV_BLOCK_ZERO) {
2430             sector_num += n;
2431             continue;
2432         }
2433         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2434         if (ret < 0) {
2435             error_report("error writing zeroes at sector %" PRId64 ": %s",
2436                          sector_num, strerror(-ret));
2437             return ret;
2438         }
2439         sector_num += n;
2440     }
2441 }
2442 
2443 int bdrv_pread(BlockDriverState *bs, int64_t offset,
2444                void *buf, int count1)
2445 {
2446     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2447     int len, nb_sectors, count;
2448     int64_t sector_num;
2449     int ret;
2450 
2451     count = count1;
2452     /* first read to align to sector start */
2453     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2454     if (len > count)
2455         len = count;
2456     sector_num = offset >> BDRV_SECTOR_BITS;
2457     if (len > 0) {
2458         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2459             return ret;
2460         memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
2461         count -= len;
2462         if (count == 0)
2463             return count1;
2464         sector_num++;
2465         buf += len;
2466     }
2467 
2468     /* read the sectors "in place" */
2469     nb_sectors = count >> BDRV_SECTOR_BITS;
2470     if (nb_sectors > 0) {
2471         if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
2472             return ret;
2473         sector_num += nb_sectors;
2474         len = nb_sectors << BDRV_SECTOR_BITS;
2475         buf += len;
2476         count -= len;
2477     }
2478 
2479     /* add data from the last sector */
2480     if (count > 0) {
2481         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2482             return ret;
2483         memcpy(buf, tmp_buf, count);
2484     }
2485     return count1;
2486 }
2487 
2488 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2489 {
2490     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2491     int len, nb_sectors, count;
2492     int64_t sector_num;
2493     int ret;
2494 
2495     count = qiov->size;
2496 
2497     /* first write to align to sector start */
2498     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2499     if (len > count)
2500         len = count;
2501     sector_num = offset >> BDRV_SECTOR_BITS;
2502     if (len > 0) {
2503         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2504             return ret;
2505         qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)),
2506                           len);
2507         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2508             return ret;
2509         count -= len;
2510         if (count == 0)
2511             return qiov->size;
2512         sector_num++;
2513     }
2514 
2515     /* write the sectors "in place" */
2516     nb_sectors = count >> BDRV_SECTOR_BITS;
2517     if (nb_sectors > 0) {
2518         QEMUIOVector qiov_inplace;
2519 
2520         qemu_iovec_init(&qiov_inplace, qiov->niov);
2521         qemu_iovec_concat(&qiov_inplace, qiov, len,
2522                           nb_sectors << BDRV_SECTOR_BITS);
2523         ret = bdrv_writev(bs, sector_num, &qiov_inplace);
2524         qemu_iovec_destroy(&qiov_inplace);
2525         if (ret < 0) {
2526             return ret;
2527         }
2528 
2529         sector_num += nb_sectors;
2530         len = nb_sectors << BDRV_SECTOR_BITS;
2531         count -= len;
2532     }
2533 
2534     /* add data from the last sector */
2535     if (count > 0) {
2536         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2537             return ret;
2538         qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count);
2539         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2540             return ret;
2541     }
2542     return qiov->size;
2543 }
2544 
2545 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2546                 const void *buf, int count1)
2547 {
2548     QEMUIOVector qiov;
2549     struct iovec iov = {
2550         .iov_base   = (void *) buf,
2551         .iov_len    = count1,
2552     };
2553 
2554     qemu_iovec_init_external(&qiov, &iov, 1);
2555     return bdrv_pwritev(bs, offset, &qiov);
2556 }
2557 
2558 /*
2559  * Writes to the file and ensures that no writes are reordered across this
2560  * request (acts as a barrier)
2561  *
2562  * Returns 0 on success, -errno in error cases.
2563  */
2564 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2565     const void *buf, int count)
2566 {
2567     int ret;
2568 
2569     ret = bdrv_pwrite(bs, offset, buf, count);
2570     if (ret < 0) {
2571         return ret;
2572     }
2573 
2574     /* No flush needed for cache modes that already do it */
2575     if (bs->enable_write_cache) {
2576         bdrv_flush(bs);
2577     }
2578 
2579     return 0;
2580 }
2581 
2582 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2583         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2584 {
2585     /* Perform I/O through a temporary buffer so that users who scribble over
2586      * their read buffer while the operation is in progress do not end up
2587      * modifying the image file.  This is critical for zero-copy guest I/O
2588      * where anything might happen inside guest memory.
2589      */
2590     void *bounce_buffer;
2591 
2592     BlockDriver *drv = bs->drv;
2593     struct iovec iov;
2594     QEMUIOVector bounce_qiov;
2595     int64_t cluster_sector_num;
2596     int cluster_nb_sectors;
2597     size_t skip_bytes;
2598     int ret;
2599 
2600     /* Cover entire cluster so no additional backing file I/O is required when
2601      * allocating cluster in the image file.
2602      */
2603     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2604                            &cluster_sector_num, &cluster_nb_sectors);
2605 
2606     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2607                                    cluster_sector_num, cluster_nb_sectors);
2608 
2609     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2610     iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2611     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2612 
2613     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2614                              &bounce_qiov);
2615     if (ret < 0) {
2616         goto err;
2617     }
2618 
2619     if (drv->bdrv_co_write_zeroes &&
2620         buffer_is_zero(bounce_buffer, iov.iov_len)) {
2621         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2622                                       cluster_nb_sectors, 0);
2623     } else {
2624         /* This does not change the data on the disk, it is not necessary
2625          * to flush even in cache=writethrough mode.
2626          */
2627         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2628                                   &bounce_qiov);
2629     }
2630 
2631     if (ret < 0) {
2632         /* It might be okay to ignore write errors for guest requests.  If this
2633          * is a deliberate copy-on-read then we don't want to ignore the error.
2634          * Simply report it in all cases.
2635          */
2636         goto err;
2637     }
2638 
2639     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2640     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2641                         nb_sectors * BDRV_SECTOR_SIZE);
2642 
2643 err:
2644     qemu_vfree(bounce_buffer);
2645     return ret;
2646 }
2647 
2648 /*
2649  * Handle a read request in coroutine context
2650  */
2651 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
2652     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2653     BdrvRequestFlags flags)
2654 {
2655     BlockDriver *drv = bs->drv;
2656     BdrvTrackedRequest req;
2657     int ret;
2658 
2659     if (!drv) {
2660         return -ENOMEDIUM;
2661     }
2662     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2663         return -EIO;
2664     }
2665 
2666     if (bs->copy_on_read) {
2667         flags |= BDRV_REQ_COPY_ON_READ;
2668     }
2669     if (flags & BDRV_REQ_COPY_ON_READ) {
2670         bs->copy_on_read_in_flight++;
2671     }
2672 
2673     if (bs->copy_on_read_in_flight) {
2674         wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2675     }
2676 
2677     /* throttling disk I/O */
2678     if (bs->io_limits_enabled) {
2679         bdrv_io_limits_intercept(bs, nb_sectors, false);
2680     }
2681 
2682     tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
2683 
2684     if (flags & BDRV_REQ_COPY_ON_READ) {
2685         int pnum;
2686 
2687         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2688         if (ret < 0) {
2689             goto out;
2690         }
2691 
2692         if (!ret || pnum != nb_sectors) {
2693             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2694             goto out;
2695         }
2696     }
2697 
2698     if (!(bs->zero_beyond_eof && bs->growable)) {
2699         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2700     } else {
2701         /* Read zeros after EOF of growable BDSes */
2702         int64_t len, total_sectors, max_nb_sectors;
2703 
2704         len = bdrv_getlength(bs);
2705         if (len < 0) {
2706             ret = len;
2707             goto out;
2708         }
2709 
2710         total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
2711         max_nb_sectors = MAX(0, total_sectors - sector_num);
2712         if (max_nb_sectors > 0) {
2713             ret = drv->bdrv_co_readv(bs, sector_num,
2714                                      MIN(nb_sectors, max_nb_sectors), qiov);
2715         } else {
2716             ret = 0;
2717         }
2718 
2719         /* Reading beyond end of file is supposed to produce zeroes */
2720         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2721             uint64_t offset = MAX(0, total_sectors - sector_num);
2722             uint64_t bytes = (sector_num + nb_sectors - offset) *
2723                               BDRV_SECTOR_SIZE;
2724             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
2725         }
2726     }
2727 
2728 out:
2729     tracked_request_end(&req);
2730 
2731     if (flags & BDRV_REQ_COPY_ON_READ) {
2732         bs->copy_on_read_in_flight--;
2733     }
2734 
2735     return ret;
2736 }
2737 
2738 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
2739     int nb_sectors, QEMUIOVector *qiov)
2740 {
2741     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
2742 
2743     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
2744 }
2745 
2746 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
2747     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2748 {
2749     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
2750 
2751     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
2752                             BDRV_REQ_COPY_ON_READ);
2753 }
2754 
2755 /* if no limit is specified in the BlockLimits use a default
2756  * of 32768 512-byte sectors (16 MiB) per request.
2757  */
2758 #define MAX_WRITE_ZEROES_DEFAULT 32768
2759 
2760 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
2761     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
2762 {
2763     BlockDriver *drv = bs->drv;
2764     QEMUIOVector qiov;
2765     struct iovec iov = {0};
2766     int ret = 0;
2767 
2768     int max_write_zeroes = bs->bl.max_write_zeroes ?
2769                            bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
2770 
2771     while (nb_sectors > 0 && !ret) {
2772         int num = nb_sectors;
2773 
2774         /* Align request.  Block drivers can expect the "bulk" of the request
2775          * to be aligned.
2776          */
2777         if (bs->bl.write_zeroes_alignment
2778             && num > bs->bl.write_zeroes_alignment) {
2779             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
2780                 /* Make a small request up to the first aligned sector.  */
2781                 num = bs->bl.write_zeroes_alignment;
2782                 num -= sector_num % bs->bl.write_zeroes_alignment;
2783             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
2784                 /* Shorten the request to the last aligned sector.  num cannot
2785                  * underflow because num > bs->bl.write_zeroes_alignment.
2786                  */
2787                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
2788             }
2789         }
2790 
2791         /* limit request size */
2792         if (num > max_write_zeroes) {
2793             num = max_write_zeroes;
2794         }
2795 
2796         ret = -ENOTSUP;
2797         /* First try the efficient write zeroes operation */
2798         if (drv->bdrv_co_write_zeroes) {
2799             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
2800         }
2801 
2802         if (ret == -ENOTSUP) {
2803             /* Fall back to bounce buffer if write zeroes is unsupported */
2804             iov.iov_len = num * BDRV_SECTOR_SIZE;
2805             if (iov.iov_base == NULL) {
2806                 iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
2807                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
2808             }
2809             qemu_iovec_init_external(&qiov, &iov, 1);
2810 
2811             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
2812 
2813             /* Keep bounce buffer around if it is big enough for all
2814              * all future requests.
2815              */
2816             if (num < max_write_zeroes) {
2817                 qemu_vfree(iov.iov_base);
2818                 iov.iov_base = NULL;
2819             }
2820         }
2821 
2822         sector_num += num;
2823         nb_sectors -= num;
2824     }
2825 
2826     qemu_vfree(iov.iov_base);
2827     return ret;
2828 }
2829 
2830 /*
2831  * Handle a write request in coroutine context
2832  */
2833 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
2834     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2835     BdrvRequestFlags flags)
2836 {
2837     BlockDriver *drv = bs->drv;
2838     BdrvTrackedRequest req;
2839     int ret;
2840 
2841     if (!bs->drv) {
2842         return -ENOMEDIUM;
2843     }
2844     if (bs->read_only) {
2845         return -EACCES;
2846     }
2847     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2848         return -EIO;
2849     }
2850 
2851     if (bs->copy_on_read_in_flight) {
2852         wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2853     }
2854 
2855     /* throttling disk I/O */
2856     if (bs->io_limits_enabled) {
2857         bdrv_io_limits_intercept(bs, nb_sectors, true);
2858     }
2859 
2860     tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
2861 
2862     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
2863 
2864     if (ret < 0) {
2865         /* Do nothing, write notifier decided to fail this request */
2866     } else if (flags & BDRV_REQ_ZERO_WRITE) {
2867         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
2868     } else {
2869         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
2870     }
2871 
2872     if (ret == 0 && !bs->enable_write_cache) {
2873         ret = bdrv_co_flush(bs);
2874     }
2875 
2876     bdrv_set_dirty(bs, sector_num, nb_sectors);
2877 
2878     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2879         bs->wr_highest_sector = sector_num + nb_sectors - 1;
2880     }
2881     if (bs->growable && ret >= 0) {
2882         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
2883     }
2884 
2885     tracked_request_end(&req);
2886 
2887     return ret;
2888 }
2889 
2890 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2891     int nb_sectors, QEMUIOVector *qiov)
2892 {
2893     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2894 
2895     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2896 }
2897 
2898 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2899                                       int64_t sector_num, int nb_sectors,
2900                                       BdrvRequestFlags flags)
2901 {
2902     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
2903 
2904     if (!(bs->open_flags & BDRV_O_UNMAP)) {
2905         flags &= ~BDRV_REQ_MAY_UNMAP;
2906     }
2907 
2908     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2909                              BDRV_REQ_ZERO_WRITE | flags);
2910 }
2911 
2912 /**
2913  * Truncate file to 'offset' bytes (needed only for file protocols)
2914  */
2915 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2916 {
2917     BlockDriver *drv = bs->drv;
2918     int ret;
2919     if (!drv)
2920         return -ENOMEDIUM;
2921     if (!drv->bdrv_truncate)
2922         return -ENOTSUP;
2923     if (bs->read_only)
2924         return -EACCES;
2925     if (bdrv_in_use(bs))
2926         return -EBUSY;
2927     ret = drv->bdrv_truncate(bs, offset);
2928     if (ret == 0) {
2929         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2930         bdrv_dev_resize_cb(bs);
2931     }
2932     return ret;
2933 }
2934 
2935 /**
2936  * Length of a allocated file in bytes. Sparse files are counted by actual
2937  * allocated space. Return < 0 if error or unknown.
2938  */
2939 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2940 {
2941     BlockDriver *drv = bs->drv;
2942     if (!drv) {
2943         return -ENOMEDIUM;
2944     }
2945     if (drv->bdrv_get_allocated_file_size) {
2946         return drv->bdrv_get_allocated_file_size(bs);
2947     }
2948     if (bs->file) {
2949         return bdrv_get_allocated_file_size(bs->file);
2950     }
2951     return -ENOTSUP;
2952 }
2953 
2954 /**
2955  * Length of a file in bytes. Return < 0 if error or unknown.
2956  */
2957 int64_t bdrv_getlength(BlockDriverState *bs)
2958 {
2959     BlockDriver *drv = bs->drv;
2960     if (!drv)
2961         return -ENOMEDIUM;
2962 
2963     if (drv->has_variable_length) {
2964         int ret = refresh_total_sectors(bs, bs->total_sectors);
2965         if (ret < 0) {
2966             return ret;
2967         }
2968     }
2969     return bs->total_sectors * BDRV_SECTOR_SIZE;
2970 }
2971 
2972 /* return 0 as number of sectors if no device present or error */
2973 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2974 {
2975     int64_t length;
2976     length = bdrv_getlength(bs);
2977     if (length < 0)
2978         length = 0;
2979     else
2980         length = length >> BDRV_SECTOR_BITS;
2981     *nb_sectors_ptr = length;
2982 }
2983 
2984 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
2985                        BlockdevOnError on_write_error)
2986 {
2987     bs->on_read_error = on_read_error;
2988     bs->on_write_error = on_write_error;
2989 }
2990 
2991 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
2992 {
2993     return is_read ? bs->on_read_error : bs->on_write_error;
2994 }
2995 
2996 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
2997 {
2998     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
2999 
3000     switch (on_err) {
3001     case BLOCKDEV_ON_ERROR_ENOSPC:
3002         return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3003     case BLOCKDEV_ON_ERROR_STOP:
3004         return BDRV_ACTION_STOP;
3005     case BLOCKDEV_ON_ERROR_REPORT:
3006         return BDRV_ACTION_REPORT;
3007     case BLOCKDEV_ON_ERROR_IGNORE:
3008         return BDRV_ACTION_IGNORE;
3009     default:
3010         abort();
3011     }
3012 }
3013 
3014 /* This is done by device models because, while the block layer knows
3015  * about the error, it does not know whether an operation comes from
3016  * the device or the block layer (from a job, for example).
3017  */
3018 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3019                        bool is_read, int error)
3020 {
3021     assert(error >= 0);
3022     bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3023     if (action == BDRV_ACTION_STOP) {
3024         vm_stop(RUN_STATE_IO_ERROR);
3025         bdrv_iostatus_set_err(bs, error);
3026     }
3027 }
3028 
3029 int bdrv_is_read_only(BlockDriverState *bs)
3030 {
3031     return bs->read_only;
3032 }
3033 
3034 int bdrv_is_sg(BlockDriverState *bs)
3035 {
3036     return bs->sg;
3037 }
3038 
3039 int bdrv_enable_write_cache(BlockDriverState *bs)
3040 {
3041     return bs->enable_write_cache;
3042 }
3043 
3044 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3045 {
3046     bs->enable_write_cache = wce;
3047 
3048     /* so a reopen() will preserve wce */
3049     if (wce) {
3050         bs->open_flags |= BDRV_O_CACHE_WB;
3051     } else {
3052         bs->open_flags &= ~BDRV_O_CACHE_WB;
3053     }
3054 }
3055 
3056 int bdrv_is_encrypted(BlockDriverState *bs)
3057 {
3058     if (bs->backing_hd && bs->backing_hd->encrypted)
3059         return 1;
3060     return bs->encrypted;
3061 }
3062 
3063 int bdrv_key_required(BlockDriverState *bs)
3064 {
3065     BlockDriverState *backing_hd = bs->backing_hd;
3066 
3067     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3068         return 1;
3069     return (bs->encrypted && !bs->valid_key);
3070 }
3071 
3072 int bdrv_set_key(BlockDriverState *bs, const char *key)
3073 {
3074     int ret;
3075     if (bs->backing_hd && bs->backing_hd->encrypted) {
3076         ret = bdrv_set_key(bs->backing_hd, key);
3077         if (ret < 0)
3078             return ret;
3079         if (!bs->encrypted)
3080             return 0;
3081     }
3082     if (!bs->encrypted) {
3083         return -EINVAL;
3084     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3085         return -ENOMEDIUM;
3086     }
3087     ret = bs->drv->bdrv_set_key(bs, key);
3088     if (ret < 0) {
3089         bs->valid_key = 0;
3090     } else if (!bs->valid_key) {
3091         bs->valid_key = 1;
3092         /* call the change callback now, we skipped it on open */
3093         bdrv_dev_change_media_cb(bs, true);
3094     }
3095     return ret;
3096 }
3097 
3098 const char *bdrv_get_format_name(BlockDriverState *bs)
3099 {
3100     return bs->drv ? bs->drv->format_name : NULL;
3101 }
3102 
3103 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3104                          void *opaque)
3105 {
3106     BlockDriver *drv;
3107 
3108     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3109         it(opaque, drv->format_name);
3110     }
3111 }
3112 
3113 BlockDriverState *bdrv_find(const char *name)
3114 {
3115     BlockDriverState *bs;
3116 
3117     QTAILQ_FOREACH(bs, &bdrv_states, list) {
3118         if (!strcmp(name, bs->device_name)) {
3119             return bs;
3120         }
3121     }
3122     return NULL;
3123 }
3124 
3125 BlockDriverState *bdrv_next(BlockDriverState *bs)
3126 {
3127     if (!bs) {
3128         return QTAILQ_FIRST(&bdrv_states);
3129     }
3130     return QTAILQ_NEXT(bs, list);
3131 }
3132 
3133 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3134 {
3135     BlockDriverState *bs;
3136 
3137     QTAILQ_FOREACH(bs, &bdrv_states, list) {
3138         it(opaque, bs);
3139     }
3140 }
3141 
3142 const char *bdrv_get_device_name(BlockDriverState *bs)
3143 {
3144     return bs->device_name;
3145 }
3146 
3147 int bdrv_get_flags(BlockDriverState *bs)
3148 {
3149     return bs->open_flags;
3150 }
3151 
3152 int bdrv_flush_all(void)
3153 {
3154     BlockDriverState *bs;
3155     int result = 0;
3156 
3157     QTAILQ_FOREACH(bs, &bdrv_states, list) {
3158         int ret = bdrv_flush(bs);
3159         if (ret < 0 && !result) {
3160             result = ret;
3161         }
3162     }
3163 
3164     return result;
3165 }
3166 
3167 int bdrv_has_zero_init_1(BlockDriverState *bs)
3168 {
3169     return 1;
3170 }
3171 
3172 int bdrv_has_zero_init(BlockDriverState *bs)
3173 {
3174     assert(bs->drv);
3175 
3176     /* If BS is a copy on write image, it is initialized to
3177        the contents of the base image, which may not be zeroes.  */
3178     if (bs->backing_hd) {
3179         return 0;
3180     }
3181     if (bs->drv->bdrv_has_zero_init) {
3182         return bs->drv->bdrv_has_zero_init(bs);
3183     }
3184 
3185     /* safe default */
3186     return 0;
3187 }
3188 
3189 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3190 {
3191     BlockDriverInfo bdi;
3192 
3193     if (bs->backing_hd) {
3194         return false;
3195     }
3196 
3197     if (bdrv_get_info(bs, &bdi) == 0) {
3198         return bdi.unallocated_blocks_are_zero;
3199     }
3200 
3201     return false;
3202 }
3203 
3204 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3205 {
3206     BlockDriverInfo bdi;
3207 
3208     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3209         return false;
3210     }
3211 
3212     if (bdrv_get_info(bs, &bdi) == 0) {
3213         return bdi.can_write_zeroes_with_unmap;
3214     }
3215 
3216     return false;
3217 }
3218 
3219 typedef struct BdrvCoGetBlockStatusData {
3220     BlockDriverState *bs;
3221     BlockDriverState *base;
3222     int64_t sector_num;
3223     int nb_sectors;
3224     int *pnum;
3225     int64_t ret;
3226     bool done;
3227 } BdrvCoGetBlockStatusData;
3228 
3229 /*
3230  * Returns true iff the specified sector is present in the disk image. Drivers
3231  * not implementing the functionality are assumed to not support backing files,
3232  * hence all their sectors are reported as allocated.
3233  *
3234  * If 'sector_num' is beyond the end of the disk image the return value is 0
3235  * and 'pnum' is set to 0.
3236  *
3237  * 'pnum' is set to the number of sectors (including and immediately following
3238  * the specified sector) that are known to be in the same
3239  * allocated/unallocated state.
3240  *
3241  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3242  * beyond the end of the disk image it will be clamped.
3243  */
3244 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3245                                                      int64_t sector_num,
3246                                                      int nb_sectors, int *pnum)
3247 {
3248     int64_t length;
3249     int64_t n;
3250     int64_t ret, ret2;
3251 
3252     length = bdrv_getlength(bs);
3253     if (length < 0) {
3254         return length;
3255     }
3256 
3257     if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3258         *pnum = 0;
3259         return 0;
3260     }
3261 
3262     n = bs->total_sectors - sector_num;
3263     if (n < nb_sectors) {
3264         nb_sectors = n;
3265     }
3266 
3267     if (!bs->drv->bdrv_co_get_block_status) {
3268         *pnum = nb_sectors;
3269         ret = BDRV_BLOCK_DATA;
3270         if (bs->drv->protocol_name) {
3271             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3272         }
3273         return ret;
3274     }
3275 
3276     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3277     if (ret < 0) {
3278         *pnum = 0;
3279         return ret;
3280     }
3281 
3282     if (ret & BDRV_BLOCK_RAW) {
3283         assert(ret & BDRV_BLOCK_OFFSET_VALID);
3284         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3285                                      *pnum, pnum);
3286     }
3287 
3288     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3289         if (bdrv_unallocated_blocks_are_zero(bs)) {
3290             ret |= BDRV_BLOCK_ZERO;
3291         } else if (bs->backing_hd) {
3292             BlockDriverState *bs2 = bs->backing_hd;
3293             int64_t length2 = bdrv_getlength(bs2);
3294             if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3295                 ret |= BDRV_BLOCK_ZERO;
3296             }
3297         }
3298     }
3299 
3300     if (bs->file &&
3301         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3302         (ret & BDRV_BLOCK_OFFSET_VALID)) {
3303         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3304                                         *pnum, pnum);
3305         if (ret2 >= 0) {
3306             /* Ignore errors.  This is just providing extra information, it
3307              * is useful but not necessary.
3308              */
3309             ret |= (ret2 & BDRV_BLOCK_ZERO);
3310         }
3311     }
3312 
3313     return ret;
3314 }
3315 
3316 /* Coroutine wrapper for bdrv_get_block_status() */
3317 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3318 {
3319     BdrvCoGetBlockStatusData *data = opaque;
3320     BlockDriverState *bs = data->bs;
3321 
3322     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3323                                          data->pnum);
3324     data->done = true;
3325 }
3326 
3327 /*
3328  * Synchronous wrapper around bdrv_co_get_block_status().
3329  *
3330  * See bdrv_co_get_block_status() for details.
3331  */
3332 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3333                               int nb_sectors, int *pnum)
3334 {
3335     Coroutine *co;
3336     BdrvCoGetBlockStatusData data = {
3337         .bs = bs,
3338         .sector_num = sector_num,
3339         .nb_sectors = nb_sectors,
3340         .pnum = pnum,
3341         .done = false,
3342     };
3343 
3344     if (qemu_in_coroutine()) {
3345         /* Fast-path if already in coroutine context */
3346         bdrv_get_block_status_co_entry(&data);
3347     } else {
3348         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3349         qemu_coroutine_enter(co, &data);
3350         while (!data.done) {
3351             qemu_aio_wait();
3352         }
3353     }
3354     return data.ret;
3355 }
3356 
3357 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3358                                    int nb_sectors, int *pnum)
3359 {
3360     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3361     if (ret < 0) {
3362         return ret;
3363     }
3364     return
3365         (ret & BDRV_BLOCK_DATA) ||
3366         ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3367 }
3368 
3369 /*
3370  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3371  *
3372  * Return true if the given sector is allocated in any image between
3373  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3374  * sector is allocated in any image of the chain.  Return false otherwise.
3375  *
3376  * 'pnum' is set to the number of sectors (including and immediately following
3377  *  the specified sector) that are known to be in the same
3378  *  allocated/unallocated state.
3379  *
3380  */
3381 int bdrv_is_allocated_above(BlockDriverState *top,
3382                             BlockDriverState *base,
3383                             int64_t sector_num,
3384                             int nb_sectors, int *pnum)
3385 {
3386     BlockDriverState *intermediate;
3387     int ret, n = nb_sectors;
3388 
3389     intermediate = top;
3390     while (intermediate && intermediate != base) {
3391         int pnum_inter;
3392         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3393                                 &pnum_inter);
3394         if (ret < 0) {
3395             return ret;
3396         } else if (ret) {
3397             *pnum = pnum_inter;
3398             return 1;
3399         }
3400 
3401         /*
3402          * [sector_num, nb_sectors] is unallocated on top but intermediate
3403          * might have
3404          *
3405          * [sector_num+x, nr_sectors] allocated.
3406          */
3407         if (n > pnum_inter &&
3408             (intermediate == top ||
3409              sector_num + pnum_inter < intermediate->total_sectors)) {
3410             n = pnum_inter;
3411         }
3412 
3413         intermediate = intermediate->backing_hd;
3414     }
3415 
3416     *pnum = n;
3417     return 0;
3418 }
3419 
3420 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3421 {
3422     if (bs->backing_hd && bs->backing_hd->encrypted)
3423         return bs->backing_file;
3424     else if (bs->encrypted)
3425         return bs->filename;
3426     else
3427         return NULL;
3428 }
3429 
3430 void bdrv_get_backing_filename(BlockDriverState *bs,
3431                                char *filename, int filename_size)
3432 {
3433     pstrcpy(filename, filename_size, bs->backing_file);
3434 }
3435 
3436 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3437                           const uint8_t *buf, int nb_sectors)
3438 {
3439     BlockDriver *drv = bs->drv;
3440     if (!drv)
3441         return -ENOMEDIUM;
3442     if (!drv->bdrv_write_compressed)
3443         return -ENOTSUP;
3444     if (bdrv_check_request(bs, sector_num, nb_sectors))
3445         return -EIO;
3446 
3447     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
3448 
3449     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3450 }
3451 
3452 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3453 {
3454     BlockDriver *drv = bs->drv;
3455     if (!drv)
3456         return -ENOMEDIUM;
3457     if (!drv->bdrv_get_info)
3458         return -ENOTSUP;
3459     memset(bdi, 0, sizeof(*bdi));
3460     return drv->bdrv_get_info(bs, bdi);
3461 }
3462 
3463 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
3464 {
3465     BlockDriver *drv = bs->drv;
3466     if (drv && drv->bdrv_get_specific_info) {
3467         return drv->bdrv_get_specific_info(bs);
3468     }
3469     return NULL;
3470 }
3471 
3472 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3473                       int64_t pos, int size)
3474 {
3475     QEMUIOVector qiov;
3476     struct iovec iov = {
3477         .iov_base   = (void *) buf,
3478         .iov_len    = size,
3479     };
3480 
3481     qemu_iovec_init_external(&qiov, &iov, 1);
3482     return bdrv_writev_vmstate(bs, &qiov, pos);
3483 }
3484 
3485 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
3486 {
3487     BlockDriver *drv = bs->drv;
3488 
3489     if (!drv) {
3490         return -ENOMEDIUM;
3491     } else if (drv->bdrv_save_vmstate) {
3492         return drv->bdrv_save_vmstate(bs, qiov, pos);
3493     } else if (bs->file) {
3494         return bdrv_writev_vmstate(bs->file, qiov, pos);
3495     }
3496 
3497     return -ENOTSUP;
3498 }
3499 
3500 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3501                       int64_t pos, int size)
3502 {
3503     BlockDriver *drv = bs->drv;
3504     if (!drv)
3505         return -ENOMEDIUM;
3506     if (drv->bdrv_load_vmstate)
3507         return drv->bdrv_load_vmstate(bs, buf, pos, size);
3508     if (bs->file)
3509         return bdrv_load_vmstate(bs->file, buf, pos, size);
3510     return -ENOTSUP;
3511 }
3512 
3513 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
3514 {
3515     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
3516         return;
3517     }
3518 
3519     bs->drv->bdrv_debug_event(bs, event);
3520 }
3521 
3522 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
3523                           const char *tag)
3524 {
3525     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
3526         bs = bs->file;
3527     }
3528 
3529     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
3530         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
3531     }
3532 
3533     return -ENOTSUP;
3534 }
3535 
3536 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
3537 {
3538     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
3539         bs = bs->file;
3540     }
3541 
3542     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
3543         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
3544     }
3545 
3546     return -ENOTSUP;
3547 }
3548 
3549 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
3550 {
3551     while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
3552         bs = bs->file;
3553     }
3554 
3555     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
3556         return bs->drv->bdrv_debug_resume(bs, tag);
3557     }
3558 
3559     return -ENOTSUP;
3560 }
3561 
3562 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
3563 {
3564     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
3565         bs = bs->file;
3566     }
3567 
3568     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
3569         return bs->drv->bdrv_debug_is_suspended(bs, tag);
3570     }
3571 
3572     return false;
3573 }
3574 
3575 int bdrv_is_snapshot(BlockDriverState *bs)
3576 {
3577     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
3578 }
3579 
3580 /* backing_file can either be relative, or absolute, or a protocol.  If it is
3581  * relative, it must be relative to the chain.  So, passing in bs->filename
3582  * from a BDS as backing_file should not be done, as that may be relative to
3583  * the CWD rather than the chain. */
3584 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
3585         const char *backing_file)
3586 {
3587     char *filename_full = NULL;
3588     char *backing_file_full = NULL;
3589     char *filename_tmp = NULL;
3590     int is_protocol = 0;
3591     BlockDriverState *curr_bs = NULL;
3592     BlockDriverState *retval = NULL;
3593 
3594     if (!bs || !bs->drv || !backing_file) {
3595         return NULL;
3596     }
3597 
3598     filename_full     = g_malloc(PATH_MAX);
3599     backing_file_full = g_malloc(PATH_MAX);
3600     filename_tmp      = g_malloc(PATH_MAX);
3601 
3602     is_protocol = path_has_protocol(backing_file);
3603 
3604     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
3605 
3606         /* If either of the filename paths is actually a protocol, then
3607          * compare unmodified paths; otherwise make paths relative */
3608         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
3609             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
3610                 retval = curr_bs->backing_hd;
3611                 break;
3612             }
3613         } else {
3614             /* If not an absolute filename path, make it relative to the current
3615              * image's filename path */
3616             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3617                          backing_file);
3618 
3619             /* We are going to compare absolute pathnames */
3620             if (!realpath(filename_tmp, filename_full)) {
3621                 continue;
3622             }
3623 
3624             /* We need to make sure the backing filename we are comparing against
3625              * is relative to the current image filename (or absolute) */
3626             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3627                          curr_bs->backing_file);
3628 
3629             if (!realpath(filename_tmp, backing_file_full)) {
3630                 continue;
3631             }
3632 
3633             if (strcmp(backing_file_full, filename_full) == 0) {
3634                 retval = curr_bs->backing_hd;
3635                 break;
3636             }
3637         }
3638     }
3639 
3640     g_free(filename_full);
3641     g_free(backing_file_full);
3642     g_free(filename_tmp);
3643     return retval;
3644 }
3645 
3646 int bdrv_get_backing_file_depth(BlockDriverState *bs)
3647 {
3648     if (!bs->drv) {
3649         return 0;
3650     }
3651 
3652     if (!bs->backing_hd) {
3653         return 0;
3654     }
3655 
3656     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
3657 }
3658 
3659 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
3660 {
3661     BlockDriverState *curr_bs = NULL;
3662 
3663     if (!bs) {
3664         return NULL;
3665     }
3666 
3667     curr_bs = bs;
3668 
3669     while (curr_bs->backing_hd) {
3670         curr_bs = curr_bs->backing_hd;
3671     }
3672     return curr_bs;
3673 }
3674 
3675 /**************************************************************/
3676 /* async I/Os */
3677 
3678 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3679                                  QEMUIOVector *qiov, int nb_sectors,
3680                                  BlockDriverCompletionFunc *cb, void *opaque)
3681 {
3682     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3683 
3684     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
3685                                  cb, opaque, false);
3686 }
3687 
3688 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3689                                   QEMUIOVector *qiov, int nb_sectors,
3690                                   BlockDriverCompletionFunc *cb, void *opaque)
3691 {
3692     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3693 
3694     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
3695                                  cb, opaque, true);
3696 }
3697 
3698 BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
3699         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
3700         BlockDriverCompletionFunc *cb, void *opaque)
3701 {
3702     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
3703 
3704     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
3705                                  BDRV_REQ_ZERO_WRITE | flags,
3706                                  cb, opaque, true);
3707 }
3708 
3709 
3710 typedef struct MultiwriteCB {
3711     int error;
3712     int num_requests;
3713     int num_callbacks;
3714     struct {
3715         BlockDriverCompletionFunc *cb;
3716         void *opaque;
3717         QEMUIOVector *free_qiov;
3718     } callbacks[];
3719 } MultiwriteCB;
3720 
3721 static void multiwrite_user_cb(MultiwriteCB *mcb)
3722 {
3723     int i;
3724 
3725     for (i = 0; i < mcb->num_callbacks; i++) {
3726         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3727         if (mcb->callbacks[i].free_qiov) {
3728             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3729         }
3730         g_free(mcb->callbacks[i].free_qiov);
3731     }
3732 }
3733 
3734 static void multiwrite_cb(void *opaque, int ret)
3735 {
3736     MultiwriteCB *mcb = opaque;
3737 
3738     trace_multiwrite_cb(mcb, ret);
3739 
3740     if (ret < 0 && !mcb->error) {
3741         mcb->error = ret;
3742     }
3743 
3744     mcb->num_requests--;
3745     if (mcb->num_requests == 0) {
3746         multiwrite_user_cb(mcb);
3747         g_free(mcb);
3748     }
3749 }
3750 
3751 static int multiwrite_req_compare(const void *a, const void *b)
3752 {
3753     const BlockRequest *req1 = a, *req2 = b;
3754 
3755     /*
3756      * Note that we can't simply subtract req2->sector from req1->sector
3757      * here as that could overflow the return value.
3758      */
3759     if (req1->sector > req2->sector) {
3760         return 1;
3761     } else if (req1->sector < req2->sector) {
3762         return -1;
3763     } else {
3764         return 0;
3765     }
3766 }
3767 
3768 /*
3769  * Takes a bunch of requests and tries to merge them. Returns the number of
3770  * requests that remain after merging.
3771  */
3772 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3773     int num_reqs, MultiwriteCB *mcb)
3774 {
3775     int i, outidx;
3776 
3777     // Sort requests by start sector
3778     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3779 
3780     // Check if adjacent requests touch the same clusters. If so, combine them,
3781     // filling up gaps with zero sectors.
3782     outidx = 0;
3783     for (i = 1; i < num_reqs; i++) {
3784         int merge = 0;
3785         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3786 
3787         // Handle exactly sequential writes and overlapping writes.
3788         if (reqs[i].sector <= oldreq_last) {
3789             merge = 1;
3790         }
3791 
3792         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3793             merge = 0;
3794         }
3795 
3796         if (merge) {
3797             size_t size;
3798             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3799             qemu_iovec_init(qiov,
3800                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3801 
3802             // Add the first request to the merged one. If the requests are
3803             // overlapping, drop the last sectors of the first request.
3804             size = (reqs[i].sector - reqs[outidx].sector) << 9;
3805             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
3806 
3807             // We should need to add any zeros between the two requests
3808             assert (reqs[i].sector <= oldreq_last);
3809 
3810             // Add the second request
3811             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
3812 
3813             reqs[outidx].nb_sectors = qiov->size >> 9;
3814             reqs[outidx].qiov = qiov;
3815 
3816             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3817         } else {
3818             outidx++;
3819             reqs[outidx].sector     = reqs[i].sector;
3820             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3821             reqs[outidx].qiov       = reqs[i].qiov;
3822         }
3823     }
3824 
3825     return outidx + 1;
3826 }
3827 
3828 /*
3829  * Submit multiple AIO write requests at once.
3830  *
3831  * On success, the function returns 0 and all requests in the reqs array have
3832  * been submitted. In error case this function returns -1, and any of the
3833  * requests may or may not be submitted yet. In particular, this means that the
3834  * callback will be called for some of the requests, for others it won't. The
3835  * caller must check the error field of the BlockRequest to wait for the right
3836  * callbacks (if error != 0, no callback will be called).
3837  *
3838  * The implementation may modify the contents of the reqs array, e.g. to merge
3839  * requests. However, the fields opaque and error are left unmodified as they
3840  * are used to signal failure for a single request to the caller.
3841  */
3842 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3843 {
3844     MultiwriteCB *mcb;
3845     int i;
3846 
3847     /* don't submit writes if we don't have a medium */
3848     if (bs->drv == NULL) {
3849         for (i = 0; i < num_reqs; i++) {
3850             reqs[i].error = -ENOMEDIUM;
3851         }
3852         return -1;
3853     }
3854 
3855     if (num_reqs == 0) {
3856         return 0;
3857     }
3858 
3859     // Create MultiwriteCB structure
3860     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3861     mcb->num_requests = 0;
3862     mcb->num_callbacks = num_reqs;
3863 
3864     for (i = 0; i < num_reqs; i++) {
3865         mcb->callbacks[i].cb = reqs[i].cb;
3866         mcb->callbacks[i].opaque = reqs[i].opaque;
3867     }
3868 
3869     // Check for mergable requests
3870     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3871 
3872     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3873 
3874     /* Run the aio requests. */
3875     mcb->num_requests = num_reqs;
3876     for (i = 0; i < num_reqs; i++) {
3877         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
3878                               reqs[i].nb_sectors, reqs[i].flags,
3879                               multiwrite_cb, mcb,
3880                               true);
3881     }
3882 
3883     return 0;
3884 }
3885 
3886 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3887 {
3888     acb->aiocb_info->cancel(acb);
3889 }
3890 
3891 /**************************************************************/
3892 /* async block device emulation */
3893 
3894 typedef struct BlockDriverAIOCBSync {
3895     BlockDriverAIOCB common;
3896     QEMUBH *bh;
3897     int ret;
3898     /* vector translation state */
3899     QEMUIOVector *qiov;
3900     uint8_t *bounce;
3901     int is_write;
3902 } BlockDriverAIOCBSync;
3903 
3904 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3905 {
3906     BlockDriverAIOCBSync *acb =
3907         container_of(blockacb, BlockDriverAIOCBSync, common);
3908     qemu_bh_delete(acb->bh);
3909     acb->bh = NULL;
3910     qemu_aio_release(acb);
3911 }
3912 
3913 static const AIOCBInfo bdrv_em_aiocb_info = {
3914     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3915     .cancel             = bdrv_aio_cancel_em,
3916 };
3917 
3918 static void bdrv_aio_bh_cb(void *opaque)
3919 {
3920     BlockDriverAIOCBSync *acb = opaque;
3921 
3922     if (!acb->is_write)
3923         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
3924     qemu_vfree(acb->bounce);
3925     acb->common.cb(acb->common.opaque, acb->ret);
3926     qemu_bh_delete(acb->bh);
3927     acb->bh = NULL;
3928     qemu_aio_release(acb);
3929 }
3930 
3931 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3932                                             int64_t sector_num,
3933                                             QEMUIOVector *qiov,
3934                                             int nb_sectors,
3935                                             BlockDriverCompletionFunc *cb,
3936                                             void *opaque,
3937                                             int is_write)
3938 
3939 {
3940     BlockDriverAIOCBSync *acb;
3941 
3942     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
3943     acb->is_write = is_write;
3944     acb->qiov = qiov;
3945     acb->bounce = qemu_blockalign(bs, qiov->size);
3946     acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3947 
3948     if (is_write) {
3949         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
3950         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3951     } else {
3952         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3953     }
3954 
3955     qemu_bh_schedule(acb->bh);
3956 
3957     return &acb->common;
3958 }
3959 
3960 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3961         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3962         BlockDriverCompletionFunc *cb, void *opaque)
3963 {
3964     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3965 }
3966 
3967 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3968         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3969         BlockDriverCompletionFunc *cb, void *opaque)
3970 {
3971     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3972 }
3973 
3974 
3975 typedef struct BlockDriverAIOCBCoroutine {
3976     BlockDriverAIOCB common;
3977     BlockRequest req;
3978     bool is_write;
3979     bool *done;
3980     QEMUBH* bh;
3981 } BlockDriverAIOCBCoroutine;
3982 
3983 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3984 {
3985     BlockDriverAIOCBCoroutine *acb =
3986         container_of(blockacb, BlockDriverAIOCBCoroutine, common);
3987     bool done = false;
3988 
3989     acb->done = &done;
3990     while (!done) {
3991         qemu_aio_wait();
3992     }
3993 }
3994 
3995 static const AIOCBInfo bdrv_em_co_aiocb_info = {
3996     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3997     .cancel             = bdrv_aio_co_cancel_em,
3998 };
3999 
4000 static void bdrv_co_em_bh(void *opaque)
4001 {
4002     BlockDriverAIOCBCoroutine *acb = opaque;
4003 
4004     acb->common.cb(acb->common.opaque, acb->req.error);
4005 
4006     if (acb->done) {
4007         *acb->done = true;
4008     }
4009 
4010     qemu_bh_delete(acb->bh);
4011     qemu_aio_release(acb);
4012 }
4013 
4014 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4015 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4016 {
4017     BlockDriverAIOCBCoroutine *acb = opaque;
4018     BlockDriverState *bs = acb->common.bs;
4019 
4020     if (!acb->is_write) {
4021         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4022             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4023     } else {
4024         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4025             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4026     }
4027 
4028     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4029     qemu_bh_schedule(acb->bh);
4030 }
4031 
4032 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4033                                                int64_t sector_num,
4034                                                QEMUIOVector *qiov,
4035                                                int nb_sectors,
4036                                                BdrvRequestFlags flags,
4037                                                BlockDriverCompletionFunc *cb,
4038                                                void *opaque,
4039                                                bool is_write)
4040 {
4041     Coroutine *co;
4042     BlockDriverAIOCBCoroutine *acb;
4043 
4044     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4045     acb->req.sector = sector_num;
4046     acb->req.nb_sectors = nb_sectors;
4047     acb->req.qiov = qiov;
4048     acb->req.flags = flags;
4049     acb->is_write = is_write;
4050     acb->done = NULL;
4051 
4052     co = qemu_coroutine_create(bdrv_co_do_rw);
4053     qemu_coroutine_enter(co, acb);
4054 
4055     return &acb->common;
4056 }
4057 
4058 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4059 {
4060     BlockDriverAIOCBCoroutine *acb = opaque;
4061     BlockDriverState *bs = acb->common.bs;
4062 
4063     acb->req.error = bdrv_co_flush(bs);
4064     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4065     qemu_bh_schedule(acb->bh);
4066 }
4067 
4068 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4069         BlockDriverCompletionFunc *cb, void *opaque)
4070 {
4071     trace_bdrv_aio_flush(bs, opaque);
4072 
4073     Coroutine *co;
4074     BlockDriverAIOCBCoroutine *acb;
4075 
4076     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4077     acb->done = NULL;
4078 
4079     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4080     qemu_coroutine_enter(co, acb);
4081 
4082     return &acb->common;
4083 }
4084 
4085 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4086 {
4087     BlockDriverAIOCBCoroutine *acb = opaque;
4088     BlockDriverState *bs = acb->common.bs;
4089 
4090     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4091     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4092     qemu_bh_schedule(acb->bh);
4093 }
4094 
4095 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4096         int64_t sector_num, int nb_sectors,
4097         BlockDriverCompletionFunc *cb, void *opaque)
4098 {
4099     Coroutine *co;
4100     BlockDriverAIOCBCoroutine *acb;
4101 
4102     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4103 
4104     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4105     acb->req.sector = sector_num;
4106     acb->req.nb_sectors = nb_sectors;
4107     acb->done = NULL;
4108     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4109     qemu_coroutine_enter(co, acb);
4110 
4111     return &acb->common;
4112 }
4113 
4114 void bdrv_init(void)
4115 {
4116     module_call_init(MODULE_INIT_BLOCK);
4117 }
4118 
4119 void bdrv_init_with_whitelist(void)
4120 {
4121     use_bdrv_whitelist = 1;
4122     bdrv_init();
4123 }
4124 
4125 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4126                    BlockDriverCompletionFunc *cb, void *opaque)
4127 {
4128     BlockDriverAIOCB *acb;
4129 
4130     acb = g_slice_alloc(aiocb_info->aiocb_size);
4131     acb->aiocb_info = aiocb_info;
4132     acb->bs = bs;
4133     acb->cb = cb;
4134     acb->opaque = opaque;
4135     return acb;
4136 }
4137 
4138 void qemu_aio_release(void *p)
4139 {
4140     BlockDriverAIOCB *acb = p;
4141     g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4142 }
4143 
4144 /**************************************************************/
4145 /* Coroutine block device emulation */
4146 
4147 typedef struct CoroutineIOCompletion {
4148     Coroutine *coroutine;
4149     int ret;
4150 } CoroutineIOCompletion;
4151 
4152 static void bdrv_co_io_em_complete(void *opaque, int ret)
4153 {
4154     CoroutineIOCompletion *co = opaque;
4155 
4156     co->ret = ret;
4157     qemu_coroutine_enter(co->coroutine, NULL);
4158 }
4159 
4160 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4161                                       int nb_sectors, QEMUIOVector *iov,
4162                                       bool is_write)
4163 {
4164     CoroutineIOCompletion co = {
4165         .coroutine = qemu_coroutine_self(),
4166     };
4167     BlockDriverAIOCB *acb;
4168 
4169     if (is_write) {
4170         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4171                                        bdrv_co_io_em_complete, &co);
4172     } else {
4173         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4174                                       bdrv_co_io_em_complete, &co);
4175     }
4176 
4177     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4178     if (!acb) {
4179         return -EIO;
4180     }
4181     qemu_coroutine_yield();
4182 
4183     return co.ret;
4184 }
4185 
4186 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4187                                          int64_t sector_num, int nb_sectors,
4188                                          QEMUIOVector *iov)
4189 {
4190     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4191 }
4192 
4193 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4194                                          int64_t sector_num, int nb_sectors,
4195                                          QEMUIOVector *iov)
4196 {
4197     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4198 }
4199 
4200 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4201 {
4202     RwCo *rwco = opaque;
4203 
4204     rwco->ret = bdrv_co_flush(rwco->bs);
4205 }
4206 
4207 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4208 {
4209     int ret;
4210 
4211     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4212         return 0;
4213     }
4214 
4215     /* Write back cached data to the OS even with cache=unsafe */
4216     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4217     if (bs->drv->bdrv_co_flush_to_os) {
4218         ret = bs->drv->bdrv_co_flush_to_os(bs);
4219         if (ret < 0) {
4220             return ret;
4221         }
4222     }
4223 
4224     /* But don't actually force it to the disk with cache=unsafe */
4225     if (bs->open_flags & BDRV_O_NO_FLUSH) {
4226         goto flush_parent;
4227     }
4228 
4229     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4230     if (bs->drv->bdrv_co_flush_to_disk) {
4231         ret = bs->drv->bdrv_co_flush_to_disk(bs);
4232     } else if (bs->drv->bdrv_aio_flush) {
4233         BlockDriverAIOCB *acb;
4234         CoroutineIOCompletion co = {
4235             .coroutine = qemu_coroutine_self(),
4236         };
4237 
4238         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4239         if (acb == NULL) {
4240             ret = -EIO;
4241         } else {
4242             qemu_coroutine_yield();
4243             ret = co.ret;
4244         }
4245     } else {
4246         /*
4247          * Some block drivers always operate in either writethrough or unsafe
4248          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4249          * know how the server works (because the behaviour is hardcoded or
4250          * depends on server-side configuration), so we can't ensure that
4251          * everything is safe on disk. Returning an error doesn't work because
4252          * that would break guests even if the server operates in writethrough
4253          * mode.
4254          *
4255          * Let's hope the user knows what he's doing.
4256          */
4257         ret = 0;
4258     }
4259     if (ret < 0) {
4260         return ret;
4261     }
4262 
4263     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4264      * in the case of cache=unsafe, so there are no useless flushes.
4265      */
4266 flush_parent:
4267     return bdrv_co_flush(bs->file);
4268 }
4269 
4270 void bdrv_invalidate_cache(BlockDriverState *bs)
4271 {
4272     if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4273         bs->drv->bdrv_invalidate_cache(bs);
4274     }
4275 }
4276 
4277 void bdrv_invalidate_cache_all(void)
4278 {
4279     BlockDriverState *bs;
4280 
4281     QTAILQ_FOREACH(bs, &bdrv_states, list) {
4282         bdrv_invalidate_cache(bs);
4283     }
4284 }
4285 
4286 void bdrv_clear_incoming_migration_all(void)
4287 {
4288     BlockDriverState *bs;
4289 
4290     QTAILQ_FOREACH(bs, &bdrv_states, list) {
4291         bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4292     }
4293 }
4294 
4295 int bdrv_flush(BlockDriverState *bs)
4296 {
4297     Coroutine *co;
4298     RwCo rwco = {
4299         .bs = bs,
4300         .ret = NOT_DONE,
4301     };
4302 
4303     if (qemu_in_coroutine()) {
4304         /* Fast-path if already in coroutine context */
4305         bdrv_flush_co_entry(&rwco);
4306     } else {
4307         co = qemu_coroutine_create(bdrv_flush_co_entry);
4308         qemu_coroutine_enter(co, &rwco);
4309         while (rwco.ret == NOT_DONE) {
4310             qemu_aio_wait();
4311         }
4312     }
4313 
4314     return rwco.ret;
4315 }
4316 
4317 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4318 {
4319     RwCo *rwco = opaque;
4320 
4321     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4322 }
4323 
4324 /* if no limit is specified in the BlockLimits use a default
4325  * of 32768 512-byte sectors (16 MiB) per request.
4326  */
4327 #define MAX_DISCARD_DEFAULT 32768
4328 
4329 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4330                                  int nb_sectors)
4331 {
4332     int max_discard;
4333 
4334     if (!bs->drv) {
4335         return -ENOMEDIUM;
4336     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4337         return -EIO;
4338     } else if (bs->read_only) {
4339         return -EROFS;
4340     }
4341 
4342     bdrv_reset_dirty(bs, sector_num, nb_sectors);
4343 
4344     /* Do nothing if disabled.  */
4345     if (!(bs->open_flags & BDRV_O_UNMAP)) {
4346         return 0;
4347     }
4348 
4349     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4350         return 0;
4351     }
4352 
4353     max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4354     while (nb_sectors > 0) {
4355         int ret;
4356         int num = nb_sectors;
4357 
4358         /* align request */
4359         if (bs->bl.discard_alignment &&
4360             num >= bs->bl.discard_alignment &&
4361             sector_num % bs->bl.discard_alignment) {
4362             if (num > bs->bl.discard_alignment) {
4363                 num = bs->bl.discard_alignment;
4364             }
4365             num -= sector_num % bs->bl.discard_alignment;
4366         }
4367 
4368         /* limit request size */
4369         if (num > max_discard) {
4370             num = max_discard;
4371         }
4372 
4373         if (bs->drv->bdrv_co_discard) {
4374             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
4375         } else {
4376             BlockDriverAIOCB *acb;
4377             CoroutineIOCompletion co = {
4378                 .coroutine = qemu_coroutine_self(),
4379             };
4380 
4381             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4382                                             bdrv_co_io_em_complete, &co);
4383             if (acb == NULL) {
4384                 return -EIO;
4385             } else {
4386                 qemu_coroutine_yield();
4387                 ret = co.ret;
4388             }
4389         }
4390         if (ret && ret != -ENOTSUP) {
4391             return ret;
4392         }
4393 
4394         sector_num += num;
4395         nb_sectors -= num;
4396     }
4397     return 0;
4398 }
4399 
4400 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4401 {
4402     Coroutine *co;
4403     RwCo rwco = {
4404         .bs = bs,
4405         .sector_num = sector_num,
4406         .nb_sectors = nb_sectors,
4407         .ret = NOT_DONE,
4408     };
4409 
4410     if (qemu_in_coroutine()) {
4411         /* Fast-path if already in coroutine context */
4412         bdrv_discard_co_entry(&rwco);
4413     } else {
4414         co = qemu_coroutine_create(bdrv_discard_co_entry);
4415         qemu_coroutine_enter(co, &rwco);
4416         while (rwco.ret == NOT_DONE) {
4417             qemu_aio_wait();
4418         }
4419     }
4420 
4421     return rwco.ret;
4422 }
4423 
4424 /**************************************************************/
4425 /* removable device support */
4426 
4427 /**
4428  * Return TRUE if the media is present
4429  */
4430 int bdrv_is_inserted(BlockDriverState *bs)
4431 {
4432     BlockDriver *drv = bs->drv;
4433 
4434     if (!drv)
4435         return 0;
4436     if (!drv->bdrv_is_inserted)
4437         return 1;
4438     return drv->bdrv_is_inserted(bs);
4439 }
4440 
4441 /**
4442  * Return whether the media changed since the last call to this
4443  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4444  */
4445 int bdrv_media_changed(BlockDriverState *bs)
4446 {
4447     BlockDriver *drv = bs->drv;
4448 
4449     if (drv && drv->bdrv_media_changed) {
4450         return drv->bdrv_media_changed(bs);
4451     }
4452     return -ENOTSUP;
4453 }
4454 
4455 /**
4456  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4457  */
4458 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4459 {
4460     BlockDriver *drv = bs->drv;
4461 
4462     if (drv && drv->bdrv_eject) {
4463         drv->bdrv_eject(bs, eject_flag);
4464     }
4465 
4466     if (bs->device_name[0] != '\0') {
4467         bdrv_emit_qmp_eject_event(bs, eject_flag);
4468     }
4469 }
4470 
4471 /**
4472  * Lock or unlock the media (if it is locked, the user won't be able
4473  * to eject it manually).
4474  */
4475 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4476 {
4477     BlockDriver *drv = bs->drv;
4478 
4479     trace_bdrv_lock_medium(bs, locked);
4480 
4481     if (drv && drv->bdrv_lock_medium) {
4482         drv->bdrv_lock_medium(bs, locked);
4483     }
4484 }
4485 
4486 /* needed for generic scsi interface */
4487 
4488 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4489 {
4490     BlockDriver *drv = bs->drv;
4491 
4492     if (drv && drv->bdrv_ioctl)
4493         return drv->bdrv_ioctl(bs, req, buf);
4494     return -ENOTSUP;
4495 }
4496 
4497 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4498         unsigned long int req, void *buf,
4499         BlockDriverCompletionFunc *cb, void *opaque)
4500 {
4501     BlockDriver *drv = bs->drv;
4502 
4503     if (drv && drv->bdrv_aio_ioctl)
4504         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4505     return NULL;
4506 }
4507 
4508 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
4509 {
4510     bs->buffer_alignment = align;
4511 }
4512 
4513 void *qemu_blockalign(BlockDriverState *bs, size_t size)
4514 {
4515     return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
4516 }
4517 
4518 /*
4519  * Check if all memory in this vector is sector aligned.
4520  */
4521 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
4522 {
4523     int i;
4524 
4525     for (i = 0; i < qiov->niov; i++) {
4526         if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
4527             return false;
4528         }
4529     }
4530 
4531     return true;
4532 }
4533 
4534 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity)
4535 {
4536     int64_t bitmap_size;
4537     BdrvDirtyBitmap *bitmap;
4538 
4539     assert((granularity & (granularity - 1)) == 0);
4540 
4541     granularity >>= BDRV_SECTOR_BITS;
4542     assert(granularity);
4543     bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
4544     bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
4545     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
4546     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
4547     return bitmap;
4548 }
4549 
4550 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
4551 {
4552     BdrvDirtyBitmap *bm, *next;
4553     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
4554         if (bm == bitmap) {
4555             QLIST_REMOVE(bitmap, list);
4556             hbitmap_free(bitmap->bitmap);
4557             g_free(bitmap);
4558             return;
4559         }
4560     }
4561 }
4562 
4563 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
4564 {
4565     BdrvDirtyBitmap *bm;
4566     BlockDirtyInfoList *list = NULL;
4567     BlockDirtyInfoList **plist = &list;
4568 
4569     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
4570         BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
4571         BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
4572         info->count = bdrv_get_dirty_count(bs, bm);
4573         info->granularity =
4574             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
4575         entry->value = info;
4576         *plist = entry;
4577         plist = &entry->next;
4578     }
4579 
4580     return list;
4581 }
4582 
4583 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
4584 {
4585     if (bitmap) {
4586         return hbitmap_get(bitmap->bitmap, sector);
4587     } else {
4588         return 0;
4589     }
4590 }
4591 
4592 void bdrv_dirty_iter_init(BlockDriverState *bs,
4593                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
4594 {
4595     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
4596 }
4597 
4598 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
4599                     int nr_sectors)
4600 {
4601     BdrvDirtyBitmap *bitmap;
4602     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
4603         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
4604     }
4605 }
4606 
4607 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
4608 {
4609     BdrvDirtyBitmap *bitmap;
4610     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
4611         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
4612     }
4613 }
4614 
4615 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
4616 {
4617     return hbitmap_count(bitmap->bitmap);
4618 }
4619 
4620 /* Get a reference to bs */
4621 void bdrv_ref(BlockDriverState *bs)
4622 {
4623     bs->refcnt++;
4624 }
4625 
4626 /* Release a previously grabbed reference to bs.
4627  * If after releasing, reference count is zero, the BlockDriverState is
4628  * deleted. */
4629 void bdrv_unref(BlockDriverState *bs)
4630 {
4631     assert(bs->refcnt > 0);
4632     if (--bs->refcnt == 0) {
4633         bdrv_delete(bs);
4634     }
4635 }
4636 
4637 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4638 {
4639     assert(bs->in_use != in_use);
4640     bs->in_use = in_use;
4641 }
4642 
4643 int bdrv_in_use(BlockDriverState *bs)
4644 {
4645     return bs->in_use;
4646 }
4647 
4648 void bdrv_iostatus_enable(BlockDriverState *bs)
4649 {
4650     bs->iostatus_enabled = true;
4651     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4652 }
4653 
4654 /* The I/O status is only enabled if the drive explicitly
4655  * enables it _and_ the VM is configured to stop on errors */
4656 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4657 {
4658     return (bs->iostatus_enabled &&
4659            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
4660             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
4661             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
4662 }
4663 
4664 void bdrv_iostatus_disable(BlockDriverState *bs)
4665 {
4666     bs->iostatus_enabled = false;
4667 }
4668 
4669 void bdrv_iostatus_reset(BlockDriverState *bs)
4670 {
4671     if (bdrv_iostatus_is_enabled(bs)) {
4672         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4673         if (bs->job) {
4674             block_job_iostatus_reset(bs->job);
4675         }
4676     }
4677 }
4678 
4679 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4680 {
4681     assert(bdrv_iostatus_is_enabled(bs));
4682     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4683         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4684                                          BLOCK_DEVICE_IO_STATUS_FAILED;
4685     }
4686 }
4687 
4688 void
4689 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4690         enum BlockAcctType type)
4691 {
4692     assert(type < BDRV_MAX_IOTYPE);
4693 
4694     cookie->bytes = bytes;
4695     cookie->start_time_ns = get_clock();
4696     cookie->type = type;
4697 }
4698 
4699 void
4700 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4701 {
4702     assert(cookie->type < BDRV_MAX_IOTYPE);
4703 
4704     bs->nr_bytes[cookie->type] += cookie->bytes;
4705     bs->nr_ops[cookie->type]++;
4706     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4707 }
4708 
4709 void bdrv_img_create(const char *filename, const char *fmt,
4710                      const char *base_filename, const char *base_fmt,
4711                      char *options, uint64_t img_size, int flags,
4712                      Error **errp, bool quiet)
4713 {
4714     QEMUOptionParameter *param = NULL, *create_options = NULL;
4715     QEMUOptionParameter *backing_fmt, *backing_file, *size;
4716     BlockDriver *drv, *proto_drv;
4717     BlockDriver *backing_drv = NULL;
4718     Error *local_err = NULL;
4719     int ret = 0;
4720 
4721     /* Find driver and parse its options */
4722     drv = bdrv_find_format(fmt);
4723     if (!drv) {
4724         error_setg(errp, "Unknown file format '%s'", fmt);
4725         return;
4726     }
4727 
4728     proto_drv = bdrv_find_protocol(filename, true);
4729     if (!proto_drv) {
4730         error_setg(errp, "Unknown protocol '%s'", filename);
4731         return;
4732     }
4733 
4734     create_options = append_option_parameters(create_options,
4735                                               drv->create_options);
4736     create_options = append_option_parameters(create_options,
4737                                               proto_drv->create_options);
4738 
4739     /* Create parameter list with default values */
4740     param = parse_option_parameters("", create_options, param);
4741 
4742     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4743 
4744     /* Parse -o options */
4745     if (options) {
4746         param = parse_option_parameters(options, create_options, param);
4747         if (param == NULL) {
4748             error_setg(errp, "Invalid options for file format '%s'.", fmt);
4749             goto out;
4750         }
4751     }
4752 
4753     if (base_filename) {
4754         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4755                                  base_filename)) {
4756             error_setg(errp, "Backing file not supported for file format '%s'",
4757                        fmt);
4758             goto out;
4759         }
4760     }
4761 
4762     if (base_fmt) {
4763         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4764             error_setg(errp, "Backing file format not supported for file "
4765                              "format '%s'", fmt);
4766             goto out;
4767         }
4768     }
4769 
4770     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4771     if (backing_file && backing_file->value.s) {
4772         if (!strcmp(filename, backing_file->value.s)) {
4773             error_setg(errp, "Error: Trying to create an image with the "
4774                              "same filename as the backing file");
4775             goto out;
4776         }
4777     }
4778 
4779     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4780     if (backing_fmt && backing_fmt->value.s) {
4781         backing_drv = bdrv_find_format(backing_fmt->value.s);
4782         if (!backing_drv) {
4783             error_setg(errp, "Unknown backing file format '%s'",
4784                        backing_fmt->value.s);
4785             goto out;
4786         }
4787     }
4788 
4789     // The size for the image must always be specified, with one exception:
4790     // If we are using a backing file, we can obtain the size from there
4791     size = get_option_parameter(param, BLOCK_OPT_SIZE);
4792     if (size && size->value.n == -1) {
4793         if (backing_file && backing_file->value.s) {
4794             BlockDriverState *bs;
4795             uint64_t size;
4796             char buf[32];
4797             int back_flags;
4798 
4799             /* backing files always opened read-only */
4800             back_flags =
4801                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4802 
4803             bs = bdrv_new("");
4804 
4805             ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags,
4806                             backing_drv, &local_err);
4807             if (ret < 0) {
4808                 error_setg_errno(errp, -ret, "Could not open '%s': %s",
4809                                  backing_file->value.s,
4810                                  error_get_pretty(local_err));
4811                 error_free(local_err);
4812                 local_err = NULL;
4813                 bdrv_unref(bs);
4814                 goto out;
4815             }
4816             bdrv_get_geometry(bs, &size);
4817             size *= 512;
4818 
4819             snprintf(buf, sizeof(buf), "%" PRId64, size);
4820             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4821 
4822             bdrv_unref(bs);
4823         } else {
4824             error_setg(errp, "Image creation needs a size parameter");
4825             goto out;
4826         }
4827     }
4828 
4829     if (!quiet) {
4830         printf("Formatting '%s', fmt=%s ", filename, fmt);
4831         print_option_parameters(param);
4832         puts("");
4833     }
4834     ret = bdrv_create(drv, filename, param, &local_err);
4835     if (ret == -EFBIG) {
4836         /* This is generally a better message than whatever the driver would
4837          * deliver (especially because of the cluster_size_hint), since that
4838          * is most probably not much different from "image too large". */
4839         const char *cluster_size_hint = "";
4840         if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
4841             cluster_size_hint = " (try using a larger cluster size)";
4842         }
4843         error_setg(errp, "The image size is too large for file format '%s'"
4844                    "%s", fmt, cluster_size_hint);
4845         error_free(local_err);
4846         local_err = NULL;
4847     }
4848 
4849 out:
4850     free_option_parameters(create_options);
4851     free_option_parameters(param);
4852 
4853     if (error_is_set(&local_err)) {
4854         error_propagate(errp, local_err);
4855     }
4856 }
4857 
4858 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
4859 {
4860     /* Currently BlockDriverState always uses the main loop AioContext */
4861     return qemu_get_aio_context();
4862 }
4863 
4864 void bdrv_add_before_write_notifier(BlockDriverState *bs,
4865                                     NotifierWithReturn *notifier)
4866 {
4867     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
4868 }
4869 
4870 int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
4871 {
4872     if (bs->drv->bdrv_amend_options == NULL) {
4873         return -ENOTSUP;
4874     }
4875     return bs->drv->bdrv_amend_options(bs, options);
4876 }
4877 
4878 ExtSnapshotPerm bdrv_check_ext_snapshot(BlockDriverState *bs)
4879 {
4880     if (bs->drv->bdrv_check_ext_snapshot) {
4881         return bs->drv->bdrv_check_ext_snapshot(bs);
4882     }
4883 
4884     if (bs->file && bs->file->drv && bs->file->drv->bdrv_check_ext_snapshot) {
4885         return bs->file->drv->bdrv_check_ext_snapshot(bs);
4886     }
4887 
4888     /* external snapshots are allowed by default */
4889     return EXT_SNAPSHOT_ALLOWED;
4890 }
4891 
4892 ExtSnapshotPerm bdrv_check_ext_snapshot_forbidden(BlockDriverState *bs)
4893 {
4894     return EXT_SNAPSHOT_FORBIDDEN;
4895 }
4896