xref: /openbmc/qemu/block.c (revision a719a27c)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor/monitor.h"
28 #include "block/block_int.h"
29 #include "block/blockjob.h"
30 #include "qemu/module.h"
31 #include "qapi/qmp/qjson.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 
39 #ifdef CONFIG_BSD
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/queue.h>
44 #ifndef __DragonFly__
45 #include <sys/disk.h>
46 #endif
47 #endif
48 
49 #ifdef _WIN32
50 #include <windows.h>
51 #endif
52 
53 struct BdrvDirtyBitmap {
54     HBitmap *bitmap;
55     QLIST_ENTRY(BdrvDirtyBitmap) list;
56 };
57 
58 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59 
60 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63         BlockDriverCompletionFunc *cb, void *opaque);
64 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66         BlockDriverCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68                                          int64_t sector_num, int nb_sectors,
69                                          QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71                                          int64_t sector_num, int nb_sectors,
72                                          QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75     BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78     BdrvRequestFlags flags);
79 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80                                                int64_t sector_num,
81                                                QEMUIOVector *qiov,
82                                                int nb_sectors,
83                                                BdrvRequestFlags flags,
84                                                BlockDriverCompletionFunc *cb,
85                                                void *opaque,
86                                                bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90 
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92     QTAILQ_HEAD_INITIALIZER(bdrv_states);
93 
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96 
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98     QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
102 
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
105 {
106     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108             filename[1] == ':');
109 }
110 
111 int is_windows_drive(const char *filename)
112 {
113     if (is_windows_drive_prefix(filename) &&
114         filename[2] == '\0')
115         return 1;
116     if (strstart(filename, "\\\\.\\", NULL) ||
117         strstart(filename, "//./", NULL))
118         return 1;
119     return 0;
120 }
121 #endif
122 
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125                         ThrottleConfig *cfg)
126 {
127     int i;
128 
129     throttle_config(&bs->throttle_state, cfg);
130 
131     for (i = 0; i < 2; i++) {
132         qemu_co_enter_next(&bs->throttled_reqs[i]);
133     }
134 }
135 
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138 {
139     bool drained = false;
140     bool enabled = bs->io_limits_enabled;
141     int i;
142 
143     bs->io_limits_enabled = false;
144 
145     for (i = 0; i < 2; i++) {
146         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147             drained = true;
148         }
149     }
150 
151     bs->io_limits_enabled = enabled;
152 
153     return drained;
154 }
155 
156 void bdrv_io_limits_disable(BlockDriverState *bs)
157 {
158     bs->io_limits_enabled = false;
159 
160     bdrv_start_throttled_reqs(bs);
161 
162     throttle_destroy(&bs->throttle_state);
163 }
164 
165 static void bdrv_throttle_read_timer_cb(void *opaque)
166 {
167     BlockDriverState *bs = opaque;
168     qemu_co_enter_next(&bs->throttled_reqs[0]);
169 }
170 
171 static void bdrv_throttle_write_timer_cb(void *opaque)
172 {
173     BlockDriverState *bs = opaque;
174     qemu_co_enter_next(&bs->throttled_reqs[1]);
175 }
176 
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
179 {
180     assert(!bs->io_limits_enabled);
181     throttle_init(&bs->throttle_state,
182                   QEMU_CLOCK_VIRTUAL,
183                   bdrv_throttle_read_timer_cb,
184                   bdrv_throttle_write_timer_cb,
185                   bs);
186     bs->io_limits_enabled = true;
187 }
188 
189 /* This function makes an IO wait if needed
190  *
191  * @nb_sectors: the number of sectors of the IO
192  * @is_write:   is the IO a write
193  */
194 static void bdrv_io_limits_intercept(BlockDriverState *bs,
195                                      unsigned int bytes,
196                                      bool is_write)
197 {
198     /* does this io must wait */
199     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
200 
201     /* if must wait or any request of this type throttled queue the IO */
202     if (must_wait ||
203         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
205     }
206 
207     /* the IO will be executed, do the accounting */
208     throttle_account(&bs->throttle_state, is_write, bytes);
209 
210 
211     /* if the next request must wait -> do nothing */
212     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
213         return;
214     }
215 
216     /* else queue next request for execution */
217     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
218 }
219 
220 size_t bdrv_opt_mem_align(BlockDriverState *bs)
221 {
222     if (!bs || !bs->drv) {
223         /* 4k should be on the safe side */
224         return 4096;
225     }
226 
227     return bs->bl.opt_mem_alignment;
228 }
229 
230 /* check if the path starts with "<protocol>:" */
231 static int path_has_protocol(const char *path)
232 {
233     const char *p;
234 
235 #ifdef _WIN32
236     if (is_windows_drive(path) ||
237         is_windows_drive_prefix(path)) {
238         return 0;
239     }
240     p = path + strcspn(path, ":/\\");
241 #else
242     p = path + strcspn(path, ":/");
243 #endif
244 
245     return *p == ':';
246 }
247 
248 int path_is_absolute(const char *path)
249 {
250 #ifdef _WIN32
251     /* specific case for names like: "\\.\d:" */
252     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
253         return 1;
254     }
255     return (*path == '/' || *path == '\\');
256 #else
257     return (*path == '/');
258 #endif
259 }
260 
261 /* if filename is absolute, just copy it to dest. Otherwise, build a
262    path to it by considering it is relative to base_path. URL are
263    supported. */
264 void path_combine(char *dest, int dest_size,
265                   const char *base_path,
266                   const char *filename)
267 {
268     const char *p, *p1;
269     int len;
270 
271     if (dest_size <= 0)
272         return;
273     if (path_is_absolute(filename)) {
274         pstrcpy(dest, dest_size, filename);
275     } else {
276         p = strchr(base_path, ':');
277         if (p)
278             p++;
279         else
280             p = base_path;
281         p1 = strrchr(base_path, '/');
282 #ifdef _WIN32
283         {
284             const char *p2;
285             p2 = strrchr(base_path, '\\');
286             if (!p1 || p2 > p1)
287                 p1 = p2;
288         }
289 #endif
290         if (p1)
291             p1++;
292         else
293             p1 = base_path;
294         if (p1 > p)
295             p = p1;
296         len = p - base_path;
297         if (len > dest_size - 1)
298             len = dest_size - 1;
299         memcpy(dest, base_path, len);
300         dest[len] = '\0';
301         pstrcat(dest, dest_size, filename);
302     }
303 }
304 
305 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
306 {
307     if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
308         pstrcpy(dest, sz, bs->backing_file);
309     } else {
310         path_combine(dest, sz, bs->filename, bs->backing_file);
311     }
312 }
313 
314 void bdrv_register(BlockDriver *bdrv)
315 {
316     /* Block drivers without coroutine functions need emulation */
317     if (!bdrv->bdrv_co_readv) {
318         bdrv->bdrv_co_readv = bdrv_co_readv_em;
319         bdrv->bdrv_co_writev = bdrv_co_writev_em;
320 
321         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
322          * the block driver lacks aio we need to emulate that too.
323          */
324         if (!bdrv->bdrv_aio_readv) {
325             /* add AIO emulation layer */
326             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
327             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
328         }
329     }
330 
331     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
332 }
333 
334 /* create a new block device (by default it is empty) */
335 BlockDriverState *bdrv_new(const char *device_name, Error **errp)
336 {
337     BlockDriverState *bs;
338 
339     if (bdrv_find(device_name)) {
340         error_setg(errp, "Device with id '%s' already exists",
341                    device_name);
342         return NULL;
343     }
344     if (bdrv_find_node(device_name)) {
345         error_setg(errp, "Device with node-name '%s' already exists",
346                    device_name);
347         return NULL;
348     }
349 
350     bs = g_malloc0(sizeof(BlockDriverState));
351     QLIST_INIT(&bs->dirty_bitmaps);
352     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
353     if (device_name[0] != '\0') {
354         QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
355     }
356     bdrv_iostatus_disable(bs);
357     notifier_list_init(&bs->close_notifiers);
358     notifier_with_return_list_init(&bs->before_write_notifiers);
359     qemu_co_queue_init(&bs->throttled_reqs[0]);
360     qemu_co_queue_init(&bs->throttled_reqs[1]);
361     bs->refcnt = 1;
362 
363     return bs;
364 }
365 
366 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
367 {
368     notifier_list_add(&bs->close_notifiers, notify);
369 }
370 
371 BlockDriver *bdrv_find_format(const char *format_name)
372 {
373     BlockDriver *drv1;
374     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
375         if (!strcmp(drv1->format_name, format_name)) {
376             return drv1;
377         }
378     }
379     return NULL;
380 }
381 
382 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
383 {
384     static const char *whitelist_rw[] = {
385         CONFIG_BDRV_RW_WHITELIST
386     };
387     static const char *whitelist_ro[] = {
388         CONFIG_BDRV_RO_WHITELIST
389     };
390     const char **p;
391 
392     if (!whitelist_rw[0] && !whitelist_ro[0]) {
393         return 1;               /* no whitelist, anything goes */
394     }
395 
396     for (p = whitelist_rw; *p; p++) {
397         if (!strcmp(drv->format_name, *p)) {
398             return 1;
399         }
400     }
401     if (read_only) {
402         for (p = whitelist_ro; *p; p++) {
403             if (!strcmp(drv->format_name, *p)) {
404                 return 1;
405             }
406         }
407     }
408     return 0;
409 }
410 
411 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
412                                           bool read_only)
413 {
414     BlockDriver *drv = bdrv_find_format(format_name);
415     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
416 }
417 
418 typedef struct CreateCo {
419     BlockDriver *drv;
420     char *filename;
421     QEMUOptionParameter *options;
422     int ret;
423     Error *err;
424 } CreateCo;
425 
426 static void coroutine_fn bdrv_create_co_entry(void *opaque)
427 {
428     Error *local_err = NULL;
429     int ret;
430 
431     CreateCo *cco = opaque;
432     assert(cco->drv);
433 
434     ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
435     if (local_err) {
436         error_propagate(&cco->err, local_err);
437     }
438     cco->ret = ret;
439 }
440 
441 int bdrv_create(BlockDriver *drv, const char* filename,
442     QEMUOptionParameter *options, Error **errp)
443 {
444     int ret;
445 
446     Coroutine *co;
447     CreateCo cco = {
448         .drv = drv,
449         .filename = g_strdup(filename),
450         .options = options,
451         .ret = NOT_DONE,
452         .err = NULL,
453     };
454 
455     if (!drv->bdrv_create) {
456         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
457         ret = -ENOTSUP;
458         goto out;
459     }
460 
461     if (qemu_in_coroutine()) {
462         /* Fast-path if already in coroutine context */
463         bdrv_create_co_entry(&cco);
464     } else {
465         co = qemu_coroutine_create(bdrv_create_co_entry);
466         qemu_coroutine_enter(co, &cco);
467         while (cco.ret == NOT_DONE) {
468             qemu_aio_wait();
469         }
470     }
471 
472     ret = cco.ret;
473     if (ret < 0) {
474         if (cco.err) {
475             error_propagate(errp, cco.err);
476         } else {
477             error_setg_errno(errp, -ret, "Could not create image");
478         }
479     }
480 
481 out:
482     g_free(cco.filename);
483     return ret;
484 }
485 
486 int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
487                      Error **errp)
488 {
489     BlockDriver *drv;
490     Error *local_err = NULL;
491     int ret;
492 
493     drv = bdrv_find_protocol(filename, true);
494     if (drv == NULL) {
495         error_setg(errp, "Could not find protocol for file '%s'", filename);
496         return -ENOENT;
497     }
498 
499     ret = bdrv_create(drv, filename, options, &local_err);
500     if (local_err) {
501         error_propagate(errp, local_err);
502     }
503     return ret;
504 }
505 
506 int bdrv_refresh_limits(BlockDriverState *bs)
507 {
508     BlockDriver *drv = bs->drv;
509 
510     memset(&bs->bl, 0, sizeof(bs->bl));
511 
512     if (!drv) {
513         return 0;
514     }
515 
516     /* Take some limits from the children as a default */
517     if (bs->file) {
518         bdrv_refresh_limits(bs->file);
519         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
520         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
521     } else {
522         bs->bl.opt_mem_alignment = 512;
523     }
524 
525     if (bs->backing_hd) {
526         bdrv_refresh_limits(bs->backing_hd);
527         bs->bl.opt_transfer_length =
528             MAX(bs->bl.opt_transfer_length,
529                 bs->backing_hd->bl.opt_transfer_length);
530         bs->bl.opt_mem_alignment =
531             MAX(bs->bl.opt_mem_alignment,
532                 bs->backing_hd->bl.opt_mem_alignment);
533     }
534 
535     /* Then let the driver override it */
536     if (drv->bdrv_refresh_limits) {
537         return drv->bdrv_refresh_limits(bs);
538     }
539 
540     return 0;
541 }
542 
543 /*
544  * Create a uniquely-named empty temporary file.
545  * Return 0 upon success, otherwise a negative errno value.
546  */
547 int get_tmp_filename(char *filename, int size)
548 {
549 #ifdef _WIN32
550     char temp_dir[MAX_PATH];
551     /* GetTempFileName requires that its output buffer (4th param)
552        have length MAX_PATH or greater.  */
553     assert(size >= MAX_PATH);
554     return (GetTempPath(MAX_PATH, temp_dir)
555             && GetTempFileName(temp_dir, "qem", 0, filename)
556             ? 0 : -GetLastError());
557 #else
558     int fd;
559     const char *tmpdir;
560     tmpdir = getenv("TMPDIR");
561     if (!tmpdir) {
562         tmpdir = "/var/tmp";
563     }
564     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
565         return -EOVERFLOW;
566     }
567     fd = mkstemp(filename);
568     if (fd < 0) {
569         return -errno;
570     }
571     if (close(fd) != 0) {
572         unlink(filename);
573         return -errno;
574     }
575     return 0;
576 #endif
577 }
578 
579 /*
580  * Detect host devices. By convention, /dev/cdrom[N] is always
581  * recognized as a host CDROM.
582  */
583 static BlockDriver *find_hdev_driver(const char *filename)
584 {
585     int score_max = 0, score;
586     BlockDriver *drv = NULL, *d;
587 
588     QLIST_FOREACH(d, &bdrv_drivers, list) {
589         if (d->bdrv_probe_device) {
590             score = d->bdrv_probe_device(filename);
591             if (score > score_max) {
592                 score_max = score;
593                 drv = d;
594             }
595         }
596     }
597 
598     return drv;
599 }
600 
601 BlockDriver *bdrv_find_protocol(const char *filename,
602                                 bool allow_protocol_prefix)
603 {
604     BlockDriver *drv1;
605     char protocol[128];
606     int len;
607     const char *p;
608 
609     /* TODO Drivers without bdrv_file_open must be specified explicitly */
610 
611     /*
612      * XXX(hch): we really should not let host device detection
613      * override an explicit protocol specification, but moving this
614      * later breaks access to device names with colons in them.
615      * Thanks to the brain-dead persistent naming schemes on udev-
616      * based Linux systems those actually are quite common.
617      */
618     drv1 = find_hdev_driver(filename);
619     if (drv1) {
620         return drv1;
621     }
622 
623     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
624         return bdrv_find_format("file");
625     }
626 
627     p = strchr(filename, ':');
628     assert(p != NULL);
629     len = p - filename;
630     if (len > sizeof(protocol) - 1)
631         len = sizeof(protocol) - 1;
632     memcpy(protocol, filename, len);
633     protocol[len] = '\0';
634     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
635         if (drv1->protocol_name &&
636             !strcmp(drv1->protocol_name, protocol)) {
637             return drv1;
638         }
639     }
640     return NULL;
641 }
642 
643 static int find_image_format(BlockDriverState *bs, const char *filename,
644                              BlockDriver **pdrv, Error **errp)
645 {
646     int score, score_max;
647     BlockDriver *drv1, *drv;
648     uint8_t buf[2048];
649     int ret = 0;
650 
651     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
652     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
653         drv = bdrv_find_format("raw");
654         if (!drv) {
655             error_setg(errp, "Could not find raw image format");
656             ret = -ENOENT;
657         }
658         *pdrv = drv;
659         return ret;
660     }
661 
662     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
663     if (ret < 0) {
664         error_setg_errno(errp, -ret, "Could not read image for determining its "
665                          "format");
666         *pdrv = NULL;
667         return ret;
668     }
669 
670     score_max = 0;
671     drv = NULL;
672     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
673         if (drv1->bdrv_probe) {
674             score = drv1->bdrv_probe(buf, ret, filename);
675             if (score > score_max) {
676                 score_max = score;
677                 drv = drv1;
678             }
679         }
680     }
681     if (!drv) {
682         error_setg(errp, "Could not determine image format: No compatible "
683                    "driver found");
684         ret = -ENOENT;
685     }
686     *pdrv = drv;
687     return ret;
688 }
689 
690 /**
691  * Set the current 'total_sectors' value
692  */
693 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
694 {
695     BlockDriver *drv = bs->drv;
696 
697     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
698     if (bs->sg)
699         return 0;
700 
701     /* query actual device if possible, otherwise just trust the hint */
702     if (drv->bdrv_getlength) {
703         int64_t length = drv->bdrv_getlength(bs);
704         if (length < 0) {
705             return length;
706         }
707         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
708     }
709 
710     bs->total_sectors = hint;
711     return 0;
712 }
713 
714 /**
715  * Set open flags for a given discard mode
716  *
717  * Return 0 on success, -1 if the discard mode was invalid.
718  */
719 int bdrv_parse_discard_flags(const char *mode, int *flags)
720 {
721     *flags &= ~BDRV_O_UNMAP;
722 
723     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
724         /* do nothing */
725     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
726         *flags |= BDRV_O_UNMAP;
727     } else {
728         return -1;
729     }
730 
731     return 0;
732 }
733 
734 /**
735  * Set open flags for a given cache mode
736  *
737  * Return 0 on success, -1 if the cache mode was invalid.
738  */
739 int bdrv_parse_cache_flags(const char *mode, int *flags)
740 {
741     *flags &= ~BDRV_O_CACHE_MASK;
742 
743     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
744         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
745     } else if (!strcmp(mode, "directsync")) {
746         *flags |= BDRV_O_NOCACHE;
747     } else if (!strcmp(mode, "writeback")) {
748         *flags |= BDRV_O_CACHE_WB;
749     } else if (!strcmp(mode, "unsafe")) {
750         *flags |= BDRV_O_CACHE_WB;
751         *flags |= BDRV_O_NO_FLUSH;
752     } else if (!strcmp(mode, "writethrough")) {
753         /* this is the default */
754     } else {
755         return -1;
756     }
757 
758     return 0;
759 }
760 
761 /**
762  * The copy-on-read flag is actually a reference count so multiple users may
763  * use the feature without worrying about clobbering its previous state.
764  * Copy-on-read stays enabled until all users have called to disable it.
765  */
766 void bdrv_enable_copy_on_read(BlockDriverState *bs)
767 {
768     bs->copy_on_read++;
769 }
770 
771 void bdrv_disable_copy_on_read(BlockDriverState *bs)
772 {
773     assert(bs->copy_on_read > 0);
774     bs->copy_on_read--;
775 }
776 
777 /*
778  * Returns the flags that bs->file should get, based on the given flags for
779  * the parent BDS
780  */
781 static int bdrv_inherited_flags(int flags)
782 {
783     /* Enable protocol handling, disable format probing for bs->file */
784     flags |= BDRV_O_PROTOCOL;
785 
786     /* Our block drivers take care to send flushes and respect unmap policy,
787      * so we can enable both unconditionally on lower layers. */
788     flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
789 
790     /* The backing file of a temporary snapshot is read-only */
791     if (flags & BDRV_O_SNAPSHOT) {
792         flags &= ~BDRV_O_RDWR;
793     }
794 
795     /* Clear flags that only apply to the top layer */
796     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
797 
798     return flags;
799 }
800 
801 /*
802  * Returns the flags that bs->backing_hd should get, based on the given flags
803  * for the parent BDS
804  */
805 static int bdrv_backing_flags(int flags)
806 {
807     /* backing files always opened read-only */
808     flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
809 
810     /* snapshot=on is handled on the top layer */
811     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
812 
813     return flags;
814 }
815 
816 static int bdrv_open_flags(BlockDriverState *bs, int flags)
817 {
818     int open_flags = flags | BDRV_O_CACHE_WB;
819 
820     /* The backing file of a temporary snapshot is read-only */
821     if (flags & BDRV_O_SNAPSHOT) {
822         open_flags &= ~BDRV_O_RDWR;
823     }
824 
825     /*
826      * Clear flags that are internal to the block layer before opening the
827      * image.
828      */
829     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
830 
831     /*
832      * Snapshots should be writable.
833      */
834     if (flags & BDRV_O_TEMPORARY) {
835         open_flags |= BDRV_O_RDWR;
836     }
837 
838     return open_flags;
839 }
840 
841 static void bdrv_assign_node_name(BlockDriverState *bs,
842                                   const char *node_name,
843                                   Error **errp)
844 {
845     if (!node_name) {
846         return;
847     }
848 
849     /* empty string node name is invalid */
850     if (node_name[0] == '\0') {
851         error_setg(errp, "Empty node name");
852         return;
853     }
854 
855     /* takes care of avoiding namespaces collisions */
856     if (bdrv_find(node_name)) {
857         error_setg(errp, "node-name=%s is conflicting with a device id",
858                    node_name);
859         return;
860     }
861 
862     /* takes care of avoiding duplicates node names */
863     if (bdrv_find_node(node_name)) {
864         error_setg(errp, "Duplicate node name");
865         return;
866     }
867 
868     /* copy node name into the bs and insert it into the graph list */
869     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
870     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
871 }
872 
873 /*
874  * Common part for opening disk images and files
875  *
876  * Removes all processed options from *options.
877  */
878 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
879     QDict *options, int flags, BlockDriver *drv, Error **errp)
880 {
881     int ret, open_flags;
882     const char *filename;
883     const char *node_name = NULL;
884     Error *local_err = NULL;
885 
886     assert(drv != NULL);
887     assert(bs->file == NULL);
888     assert(options != NULL && bs->options != options);
889 
890     if (file != NULL) {
891         filename = file->filename;
892     } else {
893         filename = qdict_get_try_str(options, "filename");
894     }
895 
896     if (drv->bdrv_needs_filename && !filename) {
897         error_setg(errp, "The '%s' block driver requires a file name",
898                    drv->format_name);
899         return -EINVAL;
900     }
901 
902     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
903 
904     node_name = qdict_get_try_str(options, "node-name");
905     bdrv_assign_node_name(bs, node_name, &local_err);
906     if (local_err) {
907         error_propagate(errp, local_err);
908         return -EINVAL;
909     }
910     qdict_del(options, "node-name");
911 
912     /* bdrv_open() with directly using a protocol as drv. This layer is already
913      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
914      * and return immediately. */
915     if (file != NULL && drv->bdrv_file_open) {
916         bdrv_swap(file, bs);
917         return 0;
918     }
919 
920     bs->open_flags = flags;
921     bs->guest_block_size = 512;
922     bs->request_alignment = 512;
923     bs->zero_beyond_eof = true;
924     open_flags = bdrv_open_flags(bs, flags);
925     bs->read_only = !(open_flags & BDRV_O_RDWR);
926 
927     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
928         error_setg(errp,
929                    !bs->read_only && bdrv_is_whitelisted(drv, true)
930                         ? "Driver '%s' can only be used for read-only devices"
931                         : "Driver '%s' is not whitelisted",
932                    drv->format_name);
933         return -ENOTSUP;
934     }
935 
936     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
937     if (flags & BDRV_O_COPY_ON_READ) {
938         if (!bs->read_only) {
939             bdrv_enable_copy_on_read(bs);
940         } else {
941             error_setg(errp, "Can't use copy-on-read on read-only device");
942             return -EINVAL;
943         }
944     }
945 
946     if (filename != NULL) {
947         pstrcpy(bs->filename, sizeof(bs->filename), filename);
948     } else {
949         bs->filename[0] = '\0';
950     }
951 
952     bs->drv = drv;
953     bs->opaque = g_malloc0(drv->instance_size);
954 
955     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
956 
957     /* Open the image, either directly or using a protocol */
958     if (drv->bdrv_file_open) {
959         assert(file == NULL);
960         assert(!drv->bdrv_needs_filename || filename != NULL);
961         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
962     } else {
963         if (file == NULL) {
964             error_setg(errp, "Can't use '%s' as a block driver for the "
965                        "protocol level", drv->format_name);
966             ret = -EINVAL;
967             goto free_and_fail;
968         }
969         bs->file = file;
970         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
971     }
972 
973     if (ret < 0) {
974         if (local_err) {
975             error_propagate(errp, local_err);
976         } else if (bs->filename[0]) {
977             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
978         } else {
979             error_setg_errno(errp, -ret, "Could not open image");
980         }
981         goto free_and_fail;
982     }
983 
984     ret = refresh_total_sectors(bs, bs->total_sectors);
985     if (ret < 0) {
986         error_setg_errno(errp, -ret, "Could not refresh total sector count");
987         goto free_and_fail;
988     }
989 
990     bdrv_refresh_limits(bs);
991     assert(bdrv_opt_mem_align(bs) != 0);
992     assert((bs->request_alignment != 0) || bs->sg);
993     return 0;
994 
995 free_and_fail:
996     bs->file = NULL;
997     g_free(bs->opaque);
998     bs->opaque = NULL;
999     bs->drv = NULL;
1000     return ret;
1001 }
1002 
1003 /*
1004  * Opens a file using a protocol (file, host_device, nbd, ...)
1005  *
1006  * options is an indirect pointer to a QDict of options to pass to the block
1007  * drivers, or pointer to NULL for an empty set of options. If this function
1008  * takes ownership of the QDict reference, it will set *options to NULL;
1009  * otherwise, it will contain unused/unrecognized options after this function
1010  * returns. Then, the caller is responsible for freeing it. If it intends to
1011  * reuse the QDict, QINCREF() should be called beforehand.
1012  */
1013 static int bdrv_file_open(BlockDriverState *bs, const char *filename,
1014                           QDict **options, int flags, Error **errp)
1015 {
1016     BlockDriver *drv;
1017     const char *drvname;
1018     bool parse_filename = false;
1019     Error *local_err = NULL;
1020     int ret;
1021 
1022     /* Fetch the file name from the options QDict if necessary */
1023     if (!filename) {
1024         filename = qdict_get_try_str(*options, "filename");
1025     } else if (filename && !qdict_haskey(*options, "filename")) {
1026         qdict_put(*options, "filename", qstring_from_str(filename));
1027         parse_filename = true;
1028     } else {
1029         error_setg(errp, "Can't specify 'file' and 'filename' options at the "
1030                    "same time");
1031         ret = -EINVAL;
1032         goto fail;
1033     }
1034 
1035     /* Find the right block driver */
1036     drvname = qdict_get_try_str(*options, "driver");
1037     if (drvname) {
1038         drv = bdrv_find_format(drvname);
1039         if (!drv) {
1040             error_setg(errp, "Unknown driver '%s'", drvname);
1041         }
1042         qdict_del(*options, "driver");
1043     } else if (filename) {
1044         drv = bdrv_find_protocol(filename, parse_filename);
1045         if (!drv) {
1046             error_setg(errp, "Unknown protocol");
1047         }
1048     } else {
1049         error_setg(errp, "Must specify either driver or file");
1050         drv = NULL;
1051     }
1052 
1053     if (!drv) {
1054         /* errp has been set already */
1055         ret = -ENOENT;
1056         goto fail;
1057     }
1058 
1059     /* Parse the filename and open it */
1060     if (drv->bdrv_parse_filename && parse_filename) {
1061         drv->bdrv_parse_filename(filename, *options, &local_err);
1062         if (local_err) {
1063             error_propagate(errp, local_err);
1064             ret = -EINVAL;
1065             goto fail;
1066         }
1067 
1068         if (!drv->bdrv_needs_filename) {
1069             qdict_del(*options, "filename");
1070         } else {
1071             filename = qdict_get_str(*options, "filename");
1072         }
1073     }
1074 
1075     if (!drv->bdrv_file_open) {
1076         ret = bdrv_open(&bs, filename, NULL, *options, flags, drv, &local_err);
1077         *options = NULL;
1078     } else {
1079         ret = bdrv_open_common(bs, NULL, *options, flags, drv, &local_err);
1080     }
1081     if (ret < 0) {
1082         error_propagate(errp, local_err);
1083         goto fail;
1084     }
1085 
1086     bs->growable = 1;
1087     return 0;
1088 
1089 fail:
1090     return ret;
1091 }
1092 
1093 /*
1094  * Opens the backing file for a BlockDriverState if not yet open
1095  *
1096  * options is a QDict of options to pass to the block drivers, or NULL for an
1097  * empty set of options. The reference to the QDict is transferred to this
1098  * function (even on failure), so if the caller intends to reuse the dictionary,
1099  * it needs to use QINCREF() before calling bdrv_file_open.
1100  */
1101 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1102 {
1103     char *backing_filename = g_malloc0(PATH_MAX);
1104     int ret = 0;
1105     BlockDriver *back_drv = NULL;
1106     Error *local_err = NULL;
1107 
1108     if (bs->backing_hd != NULL) {
1109         QDECREF(options);
1110         goto free_exit;
1111     }
1112 
1113     /* NULL means an empty set of options */
1114     if (options == NULL) {
1115         options = qdict_new();
1116     }
1117 
1118     bs->open_flags &= ~BDRV_O_NO_BACKING;
1119     if (qdict_haskey(options, "file.filename")) {
1120         backing_filename[0] = '\0';
1121     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1122         QDECREF(options);
1123         goto free_exit;
1124     } else {
1125         bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1126     }
1127 
1128     if (bs->backing_format[0] != '\0') {
1129         back_drv = bdrv_find_format(bs->backing_format);
1130     }
1131 
1132     assert(bs->backing_hd == NULL);
1133     ret = bdrv_open(&bs->backing_hd,
1134                     *backing_filename ? backing_filename : NULL, NULL, options,
1135                     bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
1136     if (ret < 0) {
1137         bs->backing_hd = NULL;
1138         bs->open_flags |= BDRV_O_NO_BACKING;
1139         error_setg(errp, "Could not open backing file: %s",
1140                    error_get_pretty(local_err));
1141         error_free(local_err);
1142         goto free_exit;
1143     }
1144 
1145     if (bs->backing_hd->file) {
1146         pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1147                 bs->backing_hd->file->filename);
1148     }
1149 
1150     /* Recalculate the BlockLimits with the backing file */
1151     bdrv_refresh_limits(bs);
1152 
1153 free_exit:
1154     g_free(backing_filename);
1155     return ret;
1156 }
1157 
1158 /*
1159  * Opens a disk image whose options are given as BlockdevRef in another block
1160  * device's options.
1161  *
1162  * If allow_none is true, no image will be opened if filename is false and no
1163  * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1164  *
1165  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1166  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1167  * itself, all options starting with "${bdref_key}." are considered part of the
1168  * BlockdevRef.
1169  *
1170  * The BlockdevRef will be removed from the options QDict.
1171  *
1172  * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1173  */
1174 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1175                     QDict *options, const char *bdref_key, int flags,
1176                     bool allow_none, Error **errp)
1177 {
1178     QDict *image_options;
1179     int ret;
1180     char *bdref_key_dot;
1181     const char *reference;
1182 
1183     assert(pbs);
1184     assert(*pbs == NULL);
1185 
1186     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1187     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1188     g_free(bdref_key_dot);
1189 
1190     reference = qdict_get_try_str(options, bdref_key);
1191     if (!filename && !reference && !qdict_size(image_options)) {
1192         if (allow_none) {
1193             ret = 0;
1194         } else {
1195             error_setg(errp, "A block device must be specified for \"%s\"",
1196                        bdref_key);
1197             ret = -EINVAL;
1198         }
1199         goto done;
1200     }
1201 
1202     ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1203 
1204 done:
1205     qdict_del(options, bdref_key);
1206     return ret;
1207 }
1208 
1209 void bdrv_append_temp_snapshot(BlockDriverState *bs, Error **errp)
1210 {
1211     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1212     char *tmp_filename = g_malloc0(PATH_MAX + 1);
1213     int64_t total_size;
1214     BlockDriver *bdrv_qcow2;
1215     QEMUOptionParameter *create_options;
1216     QDict *snapshot_options;
1217     BlockDriverState *bs_snapshot;
1218     Error *local_err;
1219     int ret;
1220 
1221     /* if snapshot, we create a temporary backing file and open it
1222        instead of opening 'filename' directly */
1223 
1224     /* Get the required size from the image */
1225     total_size = bdrv_getlength(bs);
1226     if (total_size < 0) {
1227         error_setg_errno(errp, -total_size, "Could not get image size");
1228         goto out;
1229     }
1230     total_size &= BDRV_SECTOR_MASK;
1231 
1232     /* Create the temporary image */
1233     ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1234     if (ret < 0) {
1235         error_setg_errno(errp, -ret, "Could not get temporary filename");
1236         goto out;
1237     }
1238 
1239     bdrv_qcow2 = bdrv_find_format("qcow2");
1240     create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1241                                              NULL);
1242 
1243     set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1244 
1245     ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1246     free_option_parameters(create_options);
1247     if (ret < 0) {
1248         error_setg_errno(errp, -ret, "Could not create temporary overlay "
1249                          "'%s': %s", tmp_filename,
1250                          error_get_pretty(local_err));
1251         error_free(local_err);
1252         goto out;
1253     }
1254 
1255     /* Prepare a new options QDict for the temporary file */
1256     snapshot_options = qdict_new();
1257     qdict_put(snapshot_options, "file.driver",
1258               qstring_from_str("file"));
1259     qdict_put(snapshot_options, "file.filename",
1260               qstring_from_str(tmp_filename));
1261 
1262     bs_snapshot = bdrv_new("", &error_abort);
1263 
1264     ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1265                     (bs->open_flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY,
1266                     bdrv_qcow2, &local_err);
1267     if (ret < 0) {
1268         error_propagate(errp, local_err);
1269         goto out;
1270     }
1271 
1272     bdrv_append(bs_snapshot, bs);
1273 
1274 out:
1275     g_free(tmp_filename);
1276 }
1277 
1278 /*
1279  * Opens a disk image (raw, qcow2, vmdk, ...)
1280  *
1281  * options is a QDict of options to pass to the block drivers, or NULL for an
1282  * empty set of options. The reference to the QDict belongs to the block layer
1283  * after the call (even on failure), so if the caller intends to reuse the
1284  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1285  *
1286  * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1287  * If it is not NULL, the referenced BDS will be reused.
1288  *
1289  * The reference parameter may be used to specify an existing block device which
1290  * should be opened. If specified, neither options nor a filename may be given,
1291  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1292  */
1293 int bdrv_open(BlockDriverState **pbs, const char *filename,
1294               const char *reference, QDict *options, int flags,
1295               BlockDriver *drv, Error **errp)
1296 {
1297     int ret;
1298     BlockDriverState *file = NULL, *bs;
1299     const char *drvname;
1300     Error *local_err = NULL;
1301 
1302     assert(pbs);
1303 
1304     if (reference) {
1305         bool options_non_empty = options ? qdict_size(options) : false;
1306         QDECREF(options);
1307 
1308         if (*pbs) {
1309             error_setg(errp, "Cannot reuse an existing BDS when referencing "
1310                        "another block device");
1311             return -EINVAL;
1312         }
1313 
1314         if (filename || options_non_empty) {
1315             error_setg(errp, "Cannot reference an existing block device with "
1316                        "additional options or a new filename");
1317             return -EINVAL;
1318         }
1319 
1320         bs = bdrv_lookup_bs(reference, reference, errp);
1321         if (!bs) {
1322             return -ENODEV;
1323         }
1324         bdrv_ref(bs);
1325         *pbs = bs;
1326         return 0;
1327     }
1328 
1329     if (*pbs) {
1330         bs = *pbs;
1331     } else {
1332         bs = bdrv_new("", &error_abort);
1333     }
1334 
1335     /* NULL means an empty set of options */
1336     if (options == NULL) {
1337         options = qdict_new();
1338     }
1339 
1340     bs->options = options;
1341     options = qdict_clone_shallow(options);
1342 
1343     if (flags & BDRV_O_PROTOCOL) {
1344         assert(!drv);
1345         ret = bdrv_file_open(bs, filename, &options, flags & ~BDRV_O_PROTOCOL,
1346                              &local_err);
1347         if (!ret) {
1348             drv = bs->drv;
1349             goto done;
1350         } else if (bs->drv) {
1351             goto close_and_fail;
1352         } else {
1353             goto fail;
1354         }
1355     }
1356 
1357     /* Open image file without format layer */
1358     if (flags & BDRV_O_RDWR) {
1359         flags |= BDRV_O_ALLOW_RDWR;
1360     }
1361 
1362     assert(file == NULL);
1363     ret = bdrv_open_image(&file, filename, options, "file",
1364                           bdrv_inherited_flags(flags),
1365                           true, &local_err);
1366     if (ret < 0) {
1367         goto fail;
1368     }
1369 
1370     /* Find the right image format driver */
1371     drvname = qdict_get_try_str(options, "driver");
1372     if (drvname) {
1373         drv = bdrv_find_format(drvname);
1374         qdict_del(options, "driver");
1375         if (!drv) {
1376             error_setg(errp, "Invalid driver: '%s'", drvname);
1377             ret = -EINVAL;
1378             goto fail;
1379         }
1380     }
1381 
1382     if (!drv) {
1383         if (file) {
1384             ret = find_image_format(file, filename, &drv, &local_err);
1385         } else {
1386             error_setg(errp, "Must specify either driver or file");
1387             ret = -EINVAL;
1388             goto fail;
1389         }
1390     }
1391 
1392     if (!drv) {
1393         goto fail;
1394     }
1395 
1396     /* Open the image */
1397     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1398     if (ret < 0) {
1399         goto fail;
1400     }
1401 
1402     if (file && (bs->file != file)) {
1403         bdrv_unref(file);
1404         file = NULL;
1405     }
1406 
1407     /* If there is a backing file, use it */
1408     if ((flags & BDRV_O_NO_BACKING) == 0) {
1409         QDict *backing_options;
1410 
1411         qdict_extract_subqdict(options, &backing_options, "backing.");
1412         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1413         if (ret < 0) {
1414             goto close_and_fail;
1415         }
1416     }
1417 
1418     /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1419      * temporary snapshot afterwards. */
1420     if (flags & BDRV_O_SNAPSHOT) {
1421         bdrv_append_temp_snapshot(bs, &local_err);
1422         if (local_err) {
1423             error_propagate(errp, local_err);
1424             goto close_and_fail;
1425         }
1426     }
1427 
1428 
1429 done:
1430     /* Check if any unknown options were used */
1431     if (options && (qdict_size(options) != 0)) {
1432         const QDictEntry *entry = qdict_first(options);
1433         if (flags & BDRV_O_PROTOCOL) {
1434             error_setg(errp, "Block protocol '%s' doesn't support the option "
1435                        "'%s'", drv->format_name, entry->key);
1436         } else {
1437             error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1438                        "support the option '%s'", drv->format_name,
1439                        bs->device_name, entry->key);
1440         }
1441 
1442         ret = -EINVAL;
1443         goto close_and_fail;
1444     }
1445 
1446     if (!bdrv_key_required(bs)) {
1447         bdrv_dev_change_media_cb(bs, true);
1448     } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1449                && !runstate_check(RUN_STATE_INMIGRATE)
1450                && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1451         error_setg(errp,
1452                    "Guest must be stopped for opening of encrypted image");
1453         ret = -EBUSY;
1454         goto close_and_fail;
1455     }
1456 
1457     QDECREF(options);
1458     *pbs = bs;
1459     return 0;
1460 
1461 fail:
1462     if (file != NULL) {
1463         bdrv_unref(file);
1464     }
1465     QDECREF(bs->options);
1466     QDECREF(options);
1467     bs->options = NULL;
1468     if (!*pbs) {
1469         /* If *pbs is NULL, a new BDS has been created in this function and
1470            needs to be freed now. Otherwise, it does not need to be closed,
1471            since it has not really been opened yet. */
1472         bdrv_unref(bs);
1473     }
1474     if (local_err) {
1475         error_propagate(errp, local_err);
1476     }
1477     return ret;
1478 
1479 close_and_fail:
1480     /* See fail path, but now the BDS has to be always closed */
1481     if (*pbs) {
1482         bdrv_close(bs);
1483     } else {
1484         bdrv_unref(bs);
1485     }
1486     QDECREF(options);
1487     if (local_err) {
1488         error_propagate(errp, local_err);
1489     }
1490     return ret;
1491 }
1492 
1493 typedef struct BlockReopenQueueEntry {
1494      bool prepared;
1495      BDRVReopenState state;
1496      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1497 } BlockReopenQueueEntry;
1498 
1499 /*
1500  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1501  * reopen of multiple devices.
1502  *
1503  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1504  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1505  * be created and initialized. This newly created BlockReopenQueue should be
1506  * passed back in for subsequent calls that are intended to be of the same
1507  * atomic 'set'.
1508  *
1509  * bs is the BlockDriverState to add to the reopen queue.
1510  *
1511  * flags contains the open flags for the associated bs
1512  *
1513  * returns a pointer to bs_queue, which is either the newly allocated
1514  * bs_queue, or the existing bs_queue being used.
1515  *
1516  */
1517 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1518                                     BlockDriverState *bs, int flags)
1519 {
1520     assert(bs != NULL);
1521 
1522     BlockReopenQueueEntry *bs_entry;
1523     if (bs_queue == NULL) {
1524         bs_queue = g_new0(BlockReopenQueue, 1);
1525         QSIMPLEQ_INIT(bs_queue);
1526     }
1527 
1528     /* bdrv_open() masks this flag out */
1529     flags &= ~BDRV_O_PROTOCOL;
1530 
1531     if (bs->file) {
1532         bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1533     }
1534 
1535     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1536     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1537 
1538     bs_entry->state.bs = bs;
1539     bs_entry->state.flags = flags;
1540 
1541     return bs_queue;
1542 }
1543 
1544 /*
1545  * Reopen multiple BlockDriverStates atomically & transactionally.
1546  *
1547  * The queue passed in (bs_queue) must have been built up previous
1548  * via bdrv_reopen_queue().
1549  *
1550  * Reopens all BDS specified in the queue, with the appropriate
1551  * flags.  All devices are prepared for reopen, and failure of any
1552  * device will cause all device changes to be abandonded, and intermediate
1553  * data cleaned up.
1554  *
1555  * If all devices prepare successfully, then the changes are committed
1556  * to all devices.
1557  *
1558  */
1559 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1560 {
1561     int ret = -1;
1562     BlockReopenQueueEntry *bs_entry, *next;
1563     Error *local_err = NULL;
1564 
1565     assert(bs_queue != NULL);
1566 
1567     bdrv_drain_all();
1568 
1569     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1570         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1571             error_propagate(errp, local_err);
1572             goto cleanup;
1573         }
1574         bs_entry->prepared = true;
1575     }
1576 
1577     /* If we reach this point, we have success and just need to apply the
1578      * changes
1579      */
1580     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1581         bdrv_reopen_commit(&bs_entry->state);
1582     }
1583 
1584     ret = 0;
1585 
1586 cleanup:
1587     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1588         if (ret && bs_entry->prepared) {
1589             bdrv_reopen_abort(&bs_entry->state);
1590         }
1591         g_free(bs_entry);
1592     }
1593     g_free(bs_queue);
1594     return ret;
1595 }
1596 
1597 
1598 /* Reopen a single BlockDriverState with the specified flags. */
1599 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1600 {
1601     int ret = -1;
1602     Error *local_err = NULL;
1603     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1604 
1605     ret = bdrv_reopen_multiple(queue, &local_err);
1606     if (local_err != NULL) {
1607         error_propagate(errp, local_err);
1608     }
1609     return ret;
1610 }
1611 
1612 
1613 /*
1614  * Prepares a BlockDriverState for reopen. All changes are staged in the
1615  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1616  * the block driver layer .bdrv_reopen_prepare()
1617  *
1618  * bs is the BlockDriverState to reopen
1619  * flags are the new open flags
1620  * queue is the reopen queue
1621  *
1622  * Returns 0 on success, non-zero on error.  On error errp will be set
1623  * as well.
1624  *
1625  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1626  * It is the responsibility of the caller to then call the abort() or
1627  * commit() for any other BDS that have been left in a prepare() state
1628  *
1629  */
1630 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1631                         Error **errp)
1632 {
1633     int ret = -1;
1634     Error *local_err = NULL;
1635     BlockDriver *drv;
1636 
1637     assert(reopen_state != NULL);
1638     assert(reopen_state->bs->drv != NULL);
1639     drv = reopen_state->bs->drv;
1640 
1641     /* if we are to stay read-only, do not allow permission change
1642      * to r/w */
1643     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1644         reopen_state->flags & BDRV_O_RDWR) {
1645         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1646                   reopen_state->bs->device_name);
1647         goto error;
1648     }
1649 
1650 
1651     ret = bdrv_flush(reopen_state->bs);
1652     if (ret) {
1653         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1654                   strerror(-ret));
1655         goto error;
1656     }
1657 
1658     if (drv->bdrv_reopen_prepare) {
1659         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1660         if (ret) {
1661             if (local_err != NULL) {
1662                 error_propagate(errp, local_err);
1663             } else {
1664                 error_setg(errp, "failed while preparing to reopen image '%s'",
1665                            reopen_state->bs->filename);
1666             }
1667             goto error;
1668         }
1669     } else {
1670         /* It is currently mandatory to have a bdrv_reopen_prepare()
1671          * handler for each supported drv. */
1672         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1673                   drv->format_name, reopen_state->bs->device_name,
1674                  "reopening of file");
1675         ret = -1;
1676         goto error;
1677     }
1678 
1679     ret = 0;
1680 
1681 error:
1682     return ret;
1683 }
1684 
1685 /*
1686  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1687  * makes them final by swapping the staging BlockDriverState contents into
1688  * the active BlockDriverState contents.
1689  */
1690 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1691 {
1692     BlockDriver *drv;
1693 
1694     assert(reopen_state != NULL);
1695     drv = reopen_state->bs->drv;
1696     assert(drv != NULL);
1697 
1698     /* If there are any driver level actions to take */
1699     if (drv->bdrv_reopen_commit) {
1700         drv->bdrv_reopen_commit(reopen_state);
1701     }
1702 
1703     /* set BDS specific flags now */
1704     reopen_state->bs->open_flags         = reopen_state->flags;
1705     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1706                                               BDRV_O_CACHE_WB);
1707     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1708 
1709     bdrv_refresh_limits(reopen_state->bs);
1710 }
1711 
1712 /*
1713  * Abort the reopen, and delete and free the staged changes in
1714  * reopen_state
1715  */
1716 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1717 {
1718     BlockDriver *drv;
1719 
1720     assert(reopen_state != NULL);
1721     drv = reopen_state->bs->drv;
1722     assert(drv != NULL);
1723 
1724     if (drv->bdrv_reopen_abort) {
1725         drv->bdrv_reopen_abort(reopen_state);
1726     }
1727 }
1728 
1729 
1730 void bdrv_close(BlockDriverState *bs)
1731 {
1732     if (bs->job) {
1733         block_job_cancel_sync(bs->job);
1734     }
1735     bdrv_drain_all(); /* complete I/O */
1736     bdrv_flush(bs);
1737     bdrv_drain_all(); /* in case flush left pending I/O */
1738     notifier_list_notify(&bs->close_notifiers, bs);
1739 
1740     if (bs->drv) {
1741         if (bs->backing_hd) {
1742             bdrv_unref(bs->backing_hd);
1743             bs->backing_hd = NULL;
1744         }
1745         bs->drv->bdrv_close(bs);
1746         g_free(bs->opaque);
1747         bs->opaque = NULL;
1748         bs->drv = NULL;
1749         bs->copy_on_read = 0;
1750         bs->backing_file[0] = '\0';
1751         bs->backing_format[0] = '\0';
1752         bs->total_sectors = 0;
1753         bs->encrypted = 0;
1754         bs->valid_key = 0;
1755         bs->sg = 0;
1756         bs->growable = 0;
1757         bs->zero_beyond_eof = false;
1758         QDECREF(bs->options);
1759         bs->options = NULL;
1760 
1761         if (bs->file != NULL) {
1762             bdrv_unref(bs->file);
1763             bs->file = NULL;
1764         }
1765     }
1766 
1767     bdrv_dev_change_media_cb(bs, false);
1768 
1769     /*throttling disk I/O limits*/
1770     if (bs->io_limits_enabled) {
1771         bdrv_io_limits_disable(bs);
1772     }
1773 }
1774 
1775 void bdrv_close_all(void)
1776 {
1777     BlockDriverState *bs;
1778 
1779     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1780         bdrv_close(bs);
1781     }
1782 }
1783 
1784 /* Check if any requests are in-flight (including throttled requests) */
1785 static bool bdrv_requests_pending(BlockDriverState *bs)
1786 {
1787     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1788         return true;
1789     }
1790     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1791         return true;
1792     }
1793     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1794         return true;
1795     }
1796     if (bs->file && bdrv_requests_pending(bs->file)) {
1797         return true;
1798     }
1799     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1800         return true;
1801     }
1802     return false;
1803 }
1804 
1805 static bool bdrv_requests_pending_all(void)
1806 {
1807     BlockDriverState *bs;
1808     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1809         if (bdrv_requests_pending(bs)) {
1810             return true;
1811         }
1812     }
1813     return false;
1814 }
1815 
1816 /*
1817  * Wait for pending requests to complete across all BlockDriverStates
1818  *
1819  * This function does not flush data to disk, use bdrv_flush_all() for that
1820  * after calling this function.
1821  *
1822  * Note that completion of an asynchronous I/O operation can trigger any
1823  * number of other I/O operations on other devices---for example a coroutine
1824  * can be arbitrarily complex and a constant flow of I/O can come until the
1825  * coroutine is complete.  Because of this, it is not possible to have a
1826  * function to drain a single device's I/O queue.
1827  */
1828 void bdrv_drain_all(void)
1829 {
1830     /* Always run first iteration so any pending completion BHs run */
1831     bool busy = true;
1832     BlockDriverState *bs;
1833 
1834     while (busy) {
1835         QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1836             bdrv_start_throttled_reqs(bs);
1837         }
1838 
1839         busy = bdrv_requests_pending_all();
1840         busy |= aio_poll(qemu_get_aio_context(), busy);
1841     }
1842 }
1843 
1844 /* make a BlockDriverState anonymous by removing from bdrv_state and
1845  * graph_bdrv_state list.
1846    Also, NULL terminate the device_name to prevent double remove */
1847 void bdrv_make_anon(BlockDriverState *bs)
1848 {
1849     if (bs->device_name[0] != '\0') {
1850         QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1851     }
1852     bs->device_name[0] = '\0';
1853     if (bs->node_name[0] != '\0') {
1854         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1855     }
1856     bs->node_name[0] = '\0';
1857 }
1858 
1859 static void bdrv_rebind(BlockDriverState *bs)
1860 {
1861     if (bs->drv && bs->drv->bdrv_rebind) {
1862         bs->drv->bdrv_rebind(bs);
1863     }
1864 }
1865 
1866 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1867                                      BlockDriverState *bs_src)
1868 {
1869     /* move some fields that need to stay attached to the device */
1870 
1871     /* dev info */
1872     bs_dest->dev_ops            = bs_src->dev_ops;
1873     bs_dest->dev_opaque         = bs_src->dev_opaque;
1874     bs_dest->dev                = bs_src->dev;
1875     bs_dest->guest_block_size   = bs_src->guest_block_size;
1876     bs_dest->copy_on_read       = bs_src->copy_on_read;
1877 
1878     bs_dest->enable_write_cache = bs_src->enable_write_cache;
1879 
1880     /* i/o throttled req */
1881     memcpy(&bs_dest->throttle_state,
1882            &bs_src->throttle_state,
1883            sizeof(ThrottleState));
1884     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1885     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1886     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1887 
1888     /* r/w error */
1889     bs_dest->on_read_error      = bs_src->on_read_error;
1890     bs_dest->on_write_error     = bs_src->on_write_error;
1891 
1892     /* i/o status */
1893     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1894     bs_dest->iostatus           = bs_src->iostatus;
1895 
1896     /* dirty bitmap */
1897     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1898 
1899     /* reference count */
1900     bs_dest->refcnt             = bs_src->refcnt;
1901 
1902     /* job */
1903     bs_dest->in_use             = bs_src->in_use;
1904     bs_dest->job                = bs_src->job;
1905 
1906     /* keep the same entry in bdrv_states */
1907     pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1908             bs_src->device_name);
1909     bs_dest->device_list = bs_src->device_list;
1910 }
1911 
1912 /*
1913  * Swap bs contents for two image chains while they are live,
1914  * while keeping required fields on the BlockDriverState that is
1915  * actually attached to a device.
1916  *
1917  * This will modify the BlockDriverState fields, and swap contents
1918  * between bs_new and bs_old. Both bs_new and bs_old are modified.
1919  *
1920  * bs_new is required to be anonymous.
1921  *
1922  * This function does not create any image files.
1923  */
1924 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1925 {
1926     BlockDriverState tmp;
1927 
1928     /* The code needs to swap the node_name but simply swapping node_list won't
1929      * work so first remove the nodes from the graph list, do the swap then
1930      * insert them back if needed.
1931      */
1932     if (bs_new->node_name[0] != '\0') {
1933         QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
1934     }
1935     if (bs_old->node_name[0] != '\0') {
1936         QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
1937     }
1938 
1939     /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1940     assert(bs_new->device_name[0] == '\0');
1941     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1942     assert(bs_new->job == NULL);
1943     assert(bs_new->dev == NULL);
1944     assert(bs_new->in_use == 0);
1945     assert(bs_new->io_limits_enabled == false);
1946     assert(!throttle_have_timer(&bs_new->throttle_state));
1947 
1948     tmp = *bs_new;
1949     *bs_new = *bs_old;
1950     *bs_old = tmp;
1951 
1952     /* there are some fields that should not be swapped, move them back */
1953     bdrv_move_feature_fields(&tmp, bs_old);
1954     bdrv_move_feature_fields(bs_old, bs_new);
1955     bdrv_move_feature_fields(bs_new, &tmp);
1956 
1957     /* bs_new shouldn't be in bdrv_states even after the swap!  */
1958     assert(bs_new->device_name[0] == '\0');
1959 
1960     /* Check a few fields that should remain attached to the device */
1961     assert(bs_new->dev == NULL);
1962     assert(bs_new->job == NULL);
1963     assert(bs_new->in_use == 0);
1964     assert(bs_new->io_limits_enabled == false);
1965     assert(!throttle_have_timer(&bs_new->throttle_state));
1966 
1967     /* insert the nodes back into the graph node list if needed */
1968     if (bs_new->node_name[0] != '\0') {
1969         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
1970     }
1971     if (bs_old->node_name[0] != '\0') {
1972         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
1973     }
1974 
1975     bdrv_rebind(bs_new);
1976     bdrv_rebind(bs_old);
1977 }
1978 
1979 /*
1980  * Add new bs contents at the top of an image chain while the chain is
1981  * live, while keeping required fields on the top layer.
1982  *
1983  * This will modify the BlockDriverState fields, and swap contents
1984  * between bs_new and bs_top. Both bs_new and bs_top are modified.
1985  *
1986  * bs_new is required to be anonymous.
1987  *
1988  * This function does not create any image files.
1989  */
1990 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1991 {
1992     bdrv_swap(bs_new, bs_top);
1993 
1994     /* The contents of 'tmp' will become bs_top, as we are
1995      * swapping bs_new and bs_top contents. */
1996     bs_top->backing_hd = bs_new;
1997     bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1998     pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1999             bs_new->filename);
2000     pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
2001             bs_new->drv ? bs_new->drv->format_name : "");
2002 }
2003 
2004 static void bdrv_delete(BlockDriverState *bs)
2005 {
2006     assert(!bs->dev);
2007     assert(!bs->job);
2008     assert(!bs->in_use);
2009     assert(!bs->refcnt);
2010     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2011 
2012     bdrv_close(bs);
2013 
2014     /* remove from list, if necessary */
2015     bdrv_make_anon(bs);
2016 
2017     g_free(bs);
2018 }
2019 
2020 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
2021 /* TODO change to DeviceState *dev when all users are qdevified */
2022 {
2023     if (bs->dev) {
2024         return -EBUSY;
2025     }
2026     bs->dev = dev;
2027     bdrv_iostatus_reset(bs);
2028     return 0;
2029 }
2030 
2031 /* TODO qdevified devices don't use this, remove when devices are qdevified */
2032 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
2033 {
2034     if (bdrv_attach_dev(bs, dev) < 0) {
2035         abort();
2036     }
2037 }
2038 
2039 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
2040 /* TODO change to DeviceState *dev when all users are qdevified */
2041 {
2042     assert(bs->dev == dev);
2043     bs->dev = NULL;
2044     bs->dev_ops = NULL;
2045     bs->dev_opaque = NULL;
2046     bs->guest_block_size = 512;
2047 }
2048 
2049 /* TODO change to return DeviceState * when all users are qdevified */
2050 void *bdrv_get_attached_dev(BlockDriverState *bs)
2051 {
2052     return bs->dev;
2053 }
2054 
2055 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2056                       void *opaque)
2057 {
2058     bs->dev_ops = ops;
2059     bs->dev_opaque = opaque;
2060 }
2061 
2062 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
2063                                enum MonitorEvent ev,
2064                                BlockErrorAction action, bool is_read)
2065 {
2066     QObject *data;
2067     const char *action_str;
2068 
2069     switch (action) {
2070     case BDRV_ACTION_REPORT:
2071         action_str = "report";
2072         break;
2073     case BDRV_ACTION_IGNORE:
2074         action_str = "ignore";
2075         break;
2076     case BDRV_ACTION_STOP:
2077         action_str = "stop";
2078         break;
2079     default:
2080         abort();
2081     }
2082 
2083     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2084                               bdrv->device_name,
2085                               action_str,
2086                               is_read ? "read" : "write");
2087     monitor_protocol_event(ev, data);
2088 
2089     qobject_decref(data);
2090 }
2091 
2092 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2093 {
2094     QObject *data;
2095 
2096     data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2097                               bdrv_get_device_name(bs), ejected);
2098     monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2099 
2100     qobject_decref(data);
2101 }
2102 
2103 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2104 {
2105     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2106         bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2107         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2108         if (tray_was_closed) {
2109             /* tray open */
2110             bdrv_emit_qmp_eject_event(bs, true);
2111         }
2112         if (load) {
2113             /* tray close */
2114             bdrv_emit_qmp_eject_event(bs, false);
2115         }
2116     }
2117 }
2118 
2119 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2120 {
2121     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2122 }
2123 
2124 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2125 {
2126     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2127         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2128     }
2129 }
2130 
2131 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2132 {
2133     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2134         return bs->dev_ops->is_tray_open(bs->dev_opaque);
2135     }
2136     return false;
2137 }
2138 
2139 static void bdrv_dev_resize_cb(BlockDriverState *bs)
2140 {
2141     if (bs->dev_ops && bs->dev_ops->resize_cb) {
2142         bs->dev_ops->resize_cb(bs->dev_opaque);
2143     }
2144 }
2145 
2146 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2147 {
2148     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2149         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2150     }
2151     return false;
2152 }
2153 
2154 /*
2155  * Run consistency checks on an image
2156  *
2157  * Returns 0 if the check could be completed (it doesn't mean that the image is
2158  * free of errors) or -errno when an internal error occurred. The results of the
2159  * check are stored in res.
2160  */
2161 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2162 {
2163     if (bs->drv->bdrv_check == NULL) {
2164         return -ENOTSUP;
2165     }
2166 
2167     memset(res, 0, sizeof(*res));
2168     return bs->drv->bdrv_check(bs, res, fix);
2169 }
2170 
2171 #define COMMIT_BUF_SECTORS 2048
2172 
2173 /* commit COW file into the raw image */
2174 int bdrv_commit(BlockDriverState *bs)
2175 {
2176     BlockDriver *drv = bs->drv;
2177     int64_t sector, total_sectors, length, backing_length;
2178     int n, ro, open_flags;
2179     int ret = 0;
2180     uint8_t *buf = NULL;
2181     char filename[PATH_MAX];
2182 
2183     if (!drv)
2184         return -ENOMEDIUM;
2185 
2186     if (!bs->backing_hd) {
2187         return -ENOTSUP;
2188     }
2189 
2190     if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
2191         return -EBUSY;
2192     }
2193 
2194     ro = bs->backing_hd->read_only;
2195     /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2196     pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2197     open_flags =  bs->backing_hd->open_flags;
2198 
2199     if (ro) {
2200         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2201             return -EACCES;
2202         }
2203     }
2204 
2205     length = bdrv_getlength(bs);
2206     if (length < 0) {
2207         ret = length;
2208         goto ro_cleanup;
2209     }
2210 
2211     backing_length = bdrv_getlength(bs->backing_hd);
2212     if (backing_length < 0) {
2213         ret = backing_length;
2214         goto ro_cleanup;
2215     }
2216 
2217     /* If our top snapshot is larger than the backing file image,
2218      * grow the backing file image if possible.  If not possible,
2219      * we must return an error */
2220     if (length > backing_length) {
2221         ret = bdrv_truncate(bs->backing_hd, length);
2222         if (ret < 0) {
2223             goto ro_cleanup;
2224         }
2225     }
2226 
2227     total_sectors = length >> BDRV_SECTOR_BITS;
2228     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2229 
2230     for (sector = 0; sector < total_sectors; sector += n) {
2231         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2232         if (ret < 0) {
2233             goto ro_cleanup;
2234         }
2235         if (ret) {
2236             ret = bdrv_read(bs, sector, buf, n);
2237             if (ret < 0) {
2238                 goto ro_cleanup;
2239             }
2240 
2241             ret = bdrv_write(bs->backing_hd, sector, buf, n);
2242             if (ret < 0) {
2243                 goto ro_cleanup;
2244             }
2245         }
2246     }
2247 
2248     if (drv->bdrv_make_empty) {
2249         ret = drv->bdrv_make_empty(bs);
2250         if (ret < 0) {
2251             goto ro_cleanup;
2252         }
2253         bdrv_flush(bs);
2254     }
2255 
2256     /*
2257      * Make sure all data we wrote to the backing device is actually
2258      * stable on disk.
2259      */
2260     if (bs->backing_hd) {
2261         bdrv_flush(bs->backing_hd);
2262     }
2263 
2264     ret = 0;
2265 ro_cleanup:
2266     g_free(buf);
2267 
2268     if (ro) {
2269         /* ignoring error return here */
2270         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2271     }
2272 
2273     return ret;
2274 }
2275 
2276 int bdrv_commit_all(void)
2277 {
2278     BlockDriverState *bs;
2279 
2280     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2281         if (bs->drv && bs->backing_hd) {
2282             int ret = bdrv_commit(bs);
2283             if (ret < 0) {
2284                 return ret;
2285             }
2286         }
2287     }
2288     return 0;
2289 }
2290 
2291 /**
2292  * Remove an active request from the tracked requests list
2293  *
2294  * This function should be called when a tracked request is completing.
2295  */
2296 static void tracked_request_end(BdrvTrackedRequest *req)
2297 {
2298     if (req->serialising) {
2299         req->bs->serialising_in_flight--;
2300     }
2301 
2302     QLIST_REMOVE(req, list);
2303     qemu_co_queue_restart_all(&req->wait_queue);
2304 }
2305 
2306 /**
2307  * Add an active request to the tracked requests list
2308  */
2309 static void tracked_request_begin(BdrvTrackedRequest *req,
2310                                   BlockDriverState *bs,
2311                                   int64_t offset,
2312                                   unsigned int bytes, bool is_write)
2313 {
2314     *req = (BdrvTrackedRequest){
2315         .bs = bs,
2316         .offset         = offset,
2317         .bytes          = bytes,
2318         .is_write       = is_write,
2319         .co             = qemu_coroutine_self(),
2320         .serialising    = false,
2321         .overlap_offset = offset,
2322         .overlap_bytes  = bytes,
2323     };
2324 
2325     qemu_co_queue_init(&req->wait_queue);
2326 
2327     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2328 }
2329 
2330 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2331 {
2332     int64_t overlap_offset = req->offset & ~(align - 1);
2333     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2334                                - overlap_offset;
2335 
2336     if (!req->serialising) {
2337         req->bs->serialising_in_flight++;
2338         req->serialising = true;
2339     }
2340 
2341     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2342     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2343 }
2344 
2345 /**
2346  * Round a region to cluster boundaries
2347  */
2348 void bdrv_round_to_clusters(BlockDriverState *bs,
2349                             int64_t sector_num, int nb_sectors,
2350                             int64_t *cluster_sector_num,
2351                             int *cluster_nb_sectors)
2352 {
2353     BlockDriverInfo bdi;
2354 
2355     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2356         *cluster_sector_num = sector_num;
2357         *cluster_nb_sectors = nb_sectors;
2358     } else {
2359         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2360         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2361         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2362                                             nb_sectors, c);
2363     }
2364 }
2365 
2366 static int bdrv_get_cluster_size(BlockDriverState *bs)
2367 {
2368     BlockDriverInfo bdi;
2369     int ret;
2370 
2371     ret = bdrv_get_info(bs, &bdi);
2372     if (ret < 0 || bdi.cluster_size == 0) {
2373         return bs->request_alignment;
2374     } else {
2375         return bdi.cluster_size;
2376     }
2377 }
2378 
2379 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2380                                      int64_t offset, unsigned int bytes)
2381 {
2382     /*        aaaa   bbbb */
2383     if (offset >= req->overlap_offset + req->overlap_bytes) {
2384         return false;
2385     }
2386     /* bbbb   aaaa        */
2387     if (req->overlap_offset >= offset + bytes) {
2388         return false;
2389     }
2390     return true;
2391 }
2392 
2393 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2394 {
2395     BlockDriverState *bs = self->bs;
2396     BdrvTrackedRequest *req;
2397     bool retry;
2398     bool waited = false;
2399 
2400     if (!bs->serialising_in_flight) {
2401         return false;
2402     }
2403 
2404     do {
2405         retry = false;
2406         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2407             if (req == self || (!req->serialising && !self->serialising)) {
2408                 continue;
2409             }
2410             if (tracked_request_overlaps(req, self->overlap_offset,
2411                                          self->overlap_bytes))
2412             {
2413                 /* Hitting this means there was a reentrant request, for
2414                  * example, a block driver issuing nested requests.  This must
2415                  * never happen since it means deadlock.
2416                  */
2417                 assert(qemu_coroutine_self() != req->co);
2418 
2419                 /* If the request is already (indirectly) waiting for us, or
2420                  * will wait for us as soon as it wakes up, then just go on
2421                  * (instead of producing a deadlock in the former case). */
2422                 if (!req->waiting_for) {
2423                     self->waiting_for = req;
2424                     qemu_co_queue_wait(&req->wait_queue);
2425                     self->waiting_for = NULL;
2426                     retry = true;
2427                     waited = true;
2428                     break;
2429                 }
2430             }
2431         }
2432     } while (retry);
2433 
2434     return waited;
2435 }
2436 
2437 /*
2438  * Return values:
2439  * 0        - success
2440  * -EINVAL  - backing format specified, but no file
2441  * -ENOSPC  - can't update the backing file because no space is left in the
2442  *            image file header
2443  * -ENOTSUP - format driver doesn't support changing the backing file
2444  */
2445 int bdrv_change_backing_file(BlockDriverState *bs,
2446     const char *backing_file, const char *backing_fmt)
2447 {
2448     BlockDriver *drv = bs->drv;
2449     int ret;
2450 
2451     /* Backing file format doesn't make sense without a backing file */
2452     if (backing_fmt && !backing_file) {
2453         return -EINVAL;
2454     }
2455 
2456     if (drv->bdrv_change_backing_file != NULL) {
2457         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2458     } else {
2459         ret = -ENOTSUP;
2460     }
2461 
2462     if (ret == 0) {
2463         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2464         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2465     }
2466     return ret;
2467 }
2468 
2469 /*
2470  * Finds the image layer in the chain that has 'bs' as its backing file.
2471  *
2472  * active is the current topmost image.
2473  *
2474  * Returns NULL if bs is not found in active's image chain,
2475  * or if active == bs.
2476  */
2477 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2478                                     BlockDriverState *bs)
2479 {
2480     BlockDriverState *overlay = NULL;
2481     BlockDriverState *intermediate;
2482 
2483     assert(active != NULL);
2484     assert(bs != NULL);
2485 
2486     /* if bs is the same as active, then by definition it has no overlay
2487      */
2488     if (active == bs) {
2489         return NULL;
2490     }
2491 
2492     intermediate = active;
2493     while (intermediate->backing_hd) {
2494         if (intermediate->backing_hd == bs) {
2495             overlay = intermediate;
2496             break;
2497         }
2498         intermediate = intermediate->backing_hd;
2499     }
2500 
2501     return overlay;
2502 }
2503 
2504 typedef struct BlkIntermediateStates {
2505     BlockDriverState *bs;
2506     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2507 } BlkIntermediateStates;
2508 
2509 
2510 /*
2511  * Drops images above 'base' up to and including 'top', and sets the image
2512  * above 'top' to have base as its backing file.
2513  *
2514  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2515  * information in 'bs' can be properly updated.
2516  *
2517  * E.g., this will convert the following chain:
2518  * bottom <- base <- intermediate <- top <- active
2519  *
2520  * to
2521  *
2522  * bottom <- base <- active
2523  *
2524  * It is allowed for bottom==base, in which case it converts:
2525  *
2526  * base <- intermediate <- top <- active
2527  *
2528  * to
2529  *
2530  * base <- active
2531  *
2532  * Error conditions:
2533  *  if active == top, that is considered an error
2534  *
2535  */
2536 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2537                            BlockDriverState *base)
2538 {
2539     BlockDriverState *intermediate;
2540     BlockDriverState *base_bs = NULL;
2541     BlockDriverState *new_top_bs = NULL;
2542     BlkIntermediateStates *intermediate_state, *next;
2543     int ret = -EIO;
2544 
2545     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2546     QSIMPLEQ_INIT(&states_to_delete);
2547 
2548     if (!top->drv || !base->drv) {
2549         goto exit;
2550     }
2551 
2552     new_top_bs = bdrv_find_overlay(active, top);
2553 
2554     if (new_top_bs == NULL) {
2555         /* we could not find the image above 'top', this is an error */
2556         goto exit;
2557     }
2558 
2559     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2560      * to do, no intermediate images */
2561     if (new_top_bs->backing_hd == base) {
2562         ret = 0;
2563         goto exit;
2564     }
2565 
2566     intermediate = top;
2567 
2568     /* now we will go down through the list, and add each BDS we find
2569      * into our deletion queue, until we hit the 'base'
2570      */
2571     while (intermediate) {
2572         intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2573         intermediate_state->bs = intermediate;
2574         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2575 
2576         if (intermediate->backing_hd == base) {
2577             base_bs = intermediate->backing_hd;
2578             break;
2579         }
2580         intermediate = intermediate->backing_hd;
2581     }
2582     if (base_bs == NULL) {
2583         /* something went wrong, we did not end at the base. safely
2584          * unravel everything, and exit with error */
2585         goto exit;
2586     }
2587 
2588     /* success - we can delete the intermediate states, and link top->base */
2589     ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2590                                    base_bs->drv ? base_bs->drv->format_name : "");
2591     if (ret) {
2592         goto exit;
2593     }
2594     new_top_bs->backing_hd = base_bs;
2595 
2596     bdrv_refresh_limits(new_top_bs);
2597 
2598     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2599         /* so that bdrv_close() does not recursively close the chain */
2600         intermediate_state->bs->backing_hd = NULL;
2601         bdrv_unref(intermediate_state->bs);
2602     }
2603     ret = 0;
2604 
2605 exit:
2606     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2607         g_free(intermediate_state);
2608     }
2609     return ret;
2610 }
2611 
2612 
2613 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2614                                    size_t size)
2615 {
2616     int64_t len;
2617 
2618     if (size > INT_MAX) {
2619         return -EIO;
2620     }
2621 
2622     if (!bdrv_is_inserted(bs))
2623         return -ENOMEDIUM;
2624 
2625     if (bs->growable)
2626         return 0;
2627 
2628     len = bdrv_getlength(bs);
2629 
2630     if (offset < 0)
2631         return -EIO;
2632 
2633     if ((offset > len) || (len - offset < size))
2634         return -EIO;
2635 
2636     return 0;
2637 }
2638 
2639 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2640                               int nb_sectors)
2641 {
2642     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2643         return -EIO;
2644     }
2645 
2646     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2647                                    nb_sectors * BDRV_SECTOR_SIZE);
2648 }
2649 
2650 typedef struct RwCo {
2651     BlockDriverState *bs;
2652     int64_t offset;
2653     QEMUIOVector *qiov;
2654     bool is_write;
2655     int ret;
2656     BdrvRequestFlags flags;
2657 } RwCo;
2658 
2659 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2660 {
2661     RwCo *rwco = opaque;
2662 
2663     if (!rwco->is_write) {
2664         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2665                                       rwco->qiov->size, rwco->qiov,
2666                                       rwco->flags);
2667     } else {
2668         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2669                                        rwco->qiov->size, rwco->qiov,
2670                                        rwco->flags);
2671     }
2672 }
2673 
2674 /*
2675  * Process a vectored synchronous request using coroutines
2676  */
2677 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2678                         QEMUIOVector *qiov, bool is_write,
2679                         BdrvRequestFlags flags)
2680 {
2681     Coroutine *co;
2682     RwCo rwco = {
2683         .bs = bs,
2684         .offset = offset,
2685         .qiov = qiov,
2686         .is_write = is_write,
2687         .ret = NOT_DONE,
2688         .flags = flags,
2689     };
2690 
2691     /**
2692      * In sync call context, when the vcpu is blocked, this throttling timer
2693      * will not fire; so the I/O throttling function has to be disabled here
2694      * if it has been enabled.
2695      */
2696     if (bs->io_limits_enabled) {
2697         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2698                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2699         bdrv_io_limits_disable(bs);
2700     }
2701 
2702     if (qemu_in_coroutine()) {
2703         /* Fast-path if already in coroutine context */
2704         bdrv_rw_co_entry(&rwco);
2705     } else {
2706         co = qemu_coroutine_create(bdrv_rw_co_entry);
2707         qemu_coroutine_enter(co, &rwco);
2708         while (rwco.ret == NOT_DONE) {
2709             qemu_aio_wait();
2710         }
2711     }
2712     return rwco.ret;
2713 }
2714 
2715 /*
2716  * Process a synchronous request using coroutines
2717  */
2718 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2719                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2720 {
2721     QEMUIOVector qiov;
2722     struct iovec iov = {
2723         .iov_base = (void *)buf,
2724         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2725     };
2726 
2727     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2728         return -EINVAL;
2729     }
2730 
2731     qemu_iovec_init_external(&qiov, &iov, 1);
2732     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2733                         &qiov, is_write, flags);
2734 }
2735 
2736 /* return < 0 if error. See bdrv_write() for the return codes */
2737 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2738               uint8_t *buf, int nb_sectors)
2739 {
2740     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2741 }
2742 
2743 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2744 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2745                           uint8_t *buf, int nb_sectors)
2746 {
2747     bool enabled;
2748     int ret;
2749 
2750     enabled = bs->io_limits_enabled;
2751     bs->io_limits_enabled = false;
2752     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2753     bs->io_limits_enabled = enabled;
2754     return ret;
2755 }
2756 
2757 /* Return < 0 if error. Important errors are:
2758   -EIO         generic I/O error (may happen for all errors)
2759   -ENOMEDIUM   No media inserted.
2760   -EINVAL      Invalid sector number or nb_sectors
2761   -EACCES      Trying to write a read-only device
2762 */
2763 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2764                const uint8_t *buf, int nb_sectors)
2765 {
2766     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2767 }
2768 
2769 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2770                       int nb_sectors, BdrvRequestFlags flags)
2771 {
2772     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2773                       BDRV_REQ_ZERO_WRITE | flags);
2774 }
2775 
2776 /*
2777  * Completely zero out a block device with the help of bdrv_write_zeroes.
2778  * The operation is sped up by checking the block status and only writing
2779  * zeroes to the device if they currently do not return zeroes. Optional
2780  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2781  *
2782  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2783  */
2784 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2785 {
2786     int64_t target_size;
2787     int64_t ret, nb_sectors, sector_num = 0;
2788     int n;
2789 
2790     target_size = bdrv_getlength(bs);
2791     if (target_size < 0) {
2792         return target_size;
2793     }
2794     target_size /= BDRV_SECTOR_SIZE;
2795 
2796     for (;;) {
2797         nb_sectors = target_size - sector_num;
2798         if (nb_sectors <= 0) {
2799             return 0;
2800         }
2801         if (nb_sectors > INT_MAX) {
2802             nb_sectors = INT_MAX;
2803         }
2804         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2805         if (ret < 0) {
2806             error_report("error getting block status at sector %" PRId64 ": %s",
2807                          sector_num, strerror(-ret));
2808             return ret;
2809         }
2810         if (ret & BDRV_BLOCK_ZERO) {
2811             sector_num += n;
2812             continue;
2813         }
2814         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2815         if (ret < 0) {
2816             error_report("error writing zeroes at sector %" PRId64 ": %s",
2817                          sector_num, strerror(-ret));
2818             return ret;
2819         }
2820         sector_num += n;
2821     }
2822 }
2823 
2824 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2825 {
2826     QEMUIOVector qiov;
2827     struct iovec iov = {
2828         .iov_base = (void *)buf,
2829         .iov_len = bytes,
2830     };
2831     int ret;
2832 
2833     if (bytes < 0) {
2834         return -EINVAL;
2835     }
2836 
2837     qemu_iovec_init_external(&qiov, &iov, 1);
2838     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2839     if (ret < 0) {
2840         return ret;
2841     }
2842 
2843     return bytes;
2844 }
2845 
2846 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2847 {
2848     int ret;
2849 
2850     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2851     if (ret < 0) {
2852         return ret;
2853     }
2854 
2855     return qiov->size;
2856 }
2857 
2858 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2859                 const void *buf, int bytes)
2860 {
2861     QEMUIOVector qiov;
2862     struct iovec iov = {
2863         .iov_base   = (void *) buf,
2864         .iov_len    = bytes,
2865     };
2866 
2867     if (bytes < 0) {
2868         return -EINVAL;
2869     }
2870 
2871     qemu_iovec_init_external(&qiov, &iov, 1);
2872     return bdrv_pwritev(bs, offset, &qiov);
2873 }
2874 
2875 /*
2876  * Writes to the file and ensures that no writes are reordered across this
2877  * request (acts as a barrier)
2878  *
2879  * Returns 0 on success, -errno in error cases.
2880  */
2881 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2882     const void *buf, int count)
2883 {
2884     int ret;
2885 
2886     ret = bdrv_pwrite(bs, offset, buf, count);
2887     if (ret < 0) {
2888         return ret;
2889     }
2890 
2891     /* No flush needed for cache modes that already do it */
2892     if (bs->enable_write_cache) {
2893         bdrv_flush(bs);
2894     }
2895 
2896     return 0;
2897 }
2898 
2899 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2900         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2901 {
2902     /* Perform I/O through a temporary buffer so that users who scribble over
2903      * their read buffer while the operation is in progress do not end up
2904      * modifying the image file.  This is critical for zero-copy guest I/O
2905      * where anything might happen inside guest memory.
2906      */
2907     void *bounce_buffer;
2908 
2909     BlockDriver *drv = bs->drv;
2910     struct iovec iov;
2911     QEMUIOVector bounce_qiov;
2912     int64_t cluster_sector_num;
2913     int cluster_nb_sectors;
2914     size_t skip_bytes;
2915     int ret;
2916 
2917     /* Cover entire cluster so no additional backing file I/O is required when
2918      * allocating cluster in the image file.
2919      */
2920     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2921                            &cluster_sector_num, &cluster_nb_sectors);
2922 
2923     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2924                                    cluster_sector_num, cluster_nb_sectors);
2925 
2926     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2927     iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2928     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2929 
2930     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2931                              &bounce_qiov);
2932     if (ret < 0) {
2933         goto err;
2934     }
2935 
2936     if (drv->bdrv_co_write_zeroes &&
2937         buffer_is_zero(bounce_buffer, iov.iov_len)) {
2938         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2939                                       cluster_nb_sectors, 0);
2940     } else {
2941         /* This does not change the data on the disk, it is not necessary
2942          * to flush even in cache=writethrough mode.
2943          */
2944         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2945                                   &bounce_qiov);
2946     }
2947 
2948     if (ret < 0) {
2949         /* It might be okay to ignore write errors for guest requests.  If this
2950          * is a deliberate copy-on-read then we don't want to ignore the error.
2951          * Simply report it in all cases.
2952          */
2953         goto err;
2954     }
2955 
2956     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2957     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2958                         nb_sectors * BDRV_SECTOR_SIZE);
2959 
2960 err:
2961     qemu_vfree(bounce_buffer);
2962     return ret;
2963 }
2964 
2965 /*
2966  * Forwards an already correctly aligned request to the BlockDriver. This
2967  * handles copy on read and zeroing after EOF; any other features must be
2968  * implemented by the caller.
2969  */
2970 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2971     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2972     int64_t align, QEMUIOVector *qiov, int flags)
2973 {
2974     BlockDriver *drv = bs->drv;
2975     int ret;
2976 
2977     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2978     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2979 
2980     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2981     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2982 
2983     /* Handle Copy on Read and associated serialisation */
2984     if (flags & BDRV_REQ_COPY_ON_READ) {
2985         /* If we touch the same cluster it counts as an overlap.  This
2986          * guarantees that allocating writes will be serialized and not race
2987          * with each other for the same cluster.  For example, in copy-on-read
2988          * it ensures that the CoR read and write operations are atomic and
2989          * guest writes cannot interleave between them. */
2990         mark_request_serialising(req, bdrv_get_cluster_size(bs));
2991     }
2992 
2993     wait_serialising_requests(req);
2994 
2995     if (flags & BDRV_REQ_COPY_ON_READ) {
2996         int pnum;
2997 
2998         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2999         if (ret < 0) {
3000             goto out;
3001         }
3002 
3003         if (!ret || pnum != nb_sectors) {
3004             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3005             goto out;
3006         }
3007     }
3008 
3009     /* Forward the request to the BlockDriver */
3010     if (!(bs->zero_beyond_eof && bs->growable)) {
3011         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3012     } else {
3013         /* Read zeros after EOF of growable BDSes */
3014         int64_t len, total_sectors, max_nb_sectors;
3015 
3016         len = bdrv_getlength(bs);
3017         if (len < 0) {
3018             ret = len;
3019             goto out;
3020         }
3021 
3022         total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
3023         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3024                                   align >> BDRV_SECTOR_BITS);
3025         if (max_nb_sectors > 0) {
3026             ret = drv->bdrv_co_readv(bs, sector_num,
3027                                      MIN(nb_sectors, max_nb_sectors), qiov);
3028         } else {
3029             ret = 0;
3030         }
3031 
3032         /* Reading beyond end of file is supposed to produce zeroes */
3033         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3034             uint64_t offset = MAX(0, total_sectors - sector_num);
3035             uint64_t bytes = (sector_num + nb_sectors - offset) *
3036                               BDRV_SECTOR_SIZE;
3037             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3038         }
3039     }
3040 
3041 out:
3042     return ret;
3043 }
3044 
3045 /*
3046  * Handle a read request in coroutine context
3047  */
3048 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3049     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3050     BdrvRequestFlags flags)
3051 {
3052     BlockDriver *drv = bs->drv;
3053     BdrvTrackedRequest req;
3054 
3055     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3056     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3057     uint8_t *head_buf = NULL;
3058     uint8_t *tail_buf = NULL;
3059     QEMUIOVector local_qiov;
3060     bool use_local_qiov = false;
3061     int ret;
3062 
3063     if (!drv) {
3064         return -ENOMEDIUM;
3065     }
3066     if (bdrv_check_byte_request(bs, offset, bytes)) {
3067         return -EIO;
3068     }
3069 
3070     if (bs->copy_on_read) {
3071         flags |= BDRV_REQ_COPY_ON_READ;
3072     }
3073 
3074     /* throttling disk I/O */
3075     if (bs->io_limits_enabled) {
3076         bdrv_io_limits_intercept(bs, bytes, false);
3077     }
3078 
3079     /* Align read if necessary by padding qiov */
3080     if (offset & (align - 1)) {
3081         head_buf = qemu_blockalign(bs, align);
3082         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3083         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3084         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3085         use_local_qiov = true;
3086 
3087         bytes += offset & (align - 1);
3088         offset = offset & ~(align - 1);
3089     }
3090 
3091     if ((offset + bytes) & (align - 1)) {
3092         if (!use_local_qiov) {
3093             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3094             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3095             use_local_qiov = true;
3096         }
3097         tail_buf = qemu_blockalign(bs, align);
3098         qemu_iovec_add(&local_qiov, tail_buf,
3099                        align - ((offset + bytes) & (align - 1)));
3100 
3101         bytes = ROUND_UP(bytes, align);
3102     }
3103 
3104     tracked_request_begin(&req, bs, offset, bytes, false);
3105     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3106                               use_local_qiov ? &local_qiov : qiov,
3107                               flags);
3108     tracked_request_end(&req);
3109 
3110     if (use_local_qiov) {
3111         qemu_iovec_destroy(&local_qiov);
3112         qemu_vfree(head_buf);
3113         qemu_vfree(tail_buf);
3114     }
3115 
3116     return ret;
3117 }
3118 
3119 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3120     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3121     BdrvRequestFlags flags)
3122 {
3123     if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3124         return -EINVAL;
3125     }
3126 
3127     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3128                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3129 }
3130 
3131 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3132     int nb_sectors, QEMUIOVector *qiov)
3133 {
3134     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3135 
3136     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3137 }
3138 
3139 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3140     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3141 {
3142     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3143 
3144     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3145                             BDRV_REQ_COPY_ON_READ);
3146 }
3147 
3148 /* if no limit is specified in the BlockLimits use a default
3149  * of 32768 512-byte sectors (16 MiB) per request.
3150  */
3151 #define MAX_WRITE_ZEROES_DEFAULT 32768
3152 
3153 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3154     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3155 {
3156     BlockDriver *drv = bs->drv;
3157     QEMUIOVector qiov;
3158     struct iovec iov = {0};
3159     int ret = 0;
3160 
3161     int max_write_zeroes = bs->bl.max_write_zeroes ?
3162                            bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3163 
3164     while (nb_sectors > 0 && !ret) {
3165         int num = nb_sectors;
3166 
3167         /* Align request.  Block drivers can expect the "bulk" of the request
3168          * to be aligned.
3169          */
3170         if (bs->bl.write_zeroes_alignment
3171             && num > bs->bl.write_zeroes_alignment) {
3172             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3173                 /* Make a small request up to the first aligned sector.  */
3174                 num = bs->bl.write_zeroes_alignment;
3175                 num -= sector_num % bs->bl.write_zeroes_alignment;
3176             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3177                 /* Shorten the request to the last aligned sector.  num cannot
3178                  * underflow because num > bs->bl.write_zeroes_alignment.
3179                  */
3180                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3181             }
3182         }
3183 
3184         /* limit request size */
3185         if (num > max_write_zeroes) {
3186             num = max_write_zeroes;
3187         }
3188 
3189         ret = -ENOTSUP;
3190         /* First try the efficient write zeroes operation */
3191         if (drv->bdrv_co_write_zeroes) {
3192             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3193         }
3194 
3195         if (ret == -ENOTSUP) {
3196             /* Fall back to bounce buffer if write zeroes is unsupported */
3197             iov.iov_len = num * BDRV_SECTOR_SIZE;
3198             if (iov.iov_base == NULL) {
3199                 iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3200                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3201             }
3202             qemu_iovec_init_external(&qiov, &iov, 1);
3203 
3204             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3205 
3206             /* Keep bounce buffer around if it is big enough for all
3207              * all future requests.
3208              */
3209             if (num < max_write_zeroes) {
3210                 qemu_vfree(iov.iov_base);
3211                 iov.iov_base = NULL;
3212             }
3213         }
3214 
3215         sector_num += num;
3216         nb_sectors -= num;
3217     }
3218 
3219     qemu_vfree(iov.iov_base);
3220     return ret;
3221 }
3222 
3223 /*
3224  * Forwards an already correctly aligned write request to the BlockDriver.
3225  */
3226 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3227     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3228     QEMUIOVector *qiov, int flags)
3229 {
3230     BlockDriver *drv = bs->drv;
3231     bool waited;
3232     int ret;
3233 
3234     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3235     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3236 
3237     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3238     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3239 
3240     waited = wait_serialising_requests(req);
3241     assert(!waited || !req->serialising);
3242     assert(req->overlap_offset <= offset);
3243     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3244 
3245     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3246 
3247     if (ret < 0) {
3248         /* Do nothing, write notifier decided to fail this request */
3249     } else if (flags & BDRV_REQ_ZERO_WRITE) {
3250         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3251         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3252     } else {
3253         BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3254         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3255     }
3256     BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3257 
3258     if (ret == 0 && !bs->enable_write_cache) {
3259         ret = bdrv_co_flush(bs);
3260     }
3261 
3262     bdrv_set_dirty(bs, sector_num, nb_sectors);
3263 
3264     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3265         bs->wr_highest_sector = sector_num + nb_sectors - 1;
3266     }
3267     if (bs->growable && ret >= 0) {
3268         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3269     }
3270 
3271     return ret;
3272 }
3273 
3274 /*
3275  * Handle a write request in coroutine context
3276  */
3277 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3278     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3279     BdrvRequestFlags flags)
3280 {
3281     BdrvTrackedRequest req;
3282     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3283     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3284     uint8_t *head_buf = NULL;
3285     uint8_t *tail_buf = NULL;
3286     QEMUIOVector local_qiov;
3287     bool use_local_qiov = false;
3288     int ret;
3289 
3290     if (!bs->drv) {
3291         return -ENOMEDIUM;
3292     }
3293     if (bs->read_only) {
3294         return -EACCES;
3295     }
3296     if (bdrv_check_byte_request(bs, offset, bytes)) {
3297         return -EIO;
3298     }
3299 
3300     /* throttling disk I/O */
3301     if (bs->io_limits_enabled) {
3302         bdrv_io_limits_intercept(bs, bytes, true);
3303     }
3304 
3305     /*
3306      * Align write if necessary by performing a read-modify-write cycle.
3307      * Pad qiov with the read parts and be sure to have a tracked request not
3308      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3309      */
3310     tracked_request_begin(&req, bs, offset, bytes, true);
3311 
3312     if (offset & (align - 1)) {
3313         QEMUIOVector head_qiov;
3314         struct iovec head_iov;
3315 
3316         mark_request_serialising(&req, align);
3317         wait_serialising_requests(&req);
3318 
3319         head_buf = qemu_blockalign(bs, align);
3320         head_iov = (struct iovec) {
3321             .iov_base   = head_buf,
3322             .iov_len    = align,
3323         };
3324         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3325 
3326         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3327         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3328                                   align, &head_qiov, 0);
3329         if (ret < 0) {
3330             goto fail;
3331         }
3332         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3333 
3334         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3335         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3336         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3337         use_local_qiov = true;
3338 
3339         bytes += offset & (align - 1);
3340         offset = offset & ~(align - 1);
3341     }
3342 
3343     if ((offset + bytes) & (align - 1)) {
3344         QEMUIOVector tail_qiov;
3345         struct iovec tail_iov;
3346         size_t tail_bytes;
3347         bool waited;
3348 
3349         mark_request_serialising(&req, align);
3350         waited = wait_serialising_requests(&req);
3351         assert(!waited || !use_local_qiov);
3352 
3353         tail_buf = qemu_blockalign(bs, align);
3354         tail_iov = (struct iovec) {
3355             .iov_base   = tail_buf,
3356             .iov_len    = align,
3357         };
3358         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3359 
3360         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3361         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3362                                   align, &tail_qiov, 0);
3363         if (ret < 0) {
3364             goto fail;
3365         }
3366         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3367 
3368         if (!use_local_qiov) {
3369             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3370             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3371             use_local_qiov = true;
3372         }
3373 
3374         tail_bytes = (offset + bytes) & (align - 1);
3375         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3376 
3377         bytes = ROUND_UP(bytes, align);
3378     }
3379 
3380     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3381                                use_local_qiov ? &local_qiov : qiov,
3382                                flags);
3383 
3384 fail:
3385     tracked_request_end(&req);
3386 
3387     if (use_local_qiov) {
3388         qemu_iovec_destroy(&local_qiov);
3389     }
3390     qemu_vfree(head_buf);
3391     qemu_vfree(tail_buf);
3392 
3393     return ret;
3394 }
3395 
3396 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3397     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3398     BdrvRequestFlags flags)
3399 {
3400     if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3401         return -EINVAL;
3402     }
3403 
3404     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3405                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3406 }
3407 
3408 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3409     int nb_sectors, QEMUIOVector *qiov)
3410 {
3411     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3412 
3413     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3414 }
3415 
3416 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3417                                       int64_t sector_num, int nb_sectors,
3418                                       BdrvRequestFlags flags)
3419 {
3420     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3421 
3422     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3423         flags &= ~BDRV_REQ_MAY_UNMAP;
3424     }
3425 
3426     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3427                              BDRV_REQ_ZERO_WRITE | flags);
3428 }
3429 
3430 /**
3431  * Truncate file to 'offset' bytes (needed only for file protocols)
3432  */
3433 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3434 {
3435     BlockDriver *drv = bs->drv;
3436     int ret;
3437     if (!drv)
3438         return -ENOMEDIUM;
3439     if (!drv->bdrv_truncate)
3440         return -ENOTSUP;
3441     if (bs->read_only)
3442         return -EACCES;
3443     if (bdrv_in_use(bs))
3444         return -EBUSY;
3445     ret = drv->bdrv_truncate(bs, offset);
3446     if (ret == 0) {
3447         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3448         bdrv_dev_resize_cb(bs);
3449     }
3450     return ret;
3451 }
3452 
3453 /**
3454  * Length of a allocated file in bytes. Sparse files are counted by actual
3455  * allocated space. Return < 0 if error or unknown.
3456  */
3457 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3458 {
3459     BlockDriver *drv = bs->drv;
3460     if (!drv) {
3461         return -ENOMEDIUM;
3462     }
3463     if (drv->bdrv_get_allocated_file_size) {
3464         return drv->bdrv_get_allocated_file_size(bs);
3465     }
3466     if (bs->file) {
3467         return bdrv_get_allocated_file_size(bs->file);
3468     }
3469     return -ENOTSUP;
3470 }
3471 
3472 /**
3473  * Length of a file in bytes. Return < 0 if error or unknown.
3474  */
3475 int64_t bdrv_getlength(BlockDriverState *bs)
3476 {
3477     BlockDriver *drv = bs->drv;
3478     if (!drv)
3479         return -ENOMEDIUM;
3480 
3481     if (drv->has_variable_length) {
3482         int ret = refresh_total_sectors(bs, bs->total_sectors);
3483         if (ret < 0) {
3484             return ret;
3485         }
3486     }
3487     return bs->total_sectors * BDRV_SECTOR_SIZE;
3488 }
3489 
3490 /* return 0 as number of sectors if no device present or error */
3491 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3492 {
3493     int64_t length;
3494     length = bdrv_getlength(bs);
3495     if (length < 0)
3496         length = 0;
3497     else
3498         length = length >> BDRV_SECTOR_BITS;
3499     *nb_sectors_ptr = length;
3500 }
3501 
3502 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3503                        BlockdevOnError on_write_error)
3504 {
3505     bs->on_read_error = on_read_error;
3506     bs->on_write_error = on_write_error;
3507 }
3508 
3509 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3510 {
3511     return is_read ? bs->on_read_error : bs->on_write_error;
3512 }
3513 
3514 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3515 {
3516     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3517 
3518     switch (on_err) {
3519     case BLOCKDEV_ON_ERROR_ENOSPC:
3520         return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3521     case BLOCKDEV_ON_ERROR_STOP:
3522         return BDRV_ACTION_STOP;
3523     case BLOCKDEV_ON_ERROR_REPORT:
3524         return BDRV_ACTION_REPORT;
3525     case BLOCKDEV_ON_ERROR_IGNORE:
3526         return BDRV_ACTION_IGNORE;
3527     default:
3528         abort();
3529     }
3530 }
3531 
3532 /* This is done by device models because, while the block layer knows
3533  * about the error, it does not know whether an operation comes from
3534  * the device or the block layer (from a job, for example).
3535  */
3536 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3537                        bool is_read, int error)
3538 {
3539     assert(error >= 0);
3540     bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3541     if (action == BDRV_ACTION_STOP) {
3542         vm_stop(RUN_STATE_IO_ERROR);
3543         bdrv_iostatus_set_err(bs, error);
3544     }
3545 }
3546 
3547 int bdrv_is_read_only(BlockDriverState *bs)
3548 {
3549     return bs->read_only;
3550 }
3551 
3552 int bdrv_is_sg(BlockDriverState *bs)
3553 {
3554     return bs->sg;
3555 }
3556 
3557 int bdrv_enable_write_cache(BlockDriverState *bs)
3558 {
3559     return bs->enable_write_cache;
3560 }
3561 
3562 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3563 {
3564     bs->enable_write_cache = wce;
3565 
3566     /* so a reopen() will preserve wce */
3567     if (wce) {
3568         bs->open_flags |= BDRV_O_CACHE_WB;
3569     } else {
3570         bs->open_flags &= ~BDRV_O_CACHE_WB;
3571     }
3572 }
3573 
3574 int bdrv_is_encrypted(BlockDriverState *bs)
3575 {
3576     if (bs->backing_hd && bs->backing_hd->encrypted)
3577         return 1;
3578     return bs->encrypted;
3579 }
3580 
3581 int bdrv_key_required(BlockDriverState *bs)
3582 {
3583     BlockDriverState *backing_hd = bs->backing_hd;
3584 
3585     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3586         return 1;
3587     return (bs->encrypted && !bs->valid_key);
3588 }
3589 
3590 int bdrv_set_key(BlockDriverState *bs, const char *key)
3591 {
3592     int ret;
3593     if (bs->backing_hd && bs->backing_hd->encrypted) {
3594         ret = bdrv_set_key(bs->backing_hd, key);
3595         if (ret < 0)
3596             return ret;
3597         if (!bs->encrypted)
3598             return 0;
3599     }
3600     if (!bs->encrypted) {
3601         return -EINVAL;
3602     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3603         return -ENOMEDIUM;
3604     }
3605     ret = bs->drv->bdrv_set_key(bs, key);
3606     if (ret < 0) {
3607         bs->valid_key = 0;
3608     } else if (!bs->valid_key) {
3609         bs->valid_key = 1;
3610         /* call the change callback now, we skipped it on open */
3611         bdrv_dev_change_media_cb(bs, true);
3612     }
3613     return ret;
3614 }
3615 
3616 const char *bdrv_get_format_name(BlockDriverState *bs)
3617 {
3618     return bs->drv ? bs->drv->format_name : NULL;
3619 }
3620 
3621 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3622                          void *opaque)
3623 {
3624     BlockDriver *drv;
3625     int count = 0;
3626     const char **formats = NULL;
3627 
3628     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3629         if (drv->format_name) {
3630             bool found = false;
3631             int i = count;
3632             while (formats && i && !found) {
3633                 found = !strcmp(formats[--i], drv->format_name);
3634             }
3635 
3636             if (!found) {
3637                 formats = g_realloc(formats, (count + 1) * sizeof(char *));
3638                 formats[count++] = drv->format_name;
3639                 it(opaque, drv->format_name);
3640             }
3641         }
3642     }
3643     g_free(formats);
3644 }
3645 
3646 /* This function is to find block backend bs */
3647 BlockDriverState *bdrv_find(const char *name)
3648 {
3649     BlockDriverState *bs;
3650 
3651     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3652         if (!strcmp(name, bs->device_name)) {
3653             return bs;
3654         }
3655     }
3656     return NULL;
3657 }
3658 
3659 /* This function is to find a node in the bs graph */
3660 BlockDriverState *bdrv_find_node(const char *node_name)
3661 {
3662     BlockDriverState *bs;
3663 
3664     assert(node_name);
3665 
3666     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3667         if (!strcmp(node_name, bs->node_name)) {
3668             return bs;
3669         }
3670     }
3671     return NULL;
3672 }
3673 
3674 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3675 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3676 {
3677     BlockDeviceInfoList *list, *entry;
3678     BlockDriverState *bs;
3679 
3680     list = NULL;
3681     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3682         entry = g_malloc0(sizeof(*entry));
3683         entry->value = bdrv_block_device_info(bs);
3684         entry->next = list;
3685         list = entry;
3686     }
3687 
3688     return list;
3689 }
3690 
3691 BlockDriverState *bdrv_lookup_bs(const char *device,
3692                                  const char *node_name,
3693                                  Error **errp)
3694 {
3695     BlockDriverState *bs = NULL;
3696 
3697     if (device) {
3698         bs = bdrv_find(device);
3699 
3700         if (bs) {
3701             return bs;
3702         }
3703     }
3704 
3705     if (node_name) {
3706         bs = bdrv_find_node(node_name);
3707 
3708         if (bs) {
3709             return bs;
3710         }
3711     }
3712 
3713     error_setg(errp, "Cannot find device=%s nor node_name=%s",
3714                      device ? device : "",
3715                      node_name ? node_name : "");
3716     return NULL;
3717 }
3718 
3719 BlockDriverState *bdrv_next(BlockDriverState *bs)
3720 {
3721     if (!bs) {
3722         return QTAILQ_FIRST(&bdrv_states);
3723     }
3724     return QTAILQ_NEXT(bs, device_list);
3725 }
3726 
3727 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3728 {
3729     BlockDriverState *bs;
3730 
3731     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3732         it(opaque, bs);
3733     }
3734 }
3735 
3736 const char *bdrv_get_device_name(BlockDriverState *bs)
3737 {
3738     return bs->device_name;
3739 }
3740 
3741 int bdrv_get_flags(BlockDriverState *bs)
3742 {
3743     return bs->open_flags;
3744 }
3745 
3746 int bdrv_flush_all(void)
3747 {
3748     BlockDriverState *bs;
3749     int result = 0;
3750 
3751     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3752         int ret = bdrv_flush(bs);
3753         if (ret < 0 && !result) {
3754             result = ret;
3755         }
3756     }
3757 
3758     return result;
3759 }
3760 
3761 int bdrv_has_zero_init_1(BlockDriverState *bs)
3762 {
3763     return 1;
3764 }
3765 
3766 int bdrv_has_zero_init(BlockDriverState *bs)
3767 {
3768     assert(bs->drv);
3769 
3770     /* If BS is a copy on write image, it is initialized to
3771        the contents of the base image, which may not be zeroes.  */
3772     if (bs->backing_hd) {
3773         return 0;
3774     }
3775     if (bs->drv->bdrv_has_zero_init) {
3776         return bs->drv->bdrv_has_zero_init(bs);
3777     }
3778 
3779     /* safe default */
3780     return 0;
3781 }
3782 
3783 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3784 {
3785     BlockDriverInfo bdi;
3786 
3787     if (bs->backing_hd) {
3788         return false;
3789     }
3790 
3791     if (bdrv_get_info(bs, &bdi) == 0) {
3792         return bdi.unallocated_blocks_are_zero;
3793     }
3794 
3795     return false;
3796 }
3797 
3798 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3799 {
3800     BlockDriverInfo bdi;
3801 
3802     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3803         return false;
3804     }
3805 
3806     if (bdrv_get_info(bs, &bdi) == 0) {
3807         return bdi.can_write_zeroes_with_unmap;
3808     }
3809 
3810     return false;
3811 }
3812 
3813 typedef struct BdrvCoGetBlockStatusData {
3814     BlockDriverState *bs;
3815     BlockDriverState *base;
3816     int64_t sector_num;
3817     int nb_sectors;
3818     int *pnum;
3819     int64_t ret;
3820     bool done;
3821 } BdrvCoGetBlockStatusData;
3822 
3823 /*
3824  * Returns true iff the specified sector is present in the disk image. Drivers
3825  * not implementing the functionality are assumed to not support backing files,
3826  * hence all their sectors are reported as allocated.
3827  *
3828  * If 'sector_num' is beyond the end of the disk image the return value is 0
3829  * and 'pnum' is set to 0.
3830  *
3831  * 'pnum' is set to the number of sectors (including and immediately following
3832  * the specified sector) that are known to be in the same
3833  * allocated/unallocated state.
3834  *
3835  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3836  * beyond the end of the disk image it will be clamped.
3837  */
3838 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3839                                                      int64_t sector_num,
3840                                                      int nb_sectors, int *pnum)
3841 {
3842     int64_t length;
3843     int64_t n;
3844     int64_t ret, ret2;
3845 
3846     length = bdrv_getlength(bs);
3847     if (length < 0) {
3848         return length;
3849     }
3850 
3851     if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3852         *pnum = 0;
3853         return 0;
3854     }
3855 
3856     n = bs->total_sectors - sector_num;
3857     if (n < nb_sectors) {
3858         nb_sectors = n;
3859     }
3860 
3861     if (!bs->drv->bdrv_co_get_block_status) {
3862         *pnum = nb_sectors;
3863         ret = BDRV_BLOCK_DATA;
3864         if (bs->drv->protocol_name) {
3865             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3866         }
3867         return ret;
3868     }
3869 
3870     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3871     if (ret < 0) {
3872         *pnum = 0;
3873         return ret;
3874     }
3875 
3876     if (ret & BDRV_BLOCK_RAW) {
3877         assert(ret & BDRV_BLOCK_OFFSET_VALID);
3878         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3879                                      *pnum, pnum);
3880     }
3881 
3882     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3883         if (bdrv_unallocated_blocks_are_zero(bs)) {
3884             ret |= BDRV_BLOCK_ZERO;
3885         } else if (bs->backing_hd) {
3886             BlockDriverState *bs2 = bs->backing_hd;
3887             int64_t length2 = bdrv_getlength(bs2);
3888             if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3889                 ret |= BDRV_BLOCK_ZERO;
3890             }
3891         }
3892     }
3893 
3894     if (bs->file &&
3895         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3896         (ret & BDRV_BLOCK_OFFSET_VALID)) {
3897         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3898                                         *pnum, pnum);
3899         if (ret2 >= 0) {
3900             /* Ignore errors.  This is just providing extra information, it
3901              * is useful but not necessary.
3902              */
3903             ret |= (ret2 & BDRV_BLOCK_ZERO);
3904         }
3905     }
3906 
3907     return ret;
3908 }
3909 
3910 /* Coroutine wrapper for bdrv_get_block_status() */
3911 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3912 {
3913     BdrvCoGetBlockStatusData *data = opaque;
3914     BlockDriverState *bs = data->bs;
3915 
3916     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3917                                          data->pnum);
3918     data->done = true;
3919 }
3920 
3921 /*
3922  * Synchronous wrapper around bdrv_co_get_block_status().
3923  *
3924  * See bdrv_co_get_block_status() for details.
3925  */
3926 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3927                               int nb_sectors, int *pnum)
3928 {
3929     Coroutine *co;
3930     BdrvCoGetBlockStatusData data = {
3931         .bs = bs,
3932         .sector_num = sector_num,
3933         .nb_sectors = nb_sectors,
3934         .pnum = pnum,
3935         .done = false,
3936     };
3937 
3938     if (qemu_in_coroutine()) {
3939         /* Fast-path if already in coroutine context */
3940         bdrv_get_block_status_co_entry(&data);
3941     } else {
3942         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3943         qemu_coroutine_enter(co, &data);
3944         while (!data.done) {
3945             qemu_aio_wait();
3946         }
3947     }
3948     return data.ret;
3949 }
3950 
3951 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3952                                    int nb_sectors, int *pnum)
3953 {
3954     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3955     if (ret < 0) {
3956         return ret;
3957     }
3958     return
3959         (ret & BDRV_BLOCK_DATA) ||
3960         ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3961 }
3962 
3963 /*
3964  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3965  *
3966  * Return true if the given sector is allocated in any image between
3967  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3968  * sector is allocated in any image of the chain.  Return false otherwise.
3969  *
3970  * 'pnum' is set to the number of sectors (including and immediately following
3971  *  the specified sector) that are known to be in the same
3972  *  allocated/unallocated state.
3973  *
3974  */
3975 int bdrv_is_allocated_above(BlockDriverState *top,
3976                             BlockDriverState *base,
3977                             int64_t sector_num,
3978                             int nb_sectors, int *pnum)
3979 {
3980     BlockDriverState *intermediate;
3981     int ret, n = nb_sectors;
3982 
3983     intermediate = top;
3984     while (intermediate && intermediate != base) {
3985         int pnum_inter;
3986         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3987                                 &pnum_inter);
3988         if (ret < 0) {
3989             return ret;
3990         } else if (ret) {
3991             *pnum = pnum_inter;
3992             return 1;
3993         }
3994 
3995         /*
3996          * [sector_num, nb_sectors] is unallocated on top but intermediate
3997          * might have
3998          *
3999          * [sector_num+x, nr_sectors] allocated.
4000          */
4001         if (n > pnum_inter &&
4002             (intermediate == top ||
4003              sector_num + pnum_inter < intermediate->total_sectors)) {
4004             n = pnum_inter;
4005         }
4006 
4007         intermediate = intermediate->backing_hd;
4008     }
4009 
4010     *pnum = n;
4011     return 0;
4012 }
4013 
4014 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4015 {
4016     if (bs->backing_hd && bs->backing_hd->encrypted)
4017         return bs->backing_file;
4018     else if (bs->encrypted)
4019         return bs->filename;
4020     else
4021         return NULL;
4022 }
4023 
4024 void bdrv_get_backing_filename(BlockDriverState *bs,
4025                                char *filename, int filename_size)
4026 {
4027     pstrcpy(filename, filename_size, bs->backing_file);
4028 }
4029 
4030 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4031                           const uint8_t *buf, int nb_sectors)
4032 {
4033     BlockDriver *drv = bs->drv;
4034     if (!drv)
4035         return -ENOMEDIUM;
4036     if (!drv->bdrv_write_compressed)
4037         return -ENOTSUP;
4038     if (bdrv_check_request(bs, sector_num, nb_sectors))
4039         return -EIO;
4040 
4041     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4042 
4043     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4044 }
4045 
4046 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4047 {
4048     BlockDriver *drv = bs->drv;
4049     if (!drv)
4050         return -ENOMEDIUM;
4051     if (!drv->bdrv_get_info)
4052         return -ENOTSUP;
4053     memset(bdi, 0, sizeof(*bdi));
4054     return drv->bdrv_get_info(bs, bdi);
4055 }
4056 
4057 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4058 {
4059     BlockDriver *drv = bs->drv;
4060     if (drv && drv->bdrv_get_specific_info) {
4061         return drv->bdrv_get_specific_info(bs);
4062     }
4063     return NULL;
4064 }
4065 
4066 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4067                       int64_t pos, int size)
4068 {
4069     QEMUIOVector qiov;
4070     struct iovec iov = {
4071         .iov_base   = (void *) buf,
4072         .iov_len    = size,
4073     };
4074 
4075     qemu_iovec_init_external(&qiov, &iov, 1);
4076     return bdrv_writev_vmstate(bs, &qiov, pos);
4077 }
4078 
4079 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4080 {
4081     BlockDriver *drv = bs->drv;
4082 
4083     if (!drv) {
4084         return -ENOMEDIUM;
4085     } else if (drv->bdrv_save_vmstate) {
4086         return drv->bdrv_save_vmstate(bs, qiov, pos);
4087     } else if (bs->file) {
4088         return bdrv_writev_vmstate(bs->file, qiov, pos);
4089     }
4090 
4091     return -ENOTSUP;
4092 }
4093 
4094 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4095                       int64_t pos, int size)
4096 {
4097     BlockDriver *drv = bs->drv;
4098     if (!drv)
4099         return -ENOMEDIUM;
4100     if (drv->bdrv_load_vmstate)
4101         return drv->bdrv_load_vmstate(bs, buf, pos, size);
4102     if (bs->file)
4103         return bdrv_load_vmstate(bs->file, buf, pos, size);
4104     return -ENOTSUP;
4105 }
4106 
4107 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4108 {
4109     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4110         return;
4111     }
4112 
4113     bs->drv->bdrv_debug_event(bs, event);
4114 }
4115 
4116 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4117                           const char *tag)
4118 {
4119     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4120         bs = bs->file;
4121     }
4122 
4123     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4124         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4125     }
4126 
4127     return -ENOTSUP;
4128 }
4129 
4130 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4131 {
4132     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4133         bs = bs->file;
4134     }
4135 
4136     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4137         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4138     }
4139 
4140     return -ENOTSUP;
4141 }
4142 
4143 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4144 {
4145     while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4146         bs = bs->file;
4147     }
4148 
4149     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4150         return bs->drv->bdrv_debug_resume(bs, tag);
4151     }
4152 
4153     return -ENOTSUP;
4154 }
4155 
4156 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4157 {
4158     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4159         bs = bs->file;
4160     }
4161 
4162     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4163         return bs->drv->bdrv_debug_is_suspended(bs, tag);
4164     }
4165 
4166     return false;
4167 }
4168 
4169 int bdrv_is_snapshot(BlockDriverState *bs)
4170 {
4171     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4172 }
4173 
4174 /* backing_file can either be relative, or absolute, or a protocol.  If it is
4175  * relative, it must be relative to the chain.  So, passing in bs->filename
4176  * from a BDS as backing_file should not be done, as that may be relative to
4177  * the CWD rather than the chain. */
4178 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4179         const char *backing_file)
4180 {
4181     char *filename_full = NULL;
4182     char *backing_file_full = NULL;
4183     char *filename_tmp = NULL;
4184     int is_protocol = 0;
4185     BlockDriverState *curr_bs = NULL;
4186     BlockDriverState *retval = NULL;
4187 
4188     if (!bs || !bs->drv || !backing_file) {
4189         return NULL;
4190     }
4191 
4192     filename_full     = g_malloc(PATH_MAX);
4193     backing_file_full = g_malloc(PATH_MAX);
4194     filename_tmp      = g_malloc(PATH_MAX);
4195 
4196     is_protocol = path_has_protocol(backing_file);
4197 
4198     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4199 
4200         /* If either of the filename paths is actually a protocol, then
4201          * compare unmodified paths; otherwise make paths relative */
4202         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4203             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4204                 retval = curr_bs->backing_hd;
4205                 break;
4206             }
4207         } else {
4208             /* If not an absolute filename path, make it relative to the current
4209              * image's filename path */
4210             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4211                          backing_file);
4212 
4213             /* We are going to compare absolute pathnames */
4214             if (!realpath(filename_tmp, filename_full)) {
4215                 continue;
4216             }
4217 
4218             /* We need to make sure the backing filename we are comparing against
4219              * is relative to the current image filename (or absolute) */
4220             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4221                          curr_bs->backing_file);
4222 
4223             if (!realpath(filename_tmp, backing_file_full)) {
4224                 continue;
4225             }
4226 
4227             if (strcmp(backing_file_full, filename_full) == 0) {
4228                 retval = curr_bs->backing_hd;
4229                 break;
4230             }
4231         }
4232     }
4233 
4234     g_free(filename_full);
4235     g_free(backing_file_full);
4236     g_free(filename_tmp);
4237     return retval;
4238 }
4239 
4240 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4241 {
4242     if (!bs->drv) {
4243         return 0;
4244     }
4245 
4246     if (!bs->backing_hd) {
4247         return 0;
4248     }
4249 
4250     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4251 }
4252 
4253 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4254 {
4255     BlockDriverState *curr_bs = NULL;
4256 
4257     if (!bs) {
4258         return NULL;
4259     }
4260 
4261     curr_bs = bs;
4262 
4263     while (curr_bs->backing_hd) {
4264         curr_bs = curr_bs->backing_hd;
4265     }
4266     return curr_bs;
4267 }
4268 
4269 /**************************************************************/
4270 /* async I/Os */
4271 
4272 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4273                                  QEMUIOVector *qiov, int nb_sectors,
4274                                  BlockDriverCompletionFunc *cb, void *opaque)
4275 {
4276     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4277 
4278     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4279                                  cb, opaque, false);
4280 }
4281 
4282 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4283                                   QEMUIOVector *qiov, int nb_sectors,
4284                                   BlockDriverCompletionFunc *cb, void *opaque)
4285 {
4286     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4287 
4288     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4289                                  cb, opaque, true);
4290 }
4291 
4292 BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4293         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4294         BlockDriverCompletionFunc *cb, void *opaque)
4295 {
4296     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4297 
4298     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4299                                  BDRV_REQ_ZERO_WRITE | flags,
4300                                  cb, opaque, true);
4301 }
4302 
4303 
4304 typedef struct MultiwriteCB {
4305     int error;
4306     int num_requests;
4307     int num_callbacks;
4308     struct {
4309         BlockDriverCompletionFunc *cb;
4310         void *opaque;
4311         QEMUIOVector *free_qiov;
4312     } callbacks[];
4313 } MultiwriteCB;
4314 
4315 static void multiwrite_user_cb(MultiwriteCB *mcb)
4316 {
4317     int i;
4318 
4319     for (i = 0; i < mcb->num_callbacks; i++) {
4320         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4321         if (mcb->callbacks[i].free_qiov) {
4322             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4323         }
4324         g_free(mcb->callbacks[i].free_qiov);
4325     }
4326 }
4327 
4328 static void multiwrite_cb(void *opaque, int ret)
4329 {
4330     MultiwriteCB *mcb = opaque;
4331 
4332     trace_multiwrite_cb(mcb, ret);
4333 
4334     if (ret < 0 && !mcb->error) {
4335         mcb->error = ret;
4336     }
4337 
4338     mcb->num_requests--;
4339     if (mcb->num_requests == 0) {
4340         multiwrite_user_cb(mcb);
4341         g_free(mcb);
4342     }
4343 }
4344 
4345 static int multiwrite_req_compare(const void *a, const void *b)
4346 {
4347     const BlockRequest *req1 = a, *req2 = b;
4348 
4349     /*
4350      * Note that we can't simply subtract req2->sector from req1->sector
4351      * here as that could overflow the return value.
4352      */
4353     if (req1->sector > req2->sector) {
4354         return 1;
4355     } else if (req1->sector < req2->sector) {
4356         return -1;
4357     } else {
4358         return 0;
4359     }
4360 }
4361 
4362 /*
4363  * Takes a bunch of requests and tries to merge them. Returns the number of
4364  * requests that remain after merging.
4365  */
4366 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4367     int num_reqs, MultiwriteCB *mcb)
4368 {
4369     int i, outidx;
4370 
4371     // Sort requests by start sector
4372     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4373 
4374     // Check if adjacent requests touch the same clusters. If so, combine them,
4375     // filling up gaps with zero sectors.
4376     outidx = 0;
4377     for (i = 1; i < num_reqs; i++) {
4378         int merge = 0;
4379         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4380 
4381         // Handle exactly sequential writes and overlapping writes.
4382         if (reqs[i].sector <= oldreq_last) {
4383             merge = 1;
4384         }
4385 
4386         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4387             merge = 0;
4388         }
4389 
4390         if (merge) {
4391             size_t size;
4392             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4393             qemu_iovec_init(qiov,
4394                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4395 
4396             // Add the first request to the merged one. If the requests are
4397             // overlapping, drop the last sectors of the first request.
4398             size = (reqs[i].sector - reqs[outidx].sector) << 9;
4399             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4400 
4401             // We should need to add any zeros between the two requests
4402             assert (reqs[i].sector <= oldreq_last);
4403 
4404             // Add the second request
4405             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4406 
4407             reqs[outidx].nb_sectors = qiov->size >> 9;
4408             reqs[outidx].qiov = qiov;
4409 
4410             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4411         } else {
4412             outidx++;
4413             reqs[outidx].sector     = reqs[i].sector;
4414             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4415             reqs[outidx].qiov       = reqs[i].qiov;
4416         }
4417     }
4418 
4419     return outidx + 1;
4420 }
4421 
4422 /*
4423  * Submit multiple AIO write requests at once.
4424  *
4425  * On success, the function returns 0 and all requests in the reqs array have
4426  * been submitted. In error case this function returns -1, and any of the
4427  * requests may or may not be submitted yet. In particular, this means that the
4428  * callback will be called for some of the requests, for others it won't. The
4429  * caller must check the error field of the BlockRequest to wait for the right
4430  * callbacks (if error != 0, no callback will be called).
4431  *
4432  * The implementation may modify the contents of the reqs array, e.g. to merge
4433  * requests. However, the fields opaque and error are left unmodified as they
4434  * are used to signal failure for a single request to the caller.
4435  */
4436 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4437 {
4438     MultiwriteCB *mcb;
4439     int i;
4440 
4441     /* don't submit writes if we don't have a medium */
4442     if (bs->drv == NULL) {
4443         for (i = 0; i < num_reqs; i++) {
4444             reqs[i].error = -ENOMEDIUM;
4445         }
4446         return -1;
4447     }
4448 
4449     if (num_reqs == 0) {
4450         return 0;
4451     }
4452 
4453     // Create MultiwriteCB structure
4454     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4455     mcb->num_requests = 0;
4456     mcb->num_callbacks = num_reqs;
4457 
4458     for (i = 0; i < num_reqs; i++) {
4459         mcb->callbacks[i].cb = reqs[i].cb;
4460         mcb->callbacks[i].opaque = reqs[i].opaque;
4461     }
4462 
4463     // Check for mergable requests
4464     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4465 
4466     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4467 
4468     /* Run the aio requests. */
4469     mcb->num_requests = num_reqs;
4470     for (i = 0; i < num_reqs; i++) {
4471         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4472                               reqs[i].nb_sectors, reqs[i].flags,
4473                               multiwrite_cb, mcb,
4474                               true);
4475     }
4476 
4477     return 0;
4478 }
4479 
4480 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4481 {
4482     acb->aiocb_info->cancel(acb);
4483 }
4484 
4485 /**************************************************************/
4486 /* async block device emulation */
4487 
4488 typedef struct BlockDriverAIOCBSync {
4489     BlockDriverAIOCB common;
4490     QEMUBH *bh;
4491     int ret;
4492     /* vector translation state */
4493     QEMUIOVector *qiov;
4494     uint8_t *bounce;
4495     int is_write;
4496 } BlockDriverAIOCBSync;
4497 
4498 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4499 {
4500     BlockDriverAIOCBSync *acb =
4501         container_of(blockacb, BlockDriverAIOCBSync, common);
4502     qemu_bh_delete(acb->bh);
4503     acb->bh = NULL;
4504     qemu_aio_release(acb);
4505 }
4506 
4507 static const AIOCBInfo bdrv_em_aiocb_info = {
4508     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
4509     .cancel             = bdrv_aio_cancel_em,
4510 };
4511 
4512 static void bdrv_aio_bh_cb(void *opaque)
4513 {
4514     BlockDriverAIOCBSync *acb = opaque;
4515 
4516     if (!acb->is_write)
4517         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4518     qemu_vfree(acb->bounce);
4519     acb->common.cb(acb->common.opaque, acb->ret);
4520     qemu_bh_delete(acb->bh);
4521     acb->bh = NULL;
4522     qemu_aio_release(acb);
4523 }
4524 
4525 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4526                                             int64_t sector_num,
4527                                             QEMUIOVector *qiov,
4528                                             int nb_sectors,
4529                                             BlockDriverCompletionFunc *cb,
4530                                             void *opaque,
4531                                             int is_write)
4532 
4533 {
4534     BlockDriverAIOCBSync *acb;
4535 
4536     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4537     acb->is_write = is_write;
4538     acb->qiov = qiov;
4539     acb->bounce = qemu_blockalign(bs, qiov->size);
4540     acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4541 
4542     if (is_write) {
4543         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4544         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4545     } else {
4546         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4547     }
4548 
4549     qemu_bh_schedule(acb->bh);
4550 
4551     return &acb->common;
4552 }
4553 
4554 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4555         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4556         BlockDriverCompletionFunc *cb, void *opaque)
4557 {
4558     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4559 }
4560 
4561 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4562         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4563         BlockDriverCompletionFunc *cb, void *opaque)
4564 {
4565     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4566 }
4567 
4568 
4569 typedef struct BlockDriverAIOCBCoroutine {
4570     BlockDriverAIOCB common;
4571     BlockRequest req;
4572     bool is_write;
4573     bool *done;
4574     QEMUBH* bh;
4575 } BlockDriverAIOCBCoroutine;
4576 
4577 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4578 {
4579     BlockDriverAIOCBCoroutine *acb =
4580         container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4581     bool done = false;
4582 
4583     acb->done = &done;
4584     while (!done) {
4585         qemu_aio_wait();
4586     }
4587 }
4588 
4589 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4590     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4591     .cancel             = bdrv_aio_co_cancel_em,
4592 };
4593 
4594 static void bdrv_co_em_bh(void *opaque)
4595 {
4596     BlockDriverAIOCBCoroutine *acb = opaque;
4597 
4598     acb->common.cb(acb->common.opaque, acb->req.error);
4599 
4600     if (acb->done) {
4601         *acb->done = true;
4602     }
4603 
4604     qemu_bh_delete(acb->bh);
4605     qemu_aio_release(acb);
4606 }
4607 
4608 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4609 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4610 {
4611     BlockDriverAIOCBCoroutine *acb = opaque;
4612     BlockDriverState *bs = acb->common.bs;
4613 
4614     if (!acb->is_write) {
4615         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4616             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4617     } else {
4618         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4619             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4620     }
4621 
4622     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4623     qemu_bh_schedule(acb->bh);
4624 }
4625 
4626 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4627                                                int64_t sector_num,
4628                                                QEMUIOVector *qiov,
4629                                                int nb_sectors,
4630                                                BdrvRequestFlags flags,
4631                                                BlockDriverCompletionFunc *cb,
4632                                                void *opaque,
4633                                                bool is_write)
4634 {
4635     Coroutine *co;
4636     BlockDriverAIOCBCoroutine *acb;
4637 
4638     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4639     acb->req.sector = sector_num;
4640     acb->req.nb_sectors = nb_sectors;
4641     acb->req.qiov = qiov;
4642     acb->req.flags = flags;
4643     acb->is_write = is_write;
4644     acb->done = NULL;
4645 
4646     co = qemu_coroutine_create(bdrv_co_do_rw);
4647     qemu_coroutine_enter(co, acb);
4648 
4649     return &acb->common;
4650 }
4651 
4652 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4653 {
4654     BlockDriverAIOCBCoroutine *acb = opaque;
4655     BlockDriverState *bs = acb->common.bs;
4656 
4657     acb->req.error = bdrv_co_flush(bs);
4658     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4659     qemu_bh_schedule(acb->bh);
4660 }
4661 
4662 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4663         BlockDriverCompletionFunc *cb, void *opaque)
4664 {
4665     trace_bdrv_aio_flush(bs, opaque);
4666 
4667     Coroutine *co;
4668     BlockDriverAIOCBCoroutine *acb;
4669 
4670     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4671     acb->done = NULL;
4672 
4673     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4674     qemu_coroutine_enter(co, acb);
4675 
4676     return &acb->common;
4677 }
4678 
4679 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4680 {
4681     BlockDriverAIOCBCoroutine *acb = opaque;
4682     BlockDriverState *bs = acb->common.bs;
4683 
4684     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4685     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4686     qemu_bh_schedule(acb->bh);
4687 }
4688 
4689 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4690         int64_t sector_num, int nb_sectors,
4691         BlockDriverCompletionFunc *cb, void *opaque)
4692 {
4693     Coroutine *co;
4694     BlockDriverAIOCBCoroutine *acb;
4695 
4696     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4697 
4698     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4699     acb->req.sector = sector_num;
4700     acb->req.nb_sectors = nb_sectors;
4701     acb->done = NULL;
4702     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4703     qemu_coroutine_enter(co, acb);
4704 
4705     return &acb->common;
4706 }
4707 
4708 void bdrv_init(void)
4709 {
4710     module_call_init(MODULE_INIT_BLOCK);
4711 }
4712 
4713 void bdrv_init_with_whitelist(void)
4714 {
4715     use_bdrv_whitelist = 1;
4716     bdrv_init();
4717 }
4718 
4719 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4720                    BlockDriverCompletionFunc *cb, void *opaque)
4721 {
4722     BlockDriverAIOCB *acb;
4723 
4724     acb = g_slice_alloc(aiocb_info->aiocb_size);
4725     acb->aiocb_info = aiocb_info;
4726     acb->bs = bs;
4727     acb->cb = cb;
4728     acb->opaque = opaque;
4729     return acb;
4730 }
4731 
4732 void qemu_aio_release(void *p)
4733 {
4734     BlockDriverAIOCB *acb = p;
4735     g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4736 }
4737 
4738 /**************************************************************/
4739 /* Coroutine block device emulation */
4740 
4741 typedef struct CoroutineIOCompletion {
4742     Coroutine *coroutine;
4743     int ret;
4744 } CoroutineIOCompletion;
4745 
4746 static void bdrv_co_io_em_complete(void *opaque, int ret)
4747 {
4748     CoroutineIOCompletion *co = opaque;
4749 
4750     co->ret = ret;
4751     qemu_coroutine_enter(co->coroutine, NULL);
4752 }
4753 
4754 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4755                                       int nb_sectors, QEMUIOVector *iov,
4756                                       bool is_write)
4757 {
4758     CoroutineIOCompletion co = {
4759         .coroutine = qemu_coroutine_self(),
4760     };
4761     BlockDriverAIOCB *acb;
4762 
4763     if (is_write) {
4764         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4765                                        bdrv_co_io_em_complete, &co);
4766     } else {
4767         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4768                                       bdrv_co_io_em_complete, &co);
4769     }
4770 
4771     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4772     if (!acb) {
4773         return -EIO;
4774     }
4775     qemu_coroutine_yield();
4776 
4777     return co.ret;
4778 }
4779 
4780 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4781                                          int64_t sector_num, int nb_sectors,
4782                                          QEMUIOVector *iov)
4783 {
4784     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4785 }
4786 
4787 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4788                                          int64_t sector_num, int nb_sectors,
4789                                          QEMUIOVector *iov)
4790 {
4791     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4792 }
4793 
4794 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4795 {
4796     RwCo *rwco = opaque;
4797 
4798     rwco->ret = bdrv_co_flush(rwco->bs);
4799 }
4800 
4801 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4802 {
4803     int ret;
4804 
4805     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4806         return 0;
4807     }
4808 
4809     /* Write back cached data to the OS even with cache=unsafe */
4810     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4811     if (bs->drv->bdrv_co_flush_to_os) {
4812         ret = bs->drv->bdrv_co_flush_to_os(bs);
4813         if (ret < 0) {
4814             return ret;
4815         }
4816     }
4817 
4818     /* But don't actually force it to the disk with cache=unsafe */
4819     if (bs->open_flags & BDRV_O_NO_FLUSH) {
4820         goto flush_parent;
4821     }
4822 
4823     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4824     if (bs->drv->bdrv_co_flush_to_disk) {
4825         ret = bs->drv->bdrv_co_flush_to_disk(bs);
4826     } else if (bs->drv->bdrv_aio_flush) {
4827         BlockDriverAIOCB *acb;
4828         CoroutineIOCompletion co = {
4829             .coroutine = qemu_coroutine_self(),
4830         };
4831 
4832         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4833         if (acb == NULL) {
4834             ret = -EIO;
4835         } else {
4836             qemu_coroutine_yield();
4837             ret = co.ret;
4838         }
4839     } else {
4840         /*
4841          * Some block drivers always operate in either writethrough or unsafe
4842          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4843          * know how the server works (because the behaviour is hardcoded or
4844          * depends on server-side configuration), so we can't ensure that
4845          * everything is safe on disk. Returning an error doesn't work because
4846          * that would break guests even if the server operates in writethrough
4847          * mode.
4848          *
4849          * Let's hope the user knows what he's doing.
4850          */
4851         ret = 0;
4852     }
4853     if (ret < 0) {
4854         return ret;
4855     }
4856 
4857     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4858      * in the case of cache=unsafe, so there are no useless flushes.
4859      */
4860 flush_parent:
4861     return bdrv_co_flush(bs->file);
4862 }
4863 
4864 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
4865 {
4866     Error *local_err = NULL;
4867     int ret;
4868 
4869     if (!bs->drv)  {
4870         return;
4871     }
4872 
4873     if (bs->drv->bdrv_invalidate_cache) {
4874         bs->drv->bdrv_invalidate_cache(bs, &local_err);
4875     } else if (bs->file) {
4876         bdrv_invalidate_cache(bs->file, &local_err);
4877     }
4878     if (local_err) {
4879         error_propagate(errp, local_err);
4880         return;
4881     }
4882 
4883     ret = refresh_total_sectors(bs, bs->total_sectors);
4884     if (ret < 0) {
4885         error_setg_errno(errp, -ret, "Could not refresh total sector count");
4886         return;
4887     }
4888 }
4889 
4890 void bdrv_invalidate_cache_all(Error **errp)
4891 {
4892     BlockDriverState *bs;
4893     Error *local_err = NULL;
4894 
4895     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4896         bdrv_invalidate_cache(bs, &local_err);
4897         if (local_err) {
4898             error_propagate(errp, local_err);
4899             return;
4900         }
4901     }
4902 }
4903 
4904 void bdrv_clear_incoming_migration_all(void)
4905 {
4906     BlockDriverState *bs;
4907 
4908     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4909         bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4910     }
4911 }
4912 
4913 int bdrv_flush(BlockDriverState *bs)
4914 {
4915     Coroutine *co;
4916     RwCo rwco = {
4917         .bs = bs,
4918         .ret = NOT_DONE,
4919     };
4920 
4921     if (qemu_in_coroutine()) {
4922         /* Fast-path if already in coroutine context */
4923         bdrv_flush_co_entry(&rwco);
4924     } else {
4925         co = qemu_coroutine_create(bdrv_flush_co_entry);
4926         qemu_coroutine_enter(co, &rwco);
4927         while (rwco.ret == NOT_DONE) {
4928             qemu_aio_wait();
4929         }
4930     }
4931 
4932     return rwco.ret;
4933 }
4934 
4935 typedef struct DiscardCo {
4936     BlockDriverState *bs;
4937     int64_t sector_num;
4938     int nb_sectors;
4939     int ret;
4940 } DiscardCo;
4941 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4942 {
4943     DiscardCo *rwco = opaque;
4944 
4945     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4946 }
4947 
4948 /* if no limit is specified in the BlockLimits use a default
4949  * of 32768 512-byte sectors (16 MiB) per request.
4950  */
4951 #define MAX_DISCARD_DEFAULT 32768
4952 
4953 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4954                                  int nb_sectors)
4955 {
4956     int max_discard;
4957 
4958     if (!bs->drv) {
4959         return -ENOMEDIUM;
4960     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4961         return -EIO;
4962     } else if (bs->read_only) {
4963         return -EROFS;
4964     }
4965 
4966     bdrv_reset_dirty(bs, sector_num, nb_sectors);
4967 
4968     /* Do nothing if disabled.  */
4969     if (!(bs->open_flags & BDRV_O_UNMAP)) {
4970         return 0;
4971     }
4972 
4973     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4974         return 0;
4975     }
4976 
4977     max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4978     while (nb_sectors > 0) {
4979         int ret;
4980         int num = nb_sectors;
4981 
4982         /* align request */
4983         if (bs->bl.discard_alignment &&
4984             num >= bs->bl.discard_alignment &&
4985             sector_num % bs->bl.discard_alignment) {
4986             if (num > bs->bl.discard_alignment) {
4987                 num = bs->bl.discard_alignment;
4988             }
4989             num -= sector_num % bs->bl.discard_alignment;
4990         }
4991 
4992         /* limit request size */
4993         if (num > max_discard) {
4994             num = max_discard;
4995         }
4996 
4997         if (bs->drv->bdrv_co_discard) {
4998             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
4999         } else {
5000             BlockDriverAIOCB *acb;
5001             CoroutineIOCompletion co = {
5002                 .coroutine = qemu_coroutine_self(),
5003             };
5004 
5005             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5006                                             bdrv_co_io_em_complete, &co);
5007             if (acb == NULL) {
5008                 return -EIO;
5009             } else {
5010                 qemu_coroutine_yield();
5011                 ret = co.ret;
5012             }
5013         }
5014         if (ret && ret != -ENOTSUP) {
5015             return ret;
5016         }
5017 
5018         sector_num += num;
5019         nb_sectors -= num;
5020     }
5021     return 0;
5022 }
5023 
5024 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5025 {
5026     Coroutine *co;
5027     DiscardCo rwco = {
5028         .bs = bs,
5029         .sector_num = sector_num,
5030         .nb_sectors = nb_sectors,
5031         .ret = NOT_DONE,
5032     };
5033 
5034     if (qemu_in_coroutine()) {
5035         /* Fast-path if already in coroutine context */
5036         bdrv_discard_co_entry(&rwco);
5037     } else {
5038         co = qemu_coroutine_create(bdrv_discard_co_entry);
5039         qemu_coroutine_enter(co, &rwco);
5040         while (rwco.ret == NOT_DONE) {
5041             qemu_aio_wait();
5042         }
5043     }
5044 
5045     return rwco.ret;
5046 }
5047 
5048 /**************************************************************/
5049 /* removable device support */
5050 
5051 /**
5052  * Return TRUE if the media is present
5053  */
5054 int bdrv_is_inserted(BlockDriverState *bs)
5055 {
5056     BlockDriver *drv = bs->drv;
5057 
5058     if (!drv)
5059         return 0;
5060     if (!drv->bdrv_is_inserted)
5061         return 1;
5062     return drv->bdrv_is_inserted(bs);
5063 }
5064 
5065 /**
5066  * Return whether the media changed since the last call to this
5067  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
5068  */
5069 int bdrv_media_changed(BlockDriverState *bs)
5070 {
5071     BlockDriver *drv = bs->drv;
5072 
5073     if (drv && drv->bdrv_media_changed) {
5074         return drv->bdrv_media_changed(bs);
5075     }
5076     return -ENOTSUP;
5077 }
5078 
5079 /**
5080  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5081  */
5082 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5083 {
5084     BlockDriver *drv = bs->drv;
5085 
5086     if (drv && drv->bdrv_eject) {
5087         drv->bdrv_eject(bs, eject_flag);
5088     }
5089 
5090     if (bs->device_name[0] != '\0') {
5091         bdrv_emit_qmp_eject_event(bs, eject_flag);
5092     }
5093 }
5094 
5095 /**
5096  * Lock or unlock the media (if it is locked, the user won't be able
5097  * to eject it manually).
5098  */
5099 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5100 {
5101     BlockDriver *drv = bs->drv;
5102 
5103     trace_bdrv_lock_medium(bs, locked);
5104 
5105     if (drv && drv->bdrv_lock_medium) {
5106         drv->bdrv_lock_medium(bs, locked);
5107     }
5108 }
5109 
5110 /* needed for generic scsi interface */
5111 
5112 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5113 {
5114     BlockDriver *drv = bs->drv;
5115 
5116     if (drv && drv->bdrv_ioctl)
5117         return drv->bdrv_ioctl(bs, req, buf);
5118     return -ENOTSUP;
5119 }
5120 
5121 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5122         unsigned long int req, void *buf,
5123         BlockDriverCompletionFunc *cb, void *opaque)
5124 {
5125     BlockDriver *drv = bs->drv;
5126 
5127     if (drv && drv->bdrv_aio_ioctl)
5128         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5129     return NULL;
5130 }
5131 
5132 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5133 {
5134     bs->guest_block_size = align;
5135 }
5136 
5137 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5138 {
5139     return qemu_memalign(bdrv_opt_mem_align(bs), size);
5140 }
5141 
5142 /*
5143  * Check if all memory in this vector is sector aligned.
5144  */
5145 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5146 {
5147     int i;
5148     size_t alignment = bdrv_opt_mem_align(bs);
5149 
5150     for (i = 0; i < qiov->niov; i++) {
5151         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5152             return false;
5153         }
5154         if (qiov->iov[i].iov_len % alignment) {
5155             return false;
5156         }
5157     }
5158 
5159     return true;
5160 }
5161 
5162 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5163                                           Error **errp)
5164 {
5165     int64_t bitmap_size;
5166     BdrvDirtyBitmap *bitmap;
5167 
5168     assert((granularity & (granularity - 1)) == 0);
5169 
5170     granularity >>= BDRV_SECTOR_BITS;
5171     assert(granularity);
5172     bitmap_size = bdrv_getlength(bs);
5173     if (bitmap_size < 0) {
5174         error_setg_errno(errp, -bitmap_size, "could not get length of device");
5175         errno = -bitmap_size;
5176         return NULL;
5177     }
5178     bitmap_size >>= BDRV_SECTOR_BITS;
5179     bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5180     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5181     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5182     return bitmap;
5183 }
5184 
5185 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5186 {
5187     BdrvDirtyBitmap *bm, *next;
5188     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5189         if (bm == bitmap) {
5190             QLIST_REMOVE(bitmap, list);
5191             hbitmap_free(bitmap->bitmap);
5192             g_free(bitmap);
5193             return;
5194         }
5195     }
5196 }
5197 
5198 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5199 {
5200     BdrvDirtyBitmap *bm;
5201     BlockDirtyInfoList *list = NULL;
5202     BlockDirtyInfoList **plist = &list;
5203 
5204     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5205         BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5206         BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5207         info->count = bdrv_get_dirty_count(bs, bm);
5208         info->granularity =
5209             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5210         entry->value = info;
5211         *plist = entry;
5212         plist = &entry->next;
5213     }
5214 
5215     return list;
5216 }
5217 
5218 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5219 {
5220     if (bitmap) {
5221         return hbitmap_get(bitmap->bitmap, sector);
5222     } else {
5223         return 0;
5224     }
5225 }
5226 
5227 void bdrv_dirty_iter_init(BlockDriverState *bs,
5228                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5229 {
5230     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5231 }
5232 
5233 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5234                     int nr_sectors)
5235 {
5236     BdrvDirtyBitmap *bitmap;
5237     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5238         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5239     }
5240 }
5241 
5242 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5243 {
5244     BdrvDirtyBitmap *bitmap;
5245     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5246         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5247     }
5248 }
5249 
5250 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5251 {
5252     return hbitmap_count(bitmap->bitmap);
5253 }
5254 
5255 /* Get a reference to bs */
5256 void bdrv_ref(BlockDriverState *bs)
5257 {
5258     bs->refcnt++;
5259 }
5260 
5261 /* Release a previously grabbed reference to bs.
5262  * If after releasing, reference count is zero, the BlockDriverState is
5263  * deleted. */
5264 void bdrv_unref(BlockDriverState *bs)
5265 {
5266     assert(bs->refcnt > 0);
5267     if (--bs->refcnt == 0) {
5268         bdrv_delete(bs);
5269     }
5270 }
5271 
5272 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
5273 {
5274     assert(bs->in_use != in_use);
5275     bs->in_use = in_use;
5276 }
5277 
5278 int bdrv_in_use(BlockDriverState *bs)
5279 {
5280     return bs->in_use;
5281 }
5282 
5283 void bdrv_iostatus_enable(BlockDriverState *bs)
5284 {
5285     bs->iostatus_enabled = true;
5286     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5287 }
5288 
5289 /* The I/O status is only enabled if the drive explicitly
5290  * enables it _and_ the VM is configured to stop on errors */
5291 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5292 {
5293     return (bs->iostatus_enabled &&
5294            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5295             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5296             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5297 }
5298 
5299 void bdrv_iostatus_disable(BlockDriverState *bs)
5300 {
5301     bs->iostatus_enabled = false;
5302 }
5303 
5304 void bdrv_iostatus_reset(BlockDriverState *bs)
5305 {
5306     if (bdrv_iostatus_is_enabled(bs)) {
5307         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5308         if (bs->job) {
5309             block_job_iostatus_reset(bs->job);
5310         }
5311     }
5312 }
5313 
5314 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5315 {
5316     assert(bdrv_iostatus_is_enabled(bs));
5317     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5318         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5319                                          BLOCK_DEVICE_IO_STATUS_FAILED;
5320     }
5321 }
5322 
5323 void
5324 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5325         enum BlockAcctType type)
5326 {
5327     assert(type < BDRV_MAX_IOTYPE);
5328 
5329     cookie->bytes = bytes;
5330     cookie->start_time_ns = get_clock();
5331     cookie->type = type;
5332 }
5333 
5334 void
5335 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5336 {
5337     assert(cookie->type < BDRV_MAX_IOTYPE);
5338 
5339     bs->nr_bytes[cookie->type] += cookie->bytes;
5340     bs->nr_ops[cookie->type]++;
5341     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5342 }
5343 
5344 void bdrv_img_create(const char *filename, const char *fmt,
5345                      const char *base_filename, const char *base_fmt,
5346                      char *options, uint64_t img_size, int flags,
5347                      Error **errp, bool quiet)
5348 {
5349     QEMUOptionParameter *param = NULL, *create_options = NULL;
5350     QEMUOptionParameter *backing_fmt, *backing_file, *size;
5351     BlockDriver *drv, *proto_drv;
5352     BlockDriver *backing_drv = NULL;
5353     Error *local_err = NULL;
5354     int ret = 0;
5355 
5356     /* Find driver and parse its options */
5357     drv = bdrv_find_format(fmt);
5358     if (!drv) {
5359         error_setg(errp, "Unknown file format '%s'", fmt);
5360         return;
5361     }
5362 
5363     proto_drv = bdrv_find_protocol(filename, true);
5364     if (!proto_drv) {
5365         error_setg(errp, "Unknown protocol '%s'", filename);
5366         return;
5367     }
5368 
5369     create_options = append_option_parameters(create_options,
5370                                               drv->create_options);
5371     create_options = append_option_parameters(create_options,
5372                                               proto_drv->create_options);
5373 
5374     /* Create parameter list with default values */
5375     param = parse_option_parameters("", create_options, param);
5376 
5377     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5378 
5379     /* Parse -o options */
5380     if (options) {
5381         param = parse_option_parameters(options, create_options, param);
5382         if (param == NULL) {
5383             error_setg(errp, "Invalid options for file format '%s'.", fmt);
5384             goto out;
5385         }
5386     }
5387 
5388     if (base_filename) {
5389         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5390                                  base_filename)) {
5391             error_setg(errp, "Backing file not supported for file format '%s'",
5392                        fmt);
5393             goto out;
5394         }
5395     }
5396 
5397     if (base_fmt) {
5398         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5399             error_setg(errp, "Backing file format not supported for file "
5400                              "format '%s'", fmt);
5401             goto out;
5402         }
5403     }
5404 
5405     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5406     if (backing_file && backing_file->value.s) {
5407         if (!strcmp(filename, backing_file->value.s)) {
5408             error_setg(errp, "Error: Trying to create an image with the "
5409                              "same filename as the backing file");
5410             goto out;
5411         }
5412     }
5413 
5414     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5415     if (backing_fmt && backing_fmt->value.s) {
5416         backing_drv = bdrv_find_format(backing_fmt->value.s);
5417         if (!backing_drv) {
5418             error_setg(errp, "Unknown backing file format '%s'",
5419                        backing_fmt->value.s);
5420             goto out;
5421         }
5422     }
5423 
5424     // The size for the image must always be specified, with one exception:
5425     // If we are using a backing file, we can obtain the size from there
5426     size = get_option_parameter(param, BLOCK_OPT_SIZE);
5427     if (size && size->value.n == -1) {
5428         if (backing_file && backing_file->value.s) {
5429             BlockDriverState *bs;
5430             uint64_t size;
5431             char buf[32];
5432             int back_flags;
5433 
5434             /* backing files always opened read-only */
5435             back_flags =
5436                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5437 
5438             bs = NULL;
5439             ret = bdrv_open(&bs, backing_file->value.s, NULL, NULL, back_flags,
5440                             backing_drv, &local_err);
5441             if (ret < 0) {
5442                 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5443                                  backing_file->value.s,
5444                                  error_get_pretty(local_err));
5445                 error_free(local_err);
5446                 local_err = NULL;
5447                 goto out;
5448             }
5449             bdrv_get_geometry(bs, &size);
5450             size *= 512;
5451 
5452             snprintf(buf, sizeof(buf), "%" PRId64, size);
5453             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
5454 
5455             bdrv_unref(bs);
5456         } else {
5457             error_setg(errp, "Image creation needs a size parameter");
5458             goto out;
5459         }
5460     }
5461 
5462     if (!quiet) {
5463         printf("Formatting '%s', fmt=%s ", filename, fmt);
5464         print_option_parameters(param);
5465         puts("");
5466     }
5467     ret = bdrv_create(drv, filename, param, &local_err);
5468     if (ret == -EFBIG) {
5469         /* This is generally a better message than whatever the driver would
5470          * deliver (especially because of the cluster_size_hint), since that
5471          * is most probably not much different from "image too large". */
5472         const char *cluster_size_hint = "";
5473         if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5474             cluster_size_hint = " (try using a larger cluster size)";
5475         }
5476         error_setg(errp, "The image size is too large for file format '%s'"
5477                    "%s", fmt, cluster_size_hint);
5478         error_free(local_err);
5479         local_err = NULL;
5480     }
5481 
5482 out:
5483     free_option_parameters(create_options);
5484     free_option_parameters(param);
5485 
5486     if (local_err) {
5487         error_propagate(errp, local_err);
5488     }
5489 }
5490 
5491 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5492 {
5493     /* Currently BlockDriverState always uses the main loop AioContext */
5494     return qemu_get_aio_context();
5495 }
5496 
5497 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5498                                     NotifierWithReturn *notifier)
5499 {
5500     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5501 }
5502 
5503 int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5504 {
5505     if (bs->drv->bdrv_amend_options == NULL) {
5506         return -ENOTSUP;
5507     }
5508     return bs->drv->bdrv_amend_options(bs, options);
5509 }
5510 
5511 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5512  * of block filter and by bdrv_is_first_non_filter.
5513  * It is used to test if the given bs is the candidate or recurse more in the
5514  * node graph.
5515  */
5516 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5517                                       BlockDriverState *candidate)
5518 {
5519     /* return false if basic checks fails */
5520     if (!bs || !bs->drv) {
5521         return false;
5522     }
5523 
5524     /* the code reached a non block filter driver -> check if the bs is
5525      * the same as the candidate. It's the recursion termination condition.
5526      */
5527     if (!bs->drv->is_filter) {
5528         return bs == candidate;
5529     }
5530     /* Down this path the driver is a block filter driver */
5531 
5532     /* If the block filter recursion method is defined use it to recurse down
5533      * the node graph.
5534      */
5535     if (bs->drv->bdrv_recurse_is_first_non_filter) {
5536         return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5537     }
5538 
5539     /* the driver is a block filter but don't allow to recurse -> return false
5540      */
5541     return false;
5542 }
5543 
5544 /* This function checks if the candidate is the first non filter bs down it's
5545  * bs chain. Since we don't have pointers to parents it explore all bs chains
5546  * from the top. Some filters can choose not to pass down the recursion.
5547  */
5548 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5549 {
5550     BlockDriverState *bs;
5551 
5552     /* walk down the bs forest recursively */
5553     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5554         bool perm;
5555 
5556         /* try to recurse in this top level bs */
5557         perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5558 
5559         /* candidate is the first non filter */
5560         if (perm) {
5561             return true;
5562         }
5563     }
5564 
5565     return false;
5566 }
5567