xref: /openbmc/qemu/block.c (revision 317fc44ef2bfa87e96adecf035ed136ed9d78c8f)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor/monitor.h"
28 #include "block/block_int.h"
29 #include "block/blockjob.h"
30 #include "qemu/module.h"
31 #include "qapi/qmp/qjson.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 
39 #ifdef CONFIG_BSD
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/queue.h>
44 #ifndef __DragonFly__
45 #include <sys/disk.h>
46 #endif
47 #endif
48 
49 #ifdef _WIN32
50 #include <windows.h>
51 #endif
52 
53 struct BdrvDirtyBitmap {
54     HBitmap *bitmap;
55     QLIST_ENTRY(BdrvDirtyBitmap) list;
56 };
57 
58 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59 
60 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63         BlockDriverCompletionFunc *cb, void *opaque);
64 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66         BlockDriverCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68                                          int64_t sector_num, int nb_sectors,
69                                          QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71                                          int64_t sector_num, int nb_sectors,
72                                          QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75     BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78     BdrvRequestFlags flags);
79 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80                                                int64_t sector_num,
81                                                QEMUIOVector *qiov,
82                                                int nb_sectors,
83                                                BdrvRequestFlags flags,
84                                                BlockDriverCompletionFunc *cb,
85                                                void *opaque,
86                                                bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90 
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92     QTAILQ_HEAD_INITIALIZER(bdrv_states);
93 
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96 
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98     QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
102 
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
105 {
106     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108             filename[1] == ':');
109 }
110 
111 int is_windows_drive(const char *filename)
112 {
113     if (is_windows_drive_prefix(filename) &&
114         filename[2] == '\0')
115         return 1;
116     if (strstart(filename, "\\\\.\\", NULL) ||
117         strstart(filename, "//./", NULL))
118         return 1;
119     return 0;
120 }
121 #endif
122 
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125                         ThrottleConfig *cfg)
126 {
127     int i;
128 
129     throttle_config(&bs->throttle_state, cfg);
130 
131     for (i = 0; i < 2; i++) {
132         qemu_co_enter_next(&bs->throttled_reqs[i]);
133     }
134 }
135 
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138 {
139     bool drained = false;
140     bool enabled = bs->io_limits_enabled;
141     int i;
142 
143     bs->io_limits_enabled = false;
144 
145     for (i = 0; i < 2; i++) {
146         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147             drained = true;
148         }
149     }
150 
151     bs->io_limits_enabled = enabled;
152 
153     return drained;
154 }
155 
156 void bdrv_io_limits_disable(BlockDriverState *bs)
157 {
158     bs->io_limits_enabled = false;
159 
160     bdrv_start_throttled_reqs(bs);
161 
162     throttle_destroy(&bs->throttle_state);
163 }
164 
165 static void bdrv_throttle_read_timer_cb(void *opaque)
166 {
167     BlockDriverState *bs = opaque;
168     qemu_co_enter_next(&bs->throttled_reqs[0]);
169 }
170 
171 static void bdrv_throttle_write_timer_cb(void *opaque)
172 {
173     BlockDriverState *bs = opaque;
174     qemu_co_enter_next(&bs->throttled_reqs[1]);
175 }
176 
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
179 {
180     assert(!bs->io_limits_enabled);
181     throttle_init(&bs->throttle_state,
182                   QEMU_CLOCK_VIRTUAL,
183                   bdrv_throttle_read_timer_cb,
184                   bdrv_throttle_write_timer_cb,
185                   bs);
186     bs->io_limits_enabled = true;
187 }
188 
189 /* This function makes an IO wait if needed
190  *
191  * @nb_sectors: the number of sectors of the IO
192  * @is_write:   is the IO a write
193  */
194 static void bdrv_io_limits_intercept(BlockDriverState *bs,
195                                      unsigned int bytes,
196                                      bool is_write)
197 {
198     /* does this io must wait */
199     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
200 
201     /* if must wait or any request of this type throttled queue the IO */
202     if (must_wait ||
203         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
205     }
206 
207     /* the IO will be executed, do the accounting */
208     throttle_account(&bs->throttle_state, is_write, bytes);
209 
210 
211     /* if the next request must wait -> do nothing */
212     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
213         return;
214     }
215 
216     /* else queue next request for execution */
217     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
218 }
219 
220 size_t bdrv_opt_mem_align(BlockDriverState *bs)
221 {
222     if (!bs || !bs->drv) {
223         /* 4k should be on the safe side */
224         return 4096;
225     }
226 
227     return bs->bl.opt_mem_alignment;
228 }
229 
230 /* check if the path starts with "<protocol>:" */
231 static int path_has_protocol(const char *path)
232 {
233     const char *p;
234 
235 #ifdef _WIN32
236     if (is_windows_drive(path) ||
237         is_windows_drive_prefix(path)) {
238         return 0;
239     }
240     p = path + strcspn(path, ":/\\");
241 #else
242     p = path + strcspn(path, ":/");
243 #endif
244 
245     return *p == ':';
246 }
247 
248 int path_is_absolute(const char *path)
249 {
250 #ifdef _WIN32
251     /* specific case for names like: "\\.\d:" */
252     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
253         return 1;
254     }
255     return (*path == '/' || *path == '\\');
256 #else
257     return (*path == '/');
258 #endif
259 }
260 
261 /* if filename is absolute, just copy it to dest. Otherwise, build a
262    path to it by considering it is relative to base_path. URL are
263    supported. */
264 void path_combine(char *dest, int dest_size,
265                   const char *base_path,
266                   const char *filename)
267 {
268     const char *p, *p1;
269     int len;
270 
271     if (dest_size <= 0)
272         return;
273     if (path_is_absolute(filename)) {
274         pstrcpy(dest, dest_size, filename);
275     } else {
276         p = strchr(base_path, ':');
277         if (p)
278             p++;
279         else
280             p = base_path;
281         p1 = strrchr(base_path, '/');
282 #ifdef _WIN32
283         {
284             const char *p2;
285             p2 = strrchr(base_path, '\\');
286             if (!p1 || p2 > p1)
287                 p1 = p2;
288         }
289 #endif
290         if (p1)
291             p1++;
292         else
293             p1 = base_path;
294         if (p1 > p)
295             p = p1;
296         len = p - base_path;
297         if (len > dest_size - 1)
298             len = dest_size - 1;
299         memcpy(dest, base_path, len);
300         dest[len] = '\0';
301         pstrcat(dest, dest_size, filename);
302     }
303 }
304 
305 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
306 {
307     if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
308         pstrcpy(dest, sz, bs->backing_file);
309     } else {
310         path_combine(dest, sz, bs->filename, bs->backing_file);
311     }
312 }
313 
314 void bdrv_register(BlockDriver *bdrv)
315 {
316     /* Block drivers without coroutine functions need emulation */
317     if (!bdrv->bdrv_co_readv) {
318         bdrv->bdrv_co_readv = bdrv_co_readv_em;
319         bdrv->bdrv_co_writev = bdrv_co_writev_em;
320 
321         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
322          * the block driver lacks aio we need to emulate that too.
323          */
324         if (!bdrv->bdrv_aio_readv) {
325             /* add AIO emulation layer */
326             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
327             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
328         }
329     }
330 
331     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
332 }
333 
334 /* create a new block device (by default it is empty) */
335 BlockDriverState *bdrv_new(const char *device_name, Error **errp)
336 {
337     BlockDriverState *bs;
338 
339     if (bdrv_find(device_name)) {
340         error_setg(errp, "Device with id '%s' already exists",
341                    device_name);
342         return NULL;
343     }
344     if (bdrv_find_node(device_name)) {
345         error_setg(errp, "Device with node-name '%s' already exists",
346                    device_name);
347         return NULL;
348     }
349 
350     bs = g_malloc0(sizeof(BlockDriverState));
351     QLIST_INIT(&bs->dirty_bitmaps);
352     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
353     if (device_name[0] != '\0') {
354         QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
355     }
356     bdrv_iostatus_disable(bs);
357     notifier_list_init(&bs->close_notifiers);
358     notifier_with_return_list_init(&bs->before_write_notifiers);
359     qemu_co_queue_init(&bs->throttled_reqs[0]);
360     qemu_co_queue_init(&bs->throttled_reqs[1]);
361     bs->refcnt = 1;
362 
363     return bs;
364 }
365 
366 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
367 {
368     notifier_list_add(&bs->close_notifiers, notify);
369 }
370 
371 BlockDriver *bdrv_find_format(const char *format_name)
372 {
373     BlockDriver *drv1;
374     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
375         if (!strcmp(drv1->format_name, format_name)) {
376             return drv1;
377         }
378     }
379     return NULL;
380 }
381 
382 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
383 {
384     static const char *whitelist_rw[] = {
385         CONFIG_BDRV_RW_WHITELIST
386     };
387     static const char *whitelist_ro[] = {
388         CONFIG_BDRV_RO_WHITELIST
389     };
390     const char **p;
391 
392     if (!whitelist_rw[0] && !whitelist_ro[0]) {
393         return 1;               /* no whitelist, anything goes */
394     }
395 
396     for (p = whitelist_rw; *p; p++) {
397         if (!strcmp(drv->format_name, *p)) {
398             return 1;
399         }
400     }
401     if (read_only) {
402         for (p = whitelist_ro; *p; p++) {
403             if (!strcmp(drv->format_name, *p)) {
404                 return 1;
405             }
406         }
407     }
408     return 0;
409 }
410 
411 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
412                                           bool read_only)
413 {
414     BlockDriver *drv = bdrv_find_format(format_name);
415     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
416 }
417 
418 typedef struct CreateCo {
419     BlockDriver *drv;
420     char *filename;
421     QEMUOptionParameter *options;
422     int ret;
423     Error *err;
424 } CreateCo;
425 
426 static void coroutine_fn bdrv_create_co_entry(void *opaque)
427 {
428     Error *local_err = NULL;
429     int ret;
430 
431     CreateCo *cco = opaque;
432     assert(cco->drv);
433 
434     ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
435     if (local_err) {
436         error_propagate(&cco->err, local_err);
437     }
438     cco->ret = ret;
439 }
440 
441 int bdrv_create(BlockDriver *drv, const char* filename,
442     QEMUOptionParameter *options, Error **errp)
443 {
444     int ret;
445 
446     Coroutine *co;
447     CreateCo cco = {
448         .drv = drv,
449         .filename = g_strdup(filename),
450         .options = options,
451         .ret = NOT_DONE,
452         .err = NULL,
453     };
454 
455     if (!drv->bdrv_create) {
456         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
457         ret = -ENOTSUP;
458         goto out;
459     }
460 
461     if (qemu_in_coroutine()) {
462         /* Fast-path if already in coroutine context */
463         bdrv_create_co_entry(&cco);
464     } else {
465         co = qemu_coroutine_create(bdrv_create_co_entry);
466         qemu_coroutine_enter(co, &cco);
467         while (cco.ret == NOT_DONE) {
468             qemu_aio_wait();
469         }
470     }
471 
472     ret = cco.ret;
473     if (ret < 0) {
474         if (cco.err) {
475             error_propagate(errp, cco.err);
476         } else {
477             error_setg_errno(errp, -ret, "Could not create image");
478         }
479     }
480 
481 out:
482     g_free(cco.filename);
483     return ret;
484 }
485 
486 int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
487                      Error **errp)
488 {
489     BlockDriver *drv;
490     Error *local_err = NULL;
491     int ret;
492 
493     drv = bdrv_find_protocol(filename, true);
494     if (drv == NULL) {
495         error_setg(errp, "Could not find protocol for file '%s'", filename);
496         return -ENOENT;
497     }
498 
499     ret = bdrv_create(drv, filename, options, &local_err);
500     if (local_err) {
501         error_propagate(errp, local_err);
502     }
503     return ret;
504 }
505 
506 int bdrv_refresh_limits(BlockDriverState *bs)
507 {
508     BlockDriver *drv = bs->drv;
509 
510     memset(&bs->bl, 0, sizeof(bs->bl));
511 
512     if (!drv) {
513         return 0;
514     }
515 
516     /* Take some limits from the children as a default */
517     if (bs->file) {
518         bdrv_refresh_limits(bs->file);
519         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
520         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
521     } else {
522         bs->bl.opt_mem_alignment = 512;
523     }
524 
525     if (bs->backing_hd) {
526         bdrv_refresh_limits(bs->backing_hd);
527         bs->bl.opt_transfer_length =
528             MAX(bs->bl.opt_transfer_length,
529                 bs->backing_hd->bl.opt_transfer_length);
530         bs->bl.opt_mem_alignment =
531             MAX(bs->bl.opt_mem_alignment,
532                 bs->backing_hd->bl.opt_mem_alignment);
533     }
534 
535     /* Then let the driver override it */
536     if (drv->bdrv_refresh_limits) {
537         return drv->bdrv_refresh_limits(bs);
538     }
539 
540     return 0;
541 }
542 
543 /*
544  * Create a uniquely-named empty temporary file.
545  * Return 0 upon success, otherwise a negative errno value.
546  */
547 int get_tmp_filename(char *filename, int size)
548 {
549 #ifdef _WIN32
550     char temp_dir[MAX_PATH];
551     /* GetTempFileName requires that its output buffer (4th param)
552        have length MAX_PATH or greater.  */
553     assert(size >= MAX_PATH);
554     return (GetTempPath(MAX_PATH, temp_dir)
555             && GetTempFileName(temp_dir, "qem", 0, filename)
556             ? 0 : -GetLastError());
557 #else
558     int fd;
559     const char *tmpdir;
560     tmpdir = getenv("TMPDIR");
561     if (!tmpdir) {
562         tmpdir = "/var/tmp";
563     }
564     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
565         return -EOVERFLOW;
566     }
567     fd = mkstemp(filename);
568     if (fd < 0) {
569         return -errno;
570     }
571     if (close(fd) != 0) {
572         unlink(filename);
573         return -errno;
574     }
575     return 0;
576 #endif
577 }
578 
579 /*
580  * Detect host devices. By convention, /dev/cdrom[N] is always
581  * recognized as a host CDROM.
582  */
583 static BlockDriver *find_hdev_driver(const char *filename)
584 {
585     int score_max = 0, score;
586     BlockDriver *drv = NULL, *d;
587 
588     QLIST_FOREACH(d, &bdrv_drivers, list) {
589         if (d->bdrv_probe_device) {
590             score = d->bdrv_probe_device(filename);
591             if (score > score_max) {
592                 score_max = score;
593                 drv = d;
594             }
595         }
596     }
597 
598     return drv;
599 }
600 
601 BlockDriver *bdrv_find_protocol(const char *filename,
602                                 bool allow_protocol_prefix)
603 {
604     BlockDriver *drv1;
605     char protocol[128];
606     int len;
607     const char *p;
608 
609     /* TODO Drivers without bdrv_file_open must be specified explicitly */
610 
611     /*
612      * XXX(hch): we really should not let host device detection
613      * override an explicit protocol specification, but moving this
614      * later breaks access to device names with colons in them.
615      * Thanks to the brain-dead persistent naming schemes on udev-
616      * based Linux systems those actually are quite common.
617      */
618     drv1 = find_hdev_driver(filename);
619     if (drv1) {
620         return drv1;
621     }
622 
623     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
624         return bdrv_find_format("file");
625     }
626 
627     p = strchr(filename, ':');
628     assert(p != NULL);
629     len = p - filename;
630     if (len > sizeof(protocol) - 1)
631         len = sizeof(protocol) - 1;
632     memcpy(protocol, filename, len);
633     protocol[len] = '\0';
634     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
635         if (drv1->protocol_name &&
636             !strcmp(drv1->protocol_name, protocol)) {
637             return drv1;
638         }
639     }
640     return NULL;
641 }
642 
643 static int find_image_format(BlockDriverState *bs, const char *filename,
644                              BlockDriver **pdrv, Error **errp)
645 {
646     int score, score_max;
647     BlockDriver *drv1, *drv;
648     uint8_t buf[2048];
649     int ret = 0;
650 
651     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
652     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
653         drv = bdrv_find_format("raw");
654         if (!drv) {
655             error_setg(errp, "Could not find raw image format");
656             ret = -ENOENT;
657         }
658         *pdrv = drv;
659         return ret;
660     }
661 
662     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
663     if (ret < 0) {
664         error_setg_errno(errp, -ret, "Could not read image for determining its "
665                          "format");
666         *pdrv = NULL;
667         return ret;
668     }
669 
670     score_max = 0;
671     drv = NULL;
672     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
673         if (drv1->bdrv_probe) {
674             score = drv1->bdrv_probe(buf, ret, filename);
675             if (score > score_max) {
676                 score_max = score;
677                 drv = drv1;
678             }
679         }
680     }
681     if (!drv) {
682         error_setg(errp, "Could not determine image format: No compatible "
683                    "driver found");
684         ret = -ENOENT;
685     }
686     *pdrv = drv;
687     return ret;
688 }
689 
690 /**
691  * Set the current 'total_sectors' value
692  */
693 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
694 {
695     BlockDriver *drv = bs->drv;
696 
697     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
698     if (bs->sg)
699         return 0;
700 
701     /* query actual device if possible, otherwise just trust the hint */
702     if (drv->bdrv_getlength) {
703         int64_t length = drv->bdrv_getlength(bs);
704         if (length < 0) {
705             return length;
706         }
707         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
708     }
709 
710     bs->total_sectors = hint;
711     return 0;
712 }
713 
714 /**
715  * Set open flags for a given discard mode
716  *
717  * Return 0 on success, -1 if the discard mode was invalid.
718  */
719 int bdrv_parse_discard_flags(const char *mode, int *flags)
720 {
721     *flags &= ~BDRV_O_UNMAP;
722 
723     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
724         /* do nothing */
725     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
726         *flags |= BDRV_O_UNMAP;
727     } else {
728         return -1;
729     }
730 
731     return 0;
732 }
733 
734 /**
735  * Set open flags for a given cache mode
736  *
737  * Return 0 on success, -1 if the cache mode was invalid.
738  */
739 int bdrv_parse_cache_flags(const char *mode, int *flags)
740 {
741     *flags &= ~BDRV_O_CACHE_MASK;
742 
743     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
744         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
745     } else if (!strcmp(mode, "directsync")) {
746         *flags |= BDRV_O_NOCACHE;
747     } else if (!strcmp(mode, "writeback")) {
748         *flags |= BDRV_O_CACHE_WB;
749     } else if (!strcmp(mode, "unsafe")) {
750         *flags |= BDRV_O_CACHE_WB;
751         *flags |= BDRV_O_NO_FLUSH;
752     } else if (!strcmp(mode, "writethrough")) {
753         /* this is the default */
754     } else {
755         return -1;
756     }
757 
758     return 0;
759 }
760 
761 /**
762  * The copy-on-read flag is actually a reference count so multiple users may
763  * use the feature without worrying about clobbering its previous state.
764  * Copy-on-read stays enabled until all users have called to disable it.
765  */
766 void bdrv_enable_copy_on_read(BlockDriverState *bs)
767 {
768     bs->copy_on_read++;
769 }
770 
771 void bdrv_disable_copy_on_read(BlockDriverState *bs)
772 {
773     assert(bs->copy_on_read > 0);
774     bs->copy_on_read--;
775 }
776 
777 /*
778  * Returns the flags that bs->file should get, based on the given flags for
779  * the parent BDS
780  */
781 static int bdrv_inherited_flags(int flags)
782 {
783     /* Enable protocol handling, disable format probing for bs->file */
784     flags |= BDRV_O_PROTOCOL;
785 
786     /* Our block drivers take care to send flushes and respect unmap policy,
787      * so we can enable both unconditionally on lower layers. */
788     flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
789 
790     /* The backing file of a temporary snapshot is read-only */
791     if (flags & BDRV_O_SNAPSHOT) {
792         flags &= ~BDRV_O_RDWR;
793     }
794 
795     /* Clear flags that only apply to the top layer */
796     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
797 
798     return flags;
799 }
800 
801 /*
802  * Returns the flags that bs->backing_hd should get, based on the given flags
803  * for the parent BDS
804  */
805 static int bdrv_backing_flags(int flags)
806 {
807     /* backing files always opened read-only */
808     flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
809 
810     /* snapshot=on is handled on the top layer */
811     flags &= ~BDRV_O_SNAPSHOT;
812 
813     return flags;
814 }
815 
816 static int bdrv_open_flags(BlockDriverState *bs, int flags)
817 {
818     int open_flags = flags | BDRV_O_CACHE_WB;
819 
820     /* The backing file of a temporary snapshot is read-only */
821     if (flags & BDRV_O_SNAPSHOT) {
822         open_flags &= ~BDRV_O_RDWR;
823     }
824 
825     /*
826      * Clear flags that are internal to the block layer before opening the
827      * image.
828      */
829     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
830 
831     /*
832      * Snapshots should be writable.
833      */
834     if (bs->is_temporary) {
835         open_flags |= BDRV_O_RDWR;
836     }
837 
838     return open_flags;
839 }
840 
841 static void bdrv_assign_node_name(BlockDriverState *bs,
842                                   const char *node_name,
843                                   Error **errp)
844 {
845     if (!node_name) {
846         return;
847     }
848 
849     /* empty string node name is invalid */
850     if (node_name[0] == '\0') {
851         error_setg(errp, "Empty node name");
852         return;
853     }
854 
855     /* takes care of avoiding namespaces collisions */
856     if (bdrv_find(node_name)) {
857         error_setg(errp, "node-name=%s is conflicting with a device id",
858                    node_name);
859         return;
860     }
861 
862     /* takes care of avoiding duplicates node names */
863     if (bdrv_find_node(node_name)) {
864         error_setg(errp, "Duplicate node name");
865         return;
866     }
867 
868     /* copy node name into the bs and insert it into the graph list */
869     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
870     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
871 }
872 
873 /*
874  * Common part for opening disk images and files
875  *
876  * Removes all processed options from *options.
877  */
878 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
879     QDict *options, int flags, BlockDriver *drv, Error **errp)
880 {
881     int ret, open_flags;
882     const char *filename;
883     const char *node_name = NULL;
884     Error *local_err = NULL;
885 
886     assert(drv != NULL);
887     assert(bs->file == NULL);
888     assert(options != NULL && bs->options != options);
889 
890     if (file != NULL) {
891         filename = file->filename;
892     } else {
893         filename = qdict_get_try_str(options, "filename");
894     }
895 
896     if (drv->bdrv_needs_filename && !filename) {
897         error_setg(errp, "The '%s' block driver requires a file name",
898                    drv->format_name);
899         return -EINVAL;
900     }
901 
902     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
903 
904     node_name = qdict_get_try_str(options, "node-name");
905     bdrv_assign_node_name(bs, node_name, &local_err);
906     if (local_err) {
907         error_propagate(errp, local_err);
908         return -EINVAL;
909     }
910     qdict_del(options, "node-name");
911 
912     /* bdrv_open() with directly using a protocol as drv. This layer is already
913      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
914      * and return immediately. */
915     if (file != NULL && drv->bdrv_file_open) {
916         bdrv_swap(file, bs);
917         return 0;
918     }
919 
920     bs->open_flags = flags;
921     bs->guest_block_size = 512;
922     bs->request_alignment = 512;
923     bs->zero_beyond_eof = true;
924     open_flags = bdrv_open_flags(bs, flags);
925     bs->read_only = !(open_flags & BDRV_O_RDWR);
926 
927     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
928         error_setg(errp,
929                    !bs->read_only && bdrv_is_whitelisted(drv, true)
930                         ? "Driver '%s' can only be used for read-only devices"
931                         : "Driver '%s' is not whitelisted",
932                    drv->format_name);
933         return -ENOTSUP;
934     }
935 
936     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
937     if (flags & BDRV_O_COPY_ON_READ) {
938         if (!bs->read_only) {
939             bdrv_enable_copy_on_read(bs);
940         } else {
941             error_setg(errp, "Can't use copy-on-read on read-only device");
942             return -EINVAL;
943         }
944     }
945 
946     if (filename != NULL) {
947         pstrcpy(bs->filename, sizeof(bs->filename), filename);
948     } else {
949         bs->filename[0] = '\0';
950     }
951 
952     bs->drv = drv;
953     bs->opaque = g_malloc0(drv->instance_size);
954 
955     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
956 
957     /* Open the image, either directly or using a protocol */
958     if (drv->bdrv_file_open) {
959         assert(file == NULL);
960         assert(!drv->bdrv_needs_filename || filename != NULL);
961         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
962     } else {
963         if (file == NULL) {
964             error_setg(errp, "Can't use '%s' as a block driver for the "
965                        "protocol level", drv->format_name);
966             ret = -EINVAL;
967             goto free_and_fail;
968         }
969         bs->file = file;
970         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
971     }
972 
973     if (ret < 0) {
974         if (local_err) {
975             error_propagate(errp, local_err);
976         } else if (bs->filename[0]) {
977             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
978         } else {
979             error_setg_errno(errp, -ret, "Could not open image");
980         }
981         goto free_and_fail;
982     }
983 
984     ret = refresh_total_sectors(bs, bs->total_sectors);
985     if (ret < 0) {
986         error_setg_errno(errp, -ret, "Could not refresh total sector count");
987         goto free_and_fail;
988     }
989 
990     bdrv_refresh_limits(bs);
991     assert(bdrv_opt_mem_align(bs) != 0);
992     assert((bs->request_alignment != 0) || bs->sg);
993 
994 #ifndef _WIN32
995     if (bs->is_temporary) {
996         assert(bs->filename[0] != '\0');
997         unlink(bs->filename);
998     }
999 #endif
1000     return 0;
1001 
1002 free_and_fail:
1003     bs->file = NULL;
1004     g_free(bs->opaque);
1005     bs->opaque = NULL;
1006     bs->drv = NULL;
1007     return ret;
1008 }
1009 
1010 /*
1011  * Opens a file using a protocol (file, host_device, nbd, ...)
1012  *
1013  * options is an indirect pointer to a QDict of options to pass to the block
1014  * drivers, or pointer to NULL for an empty set of options. If this function
1015  * takes ownership of the QDict reference, it will set *options to NULL;
1016  * otherwise, it will contain unused/unrecognized options after this function
1017  * returns. Then, the caller is responsible for freeing it. If it intends to
1018  * reuse the QDict, QINCREF() should be called beforehand.
1019  */
1020 static int bdrv_file_open(BlockDriverState *bs, const char *filename,
1021                           QDict **options, int flags, Error **errp)
1022 {
1023     BlockDriver *drv;
1024     const char *drvname;
1025     bool parse_filename = false;
1026     Error *local_err = NULL;
1027     int ret;
1028 
1029     /* Fetch the file name from the options QDict if necessary */
1030     if (!filename) {
1031         filename = qdict_get_try_str(*options, "filename");
1032     } else if (filename && !qdict_haskey(*options, "filename")) {
1033         qdict_put(*options, "filename", qstring_from_str(filename));
1034         parse_filename = true;
1035     } else {
1036         error_setg(errp, "Can't specify 'file' and 'filename' options at the "
1037                    "same time");
1038         ret = -EINVAL;
1039         goto fail;
1040     }
1041 
1042     /* Find the right block driver */
1043     drvname = qdict_get_try_str(*options, "driver");
1044     if (drvname) {
1045         drv = bdrv_find_format(drvname);
1046         if (!drv) {
1047             error_setg(errp, "Unknown driver '%s'", drvname);
1048         }
1049         qdict_del(*options, "driver");
1050     } else if (filename) {
1051         drv = bdrv_find_protocol(filename, parse_filename);
1052         if (!drv) {
1053             error_setg(errp, "Unknown protocol");
1054         }
1055     } else {
1056         error_setg(errp, "Must specify either driver or file");
1057         drv = NULL;
1058     }
1059 
1060     if (!drv) {
1061         /* errp has been set already */
1062         ret = -ENOENT;
1063         goto fail;
1064     }
1065 
1066     /* Parse the filename and open it */
1067     if (drv->bdrv_parse_filename && parse_filename) {
1068         drv->bdrv_parse_filename(filename, *options, &local_err);
1069         if (local_err) {
1070             error_propagate(errp, local_err);
1071             ret = -EINVAL;
1072             goto fail;
1073         }
1074 
1075         if (!drv->bdrv_needs_filename) {
1076             qdict_del(*options, "filename");
1077         } else {
1078             filename = qdict_get_str(*options, "filename");
1079         }
1080     }
1081 
1082     if (!drv->bdrv_file_open) {
1083         ret = bdrv_open(&bs, filename, NULL, *options, flags, drv, &local_err);
1084         *options = NULL;
1085     } else {
1086         ret = bdrv_open_common(bs, NULL, *options, flags, drv, &local_err);
1087     }
1088     if (ret < 0) {
1089         error_propagate(errp, local_err);
1090         goto fail;
1091     }
1092 
1093     bs->growable = 1;
1094     return 0;
1095 
1096 fail:
1097     return ret;
1098 }
1099 
1100 /*
1101  * Opens the backing file for a BlockDriverState if not yet open
1102  *
1103  * options is a QDict of options to pass to the block drivers, or NULL for an
1104  * empty set of options. The reference to the QDict is transferred to this
1105  * function (even on failure), so if the caller intends to reuse the dictionary,
1106  * it needs to use QINCREF() before calling bdrv_file_open.
1107  */
1108 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1109 {
1110     char *backing_filename = g_malloc0(PATH_MAX);
1111     int ret = 0;
1112     BlockDriver *back_drv = NULL;
1113     Error *local_err = NULL;
1114 
1115     if (bs->backing_hd != NULL) {
1116         QDECREF(options);
1117         goto free_exit;
1118     }
1119 
1120     /* NULL means an empty set of options */
1121     if (options == NULL) {
1122         options = qdict_new();
1123     }
1124 
1125     bs->open_flags &= ~BDRV_O_NO_BACKING;
1126     if (qdict_haskey(options, "file.filename")) {
1127         backing_filename[0] = '\0';
1128     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1129         QDECREF(options);
1130         goto free_exit;
1131     } else {
1132         bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1133     }
1134 
1135     if (bs->backing_format[0] != '\0') {
1136         back_drv = bdrv_find_format(bs->backing_format);
1137     }
1138 
1139     assert(bs->backing_hd == NULL);
1140     ret = bdrv_open(&bs->backing_hd,
1141                     *backing_filename ? backing_filename : NULL, NULL, options,
1142                     bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
1143     if (ret < 0) {
1144         bs->backing_hd = NULL;
1145         bs->open_flags |= BDRV_O_NO_BACKING;
1146         error_setg(errp, "Could not open backing file: %s",
1147                    error_get_pretty(local_err));
1148         error_free(local_err);
1149         goto free_exit;
1150     }
1151 
1152     if (bs->backing_hd->file) {
1153         pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1154                 bs->backing_hd->file->filename);
1155     }
1156 
1157     /* Recalculate the BlockLimits with the backing file */
1158     bdrv_refresh_limits(bs);
1159 
1160 free_exit:
1161     g_free(backing_filename);
1162     return ret;
1163 }
1164 
1165 /*
1166  * Opens a disk image whose options are given as BlockdevRef in another block
1167  * device's options.
1168  *
1169  * If allow_none is true, no image will be opened if filename is false and no
1170  * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1171  *
1172  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1173  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1174  * itself, all options starting with "${bdref_key}." are considered part of the
1175  * BlockdevRef.
1176  *
1177  * The BlockdevRef will be removed from the options QDict.
1178  *
1179  * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1180  */
1181 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1182                     QDict *options, const char *bdref_key, int flags,
1183                     bool allow_none, Error **errp)
1184 {
1185     QDict *image_options;
1186     int ret;
1187     char *bdref_key_dot;
1188     const char *reference;
1189 
1190     assert(pbs);
1191     assert(*pbs == NULL);
1192 
1193     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1194     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1195     g_free(bdref_key_dot);
1196 
1197     reference = qdict_get_try_str(options, bdref_key);
1198     if (!filename && !reference && !qdict_size(image_options)) {
1199         if (allow_none) {
1200             ret = 0;
1201         } else {
1202             error_setg(errp, "A block device must be specified for \"%s\"",
1203                        bdref_key);
1204             ret = -EINVAL;
1205         }
1206         goto done;
1207     }
1208 
1209     ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1210 
1211 done:
1212     qdict_del(options, bdref_key);
1213     return ret;
1214 }
1215 
1216 void bdrv_append_temp_snapshot(BlockDriverState *bs, Error **errp)
1217 {
1218     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1219     char *tmp_filename = g_malloc0(PATH_MAX + 1);
1220     int64_t total_size;
1221     BlockDriver *bdrv_qcow2;
1222     QEMUOptionParameter *create_options;
1223     QDict *snapshot_options;
1224     BlockDriverState *bs_snapshot;
1225     Error *local_err;
1226     int ret;
1227 
1228     /* if snapshot, we create a temporary backing file and open it
1229        instead of opening 'filename' directly */
1230 
1231     /* Get the required size from the image */
1232     total_size = bdrv_getlength(bs);
1233     if (total_size < 0) {
1234         error_setg_errno(errp, -total_size, "Could not get image size");
1235         goto out;
1236     }
1237     total_size &= BDRV_SECTOR_MASK;
1238 
1239     /* Create the temporary image */
1240     ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1241     if (ret < 0) {
1242         error_setg_errno(errp, -ret, "Could not get temporary filename");
1243         goto out;
1244     }
1245 
1246     bdrv_qcow2 = bdrv_find_format("qcow2");
1247     create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1248                                              NULL);
1249 
1250     set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1251 
1252     ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1253     free_option_parameters(create_options);
1254     if (ret < 0) {
1255         error_setg_errno(errp, -ret, "Could not create temporary overlay "
1256                          "'%s': %s", tmp_filename,
1257                          error_get_pretty(local_err));
1258         error_free(local_err);
1259         goto out;
1260     }
1261 
1262     /* Prepare a new options QDict for the temporary file */
1263     snapshot_options = qdict_new();
1264     qdict_put(snapshot_options, "file.driver",
1265               qstring_from_str("file"));
1266     qdict_put(snapshot_options, "file.filename",
1267               qstring_from_str(tmp_filename));
1268 
1269     bs_snapshot = bdrv_new("", &error_abort);
1270     bs_snapshot->is_temporary = 1;
1271 
1272     ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1273                     bs->open_flags & ~BDRV_O_SNAPSHOT, bdrv_qcow2, &local_err);
1274     if (ret < 0) {
1275         error_propagate(errp, local_err);
1276         goto out;
1277     }
1278 
1279     bdrv_append(bs_snapshot, bs);
1280 
1281 out:
1282     g_free(tmp_filename);
1283 }
1284 
1285 /*
1286  * Opens a disk image (raw, qcow2, vmdk, ...)
1287  *
1288  * options is a QDict of options to pass to the block drivers, or NULL for an
1289  * empty set of options. The reference to the QDict belongs to the block layer
1290  * after the call (even on failure), so if the caller intends to reuse the
1291  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1292  *
1293  * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1294  * If it is not NULL, the referenced BDS will be reused.
1295  *
1296  * The reference parameter may be used to specify an existing block device which
1297  * should be opened. If specified, neither options nor a filename may be given,
1298  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1299  */
1300 int bdrv_open(BlockDriverState **pbs, const char *filename,
1301               const char *reference, QDict *options, int flags,
1302               BlockDriver *drv, Error **errp)
1303 {
1304     int ret;
1305     BlockDriverState *file = NULL, *bs;
1306     const char *drvname;
1307     Error *local_err = NULL;
1308 
1309     assert(pbs);
1310 
1311     if (reference) {
1312         bool options_non_empty = options ? qdict_size(options) : false;
1313         QDECREF(options);
1314 
1315         if (*pbs) {
1316             error_setg(errp, "Cannot reuse an existing BDS when referencing "
1317                        "another block device");
1318             return -EINVAL;
1319         }
1320 
1321         if (filename || options_non_empty) {
1322             error_setg(errp, "Cannot reference an existing block device with "
1323                        "additional options or a new filename");
1324             return -EINVAL;
1325         }
1326 
1327         bs = bdrv_lookup_bs(reference, reference, errp);
1328         if (!bs) {
1329             return -ENODEV;
1330         }
1331         bdrv_ref(bs);
1332         *pbs = bs;
1333         return 0;
1334     }
1335 
1336     if (*pbs) {
1337         bs = *pbs;
1338     } else {
1339         bs = bdrv_new("", &error_abort);
1340     }
1341 
1342     /* NULL means an empty set of options */
1343     if (options == NULL) {
1344         options = qdict_new();
1345     }
1346 
1347     bs->options = options;
1348     options = qdict_clone_shallow(options);
1349 
1350     if (flags & BDRV_O_PROTOCOL) {
1351         assert(!drv);
1352         ret = bdrv_file_open(bs, filename, &options, flags & ~BDRV_O_PROTOCOL,
1353                              &local_err);
1354         if (!ret) {
1355             drv = bs->drv;
1356             goto done;
1357         } else if (bs->drv) {
1358             goto close_and_fail;
1359         } else {
1360             goto fail;
1361         }
1362     }
1363 
1364     /* Open image file without format layer */
1365     if (flags & BDRV_O_RDWR) {
1366         flags |= BDRV_O_ALLOW_RDWR;
1367     }
1368 
1369     assert(file == NULL);
1370     ret = bdrv_open_image(&file, filename, options, "file",
1371                           bdrv_inherited_flags(flags),
1372                           true, &local_err);
1373     if (ret < 0) {
1374         goto unlink_and_fail;
1375     }
1376 
1377     /* Find the right image format driver */
1378     drvname = qdict_get_try_str(options, "driver");
1379     if (drvname) {
1380         drv = bdrv_find_format(drvname);
1381         qdict_del(options, "driver");
1382         if (!drv) {
1383             error_setg(errp, "Invalid driver: '%s'", drvname);
1384             ret = -EINVAL;
1385             goto unlink_and_fail;
1386         }
1387     }
1388 
1389     if (!drv) {
1390         if (file) {
1391             ret = find_image_format(file, filename, &drv, &local_err);
1392         } else {
1393             error_setg(errp, "Must specify either driver or file");
1394             ret = -EINVAL;
1395             goto unlink_and_fail;
1396         }
1397     }
1398 
1399     if (!drv) {
1400         goto unlink_and_fail;
1401     }
1402 
1403     /* Open the image */
1404     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1405     if (ret < 0) {
1406         goto unlink_and_fail;
1407     }
1408 
1409     if (file && (bs->file != file)) {
1410         bdrv_unref(file);
1411         file = NULL;
1412     }
1413 
1414     /* If there is a backing file, use it */
1415     if ((flags & BDRV_O_NO_BACKING) == 0) {
1416         QDict *backing_options;
1417 
1418         qdict_extract_subqdict(options, &backing_options, "backing.");
1419         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1420         if (ret < 0) {
1421             goto close_and_fail;
1422         }
1423     }
1424 
1425     /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1426      * temporary snapshot afterwards. */
1427     if (flags & BDRV_O_SNAPSHOT) {
1428         bdrv_append_temp_snapshot(bs, &local_err);
1429         if (local_err) {
1430             error_propagate(errp, local_err);
1431             goto close_and_fail;
1432         }
1433     }
1434 
1435 
1436 done:
1437     /* Check if any unknown options were used */
1438     if (options && (qdict_size(options) != 0)) {
1439         const QDictEntry *entry = qdict_first(options);
1440         if (flags & BDRV_O_PROTOCOL) {
1441             error_setg(errp, "Block protocol '%s' doesn't support the option "
1442                        "'%s'", drv->format_name, entry->key);
1443         } else {
1444             error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1445                        "support the option '%s'", drv->format_name,
1446                        bs->device_name, entry->key);
1447         }
1448 
1449         ret = -EINVAL;
1450         goto close_and_fail;
1451     }
1452 
1453     if (!bdrv_key_required(bs)) {
1454         bdrv_dev_change_media_cb(bs, true);
1455     } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1456                && !runstate_check(RUN_STATE_INMIGRATE)
1457                && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1458         error_setg(errp,
1459                    "Guest must be stopped for opening of encrypted image");
1460         ret = -EBUSY;
1461         goto close_and_fail;
1462     }
1463 
1464     QDECREF(options);
1465     *pbs = bs;
1466     return 0;
1467 
1468 unlink_and_fail:
1469     if (file != NULL) {
1470         bdrv_unref(file);
1471     }
1472     if (bs->is_temporary) {
1473         unlink(filename);
1474     }
1475 fail:
1476     QDECREF(bs->options);
1477     QDECREF(options);
1478     bs->options = NULL;
1479     if (!*pbs) {
1480         /* If *pbs is NULL, a new BDS has been created in this function and
1481            needs to be freed now. Otherwise, it does not need to be closed,
1482            since it has not really been opened yet. */
1483         bdrv_unref(bs);
1484     }
1485     if (local_err) {
1486         error_propagate(errp, local_err);
1487     }
1488     return ret;
1489 
1490 close_and_fail:
1491     /* See fail path, but now the BDS has to be always closed */
1492     if (*pbs) {
1493         bdrv_close(bs);
1494     } else {
1495         bdrv_unref(bs);
1496     }
1497     QDECREF(options);
1498     if (local_err) {
1499         error_propagate(errp, local_err);
1500     }
1501     return ret;
1502 }
1503 
1504 typedef struct BlockReopenQueueEntry {
1505      bool prepared;
1506      BDRVReopenState state;
1507      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1508 } BlockReopenQueueEntry;
1509 
1510 /*
1511  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1512  * reopen of multiple devices.
1513  *
1514  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1515  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1516  * be created and initialized. This newly created BlockReopenQueue should be
1517  * passed back in for subsequent calls that are intended to be of the same
1518  * atomic 'set'.
1519  *
1520  * bs is the BlockDriverState to add to the reopen queue.
1521  *
1522  * flags contains the open flags for the associated bs
1523  *
1524  * returns a pointer to bs_queue, which is either the newly allocated
1525  * bs_queue, or the existing bs_queue being used.
1526  *
1527  */
1528 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1529                                     BlockDriverState *bs, int flags)
1530 {
1531     assert(bs != NULL);
1532 
1533     BlockReopenQueueEntry *bs_entry;
1534     if (bs_queue == NULL) {
1535         bs_queue = g_new0(BlockReopenQueue, 1);
1536         QSIMPLEQ_INIT(bs_queue);
1537     }
1538 
1539     if (bs->file) {
1540         bdrv_reopen_queue(bs_queue, bs->file, flags);
1541     }
1542 
1543     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1544     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1545 
1546     bs_entry->state.bs = bs;
1547     bs_entry->state.flags = flags;
1548 
1549     return bs_queue;
1550 }
1551 
1552 /*
1553  * Reopen multiple BlockDriverStates atomically & transactionally.
1554  *
1555  * The queue passed in (bs_queue) must have been built up previous
1556  * via bdrv_reopen_queue().
1557  *
1558  * Reopens all BDS specified in the queue, with the appropriate
1559  * flags.  All devices are prepared for reopen, and failure of any
1560  * device will cause all device changes to be abandonded, and intermediate
1561  * data cleaned up.
1562  *
1563  * If all devices prepare successfully, then the changes are committed
1564  * to all devices.
1565  *
1566  */
1567 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1568 {
1569     int ret = -1;
1570     BlockReopenQueueEntry *bs_entry, *next;
1571     Error *local_err = NULL;
1572 
1573     assert(bs_queue != NULL);
1574 
1575     bdrv_drain_all();
1576 
1577     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1578         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1579             error_propagate(errp, local_err);
1580             goto cleanup;
1581         }
1582         bs_entry->prepared = true;
1583     }
1584 
1585     /* If we reach this point, we have success and just need to apply the
1586      * changes
1587      */
1588     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1589         bdrv_reopen_commit(&bs_entry->state);
1590     }
1591 
1592     ret = 0;
1593 
1594 cleanup:
1595     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1596         if (ret && bs_entry->prepared) {
1597             bdrv_reopen_abort(&bs_entry->state);
1598         }
1599         g_free(bs_entry);
1600     }
1601     g_free(bs_queue);
1602     return ret;
1603 }
1604 
1605 
1606 /* Reopen a single BlockDriverState with the specified flags. */
1607 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1608 {
1609     int ret = -1;
1610     Error *local_err = NULL;
1611     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1612 
1613     ret = bdrv_reopen_multiple(queue, &local_err);
1614     if (local_err != NULL) {
1615         error_propagate(errp, local_err);
1616     }
1617     return ret;
1618 }
1619 
1620 
1621 /*
1622  * Prepares a BlockDriverState for reopen. All changes are staged in the
1623  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1624  * the block driver layer .bdrv_reopen_prepare()
1625  *
1626  * bs is the BlockDriverState to reopen
1627  * flags are the new open flags
1628  * queue is the reopen queue
1629  *
1630  * Returns 0 on success, non-zero on error.  On error errp will be set
1631  * as well.
1632  *
1633  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1634  * It is the responsibility of the caller to then call the abort() or
1635  * commit() for any other BDS that have been left in a prepare() state
1636  *
1637  */
1638 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1639                         Error **errp)
1640 {
1641     int ret = -1;
1642     Error *local_err = NULL;
1643     BlockDriver *drv;
1644 
1645     assert(reopen_state != NULL);
1646     assert(reopen_state->bs->drv != NULL);
1647     drv = reopen_state->bs->drv;
1648 
1649     /* if we are to stay read-only, do not allow permission change
1650      * to r/w */
1651     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1652         reopen_state->flags & BDRV_O_RDWR) {
1653         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1654                   reopen_state->bs->device_name);
1655         goto error;
1656     }
1657 
1658 
1659     ret = bdrv_flush(reopen_state->bs);
1660     if (ret) {
1661         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1662                   strerror(-ret));
1663         goto error;
1664     }
1665 
1666     if (drv->bdrv_reopen_prepare) {
1667         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1668         if (ret) {
1669             if (local_err != NULL) {
1670                 error_propagate(errp, local_err);
1671             } else {
1672                 error_setg(errp, "failed while preparing to reopen image '%s'",
1673                            reopen_state->bs->filename);
1674             }
1675             goto error;
1676         }
1677     } else {
1678         /* It is currently mandatory to have a bdrv_reopen_prepare()
1679          * handler for each supported drv. */
1680         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1681                   drv->format_name, reopen_state->bs->device_name,
1682                  "reopening of file");
1683         ret = -1;
1684         goto error;
1685     }
1686 
1687     ret = 0;
1688 
1689 error:
1690     return ret;
1691 }
1692 
1693 /*
1694  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1695  * makes them final by swapping the staging BlockDriverState contents into
1696  * the active BlockDriverState contents.
1697  */
1698 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1699 {
1700     BlockDriver *drv;
1701 
1702     assert(reopen_state != NULL);
1703     drv = reopen_state->bs->drv;
1704     assert(drv != NULL);
1705 
1706     /* If there are any driver level actions to take */
1707     if (drv->bdrv_reopen_commit) {
1708         drv->bdrv_reopen_commit(reopen_state);
1709     }
1710 
1711     /* set BDS specific flags now */
1712     reopen_state->bs->open_flags         = reopen_state->flags;
1713     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1714                                               BDRV_O_CACHE_WB);
1715     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1716 
1717     bdrv_refresh_limits(reopen_state->bs);
1718 }
1719 
1720 /*
1721  * Abort the reopen, and delete and free the staged changes in
1722  * reopen_state
1723  */
1724 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1725 {
1726     BlockDriver *drv;
1727 
1728     assert(reopen_state != NULL);
1729     drv = reopen_state->bs->drv;
1730     assert(drv != NULL);
1731 
1732     if (drv->bdrv_reopen_abort) {
1733         drv->bdrv_reopen_abort(reopen_state);
1734     }
1735 }
1736 
1737 
1738 void bdrv_close(BlockDriverState *bs)
1739 {
1740     if (bs->job) {
1741         block_job_cancel_sync(bs->job);
1742     }
1743     bdrv_drain_all(); /* complete I/O */
1744     bdrv_flush(bs);
1745     bdrv_drain_all(); /* in case flush left pending I/O */
1746     notifier_list_notify(&bs->close_notifiers, bs);
1747 
1748     if (bs->drv) {
1749         if (bs->backing_hd) {
1750             bdrv_unref(bs->backing_hd);
1751             bs->backing_hd = NULL;
1752         }
1753         bs->drv->bdrv_close(bs);
1754         g_free(bs->opaque);
1755 #ifdef _WIN32
1756         if (bs->is_temporary) {
1757             unlink(bs->filename);
1758         }
1759 #endif
1760         bs->opaque = NULL;
1761         bs->drv = NULL;
1762         bs->copy_on_read = 0;
1763         bs->backing_file[0] = '\0';
1764         bs->backing_format[0] = '\0';
1765         bs->total_sectors = 0;
1766         bs->encrypted = 0;
1767         bs->valid_key = 0;
1768         bs->sg = 0;
1769         bs->growable = 0;
1770         bs->zero_beyond_eof = false;
1771         QDECREF(bs->options);
1772         bs->options = NULL;
1773 
1774         if (bs->file != NULL) {
1775             bdrv_unref(bs->file);
1776             bs->file = NULL;
1777         }
1778     }
1779 
1780     bdrv_dev_change_media_cb(bs, false);
1781 
1782     /*throttling disk I/O limits*/
1783     if (bs->io_limits_enabled) {
1784         bdrv_io_limits_disable(bs);
1785     }
1786 }
1787 
1788 void bdrv_close_all(void)
1789 {
1790     BlockDriverState *bs;
1791 
1792     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1793         bdrv_close(bs);
1794     }
1795 }
1796 
1797 /* Check if any requests are in-flight (including throttled requests) */
1798 static bool bdrv_requests_pending(BlockDriverState *bs)
1799 {
1800     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1801         return true;
1802     }
1803     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1804         return true;
1805     }
1806     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1807         return true;
1808     }
1809     if (bs->file && bdrv_requests_pending(bs->file)) {
1810         return true;
1811     }
1812     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1813         return true;
1814     }
1815     return false;
1816 }
1817 
1818 static bool bdrv_requests_pending_all(void)
1819 {
1820     BlockDriverState *bs;
1821     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1822         if (bdrv_requests_pending(bs)) {
1823             return true;
1824         }
1825     }
1826     return false;
1827 }
1828 
1829 /*
1830  * Wait for pending requests to complete across all BlockDriverStates
1831  *
1832  * This function does not flush data to disk, use bdrv_flush_all() for that
1833  * after calling this function.
1834  *
1835  * Note that completion of an asynchronous I/O operation can trigger any
1836  * number of other I/O operations on other devices---for example a coroutine
1837  * can be arbitrarily complex and a constant flow of I/O can come until the
1838  * coroutine is complete.  Because of this, it is not possible to have a
1839  * function to drain a single device's I/O queue.
1840  */
1841 void bdrv_drain_all(void)
1842 {
1843     /* Always run first iteration so any pending completion BHs run */
1844     bool busy = true;
1845     BlockDriverState *bs;
1846 
1847     while (busy) {
1848         QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1849             bdrv_start_throttled_reqs(bs);
1850         }
1851 
1852         busy = bdrv_requests_pending_all();
1853         busy |= aio_poll(qemu_get_aio_context(), busy);
1854     }
1855 }
1856 
1857 /* make a BlockDriverState anonymous by removing from bdrv_state and
1858  * graph_bdrv_state list.
1859    Also, NULL terminate the device_name to prevent double remove */
1860 void bdrv_make_anon(BlockDriverState *bs)
1861 {
1862     if (bs->device_name[0] != '\0') {
1863         QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1864     }
1865     bs->device_name[0] = '\0';
1866     if (bs->node_name[0] != '\0') {
1867         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1868     }
1869     bs->node_name[0] = '\0';
1870 }
1871 
1872 static void bdrv_rebind(BlockDriverState *bs)
1873 {
1874     if (bs->drv && bs->drv->bdrv_rebind) {
1875         bs->drv->bdrv_rebind(bs);
1876     }
1877 }
1878 
1879 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1880                                      BlockDriverState *bs_src)
1881 {
1882     /* move some fields that need to stay attached to the device */
1883     bs_dest->open_flags         = bs_src->open_flags;
1884 
1885     /* dev info */
1886     bs_dest->dev_ops            = bs_src->dev_ops;
1887     bs_dest->dev_opaque         = bs_src->dev_opaque;
1888     bs_dest->dev                = bs_src->dev;
1889     bs_dest->guest_block_size   = bs_src->guest_block_size;
1890     bs_dest->copy_on_read       = bs_src->copy_on_read;
1891 
1892     bs_dest->enable_write_cache = bs_src->enable_write_cache;
1893 
1894     /* i/o throttled req */
1895     memcpy(&bs_dest->throttle_state,
1896            &bs_src->throttle_state,
1897            sizeof(ThrottleState));
1898     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1899     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1900     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1901 
1902     /* r/w error */
1903     bs_dest->on_read_error      = bs_src->on_read_error;
1904     bs_dest->on_write_error     = bs_src->on_write_error;
1905 
1906     /* i/o status */
1907     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1908     bs_dest->iostatus           = bs_src->iostatus;
1909 
1910     /* dirty bitmap */
1911     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1912 
1913     /* reference count */
1914     bs_dest->refcnt             = bs_src->refcnt;
1915 
1916     /* job */
1917     bs_dest->in_use             = bs_src->in_use;
1918     bs_dest->job                = bs_src->job;
1919 
1920     /* keep the same entry in bdrv_states */
1921     pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1922             bs_src->device_name);
1923     bs_dest->device_list = bs_src->device_list;
1924 }
1925 
1926 /*
1927  * Swap bs contents for two image chains while they are live,
1928  * while keeping required fields on the BlockDriverState that is
1929  * actually attached to a device.
1930  *
1931  * This will modify the BlockDriverState fields, and swap contents
1932  * between bs_new and bs_old. Both bs_new and bs_old are modified.
1933  *
1934  * bs_new is required to be anonymous.
1935  *
1936  * This function does not create any image files.
1937  */
1938 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1939 {
1940     BlockDriverState tmp;
1941 
1942     /* The code needs to swap the node_name but simply swapping node_list won't
1943      * work so first remove the nodes from the graph list, do the swap then
1944      * insert them back if needed.
1945      */
1946     if (bs_new->node_name[0] != '\0') {
1947         QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
1948     }
1949     if (bs_old->node_name[0] != '\0') {
1950         QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
1951     }
1952 
1953     /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1954     assert(bs_new->device_name[0] == '\0');
1955     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1956     assert(bs_new->job == NULL);
1957     assert(bs_new->dev == NULL);
1958     assert(bs_new->in_use == 0);
1959     assert(bs_new->io_limits_enabled == false);
1960     assert(!throttle_have_timer(&bs_new->throttle_state));
1961 
1962     tmp = *bs_new;
1963     *bs_new = *bs_old;
1964     *bs_old = tmp;
1965 
1966     /* there are some fields that should not be swapped, move them back */
1967     bdrv_move_feature_fields(&tmp, bs_old);
1968     bdrv_move_feature_fields(bs_old, bs_new);
1969     bdrv_move_feature_fields(bs_new, &tmp);
1970 
1971     /* bs_new shouldn't be in bdrv_states even after the swap!  */
1972     assert(bs_new->device_name[0] == '\0');
1973 
1974     /* Check a few fields that should remain attached to the device */
1975     assert(bs_new->dev == NULL);
1976     assert(bs_new->job == NULL);
1977     assert(bs_new->in_use == 0);
1978     assert(bs_new->io_limits_enabled == false);
1979     assert(!throttle_have_timer(&bs_new->throttle_state));
1980 
1981     /* insert the nodes back into the graph node list if needed */
1982     if (bs_new->node_name[0] != '\0') {
1983         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
1984     }
1985     if (bs_old->node_name[0] != '\0') {
1986         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
1987     }
1988 
1989     bdrv_rebind(bs_new);
1990     bdrv_rebind(bs_old);
1991 }
1992 
1993 /*
1994  * Add new bs contents at the top of an image chain while the chain is
1995  * live, while keeping required fields on the top layer.
1996  *
1997  * This will modify the BlockDriverState fields, and swap contents
1998  * between bs_new and bs_top. Both bs_new and bs_top are modified.
1999  *
2000  * bs_new is required to be anonymous.
2001  *
2002  * This function does not create any image files.
2003  */
2004 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2005 {
2006     bdrv_swap(bs_new, bs_top);
2007 
2008     /* The contents of 'tmp' will become bs_top, as we are
2009      * swapping bs_new and bs_top contents. */
2010     bs_top->backing_hd = bs_new;
2011     bs_top->open_flags &= ~BDRV_O_NO_BACKING;
2012     pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
2013             bs_new->filename);
2014     pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
2015             bs_new->drv ? bs_new->drv->format_name : "");
2016 }
2017 
2018 static void bdrv_delete(BlockDriverState *bs)
2019 {
2020     assert(!bs->dev);
2021     assert(!bs->job);
2022     assert(!bs->in_use);
2023     assert(!bs->refcnt);
2024     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2025 
2026     bdrv_close(bs);
2027 
2028     /* remove from list, if necessary */
2029     bdrv_make_anon(bs);
2030 
2031     g_free(bs);
2032 }
2033 
2034 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
2035 /* TODO change to DeviceState *dev when all users are qdevified */
2036 {
2037     if (bs->dev) {
2038         return -EBUSY;
2039     }
2040     bs->dev = dev;
2041     bdrv_iostatus_reset(bs);
2042     return 0;
2043 }
2044 
2045 /* TODO qdevified devices don't use this, remove when devices are qdevified */
2046 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
2047 {
2048     if (bdrv_attach_dev(bs, dev) < 0) {
2049         abort();
2050     }
2051 }
2052 
2053 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
2054 /* TODO change to DeviceState *dev when all users are qdevified */
2055 {
2056     assert(bs->dev == dev);
2057     bs->dev = NULL;
2058     bs->dev_ops = NULL;
2059     bs->dev_opaque = NULL;
2060     bs->guest_block_size = 512;
2061 }
2062 
2063 /* TODO change to return DeviceState * when all users are qdevified */
2064 void *bdrv_get_attached_dev(BlockDriverState *bs)
2065 {
2066     return bs->dev;
2067 }
2068 
2069 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2070                       void *opaque)
2071 {
2072     bs->dev_ops = ops;
2073     bs->dev_opaque = opaque;
2074 }
2075 
2076 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
2077                                enum MonitorEvent ev,
2078                                BlockErrorAction action, bool is_read)
2079 {
2080     QObject *data;
2081     const char *action_str;
2082 
2083     switch (action) {
2084     case BDRV_ACTION_REPORT:
2085         action_str = "report";
2086         break;
2087     case BDRV_ACTION_IGNORE:
2088         action_str = "ignore";
2089         break;
2090     case BDRV_ACTION_STOP:
2091         action_str = "stop";
2092         break;
2093     default:
2094         abort();
2095     }
2096 
2097     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2098                               bdrv->device_name,
2099                               action_str,
2100                               is_read ? "read" : "write");
2101     monitor_protocol_event(ev, data);
2102 
2103     qobject_decref(data);
2104 }
2105 
2106 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2107 {
2108     QObject *data;
2109 
2110     data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2111                               bdrv_get_device_name(bs), ejected);
2112     monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2113 
2114     qobject_decref(data);
2115 }
2116 
2117 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2118 {
2119     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2120         bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2121         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2122         if (tray_was_closed) {
2123             /* tray open */
2124             bdrv_emit_qmp_eject_event(bs, true);
2125         }
2126         if (load) {
2127             /* tray close */
2128             bdrv_emit_qmp_eject_event(bs, false);
2129         }
2130     }
2131 }
2132 
2133 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2134 {
2135     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2136 }
2137 
2138 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2139 {
2140     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2141         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2142     }
2143 }
2144 
2145 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2146 {
2147     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2148         return bs->dev_ops->is_tray_open(bs->dev_opaque);
2149     }
2150     return false;
2151 }
2152 
2153 static void bdrv_dev_resize_cb(BlockDriverState *bs)
2154 {
2155     if (bs->dev_ops && bs->dev_ops->resize_cb) {
2156         bs->dev_ops->resize_cb(bs->dev_opaque);
2157     }
2158 }
2159 
2160 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2161 {
2162     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2163         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2164     }
2165     return false;
2166 }
2167 
2168 /*
2169  * Run consistency checks on an image
2170  *
2171  * Returns 0 if the check could be completed (it doesn't mean that the image is
2172  * free of errors) or -errno when an internal error occurred. The results of the
2173  * check are stored in res.
2174  */
2175 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2176 {
2177     if (bs->drv->bdrv_check == NULL) {
2178         return -ENOTSUP;
2179     }
2180 
2181     memset(res, 0, sizeof(*res));
2182     return bs->drv->bdrv_check(bs, res, fix);
2183 }
2184 
2185 #define COMMIT_BUF_SECTORS 2048
2186 
2187 /* commit COW file into the raw image */
2188 int bdrv_commit(BlockDriverState *bs)
2189 {
2190     BlockDriver *drv = bs->drv;
2191     int64_t sector, total_sectors, length, backing_length;
2192     int n, ro, open_flags;
2193     int ret = 0;
2194     uint8_t *buf = NULL;
2195     char filename[PATH_MAX];
2196 
2197     if (!drv)
2198         return -ENOMEDIUM;
2199 
2200     if (!bs->backing_hd) {
2201         return -ENOTSUP;
2202     }
2203 
2204     if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
2205         return -EBUSY;
2206     }
2207 
2208     ro = bs->backing_hd->read_only;
2209     /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2210     pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2211     open_flags =  bs->backing_hd->open_flags;
2212 
2213     if (ro) {
2214         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2215             return -EACCES;
2216         }
2217     }
2218 
2219     length = bdrv_getlength(bs);
2220     if (length < 0) {
2221         ret = length;
2222         goto ro_cleanup;
2223     }
2224 
2225     backing_length = bdrv_getlength(bs->backing_hd);
2226     if (backing_length < 0) {
2227         ret = backing_length;
2228         goto ro_cleanup;
2229     }
2230 
2231     /* If our top snapshot is larger than the backing file image,
2232      * grow the backing file image if possible.  If not possible,
2233      * we must return an error */
2234     if (length > backing_length) {
2235         ret = bdrv_truncate(bs->backing_hd, length);
2236         if (ret < 0) {
2237             goto ro_cleanup;
2238         }
2239     }
2240 
2241     total_sectors = length >> BDRV_SECTOR_BITS;
2242     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2243 
2244     for (sector = 0; sector < total_sectors; sector += n) {
2245         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2246         if (ret < 0) {
2247             goto ro_cleanup;
2248         }
2249         if (ret) {
2250             ret = bdrv_read(bs, sector, buf, n);
2251             if (ret < 0) {
2252                 goto ro_cleanup;
2253             }
2254 
2255             ret = bdrv_write(bs->backing_hd, sector, buf, n);
2256             if (ret < 0) {
2257                 goto ro_cleanup;
2258             }
2259         }
2260     }
2261 
2262     if (drv->bdrv_make_empty) {
2263         ret = drv->bdrv_make_empty(bs);
2264         if (ret < 0) {
2265             goto ro_cleanup;
2266         }
2267         bdrv_flush(bs);
2268     }
2269 
2270     /*
2271      * Make sure all data we wrote to the backing device is actually
2272      * stable on disk.
2273      */
2274     if (bs->backing_hd) {
2275         bdrv_flush(bs->backing_hd);
2276     }
2277 
2278     ret = 0;
2279 ro_cleanup:
2280     g_free(buf);
2281 
2282     if (ro) {
2283         /* ignoring error return here */
2284         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2285     }
2286 
2287     return ret;
2288 }
2289 
2290 int bdrv_commit_all(void)
2291 {
2292     BlockDriverState *bs;
2293 
2294     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2295         if (bs->drv && bs->backing_hd) {
2296             int ret = bdrv_commit(bs);
2297             if (ret < 0) {
2298                 return ret;
2299             }
2300         }
2301     }
2302     return 0;
2303 }
2304 
2305 /**
2306  * Remove an active request from the tracked requests list
2307  *
2308  * This function should be called when a tracked request is completing.
2309  */
2310 static void tracked_request_end(BdrvTrackedRequest *req)
2311 {
2312     if (req->serialising) {
2313         req->bs->serialising_in_flight--;
2314     }
2315 
2316     QLIST_REMOVE(req, list);
2317     qemu_co_queue_restart_all(&req->wait_queue);
2318 }
2319 
2320 /**
2321  * Add an active request to the tracked requests list
2322  */
2323 static void tracked_request_begin(BdrvTrackedRequest *req,
2324                                   BlockDriverState *bs,
2325                                   int64_t offset,
2326                                   unsigned int bytes, bool is_write)
2327 {
2328     *req = (BdrvTrackedRequest){
2329         .bs = bs,
2330         .offset         = offset,
2331         .bytes          = bytes,
2332         .is_write       = is_write,
2333         .co             = qemu_coroutine_self(),
2334         .serialising    = false,
2335         .overlap_offset = offset,
2336         .overlap_bytes  = bytes,
2337     };
2338 
2339     qemu_co_queue_init(&req->wait_queue);
2340 
2341     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2342 }
2343 
2344 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2345 {
2346     int64_t overlap_offset = req->offset & ~(align - 1);
2347     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2348                                - overlap_offset;
2349 
2350     if (!req->serialising) {
2351         req->bs->serialising_in_flight++;
2352         req->serialising = true;
2353     }
2354 
2355     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2356     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2357 }
2358 
2359 /**
2360  * Round a region to cluster boundaries
2361  */
2362 void bdrv_round_to_clusters(BlockDriverState *bs,
2363                             int64_t sector_num, int nb_sectors,
2364                             int64_t *cluster_sector_num,
2365                             int *cluster_nb_sectors)
2366 {
2367     BlockDriverInfo bdi;
2368 
2369     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2370         *cluster_sector_num = sector_num;
2371         *cluster_nb_sectors = nb_sectors;
2372     } else {
2373         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2374         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2375         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2376                                             nb_sectors, c);
2377     }
2378 }
2379 
2380 static int bdrv_get_cluster_size(BlockDriverState *bs)
2381 {
2382     BlockDriverInfo bdi;
2383     int ret;
2384 
2385     ret = bdrv_get_info(bs, &bdi);
2386     if (ret < 0 || bdi.cluster_size == 0) {
2387         return bs->request_alignment;
2388     } else {
2389         return bdi.cluster_size;
2390     }
2391 }
2392 
2393 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2394                                      int64_t offset, unsigned int bytes)
2395 {
2396     /*        aaaa   bbbb */
2397     if (offset >= req->overlap_offset + req->overlap_bytes) {
2398         return false;
2399     }
2400     /* bbbb   aaaa        */
2401     if (req->overlap_offset >= offset + bytes) {
2402         return false;
2403     }
2404     return true;
2405 }
2406 
2407 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2408 {
2409     BlockDriverState *bs = self->bs;
2410     BdrvTrackedRequest *req;
2411     bool retry;
2412     bool waited = false;
2413 
2414     if (!bs->serialising_in_flight) {
2415         return false;
2416     }
2417 
2418     do {
2419         retry = false;
2420         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2421             if (req == self || (!req->serialising && !self->serialising)) {
2422                 continue;
2423             }
2424             if (tracked_request_overlaps(req, self->overlap_offset,
2425                                          self->overlap_bytes))
2426             {
2427                 /* Hitting this means there was a reentrant request, for
2428                  * example, a block driver issuing nested requests.  This must
2429                  * never happen since it means deadlock.
2430                  */
2431                 assert(qemu_coroutine_self() != req->co);
2432 
2433                 /* If the request is already (indirectly) waiting for us, or
2434                  * will wait for us as soon as it wakes up, then just go on
2435                  * (instead of producing a deadlock in the former case). */
2436                 if (!req->waiting_for) {
2437                     self->waiting_for = req;
2438                     qemu_co_queue_wait(&req->wait_queue);
2439                     self->waiting_for = NULL;
2440                     retry = true;
2441                     waited = true;
2442                     break;
2443                 }
2444             }
2445         }
2446     } while (retry);
2447 
2448     return waited;
2449 }
2450 
2451 /*
2452  * Return values:
2453  * 0        - success
2454  * -EINVAL  - backing format specified, but no file
2455  * -ENOSPC  - can't update the backing file because no space is left in the
2456  *            image file header
2457  * -ENOTSUP - format driver doesn't support changing the backing file
2458  */
2459 int bdrv_change_backing_file(BlockDriverState *bs,
2460     const char *backing_file, const char *backing_fmt)
2461 {
2462     BlockDriver *drv = bs->drv;
2463     int ret;
2464 
2465     /* Backing file format doesn't make sense without a backing file */
2466     if (backing_fmt && !backing_file) {
2467         return -EINVAL;
2468     }
2469 
2470     if (drv->bdrv_change_backing_file != NULL) {
2471         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2472     } else {
2473         ret = -ENOTSUP;
2474     }
2475 
2476     if (ret == 0) {
2477         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2478         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2479     }
2480     return ret;
2481 }
2482 
2483 /*
2484  * Finds the image layer in the chain that has 'bs' as its backing file.
2485  *
2486  * active is the current topmost image.
2487  *
2488  * Returns NULL if bs is not found in active's image chain,
2489  * or if active == bs.
2490  */
2491 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2492                                     BlockDriverState *bs)
2493 {
2494     BlockDriverState *overlay = NULL;
2495     BlockDriverState *intermediate;
2496 
2497     assert(active != NULL);
2498     assert(bs != NULL);
2499 
2500     /* if bs is the same as active, then by definition it has no overlay
2501      */
2502     if (active == bs) {
2503         return NULL;
2504     }
2505 
2506     intermediate = active;
2507     while (intermediate->backing_hd) {
2508         if (intermediate->backing_hd == bs) {
2509             overlay = intermediate;
2510             break;
2511         }
2512         intermediate = intermediate->backing_hd;
2513     }
2514 
2515     return overlay;
2516 }
2517 
2518 typedef struct BlkIntermediateStates {
2519     BlockDriverState *bs;
2520     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2521 } BlkIntermediateStates;
2522 
2523 
2524 /*
2525  * Drops images above 'base' up to and including 'top', and sets the image
2526  * above 'top' to have base as its backing file.
2527  *
2528  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2529  * information in 'bs' can be properly updated.
2530  *
2531  * E.g., this will convert the following chain:
2532  * bottom <- base <- intermediate <- top <- active
2533  *
2534  * to
2535  *
2536  * bottom <- base <- active
2537  *
2538  * It is allowed for bottom==base, in which case it converts:
2539  *
2540  * base <- intermediate <- top <- active
2541  *
2542  * to
2543  *
2544  * base <- active
2545  *
2546  * Error conditions:
2547  *  if active == top, that is considered an error
2548  *
2549  */
2550 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2551                            BlockDriverState *base)
2552 {
2553     BlockDriverState *intermediate;
2554     BlockDriverState *base_bs = NULL;
2555     BlockDriverState *new_top_bs = NULL;
2556     BlkIntermediateStates *intermediate_state, *next;
2557     int ret = -EIO;
2558 
2559     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2560     QSIMPLEQ_INIT(&states_to_delete);
2561 
2562     if (!top->drv || !base->drv) {
2563         goto exit;
2564     }
2565 
2566     new_top_bs = bdrv_find_overlay(active, top);
2567 
2568     if (new_top_bs == NULL) {
2569         /* we could not find the image above 'top', this is an error */
2570         goto exit;
2571     }
2572 
2573     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2574      * to do, no intermediate images */
2575     if (new_top_bs->backing_hd == base) {
2576         ret = 0;
2577         goto exit;
2578     }
2579 
2580     intermediate = top;
2581 
2582     /* now we will go down through the list, and add each BDS we find
2583      * into our deletion queue, until we hit the 'base'
2584      */
2585     while (intermediate) {
2586         intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2587         intermediate_state->bs = intermediate;
2588         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2589 
2590         if (intermediate->backing_hd == base) {
2591             base_bs = intermediate->backing_hd;
2592             break;
2593         }
2594         intermediate = intermediate->backing_hd;
2595     }
2596     if (base_bs == NULL) {
2597         /* something went wrong, we did not end at the base. safely
2598          * unravel everything, and exit with error */
2599         goto exit;
2600     }
2601 
2602     /* success - we can delete the intermediate states, and link top->base */
2603     ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2604                                    base_bs->drv ? base_bs->drv->format_name : "");
2605     if (ret) {
2606         goto exit;
2607     }
2608     new_top_bs->backing_hd = base_bs;
2609 
2610     bdrv_refresh_limits(new_top_bs);
2611 
2612     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2613         /* so that bdrv_close() does not recursively close the chain */
2614         intermediate_state->bs->backing_hd = NULL;
2615         bdrv_unref(intermediate_state->bs);
2616     }
2617     ret = 0;
2618 
2619 exit:
2620     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2621         g_free(intermediate_state);
2622     }
2623     return ret;
2624 }
2625 
2626 
2627 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2628                                    size_t size)
2629 {
2630     int64_t len;
2631 
2632     if (size > INT_MAX) {
2633         return -EIO;
2634     }
2635 
2636     if (!bdrv_is_inserted(bs))
2637         return -ENOMEDIUM;
2638 
2639     if (bs->growable)
2640         return 0;
2641 
2642     len = bdrv_getlength(bs);
2643 
2644     if (offset < 0)
2645         return -EIO;
2646 
2647     if ((offset > len) || (len - offset < size))
2648         return -EIO;
2649 
2650     return 0;
2651 }
2652 
2653 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2654                               int nb_sectors)
2655 {
2656     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2657         return -EIO;
2658     }
2659 
2660     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2661                                    nb_sectors * BDRV_SECTOR_SIZE);
2662 }
2663 
2664 typedef struct RwCo {
2665     BlockDriverState *bs;
2666     int64_t offset;
2667     QEMUIOVector *qiov;
2668     bool is_write;
2669     int ret;
2670     BdrvRequestFlags flags;
2671 } RwCo;
2672 
2673 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2674 {
2675     RwCo *rwco = opaque;
2676 
2677     if (!rwco->is_write) {
2678         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2679                                       rwco->qiov->size, rwco->qiov,
2680                                       rwco->flags);
2681     } else {
2682         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2683                                        rwco->qiov->size, rwco->qiov,
2684                                        rwco->flags);
2685     }
2686 }
2687 
2688 /*
2689  * Process a vectored synchronous request using coroutines
2690  */
2691 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2692                         QEMUIOVector *qiov, bool is_write,
2693                         BdrvRequestFlags flags)
2694 {
2695     Coroutine *co;
2696     RwCo rwco = {
2697         .bs = bs,
2698         .offset = offset,
2699         .qiov = qiov,
2700         .is_write = is_write,
2701         .ret = NOT_DONE,
2702         .flags = flags,
2703     };
2704 
2705     /**
2706      * In sync call context, when the vcpu is blocked, this throttling timer
2707      * will not fire; so the I/O throttling function has to be disabled here
2708      * if it has been enabled.
2709      */
2710     if (bs->io_limits_enabled) {
2711         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2712                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2713         bdrv_io_limits_disable(bs);
2714     }
2715 
2716     if (qemu_in_coroutine()) {
2717         /* Fast-path if already in coroutine context */
2718         bdrv_rw_co_entry(&rwco);
2719     } else {
2720         co = qemu_coroutine_create(bdrv_rw_co_entry);
2721         qemu_coroutine_enter(co, &rwco);
2722         while (rwco.ret == NOT_DONE) {
2723             qemu_aio_wait();
2724         }
2725     }
2726     return rwco.ret;
2727 }
2728 
2729 /*
2730  * Process a synchronous request using coroutines
2731  */
2732 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2733                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2734 {
2735     QEMUIOVector qiov;
2736     struct iovec iov = {
2737         .iov_base = (void *)buf,
2738         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2739     };
2740 
2741     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2742         return -EINVAL;
2743     }
2744 
2745     qemu_iovec_init_external(&qiov, &iov, 1);
2746     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2747                         &qiov, is_write, flags);
2748 }
2749 
2750 /* return < 0 if error. See bdrv_write() for the return codes */
2751 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2752               uint8_t *buf, int nb_sectors)
2753 {
2754     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2755 }
2756 
2757 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2758 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2759                           uint8_t *buf, int nb_sectors)
2760 {
2761     bool enabled;
2762     int ret;
2763 
2764     enabled = bs->io_limits_enabled;
2765     bs->io_limits_enabled = false;
2766     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2767     bs->io_limits_enabled = enabled;
2768     return ret;
2769 }
2770 
2771 /* Return < 0 if error. Important errors are:
2772   -EIO         generic I/O error (may happen for all errors)
2773   -ENOMEDIUM   No media inserted.
2774   -EINVAL      Invalid sector number or nb_sectors
2775   -EACCES      Trying to write a read-only device
2776 */
2777 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2778                const uint8_t *buf, int nb_sectors)
2779 {
2780     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2781 }
2782 
2783 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2784                       int nb_sectors, BdrvRequestFlags flags)
2785 {
2786     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2787                       BDRV_REQ_ZERO_WRITE | flags);
2788 }
2789 
2790 /*
2791  * Completely zero out a block device with the help of bdrv_write_zeroes.
2792  * The operation is sped up by checking the block status and only writing
2793  * zeroes to the device if they currently do not return zeroes. Optional
2794  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2795  *
2796  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2797  */
2798 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2799 {
2800     int64_t target_size;
2801     int64_t ret, nb_sectors, sector_num = 0;
2802     int n;
2803 
2804     target_size = bdrv_getlength(bs);
2805     if (target_size < 0) {
2806         return target_size;
2807     }
2808     target_size /= BDRV_SECTOR_SIZE;
2809 
2810     for (;;) {
2811         nb_sectors = target_size - sector_num;
2812         if (nb_sectors <= 0) {
2813             return 0;
2814         }
2815         if (nb_sectors > INT_MAX) {
2816             nb_sectors = INT_MAX;
2817         }
2818         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2819         if (ret < 0) {
2820             error_report("error getting block status at sector %" PRId64 ": %s",
2821                          sector_num, strerror(-ret));
2822             return ret;
2823         }
2824         if (ret & BDRV_BLOCK_ZERO) {
2825             sector_num += n;
2826             continue;
2827         }
2828         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2829         if (ret < 0) {
2830             error_report("error writing zeroes at sector %" PRId64 ": %s",
2831                          sector_num, strerror(-ret));
2832             return ret;
2833         }
2834         sector_num += n;
2835     }
2836 }
2837 
2838 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2839 {
2840     QEMUIOVector qiov;
2841     struct iovec iov = {
2842         .iov_base = (void *)buf,
2843         .iov_len = bytes,
2844     };
2845     int ret;
2846 
2847     if (bytes < 0) {
2848         return -EINVAL;
2849     }
2850 
2851     qemu_iovec_init_external(&qiov, &iov, 1);
2852     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2853     if (ret < 0) {
2854         return ret;
2855     }
2856 
2857     return bytes;
2858 }
2859 
2860 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2861 {
2862     int ret;
2863 
2864     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2865     if (ret < 0) {
2866         return ret;
2867     }
2868 
2869     return qiov->size;
2870 }
2871 
2872 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2873                 const void *buf, int bytes)
2874 {
2875     QEMUIOVector qiov;
2876     struct iovec iov = {
2877         .iov_base   = (void *) buf,
2878         .iov_len    = bytes,
2879     };
2880 
2881     if (bytes < 0) {
2882         return -EINVAL;
2883     }
2884 
2885     qemu_iovec_init_external(&qiov, &iov, 1);
2886     return bdrv_pwritev(bs, offset, &qiov);
2887 }
2888 
2889 /*
2890  * Writes to the file and ensures that no writes are reordered across this
2891  * request (acts as a barrier)
2892  *
2893  * Returns 0 on success, -errno in error cases.
2894  */
2895 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2896     const void *buf, int count)
2897 {
2898     int ret;
2899 
2900     ret = bdrv_pwrite(bs, offset, buf, count);
2901     if (ret < 0) {
2902         return ret;
2903     }
2904 
2905     /* No flush needed for cache modes that already do it */
2906     if (bs->enable_write_cache) {
2907         bdrv_flush(bs);
2908     }
2909 
2910     return 0;
2911 }
2912 
2913 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2914         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2915 {
2916     /* Perform I/O through a temporary buffer so that users who scribble over
2917      * their read buffer while the operation is in progress do not end up
2918      * modifying the image file.  This is critical for zero-copy guest I/O
2919      * where anything might happen inside guest memory.
2920      */
2921     void *bounce_buffer;
2922 
2923     BlockDriver *drv = bs->drv;
2924     struct iovec iov;
2925     QEMUIOVector bounce_qiov;
2926     int64_t cluster_sector_num;
2927     int cluster_nb_sectors;
2928     size_t skip_bytes;
2929     int ret;
2930 
2931     /* Cover entire cluster so no additional backing file I/O is required when
2932      * allocating cluster in the image file.
2933      */
2934     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2935                            &cluster_sector_num, &cluster_nb_sectors);
2936 
2937     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2938                                    cluster_sector_num, cluster_nb_sectors);
2939 
2940     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2941     iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2942     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2943 
2944     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2945                              &bounce_qiov);
2946     if (ret < 0) {
2947         goto err;
2948     }
2949 
2950     if (drv->bdrv_co_write_zeroes &&
2951         buffer_is_zero(bounce_buffer, iov.iov_len)) {
2952         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2953                                       cluster_nb_sectors, 0);
2954     } else {
2955         /* This does not change the data on the disk, it is not necessary
2956          * to flush even in cache=writethrough mode.
2957          */
2958         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2959                                   &bounce_qiov);
2960     }
2961 
2962     if (ret < 0) {
2963         /* It might be okay to ignore write errors for guest requests.  If this
2964          * is a deliberate copy-on-read then we don't want to ignore the error.
2965          * Simply report it in all cases.
2966          */
2967         goto err;
2968     }
2969 
2970     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2971     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2972                         nb_sectors * BDRV_SECTOR_SIZE);
2973 
2974 err:
2975     qemu_vfree(bounce_buffer);
2976     return ret;
2977 }
2978 
2979 /*
2980  * Forwards an already correctly aligned request to the BlockDriver. This
2981  * handles copy on read and zeroing after EOF; any other features must be
2982  * implemented by the caller.
2983  */
2984 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2985     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2986     int64_t align, QEMUIOVector *qiov, int flags)
2987 {
2988     BlockDriver *drv = bs->drv;
2989     int ret;
2990 
2991     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2992     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2993 
2994     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2995     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2996 
2997     /* Handle Copy on Read and associated serialisation */
2998     if (flags & BDRV_REQ_COPY_ON_READ) {
2999         /* If we touch the same cluster it counts as an overlap.  This
3000          * guarantees that allocating writes will be serialized and not race
3001          * with each other for the same cluster.  For example, in copy-on-read
3002          * it ensures that the CoR read and write operations are atomic and
3003          * guest writes cannot interleave between them. */
3004         mark_request_serialising(req, bdrv_get_cluster_size(bs));
3005     }
3006 
3007     wait_serialising_requests(req);
3008 
3009     if (flags & BDRV_REQ_COPY_ON_READ) {
3010         int pnum;
3011 
3012         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3013         if (ret < 0) {
3014             goto out;
3015         }
3016 
3017         if (!ret || pnum != nb_sectors) {
3018             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3019             goto out;
3020         }
3021     }
3022 
3023     /* Forward the request to the BlockDriver */
3024     if (!(bs->zero_beyond_eof && bs->growable)) {
3025         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3026     } else {
3027         /* Read zeros after EOF of growable BDSes */
3028         int64_t len, total_sectors, max_nb_sectors;
3029 
3030         len = bdrv_getlength(bs);
3031         if (len < 0) {
3032             ret = len;
3033             goto out;
3034         }
3035 
3036         total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
3037         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3038                                   align >> BDRV_SECTOR_BITS);
3039         if (max_nb_sectors > 0) {
3040             ret = drv->bdrv_co_readv(bs, sector_num,
3041                                      MIN(nb_sectors, max_nb_sectors), qiov);
3042         } else {
3043             ret = 0;
3044         }
3045 
3046         /* Reading beyond end of file is supposed to produce zeroes */
3047         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3048             uint64_t offset = MAX(0, total_sectors - sector_num);
3049             uint64_t bytes = (sector_num + nb_sectors - offset) *
3050                               BDRV_SECTOR_SIZE;
3051             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3052         }
3053     }
3054 
3055 out:
3056     return ret;
3057 }
3058 
3059 /*
3060  * Handle a read request in coroutine context
3061  */
3062 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3063     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3064     BdrvRequestFlags flags)
3065 {
3066     BlockDriver *drv = bs->drv;
3067     BdrvTrackedRequest req;
3068 
3069     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3070     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3071     uint8_t *head_buf = NULL;
3072     uint8_t *tail_buf = NULL;
3073     QEMUIOVector local_qiov;
3074     bool use_local_qiov = false;
3075     int ret;
3076 
3077     if (!drv) {
3078         return -ENOMEDIUM;
3079     }
3080     if (bdrv_check_byte_request(bs, offset, bytes)) {
3081         return -EIO;
3082     }
3083 
3084     if (bs->copy_on_read) {
3085         flags |= BDRV_REQ_COPY_ON_READ;
3086     }
3087 
3088     /* throttling disk I/O */
3089     if (bs->io_limits_enabled) {
3090         bdrv_io_limits_intercept(bs, bytes, false);
3091     }
3092 
3093     /* Align read if necessary by padding qiov */
3094     if (offset & (align - 1)) {
3095         head_buf = qemu_blockalign(bs, align);
3096         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3097         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3098         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3099         use_local_qiov = true;
3100 
3101         bytes += offset & (align - 1);
3102         offset = offset & ~(align - 1);
3103     }
3104 
3105     if ((offset + bytes) & (align - 1)) {
3106         if (!use_local_qiov) {
3107             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3108             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3109             use_local_qiov = true;
3110         }
3111         tail_buf = qemu_blockalign(bs, align);
3112         qemu_iovec_add(&local_qiov, tail_buf,
3113                        align - ((offset + bytes) & (align - 1)));
3114 
3115         bytes = ROUND_UP(bytes, align);
3116     }
3117 
3118     tracked_request_begin(&req, bs, offset, bytes, false);
3119     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3120                               use_local_qiov ? &local_qiov : qiov,
3121                               flags);
3122     tracked_request_end(&req);
3123 
3124     if (use_local_qiov) {
3125         qemu_iovec_destroy(&local_qiov);
3126         qemu_vfree(head_buf);
3127         qemu_vfree(tail_buf);
3128     }
3129 
3130     return ret;
3131 }
3132 
3133 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3134     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3135     BdrvRequestFlags flags)
3136 {
3137     if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3138         return -EINVAL;
3139     }
3140 
3141     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3142                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3143 }
3144 
3145 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3146     int nb_sectors, QEMUIOVector *qiov)
3147 {
3148     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3149 
3150     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3151 }
3152 
3153 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3154     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3155 {
3156     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3157 
3158     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3159                             BDRV_REQ_COPY_ON_READ);
3160 }
3161 
3162 /* if no limit is specified in the BlockLimits use a default
3163  * of 32768 512-byte sectors (16 MiB) per request.
3164  */
3165 #define MAX_WRITE_ZEROES_DEFAULT 32768
3166 
3167 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3168     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3169 {
3170     BlockDriver *drv = bs->drv;
3171     QEMUIOVector qiov;
3172     struct iovec iov = {0};
3173     int ret = 0;
3174 
3175     int max_write_zeroes = bs->bl.max_write_zeroes ?
3176                            bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3177 
3178     while (nb_sectors > 0 && !ret) {
3179         int num = nb_sectors;
3180 
3181         /* Align request.  Block drivers can expect the "bulk" of the request
3182          * to be aligned.
3183          */
3184         if (bs->bl.write_zeroes_alignment
3185             && num > bs->bl.write_zeroes_alignment) {
3186             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3187                 /* Make a small request up to the first aligned sector.  */
3188                 num = bs->bl.write_zeroes_alignment;
3189                 num -= sector_num % bs->bl.write_zeroes_alignment;
3190             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3191                 /* Shorten the request to the last aligned sector.  num cannot
3192                  * underflow because num > bs->bl.write_zeroes_alignment.
3193                  */
3194                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3195             }
3196         }
3197 
3198         /* limit request size */
3199         if (num > max_write_zeroes) {
3200             num = max_write_zeroes;
3201         }
3202 
3203         ret = -ENOTSUP;
3204         /* First try the efficient write zeroes operation */
3205         if (drv->bdrv_co_write_zeroes) {
3206             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3207         }
3208 
3209         if (ret == -ENOTSUP) {
3210             /* Fall back to bounce buffer if write zeroes is unsupported */
3211             iov.iov_len = num * BDRV_SECTOR_SIZE;
3212             if (iov.iov_base == NULL) {
3213                 iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3214                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3215             }
3216             qemu_iovec_init_external(&qiov, &iov, 1);
3217 
3218             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3219 
3220             /* Keep bounce buffer around if it is big enough for all
3221              * all future requests.
3222              */
3223             if (num < max_write_zeroes) {
3224                 qemu_vfree(iov.iov_base);
3225                 iov.iov_base = NULL;
3226             }
3227         }
3228 
3229         sector_num += num;
3230         nb_sectors -= num;
3231     }
3232 
3233     qemu_vfree(iov.iov_base);
3234     return ret;
3235 }
3236 
3237 /*
3238  * Forwards an already correctly aligned write request to the BlockDriver.
3239  */
3240 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3241     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3242     QEMUIOVector *qiov, int flags)
3243 {
3244     BlockDriver *drv = bs->drv;
3245     bool waited;
3246     int ret;
3247 
3248     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3249     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3250 
3251     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3252     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3253 
3254     waited = wait_serialising_requests(req);
3255     assert(!waited || !req->serialising);
3256     assert(req->overlap_offset <= offset);
3257     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3258 
3259     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3260 
3261     if (ret < 0) {
3262         /* Do nothing, write notifier decided to fail this request */
3263     } else if (flags & BDRV_REQ_ZERO_WRITE) {
3264         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3265         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3266     } else {
3267         BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3268         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3269     }
3270     BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3271 
3272     if (ret == 0 && !bs->enable_write_cache) {
3273         ret = bdrv_co_flush(bs);
3274     }
3275 
3276     bdrv_set_dirty(bs, sector_num, nb_sectors);
3277 
3278     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3279         bs->wr_highest_sector = sector_num + nb_sectors - 1;
3280     }
3281     if (bs->growable && ret >= 0) {
3282         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3283     }
3284 
3285     return ret;
3286 }
3287 
3288 /*
3289  * Handle a write request in coroutine context
3290  */
3291 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3292     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3293     BdrvRequestFlags flags)
3294 {
3295     BdrvTrackedRequest req;
3296     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3297     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3298     uint8_t *head_buf = NULL;
3299     uint8_t *tail_buf = NULL;
3300     QEMUIOVector local_qiov;
3301     bool use_local_qiov = false;
3302     int ret;
3303 
3304     if (!bs->drv) {
3305         return -ENOMEDIUM;
3306     }
3307     if (bs->read_only) {
3308         return -EACCES;
3309     }
3310     if (bdrv_check_byte_request(bs, offset, bytes)) {
3311         return -EIO;
3312     }
3313 
3314     /* throttling disk I/O */
3315     if (bs->io_limits_enabled) {
3316         bdrv_io_limits_intercept(bs, bytes, true);
3317     }
3318 
3319     /*
3320      * Align write if necessary by performing a read-modify-write cycle.
3321      * Pad qiov with the read parts and be sure to have a tracked request not
3322      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3323      */
3324     tracked_request_begin(&req, bs, offset, bytes, true);
3325 
3326     if (offset & (align - 1)) {
3327         QEMUIOVector head_qiov;
3328         struct iovec head_iov;
3329 
3330         mark_request_serialising(&req, align);
3331         wait_serialising_requests(&req);
3332 
3333         head_buf = qemu_blockalign(bs, align);
3334         head_iov = (struct iovec) {
3335             .iov_base   = head_buf,
3336             .iov_len    = align,
3337         };
3338         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3339 
3340         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3341         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3342                                   align, &head_qiov, 0);
3343         if (ret < 0) {
3344             goto fail;
3345         }
3346         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3347 
3348         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3349         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3350         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3351         use_local_qiov = true;
3352 
3353         bytes += offset & (align - 1);
3354         offset = offset & ~(align - 1);
3355     }
3356 
3357     if ((offset + bytes) & (align - 1)) {
3358         QEMUIOVector tail_qiov;
3359         struct iovec tail_iov;
3360         size_t tail_bytes;
3361         bool waited;
3362 
3363         mark_request_serialising(&req, align);
3364         waited = wait_serialising_requests(&req);
3365         assert(!waited || !use_local_qiov);
3366 
3367         tail_buf = qemu_blockalign(bs, align);
3368         tail_iov = (struct iovec) {
3369             .iov_base   = tail_buf,
3370             .iov_len    = align,
3371         };
3372         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3373 
3374         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3375         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3376                                   align, &tail_qiov, 0);
3377         if (ret < 0) {
3378             goto fail;
3379         }
3380         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3381 
3382         if (!use_local_qiov) {
3383             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3384             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3385             use_local_qiov = true;
3386         }
3387 
3388         tail_bytes = (offset + bytes) & (align - 1);
3389         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3390 
3391         bytes = ROUND_UP(bytes, align);
3392     }
3393 
3394     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3395                                use_local_qiov ? &local_qiov : qiov,
3396                                flags);
3397 
3398 fail:
3399     tracked_request_end(&req);
3400 
3401     if (use_local_qiov) {
3402         qemu_iovec_destroy(&local_qiov);
3403     }
3404     qemu_vfree(head_buf);
3405     qemu_vfree(tail_buf);
3406 
3407     return ret;
3408 }
3409 
3410 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3411     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3412     BdrvRequestFlags flags)
3413 {
3414     if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3415         return -EINVAL;
3416     }
3417 
3418     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3419                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3420 }
3421 
3422 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3423     int nb_sectors, QEMUIOVector *qiov)
3424 {
3425     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3426 
3427     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3428 }
3429 
3430 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3431                                       int64_t sector_num, int nb_sectors,
3432                                       BdrvRequestFlags flags)
3433 {
3434     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3435 
3436     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3437         flags &= ~BDRV_REQ_MAY_UNMAP;
3438     }
3439 
3440     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3441                              BDRV_REQ_ZERO_WRITE | flags);
3442 }
3443 
3444 /**
3445  * Truncate file to 'offset' bytes (needed only for file protocols)
3446  */
3447 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3448 {
3449     BlockDriver *drv = bs->drv;
3450     int ret;
3451     if (!drv)
3452         return -ENOMEDIUM;
3453     if (!drv->bdrv_truncate)
3454         return -ENOTSUP;
3455     if (bs->read_only)
3456         return -EACCES;
3457     if (bdrv_in_use(bs))
3458         return -EBUSY;
3459     ret = drv->bdrv_truncate(bs, offset);
3460     if (ret == 0) {
3461         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3462         bdrv_dev_resize_cb(bs);
3463     }
3464     return ret;
3465 }
3466 
3467 /**
3468  * Length of a allocated file in bytes. Sparse files are counted by actual
3469  * allocated space. Return < 0 if error or unknown.
3470  */
3471 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3472 {
3473     BlockDriver *drv = bs->drv;
3474     if (!drv) {
3475         return -ENOMEDIUM;
3476     }
3477     if (drv->bdrv_get_allocated_file_size) {
3478         return drv->bdrv_get_allocated_file_size(bs);
3479     }
3480     if (bs->file) {
3481         return bdrv_get_allocated_file_size(bs->file);
3482     }
3483     return -ENOTSUP;
3484 }
3485 
3486 /**
3487  * Length of a file in bytes. Return < 0 if error or unknown.
3488  */
3489 int64_t bdrv_getlength(BlockDriverState *bs)
3490 {
3491     BlockDriver *drv = bs->drv;
3492     if (!drv)
3493         return -ENOMEDIUM;
3494 
3495     if (drv->has_variable_length) {
3496         int ret = refresh_total_sectors(bs, bs->total_sectors);
3497         if (ret < 0) {
3498             return ret;
3499         }
3500     }
3501     return bs->total_sectors * BDRV_SECTOR_SIZE;
3502 }
3503 
3504 /* return 0 as number of sectors if no device present or error */
3505 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3506 {
3507     int64_t length;
3508     length = bdrv_getlength(bs);
3509     if (length < 0)
3510         length = 0;
3511     else
3512         length = length >> BDRV_SECTOR_BITS;
3513     *nb_sectors_ptr = length;
3514 }
3515 
3516 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3517                        BlockdevOnError on_write_error)
3518 {
3519     bs->on_read_error = on_read_error;
3520     bs->on_write_error = on_write_error;
3521 }
3522 
3523 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3524 {
3525     return is_read ? bs->on_read_error : bs->on_write_error;
3526 }
3527 
3528 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3529 {
3530     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3531 
3532     switch (on_err) {
3533     case BLOCKDEV_ON_ERROR_ENOSPC:
3534         return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3535     case BLOCKDEV_ON_ERROR_STOP:
3536         return BDRV_ACTION_STOP;
3537     case BLOCKDEV_ON_ERROR_REPORT:
3538         return BDRV_ACTION_REPORT;
3539     case BLOCKDEV_ON_ERROR_IGNORE:
3540         return BDRV_ACTION_IGNORE;
3541     default:
3542         abort();
3543     }
3544 }
3545 
3546 /* This is done by device models because, while the block layer knows
3547  * about the error, it does not know whether an operation comes from
3548  * the device or the block layer (from a job, for example).
3549  */
3550 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3551                        bool is_read, int error)
3552 {
3553     assert(error >= 0);
3554     bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3555     if (action == BDRV_ACTION_STOP) {
3556         vm_stop(RUN_STATE_IO_ERROR);
3557         bdrv_iostatus_set_err(bs, error);
3558     }
3559 }
3560 
3561 int bdrv_is_read_only(BlockDriverState *bs)
3562 {
3563     return bs->read_only;
3564 }
3565 
3566 int bdrv_is_sg(BlockDriverState *bs)
3567 {
3568     return bs->sg;
3569 }
3570 
3571 int bdrv_enable_write_cache(BlockDriverState *bs)
3572 {
3573     return bs->enable_write_cache;
3574 }
3575 
3576 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3577 {
3578     bs->enable_write_cache = wce;
3579 
3580     /* so a reopen() will preserve wce */
3581     if (wce) {
3582         bs->open_flags |= BDRV_O_CACHE_WB;
3583     } else {
3584         bs->open_flags &= ~BDRV_O_CACHE_WB;
3585     }
3586 }
3587 
3588 int bdrv_is_encrypted(BlockDriverState *bs)
3589 {
3590     if (bs->backing_hd && bs->backing_hd->encrypted)
3591         return 1;
3592     return bs->encrypted;
3593 }
3594 
3595 int bdrv_key_required(BlockDriverState *bs)
3596 {
3597     BlockDriverState *backing_hd = bs->backing_hd;
3598 
3599     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3600         return 1;
3601     return (bs->encrypted && !bs->valid_key);
3602 }
3603 
3604 int bdrv_set_key(BlockDriverState *bs, const char *key)
3605 {
3606     int ret;
3607     if (bs->backing_hd && bs->backing_hd->encrypted) {
3608         ret = bdrv_set_key(bs->backing_hd, key);
3609         if (ret < 0)
3610             return ret;
3611         if (!bs->encrypted)
3612             return 0;
3613     }
3614     if (!bs->encrypted) {
3615         return -EINVAL;
3616     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3617         return -ENOMEDIUM;
3618     }
3619     ret = bs->drv->bdrv_set_key(bs, key);
3620     if (ret < 0) {
3621         bs->valid_key = 0;
3622     } else if (!bs->valid_key) {
3623         bs->valid_key = 1;
3624         /* call the change callback now, we skipped it on open */
3625         bdrv_dev_change_media_cb(bs, true);
3626     }
3627     return ret;
3628 }
3629 
3630 const char *bdrv_get_format_name(BlockDriverState *bs)
3631 {
3632     return bs->drv ? bs->drv->format_name : NULL;
3633 }
3634 
3635 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3636                          void *opaque)
3637 {
3638     BlockDriver *drv;
3639     int count = 0;
3640     const char **formats = NULL;
3641 
3642     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3643         if (drv->format_name) {
3644             bool found = false;
3645             int i = count;
3646             while (formats && i && !found) {
3647                 found = !strcmp(formats[--i], drv->format_name);
3648             }
3649 
3650             if (!found) {
3651                 formats = g_realloc(formats, (count + 1) * sizeof(char *));
3652                 formats[count++] = drv->format_name;
3653                 it(opaque, drv->format_name);
3654             }
3655         }
3656     }
3657     g_free(formats);
3658 }
3659 
3660 /* This function is to find block backend bs */
3661 BlockDriverState *bdrv_find(const char *name)
3662 {
3663     BlockDriverState *bs;
3664 
3665     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3666         if (!strcmp(name, bs->device_name)) {
3667             return bs;
3668         }
3669     }
3670     return NULL;
3671 }
3672 
3673 /* This function is to find a node in the bs graph */
3674 BlockDriverState *bdrv_find_node(const char *node_name)
3675 {
3676     BlockDriverState *bs;
3677 
3678     assert(node_name);
3679 
3680     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3681         if (!strcmp(node_name, bs->node_name)) {
3682             return bs;
3683         }
3684     }
3685     return NULL;
3686 }
3687 
3688 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3689 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3690 {
3691     BlockDeviceInfoList *list, *entry;
3692     BlockDriverState *bs;
3693 
3694     list = NULL;
3695     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3696         entry = g_malloc0(sizeof(*entry));
3697         entry->value = bdrv_block_device_info(bs);
3698         entry->next = list;
3699         list = entry;
3700     }
3701 
3702     return list;
3703 }
3704 
3705 BlockDriverState *bdrv_lookup_bs(const char *device,
3706                                  const char *node_name,
3707                                  Error **errp)
3708 {
3709     BlockDriverState *bs = NULL;
3710 
3711     if (device) {
3712         bs = bdrv_find(device);
3713 
3714         if (bs) {
3715             return bs;
3716         }
3717     }
3718 
3719     if (node_name) {
3720         bs = bdrv_find_node(node_name);
3721 
3722         if (bs) {
3723             return bs;
3724         }
3725     }
3726 
3727     error_setg(errp, "Cannot find device=%s nor node_name=%s",
3728                      device ? device : "",
3729                      node_name ? node_name : "");
3730     return NULL;
3731 }
3732 
3733 BlockDriverState *bdrv_next(BlockDriverState *bs)
3734 {
3735     if (!bs) {
3736         return QTAILQ_FIRST(&bdrv_states);
3737     }
3738     return QTAILQ_NEXT(bs, device_list);
3739 }
3740 
3741 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3742 {
3743     BlockDriverState *bs;
3744 
3745     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3746         it(opaque, bs);
3747     }
3748 }
3749 
3750 const char *bdrv_get_device_name(BlockDriverState *bs)
3751 {
3752     return bs->device_name;
3753 }
3754 
3755 int bdrv_get_flags(BlockDriverState *bs)
3756 {
3757     return bs->open_flags;
3758 }
3759 
3760 int bdrv_flush_all(void)
3761 {
3762     BlockDriverState *bs;
3763     int result = 0;
3764 
3765     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3766         int ret = bdrv_flush(bs);
3767         if (ret < 0 && !result) {
3768             result = ret;
3769         }
3770     }
3771 
3772     return result;
3773 }
3774 
3775 int bdrv_has_zero_init_1(BlockDriverState *bs)
3776 {
3777     return 1;
3778 }
3779 
3780 int bdrv_has_zero_init(BlockDriverState *bs)
3781 {
3782     assert(bs->drv);
3783 
3784     /* If BS is a copy on write image, it is initialized to
3785        the contents of the base image, which may not be zeroes.  */
3786     if (bs->backing_hd) {
3787         return 0;
3788     }
3789     if (bs->drv->bdrv_has_zero_init) {
3790         return bs->drv->bdrv_has_zero_init(bs);
3791     }
3792 
3793     /* safe default */
3794     return 0;
3795 }
3796 
3797 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3798 {
3799     BlockDriverInfo bdi;
3800 
3801     if (bs->backing_hd) {
3802         return false;
3803     }
3804 
3805     if (bdrv_get_info(bs, &bdi) == 0) {
3806         return bdi.unallocated_blocks_are_zero;
3807     }
3808 
3809     return false;
3810 }
3811 
3812 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3813 {
3814     BlockDriverInfo bdi;
3815 
3816     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3817         return false;
3818     }
3819 
3820     if (bdrv_get_info(bs, &bdi) == 0) {
3821         return bdi.can_write_zeroes_with_unmap;
3822     }
3823 
3824     return false;
3825 }
3826 
3827 typedef struct BdrvCoGetBlockStatusData {
3828     BlockDriverState *bs;
3829     BlockDriverState *base;
3830     int64_t sector_num;
3831     int nb_sectors;
3832     int *pnum;
3833     int64_t ret;
3834     bool done;
3835 } BdrvCoGetBlockStatusData;
3836 
3837 /*
3838  * Returns true iff the specified sector is present in the disk image. Drivers
3839  * not implementing the functionality are assumed to not support backing files,
3840  * hence all their sectors are reported as allocated.
3841  *
3842  * If 'sector_num' is beyond the end of the disk image the return value is 0
3843  * and 'pnum' is set to 0.
3844  *
3845  * 'pnum' is set to the number of sectors (including and immediately following
3846  * the specified sector) that are known to be in the same
3847  * allocated/unallocated state.
3848  *
3849  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3850  * beyond the end of the disk image it will be clamped.
3851  */
3852 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3853                                                      int64_t sector_num,
3854                                                      int nb_sectors, int *pnum)
3855 {
3856     int64_t length;
3857     int64_t n;
3858     int64_t ret, ret2;
3859 
3860     length = bdrv_getlength(bs);
3861     if (length < 0) {
3862         return length;
3863     }
3864 
3865     if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3866         *pnum = 0;
3867         return 0;
3868     }
3869 
3870     n = bs->total_sectors - sector_num;
3871     if (n < nb_sectors) {
3872         nb_sectors = n;
3873     }
3874 
3875     if (!bs->drv->bdrv_co_get_block_status) {
3876         *pnum = nb_sectors;
3877         ret = BDRV_BLOCK_DATA;
3878         if (bs->drv->protocol_name) {
3879             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3880         }
3881         return ret;
3882     }
3883 
3884     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3885     if (ret < 0) {
3886         *pnum = 0;
3887         return ret;
3888     }
3889 
3890     if (ret & BDRV_BLOCK_RAW) {
3891         assert(ret & BDRV_BLOCK_OFFSET_VALID);
3892         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3893                                      *pnum, pnum);
3894     }
3895 
3896     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3897         if (bdrv_unallocated_blocks_are_zero(bs)) {
3898             ret |= BDRV_BLOCK_ZERO;
3899         } else if (bs->backing_hd) {
3900             BlockDriverState *bs2 = bs->backing_hd;
3901             int64_t length2 = bdrv_getlength(bs2);
3902             if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3903                 ret |= BDRV_BLOCK_ZERO;
3904             }
3905         }
3906     }
3907 
3908     if (bs->file &&
3909         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3910         (ret & BDRV_BLOCK_OFFSET_VALID)) {
3911         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3912                                         *pnum, pnum);
3913         if (ret2 >= 0) {
3914             /* Ignore errors.  This is just providing extra information, it
3915              * is useful but not necessary.
3916              */
3917             ret |= (ret2 & BDRV_BLOCK_ZERO);
3918         }
3919     }
3920 
3921     return ret;
3922 }
3923 
3924 /* Coroutine wrapper for bdrv_get_block_status() */
3925 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3926 {
3927     BdrvCoGetBlockStatusData *data = opaque;
3928     BlockDriverState *bs = data->bs;
3929 
3930     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3931                                          data->pnum);
3932     data->done = true;
3933 }
3934 
3935 /*
3936  * Synchronous wrapper around bdrv_co_get_block_status().
3937  *
3938  * See bdrv_co_get_block_status() for details.
3939  */
3940 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3941                               int nb_sectors, int *pnum)
3942 {
3943     Coroutine *co;
3944     BdrvCoGetBlockStatusData data = {
3945         .bs = bs,
3946         .sector_num = sector_num,
3947         .nb_sectors = nb_sectors,
3948         .pnum = pnum,
3949         .done = false,
3950     };
3951 
3952     if (qemu_in_coroutine()) {
3953         /* Fast-path if already in coroutine context */
3954         bdrv_get_block_status_co_entry(&data);
3955     } else {
3956         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3957         qemu_coroutine_enter(co, &data);
3958         while (!data.done) {
3959             qemu_aio_wait();
3960         }
3961     }
3962     return data.ret;
3963 }
3964 
3965 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3966                                    int nb_sectors, int *pnum)
3967 {
3968     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3969     if (ret < 0) {
3970         return ret;
3971     }
3972     return
3973         (ret & BDRV_BLOCK_DATA) ||
3974         ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3975 }
3976 
3977 /*
3978  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3979  *
3980  * Return true if the given sector is allocated in any image between
3981  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3982  * sector is allocated in any image of the chain.  Return false otherwise.
3983  *
3984  * 'pnum' is set to the number of sectors (including and immediately following
3985  *  the specified sector) that are known to be in the same
3986  *  allocated/unallocated state.
3987  *
3988  */
3989 int bdrv_is_allocated_above(BlockDriverState *top,
3990                             BlockDriverState *base,
3991                             int64_t sector_num,
3992                             int nb_sectors, int *pnum)
3993 {
3994     BlockDriverState *intermediate;
3995     int ret, n = nb_sectors;
3996 
3997     intermediate = top;
3998     while (intermediate && intermediate != base) {
3999         int pnum_inter;
4000         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4001                                 &pnum_inter);
4002         if (ret < 0) {
4003             return ret;
4004         } else if (ret) {
4005             *pnum = pnum_inter;
4006             return 1;
4007         }
4008 
4009         /*
4010          * [sector_num, nb_sectors] is unallocated on top but intermediate
4011          * might have
4012          *
4013          * [sector_num+x, nr_sectors] allocated.
4014          */
4015         if (n > pnum_inter &&
4016             (intermediate == top ||
4017              sector_num + pnum_inter < intermediate->total_sectors)) {
4018             n = pnum_inter;
4019         }
4020 
4021         intermediate = intermediate->backing_hd;
4022     }
4023 
4024     *pnum = n;
4025     return 0;
4026 }
4027 
4028 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4029 {
4030     if (bs->backing_hd && bs->backing_hd->encrypted)
4031         return bs->backing_file;
4032     else if (bs->encrypted)
4033         return bs->filename;
4034     else
4035         return NULL;
4036 }
4037 
4038 void bdrv_get_backing_filename(BlockDriverState *bs,
4039                                char *filename, int filename_size)
4040 {
4041     pstrcpy(filename, filename_size, bs->backing_file);
4042 }
4043 
4044 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4045                           const uint8_t *buf, int nb_sectors)
4046 {
4047     BlockDriver *drv = bs->drv;
4048     if (!drv)
4049         return -ENOMEDIUM;
4050     if (!drv->bdrv_write_compressed)
4051         return -ENOTSUP;
4052     if (bdrv_check_request(bs, sector_num, nb_sectors))
4053         return -EIO;
4054 
4055     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4056 
4057     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4058 }
4059 
4060 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4061 {
4062     BlockDriver *drv = bs->drv;
4063     if (!drv)
4064         return -ENOMEDIUM;
4065     if (!drv->bdrv_get_info)
4066         return -ENOTSUP;
4067     memset(bdi, 0, sizeof(*bdi));
4068     return drv->bdrv_get_info(bs, bdi);
4069 }
4070 
4071 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4072 {
4073     BlockDriver *drv = bs->drv;
4074     if (drv && drv->bdrv_get_specific_info) {
4075         return drv->bdrv_get_specific_info(bs);
4076     }
4077     return NULL;
4078 }
4079 
4080 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4081                       int64_t pos, int size)
4082 {
4083     QEMUIOVector qiov;
4084     struct iovec iov = {
4085         .iov_base   = (void *) buf,
4086         .iov_len    = size,
4087     };
4088 
4089     qemu_iovec_init_external(&qiov, &iov, 1);
4090     return bdrv_writev_vmstate(bs, &qiov, pos);
4091 }
4092 
4093 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4094 {
4095     BlockDriver *drv = bs->drv;
4096 
4097     if (!drv) {
4098         return -ENOMEDIUM;
4099     } else if (drv->bdrv_save_vmstate) {
4100         return drv->bdrv_save_vmstate(bs, qiov, pos);
4101     } else if (bs->file) {
4102         return bdrv_writev_vmstate(bs->file, qiov, pos);
4103     }
4104 
4105     return -ENOTSUP;
4106 }
4107 
4108 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4109                       int64_t pos, int size)
4110 {
4111     BlockDriver *drv = bs->drv;
4112     if (!drv)
4113         return -ENOMEDIUM;
4114     if (drv->bdrv_load_vmstate)
4115         return drv->bdrv_load_vmstate(bs, buf, pos, size);
4116     if (bs->file)
4117         return bdrv_load_vmstate(bs->file, buf, pos, size);
4118     return -ENOTSUP;
4119 }
4120 
4121 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4122 {
4123     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4124         return;
4125     }
4126 
4127     bs->drv->bdrv_debug_event(bs, event);
4128 }
4129 
4130 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4131                           const char *tag)
4132 {
4133     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4134         bs = bs->file;
4135     }
4136 
4137     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4138         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4139     }
4140 
4141     return -ENOTSUP;
4142 }
4143 
4144 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4145 {
4146     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4147         bs = bs->file;
4148     }
4149 
4150     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4151         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4152     }
4153 
4154     return -ENOTSUP;
4155 }
4156 
4157 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4158 {
4159     while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4160         bs = bs->file;
4161     }
4162 
4163     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4164         return bs->drv->bdrv_debug_resume(bs, tag);
4165     }
4166 
4167     return -ENOTSUP;
4168 }
4169 
4170 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4171 {
4172     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4173         bs = bs->file;
4174     }
4175 
4176     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4177         return bs->drv->bdrv_debug_is_suspended(bs, tag);
4178     }
4179 
4180     return false;
4181 }
4182 
4183 int bdrv_is_snapshot(BlockDriverState *bs)
4184 {
4185     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4186 }
4187 
4188 /* backing_file can either be relative, or absolute, or a protocol.  If it is
4189  * relative, it must be relative to the chain.  So, passing in bs->filename
4190  * from a BDS as backing_file should not be done, as that may be relative to
4191  * the CWD rather than the chain. */
4192 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4193         const char *backing_file)
4194 {
4195     char *filename_full = NULL;
4196     char *backing_file_full = NULL;
4197     char *filename_tmp = NULL;
4198     int is_protocol = 0;
4199     BlockDriverState *curr_bs = NULL;
4200     BlockDriverState *retval = NULL;
4201 
4202     if (!bs || !bs->drv || !backing_file) {
4203         return NULL;
4204     }
4205 
4206     filename_full     = g_malloc(PATH_MAX);
4207     backing_file_full = g_malloc(PATH_MAX);
4208     filename_tmp      = g_malloc(PATH_MAX);
4209 
4210     is_protocol = path_has_protocol(backing_file);
4211 
4212     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4213 
4214         /* If either of the filename paths is actually a protocol, then
4215          * compare unmodified paths; otherwise make paths relative */
4216         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4217             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4218                 retval = curr_bs->backing_hd;
4219                 break;
4220             }
4221         } else {
4222             /* If not an absolute filename path, make it relative to the current
4223              * image's filename path */
4224             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4225                          backing_file);
4226 
4227             /* We are going to compare absolute pathnames */
4228             if (!realpath(filename_tmp, filename_full)) {
4229                 continue;
4230             }
4231 
4232             /* We need to make sure the backing filename we are comparing against
4233              * is relative to the current image filename (or absolute) */
4234             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4235                          curr_bs->backing_file);
4236 
4237             if (!realpath(filename_tmp, backing_file_full)) {
4238                 continue;
4239             }
4240 
4241             if (strcmp(backing_file_full, filename_full) == 0) {
4242                 retval = curr_bs->backing_hd;
4243                 break;
4244             }
4245         }
4246     }
4247 
4248     g_free(filename_full);
4249     g_free(backing_file_full);
4250     g_free(filename_tmp);
4251     return retval;
4252 }
4253 
4254 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4255 {
4256     if (!bs->drv) {
4257         return 0;
4258     }
4259 
4260     if (!bs->backing_hd) {
4261         return 0;
4262     }
4263 
4264     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4265 }
4266 
4267 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4268 {
4269     BlockDriverState *curr_bs = NULL;
4270 
4271     if (!bs) {
4272         return NULL;
4273     }
4274 
4275     curr_bs = bs;
4276 
4277     while (curr_bs->backing_hd) {
4278         curr_bs = curr_bs->backing_hd;
4279     }
4280     return curr_bs;
4281 }
4282 
4283 /**************************************************************/
4284 /* async I/Os */
4285 
4286 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4287                                  QEMUIOVector *qiov, int nb_sectors,
4288                                  BlockDriverCompletionFunc *cb, void *opaque)
4289 {
4290     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4291 
4292     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4293                                  cb, opaque, false);
4294 }
4295 
4296 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4297                                   QEMUIOVector *qiov, int nb_sectors,
4298                                   BlockDriverCompletionFunc *cb, void *opaque)
4299 {
4300     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4301 
4302     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4303                                  cb, opaque, true);
4304 }
4305 
4306 BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4307         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4308         BlockDriverCompletionFunc *cb, void *opaque)
4309 {
4310     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4311 
4312     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4313                                  BDRV_REQ_ZERO_WRITE | flags,
4314                                  cb, opaque, true);
4315 }
4316 
4317 
4318 typedef struct MultiwriteCB {
4319     int error;
4320     int num_requests;
4321     int num_callbacks;
4322     struct {
4323         BlockDriverCompletionFunc *cb;
4324         void *opaque;
4325         QEMUIOVector *free_qiov;
4326     } callbacks[];
4327 } MultiwriteCB;
4328 
4329 static void multiwrite_user_cb(MultiwriteCB *mcb)
4330 {
4331     int i;
4332 
4333     for (i = 0; i < mcb->num_callbacks; i++) {
4334         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4335         if (mcb->callbacks[i].free_qiov) {
4336             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4337         }
4338         g_free(mcb->callbacks[i].free_qiov);
4339     }
4340 }
4341 
4342 static void multiwrite_cb(void *opaque, int ret)
4343 {
4344     MultiwriteCB *mcb = opaque;
4345 
4346     trace_multiwrite_cb(mcb, ret);
4347 
4348     if (ret < 0 && !mcb->error) {
4349         mcb->error = ret;
4350     }
4351 
4352     mcb->num_requests--;
4353     if (mcb->num_requests == 0) {
4354         multiwrite_user_cb(mcb);
4355         g_free(mcb);
4356     }
4357 }
4358 
4359 static int multiwrite_req_compare(const void *a, const void *b)
4360 {
4361     const BlockRequest *req1 = a, *req2 = b;
4362 
4363     /*
4364      * Note that we can't simply subtract req2->sector from req1->sector
4365      * here as that could overflow the return value.
4366      */
4367     if (req1->sector > req2->sector) {
4368         return 1;
4369     } else if (req1->sector < req2->sector) {
4370         return -1;
4371     } else {
4372         return 0;
4373     }
4374 }
4375 
4376 /*
4377  * Takes a bunch of requests and tries to merge them. Returns the number of
4378  * requests that remain after merging.
4379  */
4380 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4381     int num_reqs, MultiwriteCB *mcb)
4382 {
4383     int i, outidx;
4384 
4385     // Sort requests by start sector
4386     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4387 
4388     // Check if adjacent requests touch the same clusters. If so, combine them,
4389     // filling up gaps with zero sectors.
4390     outidx = 0;
4391     for (i = 1; i < num_reqs; i++) {
4392         int merge = 0;
4393         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4394 
4395         // Handle exactly sequential writes and overlapping writes.
4396         if (reqs[i].sector <= oldreq_last) {
4397             merge = 1;
4398         }
4399 
4400         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4401             merge = 0;
4402         }
4403 
4404         if (merge) {
4405             size_t size;
4406             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4407             qemu_iovec_init(qiov,
4408                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4409 
4410             // Add the first request to the merged one. If the requests are
4411             // overlapping, drop the last sectors of the first request.
4412             size = (reqs[i].sector - reqs[outidx].sector) << 9;
4413             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4414 
4415             // We should need to add any zeros between the two requests
4416             assert (reqs[i].sector <= oldreq_last);
4417 
4418             // Add the second request
4419             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4420 
4421             reqs[outidx].nb_sectors = qiov->size >> 9;
4422             reqs[outidx].qiov = qiov;
4423 
4424             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4425         } else {
4426             outidx++;
4427             reqs[outidx].sector     = reqs[i].sector;
4428             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4429             reqs[outidx].qiov       = reqs[i].qiov;
4430         }
4431     }
4432 
4433     return outidx + 1;
4434 }
4435 
4436 /*
4437  * Submit multiple AIO write requests at once.
4438  *
4439  * On success, the function returns 0 and all requests in the reqs array have
4440  * been submitted. In error case this function returns -1, and any of the
4441  * requests may or may not be submitted yet. In particular, this means that the
4442  * callback will be called for some of the requests, for others it won't. The
4443  * caller must check the error field of the BlockRequest to wait for the right
4444  * callbacks (if error != 0, no callback will be called).
4445  *
4446  * The implementation may modify the contents of the reqs array, e.g. to merge
4447  * requests. However, the fields opaque and error are left unmodified as they
4448  * are used to signal failure for a single request to the caller.
4449  */
4450 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4451 {
4452     MultiwriteCB *mcb;
4453     int i;
4454 
4455     /* don't submit writes if we don't have a medium */
4456     if (bs->drv == NULL) {
4457         for (i = 0; i < num_reqs; i++) {
4458             reqs[i].error = -ENOMEDIUM;
4459         }
4460         return -1;
4461     }
4462 
4463     if (num_reqs == 0) {
4464         return 0;
4465     }
4466 
4467     // Create MultiwriteCB structure
4468     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4469     mcb->num_requests = 0;
4470     mcb->num_callbacks = num_reqs;
4471 
4472     for (i = 0; i < num_reqs; i++) {
4473         mcb->callbacks[i].cb = reqs[i].cb;
4474         mcb->callbacks[i].opaque = reqs[i].opaque;
4475     }
4476 
4477     // Check for mergable requests
4478     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4479 
4480     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4481 
4482     /* Run the aio requests. */
4483     mcb->num_requests = num_reqs;
4484     for (i = 0; i < num_reqs; i++) {
4485         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4486                               reqs[i].nb_sectors, reqs[i].flags,
4487                               multiwrite_cb, mcb,
4488                               true);
4489     }
4490 
4491     return 0;
4492 }
4493 
4494 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4495 {
4496     acb->aiocb_info->cancel(acb);
4497 }
4498 
4499 /**************************************************************/
4500 /* async block device emulation */
4501 
4502 typedef struct BlockDriverAIOCBSync {
4503     BlockDriverAIOCB common;
4504     QEMUBH *bh;
4505     int ret;
4506     /* vector translation state */
4507     QEMUIOVector *qiov;
4508     uint8_t *bounce;
4509     int is_write;
4510 } BlockDriverAIOCBSync;
4511 
4512 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4513 {
4514     BlockDriverAIOCBSync *acb =
4515         container_of(blockacb, BlockDriverAIOCBSync, common);
4516     qemu_bh_delete(acb->bh);
4517     acb->bh = NULL;
4518     qemu_aio_release(acb);
4519 }
4520 
4521 static const AIOCBInfo bdrv_em_aiocb_info = {
4522     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
4523     .cancel             = bdrv_aio_cancel_em,
4524 };
4525 
4526 static void bdrv_aio_bh_cb(void *opaque)
4527 {
4528     BlockDriverAIOCBSync *acb = opaque;
4529 
4530     if (!acb->is_write)
4531         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4532     qemu_vfree(acb->bounce);
4533     acb->common.cb(acb->common.opaque, acb->ret);
4534     qemu_bh_delete(acb->bh);
4535     acb->bh = NULL;
4536     qemu_aio_release(acb);
4537 }
4538 
4539 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4540                                             int64_t sector_num,
4541                                             QEMUIOVector *qiov,
4542                                             int nb_sectors,
4543                                             BlockDriverCompletionFunc *cb,
4544                                             void *opaque,
4545                                             int is_write)
4546 
4547 {
4548     BlockDriverAIOCBSync *acb;
4549 
4550     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4551     acb->is_write = is_write;
4552     acb->qiov = qiov;
4553     acb->bounce = qemu_blockalign(bs, qiov->size);
4554     acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4555 
4556     if (is_write) {
4557         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4558         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4559     } else {
4560         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4561     }
4562 
4563     qemu_bh_schedule(acb->bh);
4564 
4565     return &acb->common;
4566 }
4567 
4568 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4569         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4570         BlockDriverCompletionFunc *cb, void *opaque)
4571 {
4572     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4573 }
4574 
4575 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4576         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4577         BlockDriverCompletionFunc *cb, void *opaque)
4578 {
4579     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4580 }
4581 
4582 
4583 typedef struct BlockDriverAIOCBCoroutine {
4584     BlockDriverAIOCB common;
4585     BlockRequest req;
4586     bool is_write;
4587     bool *done;
4588     QEMUBH* bh;
4589 } BlockDriverAIOCBCoroutine;
4590 
4591 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4592 {
4593     BlockDriverAIOCBCoroutine *acb =
4594         container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4595     bool done = false;
4596 
4597     acb->done = &done;
4598     while (!done) {
4599         qemu_aio_wait();
4600     }
4601 }
4602 
4603 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4604     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4605     .cancel             = bdrv_aio_co_cancel_em,
4606 };
4607 
4608 static void bdrv_co_em_bh(void *opaque)
4609 {
4610     BlockDriverAIOCBCoroutine *acb = opaque;
4611 
4612     acb->common.cb(acb->common.opaque, acb->req.error);
4613 
4614     if (acb->done) {
4615         *acb->done = true;
4616     }
4617 
4618     qemu_bh_delete(acb->bh);
4619     qemu_aio_release(acb);
4620 }
4621 
4622 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4623 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4624 {
4625     BlockDriverAIOCBCoroutine *acb = opaque;
4626     BlockDriverState *bs = acb->common.bs;
4627 
4628     if (!acb->is_write) {
4629         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4630             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4631     } else {
4632         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4633             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4634     }
4635 
4636     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4637     qemu_bh_schedule(acb->bh);
4638 }
4639 
4640 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4641                                                int64_t sector_num,
4642                                                QEMUIOVector *qiov,
4643                                                int nb_sectors,
4644                                                BdrvRequestFlags flags,
4645                                                BlockDriverCompletionFunc *cb,
4646                                                void *opaque,
4647                                                bool is_write)
4648 {
4649     Coroutine *co;
4650     BlockDriverAIOCBCoroutine *acb;
4651 
4652     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4653     acb->req.sector = sector_num;
4654     acb->req.nb_sectors = nb_sectors;
4655     acb->req.qiov = qiov;
4656     acb->req.flags = flags;
4657     acb->is_write = is_write;
4658     acb->done = NULL;
4659 
4660     co = qemu_coroutine_create(bdrv_co_do_rw);
4661     qemu_coroutine_enter(co, acb);
4662 
4663     return &acb->common;
4664 }
4665 
4666 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4667 {
4668     BlockDriverAIOCBCoroutine *acb = opaque;
4669     BlockDriverState *bs = acb->common.bs;
4670 
4671     acb->req.error = bdrv_co_flush(bs);
4672     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4673     qemu_bh_schedule(acb->bh);
4674 }
4675 
4676 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4677         BlockDriverCompletionFunc *cb, void *opaque)
4678 {
4679     trace_bdrv_aio_flush(bs, opaque);
4680 
4681     Coroutine *co;
4682     BlockDriverAIOCBCoroutine *acb;
4683 
4684     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4685     acb->done = NULL;
4686 
4687     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4688     qemu_coroutine_enter(co, acb);
4689 
4690     return &acb->common;
4691 }
4692 
4693 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4694 {
4695     BlockDriverAIOCBCoroutine *acb = opaque;
4696     BlockDriverState *bs = acb->common.bs;
4697 
4698     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4699     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4700     qemu_bh_schedule(acb->bh);
4701 }
4702 
4703 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4704         int64_t sector_num, int nb_sectors,
4705         BlockDriverCompletionFunc *cb, void *opaque)
4706 {
4707     Coroutine *co;
4708     BlockDriverAIOCBCoroutine *acb;
4709 
4710     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4711 
4712     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4713     acb->req.sector = sector_num;
4714     acb->req.nb_sectors = nb_sectors;
4715     acb->done = NULL;
4716     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4717     qemu_coroutine_enter(co, acb);
4718 
4719     return &acb->common;
4720 }
4721 
4722 void bdrv_init(void)
4723 {
4724     module_call_init(MODULE_INIT_BLOCK);
4725 }
4726 
4727 void bdrv_init_with_whitelist(void)
4728 {
4729     use_bdrv_whitelist = 1;
4730     bdrv_init();
4731 }
4732 
4733 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4734                    BlockDriverCompletionFunc *cb, void *opaque)
4735 {
4736     BlockDriverAIOCB *acb;
4737 
4738     acb = g_slice_alloc(aiocb_info->aiocb_size);
4739     acb->aiocb_info = aiocb_info;
4740     acb->bs = bs;
4741     acb->cb = cb;
4742     acb->opaque = opaque;
4743     return acb;
4744 }
4745 
4746 void qemu_aio_release(void *p)
4747 {
4748     BlockDriverAIOCB *acb = p;
4749     g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4750 }
4751 
4752 /**************************************************************/
4753 /* Coroutine block device emulation */
4754 
4755 typedef struct CoroutineIOCompletion {
4756     Coroutine *coroutine;
4757     int ret;
4758 } CoroutineIOCompletion;
4759 
4760 static void bdrv_co_io_em_complete(void *opaque, int ret)
4761 {
4762     CoroutineIOCompletion *co = opaque;
4763 
4764     co->ret = ret;
4765     qemu_coroutine_enter(co->coroutine, NULL);
4766 }
4767 
4768 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4769                                       int nb_sectors, QEMUIOVector *iov,
4770                                       bool is_write)
4771 {
4772     CoroutineIOCompletion co = {
4773         .coroutine = qemu_coroutine_self(),
4774     };
4775     BlockDriverAIOCB *acb;
4776 
4777     if (is_write) {
4778         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4779                                        bdrv_co_io_em_complete, &co);
4780     } else {
4781         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4782                                       bdrv_co_io_em_complete, &co);
4783     }
4784 
4785     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4786     if (!acb) {
4787         return -EIO;
4788     }
4789     qemu_coroutine_yield();
4790 
4791     return co.ret;
4792 }
4793 
4794 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4795                                          int64_t sector_num, int nb_sectors,
4796                                          QEMUIOVector *iov)
4797 {
4798     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4799 }
4800 
4801 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4802                                          int64_t sector_num, int nb_sectors,
4803                                          QEMUIOVector *iov)
4804 {
4805     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4806 }
4807 
4808 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4809 {
4810     RwCo *rwco = opaque;
4811 
4812     rwco->ret = bdrv_co_flush(rwco->bs);
4813 }
4814 
4815 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4816 {
4817     int ret;
4818 
4819     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4820         return 0;
4821     }
4822 
4823     /* Write back cached data to the OS even with cache=unsafe */
4824     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4825     if (bs->drv->bdrv_co_flush_to_os) {
4826         ret = bs->drv->bdrv_co_flush_to_os(bs);
4827         if (ret < 0) {
4828             return ret;
4829         }
4830     }
4831 
4832     /* But don't actually force it to the disk with cache=unsafe */
4833     if (bs->open_flags & BDRV_O_NO_FLUSH) {
4834         goto flush_parent;
4835     }
4836 
4837     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4838     if (bs->drv->bdrv_co_flush_to_disk) {
4839         ret = bs->drv->bdrv_co_flush_to_disk(bs);
4840     } else if (bs->drv->bdrv_aio_flush) {
4841         BlockDriverAIOCB *acb;
4842         CoroutineIOCompletion co = {
4843             .coroutine = qemu_coroutine_self(),
4844         };
4845 
4846         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4847         if (acb == NULL) {
4848             ret = -EIO;
4849         } else {
4850             qemu_coroutine_yield();
4851             ret = co.ret;
4852         }
4853     } else {
4854         /*
4855          * Some block drivers always operate in either writethrough or unsafe
4856          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4857          * know how the server works (because the behaviour is hardcoded or
4858          * depends on server-side configuration), so we can't ensure that
4859          * everything is safe on disk. Returning an error doesn't work because
4860          * that would break guests even if the server operates in writethrough
4861          * mode.
4862          *
4863          * Let's hope the user knows what he's doing.
4864          */
4865         ret = 0;
4866     }
4867     if (ret < 0) {
4868         return ret;
4869     }
4870 
4871     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4872      * in the case of cache=unsafe, so there are no useless flushes.
4873      */
4874 flush_parent:
4875     return bdrv_co_flush(bs->file);
4876 }
4877 
4878 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
4879 {
4880     Error *local_err = NULL;
4881     int ret;
4882 
4883     if (!bs->drv)  {
4884         return;
4885     }
4886 
4887     if (bs->drv->bdrv_invalidate_cache) {
4888         bs->drv->bdrv_invalidate_cache(bs, &local_err);
4889     } else if (bs->file) {
4890         bdrv_invalidate_cache(bs->file, &local_err);
4891     }
4892     if (local_err) {
4893         error_propagate(errp, local_err);
4894         return;
4895     }
4896 
4897     ret = refresh_total_sectors(bs, bs->total_sectors);
4898     if (ret < 0) {
4899         error_setg_errno(errp, -ret, "Could not refresh total sector count");
4900         return;
4901     }
4902 }
4903 
4904 void bdrv_invalidate_cache_all(Error **errp)
4905 {
4906     BlockDriverState *bs;
4907     Error *local_err = NULL;
4908 
4909     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4910         bdrv_invalidate_cache(bs, &local_err);
4911         if (local_err) {
4912             error_propagate(errp, local_err);
4913             return;
4914         }
4915     }
4916 }
4917 
4918 void bdrv_clear_incoming_migration_all(void)
4919 {
4920     BlockDriverState *bs;
4921 
4922     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4923         bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4924     }
4925 }
4926 
4927 int bdrv_flush(BlockDriverState *bs)
4928 {
4929     Coroutine *co;
4930     RwCo rwco = {
4931         .bs = bs,
4932         .ret = NOT_DONE,
4933     };
4934 
4935     if (qemu_in_coroutine()) {
4936         /* Fast-path if already in coroutine context */
4937         bdrv_flush_co_entry(&rwco);
4938     } else {
4939         co = qemu_coroutine_create(bdrv_flush_co_entry);
4940         qemu_coroutine_enter(co, &rwco);
4941         while (rwco.ret == NOT_DONE) {
4942             qemu_aio_wait();
4943         }
4944     }
4945 
4946     return rwco.ret;
4947 }
4948 
4949 typedef struct DiscardCo {
4950     BlockDriverState *bs;
4951     int64_t sector_num;
4952     int nb_sectors;
4953     int ret;
4954 } DiscardCo;
4955 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4956 {
4957     DiscardCo *rwco = opaque;
4958 
4959     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4960 }
4961 
4962 /* if no limit is specified in the BlockLimits use a default
4963  * of 32768 512-byte sectors (16 MiB) per request.
4964  */
4965 #define MAX_DISCARD_DEFAULT 32768
4966 
4967 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4968                                  int nb_sectors)
4969 {
4970     int max_discard;
4971 
4972     if (!bs->drv) {
4973         return -ENOMEDIUM;
4974     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4975         return -EIO;
4976     } else if (bs->read_only) {
4977         return -EROFS;
4978     }
4979 
4980     bdrv_reset_dirty(bs, sector_num, nb_sectors);
4981 
4982     /* Do nothing if disabled.  */
4983     if (!(bs->open_flags & BDRV_O_UNMAP)) {
4984         return 0;
4985     }
4986 
4987     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4988         return 0;
4989     }
4990 
4991     max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4992     while (nb_sectors > 0) {
4993         int ret;
4994         int num = nb_sectors;
4995 
4996         /* align request */
4997         if (bs->bl.discard_alignment &&
4998             num >= bs->bl.discard_alignment &&
4999             sector_num % bs->bl.discard_alignment) {
5000             if (num > bs->bl.discard_alignment) {
5001                 num = bs->bl.discard_alignment;
5002             }
5003             num -= sector_num % bs->bl.discard_alignment;
5004         }
5005 
5006         /* limit request size */
5007         if (num > max_discard) {
5008             num = max_discard;
5009         }
5010 
5011         if (bs->drv->bdrv_co_discard) {
5012             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5013         } else {
5014             BlockDriverAIOCB *acb;
5015             CoroutineIOCompletion co = {
5016                 .coroutine = qemu_coroutine_self(),
5017             };
5018 
5019             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5020                                             bdrv_co_io_em_complete, &co);
5021             if (acb == NULL) {
5022                 return -EIO;
5023             } else {
5024                 qemu_coroutine_yield();
5025                 ret = co.ret;
5026             }
5027         }
5028         if (ret && ret != -ENOTSUP) {
5029             return ret;
5030         }
5031 
5032         sector_num += num;
5033         nb_sectors -= num;
5034     }
5035     return 0;
5036 }
5037 
5038 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5039 {
5040     Coroutine *co;
5041     DiscardCo rwco = {
5042         .bs = bs,
5043         .sector_num = sector_num,
5044         .nb_sectors = nb_sectors,
5045         .ret = NOT_DONE,
5046     };
5047 
5048     if (qemu_in_coroutine()) {
5049         /* Fast-path if already in coroutine context */
5050         bdrv_discard_co_entry(&rwco);
5051     } else {
5052         co = qemu_coroutine_create(bdrv_discard_co_entry);
5053         qemu_coroutine_enter(co, &rwco);
5054         while (rwco.ret == NOT_DONE) {
5055             qemu_aio_wait();
5056         }
5057     }
5058 
5059     return rwco.ret;
5060 }
5061 
5062 /**************************************************************/
5063 /* removable device support */
5064 
5065 /**
5066  * Return TRUE if the media is present
5067  */
5068 int bdrv_is_inserted(BlockDriverState *bs)
5069 {
5070     BlockDriver *drv = bs->drv;
5071 
5072     if (!drv)
5073         return 0;
5074     if (!drv->bdrv_is_inserted)
5075         return 1;
5076     return drv->bdrv_is_inserted(bs);
5077 }
5078 
5079 /**
5080  * Return whether the media changed since the last call to this
5081  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
5082  */
5083 int bdrv_media_changed(BlockDriverState *bs)
5084 {
5085     BlockDriver *drv = bs->drv;
5086 
5087     if (drv && drv->bdrv_media_changed) {
5088         return drv->bdrv_media_changed(bs);
5089     }
5090     return -ENOTSUP;
5091 }
5092 
5093 /**
5094  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5095  */
5096 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5097 {
5098     BlockDriver *drv = bs->drv;
5099 
5100     if (drv && drv->bdrv_eject) {
5101         drv->bdrv_eject(bs, eject_flag);
5102     }
5103 
5104     if (bs->device_name[0] != '\0') {
5105         bdrv_emit_qmp_eject_event(bs, eject_flag);
5106     }
5107 }
5108 
5109 /**
5110  * Lock or unlock the media (if it is locked, the user won't be able
5111  * to eject it manually).
5112  */
5113 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5114 {
5115     BlockDriver *drv = bs->drv;
5116 
5117     trace_bdrv_lock_medium(bs, locked);
5118 
5119     if (drv && drv->bdrv_lock_medium) {
5120         drv->bdrv_lock_medium(bs, locked);
5121     }
5122 }
5123 
5124 /* needed for generic scsi interface */
5125 
5126 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5127 {
5128     BlockDriver *drv = bs->drv;
5129 
5130     if (drv && drv->bdrv_ioctl)
5131         return drv->bdrv_ioctl(bs, req, buf);
5132     return -ENOTSUP;
5133 }
5134 
5135 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5136         unsigned long int req, void *buf,
5137         BlockDriverCompletionFunc *cb, void *opaque)
5138 {
5139     BlockDriver *drv = bs->drv;
5140 
5141     if (drv && drv->bdrv_aio_ioctl)
5142         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5143     return NULL;
5144 }
5145 
5146 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5147 {
5148     bs->guest_block_size = align;
5149 }
5150 
5151 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5152 {
5153     return qemu_memalign(bdrv_opt_mem_align(bs), size);
5154 }
5155 
5156 /*
5157  * Check if all memory in this vector is sector aligned.
5158  */
5159 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5160 {
5161     int i;
5162     size_t alignment = bdrv_opt_mem_align(bs);
5163 
5164     for (i = 0; i < qiov->niov; i++) {
5165         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5166             return false;
5167         }
5168         if (qiov->iov[i].iov_len % alignment) {
5169             return false;
5170         }
5171     }
5172 
5173     return true;
5174 }
5175 
5176 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5177                                           Error **errp)
5178 {
5179     int64_t bitmap_size;
5180     BdrvDirtyBitmap *bitmap;
5181 
5182     assert((granularity & (granularity - 1)) == 0);
5183 
5184     granularity >>= BDRV_SECTOR_BITS;
5185     assert(granularity);
5186     bitmap_size = bdrv_getlength(bs);
5187     if (bitmap_size < 0) {
5188         error_setg_errno(errp, -bitmap_size, "could not get length of device");
5189         errno = -bitmap_size;
5190         return NULL;
5191     }
5192     bitmap_size >>= BDRV_SECTOR_BITS;
5193     bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5194     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5195     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5196     return bitmap;
5197 }
5198 
5199 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5200 {
5201     BdrvDirtyBitmap *bm, *next;
5202     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5203         if (bm == bitmap) {
5204             QLIST_REMOVE(bitmap, list);
5205             hbitmap_free(bitmap->bitmap);
5206             g_free(bitmap);
5207             return;
5208         }
5209     }
5210 }
5211 
5212 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5213 {
5214     BdrvDirtyBitmap *bm;
5215     BlockDirtyInfoList *list = NULL;
5216     BlockDirtyInfoList **plist = &list;
5217 
5218     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5219         BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5220         BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5221         info->count = bdrv_get_dirty_count(bs, bm);
5222         info->granularity =
5223             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5224         entry->value = info;
5225         *plist = entry;
5226         plist = &entry->next;
5227     }
5228 
5229     return list;
5230 }
5231 
5232 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5233 {
5234     if (bitmap) {
5235         return hbitmap_get(bitmap->bitmap, sector);
5236     } else {
5237         return 0;
5238     }
5239 }
5240 
5241 void bdrv_dirty_iter_init(BlockDriverState *bs,
5242                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5243 {
5244     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5245 }
5246 
5247 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5248                     int nr_sectors)
5249 {
5250     BdrvDirtyBitmap *bitmap;
5251     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5252         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5253     }
5254 }
5255 
5256 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5257 {
5258     BdrvDirtyBitmap *bitmap;
5259     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5260         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5261     }
5262 }
5263 
5264 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5265 {
5266     return hbitmap_count(bitmap->bitmap);
5267 }
5268 
5269 /* Get a reference to bs */
5270 void bdrv_ref(BlockDriverState *bs)
5271 {
5272     bs->refcnt++;
5273 }
5274 
5275 /* Release a previously grabbed reference to bs.
5276  * If after releasing, reference count is zero, the BlockDriverState is
5277  * deleted. */
5278 void bdrv_unref(BlockDriverState *bs)
5279 {
5280     assert(bs->refcnt > 0);
5281     if (--bs->refcnt == 0) {
5282         bdrv_delete(bs);
5283     }
5284 }
5285 
5286 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
5287 {
5288     assert(bs->in_use != in_use);
5289     bs->in_use = in_use;
5290 }
5291 
5292 int bdrv_in_use(BlockDriverState *bs)
5293 {
5294     return bs->in_use;
5295 }
5296 
5297 void bdrv_iostatus_enable(BlockDriverState *bs)
5298 {
5299     bs->iostatus_enabled = true;
5300     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5301 }
5302 
5303 /* The I/O status is only enabled if the drive explicitly
5304  * enables it _and_ the VM is configured to stop on errors */
5305 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5306 {
5307     return (bs->iostatus_enabled &&
5308            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5309             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5310             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5311 }
5312 
5313 void bdrv_iostatus_disable(BlockDriverState *bs)
5314 {
5315     bs->iostatus_enabled = false;
5316 }
5317 
5318 void bdrv_iostatus_reset(BlockDriverState *bs)
5319 {
5320     if (bdrv_iostatus_is_enabled(bs)) {
5321         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5322         if (bs->job) {
5323             block_job_iostatus_reset(bs->job);
5324         }
5325     }
5326 }
5327 
5328 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5329 {
5330     assert(bdrv_iostatus_is_enabled(bs));
5331     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5332         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5333                                          BLOCK_DEVICE_IO_STATUS_FAILED;
5334     }
5335 }
5336 
5337 void
5338 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5339         enum BlockAcctType type)
5340 {
5341     assert(type < BDRV_MAX_IOTYPE);
5342 
5343     cookie->bytes = bytes;
5344     cookie->start_time_ns = get_clock();
5345     cookie->type = type;
5346 }
5347 
5348 void
5349 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5350 {
5351     assert(cookie->type < BDRV_MAX_IOTYPE);
5352 
5353     bs->nr_bytes[cookie->type] += cookie->bytes;
5354     bs->nr_ops[cookie->type]++;
5355     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5356 }
5357 
5358 void bdrv_img_create(const char *filename, const char *fmt,
5359                      const char *base_filename, const char *base_fmt,
5360                      char *options, uint64_t img_size, int flags,
5361                      Error **errp, bool quiet)
5362 {
5363     QEMUOptionParameter *param = NULL, *create_options = NULL;
5364     QEMUOptionParameter *backing_fmt, *backing_file, *size;
5365     BlockDriver *drv, *proto_drv;
5366     BlockDriver *backing_drv = NULL;
5367     Error *local_err = NULL;
5368     int ret = 0;
5369 
5370     /* Find driver and parse its options */
5371     drv = bdrv_find_format(fmt);
5372     if (!drv) {
5373         error_setg(errp, "Unknown file format '%s'", fmt);
5374         return;
5375     }
5376 
5377     proto_drv = bdrv_find_protocol(filename, true);
5378     if (!proto_drv) {
5379         error_setg(errp, "Unknown protocol '%s'", filename);
5380         return;
5381     }
5382 
5383     create_options = append_option_parameters(create_options,
5384                                               drv->create_options);
5385     create_options = append_option_parameters(create_options,
5386                                               proto_drv->create_options);
5387 
5388     /* Create parameter list with default values */
5389     param = parse_option_parameters("", create_options, param);
5390 
5391     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5392 
5393     /* Parse -o options */
5394     if (options) {
5395         param = parse_option_parameters(options, create_options, param);
5396         if (param == NULL) {
5397             error_setg(errp, "Invalid options for file format '%s'.", fmt);
5398             goto out;
5399         }
5400     }
5401 
5402     if (base_filename) {
5403         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5404                                  base_filename)) {
5405             error_setg(errp, "Backing file not supported for file format '%s'",
5406                        fmt);
5407             goto out;
5408         }
5409     }
5410 
5411     if (base_fmt) {
5412         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5413             error_setg(errp, "Backing file format not supported for file "
5414                              "format '%s'", fmt);
5415             goto out;
5416         }
5417     }
5418 
5419     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5420     if (backing_file && backing_file->value.s) {
5421         if (!strcmp(filename, backing_file->value.s)) {
5422             error_setg(errp, "Error: Trying to create an image with the "
5423                              "same filename as the backing file");
5424             goto out;
5425         }
5426     }
5427 
5428     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5429     if (backing_fmt && backing_fmt->value.s) {
5430         backing_drv = bdrv_find_format(backing_fmt->value.s);
5431         if (!backing_drv) {
5432             error_setg(errp, "Unknown backing file format '%s'",
5433                        backing_fmt->value.s);
5434             goto out;
5435         }
5436     }
5437 
5438     // The size for the image must always be specified, with one exception:
5439     // If we are using a backing file, we can obtain the size from there
5440     size = get_option_parameter(param, BLOCK_OPT_SIZE);
5441     if (size && size->value.n == -1) {
5442         if (backing_file && backing_file->value.s) {
5443             BlockDriverState *bs;
5444             uint64_t size;
5445             char buf[32];
5446             int back_flags;
5447 
5448             /* backing files always opened read-only */
5449             back_flags =
5450                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5451 
5452             bs = NULL;
5453             ret = bdrv_open(&bs, backing_file->value.s, NULL, NULL, back_flags,
5454                             backing_drv, &local_err);
5455             if (ret < 0) {
5456                 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5457                                  backing_file->value.s,
5458                                  error_get_pretty(local_err));
5459                 error_free(local_err);
5460                 local_err = NULL;
5461                 goto out;
5462             }
5463             bdrv_get_geometry(bs, &size);
5464             size *= 512;
5465 
5466             snprintf(buf, sizeof(buf), "%" PRId64, size);
5467             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
5468 
5469             bdrv_unref(bs);
5470         } else {
5471             error_setg(errp, "Image creation needs a size parameter");
5472             goto out;
5473         }
5474     }
5475 
5476     if (!quiet) {
5477         printf("Formatting '%s', fmt=%s ", filename, fmt);
5478         print_option_parameters(param);
5479         puts("");
5480     }
5481     ret = bdrv_create(drv, filename, param, &local_err);
5482     if (ret == -EFBIG) {
5483         /* This is generally a better message than whatever the driver would
5484          * deliver (especially because of the cluster_size_hint), since that
5485          * is most probably not much different from "image too large". */
5486         const char *cluster_size_hint = "";
5487         if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5488             cluster_size_hint = " (try using a larger cluster size)";
5489         }
5490         error_setg(errp, "The image size is too large for file format '%s'"
5491                    "%s", fmt, cluster_size_hint);
5492         error_free(local_err);
5493         local_err = NULL;
5494     }
5495 
5496 out:
5497     free_option_parameters(create_options);
5498     free_option_parameters(param);
5499 
5500     if (local_err) {
5501         error_propagate(errp, local_err);
5502     }
5503 }
5504 
5505 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5506 {
5507     /* Currently BlockDriverState always uses the main loop AioContext */
5508     return qemu_get_aio_context();
5509 }
5510 
5511 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5512                                     NotifierWithReturn *notifier)
5513 {
5514     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5515 }
5516 
5517 int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5518 {
5519     if (bs->drv->bdrv_amend_options == NULL) {
5520         return -ENOTSUP;
5521     }
5522     return bs->drv->bdrv_amend_options(bs, options);
5523 }
5524 
5525 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5526  * of block filter and by bdrv_is_first_non_filter.
5527  * It is used to test if the given bs is the candidate or recurse more in the
5528  * node graph.
5529  */
5530 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5531                                       BlockDriverState *candidate)
5532 {
5533     /* return false if basic checks fails */
5534     if (!bs || !bs->drv) {
5535         return false;
5536     }
5537 
5538     /* the code reached a non block filter driver -> check if the bs is
5539      * the same as the candidate. It's the recursion termination condition.
5540      */
5541     if (!bs->drv->is_filter) {
5542         return bs == candidate;
5543     }
5544     /* Down this path the driver is a block filter driver */
5545 
5546     /* If the block filter recursion method is defined use it to recurse down
5547      * the node graph.
5548      */
5549     if (bs->drv->bdrv_recurse_is_first_non_filter) {
5550         return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5551     }
5552 
5553     /* the driver is a block filter but don't allow to recurse -> return false
5554      */
5555     return false;
5556 }
5557 
5558 /* This function checks if the candidate is the first non filter bs down it's
5559  * bs chain. Since we don't have pointers to parents it explore all bs chains
5560  * from the top. Some filters can choose not to pass down the recursion.
5561  */
5562 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5563 {
5564     BlockDriverState *bs;
5565 
5566     /* walk down the bs forest recursively */
5567     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5568         bool perm;
5569 
5570         /* try to recurse in this top level bs */
5571         perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5572 
5573         /* candidate is the first non filter */
5574         if (perm) {
5575             return true;
5576         }
5577     }
5578 
5579     return false;
5580 }
5581