xref: /openbmc/qemu/block.c (revision b8bcf811)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor/monitor.h"
28 #include "block/block_int.h"
29 #include "block/blockjob.h"
30 #include "qemu/module.h"
31 #include "qapi/qmp/qjson.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 
39 #ifdef CONFIG_BSD
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/queue.h>
44 #ifndef __DragonFly__
45 #include <sys/disk.h>
46 #endif
47 #endif
48 
49 #ifdef _WIN32
50 #include <windows.h>
51 #endif
52 
53 struct BdrvDirtyBitmap {
54     HBitmap *bitmap;
55     QLIST_ENTRY(BdrvDirtyBitmap) list;
56 };
57 
58 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59 
60 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63         BlockDriverCompletionFunc *cb, void *opaque);
64 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66         BlockDriverCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68                                          int64_t sector_num, int nb_sectors,
69                                          QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71                                          int64_t sector_num, int nb_sectors,
72                                          QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75     BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78     BdrvRequestFlags flags);
79 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80                                                int64_t sector_num,
81                                                QEMUIOVector *qiov,
82                                                int nb_sectors,
83                                                BdrvRequestFlags flags,
84                                                BlockDriverCompletionFunc *cb,
85                                                void *opaque,
86                                                bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90 
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92     QTAILQ_HEAD_INITIALIZER(bdrv_states);
93 
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96 
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98     QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
102 
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
105 {
106     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108             filename[1] == ':');
109 }
110 
111 int is_windows_drive(const char *filename)
112 {
113     if (is_windows_drive_prefix(filename) &&
114         filename[2] == '\0')
115         return 1;
116     if (strstart(filename, "\\\\.\\", NULL) ||
117         strstart(filename, "//./", NULL))
118         return 1;
119     return 0;
120 }
121 #endif
122 
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125                         ThrottleConfig *cfg)
126 {
127     int i;
128 
129     throttle_config(&bs->throttle_state, cfg);
130 
131     for (i = 0; i < 2; i++) {
132         qemu_co_enter_next(&bs->throttled_reqs[i]);
133     }
134 }
135 
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138 {
139     bool drained = false;
140     bool enabled = bs->io_limits_enabled;
141     int i;
142 
143     bs->io_limits_enabled = false;
144 
145     for (i = 0; i < 2; i++) {
146         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147             drained = true;
148         }
149     }
150 
151     bs->io_limits_enabled = enabled;
152 
153     return drained;
154 }
155 
156 void bdrv_io_limits_disable(BlockDriverState *bs)
157 {
158     bs->io_limits_enabled = false;
159 
160     bdrv_start_throttled_reqs(bs);
161 
162     throttle_destroy(&bs->throttle_state);
163 }
164 
165 static void bdrv_throttle_read_timer_cb(void *opaque)
166 {
167     BlockDriverState *bs = opaque;
168     qemu_co_enter_next(&bs->throttled_reqs[0]);
169 }
170 
171 static void bdrv_throttle_write_timer_cb(void *opaque)
172 {
173     BlockDriverState *bs = opaque;
174     qemu_co_enter_next(&bs->throttled_reqs[1]);
175 }
176 
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
179 {
180     assert(!bs->io_limits_enabled);
181     throttle_init(&bs->throttle_state,
182                   QEMU_CLOCK_VIRTUAL,
183                   bdrv_throttle_read_timer_cb,
184                   bdrv_throttle_write_timer_cb,
185                   bs);
186     bs->io_limits_enabled = true;
187 }
188 
189 /* This function makes an IO wait if needed
190  *
191  * @nb_sectors: the number of sectors of the IO
192  * @is_write:   is the IO a write
193  */
194 static void bdrv_io_limits_intercept(BlockDriverState *bs,
195                                      unsigned int bytes,
196                                      bool is_write)
197 {
198     /* does this io must wait */
199     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
200 
201     /* if must wait or any request of this type throttled queue the IO */
202     if (must_wait ||
203         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
205     }
206 
207     /* the IO will be executed, do the accounting */
208     throttle_account(&bs->throttle_state, is_write, bytes);
209 
210 
211     /* if the next request must wait -> do nothing */
212     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
213         return;
214     }
215 
216     /* else queue next request for execution */
217     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
218 }
219 
220 size_t bdrv_opt_mem_align(BlockDriverState *bs)
221 {
222     if (!bs || !bs->drv) {
223         /* 4k should be on the safe side */
224         return 4096;
225     }
226 
227     return bs->bl.opt_mem_alignment;
228 }
229 
230 /* check if the path starts with "<protocol>:" */
231 static int path_has_protocol(const char *path)
232 {
233     const char *p;
234 
235 #ifdef _WIN32
236     if (is_windows_drive(path) ||
237         is_windows_drive_prefix(path)) {
238         return 0;
239     }
240     p = path + strcspn(path, ":/\\");
241 #else
242     p = path + strcspn(path, ":/");
243 #endif
244 
245     return *p == ':';
246 }
247 
248 int path_is_absolute(const char *path)
249 {
250 #ifdef _WIN32
251     /* specific case for names like: "\\.\d:" */
252     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
253         return 1;
254     }
255     return (*path == '/' || *path == '\\');
256 #else
257     return (*path == '/');
258 #endif
259 }
260 
261 /* if filename is absolute, just copy it to dest. Otherwise, build a
262    path to it by considering it is relative to base_path. URL are
263    supported. */
264 void path_combine(char *dest, int dest_size,
265                   const char *base_path,
266                   const char *filename)
267 {
268     const char *p, *p1;
269     int len;
270 
271     if (dest_size <= 0)
272         return;
273     if (path_is_absolute(filename)) {
274         pstrcpy(dest, dest_size, filename);
275     } else {
276         p = strchr(base_path, ':');
277         if (p)
278             p++;
279         else
280             p = base_path;
281         p1 = strrchr(base_path, '/');
282 #ifdef _WIN32
283         {
284             const char *p2;
285             p2 = strrchr(base_path, '\\');
286             if (!p1 || p2 > p1)
287                 p1 = p2;
288         }
289 #endif
290         if (p1)
291             p1++;
292         else
293             p1 = base_path;
294         if (p1 > p)
295             p = p1;
296         len = p - base_path;
297         if (len > dest_size - 1)
298             len = dest_size - 1;
299         memcpy(dest, base_path, len);
300         dest[len] = '\0';
301         pstrcat(dest, dest_size, filename);
302     }
303 }
304 
305 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
306 {
307     if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
308         pstrcpy(dest, sz, bs->backing_file);
309     } else {
310         path_combine(dest, sz, bs->filename, bs->backing_file);
311     }
312 }
313 
314 void bdrv_register(BlockDriver *bdrv)
315 {
316     /* Block drivers without coroutine functions need emulation */
317     if (!bdrv->bdrv_co_readv) {
318         bdrv->bdrv_co_readv = bdrv_co_readv_em;
319         bdrv->bdrv_co_writev = bdrv_co_writev_em;
320 
321         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
322          * the block driver lacks aio we need to emulate that too.
323          */
324         if (!bdrv->bdrv_aio_readv) {
325             /* add AIO emulation layer */
326             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
327             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
328         }
329     }
330 
331     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
332 }
333 
334 /* create a new block device (by default it is empty) */
335 BlockDriverState *bdrv_new(const char *device_name)
336 {
337     BlockDriverState *bs;
338 
339     bs = g_malloc0(sizeof(BlockDriverState));
340     QLIST_INIT(&bs->dirty_bitmaps);
341     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
342     if (device_name[0] != '\0') {
343         QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
344     }
345     bdrv_iostatus_disable(bs);
346     notifier_list_init(&bs->close_notifiers);
347     notifier_with_return_list_init(&bs->before_write_notifiers);
348     qemu_co_queue_init(&bs->throttled_reqs[0]);
349     qemu_co_queue_init(&bs->throttled_reqs[1]);
350     bs->refcnt = 1;
351 
352     return bs;
353 }
354 
355 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
356 {
357     notifier_list_add(&bs->close_notifiers, notify);
358 }
359 
360 BlockDriver *bdrv_find_format(const char *format_name)
361 {
362     BlockDriver *drv1;
363     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
364         if (!strcmp(drv1->format_name, format_name)) {
365             return drv1;
366         }
367     }
368     return NULL;
369 }
370 
371 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
372 {
373     static const char *whitelist_rw[] = {
374         CONFIG_BDRV_RW_WHITELIST
375     };
376     static const char *whitelist_ro[] = {
377         CONFIG_BDRV_RO_WHITELIST
378     };
379     const char **p;
380 
381     if (!whitelist_rw[0] && !whitelist_ro[0]) {
382         return 1;               /* no whitelist, anything goes */
383     }
384 
385     for (p = whitelist_rw; *p; p++) {
386         if (!strcmp(drv->format_name, *p)) {
387             return 1;
388         }
389     }
390     if (read_only) {
391         for (p = whitelist_ro; *p; p++) {
392             if (!strcmp(drv->format_name, *p)) {
393                 return 1;
394             }
395         }
396     }
397     return 0;
398 }
399 
400 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
401                                           bool read_only)
402 {
403     BlockDriver *drv = bdrv_find_format(format_name);
404     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
405 }
406 
407 typedef struct CreateCo {
408     BlockDriver *drv;
409     char *filename;
410     QEMUOptionParameter *options;
411     int ret;
412     Error *err;
413 } CreateCo;
414 
415 static void coroutine_fn bdrv_create_co_entry(void *opaque)
416 {
417     Error *local_err = NULL;
418     int ret;
419 
420     CreateCo *cco = opaque;
421     assert(cco->drv);
422 
423     ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
424     if (error_is_set(&local_err)) {
425         error_propagate(&cco->err, local_err);
426     }
427     cco->ret = ret;
428 }
429 
430 int bdrv_create(BlockDriver *drv, const char* filename,
431     QEMUOptionParameter *options, Error **errp)
432 {
433     int ret;
434 
435     Coroutine *co;
436     CreateCo cco = {
437         .drv = drv,
438         .filename = g_strdup(filename),
439         .options = options,
440         .ret = NOT_DONE,
441         .err = NULL,
442     };
443 
444     if (!drv->bdrv_create) {
445         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
446         ret = -ENOTSUP;
447         goto out;
448     }
449 
450     if (qemu_in_coroutine()) {
451         /* Fast-path if already in coroutine context */
452         bdrv_create_co_entry(&cco);
453     } else {
454         co = qemu_coroutine_create(bdrv_create_co_entry);
455         qemu_coroutine_enter(co, &cco);
456         while (cco.ret == NOT_DONE) {
457             qemu_aio_wait();
458         }
459     }
460 
461     ret = cco.ret;
462     if (ret < 0) {
463         if (error_is_set(&cco.err)) {
464             error_propagate(errp, cco.err);
465         } else {
466             error_setg_errno(errp, -ret, "Could not create image");
467         }
468     }
469 
470 out:
471     g_free(cco.filename);
472     return ret;
473 }
474 
475 int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
476                      Error **errp)
477 {
478     BlockDriver *drv;
479     Error *local_err = NULL;
480     int ret;
481 
482     drv = bdrv_find_protocol(filename, true);
483     if (drv == NULL) {
484         error_setg(errp, "Could not find protocol for file '%s'", filename);
485         return -ENOENT;
486     }
487 
488     ret = bdrv_create(drv, filename, options, &local_err);
489     if (error_is_set(&local_err)) {
490         error_propagate(errp, local_err);
491     }
492     return ret;
493 }
494 
495 int bdrv_refresh_limits(BlockDriverState *bs)
496 {
497     BlockDriver *drv = bs->drv;
498 
499     memset(&bs->bl, 0, sizeof(bs->bl));
500 
501     if (!drv) {
502         return 0;
503     }
504 
505     /* Take some limits from the children as a default */
506     if (bs->file) {
507         bdrv_refresh_limits(bs->file);
508         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
509         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
510     } else {
511         bs->bl.opt_mem_alignment = 512;
512     }
513 
514     if (bs->backing_hd) {
515         bdrv_refresh_limits(bs->backing_hd);
516         bs->bl.opt_transfer_length =
517             MAX(bs->bl.opt_transfer_length,
518                 bs->backing_hd->bl.opt_transfer_length);
519         bs->bl.opt_mem_alignment =
520             MAX(bs->bl.opt_mem_alignment,
521                 bs->backing_hd->bl.opt_mem_alignment);
522     }
523 
524     /* Then let the driver override it */
525     if (drv->bdrv_refresh_limits) {
526         return drv->bdrv_refresh_limits(bs);
527     }
528 
529     return 0;
530 }
531 
532 /*
533  * Create a uniquely-named empty temporary file.
534  * Return 0 upon success, otherwise a negative errno value.
535  */
536 int get_tmp_filename(char *filename, int size)
537 {
538 #ifdef _WIN32
539     char temp_dir[MAX_PATH];
540     /* GetTempFileName requires that its output buffer (4th param)
541        have length MAX_PATH or greater.  */
542     assert(size >= MAX_PATH);
543     return (GetTempPath(MAX_PATH, temp_dir)
544             && GetTempFileName(temp_dir, "qem", 0, filename)
545             ? 0 : -GetLastError());
546 #else
547     int fd;
548     const char *tmpdir;
549     tmpdir = getenv("TMPDIR");
550     if (!tmpdir)
551         tmpdir = "/tmp";
552     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
553         return -EOVERFLOW;
554     }
555     fd = mkstemp(filename);
556     if (fd < 0) {
557         return -errno;
558     }
559     if (close(fd) != 0) {
560         unlink(filename);
561         return -errno;
562     }
563     return 0;
564 #endif
565 }
566 
567 /*
568  * Detect host devices. By convention, /dev/cdrom[N] is always
569  * recognized as a host CDROM.
570  */
571 static BlockDriver *find_hdev_driver(const char *filename)
572 {
573     int score_max = 0, score;
574     BlockDriver *drv = NULL, *d;
575 
576     QLIST_FOREACH(d, &bdrv_drivers, list) {
577         if (d->bdrv_probe_device) {
578             score = d->bdrv_probe_device(filename);
579             if (score > score_max) {
580                 score_max = score;
581                 drv = d;
582             }
583         }
584     }
585 
586     return drv;
587 }
588 
589 BlockDriver *bdrv_find_protocol(const char *filename,
590                                 bool allow_protocol_prefix)
591 {
592     BlockDriver *drv1;
593     char protocol[128];
594     int len;
595     const char *p;
596 
597     /* TODO Drivers without bdrv_file_open must be specified explicitly */
598 
599     /*
600      * XXX(hch): we really should not let host device detection
601      * override an explicit protocol specification, but moving this
602      * later breaks access to device names with colons in them.
603      * Thanks to the brain-dead persistent naming schemes on udev-
604      * based Linux systems those actually are quite common.
605      */
606     drv1 = find_hdev_driver(filename);
607     if (drv1) {
608         return drv1;
609     }
610 
611     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
612         return bdrv_find_format("file");
613     }
614 
615     p = strchr(filename, ':');
616     assert(p != NULL);
617     len = p - filename;
618     if (len > sizeof(protocol) - 1)
619         len = sizeof(protocol) - 1;
620     memcpy(protocol, filename, len);
621     protocol[len] = '\0';
622     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
623         if (drv1->protocol_name &&
624             !strcmp(drv1->protocol_name, protocol)) {
625             return drv1;
626         }
627     }
628     return NULL;
629 }
630 
631 static int find_image_format(BlockDriverState *bs, const char *filename,
632                              BlockDriver **pdrv, Error **errp)
633 {
634     int score, score_max;
635     BlockDriver *drv1, *drv;
636     uint8_t buf[2048];
637     int ret = 0;
638 
639     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
640     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
641         drv = bdrv_find_format("raw");
642         if (!drv) {
643             error_setg(errp, "Could not find raw image format");
644             ret = -ENOENT;
645         }
646         *pdrv = drv;
647         return ret;
648     }
649 
650     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
651     if (ret < 0) {
652         error_setg_errno(errp, -ret, "Could not read image for determining its "
653                          "format");
654         *pdrv = NULL;
655         return ret;
656     }
657 
658     score_max = 0;
659     drv = NULL;
660     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
661         if (drv1->bdrv_probe) {
662             score = drv1->bdrv_probe(buf, ret, filename);
663             if (score > score_max) {
664                 score_max = score;
665                 drv = drv1;
666             }
667         }
668     }
669     if (!drv) {
670         error_setg(errp, "Could not determine image format: No compatible "
671                    "driver found");
672         ret = -ENOENT;
673     }
674     *pdrv = drv;
675     return ret;
676 }
677 
678 /**
679  * Set the current 'total_sectors' value
680  */
681 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
682 {
683     BlockDriver *drv = bs->drv;
684 
685     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
686     if (bs->sg)
687         return 0;
688 
689     /* query actual device if possible, otherwise just trust the hint */
690     if (drv->bdrv_getlength) {
691         int64_t length = drv->bdrv_getlength(bs);
692         if (length < 0) {
693             return length;
694         }
695         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
696     }
697 
698     bs->total_sectors = hint;
699     return 0;
700 }
701 
702 /**
703  * Set open flags for a given discard mode
704  *
705  * Return 0 on success, -1 if the discard mode was invalid.
706  */
707 int bdrv_parse_discard_flags(const char *mode, int *flags)
708 {
709     *flags &= ~BDRV_O_UNMAP;
710 
711     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
712         /* do nothing */
713     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
714         *flags |= BDRV_O_UNMAP;
715     } else {
716         return -1;
717     }
718 
719     return 0;
720 }
721 
722 /**
723  * Set open flags for a given cache mode
724  *
725  * Return 0 on success, -1 if the cache mode was invalid.
726  */
727 int bdrv_parse_cache_flags(const char *mode, int *flags)
728 {
729     *flags &= ~BDRV_O_CACHE_MASK;
730 
731     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
732         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
733     } else if (!strcmp(mode, "directsync")) {
734         *flags |= BDRV_O_NOCACHE;
735     } else if (!strcmp(mode, "writeback")) {
736         *flags |= BDRV_O_CACHE_WB;
737     } else if (!strcmp(mode, "unsafe")) {
738         *flags |= BDRV_O_CACHE_WB;
739         *flags |= BDRV_O_NO_FLUSH;
740     } else if (!strcmp(mode, "writethrough")) {
741         /* this is the default */
742     } else {
743         return -1;
744     }
745 
746     return 0;
747 }
748 
749 /**
750  * The copy-on-read flag is actually a reference count so multiple users may
751  * use the feature without worrying about clobbering its previous state.
752  * Copy-on-read stays enabled until all users have called to disable it.
753  */
754 void bdrv_enable_copy_on_read(BlockDriverState *bs)
755 {
756     bs->copy_on_read++;
757 }
758 
759 void bdrv_disable_copy_on_read(BlockDriverState *bs)
760 {
761     assert(bs->copy_on_read > 0);
762     bs->copy_on_read--;
763 }
764 
765 static int bdrv_open_flags(BlockDriverState *bs, int flags)
766 {
767     int open_flags = flags | BDRV_O_CACHE_WB;
768 
769     /*
770      * Clear flags that are internal to the block layer before opening the
771      * image.
772      */
773     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
774 
775     /*
776      * Snapshots should be writable.
777      */
778     if (bs->is_temporary) {
779         open_flags |= BDRV_O_RDWR;
780     }
781 
782     return open_flags;
783 }
784 
785 static int bdrv_assign_node_name(BlockDriverState *bs,
786                                  const char *node_name,
787                                  Error **errp)
788 {
789     if (!node_name) {
790         return 0;
791     }
792 
793     /* empty string node name is invalid */
794     if (node_name[0] == '\0') {
795         error_setg(errp, "Empty node name");
796         return -EINVAL;
797     }
798 
799     /* takes care of avoiding duplicates node names */
800     if (bdrv_find_node(node_name)) {
801         error_setg(errp, "Duplicate node name");
802         return -EINVAL;
803     }
804 
805     /* copy node name into the bs and insert it into the graph list */
806     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
807     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
808 
809     return 0;
810 }
811 
812 /*
813  * Common part for opening disk images and files
814  *
815  * Removes all processed options from *options.
816  */
817 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
818     QDict *options, int flags, BlockDriver *drv, Error **errp)
819 {
820     int ret, open_flags;
821     const char *filename;
822     const char *node_name = NULL;
823     Error *local_err = NULL;
824 
825     assert(drv != NULL);
826     assert(bs->file == NULL);
827     assert(options != NULL && bs->options != options);
828 
829     if (file != NULL) {
830         filename = file->filename;
831     } else {
832         filename = qdict_get_try_str(options, "filename");
833     }
834 
835     if (drv->bdrv_needs_filename && !filename) {
836         error_setg(errp, "The '%s' block driver requires a file name",
837                    drv->format_name);
838         return -EINVAL;
839     }
840 
841     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
842 
843     node_name = qdict_get_try_str(options, "node-name");
844     ret = bdrv_assign_node_name(bs, node_name, errp);
845     if (ret < 0) {
846         return ret;
847     }
848     qdict_del(options, "node-name");
849 
850     /* bdrv_open() with directly using a protocol as drv. This layer is already
851      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
852      * and return immediately. */
853     if (file != NULL && drv->bdrv_file_open) {
854         bdrv_swap(file, bs);
855         return 0;
856     }
857 
858     bs->open_flags = flags;
859     bs->guest_block_size = 512;
860     bs->request_alignment = 512;
861     bs->zero_beyond_eof = true;
862     open_flags = bdrv_open_flags(bs, flags);
863     bs->read_only = !(open_flags & BDRV_O_RDWR);
864 
865     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
866         error_setg(errp,
867                    !bs->read_only && bdrv_is_whitelisted(drv, true)
868                         ? "Driver '%s' can only be used for read-only devices"
869                         : "Driver '%s' is not whitelisted",
870                    drv->format_name);
871         return -ENOTSUP;
872     }
873 
874     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
875     if (flags & BDRV_O_COPY_ON_READ) {
876         if (!bs->read_only) {
877             bdrv_enable_copy_on_read(bs);
878         } else {
879             error_setg(errp, "Can't use copy-on-read on read-only device");
880             return -EINVAL;
881         }
882     }
883 
884     if (filename != NULL) {
885         pstrcpy(bs->filename, sizeof(bs->filename), filename);
886     } else {
887         bs->filename[0] = '\0';
888     }
889 
890     bs->drv = drv;
891     bs->opaque = g_malloc0(drv->instance_size);
892 
893     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
894 
895     /* Open the image, either directly or using a protocol */
896     if (drv->bdrv_file_open) {
897         assert(file == NULL);
898         assert(!drv->bdrv_needs_filename || filename != NULL);
899         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
900     } else {
901         if (file == NULL) {
902             error_setg(errp, "Can't use '%s' as a block driver for the "
903                        "protocol level", drv->format_name);
904             ret = -EINVAL;
905             goto free_and_fail;
906         }
907         bs->file = file;
908         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
909     }
910 
911     if (ret < 0) {
912         if (error_is_set(&local_err)) {
913             error_propagate(errp, local_err);
914         } else if (bs->filename[0]) {
915             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
916         } else {
917             error_setg_errno(errp, -ret, "Could not open image");
918         }
919         goto free_and_fail;
920     }
921 
922     ret = refresh_total_sectors(bs, bs->total_sectors);
923     if (ret < 0) {
924         error_setg_errno(errp, -ret, "Could not refresh total sector count");
925         goto free_and_fail;
926     }
927 
928     bdrv_refresh_limits(bs);
929     assert(bdrv_opt_mem_align(bs) != 0);
930     assert(bs->request_alignment != 0);
931 
932 #ifndef _WIN32
933     if (bs->is_temporary) {
934         assert(bs->filename[0] != '\0');
935         unlink(bs->filename);
936     }
937 #endif
938     return 0;
939 
940 free_and_fail:
941     bs->file = NULL;
942     g_free(bs->opaque);
943     bs->opaque = NULL;
944     bs->drv = NULL;
945     return ret;
946 }
947 
948 /*
949  * Opens a file using a protocol (file, host_device, nbd, ...)
950  *
951  * options is a QDict of options to pass to the block drivers, or NULL for an
952  * empty set of options. The reference to the QDict belongs to the block layer
953  * after the call (even on failure), so if the caller intends to reuse the
954  * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
955  */
956 int bdrv_file_open(BlockDriverState **pbs, const char *filename,
957                    const char *reference, QDict *options, int flags,
958                    Error **errp)
959 {
960     BlockDriverState *bs = NULL;
961     BlockDriver *drv;
962     const char *drvname;
963     bool allow_protocol_prefix = false;
964     Error *local_err = NULL;
965     int ret;
966 
967     /* NULL means an empty set of options */
968     if (options == NULL) {
969         options = qdict_new();
970     }
971 
972     if (reference) {
973         if (filename || qdict_size(options)) {
974             error_setg(errp, "Cannot reference an existing block device with "
975                        "additional options or a new filename");
976             return -EINVAL;
977         }
978         QDECREF(options);
979 
980         bs = bdrv_find(reference);
981         if (!bs) {
982             error_setg(errp, "Cannot find block device '%s'", reference);
983             return -ENODEV;
984         }
985         bdrv_ref(bs);
986         *pbs = bs;
987         return 0;
988     }
989 
990     bs = bdrv_new("");
991     bs->options = options;
992     options = qdict_clone_shallow(options);
993 
994     /* Fetch the file name from the options QDict if necessary */
995     if (!filename) {
996         filename = qdict_get_try_str(options, "filename");
997     } else if (filename && !qdict_haskey(options, "filename")) {
998         qdict_put(options, "filename", qstring_from_str(filename));
999         allow_protocol_prefix = true;
1000     } else {
1001         error_setg(errp, "Can't specify 'file' and 'filename' options at the "
1002                    "same time");
1003         ret = -EINVAL;
1004         goto fail;
1005     }
1006 
1007     /* Find the right block driver */
1008     drvname = qdict_get_try_str(options, "driver");
1009     if (drvname) {
1010         drv = bdrv_find_format(drvname);
1011         if (!drv) {
1012             error_setg(errp, "Unknown driver '%s'", drvname);
1013         }
1014         qdict_del(options, "driver");
1015     } else if (filename) {
1016         drv = bdrv_find_protocol(filename, allow_protocol_prefix);
1017         if (!drv) {
1018             error_setg(errp, "Unknown protocol");
1019         }
1020     } else {
1021         error_setg(errp, "Must specify either driver or file");
1022         drv = NULL;
1023     }
1024 
1025     if (!drv) {
1026         /* errp has been set already */
1027         ret = -ENOENT;
1028         goto fail;
1029     }
1030 
1031     /* Parse the filename and open it */
1032     if (drv->bdrv_parse_filename && filename) {
1033         drv->bdrv_parse_filename(filename, options, &local_err);
1034         if (error_is_set(&local_err)) {
1035             error_propagate(errp, local_err);
1036             ret = -EINVAL;
1037             goto fail;
1038         }
1039         qdict_del(options, "filename");
1040     }
1041 
1042     if (!drv->bdrv_file_open) {
1043         ret = bdrv_open(bs, filename, options, flags, drv, &local_err);
1044         options = NULL;
1045     } else {
1046         ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err);
1047     }
1048     if (ret < 0) {
1049         error_propagate(errp, local_err);
1050         goto fail;
1051     }
1052 
1053     /* Check if any unknown options were used */
1054     if (options && (qdict_size(options) != 0)) {
1055         const QDictEntry *entry = qdict_first(options);
1056         error_setg(errp, "Block protocol '%s' doesn't support the option '%s'",
1057                    drv->format_name, entry->key);
1058         ret = -EINVAL;
1059         goto fail;
1060     }
1061     QDECREF(options);
1062 
1063     bs->growable = 1;
1064     *pbs = bs;
1065     return 0;
1066 
1067 fail:
1068     QDECREF(options);
1069     if (!bs->drv) {
1070         QDECREF(bs->options);
1071     }
1072     bdrv_unref(bs);
1073     return ret;
1074 }
1075 
1076 /*
1077  * Opens the backing file for a BlockDriverState if not yet open
1078  *
1079  * options is a QDict of options to pass to the block drivers, or NULL for an
1080  * empty set of options. The reference to the QDict is transferred to this
1081  * function (even on failure), so if the caller intends to reuse the dictionary,
1082  * it needs to use QINCREF() before calling bdrv_file_open.
1083  */
1084 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1085 {
1086     char backing_filename[PATH_MAX];
1087     int back_flags, ret;
1088     BlockDriver *back_drv = NULL;
1089     Error *local_err = NULL;
1090 
1091     if (bs->backing_hd != NULL) {
1092         QDECREF(options);
1093         return 0;
1094     }
1095 
1096     /* NULL means an empty set of options */
1097     if (options == NULL) {
1098         options = qdict_new();
1099     }
1100 
1101     bs->open_flags &= ~BDRV_O_NO_BACKING;
1102     if (qdict_haskey(options, "file.filename")) {
1103         backing_filename[0] = '\0';
1104     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1105         QDECREF(options);
1106         return 0;
1107     } else {
1108         bdrv_get_full_backing_filename(bs, backing_filename,
1109                                        sizeof(backing_filename));
1110     }
1111 
1112     bs->backing_hd = bdrv_new("");
1113 
1114     if (bs->backing_format[0] != '\0') {
1115         back_drv = bdrv_find_format(bs->backing_format);
1116     }
1117 
1118     /* backing files always opened read-only */
1119     back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
1120                                     BDRV_O_COPY_ON_READ);
1121 
1122     ret = bdrv_open(bs->backing_hd,
1123                     *backing_filename ? backing_filename : NULL, options,
1124                     back_flags, back_drv, &local_err);
1125     if (ret < 0) {
1126         bdrv_unref(bs->backing_hd);
1127         bs->backing_hd = NULL;
1128         bs->open_flags |= BDRV_O_NO_BACKING;
1129         error_setg(errp, "Could not open backing file: %s",
1130                    error_get_pretty(local_err));
1131         error_free(local_err);
1132         return ret;
1133     }
1134 
1135     if (bs->backing_hd->file) {
1136         pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1137                 bs->backing_hd->file->filename);
1138     }
1139 
1140     /* Recalculate the BlockLimits with the backing file */
1141     bdrv_refresh_limits(bs);
1142 
1143     return 0;
1144 }
1145 
1146 /*
1147  * Opens a disk image whose options are given as BlockdevRef in another block
1148  * device's options.
1149  *
1150  * If force_raw is true, bdrv_file_open() will be used, thereby preventing any
1151  * image format auto-detection. If it is false and a filename is given,
1152  * bdrv_open() will be used for auto-detection.
1153  *
1154  * If allow_none is true, no image will be opened if filename is false and no
1155  * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1156  *
1157  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1158  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1159  * itself, all options starting with "${bdref_key}." are considered part of the
1160  * BlockdevRef.
1161  *
1162  * The BlockdevRef will be removed from the options QDict.
1163  */
1164 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1165                     QDict *options, const char *bdref_key, int flags,
1166                     bool force_raw, bool allow_none, Error **errp)
1167 {
1168     QDict *image_options;
1169     int ret;
1170     char *bdref_key_dot;
1171     const char *reference;
1172 
1173     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1174     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1175     g_free(bdref_key_dot);
1176 
1177     reference = qdict_get_try_str(options, bdref_key);
1178     if (!filename && !reference && !qdict_size(image_options)) {
1179         if (allow_none) {
1180             ret = 0;
1181         } else {
1182             error_setg(errp, "A block device must be specified for \"%s\"",
1183                        bdref_key);
1184             ret = -EINVAL;
1185         }
1186         goto done;
1187     }
1188 
1189     if (filename && !force_raw) {
1190         /* If a filename is given and the block driver should be detected
1191            automatically (instead of using none), use bdrv_open() in order to do
1192            that auto-detection. */
1193         BlockDriverState *bs;
1194 
1195         if (reference) {
1196             error_setg(errp, "Cannot reference an existing block device while "
1197                        "giving a filename");
1198             ret = -EINVAL;
1199             goto done;
1200         }
1201 
1202         bs = bdrv_new("");
1203         ret = bdrv_open(bs, filename, image_options, flags, NULL, errp);
1204         if (ret < 0) {
1205             bdrv_unref(bs);
1206         } else {
1207             *pbs = bs;
1208         }
1209     } else {
1210         ret = bdrv_file_open(pbs, filename, reference, image_options, flags,
1211                              errp);
1212     }
1213 
1214 done:
1215     qdict_del(options, bdref_key);
1216     return ret;
1217 }
1218 
1219 /*
1220  * Opens a disk image (raw, qcow2, vmdk, ...)
1221  *
1222  * options is a QDict of options to pass to the block drivers, or NULL for an
1223  * empty set of options. The reference to the QDict belongs to the block layer
1224  * after the call (even on failure), so if the caller intends to reuse the
1225  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1226  */
1227 int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
1228               int flags, BlockDriver *drv, Error **errp)
1229 {
1230     int ret;
1231     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1232     char tmp_filename[PATH_MAX + 1];
1233     BlockDriverState *file = NULL;
1234     const char *drvname;
1235     Error *local_err = NULL;
1236 
1237     /* NULL means an empty set of options */
1238     if (options == NULL) {
1239         options = qdict_new();
1240     }
1241 
1242     bs->options = options;
1243     options = qdict_clone_shallow(options);
1244 
1245     /* For snapshot=on, create a temporary qcow2 overlay */
1246     if (flags & BDRV_O_SNAPSHOT) {
1247         BlockDriverState *bs1;
1248         int64_t total_size;
1249         BlockDriver *bdrv_qcow2;
1250         QEMUOptionParameter *create_options;
1251         QDict *snapshot_options;
1252 
1253         /* if snapshot, we create a temporary backing file and open it
1254            instead of opening 'filename' directly */
1255 
1256         /* Get the required size from the image */
1257         bs1 = bdrv_new("");
1258         QINCREF(options);
1259         ret = bdrv_open(bs1, filename, options, BDRV_O_NO_BACKING,
1260                         drv, &local_err);
1261         if (ret < 0) {
1262             bdrv_unref(bs1);
1263             goto fail;
1264         }
1265         total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1266 
1267         bdrv_unref(bs1);
1268 
1269         /* Create the temporary image */
1270         ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1271         if (ret < 0) {
1272             error_setg_errno(errp, -ret, "Could not get temporary filename");
1273             goto fail;
1274         }
1275 
1276         bdrv_qcow2 = bdrv_find_format("qcow2");
1277         create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1278                                                  NULL);
1279 
1280         set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1281 
1282         ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1283         free_option_parameters(create_options);
1284         if (ret < 0) {
1285             error_setg_errno(errp, -ret, "Could not create temporary overlay "
1286                              "'%s': %s", tmp_filename,
1287                              error_get_pretty(local_err));
1288             error_free(local_err);
1289             local_err = NULL;
1290             goto fail;
1291         }
1292 
1293         /* Prepare a new options QDict for the temporary file, where user
1294          * options refer to the backing file */
1295         if (filename) {
1296             qdict_put(options, "file.filename", qstring_from_str(filename));
1297         }
1298         if (drv) {
1299             qdict_put(options, "driver", qstring_from_str(drv->format_name));
1300         }
1301 
1302         snapshot_options = qdict_new();
1303         qdict_put(snapshot_options, "backing", options);
1304         qdict_flatten(snapshot_options);
1305 
1306         bs->options = snapshot_options;
1307         options = qdict_clone_shallow(bs->options);
1308 
1309         filename = tmp_filename;
1310         drv = bdrv_qcow2;
1311         bs->is_temporary = 1;
1312     }
1313 
1314     /* Open image file without format layer */
1315     if (flags & BDRV_O_RDWR) {
1316         flags |= BDRV_O_ALLOW_RDWR;
1317     }
1318 
1319     ret = bdrv_open_image(&file, filename, options, "file",
1320                           bdrv_open_flags(bs, flags | BDRV_O_UNMAP), true, true,
1321                           &local_err);
1322     if (ret < 0) {
1323         goto fail;
1324     }
1325 
1326     /* Find the right image format driver */
1327     drvname = qdict_get_try_str(options, "driver");
1328     if (drvname) {
1329         drv = bdrv_find_format(drvname);
1330         qdict_del(options, "driver");
1331         if (!drv) {
1332             error_setg(errp, "Invalid driver: '%s'", drvname);
1333             ret = -EINVAL;
1334             goto unlink_and_fail;
1335         }
1336     }
1337 
1338     if (!drv) {
1339         if (file) {
1340             ret = find_image_format(file, filename, &drv, &local_err);
1341         } else {
1342             error_setg(errp, "Must specify either driver or file");
1343             ret = -EINVAL;
1344             goto unlink_and_fail;
1345         }
1346     }
1347 
1348     if (!drv) {
1349         goto unlink_and_fail;
1350     }
1351 
1352     /* Open the image */
1353     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1354     if (ret < 0) {
1355         goto unlink_and_fail;
1356     }
1357 
1358     if (file && (bs->file != file)) {
1359         bdrv_unref(file);
1360         file = NULL;
1361     }
1362 
1363     /* If there is a backing file, use it */
1364     if ((flags & BDRV_O_NO_BACKING) == 0) {
1365         QDict *backing_options;
1366 
1367         qdict_extract_subqdict(options, &backing_options, "backing.");
1368         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1369         if (ret < 0) {
1370             goto close_and_fail;
1371         }
1372     }
1373 
1374     /* Check if any unknown options were used */
1375     if (qdict_size(options) != 0) {
1376         const QDictEntry *entry = qdict_first(options);
1377         error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1378                    "support the option '%s'", drv->format_name, bs->device_name,
1379                    entry->key);
1380 
1381         ret = -EINVAL;
1382         goto close_and_fail;
1383     }
1384     QDECREF(options);
1385 
1386     if (!bdrv_key_required(bs)) {
1387         bdrv_dev_change_media_cb(bs, true);
1388     }
1389 
1390     return 0;
1391 
1392 unlink_and_fail:
1393     if (file != NULL) {
1394         bdrv_unref(file);
1395     }
1396     if (bs->is_temporary) {
1397         unlink(filename);
1398     }
1399 fail:
1400     QDECREF(bs->options);
1401     QDECREF(options);
1402     bs->options = NULL;
1403     if (error_is_set(&local_err)) {
1404         error_propagate(errp, local_err);
1405     }
1406     return ret;
1407 
1408 close_and_fail:
1409     bdrv_close(bs);
1410     QDECREF(options);
1411     if (error_is_set(&local_err)) {
1412         error_propagate(errp, local_err);
1413     }
1414     return ret;
1415 }
1416 
1417 typedef struct BlockReopenQueueEntry {
1418      bool prepared;
1419      BDRVReopenState state;
1420      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1421 } BlockReopenQueueEntry;
1422 
1423 /*
1424  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1425  * reopen of multiple devices.
1426  *
1427  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1428  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1429  * be created and initialized. This newly created BlockReopenQueue should be
1430  * passed back in for subsequent calls that are intended to be of the same
1431  * atomic 'set'.
1432  *
1433  * bs is the BlockDriverState to add to the reopen queue.
1434  *
1435  * flags contains the open flags for the associated bs
1436  *
1437  * returns a pointer to bs_queue, which is either the newly allocated
1438  * bs_queue, or the existing bs_queue being used.
1439  *
1440  */
1441 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1442                                     BlockDriverState *bs, int flags)
1443 {
1444     assert(bs != NULL);
1445 
1446     BlockReopenQueueEntry *bs_entry;
1447     if (bs_queue == NULL) {
1448         bs_queue = g_new0(BlockReopenQueue, 1);
1449         QSIMPLEQ_INIT(bs_queue);
1450     }
1451 
1452     if (bs->file) {
1453         bdrv_reopen_queue(bs_queue, bs->file, flags);
1454     }
1455 
1456     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1457     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1458 
1459     bs_entry->state.bs = bs;
1460     bs_entry->state.flags = flags;
1461 
1462     return bs_queue;
1463 }
1464 
1465 /*
1466  * Reopen multiple BlockDriverStates atomically & transactionally.
1467  *
1468  * The queue passed in (bs_queue) must have been built up previous
1469  * via bdrv_reopen_queue().
1470  *
1471  * Reopens all BDS specified in the queue, with the appropriate
1472  * flags.  All devices are prepared for reopen, and failure of any
1473  * device will cause all device changes to be abandonded, and intermediate
1474  * data cleaned up.
1475  *
1476  * If all devices prepare successfully, then the changes are committed
1477  * to all devices.
1478  *
1479  */
1480 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1481 {
1482     int ret = -1;
1483     BlockReopenQueueEntry *bs_entry, *next;
1484     Error *local_err = NULL;
1485 
1486     assert(bs_queue != NULL);
1487 
1488     bdrv_drain_all();
1489 
1490     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1491         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1492             error_propagate(errp, local_err);
1493             goto cleanup;
1494         }
1495         bs_entry->prepared = true;
1496     }
1497 
1498     /* If we reach this point, we have success and just need to apply the
1499      * changes
1500      */
1501     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1502         bdrv_reopen_commit(&bs_entry->state);
1503     }
1504 
1505     ret = 0;
1506 
1507 cleanup:
1508     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1509         if (ret && bs_entry->prepared) {
1510             bdrv_reopen_abort(&bs_entry->state);
1511         }
1512         g_free(bs_entry);
1513     }
1514     g_free(bs_queue);
1515     return ret;
1516 }
1517 
1518 
1519 /* Reopen a single BlockDriverState with the specified flags. */
1520 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1521 {
1522     int ret = -1;
1523     Error *local_err = NULL;
1524     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1525 
1526     ret = bdrv_reopen_multiple(queue, &local_err);
1527     if (local_err != NULL) {
1528         error_propagate(errp, local_err);
1529     }
1530     return ret;
1531 }
1532 
1533 
1534 /*
1535  * Prepares a BlockDriverState for reopen. All changes are staged in the
1536  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1537  * the block driver layer .bdrv_reopen_prepare()
1538  *
1539  * bs is the BlockDriverState to reopen
1540  * flags are the new open flags
1541  * queue is the reopen queue
1542  *
1543  * Returns 0 on success, non-zero on error.  On error errp will be set
1544  * as well.
1545  *
1546  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1547  * It is the responsibility of the caller to then call the abort() or
1548  * commit() for any other BDS that have been left in a prepare() state
1549  *
1550  */
1551 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1552                         Error **errp)
1553 {
1554     int ret = -1;
1555     Error *local_err = NULL;
1556     BlockDriver *drv;
1557 
1558     assert(reopen_state != NULL);
1559     assert(reopen_state->bs->drv != NULL);
1560     drv = reopen_state->bs->drv;
1561 
1562     /* if we are to stay read-only, do not allow permission change
1563      * to r/w */
1564     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1565         reopen_state->flags & BDRV_O_RDWR) {
1566         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1567                   reopen_state->bs->device_name);
1568         goto error;
1569     }
1570 
1571 
1572     ret = bdrv_flush(reopen_state->bs);
1573     if (ret) {
1574         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1575                   strerror(-ret));
1576         goto error;
1577     }
1578 
1579     if (drv->bdrv_reopen_prepare) {
1580         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1581         if (ret) {
1582             if (local_err != NULL) {
1583                 error_propagate(errp, local_err);
1584             } else {
1585                 error_setg(errp, "failed while preparing to reopen image '%s'",
1586                            reopen_state->bs->filename);
1587             }
1588             goto error;
1589         }
1590     } else {
1591         /* It is currently mandatory to have a bdrv_reopen_prepare()
1592          * handler for each supported drv. */
1593         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1594                   drv->format_name, reopen_state->bs->device_name,
1595                  "reopening of file");
1596         ret = -1;
1597         goto error;
1598     }
1599 
1600     ret = 0;
1601 
1602 error:
1603     return ret;
1604 }
1605 
1606 /*
1607  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1608  * makes them final by swapping the staging BlockDriverState contents into
1609  * the active BlockDriverState contents.
1610  */
1611 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1612 {
1613     BlockDriver *drv;
1614 
1615     assert(reopen_state != NULL);
1616     drv = reopen_state->bs->drv;
1617     assert(drv != NULL);
1618 
1619     /* If there are any driver level actions to take */
1620     if (drv->bdrv_reopen_commit) {
1621         drv->bdrv_reopen_commit(reopen_state);
1622     }
1623 
1624     /* set BDS specific flags now */
1625     reopen_state->bs->open_flags         = reopen_state->flags;
1626     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1627                                               BDRV_O_CACHE_WB);
1628     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1629 
1630     bdrv_refresh_limits(reopen_state->bs);
1631 }
1632 
1633 /*
1634  * Abort the reopen, and delete and free the staged changes in
1635  * reopen_state
1636  */
1637 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1638 {
1639     BlockDriver *drv;
1640 
1641     assert(reopen_state != NULL);
1642     drv = reopen_state->bs->drv;
1643     assert(drv != NULL);
1644 
1645     if (drv->bdrv_reopen_abort) {
1646         drv->bdrv_reopen_abort(reopen_state);
1647     }
1648 }
1649 
1650 
1651 void bdrv_close(BlockDriverState *bs)
1652 {
1653     if (bs->job) {
1654         block_job_cancel_sync(bs->job);
1655     }
1656     bdrv_drain_all(); /* complete I/O */
1657     bdrv_flush(bs);
1658     bdrv_drain_all(); /* in case flush left pending I/O */
1659     notifier_list_notify(&bs->close_notifiers, bs);
1660 
1661     if (bs->drv) {
1662         if (bs->backing_hd) {
1663             bdrv_unref(bs->backing_hd);
1664             bs->backing_hd = NULL;
1665         }
1666         bs->drv->bdrv_close(bs);
1667         g_free(bs->opaque);
1668 #ifdef _WIN32
1669         if (bs->is_temporary) {
1670             unlink(bs->filename);
1671         }
1672 #endif
1673         bs->opaque = NULL;
1674         bs->drv = NULL;
1675         bs->copy_on_read = 0;
1676         bs->backing_file[0] = '\0';
1677         bs->backing_format[0] = '\0';
1678         bs->total_sectors = 0;
1679         bs->encrypted = 0;
1680         bs->valid_key = 0;
1681         bs->sg = 0;
1682         bs->growable = 0;
1683         bs->zero_beyond_eof = false;
1684         QDECREF(bs->options);
1685         bs->options = NULL;
1686 
1687         if (bs->file != NULL) {
1688             bdrv_unref(bs->file);
1689             bs->file = NULL;
1690         }
1691     }
1692 
1693     bdrv_dev_change_media_cb(bs, false);
1694 
1695     /*throttling disk I/O limits*/
1696     if (bs->io_limits_enabled) {
1697         bdrv_io_limits_disable(bs);
1698     }
1699 }
1700 
1701 void bdrv_close_all(void)
1702 {
1703     BlockDriverState *bs;
1704 
1705     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1706         bdrv_close(bs);
1707     }
1708 }
1709 
1710 /* Check if any requests are in-flight (including throttled requests) */
1711 static bool bdrv_requests_pending(BlockDriverState *bs)
1712 {
1713     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1714         return true;
1715     }
1716     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1717         return true;
1718     }
1719     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1720         return true;
1721     }
1722     if (bs->file && bdrv_requests_pending(bs->file)) {
1723         return true;
1724     }
1725     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1726         return true;
1727     }
1728     return false;
1729 }
1730 
1731 static bool bdrv_requests_pending_all(void)
1732 {
1733     BlockDriverState *bs;
1734     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1735         if (bdrv_requests_pending(bs)) {
1736             return true;
1737         }
1738     }
1739     return false;
1740 }
1741 
1742 /*
1743  * Wait for pending requests to complete across all BlockDriverStates
1744  *
1745  * This function does not flush data to disk, use bdrv_flush_all() for that
1746  * after calling this function.
1747  *
1748  * Note that completion of an asynchronous I/O operation can trigger any
1749  * number of other I/O operations on other devices---for example a coroutine
1750  * can be arbitrarily complex and a constant flow of I/O can come until the
1751  * coroutine is complete.  Because of this, it is not possible to have a
1752  * function to drain a single device's I/O queue.
1753  */
1754 void bdrv_drain_all(void)
1755 {
1756     /* Always run first iteration so any pending completion BHs run */
1757     bool busy = true;
1758     BlockDriverState *bs;
1759 
1760     while (busy) {
1761         QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1762             bdrv_start_throttled_reqs(bs);
1763         }
1764 
1765         busy = bdrv_requests_pending_all();
1766         busy |= aio_poll(qemu_get_aio_context(), busy);
1767     }
1768 }
1769 
1770 /* make a BlockDriverState anonymous by removing from bdrv_state and
1771  * graph_bdrv_state list.
1772    Also, NULL terminate the device_name to prevent double remove */
1773 void bdrv_make_anon(BlockDriverState *bs)
1774 {
1775     if (bs->device_name[0] != '\0') {
1776         QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1777     }
1778     bs->device_name[0] = '\0';
1779     if (bs->node_name[0] != '\0') {
1780         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1781     }
1782     bs->node_name[0] = '\0';
1783 }
1784 
1785 static void bdrv_rebind(BlockDriverState *bs)
1786 {
1787     if (bs->drv && bs->drv->bdrv_rebind) {
1788         bs->drv->bdrv_rebind(bs);
1789     }
1790 }
1791 
1792 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1793                                      BlockDriverState *bs_src)
1794 {
1795     /* move some fields that need to stay attached to the device */
1796     bs_dest->open_flags         = bs_src->open_flags;
1797 
1798     /* dev info */
1799     bs_dest->dev_ops            = bs_src->dev_ops;
1800     bs_dest->dev_opaque         = bs_src->dev_opaque;
1801     bs_dest->dev                = bs_src->dev;
1802     bs_dest->guest_block_size   = bs_src->guest_block_size;
1803     bs_dest->copy_on_read       = bs_src->copy_on_read;
1804 
1805     bs_dest->enable_write_cache = bs_src->enable_write_cache;
1806 
1807     /* i/o throttled req */
1808     memcpy(&bs_dest->throttle_state,
1809            &bs_src->throttle_state,
1810            sizeof(ThrottleState));
1811     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1812     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1813     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1814 
1815     /* r/w error */
1816     bs_dest->on_read_error      = bs_src->on_read_error;
1817     bs_dest->on_write_error     = bs_src->on_write_error;
1818 
1819     /* i/o status */
1820     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1821     bs_dest->iostatus           = bs_src->iostatus;
1822 
1823     /* dirty bitmap */
1824     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1825 
1826     /* reference count */
1827     bs_dest->refcnt             = bs_src->refcnt;
1828 
1829     /* job */
1830     bs_dest->in_use             = bs_src->in_use;
1831     bs_dest->job                = bs_src->job;
1832 
1833     /* keep the same entry in bdrv_states */
1834     pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1835             bs_src->device_name);
1836     bs_dest->device_list = bs_src->device_list;
1837 
1838     /* keep the same entry in graph_bdrv_states
1839      * We do want to swap name but don't want to swap linked list entries
1840      */
1841     bs_dest->node_list   = bs_src->node_list;
1842 }
1843 
1844 /*
1845  * Swap bs contents for two image chains while they are live,
1846  * while keeping required fields on the BlockDriverState that is
1847  * actually attached to a device.
1848  *
1849  * This will modify the BlockDriverState fields, and swap contents
1850  * between bs_new and bs_old. Both bs_new and bs_old are modified.
1851  *
1852  * bs_new is required to be anonymous.
1853  *
1854  * This function does not create any image files.
1855  */
1856 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1857 {
1858     BlockDriverState tmp;
1859 
1860     /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1861     assert(bs_new->device_name[0] == '\0');
1862     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1863     assert(bs_new->job == NULL);
1864     assert(bs_new->dev == NULL);
1865     assert(bs_new->in_use == 0);
1866     assert(bs_new->io_limits_enabled == false);
1867     assert(!throttle_have_timer(&bs_new->throttle_state));
1868 
1869     tmp = *bs_new;
1870     *bs_new = *bs_old;
1871     *bs_old = tmp;
1872 
1873     /* there are some fields that should not be swapped, move them back */
1874     bdrv_move_feature_fields(&tmp, bs_old);
1875     bdrv_move_feature_fields(bs_old, bs_new);
1876     bdrv_move_feature_fields(bs_new, &tmp);
1877 
1878     /* bs_new shouldn't be in bdrv_states even after the swap!  */
1879     assert(bs_new->device_name[0] == '\0');
1880 
1881     /* Check a few fields that should remain attached to the device */
1882     assert(bs_new->dev == NULL);
1883     assert(bs_new->job == NULL);
1884     assert(bs_new->in_use == 0);
1885     assert(bs_new->io_limits_enabled == false);
1886     assert(!throttle_have_timer(&bs_new->throttle_state));
1887 
1888     bdrv_rebind(bs_new);
1889     bdrv_rebind(bs_old);
1890 }
1891 
1892 /*
1893  * Add new bs contents at the top of an image chain while the chain is
1894  * live, while keeping required fields on the top layer.
1895  *
1896  * This will modify the BlockDriverState fields, and swap contents
1897  * between bs_new and bs_top. Both bs_new and bs_top are modified.
1898  *
1899  * bs_new is required to be anonymous.
1900  *
1901  * This function does not create any image files.
1902  */
1903 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1904 {
1905     bdrv_swap(bs_new, bs_top);
1906 
1907     /* The contents of 'tmp' will become bs_top, as we are
1908      * swapping bs_new and bs_top contents. */
1909     bs_top->backing_hd = bs_new;
1910     bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1911     pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1912             bs_new->filename);
1913     pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1914             bs_new->drv ? bs_new->drv->format_name : "");
1915 }
1916 
1917 static void bdrv_delete(BlockDriverState *bs)
1918 {
1919     assert(!bs->dev);
1920     assert(!bs->job);
1921     assert(!bs->in_use);
1922     assert(!bs->refcnt);
1923     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1924 
1925     bdrv_close(bs);
1926 
1927     /* remove from list, if necessary */
1928     bdrv_make_anon(bs);
1929 
1930     g_free(bs);
1931 }
1932 
1933 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1934 /* TODO change to DeviceState *dev when all users are qdevified */
1935 {
1936     if (bs->dev) {
1937         return -EBUSY;
1938     }
1939     bs->dev = dev;
1940     bdrv_iostatus_reset(bs);
1941     return 0;
1942 }
1943 
1944 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1945 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1946 {
1947     if (bdrv_attach_dev(bs, dev) < 0) {
1948         abort();
1949     }
1950 }
1951 
1952 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1953 /* TODO change to DeviceState *dev when all users are qdevified */
1954 {
1955     assert(bs->dev == dev);
1956     bs->dev = NULL;
1957     bs->dev_ops = NULL;
1958     bs->dev_opaque = NULL;
1959     bs->guest_block_size = 512;
1960 }
1961 
1962 /* TODO change to return DeviceState * when all users are qdevified */
1963 void *bdrv_get_attached_dev(BlockDriverState *bs)
1964 {
1965     return bs->dev;
1966 }
1967 
1968 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1969                       void *opaque)
1970 {
1971     bs->dev_ops = ops;
1972     bs->dev_opaque = opaque;
1973 }
1974 
1975 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1976                                enum MonitorEvent ev,
1977                                BlockErrorAction action, bool is_read)
1978 {
1979     QObject *data;
1980     const char *action_str;
1981 
1982     switch (action) {
1983     case BDRV_ACTION_REPORT:
1984         action_str = "report";
1985         break;
1986     case BDRV_ACTION_IGNORE:
1987         action_str = "ignore";
1988         break;
1989     case BDRV_ACTION_STOP:
1990         action_str = "stop";
1991         break;
1992     default:
1993         abort();
1994     }
1995 
1996     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1997                               bdrv->device_name,
1998                               action_str,
1999                               is_read ? "read" : "write");
2000     monitor_protocol_event(ev, data);
2001 
2002     qobject_decref(data);
2003 }
2004 
2005 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2006 {
2007     QObject *data;
2008 
2009     data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2010                               bdrv_get_device_name(bs), ejected);
2011     monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2012 
2013     qobject_decref(data);
2014 }
2015 
2016 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2017 {
2018     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2019         bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2020         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2021         if (tray_was_closed) {
2022             /* tray open */
2023             bdrv_emit_qmp_eject_event(bs, true);
2024         }
2025         if (load) {
2026             /* tray close */
2027             bdrv_emit_qmp_eject_event(bs, false);
2028         }
2029     }
2030 }
2031 
2032 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2033 {
2034     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2035 }
2036 
2037 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2038 {
2039     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2040         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2041     }
2042 }
2043 
2044 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2045 {
2046     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2047         return bs->dev_ops->is_tray_open(bs->dev_opaque);
2048     }
2049     return false;
2050 }
2051 
2052 static void bdrv_dev_resize_cb(BlockDriverState *bs)
2053 {
2054     if (bs->dev_ops && bs->dev_ops->resize_cb) {
2055         bs->dev_ops->resize_cb(bs->dev_opaque);
2056     }
2057 }
2058 
2059 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2060 {
2061     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2062         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2063     }
2064     return false;
2065 }
2066 
2067 /*
2068  * Run consistency checks on an image
2069  *
2070  * Returns 0 if the check could be completed (it doesn't mean that the image is
2071  * free of errors) or -errno when an internal error occurred. The results of the
2072  * check are stored in res.
2073  */
2074 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2075 {
2076     if (bs->drv->bdrv_check == NULL) {
2077         return -ENOTSUP;
2078     }
2079 
2080     memset(res, 0, sizeof(*res));
2081     return bs->drv->bdrv_check(bs, res, fix);
2082 }
2083 
2084 #define COMMIT_BUF_SECTORS 2048
2085 
2086 /* commit COW file into the raw image */
2087 int bdrv_commit(BlockDriverState *bs)
2088 {
2089     BlockDriver *drv = bs->drv;
2090     int64_t sector, total_sectors, length, backing_length;
2091     int n, ro, open_flags;
2092     int ret = 0;
2093     uint8_t *buf = NULL;
2094     char filename[PATH_MAX];
2095 
2096     if (!drv)
2097         return -ENOMEDIUM;
2098 
2099     if (!bs->backing_hd) {
2100         return -ENOTSUP;
2101     }
2102 
2103     if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
2104         return -EBUSY;
2105     }
2106 
2107     ro = bs->backing_hd->read_only;
2108     /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2109     pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2110     open_flags =  bs->backing_hd->open_flags;
2111 
2112     if (ro) {
2113         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2114             return -EACCES;
2115         }
2116     }
2117 
2118     length = bdrv_getlength(bs);
2119     if (length < 0) {
2120         ret = length;
2121         goto ro_cleanup;
2122     }
2123 
2124     backing_length = bdrv_getlength(bs->backing_hd);
2125     if (backing_length < 0) {
2126         ret = backing_length;
2127         goto ro_cleanup;
2128     }
2129 
2130     /* If our top snapshot is larger than the backing file image,
2131      * grow the backing file image if possible.  If not possible,
2132      * we must return an error */
2133     if (length > backing_length) {
2134         ret = bdrv_truncate(bs->backing_hd, length);
2135         if (ret < 0) {
2136             goto ro_cleanup;
2137         }
2138     }
2139 
2140     total_sectors = length >> BDRV_SECTOR_BITS;
2141     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2142 
2143     for (sector = 0; sector < total_sectors; sector += n) {
2144         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2145         if (ret < 0) {
2146             goto ro_cleanup;
2147         }
2148         if (ret) {
2149             ret = bdrv_read(bs, sector, buf, n);
2150             if (ret < 0) {
2151                 goto ro_cleanup;
2152             }
2153 
2154             ret = bdrv_write(bs->backing_hd, sector, buf, n);
2155             if (ret < 0) {
2156                 goto ro_cleanup;
2157             }
2158         }
2159     }
2160 
2161     if (drv->bdrv_make_empty) {
2162         ret = drv->bdrv_make_empty(bs);
2163         if (ret < 0) {
2164             goto ro_cleanup;
2165         }
2166         bdrv_flush(bs);
2167     }
2168 
2169     /*
2170      * Make sure all data we wrote to the backing device is actually
2171      * stable on disk.
2172      */
2173     if (bs->backing_hd) {
2174         bdrv_flush(bs->backing_hd);
2175     }
2176 
2177     ret = 0;
2178 ro_cleanup:
2179     g_free(buf);
2180 
2181     if (ro) {
2182         /* ignoring error return here */
2183         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2184     }
2185 
2186     return ret;
2187 }
2188 
2189 int bdrv_commit_all(void)
2190 {
2191     BlockDriverState *bs;
2192 
2193     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2194         if (bs->drv && bs->backing_hd) {
2195             int ret = bdrv_commit(bs);
2196             if (ret < 0) {
2197                 return ret;
2198             }
2199         }
2200     }
2201     return 0;
2202 }
2203 
2204 /**
2205  * Remove an active request from the tracked requests list
2206  *
2207  * This function should be called when a tracked request is completing.
2208  */
2209 static void tracked_request_end(BdrvTrackedRequest *req)
2210 {
2211     if (req->serialising) {
2212         req->bs->serialising_in_flight--;
2213     }
2214 
2215     QLIST_REMOVE(req, list);
2216     qemu_co_queue_restart_all(&req->wait_queue);
2217 }
2218 
2219 /**
2220  * Add an active request to the tracked requests list
2221  */
2222 static void tracked_request_begin(BdrvTrackedRequest *req,
2223                                   BlockDriverState *bs,
2224                                   int64_t offset,
2225                                   unsigned int bytes, bool is_write)
2226 {
2227     *req = (BdrvTrackedRequest){
2228         .bs = bs,
2229         .offset         = offset,
2230         .bytes          = bytes,
2231         .is_write       = is_write,
2232         .co             = qemu_coroutine_self(),
2233         .serialising    = false,
2234         .overlap_offset = offset,
2235         .overlap_bytes  = bytes,
2236     };
2237 
2238     qemu_co_queue_init(&req->wait_queue);
2239 
2240     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2241 }
2242 
2243 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2244 {
2245     int64_t overlap_offset = req->offset & ~(align - 1);
2246     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2247                                - overlap_offset;
2248 
2249     if (!req->serialising) {
2250         req->bs->serialising_in_flight++;
2251         req->serialising = true;
2252     }
2253 
2254     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2255     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2256 }
2257 
2258 /**
2259  * Round a region to cluster boundaries
2260  */
2261 void bdrv_round_to_clusters(BlockDriverState *bs,
2262                             int64_t sector_num, int nb_sectors,
2263                             int64_t *cluster_sector_num,
2264                             int *cluster_nb_sectors)
2265 {
2266     BlockDriverInfo bdi;
2267 
2268     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2269         *cluster_sector_num = sector_num;
2270         *cluster_nb_sectors = nb_sectors;
2271     } else {
2272         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2273         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2274         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2275                                             nb_sectors, c);
2276     }
2277 }
2278 
2279 static int bdrv_get_cluster_size(BlockDriverState *bs)
2280 {
2281     BlockDriverInfo bdi;
2282     int ret;
2283 
2284     ret = bdrv_get_info(bs, &bdi);
2285     if (ret < 0 || bdi.cluster_size == 0) {
2286         return bs->request_alignment;
2287     } else {
2288         return bdi.cluster_size;
2289     }
2290 }
2291 
2292 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2293                                      int64_t offset, unsigned int bytes)
2294 {
2295     /*        aaaa   bbbb */
2296     if (offset >= req->overlap_offset + req->overlap_bytes) {
2297         return false;
2298     }
2299     /* bbbb   aaaa        */
2300     if (req->overlap_offset >= offset + bytes) {
2301         return false;
2302     }
2303     return true;
2304 }
2305 
2306 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2307 {
2308     BlockDriverState *bs = self->bs;
2309     BdrvTrackedRequest *req;
2310     bool retry;
2311     bool waited = false;
2312 
2313     if (!bs->serialising_in_flight) {
2314         return false;
2315     }
2316 
2317     do {
2318         retry = false;
2319         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2320             if (req == self || (!req->serialising && !self->serialising)) {
2321                 continue;
2322             }
2323             if (tracked_request_overlaps(req, self->overlap_offset,
2324                                          self->overlap_bytes))
2325             {
2326                 /* Hitting this means there was a reentrant request, for
2327                  * example, a block driver issuing nested requests.  This must
2328                  * never happen since it means deadlock.
2329                  */
2330                 assert(qemu_coroutine_self() != req->co);
2331 
2332                 /* If the request is already (indirectly) waiting for us, or
2333                  * will wait for us as soon as it wakes up, then just go on
2334                  * (instead of producing a deadlock in the former case). */
2335                 if (!req->waiting_for) {
2336                     self->waiting_for = req;
2337                     qemu_co_queue_wait(&req->wait_queue);
2338                     self->waiting_for = NULL;
2339                     retry = true;
2340                     waited = true;
2341                     break;
2342                 }
2343             }
2344         }
2345     } while (retry);
2346 
2347     return waited;
2348 }
2349 
2350 /*
2351  * Return values:
2352  * 0        - success
2353  * -EINVAL  - backing format specified, but no file
2354  * -ENOSPC  - can't update the backing file because no space is left in the
2355  *            image file header
2356  * -ENOTSUP - format driver doesn't support changing the backing file
2357  */
2358 int bdrv_change_backing_file(BlockDriverState *bs,
2359     const char *backing_file, const char *backing_fmt)
2360 {
2361     BlockDriver *drv = bs->drv;
2362     int ret;
2363 
2364     /* Backing file format doesn't make sense without a backing file */
2365     if (backing_fmt && !backing_file) {
2366         return -EINVAL;
2367     }
2368 
2369     if (drv->bdrv_change_backing_file != NULL) {
2370         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2371     } else {
2372         ret = -ENOTSUP;
2373     }
2374 
2375     if (ret == 0) {
2376         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2377         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2378     }
2379     return ret;
2380 }
2381 
2382 /*
2383  * Finds the image layer in the chain that has 'bs' as its backing file.
2384  *
2385  * active is the current topmost image.
2386  *
2387  * Returns NULL if bs is not found in active's image chain,
2388  * or if active == bs.
2389  */
2390 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2391                                     BlockDriverState *bs)
2392 {
2393     BlockDriverState *overlay = NULL;
2394     BlockDriverState *intermediate;
2395 
2396     assert(active != NULL);
2397     assert(bs != NULL);
2398 
2399     /* if bs is the same as active, then by definition it has no overlay
2400      */
2401     if (active == bs) {
2402         return NULL;
2403     }
2404 
2405     intermediate = active;
2406     while (intermediate->backing_hd) {
2407         if (intermediate->backing_hd == bs) {
2408             overlay = intermediate;
2409             break;
2410         }
2411         intermediate = intermediate->backing_hd;
2412     }
2413 
2414     return overlay;
2415 }
2416 
2417 typedef struct BlkIntermediateStates {
2418     BlockDriverState *bs;
2419     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2420 } BlkIntermediateStates;
2421 
2422 
2423 /*
2424  * Drops images above 'base' up to and including 'top', and sets the image
2425  * above 'top' to have base as its backing file.
2426  *
2427  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2428  * information in 'bs' can be properly updated.
2429  *
2430  * E.g., this will convert the following chain:
2431  * bottom <- base <- intermediate <- top <- active
2432  *
2433  * to
2434  *
2435  * bottom <- base <- active
2436  *
2437  * It is allowed for bottom==base, in which case it converts:
2438  *
2439  * base <- intermediate <- top <- active
2440  *
2441  * to
2442  *
2443  * base <- active
2444  *
2445  * Error conditions:
2446  *  if active == top, that is considered an error
2447  *
2448  */
2449 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2450                            BlockDriverState *base)
2451 {
2452     BlockDriverState *intermediate;
2453     BlockDriverState *base_bs = NULL;
2454     BlockDriverState *new_top_bs = NULL;
2455     BlkIntermediateStates *intermediate_state, *next;
2456     int ret = -EIO;
2457 
2458     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2459     QSIMPLEQ_INIT(&states_to_delete);
2460 
2461     if (!top->drv || !base->drv) {
2462         goto exit;
2463     }
2464 
2465     new_top_bs = bdrv_find_overlay(active, top);
2466 
2467     if (new_top_bs == NULL) {
2468         /* we could not find the image above 'top', this is an error */
2469         goto exit;
2470     }
2471 
2472     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2473      * to do, no intermediate images */
2474     if (new_top_bs->backing_hd == base) {
2475         ret = 0;
2476         goto exit;
2477     }
2478 
2479     intermediate = top;
2480 
2481     /* now we will go down through the list, and add each BDS we find
2482      * into our deletion queue, until we hit the 'base'
2483      */
2484     while (intermediate) {
2485         intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2486         intermediate_state->bs = intermediate;
2487         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2488 
2489         if (intermediate->backing_hd == base) {
2490             base_bs = intermediate->backing_hd;
2491             break;
2492         }
2493         intermediate = intermediate->backing_hd;
2494     }
2495     if (base_bs == NULL) {
2496         /* something went wrong, we did not end at the base. safely
2497          * unravel everything, and exit with error */
2498         goto exit;
2499     }
2500 
2501     /* success - we can delete the intermediate states, and link top->base */
2502     ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2503                                    base_bs->drv ? base_bs->drv->format_name : "");
2504     if (ret) {
2505         goto exit;
2506     }
2507     new_top_bs->backing_hd = base_bs;
2508 
2509     bdrv_refresh_limits(new_top_bs);
2510 
2511     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2512         /* so that bdrv_close() does not recursively close the chain */
2513         intermediate_state->bs->backing_hd = NULL;
2514         bdrv_unref(intermediate_state->bs);
2515     }
2516     ret = 0;
2517 
2518 exit:
2519     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2520         g_free(intermediate_state);
2521     }
2522     return ret;
2523 }
2524 
2525 
2526 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2527                                    size_t size)
2528 {
2529     int64_t len;
2530 
2531     if (!bdrv_is_inserted(bs))
2532         return -ENOMEDIUM;
2533 
2534     if (bs->growable)
2535         return 0;
2536 
2537     len = bdrv_getlength(bs);
2538 
2539     if (offset < 0)
2540         return -EIO;
2541 
2542     if ((offset > len) || (len - offset < size))
2543         return -EIO;
2544 
2545     return 0;
2546 }
2547 
2548 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2549                               int nb_sectors)
2550 {
2551     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2552                                    nb_sectors * BDRV_SECTOR_SIZE);
2553 }
2554 
2555 typedef struct RwCo {
2556     BlockDriverState *bs;
2557     int64_t offset;
2558     QEMUIOVector *qiov;
2559     bool is_write;
2560     int ret;
2561     BdrvRequestFlags flags;
2562 } RwCo;
2563 
2564 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2565 {
2566     RwCo *rwco = opaque;
2567 
2568     if (!rwco->is_write) {
2569         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2570                                       rwco->qiov->size, rwco->qiov,
2571                                       rwco->flags);
2572     } else {
2573         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2574                                        rwco->qiov->size, rwco->qiov,
2575                                        rwco->flags);
2576     }
2577 }
2578 
2579 /*
2580  * Process a vectored synchronous request using coroutines
2581  */
2582 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2583                         QEMUIOVector *qiov, bool is_write,
2584                         BdrvRequestFlags flags)
2585 {
2586     Coroutine *co;
2587     RwCo rwco = {
2588         .bs = bs,
2589         .offset = offset,
2590         .qiov = qiov,
2591         .is_write = is_write,
2592         .ret = NOT_DONE,
2593         .flags = flags,
2594     };
2595 
2596     /**
2597      * In sync call context, when the vcpu is blocked, this throttling timer
2598      * will not fire; so the I/O throttling function has to be disabled here
2599      * if it has been enabled.
2600      */
2601     if (bs->io_limits_enabled) {
2602         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2603                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2604         bdrv_io_limits_disable(bs);
2605     }
2606 
2607     if (qemu_in_coroutine()) {
2608         /* Fast-path if already in coroutine context */
2609         bdrv_rw_co_entry(&rwco);
2610     } else {
2611         co = qemu_coroutine_create(bdrv_rw_co_entry);
2612         qemu_coroutine_enter(co, &rwco);
2613         while (rwco.ret == NOT_DONE) {
2614             qemu_aio_wait();
2615         }
2616     }
2617     return rwco.ret;
2618 }
2619 
2620 /*
2621  * Process a synchronous request using coroutines
2622  */
2623 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2624                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2625 {
2626     QEMUIOVector qiov;
2627     struct iovec iov = {
2628         .iov_base = (void *)buf,
2629         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2630     };
2631 
2632     qemu_iovec_init_external(&qiov, &iov, 1);
2633     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2634                         &qiov, is_write, flags);
2635 }
2636 
2637 /* return < 0 if error. See bdrv_write() for the return codes */
2638 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2639               uint8_t *buf, int nb_sectors)
2640 {
2641     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2642 }
2643 
2644 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2645 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2646                           uint8_t *buf, int nb_sectors)
2647 {
2648     bool enabled;
2649     int ret;
2650 
2651     enabled = bs->io_limits_enabled;
2652     bs->io_limits_enabled = false;
2653     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2654     bs->io_limits_enabled = enabled;
2655     return ret;
2656 }
2657 
2658 /* Return < 0 if error. Important errors are:
2659   -EIO         generic I/O error (may happen for all errors)
2660   -ENOMEDIUM   No media inserted.
2661   -EINVAL      Invalid sector number or nb_sectors
2662   -EACCES      Trying to write a read-only device
2663 */
2664 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2665                const uint8_t *buf, int nb_sectors)
2666 {
2667     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2668 }
2669 
2670 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2671                       int nb_sectors, BdrvRequestFlags flags)
2672 {
2673     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2674                       BDRV_REQ_ZERO_WRITE | flags);
2675 }
2676 
2677 /*
2678  * Completely zero out a block device with the help of bdrv_write_zeroes.
2679  * The operation is sped up by checking the block status and only writing
2680  * zeroes to the device if they currently do not return zeroes. Optional
2681  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2682  *
2683  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2684  */
2685 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2686 {
2687     int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
2688     int64_t ret, nb_sectors, sector_num = 0;
2689     int n;
2690 
2691     for (;;) {
2692         nb_sectors = target_size - sector_num;
2693         if (nb_sectors <= 0) {
2694             return 0;
2695         }
2696         if (nb_sectors > INT_MAX) {
2697             nb_sectors = INT_MAX;
2698         }
2699         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2700         if (ret < 0) {
2701             error_report("error getting block status at sector %" PRId64 ": %s",
2702                          sector_num, strerror(-ret));
2703             return ret;
2704         }
2705         if (ret & BDRV_BLOCK_ZERO) {
2706             sector_num += n;
2707             continue;
2708         }
2709         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2710         if (ret < 0) {
2711             error_report("error writing zeroes at sector %" PRId64 ": %s",
2712                          sector_num, strerror(-ret));
2713             return ret;
2714         }
2715         sector_num += n;
2716     }
2717 }
2718 
2719 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2720 {
2721     QEMUIOVector qiov;
2722     struct iovec iov = {
2723         .iov_base = (void *)buf,
2724         .iov_len = bytes,
2725     };
2726     int ret;
2727 
2728     if (bytes < 0) {
2729         return -EINVAL;
2730     }
2731 
2732     qemu_iovec_init_external(&qiov, &iov, 1);
2733     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2734     if (ret < 0) {
2735         return ret;
2736     }
2737 
2738     return bytes;
2739 }
2740 
2741 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2742 {
2743     int ret;
2744 
2745     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2746     if (ret < 0) {
2747         return ret;
2748     }
2749 
2750     return qiov->size;
2751 }
2752 
2753 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2754                 const void *buf, int bytes)
2755 {
2756     QEMUIOVector qiov;
2757     struct iovec iov = {
2758         .iov_base   = (void *) buf,
2759         .iov_len    = bytes,
2760     };
2761 
2762     if (bytes < 0) {
2763         return -EINVAL;
2764     }
2765 
2766     qemu_iovec_init_external(&qiov, &iov, 1);
2767     return bdrv_pwritev(bs, offset, &qiov);
2768 }
2769 
2770 /*
2771  * Writes to the file and ensures that no writes are reordered across this
2772  * request (acts as a barrier)
2773  *
2774  * Returns 0 on success, -errno in error cases.
2775  */
2776 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2777     const void *buf, int count)
2778 {
2779     int ret;
2780 
2781     ret = bdrv_pwrite(bs, offset, buf, count);
2782     if (ret < 0) {
2783         return ret;
2784     }
2785 
2786     /* No flush needed for cache modes that already do it */
2787     if (bs->enable_write_cache) {
2788         bdrv_flush(bs);
2789     }
2790 
2791     return 0;
2792 }
2793 
2794 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2795         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2796 {
2797     /* Perform I/O through a temporary buffer so that users who scribble over
2798      * their read buffer while the operation is in progress do not end up
2799      * modifying the image file.  This is critical for zero-copy guest I/O
2800      * where anything might happen inside guest memory.
2801      */
2802     void *bounce_buffer;
2803 
2804     BlockDriver *drv = bs->drv;
2805     struct iovec iov;
2806     QEMUIOVector bounce_qiov;
2807     int64_t cluster_sector_num;
2808     int cluster_nb_sectors;
2809     size_t skip_bytes;
2810     int ret;
2811 
2812     /* Cover entire cluster so no additional backing file I/O is required when
2813      * allocating cluster in the image file.
2814      */
2815     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2816                            &cluster_sector_num, &cluster_nb_sectors);
2817 
2818     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2819                                    cluster_sector_num, cluster_nb_sectors);
2820 
2821     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2822     iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2823     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2824 
2825     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2826                              &bounce_qiov);
2827     if (ret < 0) {
2828         goto err;
2829     }
2830 
2831     if (drv->bdrv_co_write_zeroes &&
2832         buffer_is_zero(bounce_buffer, iov.iov_len)) {
2833         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2834                                       cluster_nb_sectors, 0);
2835     } else {
2836         /* This does not change the data on the disk, it is not necessary
2837          * to flush even in cache=writethrough mode.
2838          */
2839         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2840                                   &bounce_qiov);
2841     }
2842 
2843     if (ret < 0) {
2844         /* It might be okay to ignore write errors for guest requests.  If this
2845          * is a deliberate copy-on-read then we don't want to ignore the error.
2846          * Simply report it in all cases.
2847          */
2848         goto err;
2849     }
2850 
2851     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2852     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2853                         nb_sectors * BDRV_SECTOR_SIZE);
2854 
2855 err:
2856     qemu_vfree(bounce_buffer);
2857     return ret;
2858 }
2859 
2860 /*
2861  * Forwards an already correctly aligned request to the BlockDriver. This
2862  * handles copy on read and zeroing after EOF; any other features must be
2863  * implemented by the caller.
2864  */
2865 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2866     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2867     int64_t align, QEMUIOVector *qiov, int flags)
2868 {
2869     BlockDriver *drv = bs->drv;
2870     int ret;
2871 
2872     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2873     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2874 
2875     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2876     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2877 
2878     /* Handle Copy on Read and associated serialisation */
2879     if (flags & BDRV_REQ_COPY_ON_READ) {
2880         /* If we touch the same cluster it counts as an overlap.  This
2881          * guarantees that allocating writes will be serialized and not race
2882          * with each other for the same cluster.  For example, in copy-on-read
2883          * it ensures that the CoR read and write operations are atomic and
2884          * guest writes cannot interleave between them. */
2885         mark_request_serialising(req, bdrv_get_cluster_size(bs));
2886     }
2887 
2888     wait_serialising_requests(req);
2889 
2890     if (flags & BDRV_REQ_COPY_ON_READ) {
2891         int pnum;
2892 
2893         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2894         if (ret < 0) {
2895             goto out;
2896         }
2897 
2898         if (!ret || pnum != nb_sectors) {
2899             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2900             goto out;
2901         }
2902     }
2903 
2904     /* Forward the request to the BlockDriver */
2905     if (!(bs->zero_beyond_eof && bs->growable)) {
2906         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2907     } else {
2908         /* Read zeros after EOF of growable BDSes */
2909         int64_t len, total_sectors, max_nb_sectors;
2910 
2911         len = bdrv_getlength(bs);
2912         if (len < 0) {
2913             ret = len;
2914             goto out;
2915         }
2916 
2917         total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
2918         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
2919                                   align >> BDRV_SECTOR_BITS);
2920         if (max_nb_sectors > 0) {
2921             ret = drv->bdrv_co_readv(bs, sector_num,
2922                                      MIN(nb_sectors, max_nb_sectors), qiov);
2923         } else {
2924             ret = 0;
2925         }
2926 
2927         /* Reading beyond end of file is supposed to produce zeroes */
2928         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2929             uint64_t offset = MAX(0, total_sectors - sector_num);
2930             uint64_t bytes = (sector_num + nb_sectors - offset) *
2931                               BDRV_SECTOR_SIZE;
2932             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
2933         }
2934     }
2935 
2936 out:
2937     return ret;
2938 }
2939 
2940 /*
2941  * Handle a read request in coroutine context
2942  */
2943 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
2944     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
2945     BdrvRequestFlags flags)
2946 {
2947     BlockDriver *drv = bs->drv;
2948     BdrvTrackedRequest req;
2949 
2950     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
2951     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
2952     uint8_t *head_buf = NULL;
2953     uint8_t *tail_buf = NULL;
2954     QEMUIOVector local_qiov;
2955     bool use_local_qiov = false;
2956     int ret;
2957 
2958     if (!drv) {
2959         return -ENOMEDIUM;
2960     }
2961     if (bdrv_check_byte_request(bs, offset, bytes)) {
2962         return -EIO;
2963     }
2964 
2965     if (bs->copy_on_read) {
2966         flags |= BDRV_REQ_COPY_ON_READ;
2967     }
2968 
2969     /* throttling disk I/O */
2970     if (bs->io_limits_enabled) {
2971         bdrv_io_limits_intercept(bs, bytes, false);
2972     }
2973 
2974     /* Align read if necessary by padding qiov */
2975     if (offset & (align - 1)) {
2976         head_buf = qemu_blockalign(bs, align);
2977         qemu_iovec_init(&local_qiov, qiov->niov + 2);
2978         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
2979         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
2980         use_local_qiov = true;
2981 
2982         bytes += offset & (align - 1);
2983         offset = offset & ~(align - 1);
2984     }
2985 
2986     if ((offset + bytes) & (align - 1)) {
2987         if (!use_local_qiov) {
2988             qemu_iovec_init(&local_qiov, qiov->niov + 1);
2989             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
2990             use_local_qiov = true;
2991         }
2992         tail_buf = qemu_blockalign(bs, align);
2993         qemu_iovec_add(&local_qiov, tail_buf,
2994                        align - ((offset + bytes) & (align - 1)));
2995 
2996         bytes = ROUND_UP(bytes, align);
2997     }
2998 
2999     tracked_request_begin(&req, bs, offset, bytes, false);
3000     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3001                               use_local_qiov ? &local_qiov : qiov,
3002                               flags);
3003     tracked_request_end(&req);
3004 
3005     if (use_local_qiov) {
3006         qemu_iovec_destroy(&local_qiov);
3007         qemu_vfree(head_buf);
3008         qemu_vfree(tail_buf);
3009     }
3010 
3011     return ret;
3012 }
3013 
3014 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3015     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3016     BdrvRequestFlags flags)
3017 {
3018     if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3019         return -EINVAL;
3020     }
3021 
3022     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3023                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3024 }
3025 
3026 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3027     int nb_sectors, QEMUIOVector *qiov)
3028 {
3029     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3030 
3031     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3032 }
3033 
3034 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3035     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3036 {
3037     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3038 
3039     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3040                             BDRV_REQ_COPY_ON_READ);
3041 }
3042 
3043 /* if no limit is specified in the BlockLimits use a default
3044  * of 32768 512-byte sectors (16 MiB) per request.
3045  */
3046 #define MAX_WRITE_ZEROES_DEFAULT 32768
3047 
3048 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3049     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3050 {
3051     BlockDriver *drv = bs->drv;
3052     QEMUIOVector qiov;
3053     struct iovec iov = {0};
3054     int ret = 0;
3055 
3056     int max_write_zeroes = bs->bl.max_write_zeroes ?
3057                            bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3058 
3059     while (nb_sectors > 0 && !ret) {
3060         int num = nb_sectors;
3061 
3062         /* Align request.  Block drivers can expect the "bulk" of the request
3063          * to be aligned.
3064          */
3065         if (bs->bl.write_zeroes_alignment
3066             && num > bs->bl.write_zeroes_alignment) {
3067             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3068                 /* Make a small request up to the first aligned sector.  */
3069                 num = bs->bl.write_zeroes_alignment;
3070                 num -= sector_num % bs->bl.write_zeroes_alignment;
3071             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3072                 /* Shorten the request to the last aligned sector.  num cannot
3073                  * underflow because num > bs->bl.write_zeroes_alignment.
3074                  */
3075                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3076             }
3077         }
3078 
3079         /* limit request size */
3080         if (num > max_write_zeroes) {
3081             num = max_write_zeroes;
3082         }
3083 
3084         ret = -ENOTSUP;
3085         /* First try the efficient write zeroes operation */
3086         if (drv->bdrv_co_write_zeroes) {
3087             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3088         }
3089 
3090         if (ret == -ENOTSUP) {
3091             /* Fall back to bounce buffer if write zeroes is unsupported */
3092             iov.iov_len = num * BDRV_SECTOR_SIZE;
3093             if (iov.iov_base == NULL) {
3094                 iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3095                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3096             }
3097             qemu_iovec_init_external(&qiov, &iov, 1);
3098 
3099             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3100 
3101             /* Keep bounce buffer around if it is big enough for all
3102              * all future requests.
3103              */
3104             if (num < max_write_zeroes) {
3105                 qemu_vfree(iov.iov_base);
3106                 iov.iov_base = NULL;
3107             }
3108         }
3109 
3110         sector_num += num;
3111         nb_sectors -= num;
3112     }
3113 
3114     qemu_vfree(iov.iov_base);
3115     return ret;
3116 }
3117 
3118 /*
3119  * Forwards an already correctly aligned write request to the BlockDriver.
3120  */
3121 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3122     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3123     QEMUIOVector *qiov, int flags)
3124 {
3125     BlockDriver *drv = bs->drv;
3126     bool waited;
3127     int ret;
3128 
3129     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3130     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3131 
3132     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3133     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3134 
3135     waited = wait_serialising_requests(req);
3136     assert(!waited || !req->serialising);
3137     assert(req->overlap_offset <= offset);
3138     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3139 
3140     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3141 
3142     if (ret < 0) {
3143         /* Do nothing, write notifier decided to fail this request */
3144     } else if (flags & BDRV_REQ_ZERO_WRITE) {
3145         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3146         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3147     } else {
3148         BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3149         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3150     }
3151     BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3152 
3153     if (ret == 0 && !bs->enable_write_cache) {
3154         ret = bdrv_co_flush(bs);
3155     }
3156 
3157     bdrv_set_dirty(bs, sector_num, nb_sectors);
3158 
3159     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3160         bs->wr_highest_sector = sector_num + nb_sectors - 1;
3161     }
3162     if (bs->growable && ret >= 0) {
3163         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3164     }
3165 
3166     return ret;
3167 }
3168 
3169 /*
3170  * Handle a write request in coroutine context
3171  */
3172 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3173     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3174     BdrvRequestFlags flags)
3175 {
3176     BdrvTrackedRequest req;
3177     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3178     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3179     uint8_t *head_buf = NULL;
3180     uint8_t *tail_buf = NULL;
3181     QEMUIOVector local_qiov;
3182     bool use_local_qiov = false;
3183     int ret;
3184 
3185     if (!bs->drv) {
3186         return -ENOMEDIUM;
3187     }
3188     if (bs->read_only) {
3189         return -EACCES;
3190     }
3191     if (bdrv_check_byte_request(bs, offset, bytes)) {
3192         return -EIO;
3193     }
3194 
3195     /* throttling disk I/O */
3196     if (bs->io_limits_enabled) {
3197         bdrv_io_limits_intercept(bs, bytes, true);
3198     }
3199 
3200     /*
3201      * Align write if necessary by performing a read-modify-write cycle.
3202      * Pad qiov with the read parts and be sure to have a tracked request not
3203      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3204      */
3205     tracked_request_begin(&req, bs, offset, bytes, true);
3206 
3207     if (offset & (align - 1)) {
3208         QEMUIOVector head_qiov;
3209         struct iovec head_iov;
3210 
3211         mark_request_serialising(&req, align);
3212         wait_serialising_requests(&req);
3213 
3214         head_buf = qemu_blockalign(bs, align);
3215         head_iov = (struct iovec) {
3216             .iov_base   = head_buf,
3217             .iov_len    = align,
3218         };
3219         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3220 
3221         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3222         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3223                                   align, &head_qiov, 0);
3224         if (ret < 0) {
3225             goto fail;
3226         }
3227         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3228 
3229         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3230         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3231         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3232         use_local_qiov = true;
3233 
3234         bytes += offset & (align - 1);
3235         offset = offset & ~(align - 1);
3236     }
3237 
3238     if ((offset + bytes) & (align - 1)) {
3239         QEMUIOVector tail_qiov;
3240         struct iovec tail_iov;
3241         size_t tail_bytes;
3242         bool waited;
3243 
3244         mark_request_serialising(&req, align);
3245         waited = wait_serialising_requests(&req);
3246         assert(!waited || !use_local_qiov);
3247 
3248         tail_buf = qemu_blockalign(bs, align);
3249         tail_iov = (struct iovec) {
3250             .iov_base   = tail_buf,
3251             .iov_len    = align,
3252         };
3253         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3254 
3255         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3256         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3257                                   align, &tail_qiov, 0);
3258         if (ret < 0) {
3259             goto fail;
3260         }
3261         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3262 
3263         if (!use_local_qiov) {
3264             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3265             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3266             use_local_qiov = true;
3267         }
3268 
3269         tail_bytes = (offset + bytes) & (align - 1);
3270         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3271 
3272         bytes = ROUND_UP(bytes, align);
3273     }
3274 
3275     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3276                                use_local_qiov ? &local_qiov : qiov,
3277                                flags);
3278 
3279 fail:
3280     tracked_request_end(&req);
3281 
3282     if (use_local_qiov) {
3283         qemu_iovec_destroy(&local_qiov);
3284     }
3285     qemu_vfree(head_buf);
3286     qemu_vfree(tail_buf);
3287 
3288     return ret;
3289 }
3290 
3291 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3292     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3293     BdrvRequestFlags flags)
3294 {
3295     if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3296         return -EINVAL;
3297     }
3298 
3299     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3300                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3301 }
3302 
3303 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3304     int nb_sectors, QEMUIOVector *qiov)
3305 {
3306     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3307 
3308     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3309 }
3310 
3311 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3312                                       int64_t sector_num, int nb_sectors,
3313                                       BdrvRequestFlags flags)
3314 {
3315     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3316 
3317     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3318         flags &= ~BDRV_REQ_MAY_UNMAP;
3319     }
3320 
3321     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3322                              BDRV_REQ_ZERO_WRITE | flags);
3323 }
3324 
3325 /**
3326  * Truncate file to 'offset' bytes (needed only for file protocols)
3327  */
3328 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3329 {
3330     BlockDriver *drv = bs->drv;
3331     int ret;
3332     if (!drv)
3333         return -ENOMEDIUM;
3334     if (!drv->bdrv_truncate)
3335         return -ENOTSUP;
3336     if (bs->read_only)
3337         return -EACCES;
3338     if (bdrv_in_use(bs))
3339         return -EBUSY;
3340     ret = drv->bdrv_truncate(bs, offset);
3341     if (ret == 0) {
3342         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3343         bdrv_dev_resize_cb(bs);
3344     }
3345     return ret;
3346 }
3347 
3348 /**
3349  * Length of a allocated file in bytes. Sparse files are counted by actual
3350  * allocated space. Return < 0 if error or unknown.
3351  */
3352 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3353 {
3354     BlockDriver *drv = bs->drv;
3355     if (!drv) {
3356         return -ENOMEDIUM;
3357     }
3358     if (drv->bdrv_get_allocated_file_size) {
3359         return drv->bdrv_get_allocated_file_size(bs);
3360     }
3361     if (bs->file) {
3362         return bdrv_get_allocated_file_size(bs->file);
3363     }
3364     return -ENOTSUP;
3365 }
3366 
3367 /**
3368  * Length of a file in bytes. Return < 0 if error or unknown.
3369  */
3370 int64_t bdrv_getlength(BlockDriverState *bs)
3371 {
3372     BlockDriver *drv = bs->drv;
3373     if (!drv)
3374         return -ENOMEDIUM;
3375 
3376     if (drv->has_variable_length) {
3377         int ret = refresh_total_sectors(bs, bs->total_sectors);
3378         if (ret < 0) {
3379             return ret;
3380         }
3381     }
3382     return bs->total_sectors * BDRV_SECTOR_SIZE;
3383 }
3384 
3385 /* return 0 as number of sectors if no device present or error */
3386 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3387 {
3388     int64_t length;
3389     length = bdrv_getlength(bs);
3390     if (length < 0)
3391         length = 0;
3392     else
3393         length = length >> BDRV_SECTOR_BITS;
3394     *nb_sectors_ptr = length;
3395 }
3396 
3397 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3398                        BlockdevOnError on_write_error)
3399 {
3400     bs->on_read_error = on_read_error;
3401     bs->on_write_error = on_write_error;
3402 }
3403 
3404 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3405 {
3406     return is_read ? bs->on_read_error : bs->on_write_error;
3407 }
3408 
3409 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3410 {
3411     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3412 
3413     switch (on_err) {
3414     case BLOCKDEV_ON_ERROR_ENOSPC:
3415         return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3416     case BLOCKDEV_ON_ERROR_STOP:
3417         return BDRV_ACTION_STOP;
3418     case BLOCKDEV_ON_ERROR_REPORT:
3419         return BDRV_ACTION_REPORT;
3420     case BLOCKDEV_ON_ERROR_IGNORE:
3421         return BDRV_ACTION_IGNORE;
3422     default:
3423         abort();
3424     }
3425 }
3426 
3427 /* This is done by device models because, while the block layer knows
3428  * about the error, it does not know whether an operation comes from
3429  * the device or the block layer (from a job, for example).
3430  */
3431 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3432                        bool is_read, int error)
3433 {
3434     assert(error >= 0);
3435     bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3436     if (action == BDRV_ACTION_STOP) {
3437         vm_stop(RUN_STATE_IO_ERROR);
3438         bdrv_iostatus_set_err(bs, error);
3439     }
3440 }
3441 
3442 int bdrv_is_read_only(BlockDriverState *bs)
3443 {
3444     return bs->read_only;
3445 }
3446 
3447 int bdrv_is_sg(BlockDriverState *bs)
3448 {
3449     return bs->sg;
3450 }
3451 
3452 int bdrv_enable_write_cache(BlockDriverState *bs)
3453 {
3454     return bs->enable_write_cache;
3455 }
3456 
3457 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3458 {
3459     bs->enable_write_cache = wce;
3460 
3461     /* so a reopen() will preserve wce */
3462     if (wce) {
3463         bs->open_flags |= BDRV_O_CACHE_WB;
3464     } else {
3465         bs->open_flags &= ~BDRV_O_CACHE_WB;
3466     }
3467 }
3468 
3469 int bdrv_is_encrypted(BlockDriverState *bs)
3470 {
3471     if (bs->backing_hd && bs->backing_hd->encrypted)
3472         return 1;
3473     return bs->encrypted;
3474 }
3475 
3476 int bdrv_key_required(BlockDriverState *bs)
3477 {
3478     BlockDriverState *backing_hd = bs->backing_hd;
3479 
3480     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3481         return 1;
3482     return (bs->encrypted && !bs->valid_key);
3483 }
3484 
3485 int bdrv_set_key(BlockDriverState *bs, const char *key)
3486 {
3487     int ret;
3488     if (bs->backing_hd && bs->backing_hd->encrypted) {
3489         ret = bdrv_set_key(bs->backing_hd, key);
3490         if (ret < 0)
3491             return ret;
3492         if (!bs->encrypted)
3493             return 0;
3494     }
3495     if (!bs->encrypted) {
3496         return -EINVAL;
3497     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3498         return -ENOMEDIUM;
3499     }
3500     ret = bs->drv->bdrv_set_key(bs, key);
3501     if (ret < 0) {
3502         bs->valid_key = 0;
3503     } else if (!bs->valid_key) {
3504         bs->valid_key = 1;
3505         /* call the change callback now, we skipped it on open */
3506         bdrv_dev_change_media_cb(bs, true);
3507     }
3508     return ret;
3509 }
3510 
3511 const char *bdrv_get_format_name(BlockDriverState *bs)
3512 {
3513     return bs->drv ? bs->drv->format_name : NULL;
3514 }
3515 
3516 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3517                          void *opaque)
3518 {
3519     BlockDriver *drv;
3520 
3521     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3522         it(opaque, drv->format_name);
3523     }
3524 }
3525 
3526 /* This function is to find block backend bs */
3527 BlockDriverState *bdrv_find(const char *name)
3528 {
3529     BlockDriverState *bs;
3530 
3531     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3532         if (!strcmp(name, bs->device_name)) {
3533             return bs;
3534         }
3535     }
3536     return NULL;
3537 }
3538 
3539 /* This function is to find a node in the bs graph */
3540 BlockDriverState *bdrv_find_node(const char *node_name)
3541 {
3542     BlockDriverState *bs;
3543 
3544     assert(node_name);
3545 
3546     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3547         if (!strcmp(node_name, bs->node_name)) {
3548             return bs;
3549         }
3550     }
3551     return NULL;
3552 }
3553 
3554 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3555 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3556 {
3557     BlockDeviceInfoList *list, *entry;
3558     BlockDriverState *bs;
3559 
3560     list = NULL;
3561     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3562         entry = g_malloc0(sizeof(*entry));
3563         entry->value = bdrv_block_device_info(bs);
3564         entry->next = list;
3565         list = entry;
3566     }
3567 
3568     return list;
3569 }
3570 
3571 BlockDriverState *bdrv_lookup_bs(const char *device,
3572                                  const char *node_name,
3573                                  Error **errp)
3574 {
3575     BlockDriverState *bs = NULL;
3576 
3577     if ((!device && !node_name) || (device && node_name)) {
3578         error_setg(errp, "Use either device or node-name but not both");
3579         return NULL;
3580     }
3581 
3582     if (device) {
3583         bs = bdrv_find(device);
3584 
3585         if (!bs) {
3586             error_set(errp, QERR_DEVICE_NOT_FOUND, device);
3587             return NULL;
3588         }
3589 
3590         return bs;
3591     }
3592 
3593     bs = bdrv_find_node(node_name);
3594 
3595     if (!bs) {
3596         error_set(errp, QERR_DEVICE_NOT_FOUND, node_name);
3597         return NULL;
3598     }
3599 
3600     return bs;
3601 }
3602 
3603 BlockDriverState *bdrv_next(BlockDriverState *bs)
3604 {
3605     if (!bs) {
3606         return QTAILQ_FIRST(&bdrv_states);
3607     }
3608     return QTAILQ_NEXT(bs, device_list);
3609 }
3610 
3611 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3612 {
3613     BlockDriverState *bs;
3614 
3615     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3616         it(opaque, bs);
3617     }
3618 }
3619 
3620 const char *bdrv_get_device_name(BlockDriverState *bs)
3621 {
3622     return bs->device_name;
3623 }
3624 
3625 int bdrv_get_flags(BlockDriverState *bs)
3626 {
3627     return bs->open_flags;
3628 }
3629 
3630 int bdrv_flush_all(void)
3631 {
3632     BlockDriverState *bs;
3633     int result = 0;
3634 
3635     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3636         int ret = bdrv_flush(bs);
3637         if (ret < 0 && !result) {
3638             result = ret;
3639         }
3640     }
3641 
3642     return result;
3643 }
3644 
3645 int bdrv_has_zero_init_1(BlockDriverState *bs)
3646 {
3647     return 1;
3648 }
3649 
3650 int bdrv_has_zero_init(BlockDriverState *bs)
3651 {
3652     assert(bs->drv);
3653 
3654     /* If BS is a copy on write image, it is initialized to
3655        the contents of the base image, which may not be zeroes.  */
3656     if (bs->backing_hd) {
3657         return 0;
3658     }
3659     if (bs->drv->bdrv_has_zero_init) {
3660         return bs->drv->bdrv_has_zero_init(bs);
3661     }
3662 
3663     /* safe default */
3664     return 0;
3665 }
3666 
3667 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3668 {
3669     BlockDriverInfo bdi;
3670 
3671     if (bs->backing_hd) {
3672         return false;
3673     }
3674 
3675     if (bdrv_get_info(bs, &bdi) == 0) {
3676         return bdi.unallocated_blocks_are_zero;
3677     }
3678 
3679     return false;
3680 }
3681 
3682 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3683 {
3684     BlockDriverInfo bdi;
3685 
3686     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3687         return false;
3688     }
3689 
3690     if (bdrv_get_info(bs, &bdi) == 0) {
3691         return bdi.can_write_zeroes_with_unmap;
3692     }
3693 
3694     return false;
3695 }
3696 
3697 typedef struct BdrvCoGetBlockStatusData {
3698     BlockDriverState *bs;
3699     BlockDriverState *base;
3700     int64_t sector_num;
3701     int nb_sectors;
3702     int *pnum;
3703     int64_t ret;
3704     bool done;
3705 } BdrvCoGetBlockStatusData;
3706 
3707 /*
3708  * Returns true iff the specified sector is present in the disk image. Drivers
3709  * not implementing the functionality are assumed to not support backing files,
3710  * hence all their sectors are reported as allocated.
3711  *
3712  * If 'sector_num' is beyond the end of the disk image the return value is 0
3713  * and 'pnum' is set to 0.
3714  *
3715  * 'pnum' is set to the number of sectors (including and immediately following
3716  * the specified sector) that are known to be in the same
3717  * allocated/unallocated state.
3718  *
3719  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3720  * beyond the end of the disk image it will be clamped.
3721  */
3722 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3723                                                      int64_t sector_num,
3724                                                      int nb_sectors, int *pnum)
3725 {
3726     int64_t length;
3727     int64_t n;
3728     int64_t ret, ret2;
3729 
3730     length = bdrv_getlength(bs);
3731     if (length < 0) {
3732         return length;
3733     }
3734 
3735     if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3736         *pnum = 0;
3737         return 0;
3738     }
3739 
3740     n = bs->total_sectors - sector_num;
3741     if (n < nb_sectors) {
3742         nb_sectors = n;
3743     }
3744 
3745     if (!bs->drv->bdrv_co_get_block_status) {
3746         *pnum = nb_sectors;
3747         ret = BDRV_BLOCK_DATA;
3748         if (bs->drv->protocol_name) {
3749             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3750         }
3751         return ret;
3752     }
3753 
3754     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3755     if (ret < 0) {
3756         *pnum = 0;
3757         return ret;
3758     }
3759 
3760     if (ret & BDRV_BLOCK_RAW) {
3761         assert(ret & BDRV_BLOCK_OFFSET_VALID);
3762         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3763                                      *pnum, pnum);
3764     }
3765 
3766     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3767         if (bdrv_unallocated_blocks_are_zero(bs)) {
3768             ret |= BDRV_BLOCK_ZERO;
3769         } else if (bs->backing_hd) {
3770             BlockDriverState *bs2 = bs->backing_hd;
3771             int64_t length2 = bdrv_getlength(bs2);
3772             if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3773                 ret |= BDRV_BLOCK_ZERO;
3774             }
3775         }
3776     }
3777 
3778     if (bs->file &&
3779         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3780         (ret & BDRV_BLOCK_OFFSET_VALID)) {
3781         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3782                                         *pnum, pnum);
3783         if (ret2 >= 0) {
3784             /* Ignore errors.  This is just providing extra information, it
3785              * is useful but not necessary.
3786              */
3787             ret |= (ret2 & BDRV_BLOCK_ZERO);
3788         }
3789     }
3790 
3791     return ret;
3792 }
3793 
3794 /* Coroutine wrapper for bdrv_get_block_status() */
3795 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3796 {
3797     BdrvCoGetBlockStatusData *data = opaque;
3798     BlockDriverState *bs = data->bs;
3799 
3800     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3801                                          data->pnum);
3802     data->done = true;
3803 }
3804 
3805 /*
3806  * Synchronous wrapper around bdrv_co_get_block_status().
3807  *
3808  * See bdrv_co_get_block_status() for details.
3809  */
3810 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3811                               int nb_sectors, int *pnum)
3812 {
3813     Coroutine *co;
3814     BdrvCoGetBlockStatusData data = {
3815         .bs = bs,
3816         .sector_num = sector_num,
3817         .nb_sectors = nb_sectors,
3818         .pnum = pnum,
3819         .done = false,
3820     };
3821 
3822     if (qemu_in_coroutine()) {
3823         /* Fast-path if already in coroutine context */
3824         bdrv_get_block_status_co_entry(&data);
3825     } else {
3826         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3827         qemu_coroutine_enter(co, &data);
3828         while (!data.done) {
3829             qemu_aio_wait();
3830         }
3831     }
3832     return data.ret;
3833 }
3834 
3835 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3836                                    int nb_sectors, int *pnum)
3837 {
3838     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3839     if (ret < 0) {
3840         return ret;
3841     }
3842     return
3843         (ret & BDRV_BLOCK_DATA) ||
3844         ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3845 }
3846 
3847 /*
3848  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3849  *
3850  * Return true if the given sector is allocated in any image between
3851  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3852  * sector is allocated in any image of the chain.  Return false otherwise.
3853  *
3854  * 'pnum' is set to the number of sectors (including and immediately following
3855  *  the specified sector) that are known to be in the same
3856  *  allocated/unallocated state.
3857  *
3858  */
3859 int bdrv_is_allocated_above(BlockDriverState *top,
3860                             BlockDriverState *base,
3861                             int64_t sector_num,
3862                             int nb_sectors, int *pnum)
3863 {
3864     BlockDriverState *intermediate;
3865     int ret, n = nb_sectors;
3866 
3867     intermediate = top;
3868     while (intermediate && intermediate != base) {
3869         int pnum_inter;
3870         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3871                                 &pnum_inter);
3872         if (ret < 0) {
3873             return ret;
3874         } else if (ret) {
3875             *pnum = pnum_inter;
3876             return 1;
3877         }
3878 
3879         /*
3880          * [sector_num, nb_sectors] is unallocated on top but intermediate
3881          * might have
3882          *
3883          * [sector_num+x, nr_sectors] allocated.
3884          */
3885         if (n > pnum_inter &&
3886             (intermediate == top ||
3887              sector_num + pnum_inter < intermediate->total_sectors)) {
3888             n = pnum_inter;
3889         }
3890 
3891         intermediate = intermediate->backing_hd;
3892     }
3893 
3894     *pnum = n;
3895     return 0;
3896 }
3897 
3898 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3899 {
3900     if (bs->backing_hd && bs->backing_hd->encrypted)
3901         return bs->backing_file;
3902     else if (bs->encrypted)
3903         return bs->filename;
3904     else
3905         return NULL;
3906 }
3907 
3908 void bdrv_get_backing_filename(BlockDriverState *bs,
3909                                char *filename, int filename_size)
3910 {
3911     pstrcpy(filename, filename_size, bs->backing_file);
3912 }
3913 
3914 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3915                           const uint8_t *buf, int nb_sectors)
3916 {
3917     BlockDriver *drv = bs->drv;
3918     if (!drv)
3919         return -ENOMEDIUM;
3920     if (!drv->bdrv_write_compressed)
3921         return -ENOTSUP;
3922     if (bdrv_check_request(bs, sector_num, nb_sectors))
3923         return -EIO;
3924 
3925     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
3926 
3927     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3928 }
3929 
3930 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3931 {
3932     BlockDriver *drv = bs->drv;
3933     if (!drv)
3934         return -ENOMEDIUM;
3935     if (!drv->bdrv_get_info)
3936         return -ENOTSUP;
3937     memset(bdi, 0, sizeof(*bdi));
3938     return drv->bdrv_get_info(bs, bdi);
3939 }
3940 
3941 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
3942 {
3943     BlockDriver *drv = bs->drv;
3944     if (drv && drv->bdrv_get_specific_info) {
3945         return drv->bdrv_get_specific_info(bs);
3946     }
3947     return NULL;
3948 }
3949 
3950 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3951                       int64_t pos, int size)
3952 {
3953     QEMUIOVector qiov;
3954     struct iovec iov = {
3955         .iov_base   = (void *) buf,
3956         .iov_len    = size,
3957     };
3958 
3959     qemu_iovec_init_external(&qiov, &iov, 1);
3960     return bdrv_writev_vmstate(bs, &qiov, pos);
3961 }
3962 
3963 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
3964 {
3965     BlockDriver *drv = bs->drv;
3966 
3967     if (!drv) {
3968         return -ENOMEDIUM;
3969     } else if (drv->bdrv_save_vmstate) {
3970         return drv->bdrv_save_vmstate(bs, qiov, pos);
3971     } else if (bs->file) {
3972         return bdrv_writev_vmstate(bs->file, qiov, pos);
3973     }
3974 
3975     return -ENOTSUP;
3976 }
3977 
3978 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3979                       int64_t pos, int size)
3980 {
3981     BlockDriver *drv = bs->drv;
3982     if (!drv)
3983         return -ENOMEDIUM;
3984     if (drv->bdrv_load_vmstate)
3985         return drv->bdrv_load_vmstate(bs, buf, pos, size);
3986     if (bs->file)
3987         return bdrv_load_vmstate(bs->file, buf, pos, size);
3988     return -ENOTSUP;
3989 }
3990 
3991 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
3992 {
3993     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
3994         return;
3995     }
3996 
3997     bs->drv->bdrv_debug_event(bs, event);
3998 }
3999 
4000 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4001                           const char *tag)
4002 {
4003     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4004         bs = bs->file;
4005     }
4006 
4007     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4008         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4009     }
4010 
4011     return -ENOTSUP;
4012 }
4013 
4014 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4015 {
4016     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4017         bs = bs->file;
4018     }
4019 
4020     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4021         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4022     }
4023 
4024     return -ENOTSUP;
4025 }
4026 
4027 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4028 {
4029     while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
4030         bs = bs->file;
4031     }
4032 
4033     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4034         return bs->drv->bdrv_debug_resume(bs, tag);
4035     }
4036 
4037     return -ENOTSUP;
4038 }
4039 
4040 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4041 {
4042     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4043         bs = bs->file;
4044     }
4045 
4046     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4047         return bs->drv->bdrv_debug_is_suspended(bs, tag);
4048     }
4049 
4050     return false;
4051 }
4052 
4053 int bdrv_is_snapshot(BlockDriverState *bs)
4054 {
4055     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4056 }
4057 
4058 /* backing_file can either be relative, or absolute, or a protocol.  If it is
4059  * relative, it must be relative to the chain.  So, passing in bs->filename
4060  * from a BDS as backing_file should not be done, as that may be relative to
4061  * the CWD rather than the chain. */
4062 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4063         const char *backing_file)
4064 {
4065     char *filename_full = NULL;
4066     char *backing_file_full = NULL;
4067     char *filename_tmp = NULL;
4068     int is_protocol = 0;
4069     BlockDriverState *curr_bs = NULL;
4070     BlockDriverState *retval = NULL;
4071 
4072     if (!bs || !bs->drv || !backing_file) {
4073         return NULL;
4074     }
4075 
4076     filename_full     = g_malloc(PATH_MAX);
4077     backing_file_full = g_malloc(PATH_MAX);
4078     filename_tmp      = g_malloc(PATH_MAX);
4079 
4080     is_protocol = path_has_protocol(backing_file);
4081 
4082     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4083 
4084         /* If either of the filename paths is actually a protocol, then
4085          * compare unmodified paths; otherwise make paths relative */
4086         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4087             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4088                 retval = curr_bs->backing_hd;
4089                 break;
4090             }
4091         } else {
4092             /* If not an absolute filename path, make it relative to the current
4093              * image's filename path */
4094             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4095                          backing_file);
4096 
4097             /* We are going to compare absolute pathnames */
4098             if (!realpath(filename_tmp, filename_full)) {
4099                 continue;
4100             }
4101 
4102             /* We need to make sure the backing filename we are comparing against
4103              * is relative to the current image filename (or absolute) */
4104             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4105                          curr_bs->backing_file);
4106 
4107             if (!realpath(filename_tmp, backing_file_full)) {
4108                 continue;
4109             }
4110 
4111             if (strcmp(backing_file_full, filename_full) == 0) {
4112                 retval = curr_bs->backing_hd;
4113                 break;
4114             }
4115         }
4116     }
4117 
4118     g_free(filename_full);
4119     g_free(backing_file_full);
4120     g_free(filename_tmp);
4121     return retval;
4122 }
4123 
4124 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4125 {
4126     if (!bs->drv) {
4127         return 0;
4128     }
4129 
4130     if (!bs->backing_hd) {
4131         return 0;
4132     }
4133 
4134     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4135 }
4136 
4137 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4138 {
4139     BlockDriverState *curr_bs = NULL;
4140 
4141     if (!bs) {
4142         return NULL;
4143     }
4144 
4145     curr_bs = bs;
4146 
4147     while (curr_bs->backing_hd) {
4148         curr_bs = curr_bs->backing_hd;
4149     }
4150     return curr_bs;
4151 }
4152 
4153 /**************************************************************/
4154 /* async I/Os */
4155 
4156 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4157                                  QEMUIOVector *qiov, int nb_sectors,
4158                                  BlockDriverCompletionFunc *cb, void *opaque)
4159 {
4160     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4161 
4162     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4163                                  cb, opaque, false);
4164 }
4165 
4166 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4167                                   QEMUIOVector *qiov, int nb_sectors,
4168                                   BlockDriverCompletionFunc *cb, void *opaque)
4169 {
4170     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4171 
4172     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4173                                  cb, opaque, true);
4174 }
4175 
4176 BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4177         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4178         BlockDriverCompletionFunc *cb, void *opaque)
4179 {
4180     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4181 
4182     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4183                                  BDRV_REQ_ZERO_WRITE | flags,
4184                                  cb, opaque, true);
4185 }
4186 
4187 
4188 typedef struct MultiwriteCB {
4189     int error;
4190     int num_requests;
4191     int num_callbacks;
4192     struct {
4193         BlockDriverCompletionFunc *cb;
4194         void *opaque;
4195         QEMUIOVector *free_qiov;
4196     } callbacks[];
4197 } MultiwriteCB;
4198 
4199 static void multiwrite_user_cb(MultiwriteCB *mcb)
4200 {
4201     int i;
4202 
4203     for (i = 0; i < mcb->num_callbacks; i++) {
4204         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4205         if (mcb->callbacks[i].free_qiov) {
4206             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4207         }
4208         g_free(mcb->callbacks[i].free_qiov);
4209     }
4210 }
4211 
4212 static void multiwrite_cb(void *opaque, int ret)
4213 {
4214     MultiwriteCB *mcb = opaque;
4215 
4216     trace_multiwrite_cb(mcb, ret);
4217 
4218     if (ret < 0 && !mcb->error) {
4219         mcb->error = ret;
4220     }
4221 
4222     mcb->num_requests--;
4223     if (mcb->num_requests == 0) {
4224         multiwrite_user_cb(mcb);
4225         g_free(mcb);
4226     }
4227 }
4228 
4229 static int multiwrite_req_compare(const void *a, const void *b)
4230 {
4231     const BlockRequest *req1 = a, *req2 = b;
4232 
4233     /*
4234      * Note that we can't simply subtract req2->sector from req1->sector
4235      * here as that could overflow the return value.
4236      */
4237     if (req1->sector > req2->sector) {
4238         return 1;
4239     } else if (req1->sector < req2->sector) {
4240         return -1;
4241     } else {
4242         return 0;
4243     }
4244 }
4245 
4246 /*
4247  * Takes a bunch of requests and tries to merge them. Returns the number of
4248  * requests that remain after merging.
4249  */
4250 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4251     int num_reqs, MultiwriteCB *mcb)
4252 {
4253     int i, outidx;
4254 
4255     // Sort requests by start sector
4256     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4257 
4258     // Check if adjacent requests touch the same clusters. If so, combine them,
4259     // filling up gaps with zero sectors.
4260     outidx = 0;
4261     for (i = 1; i < num_reqs; i++) {
4262         int merge = 0;
4263         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4264 
4265         // Handle exactly sequential writes and overlapping writes.
4266         if (reqs[i].sector <= oldreq_last) {
4267             merge = 1;
4268         }
4269 
4270         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4271             merge = 0;
4272         }
4273 
4274         if (merge) {
4275             size_t size;
4276             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4277             qemu_iovec_init(qiov,
4278                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4279 
4280             // Add the first request to the merged one. If the requests are
4281             // overlapping, drop the last sectors of the first request.
4282             size = (reqs[i].sector - reqs[outidx].sector) << 9;
4283             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4284 
4285             // We should need to add any zeros between the two requests
4286             assert (reqs[i].sector <= oldreq_last);
4287 
4288             // Add the second request
4289             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4290 
4291             reqs[outidx].nb_sectors = qiov->size >> 9;
4292             reqs[outidx].qiov = qiov;
4293 
4294             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4295         } else {
4296             outidx++;
4297             reqs[outidx].sector     = reqs[i].sector;
4298             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4299             reqs[outidx].qiov       = reqs[i].qiov;
4300         }
4301     }
4302 
4303     return outidx + 1;
4304 }
4305 
4306 /*
4307  * Submit multiple AIO write requests at once.
4308  *
4309  * On success, the function returns 0 and all requests in the reqs array have
4310  * been submitted. In error case this function returns -1, and any of the
4311  * requests may or may not be submitted yet. In particular, this means that the
4312  * callback will be called for some of the requests, for others it won't. The
4313  * caller must check the error field of the BlockRequest to wait for the right
4314  * callbacks (if error != 0, no callback will be called).
4315  *
4316  * The implementation may modify the contents of the reqs array, e.g. to merge
4317  * requests. However, the fields opaque and error are left unmodified as they
4318  * are used to signal failure for a single request to the caller.
4319  */
4320 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4321 {
4322     MultiwriteCB *mcb;
4323     int i;
4324 
4325     /* don't submit writes if we don't have a medium */
4326     if (bs->drv == NULL) {
4327         for (i = 0; i < num_reqs; i++) {
4328             reqs[i].error = -ENOMEDIUM;
4329         }
4330         return -1;
4331     }
4332 
4333     if (num_reqs == 0) {
4334         return 0;
4335     }
4336 
4337     // Create MultiwriteCB structure
4338     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4339     mcb->num_requests = 0;
4340     mcb->num_callbacks = num_reqs;
4341 
4342     for (i = 0; i < num_reqs; i++) {
4343         mcb->callbacks[i].cb = reqs[i].cb;
4344         mcb->callbacks[i].opaque = reqs[i].opaque;
4345     }
4346 
4347     // Check for mergable requests
4348     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4349 
4350     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4351 
4352     /* Run the aio requests. */
4353     mcb->num_requests = num_reqs;
4354     for (i = 0; i < num_reqs; i++) {
4355         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4356                               reqs[i].nb_sectors, reqs[i].flags,
4357                               multiwrite_cb, mcb,
4358                               true);
4359     }
4360 
4361     return 0;
4362 }
4363 
4364 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4365 {
4366     acb->aiocb_info->cancel(acb);
4367 }
4368 
4369 /**************************************************************/
4370 /* async block device emulation */
4371 
4372 typedef struct BlockDriverAIOCBSync {
4373     BlockDriverAIOCB common;
4374     QEMUBH *bh;
4375     int ret;
4376     /* vector translation state */
4377     QEMUIOVector *qiov;
4378     uint8_t *bounce;
4379     int is_write;
4380 } BlockDriverAIOCBSync;
4381 
4382 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4383 {
4384     BlockDriverAIOCBSync *acb =
4385         container_of(blockacb, BlockDriverAIOCBSync, common);
4386     qemu_bh_delete(acb->bh);
4387     acb->bh = NULL;
4388     qemu_aio_release(acb);
4389 }
4390 
4391 static const AIOCBInfo bdrv_em_aiocb_info = {
4392     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
4393     .cancel             = bdrv_aio_cancel_em,
4394 };
4395 
4396 static void bdrv_aio_bh_cb(void *opaque)
4397 {
4398     BlockDriverAIOCBSync *acb = opaque;
4399 
4400     if (!acb->is_write)
4401         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4402     qemu_vfree(acb->bounce);
4403     acb->common.cb(acb->common.opaque, acb->ret);
4404     qemu_bh_delete(acb->bh);
4405     acb->bh = NULL;
4406     qemu_aio_release(acb);
4407 }
4408 
4409 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4410                                             int64_t sector_num,
4411                                             QEMUIOVector *qiov,
4412                                             int nb_sectors,
4413                                             BlockDriverCompletionFunc *cb,
4414                                             void *opaque,
4415                                             int is_write)
4416 
4417 {
4418     BlockDriverAIOCBSync *acb;
4419 
4420     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4421     acb->is_write = is_write;
4422     acb->qiov = qiov;
4423     acb->bounce = qemu_blockalign(bs, qiov->size);
4424     acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4425 
4426     if (is_write) {
4427         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4428         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4429     } else {
4430         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4431     }
4432 
4433     qemu_bh_schedule(acb->bh);
4434 
4435     return &acb->common;
4436 }
4437 
4438 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4439         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4440         BlockDriverCompletionFunc *cb, void *opaque)
4441 {
4442     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4443 }
4444 
4445 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4446         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4447         BlockDriverCompletionFunc *cb, void *opaque)
4448 {
4449     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4450 }
4451 
4452 
4453 typedef struct BlockDriverAIOCBCoroutine {
4454     BlockDriverAIOCB common;
4455     BlockRequest req;
4456     bool is_write;
4457     bool *done;
4458     QEMUBH* bh;
4459 } BlockDriverAIOCBCoroutine;
4460 
4461 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4462 {
4463     BlockDriverAIOCBCoroutine *acb =
4464         container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4465     bool done = false;
4466 
4467     acb->done = &done;
4468     while (!done) {
4469         qemu_aio_wait();
4470     }
4471 }
4472 
4473 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4474     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4475     .cancel             = bdrv_aio_co_cancel_em,
4476 };
4477 
4478 static void bdrv_co_em_bh(void *opaque)
4479 {
4480     BlockDriverAIOCBCoroutine *acb = opaque;
4481 
4482     acb->common.cb(acb->common.opaque, acb->req.error);
4483 
4484     if (acb->done) {
4485         *acb->done = true;
4486     }
4487 
4488     qemu_bh_delete(acb->bh);
4489     qemu_aio_release(acb);
4490 }
4491 
4492 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4493 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4494 {
4495     BlockDriverAIOCBCoroutine *acb = opaque;
4496     BlockDriverState *bs = acb->common.bs;
4497 
4498     if (!acb->is_write) {
4499         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4500             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4501     } else {
4502         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4503             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4504     }
4505 
4506     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4507     qemu_bh_schedule(acb->bh);
4508 }
4509 
4510 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4511                                                int64_t sector_num,
4512                                                QEMUIOVector *qiov,
4513                                                int nb_sectors,
4514                                                BdrvRequestFlags flags,
4515                                                BlockDriverCompletionFunc *cb,
4516                                                void *opaque,
4517                                                bool is_write)
4518 {
4519     Coroutine *co;
4520     BlockDriverAIOCBCoroutine *acb;
4521 
4522     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4523     acb->req.sector = sector_num;
4524     acb->req.nb_sectors = nb_sectors;
4525     acb->req.qiov = qiov;
4526     acb->req.flags = flags;
4527     acb->is_write = is_write;
4528     acb->done = NULL;
4529 
4530     co = qemu_coroutine_create(bdrv_co_do_rw);
4531     qemu_coroutine_enter(co, acb);
4532 
4533     return &acb->common;
4534 }
4535 
4536 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4537 {
4538     BlockDriverAIOCBCoroutine *acb = opaque;
4539     BlockDriverState *bs = acb->common.bs;
4540 
4541     acb->req.error = bdrv_co_flush(bs);
4542     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4543     qemu_bh_schedule(acb->bh);
4544 }
4545 
4546 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4547         BlockDriverCompletionFunc *cb, void *opaque)
4548 {
4549     trace_bdrv_aio_flush(bs, opaque);
4550 
4551     Coroutine *co;
4552     BlockDriverAIOCBCoroutine *acb;
4553 
4554     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4555     acb->done = NULL;
4556 
4557     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4558     qemu_coroutine_enter(co, acb);
4559 
4560     return &acb->common;
4561 }
4562 
4563 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4564 {
4565     BlockDriverAIOCBCoroutine *acb = opaque;
4566     BlockDriverState *bs = acb->common.bs;
4567 
4568     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4569     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4570     qemu_bh_schedule(acb->bh);
4571 }
4572 
4573 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4574         int64_t sector_num, int nb_sectors,
4575         BlockDriverCompletionFunc *cb, void *opaque)
4576 {
4577     Coroutine *co;
4578     BlockDriverAIOCBCoroutine *acb;
4579 
4580     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4581 
4582     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4583     acb->req.sector = sector_num;
4584     acb->req.nb_sectors = nb_sectors;
4585     acb->done = NULL;
4586     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4587     qemu_coroutine_enter(co, acb);
4588 
4589     return &acb->common;
4590 }
4591 
4592 void bdrv_init(void)
4593 {
4594     module_call_init(MODULE_INIT_BLOCK);
4595 }
4596 
4597 void bdrv_init_with_whitelist(void)
4598 {
4599     use_bdrv_whitelist = 1;
4600     bdrv_init();
4601 }
4602 
4603 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4604                    BlockDriverCompletionFunc *cb, void *opaque)
4605 {
4606     BlockDriverAIOCB *acb;
4607 
4608     acb = g_slice_alloc(aiocb_info->aiocb_size);
4609     acb->aiocb_info = aiocb_info;
4610     acb->bs = bs;
4611     acb->cb = cb;
4612     acb->opaque = opaque;
4613     return acb;
4614 }
4615 
4616 void qemu_aio_release(void *p)
4617 {
4618     BlockDriverAIOCB *acb = p;
4619     g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4620 }
4621 
4622 /**************************************************************/
4623 /* Coroutine block device emulation */
4624 
4625 typedef struct CoroutineIOCompletion {
4626     Coroutine *coroutine;
4627     int ret;
4628 } CoroutineIOCompletion;
4629 
4630 static void bdrv_co_io_em_complete(void *opaque, int ret)
4631 {
4632     CoroutineIOCompletion *co = opaque;
4633 
4634     co->ret = ret;
4635     qemu_coroutine_enter(co->coroutine, NULL);
4636 }
4637 
4638 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4639                                       int nb_sectors, QEMUIOVector *iov,
4640                                       bool is_write)
4641 {
4642     CoroutineIOCompletion co = {
4643         .coroutine = qemu_coroutine_self(),
4644     };
4645     BlockDriverAIOCB *acb;
4646 
4647     if (is_write) {
4648         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4649                                        bdrv_co_io_em_complete, &co);
4650     } else {
4651         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4652                                       bdrv_co_io_em_complete, &co);
4653     }
4654 
4655     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4656     if (!acb) {
4657         return -EIO;
4658     }
4659     qemu_coroutine_yield();
4660 
4661     return co.ret;
4662 }
4663 
4664 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4665                                          int64_t sector_num, int nb_sectors,
4666                                          QEMUIOVector *iov)
4667 {
4668     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4669 }
4670 
4671 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4672                                          int64_t sector_num, int nb_sectors,
4673                                          QEMUIOVector *iov)
4674 {
4675     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4676 }
4677 
4678 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4679 {
4680     RwCo *rwco = opaque;
4681 
4682     rwco->ret = bdrv_co_flush(rwco->bs);
4683 }
4684 
4685 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4686 {
4687     int ret;
4688 
4689     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4690         return 0;
4691     }
4692 
4693     /* Write back cached data to the OS even with cache=unsafe */
4694     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4695     if (bs->drv->bdrv_co_flush_to_os) {
4696         ret = bs->drv->bdrv_co_flush_to_os(bs);
4697         if (ret < 0) {
4698             return ret;
4699         }
4700     }
4701 
4702     /* But don't actually force it to the disk with cache=unsafe */
4703     if (bs->open_flags & BDRV_O_NO_FLUSH) {
4704         goto flush_parent;
4705     }
4706 
4707     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4708     if (bs->drv->bdrv_co_flush_to_disk) {
4709         ret = bs->drv->bdrv_co_flush_to_disk(bs);
4710     } else if (bs->drv->bdrv_aio_flush) {
4711         BlockDriverAIOCB *acb;
4712         CoroutineIOCompletion co = {
4713             .coroutine = qemu_coroutine_self(),
4714         };
4715 
4716         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4717         if (acb == NULL) {
4718             ret = -EIO;
4719         } else {
4720             qemu_coroutine_yield();
4721             ret = co.ret;
4722         }
4723     } else {
4724         /*
4725          * Some block drivers always operate in either writethrough or unsafe
4726          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4727          * know how the server works (because the behaviour is hardcoded or
4728          * depends on server-side configuration), so we can't ensure that
4729          * everything is safe on disk. Returning an error doesn't work because
4730          * that would break guests even if the server operates in writethrough
4731          * mode.
4732          *
4733          * Let's hope the user knows what he's doing.
4734          */
4735         ret = 0;
4736     }
4737     if (ret < 0) {
4738         return ret;
4739     }
4740 
4741     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4742      * in the case of cache=unsafe, so there are no useless flushes.
4743      */
4744 flush_parent:
4745     return bdrv_co_flush(bs->file);
4746 }
4747 
4748 void bdrv_invalidate_cache(BlockDriverState *bs)
4749 {
4750     if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4751         bs->drv->bdrv_invalidate_cache(bs);
4752     }
4753 }
4754 
4755 void bdrv_invalidate_cache_all(void)
4756 {
4757     BlockDriverState *bs;
4758 
4759     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4760         bdrv_invalidate_cache(bs);
4761     }
4762 }
4763 
4764 void bdrv_clear_incoming_migration_all(void)
4765 {
4766     BlockDriverState *bs;
4767 
4768     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4769         bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4770     }
4771 }
4772 
4773 int bdrv_flush(BlockDriverState *bs)
4774 {
4775     Coroutine *co;
4776     RwCo rwco = {
4777         .bs = bs,
4778         .ret = NOT_DONE,
4779     };
4780 
4781     if (qemu_in_coroutine()) {
4782         /* Fast-path if already in coroutine context */
4783         bdrv_flush_co_entry(&rwco);
4784     } else {
4785         co = qemu_coroutine_create(bdrv_flush_co_entry);
4786         qemu_coroutine_enter(co, &rwco);
4787         while (rwco.ret == NOT_DONE) {
4788             qemu_aio_wait();
4789         }
4790     }
4791 
4792     return rwco.ret;
4793 }
4794 
4795 typedef struct DiscardCo {
4796     BlockDriverState *bs;
4797     int64_t sector_num;
4798     int nb_sectors;
4799     int ret;
4800 } DiscardCo;
4801 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4802 {
4803     DiscardCo *rwco = opaque;
4804 
4805     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4806 }
4807 
4808 /* if no limit is specified in the BlockLimits use a default
4809  * of 32768 512-byte sectors (16 MiB) per request.
4810  */
4811 #define MAX_DISCARD_DEFAULT 32768
4812 
4813 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4814                                  int nb_sectors)
4815 {
4816     int max_discard;
4817 
4818     if (!bs->drv) {
4819         return -ENOMEDIUM;
4820     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4821         return -EIO;
4822     } else if (bs->read_only) {
4823         return -EROFS;
4824     }
4825 
4826     bdrv_reset_dirty(bs, sector_num, nb_sectors);
4827 
4828     /* Do nothing if disabled.  */
4829     if (!(bs->open_flags & BDRV_O_UNMAP)) {
4830         return 0;
4831     }
4832 
4833     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4834         return 0;
4835     }
4836 
4837     max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4838     while (nb_sectors > 0) {
4839         int ret;
4840         int num = nb_sectors;
4841 
4842         /* align request */
4843         if (bs->bl.discard_alignment &&
4844             num >= bs->bl.discard_alignment &&
4845             sector_num % bs->bl.discard_alignment) {
4846             if (num > bs->bl.discard_alignment) {
4847                 num = bs->bl.discard_alignment;
4848             }
4849             num -= sector_num % bs->bl.discard_alignment;
4850         }
4851 
4852         /* limit request size */
4853         if (num > max_discard) {
4854             num = max_discard;
4855         }
4856 
4857         if (bs->drv->bdrv_co_discard) {
4858             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
4859         } else {
4860             BlockDriverAIOCB *acb;
4861             CoroutineIOCompletion co = {
4862                 .coroutine = qemu_coroutine_self(),
4863             };
4864 
4865             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4866                                             bdrv_co_io_em_complete, &co);
4867             if (acb == NULL) {
4868                 return -EIO;
4869             } else {
4870                 qemu_coroutine_yield();
4871                 ret = co.ret;
4872             }
4873         }
4874         if (ret && ret != -ENOTSUP) {
4875             return ret;
4876         }
4877 
4878         sector_num += num;
4879         nb_sectors -= num;
4880     }
4881     return 0;
4882 }
4883 
4884 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4885 {
4886     Coroutine *co;
4887     DiscardCo rwco = {
4888         .bs = bs,
4889         .sector_num = sector_num,
4890         .nb_sectors = nb_sectors,
4891         .ret = NOT_DONE,
4892     };
4893 
4894     if (qemu_in_coroutine()) {
4895         /* Fast-path if already in coroutine context */
4896         bdrv_discard_co_entry(&rwco);
4897     } else {
4898         co = qemu_coroutine_create(bdrv_discard_co_entry);
4899         qemu_coroutine_enter(co, &rwco);
4900         while (rwco.ret == NOT_DONE) {
4901             qemu_aio_wait();
4902         }
4903     }
4904 
4905     return rwco.ret;
4906 }
4907 
4908 /**************************************************************/
4909 /* removable device support */
4910 
4911 /**
4912  * Return TRUE if the media is present
4913  */
4914 int bdrv_is_inserted(BlockDriverState *bs)
4915 {
4916     BlockDriver *drv = bs->drv;
4917 
4918     if (!drv)
4919         return 0;
4920     if (!drv->bdrv_is_inserted)
4921         return 1;
4922     return drv->bdrv_is_inserted(bs);
4923 }
4924 
4925 /**
4926  * Return whether the media changed since the last call to this
4927  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4928  */
4929 int bdrv_media_changed(BlockDriverState *bs)
4930 {
4931     BlockDriver *drv = bs->drv;
4932 
4933     if (drv && drv->bdrv_media_changed) {
4934         return drv->bdrv_media_changed(bs);
4935     }
4936     return -ENOTSUP;
4937 }
4938 
4939 /**
4940  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4941  */
4942 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4943 {
4944     BlockDriver *drv = bs->drv;
4945 
4946     if (drv && drv->bdrv_eject) {
4947         drv->bdrv_eject(bs, eject_flag);
4948     }
4949 
4950     if (bs->device_name[0] != '\0') {
4951         bdrv_emit_qmp_eject_event(bs, eject_flag);
4952     }
4953 }
4954 
4955 /**
4956  * Lock or unlock the media (if it is locked, the user won't be able
4957  * to eject it manually).
4958  */
4959 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4960 {
4961     BlockDriver *drv = bs->drv;
4962 
4963     trace_bdrv_lock_medium(bs, locked);
4964 
4965     if (drv && drv->bdrv_lock_medium) {
4966         drv->bdrv_lock_medium(bs, locked);
4967     }
4968 }
4969 
4970 /* needed for generic scsi interface */
4971 
4972 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4973 {
4974     BlockDriver *drv = bs->drv;
4975 
4976     if (drv && drv->bdrv_ioctl)
4977         return drv->bdrv_ioctl(bs, req, buf);
4978     return -ENOTSUP;
4979 }
4980 
4981 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4982         unsigned long int req, void *buf,
4983         BlockDriverCompletionFunc *cb, void *opaque)
4984 {
4985     BlockDriver *drv = bs->drv;
4986 
4987     if (drv && drv->bdrv_aio_ioctl)
4988         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4989     return NULL;
4990 }
4991 
4992 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
4993 {
4994     bs->guest_block_size = align;
4995 }
4996 
4997 void *qemu_blockalign(BlockDriverState *bs, size_t size)
4998 {
4999     return qemu_memalign(bdrv_opt_mem_align(bs), size);
5000 }
5001 
5002 /*
5003  * Check if all memory in this vector is sector aligned.
5004  */
5005 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5006 {
5007     int i;
5008     size_t alignment = bdrv_opt_mem_align(bs);
5009 
5010     for (i = 0; i < qiov->niov; i++) {
5011         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5012             return false;
5013         }
5014         if (qiov->iov[i].iov_len % alignment) {
5015             return false;
5016         }
5017     }
5018 
5019     return true;
5020 }
5021 
5022 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity)
5023 {
5024     int64_t bitmap_size;
5025     BdrvDirtyBitmap *bitmap;
5026 
5027     assert((granularity & (granularity - 1)) == 0);
5028 
5029     granularity >>= BDRV_SECTOR_BITS;
5030     assert(granularity);
5031     bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
5032     bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5033     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5034     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5035     return bitmap;
5036 }
5037 
5038 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5039 {
5040     BdrvDirtyBitmap *bm, *next;
5041     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5042         if (bm == bitmap) {
5043             QLIST_REMOVE(bitmap, list);
5044             hbitmap_free(bitmap->bitmap);
5045             g_free(bitmap);
5046             return;
5047         }
5048     }
5049 }
5050 
5051 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5052 {
5053     BdrvDirtyBitmap *bm;
5054     BlockDirtyInfoList *list = NULL;
5055     BlockDirtyInfoList **plist = &list;
5056 
5057     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5058         BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5059         BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5060         info->count = bdrv_get_dirty_count(bs, bm);
5061         info->granularity =
5062             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5063         entry->value = info;
5064         *plist = entry;
5065         plist = &entry->next;
5066     }
5067 
5068     return list;
5069 }
5070 
5071 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5072 {
5073     if (bitmap) {
5074         return hbitmap_get(bitmap->bitmap, sector);
5075     } else {
5076         return 0;
5077     }
5078 }
5079 
5080 void bdrv_dirty_iter_init(BlockDriverState *bs,
5081                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5082 {
5083     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5084 }
5085 
5086 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5087                     int nr_sectors)
5088 {
5089     BdrvDirtyBitmap *bitmap;
5090     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5091         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5092     }
5093 }
5094 
5095 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5096 {
5097     BdrvDirtyBitmap *bitmap;
5098     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5099         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5100     }
5101 }
5102 
5103 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5104 {
5105     return hbitmap_count(bitmap->bitmap);
5106 }
5107 
5108 /* Get a reference to bs */
5109 void bdrv_ref(BlockDriverState *bs)
5110 {
5111     bs->refcnt++;
5112 }
5113 
5114 /* Release a previously grabbed reference to bs.
5115  * If after releasing, reference count is zero, the BlockDriverState is
5116  * deleted. */
5117 void bdrv_unref(BlockDriverState *bs)
5118 {
5119     assert(bs->refcnt > 0);
5120     if (--bs->refcnt == 0) {
5121         bdrv_delete(bs);
5122     }
5123 }
5124 
5125 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
5126 {
5127     assert(bs->in_use != in_use);
5128     bs->in_use = in_use;
5129 }
5130 
5131 int bdrv_in_use(BlockDriverState *bs)
5132 {
5133     return bs->in_use;
5134 }
5135 
5136 void bdrv_iostatus_enable(BlockDriverState *bs)
5137 {
5138     bs->iostatus_enabled = true;
5139     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5140 }
5141 
5142 /* The I/O status is only enabled if the drive explicitly
5143  * enables it _and_ the VM is configured to stop on errors */
5144 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5145 {
5146     return (bs->iostatus_enabled &&
5147            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5148             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5149             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5150 }
5151 
5152 void bdrv_iostatus_disable(BlockDriverState *bs)
5153 {
5154     bs->iostatus_enabled = false;
5155 }
5156 
5157 void bdrv_iostatus_reset(BlockDriverState *bs)
5158 {
5159     if (bdrv_iostatus_is_enabled(bs)) {
5160         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5161         if (bs->job) {
5162             block_job_iostatus_reset(bs->job);
5163         }
5164     }
5165 }
5166 
5167 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5168 {
5169     assert(bdrv_iostatus_is_enabled(bs));
5170     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5171         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5172                                          BLOCK_DEVICE_IO_STATUS_FAILED;
5173     }
5174 }
5175 
5176 void
5177 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5178         enum BlockAcctType type)
5179 {
5180     assert(type < BDRV_MAX_IOTYPE);
5181 
5182     cookie->bytes = bytes;
5183     cookie->start_time_ns = get_clock();
5184     cookie->type = type;
5185 }
5186 
5187 void
5188 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5189 {
5190     assert(cookie->type < BDRV_MAX_IOTYPE);
5191 
5192     bs->nr_bytes[cookie->type] += cookie->bytes;
5193     bs->nr_ops[cookie->type]++;
5194     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5195 }
5196 
5197 void bdrv_img_create(const char *filename, const char *fmt,
5198                      const char *base_filename, const char *base_fmt,
5199                      char *options, uint64_t img_size, int flags,
5200                      Error **errp, bool quiet)
5201 {
5202     QEMUOptionParameter *param = NULL, *create_options = NULL;
5203     QEMUOptionParameter *backing_fmt, *backing_file, *size;
5204     BlockDriver *drv, *proto_drv;
5205     BlockDriver *backing_drv = NULL;
5206     Error *local_err = NULL;
5207     int ret = 0;
5208 
5209     /* Find driver and parse its options */
5210     drv = bdrv_find_format(fmt);
5211     if (!drv) {
5212         error_setg(errp, "Unknown file format '%s'", fmt);
5213         return;
5214     }
5215 
5216     proto_drv = bdrv_find_protocol(filename, true);
5217     if (!proto_drv) {
5218         error_setg(errp, "Unknown protocol '%s'", filename);
5219         return;
5220     }
5221 
5222     create_options = append_option_parameters(create_options,
5223                                               drv->create_options);
5224     create_options = append_option_parameters(create_options,
5225                                               proto_drv->create_options);
5226 
5227     /* Create parameter list with default values */
5228     param = parse_option_parameters("", create_options, param);
5229 
5230     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5231 
5232     /* Parse -o options */
5233     if (options) {
5234         param = parse_option_parameters(options, create_options, param);
5235         if (param == NULL) {
5236             error_setg(errp, "Invalid options for file format '%s'.", fmt);
5237             goto out;
5238         }
5239     }
5240 
5241     if (base_filename) {
5242         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5243                                  base_filename)) {
5244             error_setg(errp, "Backing file not supported for file format '%s'",
5245                        fmt);
5246             goto out;
5247         }
5248     }
5249 
5250     if (base_fmt) {
5251         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5252             error_setg(errp, "Backing file format not supported for file "
5253                              "format '%s'", fmt);
5254             goto out;
5255         }
5256     }
5257 
5258     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5259     if (backing_file && backing_file->value.s) {
5260         if (!strcmp(filename, backing_file->value.s)) {
5261             error_setg(errp, "Error: Trying to create an image with the "
5262                              "same filename as the backing file");
5263             goto out;
5264         }
5265     }
5266 
5267     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5268     if (backing_fmt && backing_fmt->value.s) {
5269         backing_drv = bdrv_find_format(backing_fmt->value.s);
5270         if (!backing_drv) {
5271             error_setg(errp, "Unknown backing file format '%s'",
5272                        backing_fmt->value.s);
5273             goto out;
5274         }
5275     }
5276 
5277     // The size for the image must always be specified, with one exception:
5278     // If we are using a backing file, we can obtain the size from there
5279     size = get_option_parameter(param, BLOCK_OPT_SIZE);
5280     if (size && size->value.n == -1) {
5281         if (backing_file && backing_file->value.s) {
5282             BlockDriverState *bs;
5283             uint64_t size;
5284             char buf[32];
5285             int back_flags;
5286 
5287             /* backing files always opened read-only */
5288             back_flags =
5289                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5290 
5291             bs = bdrv_new("");
5292 
5293             ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags,
5294                             backing_drv, &local_err);
5295             if (ret < 0) {
5296                 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5297                                  backing_file->value.s,
5298                                  error_get_pretty(local_err));
5299                 error_free(local_err);
5300                 local_err = NULL;
5301                 bdrv_unref(bs);
5302                 goto out;
5303             }
5304             bdrv_get_geometry(bs, &size);
5305             size *= 512;
5306 
5307             snprintf(buf, sizeof(buf), "%" PRId64, size);
5308             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
5309 
5310             bdrv_unref(bs);
5311         } else {
5312             error_setg(errp, "Image creation needs a size parameter");
5313             goto out;
5314         }
5315     }
5316 
5317     if (!quiet) {
5318         printf("Formatting '%s', fmt=%s ", filename, fmt);
5319         print_option_parameters(param);
5320         puts("");
5321     }
5322     ret = bdrv_create(drv, filename, param, &local_err);
5323     if (ret == -EFBIG) {
5324         /* This is generally a better message than whatever the driver would
5325          * deliver (especially because of the cluster_size_hint), since that
5326          * is most probably not much different from "image too large". */
5327         const char *cluster_size_hint = "";
5328         if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5329             cluster_size_hint = " (try using a larger cluster size)";
5330         }
5331         error_setg(errp, "The image size is too large for file format '%s'"
5332                    "%s", fmt, cluster_size_hint);
5333         error_free(local_err);
5334         local_err = NULL;
5335     }
5336 
5337 out:
5338     free_option_parameters(create_options);
5339     free_option_parameters(param);
5340 
5341     if (error_is_set(&local_err)) {
5342         error_propagate(errp, local_err);
5343     }
5344 }
5345 
5346 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5347 {
5348     /* Currently BlockDriverState always uses the main loop AioContext */
5349     return qemu_get_aio_context();
5350 }
5351 
5352 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5353                                     NotifierWithReturn *notifier)
5354 {
5355     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5356 }
5357 
5358 int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5359 {
5360     if (bs->drv->bdrv_amend_options == NULL) {
5361         return -ENOTSUP;
5362     }
5363     return bs->drv->bdrv_amend_options(bs, options);
5364 }
5365 
5366 /* Used to recurse on single child block filters.
5367  * Single child block filter will store their child in bs->file.
5368  */
5369 bool bdrv_generic_is_first_non_filter(BlockDriverState *bs,
5370                                       BlockDriverState *candidate)
5371 {
5372     if (!bs->drv) {
5373         return false;
5374     }
5375 
5376     if (!bs->drv->authorizations[BS_IS_A_FILTER]) {
5377         if (bs == candidate) {
5378             return true;
5379         } else {
5380             return false;
5381         }
5382     }
5383 
5384     if (!bs->drv->authorizations[BS_FILTER_PASS_DOWN]) {
5385         return false;
5386     }
5387 
5388     if (!bs->file) {
5389         return false;
5390     }
5391 
5392     return bdrv_recurse_is_first_non_filter(bs->file, candidate);
5393 }
5394 
5395 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5396                                       BlockDriverState *candidate)
5397 {
5398     if (bs->drv && bs->drv->bdrv_recurse_is_first_non_filter) {
5399         return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5400     }
5401 
5402     return bdrv_generic_is_first_non_filter(bs, candidate);
5403 }
5404 
5405 /* This function checks if the candidate is the first non filter bs down it's
5406  * bs chain. Since we don't have pointers to parents it explore all bs chains
5407  * from the top. Some filters can choose not to pass down the recursion.
5408  */
5409 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5410 {
5411     BlockDriverState *bs;
5412 
5413     /* walk down the bs forest recursively */
5414     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5415         bool perm;
5416 
5417         if (!bs->file) {
5418             continue;
5419         }
5420 
5421         perm = bdrv_recurse_is_first_non_filter(bs->file, candidate);
5422 
5423         /* candidate is the first non filter */
5424         if (perm) {
5425             return true;
5426         }
5427     }
5428 
5429     return false;
5430 }
5431