xref: /openbmc/qemu/block.c (revision d6032e06)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor/monitor.h"
28 #include "block/block_int.h"
29 #include "block/blockjob.h"
30 #include "qemu/module.h"
31 #include "qapi/qmp/qjson.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 
39 #ifdef CONFIG_BSD
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/queue.h>
44 #ifndef __DragonFly__
45 #include <sys/disk.h>
46 #endif
47 #endif
48 
49 #ifdef _WIN32
50 #include <windows.h>
51 #endif
52 
53 struct BdrvDirtyBitmap {
54     HBitmap *bitmap;
55     QLIST_ENTRY(BdrvDirtyBitmap) list;
56 };
57 
58 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59 
60 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63         BlockDriverCompletionFunc *cb, void *opaque);
64 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66         BlockDriverCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68                                          int64_t sector_num, int nb_sectors,
69                                          QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71                                          int64_t sector_num, int nb_sectors,
72                                          QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75     BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78     BdrvRequestFlags flags);
79 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80                                                int64_t sector_num,
81                                                QEMUIOVector *qiov,
82                                                int nb_sectors,
83                                                BdrvRequestFlags flags,
84                                                BlockDriverCompletionFunc *cb,
85                                                void *opaque,
86                                                bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90 
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92     QTAILQ_HEAD_INITIALIZER(bdrv_states);
93 
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96 
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98     QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
102 
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
105 {
106     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108             filename[1] == ':');
109 }
110 
111 int is_windows_drive(const char *filename)
112 {
113     if (is_windows_drive_prefix(filename) &&
114         filename[2] == '\0')
115         return 1;
116     if (strstart(filename, "\\\\.\\", NULL) ||
117         strstart(filename, "//./", NULL))
118         return 1;
119     return 0;
120 }
121 #endif
122 
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125                         ThrottleConfig *cfg)
126 {
127     int i;
128 
129     throttle_config(&bs->throttle_state, cfg);
130 
131     for (i = 0; i < 2; i++) {
132         qemu_co_enter_next(&bs->throttled_reqs[i]);
133     }
134 }
135 
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138 {
139     bool drained = false;
140     bool enabled = bs->io_limits_enabled;
141     int i;
142 
143     bs->io_limits_enabled = false;
144 
145     for (i = 0; i < 2; i++) {
146         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147             drained = true;
148         }
149     }
150 
151     bs->io_limits_enabled = enabled;
152 
153     return drained;
154 }
155 
156 void bdrv_io_limits_disable(BlockDriverState *bs)
157 {
158     bs->io_limits_enabled = false;
159 
160     bdrv_start_throttled_reqs(bs);
161 
162     throttle_destroy(&bs->throttle_state);
163 }
164 
165 static void bdrv_throttle_read_timer_cb(void *opaque)
166 {
167     BlockDriverState *bs = opaque;
168     qemu_co_enter_next(&bs->throttled_reqs[0]);
169 }
170 
171 static void bdrv_throttle_write_timer_cb(void *opaque)
172 {
173     BlockDriverState *bs = opaque;
174     qemu_co_enter_next(&bs->throttled_reqs[1]);
175 }
176 
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
179 {
180     assert(!bs->io_limits_enabled);
181     throttle_init(&bs->throttle_state,
182                   QEMU_CLOCK_VIRTUAL,
183                   bdrv_throttle_read_timer_cb,
184                   bdrv_throttle_write_timer_cb,
185                   bs);
186     bs->io_limits_enabled = true;
187 }
188 
189 /* This function makes an IO wait if needed
190  *
191  * @nb_sectors: the number of sectors of the IO
192  * @is_write:   is the IO a write
193  */
194 static void bdrv_io_limits_intercept(BlockDriverState *bs,
195                                      unsigned int bytes,
196                                      bool is_write)
197 {
198     /* does this io must wait */
199     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
200 
201     /* if must wait or any request of this type throttled queue the IO */
202     if (must_wait ||
203         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
205     }
206 
207     /* the IO will be executed, do the accounting */
208     throttle_account(&bs->throttle_state, is_write, bytes);
209 
210 
211     /* if the next request must wait -> do nothing */
212     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
213         return;
214     }
215 
216     /* else queue next request for execution */
217     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
218 }
219 
220 size_t bdrv_opt_mem_align(BlockDriverState *bs)
221 {
222     if (!bs || !bs->drv) {
223         /* 4k should be on the safe side */
224         return 4096;
225     }
226 
227     return bs->bl.opt_mem_alignment;
228 }
229 
230 /* check if the path starts with "<protocol>:" */
231 static int path_has_protocol(const char *path)
232 {
233     const char *p;
234 
235 #ifdef _WIN32
236     if (is_windows_drive(path) ||
237         is_windows_drive_prefix(path)) {
238         return 0;
239     }
240     p = path + strcspn(path, ":/\\");
241 #else
242     p = path + strcspn(path, ":/");
243 #endif
244 
245     return *p == ':';
246 }
247 
248 int path_is_absolute(const char *path)
249 {
250 #ifdef _WIN32
251     /* specific case for names like: "\\.\d:" */
252     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
253         return 1;
254     }
255     return (*path == '/' || *path == '\\');
256 #else
257     return (*path == '/');
258 #endif
259 }
260 
261 /* if filename is absolute, just copy it to dest. Otherwise, build a
262    path to it by considering it is relative to base_path. URL are
263    supported. */
264 void path_combine(char *dest, int dest_size,
265                   const char *base_path,
266                   const char *filename)
267 {
268     const char *p, *p1;
269     int len;
270 
271     if (dest_size <= 0)
272         return;
273     if (path_is_absolute(filename)) {
274         pstrcpy(dest, dest_size, filename);
275     } else {
276         p = strchr(base_path, ':');
277         if (p)
278             p++;
279         else
280             p = base_path;
281         p1 = strrchr(base_path, '/');
282 #ifdef _WIN32
283         {
284             const char *p2;
285             p2 = strrchr(base_path, '\\');
286             if (!p1 || p2 > p1)
287                 p1 = p2;
288         }
289 #endif
290         if (p1)
291             p1++;
292         else
293             p1 = base_path;
294         if (p1 > p)
295             p = p1;
296         len = p - base_path;
297         if (len > dest_size - 1)
298             len = dest_size - 1;
299         memcpy(dest, base_path, len);
300         dest[len] = '\0';
301         pstrcat(dest, dest_size, filename);
302     }
303 }
304 
305 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
306 {
307     if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
308         pstrcpy(dest, sz, bs->backing_file);
309     } else {
310         path_combine(dest, sz, bs->filename, bs->backing_file);
311     }
312 }
313 
314 void bdrv_register(BlockDriver *bdrv)
315 {
316     /* Block drivers without coroutine functions need emulation */
317     if (!bdrv->bdrv_co_readv) {
318         bdrv->bdrv_co_readv = bdrv_co_readv_em;
319         bdrv->bdrv_co_writev = bdrv_co_writev_em;
320 
321         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
322          * the block driver lacks aio we need to emulate that too.
323          */
324         if (!bdrv->bdrv_aio_readv) {
325             /* add AIO emulation layer */
326             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
327             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
328         }
329     }
330 
331     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
332 }
333 
334 /* create a new block device (by default it is empty) */
335 BlockDriverState *bdrv_new(const char *device_name)
336 {
337     BlockDriverState *bs;
338 
339     bs = g_malloc0(sizeof(BlockDriverState));
340     QLIST_INIT(&bs->dirty_bitmaps);
341     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
342     if (device_name[0] != '\0') {
343         QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
344     }
345     bdrv_iostatus_disable(bs);
346     notifier_list_init(&bs->close_notifiers);
347     notifier_with_return_list_init(&bs->before_write_notifiers);
348     qemu_co_queue_init(&bs->throttled_reqs[0]);
349     qemu_co_queue_init(&bs->throttled_reqs[1]);
350     bs->refcnt = 1;
351 
352     return bs;
353 }
354 
355 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
356 {
357     notifier_list_add(&bs->close_notifiers, notify);
358 }
359 
360 BlockDriver *bdrv_find_format(const char *format_name)
361 {
362     BlockDriver *drv1;
363     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
364         if (!strcmp(drv1->format_name, format_name)) {
365             return drv1;
366         }
367     }
368     return NULL;
369 }
370 
371 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
372 {
373     static const char *whitelist_rw[] = {
374         CONFIG_BDRV_RW_WHITELIST
375     };
376     static const char *whitelist_ro[] = {
377         CONFIG_BDRV_RO_WHITELIST
378     };
379     const char **p;
380 
381     if (!whitelist_rw[0] && !whitelist_ro[0]) {
382         return 1;               /* no whitelist, anything goes */
383     }
384 
385     for (p = whitelist_rw; *p; p++) {
386         if (!strcmp(drv->format_name, *p)) {
387             return 1;
388         }
389     }
390     if (read_only) {
391         for (p = whitelist_ro; *p; p++) {
392             if (!strcmp(drv->format_name, *p)) {
393                 return 1;
394             }
395         }
396     }
397     return 0;
398 }
399 
400 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
401                                           bool read_only)
402 {
403     BlockDriver *drv = bdrv_find_format(format_name);
404     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
405 }
406 
407 typedef struct CreateCo {
408     BlockDriver *drv;
409     char *filename;
410     QEMUOptionParameter *options;
411     int ret;
412     Error *err;
413 } CreateCo;
414 
415 static void coroutine_fn bdrv_create_co_entry(void *opaque)
416 {
417     Error *local_err = NULL;
418     int ret;
419 
420     CreateCo *cco = opaque;
421     assert(cco->drv);
422 
423     ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
424     if (local_err) {
425         error_propagate(&cco->err, local_err);
426     }
427     cco->ret = ret;
428 }
429 
430 int bdrv_create(BlockDriver *drv, const char* filename,
431     QEMUOptionParameter *options, Error **errp)
432 {
433     int ret;
434 
435     Coroutine *co;
436     CreateCo cco = {
437         .drv = drv,
438         .filename = g_strdup(filename),
439         .options = options,
440         .ret = NOT_DONE,
441         .err = NULL,
442     };
443 
444     if (!drv->bdrv_create) {
445         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
446         ret = -ENOTSUP;
447         goto out;
448     }
449 
450     if (qemu_in_coroutine()) {
451         /* Fast-path if already in coroutine context */
452         bdrv_create_co_entry(&cco);
453     } else {
454         co = qemu_coroutine_create(bdrv_create_co_entry);
455         qemu_coroutine_enter(co, &cco);
456         while (cco.ret == NOT_DONE) {
457             qemu_aio_wait();
458         }
459     }
460 
461     ret = cco.ret;
462     if (ret < 0) {
463         if (cco.err) {
464             error_propagate(errp, cco.err);
465         } else {
466             error_setg_errno(errp, -ret, "Could not create image");
467         }
468     }
469 
470 out:
471     g_free(cco.filename);
472     return ret;
473 }
474 
475 int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
476                      Error **errp)
477 {
478     BlockDriver *drv;
479     Error *local_err = NULL;
480     int ret;
481 
482     drv = bdrv_find_protocol(filename, true);
483     if (drv == NULL) {
484         error_setg(errp, "Could not find protocol for file '%s'", filename);
485         return -ENOENT;
486     }
487 
488     ret = bdrv_create(drv, filename, options, &local_err);
489     if (local_err) {
490         error_propagate(errp, local_err);
491     }
492     return ret;
493 }
494 
495 int bdrv_refresh_limits(BlockDriverState *bs)
496 {
497     BlockDriver *drv = bs->drv;
498 
499     memset(&bs->bl, 0, sizeof(bs->bl));
500 
501     if (!drv) {
502         return 0;
503     }
504 
505     /* Take some limits from the children as a default */
506     if (bs->file) {
507         bdrv_refresh_limits(bs->file);
508         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
509         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
510     } else {
511         bs->bl.opt_mem_alignment = 512;
512     }
513 
514     if (bs->backing_hd) {
515         bdrv_refresh_limits(bs->backing_hd);
516         bs->bl.opt_transfer_length =
517             MAX(bs->bl.opt_transfer_length,
518                 bs->backing_hd->bl.opt_transfer_length);
519         bs->bl.opt_mem_alignment =
520             MAX(bs->bl.opt_mem_alignment,
521                 bs->backing_hd->bl.opt_mem_alignment);
522     }
523 
524     /* Then let the driver override it */
525     if (drv->bdrv_refresh_limits) {
526         return drv->bdrv_refresh_limits(bs);
527     }
528 
529     return 0;
530 }
531 
532 /*
533  * Create a uniquely-named empty temporary file.
534  * Return 0 upon success, otherwise a negative errno value.
535  */
536 int get_tmp_filename(char *filename, int size)
537 {
538 #ifdef _WIN32
539     char temp_dir[MAX_PATH];
540     /* GetTempFileName requires that its output buffer (4th param)
541        have length MAX_PATH or greater.  */
542     assert(size >= MAX_PATH);
543     return (GetTempPath(MAX_PATH, temp_dir)
544             && GetTempFileName(temp_dir, "qem", 0, filename)
545             ? 0 : -GetLastError());
546 #else
547     int fd;
548     const char *tmpdir;
549     tmpdir = getenv("TMPDIR");
550     if (!tmpdir)
551         tmpdir = "/tmp";
552     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
553         return -EOVERFLOW;
554     }
555     fd = mkstemp(filename);
556     if (fd < 0) {
557         return -errno;
558     }
559     if (close(fd) != 0) {
560         unlink(filename);
561         return -errno;
562     }
563     return 0;
564 #endif
565 }
566 
567 /*
568  * Detect host devices. By convention, /dev/cdrom[N] is always
569  * recognized as a host CDROM.
570  */
571 static BlockDriver *find_hdev_driver(const char *filename)
572 {
573     int score_max = 0, score;
574     BlockDriver *drv = NULL, *d;
575 
576     QLIST_FOREACH(d, &bdrv_drivers, list) {
577         if (d->bdrv_probe_device) {
578             score = d->bdrv_probe_device(filename);
579             if (score > score_max) {
580                 score_max = score;
581                 drv = d;
582             }
583         }
584     }
585 
586     return drv;
587 }
588 
589 BlockDriver *bdrv_find_protocol(const char *filename,
590                                 bool allow_protocol_prefix)
591 {
592     BlockDriver *drv1;
593     char protocol[128];
594     int len;
595     const char *p;
596 
597     /* TODO Drivers without bdrv_file_open must be specified explicitly */
598 
599     /*
600      * XXX(hch): we really should not let host device detection
601      * override an explicit protocol specification, but moving this
602      * later breaks access to device names with colons in them.
603      * Thanks to the brain-dead persistent naming schemes on udev-
604      * based Linux systems those actually are quite common.
605      */
606     drv1 = find_hdev_driver(filename);
607     if (drv1) {
608         return drv1;
609     }
610 
611     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
612         return bdrv_find_format("file");
613     }
614 
615     p = strchr(filename, ':');
616     assert(p != NULL);
617     len = p - filename;
618     if (len > sizeof(protocol) - 1)
619         len = sizeof(protocol) - 1;
620     memcpy(protocol, filename, len);
621     protocol[len] = '\0';
622     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
623         if (drv1->protocol_name &&
624             !strcmp(drv1->protocol_name, protocol)) {
625             return drv1;
626         }
627     }
628     return NULL;
629 }
630 
631 static int find_image_format(BlockDriverState *bs, const char *filename,
632                              BlockDriver **pdrv, Error **errp)
633 {
634     int score, score_max;
635     BlockDriver *drv1, *drv;
636     uint8_t buf[2048];
637     int ret = 0;
638 
639     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
640     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
641         drv = bdrv_find_format("raw");
642         if (!drv) {
643             error_setg(errp, "Could not find raw image format");
644             ret = -ENOENT;
645         }
646         *pdrv = drv;
647         return ret;
648     }
649 
650     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
651     if (ret < 0) {
652         error_setg_errno(errp, -ret, "Could not read image for determining its "
653                          "format");
654         *pdrv = NULL;
655         return ret;
656     }
657 
658     score_max = 0;
659     drv = NULL;
660     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
661         if (drv1->bdrv_probe) {
662             score = drv1->bdrv_probe(buf, ret, filename);
663             if (score > score_max) {
664                 score_max = score;
665                 drv = drv1;
666             }
667         }
668     }
669     if (!drv) {
670         error_setg(errp, "Could not determine image format: No compatible "
671                    "driver found");
672         ret = -ENOENT;
673     }
674     *pdrv = drv;
675     return ret;
676 }
677 
678 /**
679  * Set the current 'total_sectors' value
680  */
681 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
682 {
683     BlockDriver *drv = bs->drv;
684 
685     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
686     if (bs->sg)
687         return 0;
688 
689     /* query actual device if possible, otherwise just trust the hint */
690     if (drv->bdrv_getlength) {
691         int64_t length = drv->bdrv_getlength(bs);
692         if (length < 0) {
693             return length;
694         }
695         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
696     }
697 
698     bs->total_sectors = hint;
699     return 0;
700 }
701 
702 /**
703  * Set open flags for a given discard mode
704  *
705  * Return 0 on success, -1 if the discard mode was invalid.
706  */
707 int bdrv_parse_discard_flags(const char *mode, int *flags)
708 {
709     *flags &= ~BDRV_O_UNMAP;
710 
711     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
712         /* do nothing */
713     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
714         *flags |= BDRV_O_UNMAP;
715     } else {
716         return -1;
717     }
718 
719     return 0;
720 }
721 
722 /**
723  * Set open flags for a given cache mode
724  *
725  * Return 0 on success, -1 if the cache mode was invalid.
726  */
727 int bdrv_parse_cache_flags(const char *mode, int *flags)
728 {
729     *flags &= ~BDRV_O_CACHE_MASK;
730 
731     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
732         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
733     } else if (!strcmp(mode, "directsync")) {
734         *flags |= BDRV_O_NOCACHE;
735     } else if (!strcmp(mode, "writeback")) {
736         *flags |= BDRV_O_CACHE_WB;
737     } else if (!strcmp(mode, "unsafe")) {
738         *flags |= BDRV_O_CACHE_WB;
739         *flags |= BDRV_O_NO_FLUSH;
740     } else if (!strcmp(mode, "writethrough")) {
741         /* this is the default */
742     } else {
743         return -1;
744     }
745 
746     return 0;
747 }
748 
749 /**
750  * The copy-on-read flag is actually a reference count so multiple users may
751  * use the feature without worrying about clobbering its previous state.
752  * Copy-on-read stays enabled until all users have called to disable it.
753  */
754 void bdrv_enable_copy_on_read(BlockDriverState *bs)
755 {
756     bs->copy_on_read++;
757 }
758 
759 void bdrv_disable_copy_on_read(BlockDriverState *bs)
760 {
761     assert(bs->copy_on_read > 0);
762     bs->copy_on_read--;
763 }
764 
765 static int bdrv_open_flags(BlockDriverState *bs, int flags)
766 {
767     int open_flags = flags | BDRV_O_CACHE_WB;
768 
769     /*
770      * Clear flags that are internal to the block layer before opening the
771      * image.
772      */
773     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
774 
775     /*
776      * Snapshots should be writable.
777      */
778     if (bs->is_temporary) {
779         open_flags |= BDRV_O_RDWR;
780     }
781 
782     return open_flags;
783 }
784 
785 static int bdrv_assign_node_name(BlockDriverState *bs,
786                                  const char *node_name,
787                                  Error **errp)
788 {
789     if (!node_name) {
790         return 0;
791     }
792 
793     /* empty string node name is invalid */
794     if (node_name[0] == '\0') {
795         error_setg(errp, "Empty node name");
796         return -EINVAL;
797     }
798 
799     /* takes care of avoiding namespaces collisions */
800     if (bdrv_find(node_name)) {
801         error_setg(errp, "node-name=%s is conflicting with a device id",
802                    node_name);
803         return -EINVAL;
804     }
805 
806     /* takes care of avoiding duplicates node names */
807     if (bdrv_find_node(node_name)) {
808         error_setg(errp, "Duplicate node name");
809         return -EINVAL;
810     }
811 
812     /* copy node name into the bs and insert it into the graph list */
813     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
814     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
815 
816     return 0;
817 }
818 
819 /*
820  * Common part for opening disk images and files
821  *
822  * Removes all processed options from *options.
823  */
824 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
825     QDict *options, int flags, BlockDriver *drv, Error **errp)
826 {
827     int ret, open_flags;
828     const char *filename;
829     const char *node_name = NULL;
830     Error *local_err = NULL;
831 
832     assert(drv != NULL);
833     assert(bs->file == NULL);
834     assert(options != NULL && bs->options != options);
835 
836     if (file != NULL) {
837         filename = file->filename;
838     } else {
839         filename = qdict_get_try_str(options, "filename");
840     }
841 
842     if (drv->bdrv_needs_filename && !filename) {
843         error_setg(errp, "The '%s' block driver requires a file name",
844                    drv->format_name);
845         return -EINVAL;
846     }
847 
848     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
849 
850     node_name = qdict_get_try_str(options, "node-name");
851     ret = bdrv_assign_node_name(bs, node_name, errp);
852     if (ret < 0) {
853         return ret;
854     }
855     qdict_del(options, "node-name");
856 
857     /* bdrv_open() with directly using a protocol as drv. This layer is already
858      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
859      * and return immediately. */
860     if (file != NULL && drv->bdrv_file_open) {
861         bdrv_swap(file, bs);
862         return 0;
863     }
864 
865     bs->open_flags = flags;
866     bs->guest_block_size = 512;
867     bs->request_alignment = 512;
868     bs->zero_beyond_eof = true;
869     open_flags = bdrv_open_flags(bs, flags);
870     bs->read_only = !(open_flags & BDRV_O_RDWR);
871 
872     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
873         error_setg(errp,
874                    !bs->read_only && bdrv_is_whitelisted(drv, true)
875                         ? "Driver '%s' can only be used for read-only devices"
876                         : "Driver '%s' is not whitelisted",
877                    drv->format_name);
878         return -ENOTSUP;
879     }
880 
881     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
882     if (flags & BDRV_O_COPY_ON_READ) {
883         if (!bs->read_only) {
884             bdrv_enable_copy_on_read(bs);
885         } else {
886             error_setg(errp, "Can't use copy-on-read on read-only device");
887             return -EINVAL;
888         }
889     }
890 
891     if (filename != NULL) {
892         pstrcpy(bs->filename, sizeof(bs->filename), filename);
893     } else {
894         bs->filename[0] = '\0';
895     }
896 
897     bs->drv = drv;
898     bs->opaque = g_malloc0(drv->instance_size);
899 
900     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
901 
902     /* Open the image, either directly or using a protocol */
903     if (drv->bdrv_file_open) {
904         assert(file == NULL);
905         assert(!drv->bdrv_needs_filename || filename != NULL);
906         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
907     } else {
908         if (file == NULL) {
909             error_setg(errp, "Can't use '%s' as a block driver for the "
910                        "protocol level", drv->format_name);
911             ret = -EINVAL;
912             goto free_and_fail;
913         }
914         bs->file = file;
915         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
916     }
917 
918     if (ret < 0) {
919         if (local_err) {
920             error_propagate(errp, local_err);
921         } else if (bs->filename[0]) {
922             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
923         } else {
924             error_setg_errno(errp, -ret, "Could not open image");
925         }
926         goto free_and_fail;
927     }
928 
929     ret = refresh_total_sectors(bs, bs->total_sectors);
930     if (ret < 0) {
931         error_setg_errno(errp, -ret, "Could not refresh total sector count");
932         goto free_and_fail;
933     }
934 
935     bdrv_refresh_limits(bs);
936     assert(bdrv_opt_mem_align(bs) != 0);
937     assert(bs->request_alignment != 0);
938 
939 #ifndef _WIN32
940     if (bs->is_temporary) {
941         assert(bs->filename[0] != '\0');
942         unlink(bs->filename);
943     }
944 #endif
945     return 0;
946 
947 free_and_fail:
948     bs->file = NULL;
949     g_free(bs->opaque);
950     bs->opaque = NULL;
951     bs->drv = NULL;
952     return ret;
953 }
954 
955 /*
956  * Opens a file using a protocol (file, host_device, nbd, ...)
957  *
958  * options is an indirect pointer to a QDict of options to pass to the block
959  * drivers, or pointer to NULL for an empty set of options. If this function
960  * takes ownership of the QDict reference, it will set *options to NULL;
961  * otherwise, it will contain unused/unrecognized options after this function
962  * returns. Then, the caller is responsible for freeing it. If it intends to
963  * reuse the QDict, QINCREF() should be called beforehand.
964  */
965 static int bdrv_file_open(BlockDriverState *bs, const char *filename,
966                           QDict **options, int flags, Error **errp)
967 {
968     BlockDriver *drv;
969     const char *drvname;
970     bool allow_protocol_prefix = false;
971     Error *local_err = NULL;
972     int ret;
973 
974     /* Fetch the file name from the options QDict if necessary */
975     if (!filename) {
976         filename = qdict_get_try_str(*options, "filename");
977     } else if (filename && !qdict_haskey(*options, "filename")) {
978         qdict_put(*options, "filename", qstring_from_str(filename));
979         allow_protocol_prefix = true;
980     } else {
981         error_setg(errp, "Can't specify 'file' and 'filename' options at the "
982                    "same time");
983         ret = -EINVAL;
984         goto fail;
985     }
986 
987     /* Find the right block driver */
988     drvname = qdict_get_try_str(*options, "driver");
989     if (drvname) {
990         drv = bdrv_find_format(drvname);
991         if (!drv) {
992             error_setg(errp, "Unknown driver '%s'", drvname);
993         }
994         qdict_del(*options, "driver");
995     } else if (filename) {
996         drv = bdrv_find_protocol(filename, allow_protocol_prefix);
997         if (!drv) {
998             error_setg(errp, "Unknown protocol");
999         }
1000     } else {
1001         error_setg(errp, "Must specify either driver or file");
1002         drv = NULL;
1003     }
1004 
1005     if (!drv) {
1006         /* errp has been set already */
1007         ret = -ENOENT;
1008         goto fail;
1009     }
1010 
1011     /* Parse the filename and open it */
1012     if (drv->bdrv_parse_filename && filename) {
1013         drv->bdrv_parse_filename(filename, *options, &local_err);
1014         if (local_err) {
1015             error_propagate(errp, local_err);
1016             ret = -EINVAL;
1017             goto fail;
1018         }
1019         qdict_del(*options, "filename");
1020     }
1021 
1022     if (!drv->bdrv_file_open) {
1023         ret = bdrv_open(&bs, filename, NULL, *options, flags, drv, &local_err);
1024         *options = NULL;
1025     } else {
1026         ret = bdrv_open_common(bs, NULL, *options, flags, drv, &local_err);
1027     }
1028     if (ret < 0) {
1029         error_propagate(errp, local_err);
1030         goto fail;
1031     }
1032 
1033     bs->growable = 1;
1034     return 0;
1035 
1036 fail:
1037     return ret;
1038 }
1039 
1040 /*
1041  * Opens the backing file for a BlockDriverState if not yet open
1042  *
1043  * options is a QDict of options to pass to the block drivers, or NULL for an
1044  * empty set of options. The reference to the QDict is transferred to this
1045  * function (even on failure), so if the caller intends to reuse the dictionary,
1046  * it needs to use QINCREF() before calling bdrv_file_open.
1047  */
1048 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1049 {
1050     char backing_filename[PATH_MAX];
1051     int back_flags, ret;
1052     BlockDriver *back_drv = NULL;
1053     Error *local_err = NULL;
1054 
1055     if (bs->backing_hd != NULL) {
1056         QDECREF(options);
1057         return 0;
1058     }
1059 
1060     /* NULL means an empty set of options */
1061     if (options == NULL) {
1062         options = qdict_new();
1063     }
1064 
1065     bs->open_flags &= ~BDRV_O_NO_BACKING;
1066     if (qdict_haskey(options, "file.filename")) {
1067         backing_filename[0] = '\0';
1068     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1069         QDECREF(options);
1070         return 0;
1071     } else {
1072         bdrv_get_full_backing_filename(bs, backing_filename,
1073                                        sizeof(backing_filename));
1074     }
1075 
1076     if (bs->backing_format[0] != '\0') {
1077         back_drv = bdrv_find_format(bs->backing_format);
1078     }
1079 
1080     /* backing files always opened read-only */
1081     back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
1082                                     BDRV_O_COPY_ON_READ);
1083 
1084     assert(bs->backing_hd == NULL);
1085     ret = bdrv_open(&bs->backing_hd,
1086                     *backing_filename ? backing_filename : NULL, NULL, options,
1087                     back_flags, back_drv, &local_err);
1088     if (ret < 0) {
1089         bs->backing_hd = NULL;
1090         bs->open_flags |= BDRV_O_NO_BACKING;
1091         error_setg(errp, "Could not open backing file: %s",
1092                    error_get_pretty(local_err));
1093         error_free(local_err);
1094         return ret;
1095     }
1096 
1097     if (bs->backing_hd->file) {
1098         pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1099                 bs->backing_hd->file->filename);
1100     }
1101 
1102     /* Recalculate the BlockLimits with the backing file */
1103     bdrv_refresh_limits(bs);
1104 
1105     return 0;
1106 }
1107 
1108 /*
1109  * Opens a disk image whose options are given as BlockdevRef in another block
1110  * device's options.
1111  *
1112  * If allow_none is true, no image will be opened if filename is false and no
1113  * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1114  *
1115  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1116  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1117  * itself, all options starting with "${bdref_key}." are considered part of the
1118  * BlockdevRef.
1119  *
1120  * The BlockdevRef will be removed from the options QDict.
1121  *
1122  * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1123  */
1124 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1125                     QDict *options, const char *bdref_key, int flags,
1126                     bool allow_none, Error **errp)
1127 {
1128     QDict *image_options;
1129     int ret;
1130     char *bdref_key_dot;
1131     const char *reference;
1132 
1133     assert(pbs);
1134     assert(*pbs == NULL);
1135 
1136     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1137     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1138     g_free(bdref_key_dot);
1139 
1140     reference = qdict_get_try_str(options, bdref_key);
1141     if (!filename && !reference && !qdict_size(image_options)) {
1142         if (allow_none) {
1143             ret = 0;
1144         } else {
1145             error_setg(errp, "A block device must be specified for \"%s\"",
1146                        bdref_key);
1147             ret = -EINVAL;
1148         }
1149         goto done;
1150     }
1151 
1152     ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1153 
1154 done:
1155     qdict_del(options, bdref_key);
1156     return ret;
1157 }
1158 
1159 /*
1160  * Opens a disk image (raw, qcow2, vmdk, ...)
1161  *
1162  * options is a QDict of options to pass to the block drivers, or NULL for an
1163  * empty set of options. The reference to the QDict belongs to the block layer
1164  * after the call (even on failure), so if the caller intends to reuse the
1165  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1166  *
1167  * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1168  * If it is not NULL, the referenced BDS will be reused.
1169  *
1170  * The reference parameter may be used to specify an existing block device which
1171  * should be opened. If specified, neither options nor a filename may be given,
1172  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1173  */
1174 int bdrv_open(BlockDriverState **pbs, const char *filename,
1175               const char *reference, QDict *options, int flags,
1176               BlockDriver *drv, Error **errp)
1177 {
1178     int ret;
1179     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1180     char tmp_filename[PATH_MAX + 1];
1181     BlockDriverState *file = NULL, *bs;
1182     const char *drvname;
1183     Error *local_err = NULL;
1184 
1185     assert(pbs);
1186 
1187     if (reference) {
1188         bool options_non_empty = options ? qdict_size(options) : false;
1189         QDECREF(options);
1190 
1191         if (*pbs) {
1192             error_setg(errp, "Cannot reuse an existing BDS when referencing "
1193                        "another block device");
1194             return -EINVAL;
1195         }
1196 
1197         if (filename || options_non_empty) {
1198             error_setg(errp, "Cannot reference an existing block device with "
1199                        "additional options or a new filename");
1200             return -EINVAL;
1201         }
1202 
1203         bs = bdrv_lookup_bs(reference, reference, errp);
1204         if (!bs) {
1205             return -ENODEV;
1206         }
1207         bdrv_ref(bs);
1208         *pbs = bs;
1209         return 0;
1210     }
1211 
1212     if (*pbs) {
1213         bs = *pbs;
1214     } else {
1215         bs = bdrv_new("");
1216     }
1217 
1218     /* NULL means an empty set of options */
1219     if (options == NULL) {
1220         options = qdict_new();
1221     }
1222 
1223     bs->options = options;
1224     options = qdict_clone_shallow(options);
1225 
1226     if (flags & BDRV_O_PROTOCOL) {
1227         assert(!drv);
1228         ret = bdrv_file_open(bs, filename, &options, flags & ~BDRV_O_PROTOCOL,
1229                              &local_err);
1230         if (!ret) {
1231             goto done;
1232         } else if (bs->drv) {
1233             goto close_and_fail;
1234         } else {
1235             goto fail;
1236         }
1237     }
1238 
1239     /* For snapshot=on, create a temporary qcow2 overlay */
1240     if (flags & BDRV_O_SNAPSHOT) {
1241         BlockDriverState *bs1;
1242         int64_t total_size;
1243         BlockDriver *bdrv_qcow2;
1244         QEMUOptionParameter *create_options;
1245         QDict *snapshot_options;
1246 
1247         /* if snapshot, we create a temporary backing file and open it
1248            instead of opening 'filename' directly */
1249 
1250         /* Get the required size from the image */
1251         QINCREF(options);
1252         bs1 = NULL;
1253         ret = bdrv_open(&bs1, filename, NULL, options, BDRV_O_NO_BACKING,
1254                         drv, &local_err);
1255         if (ret < 0) {
1256             goto fail;
1257         }
1258         total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1259 
1260         bdrv_unref(bs1);
1261 
1262         /* Create the temporary image */
1263         ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1264         if (ret < 0) {
1265             error_setg_errno(errp, -ret, "Could not get temporary filename");
1266             goto fail;
1267         }
1268 
1269         bdrv_qcow2 = bdrv_find_format("qcow2");
1270         create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1271                                                  NULL);
1272 
1273         set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1274 
1275         ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1276         free_option_parameters(create_options);
1277         if (ret < 0) {
1278             error_setg_errno(errp, -ret, "Could not create temporary overlay "
1279                              "'%s': %s", tmp_filename,
1280                              error_get_pretty(local_err));
1281             error_free(local_err);
1282             local_err = NULL;
1283             goto fail;
1284         }
1285 
1286         /* Prepare a new options QDict for the temporary file, where user
1287          * options refer to the backing file */
1288         if (filename) {
1289             qdict_put(options, "file.filename", qstring_from_str(filename));
1290         }
1291         if (drv) {
1292             qdict_put(options, "driver", qstring_from_str(drv->format_name));
1293         }
1294 
1295         snapshot_options = qdict_new();
1296         qdict_put(snapshot_options, "backing", options);
1297         qdict_flatten(snapshot_options);
1298 
1299         bs->options = snapshot_options;
1300         options = qdict_clone_shallow(bs->options);
1301 
1302         filename = tmp_filename;
1303         drv = bdrv_qcow2;
1304         bs->is_temporary = 1;
1305     }
1306 
1307     /* Open image file without format layer */
1308     if (flags & BDRV_O_RDWR) {
1309         flags |= BDRV_O_ALLOW_RDWR;
1310     }
1311 
1312     assert(file == NULL);
1313     ret = bdrv_open_image(&file, filename, options, "file",
1314                           bdrv_open_flags(bs, flags | BDRV_O_UNMAP) |
1315                           BDRV_O_PROTOCOL, true, &local_err);
1316     if (ret < 0) {
1317         goto fail;
1318     }
1319 
1320     /* Find the right image format driver */
1321     drvname = qdict_get_try_str(options, "driver");
1322     if (drvname) {
1323         drv = bdrv_find_format(drvname);
1324         qdict_del(options, "driver");
1325         if (!drv) {
1326             error_setg(errp, "Invalid driver: '%s'", drvname);
1327             ret = -EINVAL;
1328             goto unlink_and_fail;
1329         }
1330     }
1331 
1332     if (!drv) {
1333         if (file) {
1334             ret = find_image_format(file, filename, &drv, &local_err);
1335         } else {
1336             error_setg(errp, "Must specify either driver or file");
1337             ret = -EINVAL;
1338             goto unlink_and_fail;
1339         }
1340     }
1341 
1342     if (!drv) {
1343         goto unlink_and_fail;
1344     }
1345 
1346     /* Open the image */
1347     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1348     if (ret < 0) {
1349         goto unlink_and_fail;
1350     }
1351 
1352     if (file && (bs->file != file)) {
1353         bdrv_unref(file);
1354         file = NULL;
1355     }
1356 
1357     /* If there is a backing file, use it */
1358     if ((flags & BDRV_O_NO_BACKING) == 0) {
1359         QDict *backing_options;
1360 
1361         qdict_extract_subqdict(options, &backing_options, "backing.");
1362         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1363         if (ret < 0) {
1364             goto close_and_fail;
1365         }
1366     }
1367 
1368 done:
1369     /* Check if any unknown options were used */
1370     if (options && (qdict_size(options) != 0)) {
1371         const QDictEntry *entry = qdict_first(options);
1372         if (flags & BDRV_O_PROTOCOL) {
1373             error_setg(errp, "Block protocol '%s' doesn't support the option "
1374                        "'%s'", drv->format_name, entry->key);
1375         } else {
1376             error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1377                        "support the option '%s'", drv->format_name,
1378                        bs->device_name, entry->key);
1379         }
1380 
1381         ret = -EINVAL;
1382         goto close_and_fail;
1383     }
1384     QDECREF(options);
1385 
1386     if (!bdrv_key_required(bs)) {
1387         bdrv_dev_change_media_cb(bs, true);
1388     }
1389 
1390     *pbs = bs;
1391     return 0;
1392 
1393 unlink_and_fail:
1394     if (file != NULL) {
1395         bdrv_unref(file);
1396     }
1397     if (bs->is_temporary) {
1398         unlink(filename);
1399     }
1400 fail:
1401     QDECREF(bs->options);
1402     QDECREF(options);
1403     bs->options = NULL;
1404     if (!*pbs) {
1405         /* If *pbs is NULL, a new BDS has been created in this function and
1406            needs to be freed now. Otherwise, it does not need to be closed,
1407            since it has not really been opened yet. */
1408         bdrv_unref(bs);
1409     }
1410     if (local_err) {
1411         error_propagate(errp, local_err);
1412     }
1413     return ret;
1414 
1415 close_and_fail:
1416     /* See fail path, but now the BDS has to be always closed */
1417     if (*pbs) {
1418         bdrv_close(bs);
1419     } else {
1420         bdrv_unref(bs);
1421     }
1422     QDECREF(options);
1423     if (local_err) {
1424         error_propagate(errp, local_err);
1425     }
1426     return ret;
1427 }
1428 
1429 typedef struct BlockReopenQueueEntry {
1430      bool prepared;
1431      BDRVReopenState state;
1432      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1433 } BlockReopenQueueEntry;
1434 
1435 /*
1436  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1437  * reopen of multiple devices.
1438  *
1439  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1440  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1441  * be created and initialized. This newly created BlockReopenQueue should be
1442  * passed back in for subsequent calls that are intended to be of the same
1443  * atomic 'set'.
1444  *
1445  * bs is the BlockDriverState to add to the reopen queue.
1446  *
1447  * flags contains the open flags for the associated bs
1448  *
1449  * returns a pointer to bs_queue, which is either the newly allocated
1450  * bs_queue, or the existing bs_queue being used.
1451  *
1452  */
1453 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1454                                     BlockDriverState *bs, int flags)
1455 {
1456     assert(bs != NULL);
1457 
1458     BlockReopenQueueEntry *bs_entry;
1459     if (bs_queue == NULL) {
1460         bs_queue = g_new0(BlockReopenQueue, 1);
1461         QSIMPLEQ_INIT(bs_queue);
1462     }
1463 
1464     if (bs->file) {
1465         bdrv_reopen_queue(bs_queue, bs->file, flags);
1466     }
1467 
1468     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1469     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1470 
1471     bs_entry->state.bs = bs;
1472     bs_entry->state.flags = flags;
1473 
1474     return bs_queue;
1475 }
1476 
1477 /*
1478  * Reopen multiple BlockDriverStates atomically & transactionally.
1479  *
1480  * The queue passed in (bs_queue) must have been built up previous
1481  * via bdrv_reopen_queue().
1482  *
1483  * Reopens all BDS specified in the queue, with the appropriate
1484  * flags.  All devices are prepared for reopen, and failure of any
1485  * device will cause all device changes to be abandonded, and intermediate
1486  * data cleaned up.
1487  *
1488  * If all devices prepare successfully, then the changes are committed
1489  * to all devices.
1490  *
1491  */
1492 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1493 {
1494     int ret = -1;
1495     BlockReopenQueueEntry *bs_entry, *next;
1496     Error *local_err = NULL;
1497 
1498     assert(bs_queue != NULL);
1499 
1500     bdrv_drain_all();
1501 
1502     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1503         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1504             error_propagate(errp, local_err);
1505             goto cleanup;
1506         }
1507         bs_entry->prepared = true;
1508     }
1509 
1510     /* If we reach this point, we have success and just need to apply the
1511      * changes
1512      */
1513     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1514         bdrv_reopen_commit(&bs_entry->state);
1515     }
1516 
1517     ret = 0;
1518 
1519 cleanup:
1520     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1521         if (ret && bs_entry->prepared) {
1522             bdrv_reopen_abort(&bs_entry->state);
1523         }
1524         g_free(bs_entry);
1525     }
1526     g_free(bs_queue);
1527     return ret;
1528 }
1529 
1530 
1531 /* Reopen a single BlockDriverState with the specified flags. */
1532 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1533 {
1534     int ret = -1;
1535     Error *local_err = NULL;
1536     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1537 
1538     ret = bdrv_reopen_multiple(queue, &local_err);
1539     if (local_err != NULL) {
1540         error_propagate(errp, local_err);
1541     }
1542     return ret;
1543 }
1544 
1545 
1546 /*
1547  * Prepares a BlockDriverState for reopen. All changes are staged in the
1548  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1549  * the block driver layer .bdrv_reopen_prepare()
1550  *
1551  * bs is the BlockDriverState to reopen
1552  * flags are the new open flags
1553  * queue is the reopen queue
1554  *
1555  * Returns 0 on success, non-zero on error.  On error errp will be set
1556  * as well.
1557  *
1558  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1559  * It is the responsibility of the caller to then call the abort() or
1560  * commit() for any other BDS that have been left in a prepare() state
1561  *
1562  */
1563 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1564                         Error **errp)
1565 {
1566     int ret = -1;
1567     Error *local_err = NULL;
1568     BlockDriver *drv;
1569 
1570     assert(reopen_state != NULL);
1571     assert(reopen_state->bs->drv != NULL);
1572     drv = reopen_state->bs->drv;
1573 
1574     /* if we are to stay read-only, do not allow permission change
1575      * to r/w */
1576     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1577         reopen_state->flags & BDRV_O_RDWR) {
1578         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1579                   reopen_state->bs->device_name);
1580         goto error;
1581     }
1582 
1583 
1584     ret = bdrv_flush(reopen_state->bs);
1585     if (ret) {
1586         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1587                   strerror(-ret));
1588         goto error;
1589     }
1590 
1591     if (drv->bdrv_reopen_prepare) {
1592         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1593         if (ret) {
1594             if (local_err != NULL) {
1595                 error_propagate(errp, local_err);
1596             } else {
1597                 error_setg(errp, "failed while preparing to reopen image '%s'",
1598                            reopen_state->bs->filename);
1599             }
1600             goto error;
1601         }
1602     } else {
1603         /* It is currently mandatory to have a bdrv_reopen_prepare()
1604          * handler for each supported drv. */
1605         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1606                   drv->format_name, reopen_state->bs->device_name,
1607                  "reopening of file");
1608         ret = -1;
1609         goto error;
1610     }
1611 
1612     ret = 0;
1613 
1614 error:
1615     return ret;
1616 }
1617 
1618 /*
1619  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1620  * makes them final by swapping the staging BlockDriverState contents into
1621  * the active BlockDriverState contents.
1622  */
1623 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1624 {
1625     BlockDriver *drv;
1626 
1627     assert(reopen_state != NULL);
1628     drv = reopen_state->bs->drv;
1629     assert(drv != NULL);
1630 
1631     /* If there are any driver level actions to take */
1632     if (drv->bdrv_reopen_commit) {
1633         drv->bdrv_reopen_commit(reopen_state);
1634     }
1635 
1636     /* set BDS specific flags now */
1637     reopen_state->bs->open_flags         = reopen_state->flags;
1638     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1639                                               BDRV_O_CACHE_WB);
1640     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1641 
1642     bdrv_refresh_limits(reopen_state->bs);
1643 }
1644 
1645 /*
1646  * Abort the reopen, and delete and free the staged changes in
1647  * reopen_state
1648  */
1649 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1650 {
1651     BlockDriver *drv;
1652 
1653     assert(reopen_state != NULL);
1654     drv = reopen_state->bs->drv;
1655     assert(drv != NULL);
1656 
1657     if (drv->bdrv_reopen_abort) {
1658         drv->bdrv_reopen_abort(reopen_state);
1659     }
1660 }
1661 
1662 
1663 void bdrv_close(BlockDriverState *bs)
1664 {
1665     if (bs->job) {
1666         block_job_cancel_sync(bs->job);
1667     }
1668     bdrv_drain_all(); /* complete I/O */
1669     bdrv_flush(bs);
1670     bdrv_drain_all(); /* in case flush left pending I/O */
1671     notifier_list_notify(&bs->close_notifiers, bs);
1672 
1673     if (bs->drv) {
1674         if (bs->backing_hd) {
1675             bdrv_unref(bs->backing_hd);
1676             bs->backing_hd = NULL;
1677         }
1678         bs->drv->bdrv_close(bs);
1679         g_free(bs->opaque);
1680 #ifdef _WIN32
1681         if (bs->is_temporary) {
1682             unlink(bs->filename);
1683         }
1684 #endif
1685         bs->opaque = NULL;
1686         bs->drv = NULL;
1687         bs->copy_on_read = 0;
1688         bs->backing_file[0] = '\0';
1689         bs->backing_format[0] = '\0';
1690         bs->total_sectors = 0;
1691         bs->encrypted = 0;
1692         bs->valid_key = 0;
1693         bs->sg = 0;
1694         bs->growable = 0;
1695         bs->zero_beyond_eof = false;
1696         QDECREF(bs->options);
1697         bs->options = NULL;
1698 
1699         if (bs->file != NULL) {
1700             bdrv_unref(bs->file);
1701             bs->file = NULL;
1702         }
1703     }
1704 
1705     bdrv_dev_change_media_cb(bs, false);
1706 
1707     /*throttling disk I/O limits*/
1708     if (bs->io_limits_enabled) {
1709         bdrv_io_limits_disable(bs);
1710     }
1711 }
1712 
1713 void bdrv_close_all(void)
1714 {
1715     BlockDriverState *bs;
1716 
1717     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1718         bdrv_close(bs);
1719     }
1720 }
1721 
1722 /* Check if any requests are in-flight (including throttled requests) */
1723 static bool bdrv_requests_pending(BlockDriverState *bs)
1724 {
1725     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1726         return true;
1727     }
1728     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1729         return true;
1730     }
1731     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1732         return true;
1733     }
1734     if (bs->file && bdrv_requests_pending(bs->file)) {
1735         return true;
1736     }
1737     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1738         return true;
1739     }
1740     return false;
1741 }
1742 
1743 static bool bdrv_requests_pending_all(void)
1744 {
1745     BlockDriverState *bs;
1746     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1747         if (bdrv_requests_pending(bs)) {
1748             return true;
1749         }
1750     }
1751     return false;
1752 }
1753 
1754 /*
1755  * Wait for pending requests to complete across all BlockDriverStates
1756  *
1757  * This function does not flush data to disk, use bdrv_flush_all() for that
1758  * after calling this function.
1759  *
1760  * Note that completion of an asynchronous I/O operation can trigger any
1761  * number of other I/O operations on other devices---for example a coroutine
1762  * can be arbitrarily complex and a constant flow of I/O can come until the
1763  * coroutine is complete.  Because of this, it is not possible to have a
1764  * function to drain a single device's I/O queue.
1765  */
1766 void bdrv_drain_all(void)
1767 {
1768     /* Always run first iteration so any pending completion BHs run */
1769     bool busy = true;
1770     BlockDriverState *bs;
1771 
1772     while (busy) {
1773         QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1774             bdrv_start_throttled_reqs(bs);
1775         }
1776 
1777         busy = bdrv_requests_pending_all();
1778         busy |= aio_poll(qemu_get_aio_context(), busy);
1779     }
1780 }
1781 
1782 /* make a BlockDriverState anonymous by removing from bdrv_state and
1783  * graph_bdrv_state list.
1784    Also, NULL terminate the device_name to prevent double remove */
1785 void bdrv_make_anon(BlockDriverState *bs)
1786 {
1787     if (bs->device_name[0] != '\0') {
1788         QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1789     }
1790     bs->device_name[0] = '\0';
1791     if (bs->node_name[0] != '\0') {
1792         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1793     }
1794     bs->node_name[0] = '\0';
1795 }
1796 
1797 static void bdrv_rebind(BlockDriverState *bs)
1798 {
1799     if (bs->drv && bs->drv->bdrv_rebind) {
1800         bs->drv->bdrv_rebind(bs);
1801     }
1802 }
1803 
1804 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1805                                      BlockDriverState *bs_src)
1806 {
1807     /* move some fields that need to stay attached to the device */
1808     bs_dest->open_flags         = bs_src->open_flags;
1809 
1810     /* dev info */
1811     bs_dest->dev_ops            = bs_src->dev_ops;
1812     bs_dest->dev_opaque         = bs_src->dev_opaque;
1813     bs_dest->dev                = bs_src->dev;
1814     bs_dest->guest_block_size   = bs_src->guest_block_size;
1815     bs_dest->copy_on_read       = bs_src->copy_on_read;
1816 
1817     bs_dest->enable_write_cache = bs_src->enable_write_cache;
1818 
1819     /* i/o throttled req */
1820     memcpy(&bs_dest->throttle_state,
1821            &bs_src->throttle_state,
1822            sizeof(ThrottleState));
1823     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1824     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1825     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1826 
1827     /* r/w error */
1828     bs_dest->on_read_error      = bs_src->on_read_error;
1829     bs_dest->on_write_error     = bs_src->on_write_error;
1830 
1831     /* i/o status */
1832     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1833     bs_dest->iostatus           = bs_src->iostatus;
1834 
1835     /* dirty bitmap */
1836     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1837 
1838     /* reference count */
1839     bs_dest->refcnt             = bs_src->refcnt;
1840 
1841     /* job */
1842     bs_dest->in_use             = bs_src->in_use;
1843     bs_dest->job                = bs_src->job;
1844 
1845     /* keep the same entry in bdrv_states */
1846     pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1847             bs_src->device_name);
1848     bs_dest->device_list = bs_src->device_list;
1849 
1850     /* keep the same entry in graph_bdrv_states
1851      * We do want to swap name but don't want to swap linked list entries
1852      */
1853     bs_dest->node_list   = bs_src->node_list;
1854 }
1855 
1856 /*
1857  * Swap bs contents for two image chains while they are live,
1858  * while keeping required fields on the BlockDriverState that is
1859  * actually attached to a device.
1860  *
1861  * This will modify the BlockDriverState fields, and swap contents
1862  * between bs_new and bs_old. Both bs_new and bs_old are modified.
1863  *
1864  * bs_new is required to be anonymous.
1865  *
1866  * This function does not create any image files.
1867  */
1868 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1869 {
1870     BlockDriverState tmp;
1871 
1872     /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1873     assert(bs_new->device_name[0] == '\0');
1874     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1875     assert(bs_new->job == NULL);
1876     assert(bs_new->dev == NULL);
1877     assert(bs_new->in_use == 0);
1878     assert(bs_new->io_limits_enabled == false);
1879     assert(!throttle_have_timer(&bs_new->throttle_state));
1880 
1881     tmp = *bs_new;
1882     *bs_new = *bs_old;
1883     *bs_old = tmp;
1884 
1885     /* there are some fields that should not be swapped, move them back */
1886     bdrv_move_feature_fields(&tmp, bs_old);
1887     bdrv_move_feature_fields(bs_old, bs_new);
1888     bdrv_move_feature_fields(bs_new, &tmp);
1889 
1890     /* bs_new shouldn't be in bdrv_states even after the swap!  */
1891     assert(bs_new->device_name[0] == '\0');
1892 
1893     /* Check a few fields that should remain attached to the device */
1894     assert(bs_new->dev == NULL);
1895     assert(bs_new->job == NULL);
1896     assert(bs_new->in_use == 0);
1897     assert(bs_new->io_limits_enabled == false);
1898     assert(!throttle_have_timer(&bs_new->throttle_state));
1899 
1900     bdrv_rebind(bs_new);
1901     bdrv_rebind(bs_old);
1902 }
1903 
1904 /*
1905  * Add new bs contents at the top of an image chain while the chain is
1906  * live, while keeping required fields on the top layer.
1907  *
1908  * This will modify the BlockDriverState fields, and swap contents
1909  * between bs_new and bs_top. Both bs_new and bs_top are modified.
1910  *
1911  * bs_new is required to be anonymous.
1912  *
1913  * This function does not create any image files.
1914  */
1915 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1916 {
1917     bdrv_swap(bs_new, bs_top);
1918 
1919     /* The contents of 'tmp' will become bs_top, as we are
1920      * swapping bs_new and bs_top contents. */
1921     bs_top->backing_hd = bs_new;
1922     bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1923     pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1924             bs_new->filename);
1925     pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1926             bs_new->drv ? bs_new->drv->format_name : "");
1927 }
1928 
1929 static void bdrv_delete(BlockDriverState *bs)
1930 {
1931     assert(!bs->dev);
1932     assert(!bs->job);
1933     assert(!bs->in_use);
1934     assert(!bs->refcnt);
1935     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1936 
1937     bdrv_close(bs);
1938 
1939     /* remove from list, if necessary */
1940     bdrv_make_anon(bs);
1941 
1942     g_free(bs);
1943 }
1944 
1945 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1946 /* TODO change to DeviceState *dev when all users are qdevified */
1947 {
1948     if (bs->dev) {
1949         return -EBUSY;
1950     }
1951     bs->dev = dev;
1952     bdrv_iostatus_reset(bs);
1953     return 0;
1954 }
1955 
1956 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1957 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1958 {
1959     if (bdrv_attach_dev(bs, dev) < 0) {
1960         abort();
1961     }
1962 }
1963 
1964 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1965 /* TODO change to DeviceState *dev when all users are qdevified */
1966 {
1967     assert(bs->dev == dev);
1968     bs->dev = NULL;
1969     bs->dev_ops = NULL;
1970     bs->dev_opaque = NULL;
1971     bs->guest_block_size = 512;
1972 }
1973 
1974 /* TODO change to return DeviceState * when all users are qdevified */
1975 void *bdrv_get_attached_dev(BlockDriverState *bs)
1976 {
1977     return bs->dev;
1978 }
1979 
1980 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1981                       void *opaque)
1982 {
1983     bs->dev_ops = ops;
1984     bs->dev_opaque = opaque;
1985 }
1986 
1987 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1988                                enum MonitorEvent ev,
1989                                BlockErrorAction action, bool is_read)
1990 {
1991     QObject *data;
1992     const char *action_str;
1993 
1994     switch (action) {
1995     case BDRV_ACTION_REPORT:
1996         action_str = "report";
1997         break;
1998     case BDRV_ACTION_IGNORE:
1999         action_str = "ignore";
2000         break;
2001     case BDRV_ACTION_STOP:
2002         action_str = "stop";
2003         break;
2004     default:
2005         abort();
2006     }
2007 
2008     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2009                               bdrv->device_name,
2010                               action_str,
2011                               is_read ? "read" : "write");
2012     monitor_protocol_event(ev, data);
2013 
2014     qobject_decref(data);
2015 }
2016 
2017 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2018 {
2019     QObject *data;
2020 
2021     data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2022                               bdrv_get_device_name(bs), ejected);
2023     monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2024 
2025     qobject_decref(data);
2026 }
2027 
2028 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2029 {
2030     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2031         bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2032         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2033         if (tray_was_closed) {
2034             /* tray open */
2035             bdrv_emit_qmp_eject_event(bs, true);
2036         }
2037         if (load) {
2038             /* tray close */
2039             bdrv_emit_qmp_eject_event(bs, false);
2040         }
2041     }
2042 }
2043 
2044 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2045 {
2046     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2047 }
2048 
2049 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2050 {
2051     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2052         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2053     }
2054 }
2055 
2056 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2057 {
2058     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2059         return bs->dev_ops->is_tray_open(bs->dev_opaque);
2060     }
2061     return false;
2062 }
2063 
2064 static void bdrv_dev_resize_cb(BlockDriverState *bs)
2065 {
2066     if (bs->dev_ops && bs->dev_ops->resize_cb) {
2067         bs->dev_ops->resize_cb(bs->dev_opaque);
2068     }
2069 }
2070 
2071 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2072 {
2073     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2074         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2075     }
2076     return false;
2077 }
2078 
2079 /*
2080  * Run consistency checks on an image
2081  *
2082  * Returns 0 if the check could be completed (it doesn't mean that the image is
2083  * free of errors) or -errno when an internal error occurred. The results of the
2084  * check are stored in res.
2085  */
2086 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2087 {
2088     if (bs->drv->bdrv_check == NULL) {
2089         return -ENOTSUP;
2090     }
2091 
2092     memset(res, 0, sizeof(*res));
2093     return bs->drv->bdrv_check(bs, res, fix);
2094 }
2095 
2096 #define COMMIT_BUF_SECTORS 2048
2097 
2098 /* commit COW file into the raw image */
2099 int bdrv_commit(BlockDriverState *bs)
2100 {
2101     BlockDriver *drv = bs->drv;
2102     int64_t sector, total_sectors, length, backing_length;
2103     int n, ro, open_flags;
2104     int ret = 0;
2105     uint8_t *buf = NULL;
2106     char filename[PATH_MAX];
2107 
2108     if (!drv)
2109         return -ENOMEDIUM;
2110 
2111     if (!bs->backing_hd) {
2112         return -ENOTSUP;
2113     }
2114 
2115     if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
2116         return -EBUSY;
2117     }
2118 
2119     ro = bs->backing_hd->read_only;
2120     /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2121     pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2122     open_flags =  bs->backing_hd->open_flags;
2123 
2124     if (ro) {
2125         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2126             return -EACCES;
2127         }
2128     }
2129 
2130     length = bdrv_getlength(bs);
2131     if (length < 0) {
2132         ret = length;
2133         goto ro_cleanup;
2134     }
2135 
2136     backing_length = bdrv_getlength(bs->backing_hd);
2137     if (backing_length < 0) {
2138         ret = backing_length;
2139         goto ro_cleanup;
2140     }
2141 
2142     /* If our top snapshot is larger than the backing file image,
2143      * grow the backing file image if possible.  If not possible,
2144      * we must return an error */
2145     if (length > backing_length) {
2146         ret = bdrv_truncate(bs->backing_hd, length);
2147         if (ret < 0) {
2148             goto ro_cleanup;
2149         }
2150     }
2151 
2152     total_sectors = length >> BDRV_SECTOR_BITS;
2153     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2154 
2155     for (sector = 0; sector < total_sectors; sector += n) {
2156         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2157         if (ret < 0) {
2158             goto ro_cleanup;
2159         }
2160         if (ret) {
2161             ret = bdrv_read(bs, sector, buf, n);
2162             if (ret < 0) {
2163                 goto ro_cleanup;
2164             }
2165 
2166             ret = bdrv_write(bs->backing_hd, sector, buf, n);
2167             if (ret < 0) {
2168                 goto ro_cleanup;
2169             }
2170         }
2171     }
2172 
2173     if (drv->bdrv_make_empty) {
2174         ret = drv->bdrv_make_empty(bs);
2175         if (ret < 0) {
2176             goto ro_cleanup;
2177         }
2178         bdrv_flush(bs);
2179     }
2180 
2181     /*
2182      * Make sure all data we wrote to the backing device is actually
2183      * stable on disk.
2184      */
2185     if (bs->backing_hd) {
2186         bdrv_flush(bs->backing_hd);
2187     }
2188 
2189     ret = 0;
2190 ro_cleanup:
2191     g_free(buf);
2192 
2193     if (ro) {
2194         /* ignoring error return here */
2195         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2196     }
2197 
2198     return ret;
2199 }
2200 
2201 int bdrv_commit_all(void)
2202 {
2203     BlockDriverState *bs;
2204 
2205     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2206         if (bs->drv && bs->backing_hd) {
2207             int ret = bdrv_commit(bs);
2208             if (ret < 0) {
2209                 return ret;
2210             }
2211         }
2212     }
2213     return 0;
2214 }
2215 
2216 /**
2217  * Remove an active request from the tracked requests list
2218  *
2219  * This function should be called when a tracked request is completing.
2220  */
2221 static void tracked_request_end(BdrvTrackedRequest *req)
2222 {
2223     if (req->serialising) {
2224         req->bs->serialising_in_flight--;
2225     }
2226 
2227     QLIST_REMOVE(req, list);
2228     qemu_co_queue_restart_all(&req->wait_queue);
2229 }
2230 
2231 /**
2232  * Add an active request to the tracked requests list
2233  */
2234 static void tracked_request_begin(BdrvTrackedRequest *req,
2235                                   BlockDriverState *bs,
2236                                   int64_t offset,
2237                                   unsigned int bytes, bool is_write)
2238 {
2239     *req = (BdrvTrackedRequest){
2240         .bs = bs,
2241         .offset         = offset,
2242         .bytes          = bytes,
2243         .is_write       = is_write,
2244         .co             = qemu_coroutine_self(),
2245         .serialising    = false,
2246         .overlap_offset = offset,
2247         .overlap_bytes  = bytes,
2248     };
2249 
2250     qemu_co_queue_init(&req->wait_queue);
2251 
2252     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2253 }
2254 
2255 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2256 {
2257     int64_t overlap_offset = req->offset & ~(align - 1);
2258     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2259                                - overlap_offset;
2260 
2261     if (!req->serialising) {
2262         req->bs->serialising_in_flight++;
2263         req->serialising = true;
2264     }
2265 
2266     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2267     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2268 }
2269 
2270 /**
2271  * Round a region to cluster boundaries
2272  */
2273 void bdrv_round_to_clusters(BlockDriverState *bs,
2274                             int64_t sector_num, int nb_sectors,
2275                             int64_t *cluster_sector_num,
2276                             int *cluster_nb_sectors)
2277 {
2278     BlockDriverInfo bdi;
2279 
2280     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2281         *cluster_sector_num = sector_num;
2282         *cluster_nb_sectors = nb_sectors;
2283     } else {
2284         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2285         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2286         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2287                                             nb_sectors, c);
2288     }
2289 }
2290 
2291 static int bdrv_get_cluster_size(BlockDriverState *bs)
2292 {
2293     BlockDriverInfo bdi;
2294     int ret;
2295 
2296     ret = bdrv_get_info(bs, &bdi);
2297     if (ret < 0 || bdi.cluster_size == 0) {
2298         return bs->request_alignment;
2299     } else {
2300         return bdi.cluster_size;
2301     }
2302 }
2303 
2304 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2305                                      int64_t offset, unsigned int bytes)
2306 {
2307     /*        aaaa   bbbb */
2308     if (offset >= req->overlap_offset + req->overlap_bytes) {
2309         return false;
2310     }
2311     /* bbbb   aaaa        */
2312     if (req->overlap_offset >= offset + bytes) {
2313         return false;
2314     }
2315     return true;
2316 }
2317 
2318 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2319 {
2320     BlockDriverState *bs = self->bs;
2321     BdrvTrackedRequest *req;
2322     bool retry;
2323     bool waited = false;
2324 
2325     if (!bs->serialising_in_flight) {
2326         return false;
2327     }
2328 
2329     do {
2330         retry = false;
2331         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2332             if (req == self || (!req->serialising && !self->serialising)) {
2333                 continue;
2334             }
2335             if (tracked_request_overlaps(req, self->overlap_offset,
2336                                          self->overlap_bytes))
2337             {
2338                 /* Hitting this means there was a reentrant request, for
2339                  * example, a block driver issuing nested requests.  This must
2340                  * never happen since it means deadlock.
2341                  */
2342                 assert(qemu_coroutine_self() != req->co);
2343 
2344                 /* If the request is already (indirectly) waiting for us, or
2345                  * will wait for us as soon as it wakes up, then just go on
2346                  * (instead of producing a deadlock in the former case). */
2347                 if (!req->waiting_for) {
2348                     self->waiting_for = req;
2349                     qemu_co_queue_wait(&req->wait_queue);
2350                     self->waiting_for = NULL;
2351                     retry = true;
2352                     waited = true;
2353                     break;
2354                 }
2355             }
2356         }
2357     } while (retry);
2358 
2359     return waited;
2360 }
2361 
2362 /*
2363  * Return values:
2364  * 0        - success
2365  * -EINVAL  - backing format specified, but no file
2366  * -ENOSPC  - can't update the backing file because no space is left in the
2367  *            image file header
2368  * -ENOTSUP - format driver doesn't support changing the backing file
2369  */
2370 int bdrv_change_backing_file(BlockDriverState *bs,
2371     const char *backing_file, const char *backing_fmt)
2372 {
2373     BlockDriver *drv = bs->drv;
2374     int ret;
2375 
2376     /* Backing file format doesn't make sense without a backing file */
2377     if (backing_fmt && !backing_file) {
2378         return -EINVAL;
2379     }
2380 
2381     if (drv->bdrv_change_backing_file != NULL) {
2382         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2383     } else {
2384         ret = -ENOTSUP;
2385     }
2386 
2387     if (ret == 0) {
2388         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2389         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2390     }
2391     return ret;
2392 }
2393 
2394 /*
2395  * Finds the image layer in the chain that has 'bs' as its backing file.
2396  *
2397  * active is the current topmost image.
2398  *
2399  * Returns NULL if bs is not found in active's image chain,
2400  * or if active == bs.
2401  */
2402 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2403                                     BlockDriverState *bs)
2404 {
2405     BlockDriverState *overlay = NULL;
2406     BlockDriverState *intermediate;
2407 
2408     assert(active != NULL);
2409     assert(bs != NULL);
2410 
2411     /* if bs is the same as active, then by definition it has no overlay
2412      */
2413     if (active == bs) {
2414         return NULL;
2415     }
2416 
2417     intermediate = active;
2418     while (intermediate->backing_hd) {
2419         if (intermediate->backing_hd == bs) {
2420             overlay = intermediate;
2421             break;
2422         }
2423         intermediate = intermediate->backing_hd;
2424     }
2425 
2426     return overlay;
2427 }
2428 
2429 typedef struct BlkIntermediateStates {
2430     BlockDriverState *bs;
2431     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2432 } BlkIntermediateStates;
2433 
2434 
2435 /*
2436  * Drops images above 'base' up to and including 'top', and sets the image
2437  * above 'top' to have base as its backing file.
2438  *
2439  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2440  * information in 'bs' can be properly updated.
2441  *
2442  * E.g., this will convert the following chain:
2443  * bottom <- base <- intermediate <- top <- active
2444  *
2445  * to
2446  *
2447  * bottom <- base <- active
2448  *
2449  * It is allowed for bottom==base, in which case it converts:
2450  *
2451  * base <- intermediate <- top <- active
2452  *
2453  * to
2454  *
2455  * base <- active
2456  *
2457  * Error conditions:
2458  *  if active == top, that is considered an error
2459  *
2460  */
2461 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2462                            BlockDriverState *base)
2463 {
2464     BlockDriverState *intermediate;
2465     BlockDriverState *base_bs = NULL;
2466     BlockDriverState *new_top_bs = NULL;
2467     BlkIntermediateStates *intermediate_state, *next;
2468     int ret = -EIO;
2469 
2470     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2471     QSIMPLEQ_INIT(&states_to_delete);
2472 
2473     if (!top->drv || !base->drv) {
2474         goto exit;
2475     }
2476 
2477     new_top_bs = bdrv_find_overlay(active, top);
2478 
2479     if (new_top_bs == NULL) {
2480         /* we could not find the image above 'top', this is an error */
2481         goto exit;
2482     }
2483 
2484     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2485      * to do, no intermediate images */
2486     if (new_top_bs->backing_hd == base) {
2487         ret = 0;
2488         goto exit;
2489     }
2490 
2491     intermediate = top;
2492 
2493     /* now we will go down through the list, and add each BDS we find
2494      * into our deletion queue, until we hit the 'base'
2495      */
2496     while (intermediate) {
2497         intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2498         intermediate_state->bs = intermediate;
2499         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2500 
2501         if (intermediate->backing_hd == base) {
2502             base_bs = intermediate->backing_hd;
2503             break;
2504         }
2505         intermediate = intermediate->backing_hd;
2506     }
2507     if (base_bs == NULL) {
2508         /* something went wrong, we did not end at the base. safely
2509          * unravel everything, and exit with error */
2510         goto exit;
2511     }
2512 
2513     /* success - we can delete the intermediate states, and link top->base */
2514     ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2515                                    base_bs->drv ? base_bs->drv->format_name : "");
2516     if (ret) {
2517         goto exit;
2518     }
2519     new_top_bs->backing_hd = base_bs;
2520 
2521     bdrv_refresh_limits(new_top_bs);
2522 
2523     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2524         /* so that bdrv_close() does not recursively close the chain */
2525         intermediate_state->bs->backing_hd = NULL;
2526         bdrv_unref(intermediate_state->bs);
2527     }
2528     ret = 0;
2529 
2530 exit:
2531     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2532         g_free(intermediate_state);
2533     }
2534     return ret;
2535 }
2536 
2537 
2538 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2539                                    size_t size)
2540 {
2541     int64_t len;
2542 
2543     if (!bdrv_is_inserted(bs))
2544         return -ENOMEDIUM;
2545 
2546     if (bs->growable)
2547         return 0;
2548 
2549     len = bdrv_getlength(bs);
2550 
2551     if (offset < 0)
2552         return -EIO;
2553 
2554     if ((offset > len) || (len - offset < size))
2555         return -EIO;
2556 
2557     return 0;
2558 }
2559 
2560 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2561                               int nb_sectors)
2562 {
2563     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2564                                    nb_sectors * BDRV_SECTOR_SIZE);
2565 }
2566 
2567 typedef struct RwCo {
2568     BlockDriverState *bs;
2569     int64_t offset;
2570     QEMUIOVector *qiov;
2571     bool is_write;
2572     int ret;
2573     BdrvRequestFlags flags;
2574 } RwCo;
2575 
2576 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2577 {
2578     RwCo *rwco = opaque;
2579 
2580     if (!rwco->is_write) {
2581         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2582                                       rwco->qiov->size, rwco->qiov,
2583                                       rwco->flags);
2584     } else {
2585         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2586                                        rwco->qiov->size, rwco->qiov,
2587                                        rwco->flags);
2588     }
2589 }
2590 
2591 /*
2592  * Process a vectored synchronous request using coroutines
2593  */
2594 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2595                         QEMUIOVector *qiov, bool is_write,
2596                         BdrvRequestFlags flags)
2597 {
2598     Coroutine *co;
2599     RwCo rwco = {
2600         .bs = bs,
2601         .offset = offset,
2602         .qiov = qiov,
2603         .is_write = is_write,
2604         .ret = NOT_DONE,
2605         .flags = flags,
2606     };
2607 
2608     /**
2609      * In sync call context, when the vcpu is blocked, this throttling timer
2610      * will not fire; so the I/O throttling function has to be disabled here
2611      * if it has been enabled.
2612      */
2613     if (bs->io_limits_enabled) {
2614         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2615                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2616         bdrv_io_limits_disable(bs);
2617     }
2618 
2619     if (qemu_in_coroutine()) {
2620         /* Fast-path if already in coroutine context */
2621         bdrv_rw_co_entry(&rwco);
2622     } else {
2623         co = qemu_coroutine_create(bdrv_rw_co_entry);
2624         qemu_coroutine_enter(co, &rwco);
2625         while (rwco.ret == NOT_DONE) {
2626             qemu_aio_wait();
2627         }
2628     }
2629     return rwco.ret;
2630 }
2631 
2632 /*
2633  * Process a synchronous request using coroutines
2634  */
2635 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2636                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2637 {
2638     QEMUIOVector qiov;
2639     struct iovec iov = {
2640         .iov_base = (void *)buf,
2641         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2642     };
2643 
2644     qemu_iovec_init_external(&qiov, &iov, 1);
2645     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2646                         &qiov, is_write, flags);
2647 }
2648 
2649 /* return < 0 if error. See bdrv_write() for the return codes */
2650 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2651               uint8_t *buf, int nb_sectors)
2652 {
2653     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2654 }
2655 
2656 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2657 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2658                           uint8_t *buf, int nb_sectors)
2659 {
2660     bool enabled;
2661     int ret;
2662 
2663     enabled = bs->io_limits_enabled;
2664     bs->io_limits_enabled = false;
2665     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2666     bs->io_limits_enabled = enabled;
2667     return ret;
2668 }
2669 
2670 /* Return < 0 if error. Important errors are:
2671   -EIO         generic I/O error (may happen for all errors)
2672   -ENOMEDIUM   No media inserted.
2673   -EINVAL      Invalid sector number or nb_sectors
2674   -EACCES      Trying to write a read-only device
2675 */
2676 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2677                const uint8_t *buf, int nb_sectors)
2678 {
2679     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2680 }
2681 
2682 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2683                       int nb_sectors, BdrvRequestFlags flags)
2684 {
2685     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2686                       BDRV_REQ_ZERO_WRITE | flags);
2687 }
2688 
2689 /*
2690  * Completely zero out a block device with the help of bdrv_write_zeroes.
2691  * The operation is sped up by checking the block status and only writing
2692  * zeroes to the device if they currently do not return zeroes. Optional
2693  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2694  *
2695  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2696  */
2697 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2698 {
2699     int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
2700     int64_t ret, nb_sectors, sector_num = 0;
2701     int n;
2702 
2703     for (;;) {
2704         nb_sectors = target_size - sector_num;
2705         if (nb_sectors <= 0) {
2706             return 0;
2707         }
2708         if (nb_sectors > INT_MAX) {
2709             nb_sectors = INT_MAX;
2710         }
2711         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2712         if (ret < 0) {
2713             error_report("error getting block status at sector %" PRId64 ": %s",
2714                          sector_num, strerror(-ret));
2715             return ret;
2716         }
2717         if (ret & BDRV_BLOCK_ZERO) {
2718             sector_num += n;
2719             continue;
2720         }
2721         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2722         if (ret < 0) {
2723             error_report("error writing zeroes at sector %" PRId64 ": %s",
2724                          sector_num, strerror(-ret));
2725             return ret;
2726         }
2727         sector_num += n;
2728     }
2729 }
2730 
2731 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2732 {
2733     QEMUIOVector qiov;
2734     struct iovec iov = {
2735         .iov_base = (void *)buf,
2736         .iov_len = bytes,
2737     };
2738     int ret;
2739 
2740     if (bytes < 0) {
2741         return -EINVAL;
2742     }
2743 
2744     qemu_iovec_init_external(&qiov, &iov, 1);
2745     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2746     if (ret < 0) {
2747         return ret;
2748     }
2749 
2750     return bytes;
2751 }
2752 
2753 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2754 {
2755     int ret;
2756 
2757     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2758     if (ret < 0) {
2759         return ret;
2760     }
2761 
2762     return qiov->size;
2763 }
2764 
2765 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2766                 const void *buf, int bytes)
2767 {
2768     QEMUIOVector qiov;
2769     struct iovec iov = {
2770         .iov_base   = (void *) buf,
2771         .iov_len    = bytes,
2772     };
2773 
2774     if (bytes < 0) {
2775         return -EINVAL;
2776     }
2777 
2778     qemu_iovec_init_external(&qiov, &iov, 1);
2779     return bdrv_pwritev(bs, offset, &qiov);
2780 }
2781 
2782 /*
2783  * Writes to the file and ensures that no writes are reordered across this
2784  * request (acts as a barrier)
2785  *
2786  * Returns 0 on success, -errno in error cases.
2787  */
2788 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2789     const void *buf, int count)
2790 {
2791     int ret;
2792 
2793     ret = bdrv_pwrite(bs, offset, buf, count);
2794     if (ret < 0) {
2795         return ret;
2796     }
2797 
2798     /* No flush needed for cache modes that already do it */
2799     if (bs->enable_write_cache) {
2800         bdrv_flush(bs);
2801     }
2802 
2803     return 0;
2804 }
2805 
2806 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2807         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2808 {
2809     /* Perform I/O through a temporary buffer so that users who scribble over
2810      * their read buffer while the operation is in progress do not end up
2811      * modifying the image file.  This is critical for zero-copy guest I/O
2812      * where anything might happen inside guest memory.
2813      */
2814     void *bounce_buffer;
2815 
2816     BlockDriver *drv = bs->drv;
2817     struct iovec iov;
2818     QEMUIOVector bounce_qiov;
2819     int64_t cluster_sector_num;
2820     int cluster_nb_sectors;
2821     size_t skip_bytes;
2822     int ret;
2823 
2824     /* Cover entire cluster so no additional backing file I/O is required when
2825      * allocating cluster in the image file.
2826      */
2827     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2828                            &cluster_sector_num, &cluster_nb_sectors);
2829 
2830     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2831                                    cluster_sector_num, cluster_nb_sectors);
2832 
2833     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2834     iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2835     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2836 
2837     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2838                              &bounce_qiov);
2839     if (ret < 0) {
2840         goto err;
2841     }
2842 
2843     if (drv->bdrv_co_write_zeroes &&
2844         buffer_is_zero(bounce_buffer, iov.iov_len)) {
2845         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2846                                       cluster_nb_sectors, 0);
2847     } else {
2848         /* This does not change the data on the disk, it is not necessary
2849          * to flush even in cache=writethrough mode.
2850          */
2851         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2852                                   &bounce_qiov);
2853     }
2854 
2855     if (ret < 0) {
2856         /* It might be okay to ignore write errors for guest requests.  If this
2857          * is a deliberate copy-on-read then we don't want to ignore the error.
2858          * Simply report it in all cases.
2859          */
2860         goto err;
2861     }
2862 
2863     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2864     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2865                         nb_sectors * BDRV_SECTOR_SIZE);
2866 
2867 err:
2868     qemu_vfree(bounce_buffer);
2869     return ret;
2870 }
2871 
2872 /*
2873  * Forwards an already correctly aligned request to the BlockDriver. This
2874  * handles copy on read and zeroing after EOF; any other features must be
2875  * implemented by the caller.
2876  */
2877 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2878     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2879     int64_t align, QEMUIOVector *qiov, int flags)
2880 {
2881     BlockDriver *drv = bs->drv;
2882     int ret;
2883 
2884     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2885     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2886 
2887     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2888     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2889 
2890     /* Handle Copy on Read and associated serialisation */
2891     if (flags & BDRV_REQ_COPY_ON_READ) {
2892         /* If we touch the same cluster it counts as an overlap.  This
2893          * guarantees that allocating writes will be serialized and not race
2894          * with each other for the same cluster.  For example, in copy-on-read
2895          * it ensures that the CoR read and write operations are atomic and
2896          * guest writes cannot interleave between them. */
2897         mark_request_serialising(req, bdrv_get_cluster_size(bs));
2898     }
2899 
2900     wait_serialising_requests(req);
2901 
2902     if (flags & BDRV_REQ_COPY_ON_READ) {
2903         int pnum;
2904 
2905         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2906         if (ret < 0) {
2907             goto out;
2908         }
2909 
2910         if (!ret || pnum != nb_sectors) {
2911             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2912             goto out;
2913         }
2914     }
2915 
2916     /* Forward the request to the BlockDriver */
2917     if (!(bs->zero_beyond_eof && bs->growable)) {
2918         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2919     } else {
2920         /* Read zeros after EOF of growable BDSes */
2921         int64_t len, total_sectors, max_nb_sectors;
2922 
2923         len = bdrv_getlength(bs);
2924         if (len < 0) {
2925             ret = len;
2926             goto out;
2927         }
2928 
2929         total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
2930         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
2931                                   align >> BDRV_SECTOR_BITS);
2932         if (max_nb_sectors > 0) {
2933             ret = drv->bdrv_co_readv(bs, sector_num,
2934                                      MIN(nb_sectors, max_nb_sectors), qiov);
2935         } else {
2936             ret = 0;
2937         }
2938 
2939         /* Reading beyond end of file is supposed to produce zeroes */
2940         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2941             uint64_t offset = MAX(0, total_sectors - sector_num);
2942             uint64_t bytes = (sector_num + nb_sectors - offset) *
2943                               BDRV_SECTOR_SIZE;
2944             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
2945         }
2946     }
2947 
2948 out:
2949     return ret;
2950 }
2951 
2952 /*
2953  * Handle a read request in coroutine context
2954  */
2955 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
2956     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
2957     BdrvRequestFlags flags)
2958 {
2959     BlockDriver *drv = bs->drv;
2960     BdrvTrackedRequest req;
2961 
2962     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
2963     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
2964     uint8_t *head_buf = NULL;
2965     uint8_t *tail_buf = NULL;
2966     QEMUIOVector local_qiov;
2967     bool use_local_qiov = false;
2968     int ret;
2969 
2970     if (!drv) {
2971         return -ENOMEDIUM;
2972     }
2973     if (bdrv_check_byte_request(bs, offset, bytes)) {
2974         return -EIO;
2975     }
2976 
2977     if (bs->copy_on_read) {
2978         flags |= BDRV_REQ_COPY_ON_READ;
2979     }
2980 
2981     /* throttling disk I/O */
2982     if (bs->io_limits_enabled) {
2983         bdrv_io_limits_intercept(bs, bytes, false);
2984     }
2985 
2986     /* Align read if necessary by padding qiov */
2987     if (offset & (align - 1)) {
2988         head_buf = qemu_blockalign(bs, align);
2989         qemu_iovec_init(&local_qiov, qiov->niov + 2);
2990         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
2991         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
2992         use_local_qiov = true;
2993 
2994         bytes += offset & (align - 1);
2995         offset = offset & ~(align - 1);
2996     }
2997 
2998     if ((offset + bytes) & (align - 1)) {
2999         if (!use_local_qiov) {
3000             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3001             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3002             use_local_qiov = true;
3003         }
3004         tail_buf = qemu_blockalign(bs, align);
3005         qemu_iovec_add(&local_qiov, tail_buf,
3006                        align - ((offset + bytes) & (align - 1)));
3007 
3008         bytes = ROUND_UP(bytes, align);
3009     }
3010 
3011     tracked_request_begin(&req, bs, offset, bytes, false);
3012     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3013                               use_local_qiov ? &local_qiov : qiov,
3014                               flags);
3015     tracked_request_end(&req);
3016 
3017     if (use_local_qiov) {
3018         qemu_iovec_destroy(&local_qiov);
3019         qemu_vfree(head_buf);
3020         qemu_vfree(tail_buf);
3021     }
3022 
3023     return ret;
3024 }
3025 
3026 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3027     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3028     BdrvRequestFlags flags)
3029 {
3030     if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3031         return -EINVAL;
3032     }
3033 
3034     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3035                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3036 }
3037 
3038 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3039     int nb_sectors, QEMUIOVector *qiov)
3040 {
3041     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3042 
3043     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3044 }
3045 
3046 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3047     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3048 {
3049     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3050 
3051     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3052                             BDRV_REQ_COPY_ON_READ);
3053 }
3054 
3055 /* if no limit is specified in the BlockLimits use a default
3056  * of 32768 512-byte sectors (16 MiB) per request.
3057  */
3058 #define MAX_WRITE_ZEROES_DEFAULT 32768
3059 
3060 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3061     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3062 {
3063     BlockDriver *drv = bs->drv;
3064     QEMUIOVector qiov;
3065     struct iovec iov = {0};
3066     int ret = 0;
3067 
3068     int max_write_zeroes = bs->bl.max_write_zeroes ?
3069                            bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3070 
3071     while (nb_sectors > 0 && !ret) {
3072         int num = nb_sectors;
3073 
3074         /* Align request.  Block drivers can expect the "bulk" of the request
3075          * to be aligned.
3076          */
3077         if (bs->bl.write_zeroes_alignment
3078             && num > bs->bl.write_zeroes_alignment) {
3079             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3080                 /* Make a small request up to the first aligned sector.  */
3081                 num = bs->bl.write_zeroes_alignment;
3082                 num -= sector_num % bs->bl.write_zeroes_alignment;
3083             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3084                 /* Shorten the request to the last aligned sector.  num cannot
3085                  * underflow because num > bs->bl.write_zeroes_alignment.
3086                  */
3087                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3088             }
3089         }
3090 
3091         /* limit request size */
3092         if (num > max_write_zeroes) {
3093             num = max_write_zeroes;
3094         }
3095 
3096         ret = -ENOTSUP;
3097         /* First try the efficient write zeroes operation */
3098         if (drv->bdrv_co_write_zeroes) {
3099             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3100         }
3101 
3102         if (ret == -ENOTSUP) {
3103             /* Fall back to bounce buffer if write zeroes is unsupported */
3104             iov.iov_len = num * BDRV_SECTOR_SIZE;
3105             if (iov.iov_base == NULL) {
3106                 iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3107                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3108             }
3109             qemu_iovec_init_external(&qiov, &iov, 1);
3110 
3111             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3112 
3113             /* Keep bounce buffer around if it is big enough for all
3114              * all future requests.
3115              */
3116             if (num < max_write_zeroes) {
3117                 qemu_vfree(iov.iov_base);
3118                 iov.iov_base = NULL;
3119             }
3120         }
3121 
3122         sector_num += num;
3123         nb_sectors -= num;
3124     }
3125 
3126     qemu_vfree(iov.iov_base);
3127     return ret;
3128 }
3129 
3130 /*
3131  * Forwards an already correctly aligned write request to the BlockDriver.
3132  */
3133 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3134     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3135     QEMUIOVector *qiov, int flags)
3136 {
3137     BlockDriver *drv = bs->drv;
3138     bool waited;
3139     int ret;
3140 
3141     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3142     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3143 
3144     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3145     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3146 
3147     waited = wait_serialising_requests(req);
3148     assert(!waited || !req->serialising);
3149     assert(req->overlap_offset <= offset);
3150     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3151 
3152     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3153 
3154     if (ret < 0) {
3155         /* Do nothing, write notifier decided to fail this request */
3156     } else if (flags & BDRV_REQ_ZERO_WRITE) {
3157         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3158         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3159     } else {
3160         BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3161         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3162     }
3163     BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3164 
3165     if (ret == 0 && !bs->enable_write_cache) {
3166         ret = bdrv_co_flush(bs);
3167     }
3168 
3169     bdrv_set_dirty(bs, sector_num, nb_sectors);
3170 
3171     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3172         bs->wr_highest_sector = sector_num + nb_sectors - 1;
3173     }
3174     if (bs->growable && ret >= 0) {
3175         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3176     }
3177 
3178     return ret;
3179 }
3180 
3181 /*
3182  * Handle a write request in coroutine context
3183  */
3184 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3185     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3186     BdrvRequestFlags flags)
3187 {
3188     BdrvTrackedRequest req;
3189     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3190     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3191     uint8_t *head_buf = NULL;
3192     uint8_t *tail_buf = NULL;
3193     QEMUIOVector local_qiov;
3194     bool use_local_qiov = false;
3195     int ret;
3196 
3197     if (!bs->drv) {
3198         return -ENOMEDIUM;
3199     }
3200     if (bs->read_only) {
3201         return -EACCES;
3202     }
3203     if (bdrv_check_byte_request(bs, offset, bytes)) {
3204         return -EIO;
3205     }
3206 
3207     /* throttling disk I/O */
3208     if (bs->io_limits_enabled) {
3209         bdrv_io_limits_intercept(bs, bytes, true);
3210     }
3211 
3212     /*
3213      * Align write if necessary by performing a read-modify-write cycle.
3214      * Pad qiov with the read parts and be sure to have a tracked request not
3215      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3216      */
3217     tracked_request_begin(&req, bs, offset, bytes, true);
3218 
3219     if (offset & (align - 1)) {
3220         QEMUIOVector head_qiov;
3221         struct iovec head_iov;
3222 
3223         mark_request_serialising(&req, align);
3224         wait_serialising_requests(&req);
3225 
3226         head_buf = qemu_blockalign(bs, align);
3227         head_iov = (struct iovec) {
3228             .iov_base   = head_buf,
3229             .iov_len    = align,
3230         };
3231         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3232 
3233         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3234         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3235                                   align, &head_qiov, 0);
3236         if (ret < 0) {
3237             goto fail;
3238         }
3239         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3240 
3241         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3242         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3243         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3244         use_local_qiov = true;
3245 
3246         bytes += offset & (align - 1);
3247         offset = offset & ~(align - 1);
3248     }
3249 
3250     if ((offset + bytes) & (align - 1)) {
3251         QEMUIOVector tail_qiov;
3252         struct iovec tail_iov;
3253         size_t tail_bytes;
3254         bool waited;
3255 
3256         mark_request_serialising(&req, align);
3257         waited = wait_serialising_requests(&req);
3258         assert(!waited || !use_local_qiov);
3259 
3260         tail_buf = qemu_blockalign(bs, align);
3261         tail_iov = (struct iovec) {
3262             .iov_base   = tail_buf,
3263             .iov_len    = align,
3264         };
3265         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3266 
3267         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3268         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3269                                   align, &tail_qiov, 0);
3270         if (ret < 0) {
3271             goto fail;
3272         }
3273         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3274 
3275         if (!use_local_qiov) {
3276             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3277             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3278             use_local_qiov = true;
3279         }
3280 
3281         tail_bytes = (offset + bytes) & (align - 1);
3282         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3283 
3284         bytes = ROUND_UP(bytes, align);
3285     }
3286 
3287     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3288                                use_local_qiov ? &local_qiov : qiov,
3289                                flags);
3290 
3291 fail:
3292     tracked_request_end(&req);
3293 
3294     if (use_local_qiov) {
3295         qemu_iovec_destroy(&local_qiov);
3296     }
3297     qemu_vfree(head_buf);
3298     qemu_vfree(tail_buf);
3299 
3300     return ret;
3301 }
3302 
3303 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3304     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3305     BdrvRequestFlags flags)
3306 {
3307     if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3308         return -EINVAL;
3309     }
3310 
3311     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3312                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3313 }
3314 
3315 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3316     int nb_sectors, QEMUIOVector *qiov)
3317 {
3318     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3319 
3320     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3321 }
3322 
3323 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3324                                       int64_t sector_num, int nb_sectors,
3325                                       BdrvRequestFlags flags)
3326 {
3327     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3328 
3329     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3330         flags &= ~BDRV_REQ_MAY_UNMAP;
3331     }
3332 
3333     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3334                              BDRV_REQ_ZERO_WRITE | flags);
3335 }
3336 
3337 /**
3338  * Truncate file to 'offset' bytes (needed only for file protocols)
3339  */
3340 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3341 {
3342     BlockDriver *drv = bs->drv;
3343     int ret;
3344     if (!drv)
3345         return -ENOMEDIUM;
3346     if (!drv->bdrv_truncate)
3347         return -ENOTSUP;
3348     if (bs->read_only)
3349         return -EACCES;
3350     if (bdrv_in_use(bs))
3351         return -EBUSY;
3352     ret = drv->bdrv_truncate(bs, offset);
3353     if (ret == 0) {
3354         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3355         bdrv_dev_resize_cb(bs);
3356     }
3357     return ret;
3358 }
3359 
3360 /**
3361  * Length of a allocated file in bytes. Sparse files are counted by actual
3362  * allocated space. Return < 0 if error or unknown.
3363  */
3364 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3365 {
3366     BlockDriver *drv = bs->drv;
3367     if (!drv) {
3368         return -ENOMEDIUM;
3369     }
3370     if (drv->bdrv_get_allocated_file_size) {
3371         return drv->bdrv_get_allocated_file_size(bs);
3372     }
3373     if (bs->file) {
3374         return bdrv_get_allocated_file_size(bs->file);
3375     }
3376     return -ENOTSUP;
3377 }
3378 
3379 /**
3380  * Length of a file in bytes. Return < 0 if error or unknown.
3381  */
3382 int64_t bdrv_getlength(BlockDriverState *bs)
3383 {
3384     BlockDriver *drv = bs->drv;
3385     if (!drv)
3386         return -ENOMEDIUM;
3387 
3388     if (drv->has_variable_length) {
3389         int ret = refresh_total_sectors(bs, bs->total_sectors);
3390         if (ret < 0) {
3391             return ret;
3392         }
3393     }
3394     return bs->total_sectors * BDRV_SECTOR_SIZE;
3395 }
3396 
3397 /* return 0 as number of sectors if no device present or error */
3398 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3399 {
3400     int64_t length;
3401     length = bdrv_getlength(bs);
3402     if (length < 0)
3403         length = 0;
3404     else
3405         length = length >> BDRV_SECTOR_BITS;
3406     *nb_sectors_ptr = length;
3407 }
3408 
3409 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3410                        BlockdevOnError on_write_error)
3411 {
3412     bs->on_read_error = on_read_error;
3413     bs->on_write_error = on_write_error;
3414 }
3415 
3416 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3417 {
3418     return is_read ? bs->on_read_error : bs->on_write_error;
3419 }
3420 
3421 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3422 {
3423     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3424 
3425     switch (on_err) {
3426     case BLOCKDEV_ON_ERROR_ENOSPC:
3427         return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3428     case BLOCKDEV_ON_ERROR_STOP:
3429         return BDRV_ACTION_STOP;
3430     case BLOCKDEV_ON_ERROR_REPORT:
3431         return BDRV_ACTION_REPORT;
3432     case BLOCKDEV_ON_ERROR_IGNORE:
3433         return BDRV_ACTION_IGNORE;
3434     default:
3435         abort();
3436     }
3437 }
3438 
3439 /* This is done by device models because, while the block layer knows
3440  * about the error, it does not know whether an operation comes from
3441  * the device or the block layer (from a job, for example).
3442  */
3443 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3444                        bool is_read, int error)
3445 {
3446     assert(error >= 0);
3447     bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3448     if (action == BDRV_ACTION_STOP) {
3449         vm_stop(RUN_STATE_IO_ERROR);
3450         bdrv_iostatus_set_err(bs, error);
3451     }
3452 }
3453 
3454 int bdrv_is_read_only(BlockDriverState *bs)
3455 {
3456     return bs->read_only;
3457 }
3458 
3459 int bdrv_is_sg(BlockDriverState *bs)
3460 {
3461     return bs->sg;
3462 }
3463 
3464 int bdrv_enable_write_cache(BlockDriverState *bs)
3465 {
3466     return bs->enable_write_cache;
3467 }
3468 
3469 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3470 {
3471     bs->enable_write_cache = wce;
3472 
3473     /* so a reopen() will preserve wce */
3474     if (wce) {
3475         bs->open_flags |= BDRV_O_CACHE_WB;
3476     } else {
3477         bs->open_flags &= ~BDRV_O_CACHE_WB;
3478     }
3479 }
3480 
3481 int bdrv_is_encrypted(BlockDriverState *bs)
3482 {
3483     if (bs->backing_hd && bs->backing_hd->encrypted)
3484         return 1;
3485     return bs->encrypted;
3486 }
3487 
3488 int bdrv_key_required(BlockDriverState *bs)
3489 {
3490     BlockDriverState *backing_hd = bs->backing_hd;
3491 
3492     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3493         return 1;
3494     return (bs->encrypted && !bs->valid_key);
3495 }
3496 
3497 int bdrv_set_key(BlockDriverState *bs, const char *key)
3498 {
3499     int ret;
3500     if (bs->backing_hd && bs->backing_hd->encrypted) {
3501         ret = bdrv_set_key(bs->backing_hd, key);
3502         if (ret < 0)
3503             return ret;
3504         if (!bs->encrypted)
3505             return 0;
3506     }
3507     if (!bs->encrypted) {
3508         return -EINVAL;
3509     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3510         return -ENOMEDIUM;
3511     }
3512     ret = bs->drv->bdrv_set_key(bs, key);
3513     if (ret < 0) {
3514         bs->valid_key = 0;
3515     } else if (!bs->valid_key) {
3516         bs->valid_key = 1;
3517         /* call the change callback now, we skipped it on open */
3518         bdrv_dev_change_media_cb(bs, true);
3519     }
3520     return ret;
3521 }
3522 
3523 const char *bdrv_get_format_name(BlockDriverState *bs)
3524 {
3525     return bs->drv ? bs->drv->format_name : NULL;
3526 }
3527 
3528 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3529                          void *opaque)
3530 {
3531     BlockDriver *drv;
3532 
3533     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3534         it(opaque, drv->format_name);
3535     }
3536 }
3537 
3538 /* This function is to find block backend bs */
3539 BlockDriverState *bdrv_find(const char *name)
3540 {
3541     BlockDriverState *bs;
3542 
3543     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3544         if (!strcmp(name, bs->device_name)) {
3545             return bs;
3546         }
3547     }
3548     return NULL;
3549 }
3550 
3551 /* This function is to find a node in the bs graph */
3552 BlockDriverState *bdrv_find_node(const char *node_name)
3553 {
3554     BlockDriverState *bs;
3555 
3556     assert(node_name);
3557 
3558     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3559         if (!strcmp(node_name, bs->node_name)) {
3560             return bs;
3561         }
3562     }
3563     return NULL;
3564 }
3565 
3566 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3567 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3568 {
3569     BlockDeviceInfoList *list, *entry;
3570     BlockDriverState *bs;
3571 
3572     list = NULL;
3573     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3574         entry = g_malloc0(sizeof(*entry));
3575         entry->value = bdrv_block_device_info(bs);
3576         entry->next = list;
3577         list = entry;
3578     }
3579 
3580     return list;
3581 }
3582 
3583 BlockDriverState *bdrv_lookup_bs(const char *device,
3584                                  const char *node_name,
3585                                  Error **errp)
3586 {
3587     BlockDriverState *bs = NULL;
3588 
3589     if (device) {
3590         bs = bdrv_find(device);
3591 
3592         if (bs) {
3593             return bs;
3594         }
3595     }
3596 
3597     if (node_name) {
3598         bs = bdrv_find_node(node_name);
3599 
3600         if (bs) {
3601             return bs;
3602         }
3603     }
3604 
3605     error_setg(errp, "Cannot find device=%s nor node_name=%s",
3606                      device ? device : "",
3607                      node_name ? node_name : "");
3608     return NULL;
3609 }
3610 
3611 BlockDriverState *bdrv_next(BlockDriverState *bs)
3612 {
3613     if (!bs) {
3614         return QTAILQ_FIRST(&bdrv_states);
3615     }
3616     return QTAILQ_NEXT(bs, device_list);
3617 }
3618 
3619 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3620 {
3621     BlockDriverState *bs;
3622 
3623     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3624         it(opaque, bs);
3625     }
3626 }
3627 
3628 const char *bdrv_get_device_name(BlockDriverState *bs)
3629 {
3630     return bs->device_name;
3631 }
3632 
3633 int bdrv_get_flags(BlockDriverState *bs)
3634 {
3635     return bs->open_flags;
3636 }
3637 
3638 int bdrv_flush_all(void)
3639 {
3640     BlockDriverState *bs;
3641     int result = 0;
3642 
3643     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3644         int ret = bdrv_flush(bs);
3645         if (ret < 0 && !result) {
3646             result = ret;
3647         }
3648     }
3649 
3650     return result;
3651 }
3652 
3653 int bdrv_has_zero_init_1(BlockDriverState *bs)
3654 {
3655     return 1;
3656 }
3657 
3658 int bdrv_has_zero_init(BlockDriverState *bs)
3659 {
3660     assert(bs->drv);
3661 
3662     /* If BS is a copy on write image, it is initialized to
3663        the contents of the base image, which may not be zeroes.  */
3664     if (bs->backing_hd) {
3665         return 0;
3666     }
3667     if (bs->drv->bdrv_has_zero_init) {
3668         return bs->drv->bdrv_has_zero_init(bs);
3669     }
3670 
3671     /* safe default */
3672     return 0;
3673 }
3674 
3675 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3676 {
3677     BlockDriverInfo bdi;
3678 
3679     if (bs->backing_hd) {
3680         return false;
3681     }
3682 
3683     if (bdrv_get_info(bs, &bdi) == 0) {
3684         return bdi.unallocated_blocks_are_zero;
3685     }
3686 
3687     return false;
3688 }
3689 
3690 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3691 {
3692     BlockDriverInfo bdi;
3693 
3694     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3695         return false;
3696     }
3697 
3698     if (bdrv_get_info(bs, &bdi) == 0) {
3699         return bdi.can_write_zeroes_with_unmap;
3700     }
3701 
3702     return false;
3703 }
3704 
3705 typedef struct BdrvCoGetBlockStatusData {
3706     BlockDriverState *bs;
3707     BlockDriverState *base;
3708     int64_t sector_num;
3709     int nb_sectors;
3710     int *pnum;
3711     int64_t ret;
3712     bool done;
3713 } BdrvCoGetBlockStatusData;
3714 
3715 /*
3716  * Returns true iff the specified sector is present in the disk image. Drivers
3717  * not implementing the functionality are assumed to not support backing files,
3718  * hence all their sectors are reported as allocated.
3719  *
3720  * If 'sector_num' is beyond the end of the disk image the return value is 0
3721  * and 'pnum' is set to 0.
3722  *
3723  * 'pnum' is set to the number of sectors (including and immediately following
3724  * the specified sector) that are known to be in the same
3725  * allocated/unallocated state.
3726  *
3727  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3728  * beyond the end of the disk image it will be clamped.
3729  */
3730 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3731                                                      int64_t sector_num,
3732                                                      int nb_sectors, int *pnum)
3733 {
3734     int64_t length;
3735     int64_t n;
3736     int64_t ret, ret2;
3737 
3738     length = bdrv_getlength(bs);
3739     if (length < 0) {
3740         return length;
3741     }
3742 
3743     if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3744         *pnum = 0;
3745         return 0;
3746     }
3747 
3748     n = bs->total_sectors - sector_num;
3749     if (n < nb_sectors) {
3750         nb_sectors = n;
3751     }
3752 
3753     if (!bs->drv->bdrv_co_get_block_status) {
3754         *pnum = nb_sectors;
3755         ret = BDRV_BLOCK_DATA;
3756         if (bs->drv->protocol_name) {
3757             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3758         }
3759         return ret;
3760     }
3761 
3762     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3763     if (ret < 0) {
3764         *pnum = 0;
3765         return ret;
3766     }
3767 
3768     if (ret & BDRV_BLOCK_RAW) {
3769         assert(ret & BDRV_BLOCK_OFFSET_VALID);
3770         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3771                                      *pnum, pnum);
3772     }
3773 
3774     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3775         if (bdrv_unallocated_blocks_are_zero(bs)) {
3776             ret |= BDRV_BLOCK_ZERO;
3777         } else if (bs->backing_hd) {
3778             BlockDriverState *bs2 = bs->backing_hd;
3779             int64_t length2 = bdrv_getlength(bs2);
3780             if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3781                 ret |= BDRV_BLOCK_ZERO;
3782             }
3783         }
3784     }
3785 
3786     if (bs->file &&
3787         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3788         (ret & BDRV_BLOCK_OFFSET_VALID)) {
3789         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3790                                         *pnum, pnum);
3791         if (ret2 >= 0) {
3792             /* Ignore errors.  This is just providing extra information, it
3793              * is useful but not necessary.
3794              */
3795             ret |= (ret2 & BDRV_BLOCK_ZERO);
3796         }
3797     }
3798 
3799     return ret;
3800 }
3801 
3802 /* Coroutine wrapper for bdrv_get_block_status() */
3803 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3804 {
3805     BdrvCoGetBlockStatusData *data = opaque;
3806     BlockDriverState *bs = data->bs;
3807 
3808     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3809                                          data->pnum);
3810     data->done = true;
3811 }
3812 
3813 /*
3814  * Synchronous wrapper around bdrv_co_get_block_status().
3815  *
3816  * See bdrv_co_get_block_status() for details.
3817  */
3818 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3819                               int nb_sectors, int *pnum)
3820 {
3821     Coroutine *co;
3822     BdrvCoGetBlockStatusData data = {
3823         .bs = bs,
3824         .sector_num = sector_num,
3825         .nb_sectors = nb_sectors,
3826         .pnum = pnum,
3827         .done = false,
3828     };
3829 
3830     if (qemu_in_coroutine()) {
3831         /* Fast-path if already in coroutine context */
3832         bdrv_get_block_status_co_entry(&data);
3833     } else {
3834         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3835         qemu_coroutine_enter(co, &data);
3836         while (!data.done) {
3837             qemu_aio_wait();
3838         }
3839     }
3840     return data.ret;
3841 }
3842 
3843 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3844                                    int nb_sectors, int *pnum)
3845 {
3846     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3847     if (ret < 0) {
3848         return ret;
3849     }
3850     return
3851         (ret & BDRV_BLOCK_DATA) ||
3852         ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3853 }
3854 
3855 /*
3856  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3857  *
3858  * Return true if the given sector is allocated in any image between
3859  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3860  * sector is allocated in any image of the chain.  Return false otherwise.
3861  *
3862  * 'pnum' is set to the number of sectors (including and immediately following
3863  *  the specified sector) that are known to be in the same
3864  *  allocated/unallocated state.
3865  *
3866  */
3867 int bdrv_is_allocated_above(BlockDriverState *top,
3868                             BlockDriverState *base,
3869                             int64_t sector_num,
3870                             int nb_sectors, int *pnum)
3871 {
3872     BlockDriverState *intermediate;
3873     int ret, n = nb_sectors;
3874 
3875     intermediate = top;
3876     while (intermediate && intermediate != base) {
3877         int pnum_inter;
3878         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3879                                 &pnum_inter);
3880         if (ret < 0) {
3881             return ret;
3882         } else if (ret) {
3883             *pnum = pnum_inter;
3884             return 1;
3885         }
3886 
3887         /*
3888          * [sector_num, nb_sectors] is unallocated on top but intermediate
3889          * might have
3890          *
3891          * [sector_num+x, nr_sectors] allocated.
3892          */
3893         if (n > pnum_inter &&
3894             (intermediate == top ||
3895              sector_num + pnum_inter < intermediate->total_sectors)) {
3896             n = pnum_inter;
3897         }
3898 
3899         intermediate = intermediate->backing_hd;
3900     }
3901 
3902     *pnum = n;
3903     return 0;
3904 }
3905 
3906 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3907 {
3908     if (bs->backing_hd && bs->backing_hd->encrypted)
3909         return bs->backing_file;
3910     else if (bs->encrypted)
3911         return bs->filename;
3912     else
3913         return NULL;
3914 }
3915 
3916 void bdrv_get_backing_filename(BlockDriverState *bs,
3917                                char *filename, int filename_size)
3918 {
3919     pstrcpy(filename, filename_size, bs->backing_file);
3920 }
3921 
3922 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3923                           const uint8_t *buf, int nb_sectors)
3924 {
3925     BlockDriver *drv = bs->drv;
3926     if (!drv)
3927         return -ENOMEDIUM;
3928     if (!drv->bdrv_write_compressed)
3929         return -ENOTSUP;
3930     if (bdrv_check_request(bs, sector_num, nb_sectors))
3931         return -EIO;
3932 
3933     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
3934 
3935     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3936 }
3937 
3938 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3939 {
3940     BlockDriver *drv = bs->drv;
3941     if (!drv)
3942         return -ENOMEDIUM;
3943     if (!drv->bdrv_get_info)
3944         return -ENOTSUP;
3945     memset(bdi, 0, sizeof(*bdi));
3946     return drv->bdrv_get_info(bs, bdi);
3947 }
3948 
3949 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
3950 {
3951     BlockDriver *drv = bs->drv;
3952     if (drv && drv->bdrv_get_specific_info) {
3953         return drv->bdrv_get_specific_info(bs);
3954     }
3955     return NULL;
3956 }
3957 
3958 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3959                       int64_t pos, int size)
3960 {
3961     QEMUIOVector qiov;
3962     struct iovec iov = {
3963         .iov_base   = (void *) buf,
3964         .iov_len    = size,
3965     };
3966 
3967     qemu_iovec_init_external(&qiov, &iov, 1);
3968     return bdrv_writev_vmstate(bs, &qiov, pos);
3969 }
3970 
3971 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
3972 {
3973     BlockDriver *drv = bs->drv;
3974 
3975     if (!drv) {
3976         return -ENOMEDIUM;
3977     } else if (drv->bdrv_save_vmstate) {
3978         return drv->bdrv_save_vmstate(bs, qiov, pos);
3979     } else if (bs->file) {
3980         return bdrv_writev_vmstate(bs->file, qiov, pos);
3981     }
3982 
3983     return -ENOTSUP;
3984 }
3985 
3986 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3987                       int64_t pos, int size)
3988 {
3989     BlockDriver *drv = bs->drv;
3990     if (!drv)
3991         return -ENOMEDIUM;
3992     if (drv->bdrv_load_vmstate)
3993         return drv->bdrv_load_vmstate(bs, buf, pos, size);
3994     if (bs->file)
3995         return bdrv_load_vmstate(bs->file, buf, pos, size);
3996     return -ENOTSUP;
3997 }
3998 
3999 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4000 {
4001     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4002         return;
4003     }
4004 
4005     bs->drv->bdrv_debug_event(bs, event);
4006 }
4007 
4008 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4009                           const char *tag)
4010 {
4011     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4012         bs = bs->file;
4013     }
4014 
4015     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4016         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4017     }
4018 
4019     return -ENOTSUP;
4020 }
4021 
4022 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4023 {
4024     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4025         bs = bs->file;
4026     }
4027 
4028     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4029         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4030     }
4031 
4032     return -ENOTSUP;
4033 }
4034 
4035 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4036 {
4037     while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
4038         bs = bs->file;
4039     }
4040 
4041     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4042         return bs->drv->bdrv_debug_resume(bs, tag);
4043     }
4044 
4045     return -ENOTSUP;
4046 }
4047 
4048 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4049 {
4050     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4051         bs = bs->file;
4052     }
4053 
4054     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4055         return bs->drv->bdrv_debug_is_suspended(bs, tag);
4056     }
4057 
4058     return false;
4059 }
4060 
4061 int bdrv_is_snapshot(BlockDriverState *bs)
4062 {
4063     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4064 }
4065 
4066 /* backing_file can either be relative, or absolute, or a protocol.  If it is
4067  * relative, it must be relative to the chain.  So, passing in bs->filename
4068  * from a BDS as backing_file should not be done, as that may be relative to
4069  * the CWD rather than the chain. */
4070 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4071         const char *backing_file)
4072 {
4073     char *filename_full = NULL;
4074     char *backing_file_full = NULL;
4075     char *filename_tmp = NULL;
4076     int is_protocol = 0;
4077     BlockDriverState *curr_bs = NULL;
4078     BlockDriverState *retval = NULL;
4079 
4080     if (!bs || !bs->drv || !backing_file) {
4081         return NULL;
4082     }
4083 
4084     filename_full     = g_malloc(PATH_MAX);
4085     backing_file_full = g_malloc(PATH_MAX);
4086     filename_tmp      = g_malloc(PATH_MAX);
4087 
4088     is_protocol = path_has_protocol(backing_file);
4089 
4090     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4091 
4092         /* If either of the filename paths is actually a protocol, then
4093          * compare unmodified paths; otherwise make paths relative */
4094         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4095             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4096                 retval = curr_bs->backing_hd;
4097                 break;
4098             }
4099         } else {
4100             /* If not an absolute filename path, make it relative to the current
4101              * image's filename path */
4102             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4103                          backing_file);
4104 
4105             /* We are going to compare absolute pathnames */
4106             if (!realpath(filename_tmp, filename_full)) {
4107                 continue;
4108             }
4109 
4110             /* We need to make sure the backing filename we are comparing against
4111              * is relative to the current image filename (or absolute) */
4112             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4113                          curr_bs->backing_file);
4114 
4115             if (!realpath(filename_tmp, backing_file_full)) {
4116                 continue;
4117             }
4118 
4119             if (strcmp(backing_file_full, filename_full) == 0) {
4120                 retval = curr_bs->backing_hd;
4121                 break;
4122             }
4123         }
4124     }
4125 
4126     g_free(filename_full);
4127     g_free(backing_file_full);
4128     g_free(filename_tmp);
4129     return retval;
4130 }
4131 
4132 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4133 {
4134     if (!bs->drv) {
4135         return 0;
4136     }
4137 
4138     if (!bs->backing_hd) {
4139         return 0;
4140     }
4141 
4142     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4143 }
4144 
4145 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4146 {
4147     BlockDriverState *curr_bs = NULL;
4148 
4149     if (!bs) {
4150         return NULL;
4151     }
4152 
4153     curr_bs = bs;
4154 
4155     while (curr_bs->backing_hd) {
4156         curr_bs = curr_bs->backing_hd;
4157     }
4158     return curr_bs;
4159 }
4160 
4161 /**************************************************************/
4162 /* async I/Os */
4163 
4164 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4165                                  QEMUIOVector *qiov, int nb_sectors,
4166                                  BlockDriverCompletionFunc *cb, void *opaque)
4167 {
4168     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4169 
4170     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4171                                  cb, opaque, false);
4172 }
4173 
4174 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4175                                   QEMUIOVector *qiov, int nb_sectors,
4176                                   BlockDriverCompletionFunc *cb, void *opaque)
4177 {
4178     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4179 
4180     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4181                                  cb, opaque, true);
4182 }
4183 
4184 BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4185         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4186         BlockDriverCompletionFunc *cb, void *opaque)
4187 {
4188     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4189 
4190     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4191                                  BDRV_REQ_ZERO_WRITE | flags,
4192                                  cb, opaque, true);
4193 }
4194 
4195 
4196 typedef struct MultiwriteCB {
4197     int error;
4198     int num_requests;
4199     int num_callbacks;
4200     struct {
4201         BlockDriverCompletionFunc *cb;
4202         void *opaque;
4203         QEMUIOVector *free_qiov;
4204     } callbacks[];
4205 } MultiwriteCB;
4206 
4207 static void multiwrite_user_cb(MultiwriteCB *mcb)
4208 {
4209     int i;
4210 
4211     for (i = 0; i < mcb->num_callbacks; i++) {
4212         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4213         if (mcb->callbacks[i].free_qiov) {
4214             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4215         }
4216         g_free(mcb->callbacks[i].free_qiov);
4217     }
4218 }
4219 
4220 static void multiwrite_cb(void *opaque, int ret)
4221 {
4222     MultiwriteCB *mcb = opaque;
4223 
4224     trace_multiwrite_cb(mcb, ret);
4225 
4226     if (ret < 0 && !mcb->error) {
4227         mcb->error = ret;
4228     }
4229 
4230     mcb->num_requests--;
4231     if (mcb->num_requests == 0) {
4232         multiwrite_user_cb(mcb);
4233         g_free(mcb);
4234     }
4235 }
4236 
4237 static int multiwrite_req_compare(const void *a, const void *b)
4238 {
4239     const BlockRequest *req1 = a, *req2 = b;
4240 
4241     /*
4242      * Note that we can't simply subtract req2->sector from req1->sector
4243      * here as that could overflow the return value.
4244      */
4245     if (req1->sector > req2->sector) {
4246         return 1;
4247     } else if (req1->sector < req2->sector) {
4248         return -1;
4249     } else {
4250         return 0;
4251     }
4252 }
4253 
4254 /*
4255  * Takes a bunch of requests and tries to merge them. Returns the number of
4256  * requests that remain after merging.
4257  */
4258 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4259     int num_reqs, MultiwriteCB *mcb)
4260 {
4261     int i, outidx;
4262 
4263     // Sort requests by start sector
4264     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4265 
4266     // Check if adjacent requests touch the same clusters. If so, combine them,
4267     // filling up gaps with zero sectors.
4268     outidx = 0;
4269     for (i = 1; i < num_reqs; i++) {
4270         int merge = 0;
4271         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4272 
4273         // Handle exactly sequential writes and overlapping writes.
4274         if (reqs[i].sector <= oldreq_last) {
4275             merge = 1;
4276         }
4277 
4278         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4279             merge = 0;
4280         }
4281 
4282         if (merge) {
4283             size_t size;
4284             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4285             qemu_iovec_init(qiov,
4286                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4287 
4288             // Add the first request to the merged one. If the requests are
4289             // overlapping, drop the last sectors of the first request.
4290             size = (reqs[i].sector - reqs[outidx].sector) << 9;
4291             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4292 
4293             // We should need to add any zeros between the two requests
4294             assert (reqs[i].sector <= oldreq_last);
4295 
4296             // Add the second request
4297             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4298 
4299             reqs[outidx].nb_sectors = qiov->size >> 9;
4300             reqs[outidx].qiov = qiov;
4301 
4302             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4303         } else {
4304             outidx++;
4305             reqs[outidx].sector     = reqs[i].sector;
4306             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4307             reqs[outidx].qiov       = reqs[i].qiov;
4308         }
4309     }
4310 
4311     return outidx + 1;
4312 }
4313 
4314 /*
4315  * Submit multiple AIO write requests at once.
4316  *
4317  * On success, the function returns 0 and all requests in the reqs array have
4318  * been submitted. In error case this function returns -1, and any of the
4319  * requests may or may not be submitted yet. In particular, this means that the
4320  * callback will be called for some of the requests, for others it won't. The
4321  * caller must check the error field of the BlockRequest to wait for the right
4322  * callbacks (if error != 0, no callback will be called).
4323  *
4324  * The implementation may modify the contents of the reqs array, e.g. to merge
4325  * requests. However, the fields opaque and error are left unmodified as they
4326  * are used to signal failure for a single request to the caller.
4327  */
4328 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4329 {
4330     MultiwriteCB *mcb;
4331     int i;
4332 
4333     /* don't submit writes if we don't have a medium */
4334     if (bs->drv == NULL) {
4335         for (i = 0; i < num_reqs; i++) {
4336             reqs[i].error = -ENOMEDIUM;
4337         }
4338         return -1;
4339     }
4340 
4341     if (num_reqs == 0) {
4342         return 0;
4343     }
4344 
4345     // Create MultiwriteCB structure
4346     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4347     mcb->num_requests = 0;
4348     mcb->num_callbacks = num_reqs;
4349 
4350     for (i = 0; i < num_reqs; i++) {
4351         mcb->callbacks[i].cb = reqs[i].cb;
4352         mcb->callbacks[i].opaque = reqs[i].opaque;
4353     }
4354 
4355     // Check for mergable requests
4356     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4357 
4358     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4359 
4360     /* Run the aio requests. */
4361     mcb->num_requests = num_reqs;
4362     for (i = 0; i < num_reqs; i++) {
4363         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4364                               reqs[i].nb_sectors, reqs[i].flags,
4365                               multiwrite_cb, mcb,
4366                               true);
4367     }
4368 
4369     return 0;
4370 }
4371 
4372 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4373 {
4374     acb->aiocb_info->cancel(acb);
4375 }
4376 
4377 /**************************************************************/
4378 /* async block device emulation */
4379 
4380 typedef struct BlockDriverAIOCBSync {
4381     BlockDriverAIOCB common;
4382     QEMUBH *bh;
4383     int ret;
4384     /* vector translation state */
4385     QEMUIOVector *qiov;
4386     uint8_t *bounce;
4387     int is_write;
4388 } BlockDriverAIOCBSync;
4389 
4390 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4391 {
4392     BlockDriverAIOCBSync *acb =
4393         container_of(blockacb, BlockDriverAIOCBSync, common);
4394     qemu_bh_delete(acb->bh);
4395     acb->bh = NULL;
4396     qemu_aio_release(acb);
4397 }
4398 
4399 static const AIOCBInfo bdrv_em_aiocb_info = {
4400     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
4401     .cancel             = bdrv_aio_cancel_em,
4402 };
4403 
4404 static void bdrv_aio_bh_cb(void *opaque)
4405 {
4406     BlockDriverAIOCBSync *acb = opaque;
4407 
4408     if (!acb->is_write)
4409         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4410     qemu_vfree(acb->bounce);
4411     acb->common.cb(acb->common.opaque, acb->ret);
4412     qemu_bh_delete(acb->bh);
4413     acb->bh = NULL;
4414     qemu_aio_release(acb);
4415 }
4416 
4417 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4418                                             int64_t sector_num,
4419                                             QEMUIOVector *qiov,
4420                                             int nb_sectors,
4421                                             BlockDriverCompletionFunc *cb,
4422                                             void *opaque,
4423                                             int is_write)
4424 
4425 {
4426     BlockDriverAIOCBSync *acb;
4427 
4428     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4429     acb->is_write = is_write;
4430     acb->qiov = qiov;
4431     acb->bounce = qemu_blockalign(bs, qiov->size);
4432     acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4433 
4434     if (is_write) {
4435         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4436         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4437     } else {
4438         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4439     }
4440 
4441     qemu_bh_schedule(acb->bh);
4442 
4443     return &acb->common;
4444 }
4445 
4446 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4447         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4448         BlockDriverCompletionFunc *cb, void *opaque)
4449 {
4450     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4451 }
4452 
4453 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4454         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4455         BlockDriverCompletionFunc *cb, void *opaque)
4456 {
4457     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4458 }
4459 
4460 
4461 typedef struct BlockDriverAIOCBCoroutine {
4462     BlockDriverAIOCB common;
4463     BlockRequest req;
4464     bool is_write;
4465     bool *done;
4466     QEMUBH* bh;
4467 } BlockDriverAIOCBCoroutine;
4468 
4469 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4470 {
4471     BlockDriverAIOCBCoroutine *acb =
4472         container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4473     bool done = false;
4474 
4475     acb->done = &done;
4476     while (!done) {
4477         qemu_aio_wait();
4478     }
4479 }
4480 
4481 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4482     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4483     .cancel             = bdrv_aio_co_cancel_em,
4484 };
4485 
4486 static void bdrv_co_em_bh(void *opaque)
4487 {
4488     BlockDriverAIOCBCoroutine *acb = opaque;
4489 
4490     acb->common.cb(acb->common.opaque, acb->req.error);
4491 
4492     if (acb->done) {
4493         *acb->done = true;
4494     }
4495 
4496     qemu_bh_delete(acb->bh);
4497     qemu_aio_release(acb);
4498 }
4499 
4500 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4501 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4502 {
4503     BlockDriverAIOCBCoroutine *acb = opaque;
4504     BlockDriverState *bs = acb->common.bs;
4505 
4506     if (!acb->is_write) {
4507         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4508             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4509     } else {
4510         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4511             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4512     }
4513 
4514     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4515     qemu_bh_schedule(acb->bh);
4516 }
4517 
4518 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4519                                                int64_t sector_num,
4520                                                QEMUIOVector *qiov,
4521                                                int nb_sectors,
4522                                                BdrvRequestFlags flags,
4523                                                BlockDriverCompletionFunc *cb,
4524                                                void *opaque,
4525                                                bool is_write)
4526 {
4527     Coroutine *co;
4528     BlockDriverAIOCBCoroutine *acb;
4529 
4530     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4531     acb->req.sector = sector_num;
4532     acb->req.nb_sectors = nb_sectors;
4533     acb->req.qiov = qiov;
4534     acb->req.flags = flags;
4535     acb->is_write = is_write;
4536     acb->done = NULL;
4537 
4538     co = qemu_coroutine_create(bdrv_co_do_rw);
4539     qemu_coroutine_enter(co, acb);
4540 
4541     return &acb->common;
4542 }
4543 
4544 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4545 {
4546     BlockDriverAIOCBCoroutine *acb = opaque;
4547     BlockDriverState *bs = acb->common.bs;
4548 
4549     acb->req.error = bdrv_co_flush(bs);
4550     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4551     qemu_bh_schedule(acb->bh);
4552 }
4553 
4554 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4555         BlockDriverCompletionFunc *cb, void *opaque)
4556 {
4557     trace_bdrv_aio_flush(bs, opaque);
4558 
4559     Coroutine *co;
4560     BlockDriverAIOCBCoroutine *acb;
4561 
4562     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4563     acb->done = NULL;
4564 
4565     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4566     qemu_coroutine_enter(co, acb);
4567 
4568     return &acb->common;
4569 }
4570 
4571 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4572 {
4573     BlockDriverAIOCBCoroutine *acb = opaque;
4574     BlockDriverState *bs = acb->common.bs;
4575 
4576     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4577     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4578     qemu_bh_schedule(acb->bh);
4579 }
4580 
4581 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4582         int64_t sector_num, int nb_sectors,
4583         BlockDriverCompletionFunc *cb, void *opaque)
4584 {
4585     Coroutine *co;
4586     BlockDriverAIOCBCoroutine *acb;
4587 
4588     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4589 
4590     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4591     acb->req.sector = sector_num;
4592     acb->req.nb_sectors = nb_sectors;
4593     acb->done = NULL;
4594     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4595     qemu_coroutine_enter(co, acb);
4596 
4597     return &acb->common;
4598 }
4599 
4600 void bdrv_init(void)
4601 {
4602     module_call_init(MODULE_INIT_BLOCK);
4603 }
4604 
4605 void bdrv_init_with_whitelist(void)
4606 {
4607     use_bdrv_whitelist = 1;
4608     bdrv_init();
4609 }
4610 
4611 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4612                    BlockDriverCompletionFunc *cb, void *opaque)
4613 {
4614     BlockDriverAIOCB *acb;
4615 
4616     acb = g_slice_alloc(aiocb_info->aiocb_size);
4617     acb->aiocb_info = aiocb_info;
4618     acb->bs = bs;
4619     acb->cb = cb;
4620     acb->opaque = opaque;
4621     return acb;
4622 }
4623 
4624 void qemu_aio_release(void *p)
4625 {
4626     BlockDriverAIOCB *acb = p;
4627     g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4628 }
4629 
4630 /**************************************************************/
4631 /* Coroutine block device emulation */
4632 
4633 typedef struct CoroutineIOCompletion {
4634     Coroutine *coroutine;
4635     int ret;
4636 } CoroutineIOCompletion;
4637 
4638 static void bdrv_co_io_em_complete(void *opaque, int ret)
4639 {
4640     CoroutineIOCompletion *co = opaque;
4641 
4642     co->ret = ret;
4643     qemu_coroutine_enter(co->coroutine, NULL);
4644 }
4645 
4646 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4647                                       int nb_sectors, QEMUIOVector *iov,
4648                                       bool is_write)
4649 {
4650     CoroutineIOCompletion co = {
4651         .coroutine = qemu_coroutine_self(),
4652     };
4653     BlockDriverAIOCB *acb;
4654 
4655     if (is_write) {
4656         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4657                                        bdrv_co_io_em_complete, &co);
4658     } else {
4659         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4660                                       bdrv_co_io_em_complete, &co);
4661     }
4662 
4663     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4664     if (!acb) {
4665         return -EIO;
4666     }
4667     qemu_coroutine_yield();
4668 
4669     return co.ret;
4670 }
4671 
4672 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4673                                          int64_t sector_num, int nb_sectors,
4674                                          QEMUIOVector *iov)
4675 {
4676     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4677 }
4678 
4679 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4680                                          int64_t sector_num, int nb_sectors,
4681                                          QEMUIOVector *iov)
4682 {
4683     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4684 }
4685 
4686 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4687 {
4688     RwCo *rwco = opaque;
4689 
4690     rwco->ret = bdrv_co_flush(rwco->bs);
4691 }
4692 
4693 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4694 {
4695     int ret;
4696 
4697     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4698         return 0;
4699     }
4700 
4701     /* Write back cached data to the OS even with cache=unsafe */
4702     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4703     if (bs->drv->bdrv_co_flush_to_os) {
4704         ret = bs->drv->bdrv_co_flush_to_os(bs);
4705         if (ret < 0) {
4706             return ret;
4707         }
4708     }
4709 
4710     /* But don't actually force it to the disk with cache=unsafe */
4711     if (bs->open_flags & BDRV_O_NO_FLUSH) {
4712         goto flush_parent;
4713     }
4714 
4715     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4716     if (bs->drv->bdrv_co_flush_to_disk) {
4717         ret = bs->drv->bdrv_co_flush_to_disk(bs);
4718     } else if (bs->drv->bdrv_aio_flush) {
4719         BlockDriverAIOCB *acb;
4720         CoroutineIOCompletion co = {
4721             .coroutine = qemu_coroutine_self(),
4722         };
4723 
4724         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4725         if (acb == NULL) {
4726             ret = -EIO;
4727         } else {
4728             qemu_coroutine_yield();
4729             ret = co.ret;
4730         }
4731     } else {
4732         /*
4733          * Some block drivers always operate in either writethrough or unsafe
4734          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4735          * know how the server works (because the behaviour is hardcoded or
4736          * depends on server-side configuration), so we can't ensure that
4737          * everything is safe on disk. Returning an error doesn't work because
4738          * that would break guests even if the server operates in writethrough
4739          * mode.
4740          *
4741          * Let's hope the user knows what he's doing.
4742          */
4743         ret = 0;
4744     }
4745     if (ret < 0) {
4746         return ret;
4747     }
4748 
4749     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4750      * in the case of cache=unsafe, so there are no useless flushes.
4751      */
4752 flush_parent:
4753     return bdrv_co_flush(bs->file);
4754 }
4755 
4756 void bdrv_invalidate_cache(BlockDriverState *bs)
4757 {
4758     if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4759         bs->drv->bdrv_invalidate_cache(bs);
4760     }
4761 }
4762 
4763 void bdrv_invalidate_cache_all(void)
4764 {
4765     BlockDriverState *bs;
4766 
4767     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4768         bdrv_invalidate_cache(bs);
4769     }
4770 }
4771 
4772 void bdrv_clear_incoming_migration_all(void)
4773 {
4774     BlockDriverState *bs;
4775 
4776     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4777         bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4778     }
4779 }
4780 
4781 int bdrv_flush(BlockDriverState *bs)
4782 {
4783     Coroutine *co;
4784     RwCo rwco = {
4785         .bs = bs,
4786         .ret = NOT_DONE,
4787     };
4788 
4789     if (qemu_in_coroutine()) {
4790         /* Fast-path if already in coroutine context */
4791         bdrv_flush_co_entry(&rwco);
4792     } else {
4793         co = qemu_coroutine_create(bdrv_flush_co_entry);
4794         qemu_coroutine_enter(co, &rwco);
4795         while (rwco.ret == NOT_DONE) {
4796             qemu_aio_wait();
4797         }
4798     }
4799 
4800     return rwco.ret;
4801 }
4802 
4803 typedef struct DiscardCo {
4804     BlockDriverState *bs;
4805     int64_t sector_num;
4806     int nb_sectors;
4807     int ret;
4808 } DiscardCo;
4809 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4810 {
4811     DiscardCo *rwco = opaque;
4812 
4813     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4814 }
4815 
4816 /* if no limit is specified in the BlockLimits use a default
4817  * of 32768 512-byte sectors (16 MiB) per request.
4818  */
4819 #define MAX_DISCARD_DEFAULT 32768
4820 
4821 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4822                                  int nb_sectors)
4823 {
4824     int max_discard;
4825 
4826     if (!bs->drv) {
4827         return -ENOMEDIUM;
4828     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4829         return -EIO;
4830     } else if (bs->read_only) {
4831         return -EROFS;
4832     }
4833 
4834     bdrv_reset_dirty(bs, sector_num, nb_sectors);
4835 
4836     /* Do nothing if disabled.  */
4837     if (!(bs->open_flags & BDRV_O_UNMAP)) {
4838         return 0;
4839     }
4840 
4841     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4842         return 0;
4843     }
4844 
4845     max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4846     while (nb_sectors > 0) {
4847         int ret;
4848         int num = nb_sectors;
4849 
4850         /* align request */
4851         if (bs->bl.discard_alignment &&
4852             num >= bs->bl.discard_alignment &&
4853             sector_num % bs->bl.discard_alignment) {
4854             if (num > bs->bl.discard_alignment) {
4855                 num = bs->bl.discard_alignment;
4856             }
4857             num -= sector_num % bs->bl.discard_alignment;
4858         }
4859 
4860         /* limit request size */
4861         if (num > max_discard) {
4862             num = max_discard;
4863         }
4864 
4865         if (bs->drv->bdrv_co_discard) {
4866             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
4867         } else {
4868             BlockDriverAIOCB *acb;
4869             CoroutineIOCompletion co = {
4870                 .coroutine = qemu_coroutine_self(),
4871             };
4872 
4873             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4874                                             bdrv_co_io_em_complete, &co);
4875             if (acb == NULL) {
4876                 return -EIO;
4877             } else {
4878                 qemu_coroutine_yield();
4879                 ret = co.ret;
4880             }
4881         }
4882         if (ret && ret != -ENOTSUP) {
4883             return ret;
4884         }
4885 
4886         sector_num += num;
4887         nb_sectors -= num;
4888     }
4889     return 0;
4890 }
4891 
4892 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4893 {
4894     Coroutine *co;
4895     DiscardCo rwco = {
4896         .bs = bs,
4897         .sector_num = sector_num,
4898         .nb_sectors = nb_sectors,
4899         .ret = NOT_DONE,
4900     };
4901 
4902     if (qemu_in_coroutine()) {
4903         /* Fast-path if already in coroutine context */
4904         bdrv_discard_co_entry(&rwco);
4905     } else {
4906         co = qemu_coroutine_create(bdrv_discard_co_entry);
4907         qemu_coroutine_enter(co, &rwco);
4908         while (rwco.ret == NOT_DONE) {
4909             qemu_aio_wait();
4910         }
4911     }
4912 
4913     return rwco.ret;
4914 }
4915 
4916 /**************************************************************/
4917 /* removable device support */
4918 
4919 /**
4920  * Return TRUE if the media is present
4921  */
4922 int bdrv_is_inserted(BlockDriverState *bs)
4923 {
4924     BlockDriver *drv = bs->drv;
4925 
4926     if (!drv)
4927         return 0;
4928     if (!drv->bdrv_is_inserted)
4929         return 1;
4930     return drv->bdrv_is_inserted(bs);
4931 }
4932 
4933 /**
4934  * Return whether the media changed since the last call to this
4935  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4936  */
4937 int bdrv_media_changed(BlockDriverState *bs)
4938 {
4939     BlockDriver *drv = bs->drv;
4940 
4941     if (drv && drv->bdrv_media_changed) {
4942         return drv->bdrv_media_changed(bs);
4943     }
4944     return -ENOTSUP;
4945 }
4946 
4947 /**
4948  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4949  */
4950 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4951 {
4952     BlockDriver *drv = bs->drv;
4953 
4954     if (drv && drv->bdrv_eject) {
4955         drv->bdrv_eject(bs, eject_flag);
4956     }
4957 
4958     if (bs->device_name[0] != '\0') {
4959         bdrv_emit_qmp_eject_event(bs, eject_flag);
4960     }
4961 }
4962 
4963 /**
4964  * Lock or unlock the media (if it is locked, the user won't be able
4965  * to eject it manually).
4966  */
4967 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4968 {
4969     BlockDriver *drv = bs->drv;
4970 
4971     trace_bdrv_lock_medium(bs, locked);
4972 
4973     if (drv && drv->bdrv_lock_medium) {
4974         drv->bdrv_lock_medium(bs, locked);
4975     }
4976 }
4977 
4978 /* needed for generic scsi interface */
4979 
4980 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4981 {
4982     BlockDriver *drv = bs->drv;
4983 
4984     if (drv && drv->bdrv_ioctl)
4985         return drv->bdrv_ioctl(bs, req, buf);
4986     return -ENOTSUP;
4987 }
4988 
4989 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4990         unsigned long int req, void *buf,
4991         BlockDriverCompletionFunc *cb, void *opaque)
4992 {
4993     BlockDriver *drv = bs->drv;
4994 
4995     if (drv && drv->bdrv_aio_ioctl)
4996         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4997     return NULL;
4998 }
4999 
5000 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5001 {
5002     bs->guest_block_size = align;
5003 }
5004 
5005 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5006 {
5007     return qemu_memalign(bdrv_opt_mem_align(bs), size);
5008 }
5009 
5010 /*
5011  * Check if all memory in this vector is sector aligned.
5012  */
5013 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5014 {
5015     int i;
5016     size_t alignment = bdrv_opt_mem_align(bs);
5017 
5018     for (i = 0; i < qiov->niov; i++) {
5019         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5020             return false;
5021         }
5022         if (qiov->iov[i].iov_len % alignment) {
5023             return false;
5024         }
5025     }
5026 
5027     return true;
5028 }
5029 
5030 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity)
5031 {
5032     int64_t bitmap_size;
5033     BdrvDirtyBitmap *bitmap;
5034 
5035     assert((granularity & (granularity - 1)) == 0);
5036 
5037     granularity >>= BDRV_SECTOR_BITS;
5038     assert(granularity);
5039     bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
5040     bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5041     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5042     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5043     return bitmap;
5044 }
5045 
5046 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5047 {
5048     BdrvDirtyBitmap *bm, *next;
5049     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5050         if (bm == bitmap) {
5051             QLIST_REMOVE(bitmap, list);
5052             hbitmap_free(bitmap->bitmap);
5053             g_free(bitmap);
5054             return;
5055         }
5056     }
5057 }
5058 
5059 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5060 {
5061     BdrvDirtyBitmap *bm;
5062     BlockDirtyInfoList *list = NULL;
5063     BlockDirtyInfoList **plist = &list;
5064 
5065     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5066         BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5067         BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5068         info->count = bdrv_get_dirty_count(bs, bm);
5069         info->granularity =
5070             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5071         entry->value = info;
5072         *plist = entry;
5073         plist = &entry->next;
5074     }
5075 
5076     return list;
5077 }
5078 
5079 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5080 {
5081     if (bitmap) {
5082         return hbitmap_get(bitmap->bitmap, sector);
5083     } else {
5084         return 0;
5085     }
5086 }
5087 
5088 void bdrv_dirty_iter_init(BlockDriverState *bs,
5089                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5090 {
5091     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5092 }
5093 
5094 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5095                     int nr_sectors)
5096 {
5097     BdrvDirtyBitmap *bitmap;
5098     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5099         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5100     }
5101 }
5102 
5103 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5104 {
5105     BdrvDirtyBitmap *bitmap;
5106     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5107         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5108     }
5109 }
5110 
5111 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5112 {
5113     return hbitmap_count(bitmap->bitmap);
5114 }
5115 
5116 /* Get a reference to bs */
5117 void bdrv_ref(BlockDriverState *bs)
5118 {
5119     bs->refcnt++;
5120 }
5121 
5122 /* Release a previously grabbed reference to bs.
5123  * If after releasing, reference count is zero, the BlockDriverState is
5124  * deleted. */
5125 void bdrv_unref(BlockDriverState *bs)
5126 {
5127     assert(bs->refcnt > 0);
5128     if (--bs->refcnt == 0) {
5129         bdrv_delete(bs);
5130     }
5131 }
5132 
5133 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
5134 {
5135     assert(bs->in_use != in_use);
5136     bs->in_use = in_use;
5137 }
5138 
5139 int bdrv_in_use(BlockDriverState *bs)
5140 {
5141     return bs->in_use;
5142 }
5143 
5144 void bdrv_iostatus_enable(BlockDriverState *bs)
5145 {
5146     bs->iostatus_enabled = true;
5147     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5148 }
5149 
5150 /* The I/O status is only enabled if the drive explicitly
5151  * enables it _and_ the VM is configured to stop on errors */
5152 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5153 {
5154     return (bs->iostatus_enabled &&
5155            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5156             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5157             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5158 }
5159 
5160 void bdrv_iostatus_disable(BlockDriverState *bs)
5161 {
5162     bs->iostatus_enabled = false;
5163 }
5164 
5165 void bdrv_iostatus_reset(BlockDriverState *bs)
5166 {
5167     if (bdrv_iostatus_is_enabled(bs)) {
5168         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5169         if (bs->job) {
5170             block_job_iostatus_reset(bs->job);
5171         }
5172     }
5173 }
5174 
5175 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5176 {
5177     assert(bdrv_iostatus_is_enabled(bs));
5178     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5179         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5180                                          BLOCK_DEVICE_IO_STATUS_FAILED;
5181     }
5182 }
5183 
5184 void
5185 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5186         enum BlockAcctType type)
5187 {
5188     assert(type < BDRV_MAX_IOTYPE);
5189 
5190     cookie->bytes = bytes;
5191     cookie->start_time_ns = get_clock();
5192     cookie->type = type;
5193 }
5194 
5195 void
5196 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5197 {
5198     assert(cookie->type < BDRV_MAX_IOTYPE);
5199 
5200     bs->nr_bytes[cookie->type] += cookie->bytes;
5201     bs->nr_ops[cookie->type]++;
5202     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5203 }
5204 
5205 void bdrv_img_create(const char *filename, const char *fmt,
5206                      const char *base_filename, const char *base_fmt,
5207                      char *options, uint64_t img_size, int flags,
5208                      Error **errp, bool quiet)
5209 {
5210     QEMUOptionParameter *param = NULL, *create_options = NULL;
5211     QEMUOptionParameter *backing_fmt, *backing_file, *size;
5212     BlockDriver *drv, *proto_drv;
5213     BlockDriver *backing_drv = NULL;
5214     Error *local_err = NULL;
5215     int ret = 0;
5216 
5217     /* Find driver and parse its options */
5218     drv = bdrv_find_format(fmt);
5219     if (!drv) {
5220         error_setg(errp, "Unknown file format '%s'", fmt);
5221         return;
5222     }
5223 
5224     proto_drv = bdrv_find_protocol(filename, true);
5225     if (!proto_drv) {
5226         error_setg(errp, "Unknown protocol '%s'", filename);
5227         return;
5228     }
5229 
5230     create_options = append_option_parameters(create_options,
5231                                               drv->create_options);
5232     create_options = append_option_parameters(create_options,
5233                                               proto_drv->create_options);
5234 
5235     /* Create parameter list with default values */
5236     param = parse_option_parameters("", create_options, param);
5237 
5238     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5239 
5240     /* Parse -o options */
5241     if (options) {
5242         param = parse_option_parameters(options, create_options, param);
5243         if (param == NULL) {
5244             error_setg(errp, "Invalid options for file format '%s'.", fmt);
5245             goto out;
5246         }
5247     }
5248 
5249     if (base_filename) {
5250         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5251                                  base_filename)) {
5252             error_setg(errp, "Backing file not supported for file format '%s'",
5253                        fmt);
5254             goto out;
5255         }
5256     }
5257 
5258     if (base_fmt) {
5259         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5260             error_setg(errp, "Backing file format not supported for file "
5261                              "format '%s'", fmt);
5262             goto out;
5263         }
5264     }
5265 
5266     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5267     if (backing_file && backing_file->value.s) {
5268         if (!strcmp(filename, backing_file->value.s)) {
5269             error_setg(errp, "Error: Trying to create an image with the "
5270                              "same filename as the backing file");
5271             goto out;
5272         }
5273     }
5274 
5275     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5276     if (backing_fmt && backing_fmt->value.s) {
5277         backing_drv = bdrv_find_format(backing_fmt->value.s);
5278         if (!backing_drv) {
5279             error_setg(errp, "Unknown backing file format '%s'",
5280                        backing_fmt->value.s);
5281             goto out;
5282         }
5283     }
5284 
5285     // The size for the image must always be specified, with one exception:
5286     // If we are using a backing file, we can obtain the size from there
5287     size = get_option_parameter(param, BLOCK_OPT_SIZE);
5288     if (size && size->value.n == -1) {
5289         if (backing_file && backing_file->value.s) {
5290             BlockDriverState *bs;
5291             uint64_t size;
5292             char buf[32];
5293             int back_flags;
5294 
5295             /* backing files always opened read-only */
5296             back_flags =
5297                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5298 
5299             bs = NULL;
5300             ret = bdrv_open(&bs, backing_file->value.s, NULL, NULL, back_flags,
5301                             backing_drv, &local_err);
5302             if (ret < 0) {
5303                 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5304                                  backing_file->value.s,
5305                                  error_get_pretty(local_err));
5306                 error_free(local_err);
5307                 local_err = NULL;
5308                 goto out;
5309             }
5310             bdrv_get_geometry(bs, &size);
5311             size *= 512;
5312 
5313             snprintf(buf, sizeof(buf), "%" PRId64, size);
5314             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
5315 
5316             bdrv_unref(bs);
5317         } else {
5318             error_setg(errp, "Image creation needs a size parameter");
5319             goto out;
5320         }
5321     }
5322 
5323     if (!quiet) {
5324         printf("Formatting '%s', fmt=%s ", filename, fmt);
5325         print_option_parameters(param);
5326         puts("");
5327     }
5328     ret = bdrv_create(drv, filename, param, &local_err);
5329     if (ret == -EFBIG) {
5330         /* This is generally a better message than whatever the driver would
5331          * deliver (especially because of the cluster_size_hint), since that
5332          * is most probably not much different from "image too large". */
5333         const char *cluster_size_hint = "";
5334         if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5335             cluster_size_hint = " (try using a larger cluster size)";
5336         }
5337         error_setg(errp, "The image size is too large for file format '%s'"
5338                    "%s", fmt, cluster_size_hint);
5339         error_free(local_err);
5340         local_err = NULL;
5341     }
5342 
5343 out:
5344     free_option_parameters(create_options);
5345     free_option_parameters(param);
5346 
5347     if (local_err) {
5348         error_propagate(errp, local_err);
5349     }
5350 }
5351 
5352 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5353 {
5354     /* Currently BlockDriverState always uses the main loop AioContext */
5355     return qemu_get_aio_context();
5356 }
5357 
5358 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5359                                     NotifierWithReturn *notifier)
5360 {
5361     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5362 }
5363 
5364 int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5365 {
5366     if (bs->drv->bdrv_amend_options == NULL) {
5367         return -ENOTSUP;
5368     }
5369     return bs->drv->bdrv_amend_options(bs, options);
5370 }
5371 
5372 /* Used to recurse on single child block filters.
5373  * Single child block filter will store their child in bs->file.
5374  */
5375 bool bdrv_generic_is_first_non_filter(BlockDriverState *bs,
5376                                       BlockDriverState *candidate)
5377 {
5378     if (!bs->drv) {
5379         return false;
5380     }
5381 
5382     if (!bs->drv->authorizations[BS_IS_A_FILTER]) {
5383         if (bs == candidate) {
5384             return true;
5385         } else {
5386             return false;
5387         }
5388     }
5389 
5390     if (!bs->drv->authorizations[BS_FILTER_PASS_DOWN]) {
5391         return false;
5392     }
5393 
5394     if (!bs->file) {
5395         return false;
5396     }
5397 
5398     return bdrv_recurse_is_first_non_filter(bs->file, candidate);
5399 }
5400 
5401 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5402                                       BlockDriverState *candidate)
5403 {
5404     if (bs->drv && bs->drv->bdrv_recurse_is_first_non_filter) {
5405         return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5406     }
5407 
5408     return bdrv_generic_is_first_non_filter(bs, candidate);
5409 }
5410 
5411 /* This function checks if the candidate is the first non filter bs down it's
5412  * bs chain. Since we don't have pointers to parents it explore all bs chains
5413  * from the top. Some filters can choose not to pass down the recursion.
5414  */
5415 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5416 {
5417     BlockDriverState *bs;
5418 
5419     /* walk down the bs forest recursively */
5420     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5421         bool perm;
5422 
5423         perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5424 
5425         /* candidate is the first non filter */
5426         if (perm) {
5427             return true;
5428         }
5429     }
5430 
5431     return false;
5432 }
5433