xref: /openbmc/qemu/block.c (revision e0c47b6cb1de430fbc6f828f7acffa851c580840)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/block-backend.h"
32 #include "sysemu/sysemu.h"
33 #include "sysemu/qtest.h"
34 #include "qemu/notify.h"
35 #include "block/coroutine.h"
36 #include "block/qapi.h"
37 #include "qmp-commands.h"
38 #include "qemu/timer.h"
39 #include "qapi-event.h"
40 
41 #ifdef CONFIG_BSD
42 #include <sys/types.h>
43 #include <sys/stat.h>
44 #include <sys/ioctl.h>
45 #include <sys/queue.h>
46 #ifndef __DragonFly__
47 #include <sys/disk.h>
48 #endif
49 #endif
50 
51 #ifdef _WIN32
52 #include <windows.h>
53 #endif
54 
55 /**
56  * A BdrvDirtyBitmap can be in three possible states:
57  * (1) successor is NULL and disabled is false: full r/w mode
58  * (2) successor is NULL and disabled is true: read only mode ("disabled")
59  * (3) successor is set: frozen mode.
60  *     A frozen bitmap cannot be renamed, deleted, anonymized, cleared, set,
61  *     or enabled. A frozen bitmap can only abdicate() or reclaim().
62  */
63 struct BdrvDirtyBitmap {
64     HBitmap *bitmap;            /* Dirty sector bitmap implementation */
65     BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */
66     char *name;                 /* Optional non-empty unique ID */
67     int64_t size;               /* Size of the bitmap (Number of sectors) */
68     bool disabled;              /* Bitmap is read-only */
69     QLIST_ENTRY(BdrvDirtyBitmap) list;
70 };
71 
72 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
73 
74 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
75         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
76         BlockCompletionFunc *cb, void *opaque);
77 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
78         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
79         BlockCompletionFunc *cb, void *opaque);
80 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
81                                          int64_t sector_num, int nb_sectors,
82                                          QEMUIOVector *iov);
83 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
84                                          int64_t sector_num, int nb_sectors,
85                                          QEMUIOVector *iov);
86 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
87     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
88     BdrvRequestFlags flags);
89 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
90     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
91     BdrvRequestFlags flags);
92 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
93                                          int64_t sector_num,
94                                          QEMUIOVector *qiov,
95                                          int nb_sectors,
96                                          BdrvRequestFlags flags,
97                                          BlockCompletionFunc *cb,
98                                          void *opaque,
99                                          bool is_write);
100 static void coroutine_fn bdrv_co_do_rw(void *opaque);
101 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
102     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
103 
104 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
105     QTAILQ_HEAD_INITIALIZER(bdrv_states);
106 
107 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
108     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
109 
110 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
111     QLIST_HEAD_INITIALIZER(bdrv_drivers);
112 
113 static void bdrv_dirty_bitmap_truncate(BlockDriverState *bs);
114 /* If non-zero, use only whitelisted block drivers */
115 static int use_bdrv_whitelist;
116 
117 #ifdef _WIN32
118 static int is_windows_drive_prefix(const char *filename)
119 {
120     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
121              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
122             filename[1] == ':');
123 }
124 
125 int is_windows_drive(const char *filename)
126 {
127     if (is_windows_drive_prefix(filename) &&
128         filename[2] == '\0')
129         return 1;
130     if (strstart(filename, "\\\\.\\", NULL) ||
131         strstart(filename, "//./", NULL))
132         return 1;
133     return 0;
134 }
135 #endif
136 
137 /* throttling disk I/O limits */
138 void bdrv_set_io_limits(BlockDriverState *bs,
139                         ThrottleConfig *cfg)
140 {
141     int i;
142 
143     throttle_config(&bs->throttle_state, cfg);
144 
145     for (i = 0; i < 2; i++) {
146         qemu_co_enter_next(&bs->throttled_reqs[i]);
147     }
148 }
149 
150 /* this function drain all the throttled IOs */
151 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
152 {
153     bool drained = false;
154     bool enabled = bs->io_limits_enabled;
155     int i;
156 
157     bs->io_limits_enabled = false;
158 
159     for (i = 0; i < 2; i++) {
160         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
161             drained = true;
162         }
163     }
164 
165     bs->io_limits_enabled = enabled;
166 
167     return drained;
168 }
169 
170 void bdrv_io_limits_disable(BlockDriverState *bs)
171 {
172     bs->io_limits_enabled = false;
173 
174     bdrv_start_throttled_reqs(bs);
175 
176     throttle_destroy(&bs->throttle_state);
177 }
178 
179 static void bdrv_throttle_read_timer_cb(void *opaque)
180 {
181     BlockDriverState *bs = opaque;
182     qemu_co_enter_next(&bs->throttled_reqs[0]);
183 }
184 
185 static void bdrv_throttle_write_timer_cb(void *opaque)
186 {
187     BlockDriverState *bs = opaque;
188     qemu_co_enter_next(&bs->throttled_reqs[1]);
189 }
190 
191 /* should be called before bdrv_set_io_limits if a limit is set */
192 void bdrv_io_limits_enable(BlockDriverState *bs)
193 {
194     int clock_type = QEMU_CLOCK_REALTIME;
195 
196     if (qtest_enabled()) {
197         /* For testing block IO throttling only */
198         clock_type = QEMU_CLOCK_VIRTUAL;
199     }
200     assert(!bs->io_limits_enabled);
201     throttle_init(&bs->throttle_state,
202                   bdrv_get_aio_context(bs),
203                   clock_type,
204                   bdrv_throttle_read_timer_cb,
205                   bdrv_throttle_write_timer_cb,
206                   bs);
207     bs->io_limits_enabled = true;
208 }
209 
210 /* This function makes an IO wait if needed
211  *
212  * @nb_sectors: the number of sectors of the IO
213  * @is_write:   is the IO a write
214  */
215 static void bdrv_io_limits_intercept(BlockDriverState *bs,
216                                      unsigned int bytes,
217                                      bool is_write)
218 {
219     /* does this io must wait */
220     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
221 
222     /* if must wait or any request of this type throttled queue the IO */
223     if (must_wait ||
224         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
225         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
226     }
227 
228     /* the IO will be executed, do the accounting */
229     throttle_account(&bs->throttle_state, is_write, bytes);
230 
231 
232     /* if the next request must wait -> do nothing */
233     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
234         return;
235     }
236 
237     /* else queue next request for execution */
238     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
239 }
240 
241 size_t bdrv_opt_mem_align(BlockDriverState *bs)
242 {
243     if (!bs || !bs->drv) {
244         /* 4k should be on the safe side */
245         return 4096;
246     }
247 
248     return bs->bl.opt_mem_alignment;
249 }
250 
251 /* check if the path starts with "<protocol>:" */
252 int path_has_protocol(const char *path)
253 {
254     const char *p;
255 
256 #ifdef _WIN32
257     if (is_windows_drive(path) ||
258         is_windows_drive_prefix(path)) {
259         return 0;
260     }
261     p = path + strcspn(path, ":/\\");
262 #else
263     p = path + strcspn(path, ":/");
264 #endif
265 
266     return *p == ':';
267 }
268 
269 int path_is_absolute(const char *path)
270 {
271 #ifdef _WIN32
272     /* specific case for names like: "\\.\d:" */
273     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
274         return 1;
275     }
276     return (*path == '/' || *path == '\\');
277 #else
278     return (*path == '/');
279 #endif
280 }
281 
282 /* if filename is absolute, just copy it to dest. Otherwise, build a
283    path to it by considering it is relative to base_path. URL are
284    supported. */
285 void path_combine(char *dest, int dest_size,
286                   const char *base_path,
287                   const char *filename)
288 {
289     const char *p, *p1;
290     int len;
291 
292     if (dest_size <= 0)
293         return;
294     if (path_is_absolute(filename)) {
295         pstrcpy(dest, dest_size, filename);
296     } else {
297         p = strchr(base_path, ':');
298         if (p)
299             p++;
300         else
301             p = base_path;
302         p1 = strrchr(base_path, '/');
303 #ifdef _WIN32
304         {
305             const char *p2;
306             p2 = strrchr(base_path, '\\');
307             if (!p1 || p2 > p1)
308                 p1 = p2;
309         }
310 #endif
311         if (p1)
312             p1++;
313         else
314             p1 = base_path;
315         if (p1 > p)
316             p = p1;
317         len = p - base_path;
318         if (len > dest_size - 1)
319             len = dest_size - 1;
320         memcpy(dest, base_path, len);
321         dest[len] = '\0';
322         pstrcat(dest, dest_size, filename);
323     }
324 }
325 
326 void bdrv_get_full_backing_filename_from_filename(const char *backed,
327                                                   const char *backing,
328                                                   char *dest, size_t sz,
329                                                   Error **errp)
330 {
331     if (backing[0] == '\0' || path_has_protocol(backing) ||
332         path_is_absolute(backing))
333     {
334         pstrcpy(dest, sz, backing);
335     } else if (backed[0] == '\0' || strstart(backed, "json:", NULL)) {
336         error_setg(errp, "Cannot use relative backing file names for '%s'",
337                    backed);
338     } else {
339         path_combine(dest, sz, backed, backing);
340     }
341 }
342 
343 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz,
344                                     Error **errp)
345 {
346     char *backed = bs->exact_filename[0] ? bs->exact_filename : bs->filename;
347 
348     bdrv_get_full_backing_filename_from_filename(backed, bs->backing_file,
349                                                  dest, sz, errp);
350 }
351 
352 void bdrv_register(BlockDriver *bdrv)
353 {
354     /* Block drivers without coroutine functions need emulation */
355     if (!bdrv->bdrv_co_readv) {
356         bdrv->bdrv_co_readv = bdrv_co_readv_em;
357         bdrv->bdrv_co_writev = bdrv_co_writev_em;
358 
359         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
360          * the block driver lacks aio we need to emulate that too.
361          */
362         if (!bdrv->bdrv_aio_readv) {
363             /* add AIO emulation layer */
364             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
365             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
366         }
367     }
368 
369     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
370 }
371 
372 BlockDriverState *bdrv_new_root(void)
373 {
374     BlockDriverState *bs = bdrv_new();
375 
376     QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
377     return bs;
378 }
379 
380 BlockDriverState *bdrv_new(void)
381 {
382     BlockDriverState *bs;
383     int i;
384 
385     bs = g_new0(BlockDriverState, 1);
386     QLIST_INIT(&bs->dirty_bitmaps);
387     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
388         QLIST_INIT(&bs->op_blockers[i]);
389     }
390     bdrv_iostatus_disable(bs);
391     notifier_list_init(&bs->close_notifiers);
392     notifier_with_return_list_init(&bs->before_write_notifiers);
393     qemu_co_queue_init(&bs->throttled_reqs[0]);
394     qemu_co_queue_init(&bs->throttled_reqs[1]);
395     bs->refcnt = 1;
396     bs->aio_context = qemu_get_aio_context();
397 
398     return bs;
399 }
400 
401 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
402 {
403     notifier_list_add(&bs->close_notifiers, notify);
404 }
405 
406 BlockDriver *bdrv_find_format(const char *format_name)
407 {
408     BlockDriver *drv1;
409     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
410         if (!strcmp(drv1->format_name, format_name)) {
411             return drv1;
412         }
413     }
414     return NULL;
415 }
416 
417 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
418 {
419     static const char *whitelist_rw[] = {
420         CONFIG_BDRV_RW_WHITELIST
421     };
422     static const char *whitelist_ro[] = {
423         CONFIG_BDRV_RO_WHITELIST
424     };
425     const char **p;
426 
427     if (!whitelist_rw[0] && !whitelist_ro[0]) {
428         return 1;               /* no whitelist, anything goes */
429     }
430 
431     for (p = whitelist_rw; *p; p++) {
432         if (!strcmp(drv->format_name, *p)) {
433             return 1;
434         }
435     }
436     if (read_only) {
437         for (p = whitelist_ro; *p; p++) {
438             if (!strcmp(drv->format_name, *p)) {
439                 return 1;
440             }
441         }
442     }
443     return 0;
444 }
445 
446 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
447                                           bool read_only)
448 {
449     BlockDriver *drv = bdrv_find_format(format_name);
450     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
451 }
452 
453 typedef struct CreateCo {
454     BlockDriver *drv;
455     char *filename;
456     QemuOpts *opts;
457     int ret;
458     Error *err;
459 } CreateCo;
460 
461 static void coroutine_fn bdrv_create_co_entry(void *opaque)
462 {
463     Error *local_err = NULL;
464     int ret;
465 
466     CreateCo *cco = opaque;
467     assert(cco->drv);
468 
469     ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
470     if (local_err) {
471         error_propagate(&cco->err, local_err);
472     }
473     cco->ret = ret;
474 }
475 
476 int bdrv_create(BlockDriver *drv, const char* filename,
477                 QemuOpts *opts, Error **errp)
478 {
479     int ret;
480 
481     Coroutine *co;
482     CreateCo cco = {
483         .drv = drv,
484         .filename = g_strdup(filename),
485         .opts = opts,
486         .ret = NOT_DONE,
487         .err = NULL,
488     };
489 
490     if (!drv->bdrv_create) {
491         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
492         ret = -ENOTSUP;
493         goto out;
494     }
495 
496     if (qemu_in_coroutine()) {
497         /* Fast-path if already in coroutine context */
498         bdrv_create_co_entry(&cco);
499     } else {
500         co = qemu_coroutine_create(bdrv_create_co_entry);
501         qemu_coroutine_enter(co, &cco);
502         while (cco.ret == NOT_DONE) {
503             aio_poll(qemu_get_aio_context(), true);
504         }
505     }
506 
507     ret = cco.ret;
508     if (ret < 0) {
509         if (cco.err) {
510             error_propagate(errp, cco.err);
511         } else {
512             error_setg_errno(errp, -ret, "Could not create image");
513         }
514     }
515 
516 out:
517     g_free(cco.filename);
518     return ret;
519 }
520 
521 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
522 {
523     BlockDriver *drv;
524     Error *local_err = NULL;
525     int ret;
526 
527     drv = bdrv_find_protocol(filename, true, errp);
528     if (drv == NULL) {
529         return -ENOENT;
530     }
531 
532     ret = bdrv_create(drv, filename, opts, &local_err);
533     if (local_err) {
534         error_propagate(errp, local_err);
535     }
536     return ret;
537 }
538 
539 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
540 {
541     BlockDriver *drv = bs->drv;
542     Error *local_err = NULL;
543 
544     memset(&bs->bl, 0, sizeof(bs->bl));
545 
546     if (!drv) {
547         return;
548     }
549 
550     /* Take some limits from the children as a default */
551     if (bs->file) {
552         bdrv_refresh_limits(bs->file, &local_err);
553         if (local_err) {
554             error_propagate(errp, local_err);
555             return;
556         }
557         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
558         bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
559         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
560     } else {
561         bs->bl.opt_mem_alignment = 512;
562     }
563 
564     if (bs->backing_hd) {
565         bdrv_refresh_limits(bs->backing_hd, &local_err);
566         if (local_err) {
567             error_propagate(errp, local_err);
568             return;
569         }
570         bs->bl.opt_transfer_length =
571             MAX(bs->bl.opt_transfer_length,
572                 bs->backing_hd->bl.opt_transfer_length);
573         bs->bl.max_transfer_length =
574             MIN_NON_ZERO(bs->bl.max_transfer_length,
575                          bs->backing_hd->bl.max_transfer_length);
576         bs->bl.opt_mem_alignment =
577             MAX(bs->bl.opt_mem_alignment,
578                 bs->backing_hd->bl.opt_mem_alignment);
579     }
580 
581     /* Then let the driver override it */
582     if (drv->bdrv_refresh_limits) {
583         drv->bdrv_refresh_limits(bs, errp);
584     }
585 }
586 
587 /**
588  * Try to get @bs's logical and physical block size.
589  * On success, store them in @bsz struct and return 0.
590  * On failure return -errno.
591  * @bs must not be empty.
592  */
593 int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
594 {
595     BlockDriver *drv = bs->drv;
596 
597     if (drv && drv->bdrv_probe_blocksizes) {
598         return drv->bdrv_probe_blocksizes(bs, bsz);
599     }
600 
601     return -ENOTSUP;
602 }
603 
604 /**
605  * Try to get @bs's geometry (cyls, heads, sectors).
606  * On success, store them in @geo struct and return 0.
607  * On failure return -errno.
608  * @bs must not be empty.
609  */
610 int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
611 {
612     BlockDriver *drv = bs->drv;
613 
614     if (drv && drv->bdrv_probe_geometry) {
615         return drv->bdrv_probe_geometry(bs, geo);
616     }
617 
618     return -ENOTSUP;
619 }
620 
621 /*
622  * Create a uniquely-named empty temporary file.
623  * Return 0 upon success, otherwise a negative errno value.
624  */
625 int get_tmp_filename(char *filename, int size)
626 {
627 #ifdef _WIN32
628     char temp_dir[MAX_PATH];
629     /* GetTempFileName requires that its output buffer (4th param)
630        have length MAX_PATH or greater.  */
631     assert(size >= MAX_PATH);
632     return (GetTempPath(MAX_PATH, temp_dir)
633             && GetTempFileName(temp_dir, "qem", 0, filename)
634             ? 0 : -GetLastError());
635 #else
636     int fd;
637     const char *tmpdir;
638     tmpdir = getenv("TMPDIR");
639     if (!tmpdir) {
640         tmpdir = "/var/tmp";
641     }
642     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
643         return -EOVERFLOW;
644     }
645     fd = mkstemp(filename);
646     if (fd < 0) {
647         return -errno;
648     }
649     if (close(fd) != 0) {
650         unlink(filename);
651         return -errno;
652     }
653     return 0;
654 #endif
655 }
656 
657 /*
658  * Detect host devices. By convention, /dev/cdrom[N] is always
659  * recognized as a host CDROM.
660  */
661 static BlockDriver *find_hdev_driver(const char *filename)
662 {
663     int score_max = 0, score;
664     BlockDriver *drv = NULL, *d;
665 
666     QLIST_FOREACH(d, &bdrv_drivers, list) {
667         if (d->bdrv_probe_device) {
668             score = d->bdrv_probe_device(filename);
669             if (score > score_max) {
670                 score_max = score;
671                 drv = d;
672             }
673         }
674     }
675 
676     return drv;
677 }
678 
679 BlockDriver *bdrv_find_protocol(const char *filename,
680                                 bool allow_protocol_prefix,
681                                 Error **errp)
682 {
683     BlockDriver *drv1;
684     char protocol[128];
685     int len;
686     const char *p;
687 
688     /* TODO Drivers without bdrv_file_open must be specified explicitly */
689 
690     /*
691      * XXX(hch): we really should not let host device detection
692      * override an explicit protocol specification, but moving this
693      * later breaks access to device names with colons in them.
694      * Thanks to the brain-dead persistent naming schemes on udev-
695      * based Linux systems those actually are quite common.
696      */
697     drv1 = find_hdev_driver(filename);
698     if (drv1) {
699         return drv1;
700     }
701 
702     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
703         return &bdrv_file;
704     }
705 
706     p = strchr(filename, ':');
707     assert(p != NULL);
708     len = p - filename;
709     if (len > sizeof(protocol) - 1)
710         len = sizeof(protocol) - 1;
711     memcpy(protocol, filename, len);
712     protocol[len] = '\0';
713     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
714         if (drv1->protocol_name &&
715             !strcmp(drv1->protocol_name, protocol)) {
716             return drv1;
717         }
718     }
719 
720     error_setg(errp, "Unknown protocol '%s'", protocol);
721     return NULL;
722 }
723 
724 /*
725  * Guess image format by probing its contents.
726  * This is not a good idea when your image is raw (CVE-2008-2004), but
727  * we do it anyway for backward compatibility.
728  *
729  * @buf         contains the image's first @buf_size bytes.
730  * @buf_size    is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
731  *              but can be smaller if the image file is smaller)
732  * @filename    is its filename.
733  *
734  * For all block drivers, call the bdrv_probe() method to get its
735  * probing score.
736  * Return the first block driver with the highest probing score.
737  */
738 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
739                             const char *filename)
740 {
741     int score_max = 0, score;
742     BlockDriver *drv = NULL, *d;
743 
744     QLIST_FOREACH(d, &bdrv_drivers, list) {
745         if (d->bdrv_probe) {
746             score = d->bdrv_probe(buf, buf_size, filename);
747             if (score > score_max) {
748                 score_max = score;
749                 drv = d;
750             }
751         }
752     }
753 
754     return drv;
755 }
756 
757 static int find_image_format(BlockDriverState *bs, const char *filename,
758                              BlockDriver **pdrv, Error **errp)
759 {
760     BlockDriver *drv;
761     uint8_t buf[BLOCK_PROBE_BUF_SIZE];
762     int ret = 0;
763 
764     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
765     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
766         *pdrv = &bdrv_raw;
767         return ret;
768     }
769 
770     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
771     if (ret < 0) {
772         error_setg_errno(errp, -ret, "Could not read image for determining its "
773                          "format");
774         *pdrv = NULL;
775         return ret;
776     }
777 
778     drv = bdrv_probe_all(buf, ret, filename);
779     if (!drv) {
780         error_setg(errp, "Could not determine image format: No compatible "
781                    "driver found");
782         ret = -ENOENT;
783     }
784     *pdrv = drv;
785     return ret;
786 }
787 
788 /**
789  * Set the current 'total_sectors' value
790  * Return 0 on success, -errno on error.
791  */
792 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
793 {
794     BlockDriver *drv = bs->drv;
795 
796     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
797     if (bs->sg)
798         return 0;
799 
800     /* query actual device if possible, otherwise just trust the hint */
801     if (drv->bdrv_getlength) {
802         int64_t length = drv->bdrv_getlength(bs);
803         if (length < 0) {
804             return length;
805         }
806         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
807     }
808 
809     bs->total_sectors = hint;
810     return 0;
811 }
812 
813 /**
814  * Set open flags for a given discard mode
815  *
816  * Return 0 on success, -1 if the discard mode was invalid.
817  */
818 int bdrv_parse_discard_flags(const char *mode, int *flags)
819 {
820     *flags &= ~BDRV_O_UNMAP;
821 
822     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
823         /* do nothing */
824     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
825         *flags |= BDRV_O_UNMAP;
826     } else {
827         return -1;
828     }
829 
830     return 0;
831 }
832 
833 /**
834  * Set open flags for a given cache mode
835  *
836  * Return 0 on success, -1 if the cache mode was invalid.
837  */
838 int bdrv_parse_cache_flags(const char *mode, int *flags)
839 {
840     *flags &= ~BDRV_O_CACHE_MASK;
841 
842     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
843         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
844     } else if (!strcmp(mode, "directsync")) {
845         *flags |= BDRV_O_NOCACHE;
846     } else if (!strcmp(mode, "writeback")) {
847         *flags |= BDRV_O_CACHE_WB;
848     } else if (!strcmp(mode, "unsafe")) {
849         *flags |= BDRV_O_CACHE_WB;
850         *flags |= BDRV_O_NO_FLUSH;
851     } else if (!strcmp(mode, "writethrough")) {
852         /* this is the default */
853     } else {
854         return -1;
855     }
856 
857     return 0;
858 }
859 
860 /**
861  * The copy-on-read flag is actually a reference count so multiple users may
862  * use the feature without worrying about clobbering its previous state.
863  * Copy-on-read stays enabled until all users have called to disable it.
864  */
865 void bdrv_enable_copy_on_read(BlockDriverState *bs)
866 {
867     bs->copy_on_read++;
868 }
869 
870 void bdrv_disable_copy_on_read(BlockDriverState *bs)
871 {
872     assert(bs->copy_on_read > 0);
873     bs->copy_on_read--;
874 }
875 
876 /*
877  * Returns the flags that a temporary snapshot should get, based on the
878  * originally requested flags (the originally requested image will have flags
879  * like a backing file)
880  */
881 static int bdrv_temp_snapshot_flags(int flags)
882 {
883     return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
884 }
885 
886 /*
887  * Returns the flags that bs->file should get, based on the given flags for
888  * the parent BDS
889  */
890 static int bdrv_inherited_flags(int flags)
891 {
892     /* Enable protocol handling, disable format probing for bs->file */
893     flags |= BDRV_O_PROTOCOL;
894 
895     /* Our block drivers take care to send flushes and respect unmap policy,
896      * so we can enable both unconditionally on lower layers. */
897     flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
898 
899     /* Clear flags that only apply to the top layer */
900     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
901 
902     return flags;
903 }
904 
905 /*
906  * Returns the flags that bs->backing_hd should get, based on the given flags
907  * for the parent BDS
908  */
909 static int bdrv_backing_flags(int flags)
910 {
911     /* backing files always opened read-only */
912     flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
913 
914     /* snapshot=on is handled on the top layer */
915     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
916 
917     return flags;
918 }
919 
920 static int bdrv_open_flags(BlockDriverState *bs, int flags)
921 {
922     int open_flags = flags | BDRV_O_CACHE_WB;
923 
924     /*
925      * Clear flags that are internal to the block layer before opening the
926      * image.
927      */
928     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
929 
930     /*
931      * Snapshots should be writable.
932      */
933     if (flags & BDRV_O_TEMPORARY) {
934         open_flags |= BDRV_O_RDWR;
935     }
936 
937     return open_flags;
938 }
939 
940 static void bdrv_assign_node_name(BlockDriverState *bs,
941                                   const char *node_name,
942                                   Error **errp)
943 {
944     if (!node_name) {
945         return;
946     }
947 
948     /* Check for empty string or invalid characters */
949     if (!id_wellformed(node_name)) {
950         error_setg(errp, "Invalid node name");
951         return;
952     }
953 
954     /* takes care of avoiding namespaces collisions */
955     if (blk_by_name(node_name)) {
956         error_setg(errp, "node-name=%s is conflicting with a device id",
957                    node_name);
958         return;
959     }
960 
961     /* takes care of avoiding duplicates node names */
962     if (bdrv_find_node(node_name)) {
963         error_setg(errp, "Duplicate node name");
964         return;
965     }
966 
967     /* copy node name into the bs and insert it into the graph list */
968     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
969     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
970 }
971 
972 /*
973  * Common part for opening disk images and files
974  *
975  * Removes all processed options from *options.
976  */
977 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
978     QDict *options, int flags, BlockDriver *drv, Error **errp)
979 {
980     int ret, open_flags;
981     const char *filename;
982     const char *node_name = NULL;
983     Error *local_err = NULL;
984 
985     assert(drv != NULL);
986     assert(bs->file == NULL);
987     assert(options != NULL && bs->options != options);
988 
989     if (file != NULL) {
990         filename = file->filename;
991     } else {
992         filename = qdict_get_try_str(options, "filename");
993     }
994 
995     if (drv->bdrv_needs_filename && !filename) {
996         error_setg(errp, "The '%s' block driver requires a file name",
997                    drv->format_name);
998         return -EINVAL;
999     }
1000 
1001     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
1002 
1003     node_name = qdict_get_try_str(options, "node-name");
1004     bdrv_assign_node_name(bs, node_name, &local_err);
1005     if (local_err) {
1006         error_propagate(errp, local_err);
1007         return -EINVAL;
1008     }
1009     qdict_del(options, "node-name");
1010 
1011     /* bdrv_open() with directly using a protocol as drv. This layer is already
1012      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
1013      * and return immediately. */
1014     if (file != NULL && drv->bdrv_file_open) {
1015         bdrv_swap(file, bs);
1016         return 0;
1017     }
1018 
1019     bs->open_flags = flags;
1020     bs->guest_block_size = 512;
1021     bs->request_alignment = 512;
1022     bs->zero_beyond_eof = true;
1023     open_flags = bdrv_open_flags(bs, flags);
1024     bs->read_only = !(open_flags & BDRV_O_RDWR);
1025 
1026     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
1027         error_setg(errp,
1028                    !bs->read_only && bdrv_is_whitelisted(drv, true)
1029                         ? "Driver '%s' can only be used for read-only devices"
1030                         : "Driver '%s' is not whitelisted",
1031                    drv->format_name);
1032         return -ENOTSUP;
1033     }
1034 
1035     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
1036     if (flags & BDRV_O_COPY_ON_READ) {
1037         if (!bs->read_only) {
1038             bdrv_enable_copy_on_read(bs);
1039         } else {
1040             error_setg(errp, "Can't use copy-on-read on read-only device");
1041             return -EINVAL;
1042         }
1043     }
1044 
1045     if (filename != NULL) {
1046         pstrcpy(bs->filename, sizeof(bs->filename), filename);
1047     } else {
1048         bs->filename[0] = '\0';
1049     }
1050     pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
1051 
1052     bs->drv = drv;
1053     bs->opaque = g_malloc0(drv->instance_size);
1054 
1055     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
1056 
1057     /* Open the image, either directly or using a protocol */
1058     if (drv->bdrv_file_open) {
1059         assert(file == NULL);
1060         assert(!drv->bdrv_needs_filename || filename != NULL);
1061         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
1062     } else {
1063         if (file == NULL) {
1064             error_setg(errp, "Can't use '%s' as a block driver for the "
1065                        "protocol level", drv->format_name);
1066             ret = -EINVAL;
1067             goto free_and_fail;
1068         }
1069         bs->file = file;
1070         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
1071     }
1072 
1073     if (ret < 0) {
1074         if (local_err) {
1075             error_propagate(errp, local_err);
1076         } else if (bs->filename[0]) {
1077             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
1078         } else {
1079             error_setg_errno(errp, -ret, "Could not open image");
1080         }
1081         goto free_and_fail;
1082     }
1083 
1084     if (bs->encrypted) {
1085         error_report("Encrypted images are deprecated");
1086         error_printf("Support for them will be removed in a future release.\n"
1087                      "You can use 'qemu-img convert' to convert your image"
1088                      " to an unencrypted one.\n");
1089     }
1090 
1091     ret = refresh_total_sectors(bs, bs->total_sectors);
1092     if (ret < 0) {
1093         error_setg_errno(errp, -ret, "Could not refresh total sector count");
1094         goto free_and_fail;
1095     }
1096 
1097     bdrv_refresh_limits(bs, &local_err);
1098     if (local_err) {
1099         error_propagate(errp, local_err);
1100         ret = -EINVAL;
1101         goto free_and_fail;
1102     }
1103 
1104     assert(bdrv_opt_mem_align(bs) != 0);
1105     assert((bs->request_alignment != 0) || bs->sg);
1106     return 0;
1107 
1108 free_and_fail:
1109     bs->file = NULL;
1110     g_free(bs->opaque);
1111     bs->opaque = NULL;
1112     bs->drv = NULL;
1113     return ret;
1114 }
1115 
1116 static QDict *parse_json_filename(const char *filename, Error **errp)
1117 {
1118     QObject *options_obj;
1119     QDict *options;
1120     int ret;
1121 
1122     ret = strstart(filename, "json:", &filename);
1123     assert(ret);
1124 
1125     options_obj = qobject_from_json(filename);
1126     if (!options_obj) {
1127         error_setg(errp, "Could not parse the JSON options");
1128         return NULL;
1129     }
1130 
1131     if (qobject_type(options_obj) != QTYPE_QDICT) {
1132         qobject_decref(options_obj);
1133         error_setg(errp, "Invalid JSON object given");
1134         return NULL;
1135     }
1136 
1137     options = qobject_to_qdict(options_obj);
1138     qdict_flatten(options);
1139 
1140     return options;
1141 }
1142 
1143 /*
1144  * Fills in default options for opening images and converts the legacy
1145  * filename/flags pair to option QDict entries.
1146  */
1147 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1148                              BlockDriver *drv, Error **errp)
1149 {
1150     const char *filename = *pfilename;
1151     const char *drvname;
1152     bool protocol = flags & BDRV_O_PROTOCOL;
1153     bool parse_filename = false;
1154     Error *local_err = NULL;
1155 
1156     /* Parse json: pseudo-protocol */
1157     if (filename && g_str_has_prefix(filename, "json:")) {
1158         QDict *json_options = parse_json_filename(filename, &local_err);
1159         if (local_err) {
1160             error_propagate(errp, local_err);
1161             return -EINVAL;
1162         }
1163 
1164         /* Options given in the filename have lower priority than options
1165          * specified directly */
1166         qdict_join(*options, json_options, false);
1167         QDECREF(json_options);
1168         *pfilename = filename = NULL;
1169     }
1170 
1171     /* Fetch the file name from the options QDict if necessary */
1172     if (protocol && filename) {
1173         if (!qdict_haskey(*options, "filename")) {
1174             qdict_put(*options, "filename", qstring_from_str(filename));
1175             parse_filename = true;
1176         } else {
1177             error_setg(errp, "Can't specify 'file' and 'filename' options at "
1178                              "the same time");
1179             return -EINVAL;
1180         }
1181     }
1182 
1183     /* Find the right block driver */
1184     filename = qdict_get_try_str(*options, "filename");
1185     drvname = qdict_get_try_str(*options, "driver");
1186 
1187     if (drv) {
1188         if (drvname) {
1189             error_setg(errp, "Driver specified twice");
1190             return -EINVAL;
1191         }
1192         drvname = drv->format_name;
1193         qdict_put(*options, "driver", qstring_from_str(drvname));
1194     } else {
1195         if (!drvname && protocol) {
1196             if (filename) {
1197                 drv = bdrv_find_protocol(filename, parse_filename, errp);
1198                 if (!drv) {
1199                     return -EINVAL;
1200                 }
1201 
1202                 drvname = drv->format_name;
1203                 qdict_put(*options, "driver", qstring_from_str(drvname));
1204             } else {
1205                 error_setg(errp, "Must specify either driver or file");
1206                 return -EINVAL;
1207             }
1208         } else if (drvname) {
1209             drv = bdrv_find_format(drvname);
1210             if (!drv) {
1211                 error_setg(errp, "Unknown driver '%s'", drvname);
1212                 return -ENOENT;
1213             }
1214         }
1215     }
1216 
1217     assert(drv || !protocol);
1218 
1219     /* Driver-specific filename parsing */
1220     if (drv && drv->bdrv_parse_filename && parse_filename) {
1221         drv->bdrv_parse_filename(filename, *options, &local_err);
1222         if (local_err) {
1223             error_propagate(errp, local_err);
1224             return -EINVAL;
1225         }
1226 
1227         if (!drv->bdrv_needs_filename) {
1228             qdict_del(*options, "filename");
1229         }
1230     }
1231 
1232     return 0;
1233 }
1234 
1235 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1236 {
1237 
1238     if (bs->backing_hd) {
1239         assert(bs->backing_blocker);
1240         bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1241     } else if (backing_hd) {
1242         error_setg(&bs->backing_blocker,
1243                    "node is used as backing hd of '%s'",
1244                    bdrv_get_device_or_node_name(bs));
1245     }
1246 
1247     bs->backing_hd = backing_hd;
1248     if (!backing_hd) {
1249         error_free(bs->backing_blocker);
1250         bs->backing_blocker = NULL;
1251         goto out;
1252     }
1253     bs->open_flags &= ~BDRV_O_NO_BACKING;
1254     pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1255     pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1256             backing_hd->drv ? backing_hd->drv->format_name : "");
1257 
1258     bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1259     /* Otherwise we won't be able to commit due to check in bdrv_commit */
1260     bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET,
1261                     bs->backing_blocker);
1262 out:
1263     bdrv_refresh_limits(bs, NULL);
1264 }
1265 
1266 /*
1267  * Opens the backing file for a BlockDriverState if not yet open
1268  *
1269  * options is a QDict of options to pass to the block drivers, or NULL for an
1270  * empty set of options. The reference to the QDict is transferred to this
1271  * function (even on failure), so if the caller intends to reuse the dictionary,
1272  * it needs to use QINCREF() before calling bdrv_file_open.
1273  */
1274 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1275 {
1276     char *backing_filename = g_malloc0(PATH_MAX);
1277     int ret = 0;
1278     BlockDriverState *backing_hd;
1279     Error *local_err = NULL;
1280 
1281     if (bs->backing_hd != NULL) {
1282         QDECREF(options);
1283         goto free_exit;
1284     }
1285 
1286     /* NULL means an empty set of options */
1287     if (options == NULL) {
1288         options = qdict_new();
1289     }
1290 
1291     bs->open_flags &= ~BDRV_O_NO_BACKING;
1292     if (qdict_haskey(options, "file.filename")) {
1293         backing_filename[0] = '\0';
1294     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1295         QDECREF(options);
1296         goto free_exit;
1297     } else {
1298         bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX,
1299                                        &local_err);
1300         if (local_err) {
1301             ret = -EINVAL;
1302             error_propagate(errp, local_err);
1303             QDECREF(options);
1304             goto free_exit;
1305         }
1306     }
1307 
1308     if (!bs->drv || !bs->drv->supports_backing) {
1309         ret = -EINVAL;
1310         error_setg(errp, "Driver doesn't support backing files");
1311         QDECREF(options);
1312         goto free_exit;
1313     }
1314 
1315     backing_hd = bdrv_new();
1316 
1317     if (bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) {
1318         qdict_put(options, "driver", qstring_from_str(bs->backing_format));
1319     }
1320 
1321     assert(bs->backing_hd == NULL);
1322     ret = bdrv_open(&backing_hd,
1323                     *backing_filename ? backing_filename : NULL, NULL, options,
1324                     bdrv_backing_flags(bs->open_flags), NULL, &local_err);
1325     if (ret < 0) {
1326         bdrv_unref(backing_hd);
1327         backing_hd = NULL;
1328         bs->open_flags |= BDRV_O_NO_BACKING;
1329         error_setg(errp, "Could not open backing file: %s",
1330                    error_get_pretty(local_err));
1331         error_free(local_err);
1332         goto free_exit;
1333     }
1334     bdrv_set_backing_hd(bs, backing_hd);
1335 
1336 free_exit:
1337     g_free(backing_filename);
1338     return ret;
1339 }
1340 
1341 /*
1342  * Opens a disk image whose options are given as BlockdevRef in another block
1343  * device's options.
1344  *
1345  * If allow_none is true, no image will be opened if filename is false and no
1346  * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1347  *
1348  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1349  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1350  * itself, all options starting with "${bdref_key}." are considered part of the
1351  * BlockdevRef.
1352  *
1353  * The BlockdevRef will be removed from the options QDict.
1354  *
1355  * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1356  */
1357 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1358                     QDict *options, const char *bdref_key, int flags,
1359                     bool allow_none, Error **errp)
1360 {
1361     QDict *image_options;
1362     int ret;
1363     char *bdref_key_dot;
1364     const char *reference;
1365 
1366     assert(pbs);
1367     assert(*pbs == NULL);
1368 
1369     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1370     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1371     g_free(bdref_key_dot);
1372 
1373     reference = qdict_get_try_str(options, bdref_key);
1374     if (!filename && !reference && !qdict_size(image_options)) {
1375         if (allow_none) {
1376             ret = 0;
1377         } else {
1378             error_setg(errp, "A block device must be specified for \"%s\"",
1379                        bdref_key);
1380             ret = -EINVAL;
1381         }
1382         QDECREF(image_options);
1383         goto done;
1384     }
1385 
1386     ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1387 
1388 done:
1389     qdict_del(options, bdref_key);
1390     return ret;
1391 }
1392 
1393 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1394 {
1395     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1396     char *tmp_filename = g_malloc0(PATH_MAX + 1);
1397     int64_t total_size;
1398     QemuOpts *opts = NULL;
1399     QDict *snapshot_options;
1400     BlockDriverState *bs_snapshot;
1401     Error *local_err;
1402     int ret;
1403 
1404     /* if snapshot, we create a temporary backing file and open it
1405        instead of opening 'filename' directly */
1406 
1407     /* Get the required size from the image */
1408     total_size = bdrv_getlength(bs);
1409     if (total_size < 0) {
1410         ret = total_size;
1411         error_setg_errno(errp, -total_size, "Could not get image size");
1412         goto out;
1413     }
1414 
1415     /* Create the temporary image */
1416     ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1417     if (ret < 0) {
1418         error_setg_errno(errp, -ret, "Could not get temporary filename");
1419         goto out;
1420     }
1421 
1422     opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
1423                             &error_abort);
1424     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size, &error_abort);
1425     ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, &local_err);
1426     qemu_opts_del(opts);
1427     if (ret < 0) {
1428         error_setg_errno(errp, -ret, "Could not create temporary overlay "
1429                          "'%s': %s", tmp_filename,
1430                          error_get_pretty(local_err));
1431         error_free(local_err);
1432         goto out;
1433     }
1434 
1435     /* Prepare a new options QDict for the temporary file */
1436     snapshot_options = qdict_new();
1437     qdict_put(snapshot_options, "file.driver",
1438               qstring_from_str("file"));
1439     qdict_put(snapshot_options, "file.filename",
1440               qstring_from_str(tmp_filename));
1441 
1442     bs_snapshot = bdrv_new();
1443 
1444     ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1445                     flags, &bdrv_qcow2, &local_err);
1446     if (ret < 0) {
1447         error_propagate(errp, local_err);
1448         goto out;
1449     }
1450 
1451     bdrv_append(bs_snapshot, bs);
1452 
1453 out:
1454     g_free(tmp_filename);
1455     return ret;
1456 }
1457 
1458 /*
1459  * Opens a disk image (raw, qcow2, vmdk, ...)
1460  *
1461  * options is a QDict of options to pass to the block drivers, or NULL for an
1462  * empty set of options. The reference to the QDict belongs to the block layer
1463  * after the call (even on failure), so if the caller intends to reuse the
1464  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1465  *
1466  * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1467  * If it is not NULL, the referenced BDS will be reused.
1468  *
1469  * The reference parameter may be used to specify an existing block device which
1470  * should be opened. If specified, neither options nor a filename may be given,
1471  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1472  */
1473 int bdrv_open(BlockDriverState **pbs, const char *filename,
1474               const char *reference, QDict *options, int flags,
1475               BlockDriver *drv, Error **errp)
1476 {
1477     int ret;
1478     BlockDriverState *file = NULL, *bs;
1479     const char *drvname;
1480     Error *local_err = NULL;
1481     int snapshot_flags = 0;
1482 
1483     assert(pbs);
1484 
1485     if (reference) {
1486         bool options_non_empty = options ? qdict_size(options) : false;
1487         QDECREF(options);
1488 
1489         if (*pbs) {
1490             error_setg(errp, "Cannot reuse an existing BDS when referencing "
1491                        "another block device");
1492             return -EINVAL;
1493         }
1494 
1495         if (filename || options_non_empty) {
1496             error_setg(errp, "Cannot reference an existing block device with "
1497                        "additional options or a new filename");
1498             return -EINVAL;
1499         }
1500 
1501         bs = bdrv_lookup_bs(reference, reference, errp);
1502         if (!bs) {
1503             return -ENODEV;
1504         }
1505         bdrv_ref(bs);
1506         *pbs = bs;
1507         return 0;
1508     }
1509 
1510     if (*pbs) {
1511         bs = *pbs;
1512     } else {
1513         bs = bdrv_new();
1514     }
1515 
1516     /* NULL means an empty set of options */
1517     if (options == NULL) {
1518         options = qdict_new();
1519     }
1520 
1521     ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1522     if (local_err) {
1523         goto fail;
1524     }
1525 
1526     /* Find the right image format driver */
1527     drv = NULL;
1528     drvname = qdict_get_try_str(options, "driver");
1529     if (drvname) {
1530         drv = bdrv_find_format(drvname);
1531         qdict_del(options, "driver");
1532         if (!drv) {
1533             error_setg(errp, "Unknown driver: '%s'", drvname);
1534             ret = -EINVAL;
1535             goto fail;
1536         }
1537     }
1538 
1539     assert(drvname || !(flags & BDRV_O_PROTOCOL));
1540     if (drv && !drv->bdrv_file_open) {
1541         /* If the user explicitly wants a format driver here, we'll need to add
1542          * another layer for the protocol in bs->file */
1543         flags &= ~BDRV_O_PROTOCOL;
1544     }
1545 
1546     bs->options = options;
1547     options = qdict_clone_shallow(options);
1548 
1549     /* Open image file without format layer */
1550     if ((flags & BDRV_O_PROTOCOL) == 0) {
1551         if (flags & BDRV_O_RDWR) {
1552             flags |= BDRV_O_ALLOW_RDWR;
1553         }
1554         if (flags & BDRV_O_SNAPSHOT) {
1555             snapshot_flags = bdrv_temp_snapshot_flags(flags);
1556             flags = bdrv_backing_flags(flags);
1557         }
1558 
1559         assert(file == NULL);
1560         ret = bdrv_open_image(&file, filename, options, "file",
1561                               bdrv_inherited_flags(flags),
1562                               true, &local_err);
1563         if (ret < 0) {
1564             goto fail;
1565         }
1566     }
1567 
1568     /* Image format probing */
1569     bs->probed = !drv;
1570     if (!drv && file) {
1571         ret = find_image_format(file, filename, &drv, &local_err);
1572         if (ret < 0) {
1573             goto fail;
1574         }
1575     } else if (!drv) {
1576         error_setg(errp, "Must specify either driver or file");
1577         ret = -EINVAL;
1578         goto fail;
1579     }
1580 
1581     /* Open the image */
1582     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1583     if (ret < 0) {
1584         goto fail;
1585     }
1586 
1587     if (file && (bs->file != file)) {
1588         bdrv_unref(file);
1589         file = NULL;
1590     }
1591 
1592     /* If there is a backing file, use it */
1593     if ((flags & BDRV_O_NO_BACKING) == 0) {
1594         QDict *backing_options;
1595 
1596         qdict_extract_subqdict(options, &backing_options, "backing.");
1597         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1598         if (ret < 0) {
1599             goto close_and_fail;
1600         }
1601     }
1602 
1603     bdrv_refresh_filename(bs);
1604 
1605     /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1606      * temporary snapshot afterwards. */
1607     if (snapshot_flags) {
1608         ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1609         if (local_err) {
1610             goto close_and_fail;
1611         }
1612     }
1613 
1614     /* Check if any unknown options were used */
1615     if (options && (qdict_size(options) != 0)) {
1616         const QDictEntry *entry = qdict_first(options);
1617         if (flags & BDRV_O_PROTOCOL) {
1618             error_setg(errp, "Block protocol '%s' doesn't support the option "
1619                        "'%s'", drv->format_name, entry->key);
1620         } else {
1621             error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1622                        "support the option '%s'", drv->format_name,
1623                        bdrv_get_device_name(bs), entry->key);
1624         }
1625 
1626         ret = -EINVAL;
1627         goto close_and_fail;
1628     }
1629 
1630     if (!bdrv_key_required(bs)) {
1631         if (bs->blk) {
1632             blk_dev_change_media_cb(bs->blk, true);
1633         }
1634     } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1635                && !runstate_check(RUN_STATE_INMIGRATE)
1636                && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1637         error_setg(errp,
1638                    "Guest must be stopped for opening of encrypted image");
1639         ret = -EBUSY;
1640         goto close_and_fail;
1641     }
1642 
1643     QDECREF(options);
1644     *pbs = bs;
1645     return 0;
1646 
1647 fail:
1648     if (file != NULL) {
1649         bdrv_unref(file);
1650     }
1651     QDECREF(bs->options);
1652     QDECREF(options);
1653     bs->options = NULL;
1654     if (!*pbs) {
1655         /* If *pbs is NULL, a new BDS has been created in this function and
1656            needs to be freed now. Otherwise, it does not need to be closed,
1657            since it has not really been opened yet. */
1658         bdrv_unref(bs);
1659     }
1660     if (local_err) {
1661         error_propagate(errp, local_err);
1662     }
1663     return ret;
1664 
1665 close_and_fail:
1666     /* See fail path, but now the BDS has to be always closed */
1667     if (*pbs) {
1668         bdrv_close(bs);
1669     } else {
1670         bdrv_unref(bs);
1671     }
1672     QDECREF(options);
1673     if (local_err) {
1674         error_propagate(errp, local_err);
1675     }
1676     return ret;
1677 }
1678 
1679 typedef struct BlockReopenQueueEntry {
1680      bool prepared;
1681      BDRVReopenState state;
1682      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1683 } BlockReopenQueueEntry;
1684 
1685 /*
1686  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1687  * reopen of multiple devices.
1688  *
1689  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1690  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1691  * be created and initialized. This newly created BlockReopenQueue should be
1692  * passed back in for subsequent calls that are intended to be of the same
1693  * atomic 'set'.
1694  *
1695  * bs is the BlockDriverState to add to the reopen queue.
1696  *
1697  * flags contains the open flags for the associated bs
1698  *
1699  * returns a pointer to bs_queue, which is either the newly allocated
1700  * bs_queue, or the existing bs_queue being used.
1701  *
1702  */
1703 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1704                                     BlockDriverState *bs, int flags)
1705 {
1706     assert(bs != NULL);
1707 
1708     BlockReopenQueueEntry *bs_entry;
1709     if (bs_queue == NULL) {
1710         bs_queue = g_new0(BlockReopenQueue, 1);
1711         QSIMPLEQ_INIT(bs_queue);
1712     }
1713 
1714     /* bdrv_open() masks this flag out */
1715     flags &= ~BDRV_O_PROTOCOL;
1716 
1717     if (bs->file) {
1718         bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1719     }
1720 
1721     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1722     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1723 
1724     bs_entry->state.bs = bs;
1725     bs_entry->state.flags = flags;
1726 
1727     return bs_queue;
1728 }
1729 
1730 /*
1731  * Reopen multiple BlockDriverStates atomically & transactionally.
1732  *
1733  * The queue passed in (bs_queue) must have been built up previous
1734  * via bdrv_reopen_queue().
1735  *
1736  * Reopens all BDS specified in the queue, with the appropriate
1737  * flags.  All devices are prepared for reopen, and failure of any
1738  * device will cause all device changes to be abandonded, and intermediate
1739  * data cleaned up.
1740  *
1741  * If all devices prepare successfully, then the changes are committed
1742  * to all devices.
1743  *
1744  */
1745 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1746 {
1747     int ret = -1;
1748     BlockReopenQueueEntry *bs_entry, *next;
1749     Error *local_err = NULL;
1750 
1751     assert(bs_queue != NULL);
1752 
1753     bdrv_drain_all();
1754 
1755     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1756         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1757             error_propagate(errp, local_err);
1758             goto cleanup;
1759         }
1760         bs_entry->prepared = true;
1761     }
1762 
1763     /* If we reach this point, we have success and just need to apply the
1764      * changes
1765      */
1766     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1767         bdrv_reopen_commit(&bs_entry->state);
1768     }
1769 
1770     ret = 0;
1771 
1772 cleanup:
1773     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1774         if (ret && bs_entry->prepared) {
1775             bdrv_reopen_abort(&bs_entry->state);
1776         }
1777         g_free(bs_entry);
1778     }
1779     g_free(bs_queue);
1780     return ret;
1781 }
1782 
1783 
1784 /* Reopen a single BlockDriverState with the specified flags. */
1785 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1786 {
1787     int ret = -1;
1788     Error *local_err = NULL;
1789     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1790 
1791     ret = bdrv_reopen_multiple(queue, &local_err);
1792     if (local_err != NULL) {
1793         error_propagate(errp, local_err);
1794     }
1795     return ret;
1796 }
1797 
1798 
1799 /*
1800  * Prepares a BlockDriverState for reopen. All changes are staged in the
1801  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1802  * the block driver layer .bdrv_reopen_prepare()
1803  *
1804  * bs is the BlockDriverState to reopen
1805  * flags are the new open flags
1806  * queue is the reopen queue
1807  *
1808  * Returns 0 on success, non-zero on error.  On error errp will be set
1809  * as well.
1810  *
1811  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1812  * It is the responsibility of the caller to then call the abort() or
1813  * commit() for any other BDS that have been left in a prepare() state
1814  *
1815  */
1816 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1817                         Error **errp)
1818 {
1819     int ret = -1;
1820     Error *local_err = NULL;
1821     BlockDriver *drv;
1822 
1823     assert(reopen_state != NULL);
1824     assert(reopen_state->bs->drv != NULL);
1825     drv = reopen_state->bs->drv;
1826 
1827     /* if we are to stay read-only, do not allow permission change
1828      * to r/w */
1829     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1830         reopen_state->flags & BDRV_O_RDWR) {
1831         error_setg(errp, "Node '%s' is read only",
1832                    bdrv_get_device_or_node_name(reopen_state->bs));
1833         goto error;
1834     }
1835 
1836 
1837     ret = bdrv_flush(reopen_state->bs);
1838     if (ret) {
1839         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1840                   strerror(-ret));
1841         goto error;
1842     }
1843 
1844     if (drv->bdrv_reopen_prepare) {
1845         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1846         if (ret) {
1847             if (local_err != NULL) {
1848                 error_propagate(errp, local_err);
1849             } else {
1850                 error_setg(errp, "failed while preparing to reopen image '%s'",
1851                            reopen_state->bs->filename);
1852             }
1853             goto error;
1854         }
1855     } else {
1856         /* It is currently mandatory to have a bdrv_reopen_prepare()
1857          * handler for each supported drv. */
1858         error_setg(errp, "Block format '%s' used by node '%s' "
1859                    "does not support reopening files", drv->format_name,
1860                    bdrv_get_device_or_node_name(reopen_state->bs));
1861         ret = -1;
1862         goto error;
1863     }
1864 
1865     ret = 0;
1866 
1867 error:
1868     return ret;
1869 }
1870 
1871 /*
1872  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1873  * makes them final by swapping the staging BlockDriverState contents into
1874  * the active BlockDriverState contents.
1875  */
1876 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1877 {
1878     BlockDriver *drv;
1879 
1880     assert(reopen_state != NULL);
1881     drv = reopen_state->bs->drv;
1882     assert(drv != NULL);
1883 
1884     /* If there are any driver level actions to take */
1885     if (drv->bdrv_reopen_commit) {
1886         drv->bdrv_reopen_commit(reopen_state);
1887     }
1888 
1889     /* set BDS specific flags now */
1890     reopen_state->bs->open_flags         = reopen_state->flags;
1891     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1892                                               BDRV_O_CACHE_WB);
1893     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1894 
1895     bdrv_refresh_limits(reopen_state->bs, NULL);
1896 }
1897 
1898 /*
1899  * Abort the reopen, and delete and free the staged changes in
1900  * reopen_state
1901  */
1902 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1903 {
1904     BlockDriver *drv;
1905 
1906     assert(reopen_state != NULL);
1907     drv = reopen_state->bs->drv;
1908     assert(drv != NULL);
1909 
1910     if (drv->bdrv_reopen_abort) {
1911         drv->bdrv_reopen_abort(reopen_state);
1912     }
1913 }
1914 
1915 
1916 void bdrv_close(BlockDriverState *bs)
1917 {
1918     BdrvAioNotifier *ban, *ban_next;
1919 
1920     if (bs->job) {
1921         block_job_cancel_sync(bs->job);
1922     }
1923     bdrv_drain_all(); /* complete I/O */
1924     bdrv_flush(bs);
1925     bdrv_drain_all(); /* in case flush left pending I/O */
1926     notifier_list_notify(&bs->close_notifiers, bs);
1927 
1928     if (bs->drv) {
1929         if (bs->backing_hd) {
1930             BlockDriverState *backing_hd = bs->backing_hd;
1931             bdrv_set_backing_hd(bs, NULL);
1932             bdrv_unref(backing_hd);
1933         }
1934         bs->drv->bdrv_close(bs);
1935         g_free(bs->opaque);
1936         bs->opaque = NULL;
1937         bs->drv = NULL;
1938         bs->copy_on_read = 0;
1939         bs->backing_file[0] = '\0';
1940         bs->backing_format[0] = '\0';
1941         bs->total_sectors = 0;
1942         bs->encrypted = 0;
1943         bs->valid_key = 0;
1944         bs->sg = 0;
1945         bs->zero_beyond_eof = false;
1946         QDECREF(bs->options);
1947         bs->options = NULL;
1948         QDECREF(bs->full_open_options);
1949         bs->full_open_options = NULL;
1950 
1951         if (bs->file != NULL) {
1952             bdrv_unref(bs->file);
1953             bs->file = NULL;
1954         }
1955     }
1956 
1957     if (bs->blk) {
1958         blk_dev_change_media_cb(bs->blk, false);
1959     }
1960 
1961     /*throttling disk I/O limits*/
1962     if (bs->io_limits_enabled) {
1963         bdrv_io_limits_disable(bs);
1964     }
1965 
1966     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1967         g_free(ban);
1968     }
1969     QLIST_INIT(&bs->aio_notifiers);
1970 }
1971 
1972 void bdrv_close_all(void)
1973 {
1974     BlockDriverState *bs;
1975 
1976     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1977         AioContext *aio_context = bdrv_get_aio_context(bs);
1978 
1979         aio_context_acquire(aio_context);
1980         bdrv_close(bs);
1981         aio_context_release(aio_context);
1982     }
1983 }
1984 
1985 /* Check if any requests are in-flight (including throttled requests) */
1986 static bool bdrv_requests_pending(BlockDriverState *bs)
1987 {
1988     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1989         return true;
1990     }
1991     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1992         return true;
1993     }
1994     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1995         return true;
1996     }
1997     if (bs->file && bdrv_requests_pending(bs->file)) {
1998         return true;
1999     }
2000     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
2001         return true;
2002     }
2003     return false;
2004 }
2005 
2006 static bool bdrv_drain_one(BlockDriverState *bs)
2007 {
2008     bool bs_busy;
2009 
2010     bdrv_flush_io_queue(bs);
2011     bdrv_start_throttled_reqs(bs);
2012     bs_busy = bdrv_requests_pending(bs);
2013     bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
2014     return bs_busy;
2015 }
2016 
2017 /*
2018  * Wait for pending requests to complete on a single BlockDriverState subtree
2019  *
2020  * See the warning in bdrv_drain_all().  This function can only be called if
2021  * you are sure nothing can generate I/O because you have op blockers
2022  * installed.
2023  *
2024  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
2025  * AioContext.
2026  */
2027 void bdrv_drain(BlockDriverState *bs)
2028 {
2029     while (bdrv_drain_one(bs)) {
2030         /* Keep iterating */
2031     }
2032 }
2033 
2034 /*
2035  * Wait for pending requests to complete across all BlockDriverStates
2036  *
2037  * This function does not flush data to disk, use bdrv_flush_all() for that
2038  * after calling this function.
2039  *
2040  * Note that completion of an asynchronous I/O operation can trigger any
2041  * number of other I/O operations on other devices---for example a coroutine
2042  * can be arbitrarily complex and a constant flow of I/O can come until the
2043  * coroutine is complete.  Because of this, it is not possible to have a
2044  * function to drain a single device's I/O queue.
2045  */
2046 void bdrv_drain_all(void)
2047 {
2048     /* Always run first iteration so any pending completion BHs run */
2049     bool busy = true;
2050     BlockDriverState *bs = NULL;
2051 
2052     while ((bs = bdrv_next(bs))) {
2053         AioContext *aio_context = bdrv_get_aio_context(bs);
2054 
2055         aio_context_acquire(aio_context);
2056         if (bs->job) {
2057             block_job_pause(bs->job);
2058         }
2059         aio_context_release(aio_context);
2060     }
2061 
2062     while (busy) {
2063         busy = false;
2064         bs = NULL;
2065 
2066         while ((bs = bdrv_next(bs))) {
2067             AioContext *aio_context = bdrv_get_aio_context(bs);
2068 
2069             aio_context_acquire(aio_context);
2070             busy |= bdrv_drain_one(bs);
2071             aio_context_release(aio_context);
2072         }
2073     }
2074 
2075     bs = NULL;
2076     while ((bs = bdrv_next(bs))) {
2077         AioContext *aio_context = bdrv_get_aio_context(bs);
2078 
2079         aio_context_acquire(aio_context);
2080         if (bs->job) {
2081             block_job_resume(bs->job);
2082         }
2083         aio_context_release(aio_context);
2084     }
2085 }
2086 
2087 /* make a BlockDriverState anonymous by removing from bdrv_state and
2088  * graph_bdrv_state list.
2089    Also, NULL terminate the device_name to prevent double remove */
2090 void bdrv_make_anon(BlockDriverState *bs)
2091 {
2092     /*
2093      * Take care to remove bs from bdrv_states only when it's actually
2094      * in it.  Note that bs->device_list.tqe_prev is initially null,
2095      * and gets set to non-null by QTAILQ_INSERT_TAIL().  Establish
2096      * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
2097      * resetting it to null on remove.
2098      */
2099     if (bs->device_list.tqe_prev) {
2100         QTAILQ_REMOVE(&bdrv_states, bs, device_list);
2101         bs->device_list.tqe_prev = NULL;
2102     }
2103     if (bs->node_name[0] != '\0') {
2104         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
2105     }
2106     bs->node_name[0] = '\0';
2107 }
2108 
2109 static void bdrv_rebind(BlockDriverState *bs)
2110 {
2111     if (bs->drv && bs->drv->bdrv_rebind) {
2112         bs->drv->bdrv_rebind(bs);
2113     }
2114 }
2115 
2116 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
2117                                      BlockDriverState *bs_src)
2118 {
2119     /* move some fields that need to stay attached to the device */
2120 
2121     /* dev info */
2122     bs_dest->guest_block_size   = bs_src->guest_block_size;
2123     bs_dest->copy_on_read       = bs_src->copy_on_read;
2124 
2125     bs_dest->enable_write_cache = bs_src->enable_write_cache;
2126 
2127     /* i/o throttled req */
2128     memcpy(&bs_dest->throttle_state,
2129            &bs_src->throttle_state,
2130            sizeof(ThrottleState));
2131     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
2132     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
2133     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
2134 
2135     /* r/w error */
2136     bs_dest->on_read_error      = bs_src->on_read_error;
2137     bs_dest->on_write_error     = bs_src->on_write_error;
2138 
2139     /* i/o status */
2140     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
2141     bs_dest->iostatus           = bs_src->iostatus;
2142 
2143     /* dirty bitmap */
2144     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
2145 
2146     /* reference count */
2147     bs_dest->refcnt             = bs_src->refcnt;
2148 
2149     /* job */
2150     bs_dest->job                = bs_src->job;
2151 
2152     /* keep the same entry in bdrv_states */
2153     bs_dest->device_list = bs_src->device_list;
2154     bs_dest->blk = bs_src->blk;
2155 
2156     memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2157            sizeof(bs_dest->op_blockers));
2158 }
2159 
2160 /*
2161  * Swap bs contents for two image chains while they are live,
2162  * while keeping required fields on the BlockDriverState that is
2163  * actually attached to a device.
2164  *
2165  * This will modify the BlockDriverState fields, and swap contents
2166  * between bs_new and bs_old. Both bs_new and bs_old are modified.
2167  *
2168  * bs_new must not be attached to a BlockBackend.
2169  *
2170  * This function does not create any image files.
2171  */
2172 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2173 {
2174     BlockDriverState tmp;
2175 
2176     /* The code needs to swap the node_name but simply swapping node_list won't
2177      * work so first remove the nodes from the graph list, do the swap then
2178      * insert them back if needed.
2179      */
2180     if (bs_new->node_name[0] != '\0') {
2181         QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2182     }
2183     if (bs_old->node_name[0] != '\0') {
2184         QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2185     }
2186 
2187     /* bs_new must be unattached and shouldn't have anything fancy enabled */
2188     assert(!bs_new->blk);
2189     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2190     assert(bs_new->job == NULL);
2191     assert(bs_new->io_limits_enabled == false);
2192     assert(!throttle_have_timer(&bs_new->throttle_state));
2193 
2194     tmp = *bs_new;
2195     *bs_new = *bs_old;
2196     *bs_old = tmp;
2197 
2198     /* there are some fields that should not be swapped, move them back */
2199     bdrv_move_feature_fields(&tmp, bs_old);
2200     bdrv_move_feature_fields(bs_old, bs_new);
2201     bdrv_move_feature_fields(bs_new, &tmp);
2202 
2203     /* bs_new must remain unattached */
2204     assert(!bs_new->blk);
2205 
2206     /* Check a few fields that should remain attached to the device */
2207     assert(bs_new->job == NULL);
2208     assert(bs_new->io_limits_enabled == false);
2209     assert(!throttle_have_timer(&bs_new->throttle_state));
2210 
2211     /* insert the nodes back into the graph node list if needed */
2212     if (bs_new->node_name[0] != '\0') {
2213         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2214     }
2215     if (bs_old->node_name[0] != '\0') {
2216         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2217     }
2218 
2219     bdrv_rebind(bs_new);
2220     bdrv_rebind(bs_old);
2221 }
2222 
2223 /*
2224  * Add new bs contents at the top of an image chain while the chain is
2225  * live, while keeping required fields on the top layer.
2226  *
2227  * This will modify the BlockDriverState fields, and swap contents
2228  * between bs_new and bs_top. Both bs_new and bs_top are modified.
2229  *
2230  * bs_new must not be attached to a BlockBackend.
2231  *
2232  * This function does not create any image files.
2233  */
2234 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2235 {
2236     bdrv_swap(bs_new, bs_top);
2237 
2238     /* The contents of 'tmp' will become bs_top, as we are
2239      * swapping bs_new and bs_top contents. */
2240     bdrv_set_backing_hd(bs_top, bs_new);
2241 }
2242 
2243 static void bdrv_delete(BlockDriverState *bs)
2244 {
2245     assert(!bs->job);
2246     assert(bdrv_op_blocker_is_empty(bs));
2247     assert(!bs->refcnt);
2248     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2249 
2250     bdrv_close(bs);
2251 
2252     /* remove from list, if necessary */
2253     bdrv_make_anon(bs);
2254 
2255     g_free(bs);
2256 }
2257 
2258 /*
2259  * Run consistency checks on an image
2260  *
2261  * Returns 0 if the check could be completed (it doesn't mean that the image is
2262  * free of errors) or -errno when an internal error occurred. The results of the
2263  * check are stored in res.
2264  */
2265 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2266 {
2267     if (bs->drv == NULL) {
2268         return -ENOMEDIUM;
2269     }
2270     if (bs->drv->bdrv_check == NULL) {
2271         return -ENOTSUP;
2272     }
2273 
2274     memset(res, 0, sizeof(*res));
2275     return bs->drv->bdrv_check(bs, res, fix);
2276 }
2277 
2278 #define COMMIT_BUF_SECTORS 2048
2279 
2280 /* commit COW file into the raw image */
2281 int bdrv_commit(BlockDriverState *bs)
2282 {
2283     BlockDriver *drv = bs->drv;
2284     int64_t sector, total_sectors, length, backing_length;
2285     int n, ro, open_flags;
2286     int ret = 0;
2287     uint8_t *buf = NULL;
2288 
2289     if (!drv)
2290         return -ENOMEDIUM;
2291 
2292     if (!bs->backing_hd) {
2293         return -ENOTSUP;
2294     }
2295 
2296     if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) ||
2297         bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET, NULL)) {
2298         return -EBUSY;
2299     }
2300 
2301     ro = bs->backing_hd->read_only;
2302     open_flags =  bs->backing_hd->open_flags;
2303 
2304     if (ro) {
2305         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2306             return -EACCES;
2307         }
2308     }
2309 
2310     length = bdrv_getlength(bs);
2311     if (length < 0) {
2312         ret = length;
2313         goto ro_cleanup;
2314     }
2315 
2316     backing_length = bdrv_getlength(bs->backing_hd);
2317     if (backing_length < 0) {
2318         ret = backing_length;
2319         goto ro_cleanup;
2320     }
2321 
2322     /* If our top snapshot is larger than the backing file image,
2323      * grow the backing file image if possible.  If not possible,
2324      * we must return an error */
2325     if (length > backing_length) {
2326         ret = bdrv_truncate(bs->backing_hd, length);
2327         if (ret < 0) {
2328             goto ro_cleanup;
2329         }
2330     }
2331 
2332     total_sectors = length >> BDRV_SECTOR_BITS;
2333 
2334     /* qemu_try_blockalign() for bs will choose an alignment that works for
2335      * bs->backing_hd as well, so no need to compare the alignment manually. */
2336     buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2337     if (buf == NULL) {
2338         ret = -ENOMEM;
2339         goto ro_cleanup;
2340     }
2341 
2342     for (sector = 0; sector < total_sectors; sector += n) {
2343         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2344         if (ret < 0) {
2345             goto ro_cleanup;
2346         }
2347         if (ret) {
2348             ret = bdrv_read(bs, sector, buf, n);
2349             if (ret < 0) {
2350                 goto ro_cleanup;
2351             }
2352 
2353             ret = bdrv_write(bs->backing_hd, sector, buf, n);
2354             if (ret < 0) {
2355                 goto ro_cleanup;
2356             }
2357         }
2358     }
2359 
2360     if (drv->bdrv_make_empty) {
2361         ret = drv->bdrv_make_empty(bs);
2362         if (ret < 0) {
2363             goto ro_cleanup;
2364         }
2365         bdrv_flush(bs);
2366     }
2367 
2368     /*
2369      * Make sure all data we wrote to the backing device is actually
2370      * stable on disk.
2371      */
2372     if (bs->backing_hd) {
2373         bdrv_flush(bs->backing_hd);
2374     }
2375 
2376     ret = 0;
2377 ro_cleanup:
2378     qemu_vfree(buf);
2379 
2380     if (ro) {
2381         /* ignoring error return here */
2382         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2383     }
2384 
2385     return ret;
2386 }
2387 
2388 int bdrv_commit_all(void)
2389 {
2390     BlockDriverState *bs;
2391 
2392     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2393         AioContext *aio_context = bdrv_get_aio_context(bs);
2394 
2395         aio_context_acquire(aio_context);
2396         if (bs->drv && bs->backing_hd) {
2397             int ret = bdrv_commit(bs);
2398             if (ret < 0) {
2399                 aio_context_release(aio_context);
2400                 return ret;
2401             }
2402         }
2403         aio_context_release(aio_context);
2404     }
2405     return 0;
2406 }
2407 
2408 /**
2409  * Remove an active request from the tracked requests list
2410  *
2411  * This function should be called when a tracked request is completing.
2412  */
2413 static void tracked_request_end(BdrvTrackedRequest *req)
2414 {
2415     if (req->serialising) {
2416         req->bs->serialising_in_flight--;
2417     }
2418 
2419     QLIST_REMOVE(req, list);
2420     qemu_co_queue_restart_all(&req->wait_queue);
2421 }
2422 
2423 /**
2424  * Add an active request to the tracked requests list
2425  */
2426 static void tracked_request_begin(BdrvTrackedRequest *req,
2427                                   BlockDriverState *bs,
2428                                   int64_t offset,
2429                                   unsigned int bytes, bool is_write)
2430 {
2431     *req = (BdrvTrackedRequest){
2432         .bs = bs,
2433         .offset         = offset,
2434         .bytes          = bytes,
2435         .is_write       = is_write,
2436         .co             = qemu_coroutine_self(),
2437         .serialising    = false,
2438         .overlap_offset = offset,
2439         .overlap_bytes  = bytes,
2440     };
2441 
2442     qemu_co_queue_init(&req->wait_queue);
2443 
2444     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2445 }
2446 
2447 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2448 {
2449     int64_t overlap_offset = req->offset & ~(align - 1);
2450     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2451                                - overlap_offset;
2452 
2453     if (!req->serialising) {
2454         req->bs->serialising_in_flight++;
2455         req->serialising = true;
2456     }
2457 
2458     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2459     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2460 }
2461 
2462 /**
2463  * Round a region to cluster boundaries
2464  */
2465 void bdrv_round_to_clusters(BlockDriverState *bs,
2466                             int64_t sector_num, int nb_sectors,
2467                             int64_t *cluster_sector_num,
2468                             int *cluster_nb_sectors)
2469 {
2470     BlockDriverInfo bdi;
2471 
2472     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2473         *cluster_sector_num = sector_num;
2474         *cluster_nb_sectors = nb_sectors;
2475     } else {
2476         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2477         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2478         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2479                                             nb_sectors, c);
2480     }
2481 }
2482 
2483 static int bdrv_get_cluster_size(BlockDriverState *bs)
2484 {
2485     BlockDriverInfo bdi;
2486     int ret;
2487 
2488     ret = bdrv_get_info(bs, &bdi);
2489     if (ret < 0 || bdi.cluster_size == 0) {
2490         return bs->request_alignment;
2491     } else {
2492         return bdi.cluster_size;
2493     }
2494 }
2495 
2496 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2497                                      int64_t offset, unsigned int bytes)
2498 {
2499     /*        aaaa   bbbb */
2500     if (offset >= req->overlap_offset + req->overlap_bytes) {
2501         return false;
2502     }
2503     /* bbbb   aaaa        */
2504     if (req->overlap_offset >= offset + bytes) {
2505         return false;
2506     }
2507     return true;
2508 }
2509 
2510 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2511 {
2512     BlockDriverState *bs = self->bs;
2513     BdrvTrackedRequest *req;
2514     bool retry;
2515     bool waited = false;
2516 
2517     if (!bs->serialising_in_flight) {
2518         return false;
2519     }
2520 
2521     do {
2522         retry = false;
2523         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2524             if (req == self || (!req->serialising && !self->serialising)) {
2525                 continue;
2526             }
2527             if (tracked_request_overlaps(req, self->overlap_offset,
2528                                          self->overlap_bytes))
2529             {
2530                 /* Hitting this means there was a reentrant request, for
2531                  * example, a block driver issuing nested requests.  This must
2532                  * never happen since it means deadlock.
2533                  */
2534                 assert(qemu_coroutine_self() != req->co);
2535 
2536                 /* If the request is already (indirectly) waiting for us, or
2537                  * will wait for us as soon as it wakes up, then just go on
2538                  * (instead of producing a deadlock in the former case). */
2539                 if (!req->waiting_for) {
2540                     self->waiting_for = req;
2541                     qemu_co_queue_wait(&req->wait_queue);
2542                     self->waiting_for = NULL;
2543                     retry = true;
2544                     waited = true;
2545                     break;
2546                 }
2547             }
2548         }
2549     } while (retry);
2550 
2551     return waited;
2552 }
2553 
2554 /*
2555  * Return values:
2556  * 0        - success
2557  * -EINVAL  - backing format specified, but no file
2558  * -ENOSPC  - can't update the backing file because no space is left in the
2559  *            image file header
2560  * -ENOTSUP - format driver doesn't support changing the backing file
2561  */
2562 int bdrv_change_backing_file(BlockDriverState *bs,
2563     const char *backing_file, const char *backing_fmt)
2564 {
2565     BlockDriver *drv = bs->drv;
2566     int ret;
2567 
2568     /* Backing file format doesn't make sense without a backing file */
2569     if (backing_fmt && !backing_file) {
2570         return -EINVAL;
2571     }
2572 
2573     if (drv->bdrv_change_backing_file != NULL) {
2574         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2575     } else {
2576         ret = -ENOTSUP;
2577     }
2578 
2579     if (ret == 0) {
2580         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2581         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2582     }
2583     return ret;
2584 }
2585 
2586 /*
2587  * Finds the image layer in the chain that has 'bs' as its backing file.
2588  *
2589  * active is the current topmost image.
2590  *
2591  * Returns NULL if bs is not found in active's image chain,
2592  * or if active == bs.
2593  *
2594  * Returns the bottommost base image if bs == NULL.
2595  */
2596 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2597                                     BlockDriverState *bs)
2598 {
2599     while (active && bs != active->backing_hd) {
2600         active = active->backing_hd;
2601     }
2602 
2603     return active;
2604 }
2605 
2606 /* Given a BDS, searches for the base layer. */
2607 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2608 {
2609     return bdrv_find_overlay(bs, NULL);
2610 }
2611 
2612 typedef struct BlkIntermediateStates {
2613     BlockDriverState *bs;
2614     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2615 } BlkIntermediateStates;
2616 
2617 
2618 /*
2619  * Drops images above 'base' up to and including 'top', and sets the image
2620  * above 'top' to have base as its backing file.
2621  *
2622  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2623  * information in 'bs' can be properly updated.
2624  *
2625  * E.g., this will convert the following chain:
2626  * bottom <- base <- intermediate <- top <- active
2627  *
2628  * to
2629  *
2630  * bottom <- base <- active
2631  *
2632  * It is allowed for bottom==base, in which case it converts:
2633  *
2634  * base <- intermediate <- top <- active
2635  *
2636  * to
2637  *
2638  * base <- active
2639  *
2640  * If backing_file_str is non-NULL, it will be used when modifying top's
2641  * overlay image metadata.
2642  *
2643  * Error conditions:
2644  *  if active == top, that is considered an error
2645  *
2646  */
2647 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2648                            BlockDriverState *base, const char *backing_file_str)
2649 {
2650     BlockDriverState *intermediate;
2651     BlockDriverState *base_bs = NULL;
2652     BlockDriverState *new_top_bs = NULL;
2653     BlkIntermediateStates *intermediate_state, *next;
2654     int ret = -EIO;
2655 
2656     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2657     QSIMPLEQ_INIT(&states_to_delete);
2658 
2659     if (!top->drv || !base->drv) {
2660         goto exit;
2661     }
2662 
2663     new_top_bs = bdrv_find_overlay(active, top);
2664 
2665     if (new_top_bs == NULL) {
2666         /* we could not find the image above 'top', this is an error */
2667         goto exit;
2668     }
2669 
2670     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2671      * to do, no intermediate images */
2672     if (new_top_bs->backing_hd == base) {
2673         ret = 0;
2674         goto exit;
2675     }
2676 
2677     intermediate = top;
2678 
2679     /* now we will go down through the list, and add each BDS we find
2680      * into our deletion queue, until we hit the 'base'
2681      */
2682     while (intermediate) {
2683         intermediate_state = g_new0(BlkIntermediateStates, 1);
2684         intermediate_state->bs = intermediate;
2685         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2686 
2687         if (intermediate->backing_hd == base) {
2688             base_bs = intermediate->backing_hd;
2689             break;
2690         }
2691         intermediate = intermediate->backing_hd;
2692     }
2693     if (base_bs == NULL) {
2694         /* something went wrong, we did not end at the base. safely
2695          * unravel everything, and exit with error */
2696         goto exit;
2697     }
2698 
2699     /* success - we can delete the intermediate states, and link top->base */
2700     backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2701     ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2702                                    base_bs->drv ? base_bs->drv->format_name : "");
2703     if (ret) {
2704         goto exit;
2705     }
2706     bdrv_set_backing_hd(new_top_bs, base_bs);
2707 
2708     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2709         /* so that bdrv_close() does not recursively close the chain */
2710         bdrv_set_backing_hd(intermediate_state->bs, NULL);
2711         bdrv_unref(intermediate_state->bs);
2712     }
2713     ret = 0;
2714 
2715 exit:
2716     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2717         g_free(intermediate_state);
2718     }
2719     return ret;
2720 }
2721 
2722 
2723 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2724                                    size_t size)
2725 {
2726     if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
2727         return -EIO;
2728     }
2729 
2730     if (!bdrv_is_inserted(bs)) {
2731         return -ENOMEDIUM;
2732     }
2733 
2734     if (offset < 0) {
2735         return -EIO;
2736     }
2737 
2738     return 0;
2739 }
2740 
2741 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2742                               int nb_sectors)
2743 {
2744     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
2745         return -EIO;
2746     }
2747 
2748     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2749                                    nb_sectors * BDRV_SECTOR_SIZE);
2750 }
2751 
2752 typedef struct RwCo {
2753     BlockDriverState *bs;
2754     int64_t offset;
2755     QEMUIOVector *qiov;
2756     bool is_write;
2757     int ret;
2758     BdrvRequestFlags flags;
2759 } RwCo;
2760 
2761 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2762 {
2763     RwCo *rwco = opaque;
2764 
2765     if (!rwco->is_write) {
2766         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2767                                       rwco->qiov->size, rwco->qiov,
2768                                       rwco->flags);
2769     } else {
2770         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2771                                        rwco->qiov->size, rwco->qiov,
2772                                        rwco->flags);
2773     }
2774 }
2775 
2776 /*
2777  * Process a vectored synchronous request using coroutines
2778  */
2779 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2780                         QEMUIOVector *qiov, bool is_write,
2781                         BdrvRequestFlags flags)
2782 {
2783     Coroutine *co;
2784     RwCo rwco = {
2785         .bs = bs,
2786         .offset = offset,
2787         .qiov = qiov,
2788         .is_write = is_write,
2789         .ret = NOT_DONE,
2790         .flags = flags,
2791     };
2792 
2793     /**
2794      * In sync call context, when the vcpu is blocked, this throttling timer
2795      * will not fire; so the I/O throttling function has to be disabled here
2796      * if it has been enabled.
2797      */
2798     if (bs->io_limits_enabled) {
2799         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2800                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2801         bdrv_io_limits_disable(bs);
2802     }
2803 
2804     if (qemu_in_coroutine()) {
2805         /* Fast-path if already in coroutine context */
2806         bdrv_rw_co_entry(&rwco);
2807     } else {
2808         AioContext *aio_context = bdrv_get_aio_context(bs);
2809 
2810         co = qemu_coroutine_create(bdrv_rw_co_entry);
2811         qemu_coroutine_enter(co, &rwco);
2812         while (rwco.ret == NOT_DONE) {
2813             aio_poll(aio_context, true);
2814         }
2815     }
2816     return rwco.ret;
2817 }
2818 
2819 /*
2820  * Process a synchronous request using coroutines
2821  */
2822 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2823                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2824 {
2825     QEMUIOVector qiov;
2826     struct iovec iov = {
2827         .iov_base = (void *)buf,
2828         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2829     };
2830 
2831     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
2832         return -EINVAL;
2833     }
2834 
2835     qemu_iovec_init_external(&qiov, &iov, 1);
2836     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2837                         &qiov, is_write, flags);
2838 }
2839 
2840 /* return < 0 if error. See bdrv_write() for the return codes */
2841 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2842               uint8_t *buf, int nb_sectors)
2843 {
2844     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2845 }
2846 
2847 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2848 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2849                           uint8_t *buf, int nb_sectors)
2850 {
2851     bool enabled;
2852     int ret;
2853 
2854     enabled = bs->io_limits_enabled;
2855     bs->io_limits_enabled = false;
2856     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2857     bs->io_limits_enabled = enabled;
2858     return ret;
2859 }
2860 
2861 /* Return < 0 if error. Important errors are:
2862   -EIO         generic I/O error (may happen for all errors)
2863   -ENOMEDIUM   No media inserted.
2864   -EINVAL      Invalid sector number or nb_sectors
2865   -EACCES      Trying to write a read-only device
2866 */
2867 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2868                const uint8_t *buf, int nb_sectors)
2869 {
2870     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2871 }
2872 
2873 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2874                       int nb_sectors, BdrvRequestFlags flags)
2875 {
2876     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2877                       BDRV_REQ_ZERO_WRITE | flags);
2878 }
2879 
2880 /*
2881  * Completely zero out a block device with the help of bdrv_write_zeroes.
2882  * The operation is sped up by checking the block status and only writing
2883  * zeroes to the device if they currently do not return zeroes. Optional
2884  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2885  *
2886  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2887  */
2888 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2889 {
2890     int64_t target_sectors, ret, nb_sectors, sector_num = 0;
2891     int n;
2892 
2893     target_sectors = bdrv_nb_sectors(bs);
2894     if (target_sectors < 0) {
2895         return target_sectors;
2896     }
2897 
2898     for (;;) {
2899         nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
2900         if (nb_sectors <= 0) {
2901             return 0;
2902         }
2903         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2904         if (ret < 0) {
2905             error_report("error getting block status at sector %" PRId64 ": %s",
2906                          sector_num, strerror(-ret));
2907             return ret;
2908         }
2909         if (ret & BDRV_BLOCK_ZERO) {
2910             sector_num += n;
2911             continue;
2912         }
2913         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2914         if (ret < 0) {
2915             error_report("error writing zeroes at sector %" PRId64 ": %s",
2916                          sector_num, strerror(-ret));
2917             return ret;
2918         }
2919         sector_num += n;
2920     }
2921 }
2922 
2923 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2924 {
2925     QEMUIOVector qiov;
2926     struct iovec iov = {
2927         .iov_base = (void *)buf,
2928         .iov_len = bytes,
2929     };
2930     int ret;
2931 
2932     if (bytes < 0) {
2933         return -EINVAL;
2934     }
2935 
2936     qemu_iovec_init_external(&qiov, &iov, 1);
2937     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2938     if (ret < 0) {
2939         return ret;
2940     }
2941 
2942     return bytes;
2943 }
2944 
2945 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2946 {
2947     int ret;
2948 
2949     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2950     if (ret < 0) {
2951         return ret;
2952     }
2953 
2954     return qiov->size;
2955 }
2956 
2957 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2958                 const void *buf, int bytes)
2959 {
2960     QEMUIOVector qiov;
2961     struct iovec iov = {
2962         .iov_base   = (void *) buf,
2963         .iov_len    = bytes,
2964     };
2965 
2966     if (bytes < 0) {
2967         return -EINVAL;
2968     }
2969 
2970     qemu_iovec_init_external(&qiov, &iov, 1);
2971     return bdrv_pwritev(bs, offset, &qiov);
2972 }
2973 
2974 /*
2975  * Writes to the file and ensures that no writes are reordered across this
2976  * request (acts as a barrier)
2977  *
2978  * Returns 0 on success, -errno in error cases.
2979  */
2980 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2981     const void *buf, int count)
2982 {
2983     int ret;
2984 
2985     ret = bdrv_pwrite(bs, offset, buf, count);
2986     if (ret < 0) {
2987         return ret;
2988     }
2989 
2990     /* No flush needed for cache modes that already do it */
2991     if (bs->enable_write_cache) {
2992         bdrv_flush(bs);
2993     }
2994 
2995     return 0;
2996 }
2997 
2998 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2999         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3000 {
3001     /* Perform I/O through a temporary buffer so that users who scribble over
3002      * their read buffer while the operation is in progress do not end up
3003      * modifying the image file.  This is critical for zero-copy guest I/O
3004      * where anything might happen inside guest memory.
3005      */
3006     void *bounce_buffer;
3007 
3008     BlockDriver *drv = bs->drv;
3009     struct iovec iov;
3010     QEMUIOVector bounce_qiov;
3011     int64_t cluster_sector_num;
3012     int cluster_nb_sectors;
3013     size_t skip_bytes;
3014     int ret;
3015 
3016     /* Cover entire cluster so no additional backing file I/O is required when
3017      * allocating cluster in the image file.
3018      */
3019     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
3020                            &cluster_sector_num, &cluster_nb_sectors);
3021 
3022     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
3023                                    cluster_sector_num, cluster_nb_sectors);
3024 
3025     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
3026     iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
3027     if (bounce_buffer == NULL) {
3028         ret = -ENOMEM;
3029         goto err;
3030     }
3031 
3032     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
3033 
3034     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
3035                              &bounce_qiov);
3036     if (ret < 0) {
3037         goto err;
3038     }
3039 
3040     if (drv->bdrv_co_write_zeroes &&
3041         buffer_is_zero(bounce_buffer, iov.iov_len)) {
3042         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
3043                                       cluster_nb_sectors, 0);
3044     } else {
3045         /* This does not change the data on the disk, it is not necessary
3046          * to flush even in cache=writethrough mode.
3047          */
3048         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
3049                                   &bounce_qiov);
3050     }
3051 
3052     if (ret < 0) {
3053         /* It might be okay to ignore write errors for guest requests.  If this
3054          * is a deliberate copy-on-read then we don't want to ignore the error.
3055          * Simply report it in all cases.
3056          */
3057         goto err;
3058     }
3059 
3060     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
3061     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
3062                         nb_sectors * BDRV_SECTOR_SIZE);
3063 
3064 err:
3065     qemu_vfree(bounce_buffer);
3066     return ret;
3067 }
3068 
3069 /*
3070  * Forwards an already correctly aligned request to the BlockDriver. This
3071  * handles copy on read and zeroing after EOF; any other features must be
3072  * implemented by the caller.
3073  */
3074 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
3075     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3076     int64_t align, QEMUIOVector *qiov, int flags)
3077 {
3078     BlockDriver *drv = bs->drv;
3079     int ret;
3080 
3081     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3082     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3083 
3084     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3085     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3086     assert(!qiov || bytes == qiov->size);
3087 
3088     /* Handle Copy on Read and associated serialisation */
3089     if (flags & BDRV_REQ_COPY_ON_READ) {
3090         /* If we touch the same cluster it counts as an overlap.  This
3091          * guarantees that allocating writes will be serialized and not race
3092          * with each other for the same cluster.  For example, in copy-on-read
3093          * it ensures that the CoR read and write operations are atomic and
3094          * guest writes cannot interleave between them. */
3095         mark_request_serialising(req, bdrv_get_cluster_size(bs));
3096     }
3097 
3098     wait_serialising_requests(req);
3099 
3100     if (flags & BDRV_REQ_COPY_ON_READ) {
3101         int pnum;
3102 
3103         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3104         if (ret < 0) {
3105             goto out;
3106         }
3107 
3108         if (!ret || pnum != nb_sectors) {
3109             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3110             goto out;
3111         }
3112     }
3113 
3114     /* Forward the request to the BlockDriver */
3115     if (!bs->zero_beyond_eof) {
3116         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3117     } else {
3118         /* Read zeros after EOF */
3119         int64_t total_sectors, max_nb_sectors;
3120 
3121         total_sectors = bdrv_nb_sectors(bs);
3122         if (total_sectors < 0) {
3123             ret = total_sectors;
3124             goto out;
3125         }
3126 
3127         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3128                                   align >> BDRV_SECTOR_BITS);
3129         if (nb_sectors < max_nb_sectors) {
3130             ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3131         } else if (max_nb_sectors > 0) {
3132             QEMUIOVector local_qiov;
3133 
3134             qemu_iovec_init(&local_qiov, qiov->niov);
3135             qemu_iovec_concat(&local_qiov, qiov, 0,
3136                               max_nb_sectors * BDRV_SECTOR_SIZE);
3137 
3138             ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
3139                                      &local_qiov);
3140 
3141             qemu_iovec_destroy(&local_qiov);
3142         } else {
3143             ret = 0;
3144         }
3145 
3146         /* Reading beyond end of file is supposed to produce zeroes */
3147         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3148             uint64_t offset = MAX(0, total_sectors - sector_num);
3149             uint64_t bytes = (sector_num + nb_sectors - offset) *
3150                               BDRV_SECTOR_SIZE;
3151             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3152         }
3153     }
3154 
3155 out:
3156     return ret;
3157 }
3158 
3159 static inline uint64_t bdrv_get_align(BlockDriverState *bs)
3160 {
3161     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3162     return MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3163 }
3164 
3165 static inline bool bdrv_req_is_aligned(BlockDriverState *bs,
3166                                        int64_t offset, size_t bytes)
3167 {
3168     int64_t align = bdrv_get_align(bs);
3169     return !(offset & (align - 1) || (bytes & (align - 1)));
3170 }
3171 
3172 /*
3173  * Handle a read request in coroutine context
3174  */
3175 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3176     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3177     BdrvRequestFlags flags)
3178 {
3179     BlockDriver *drv = bs->drv;
3180     BdrvTrackedRequest req;
3181 
3182     uint64_t align = bdrv_get_align(bs);
3183     uint8_t *head_buf = NULL;
3184     uint8_t *tail_buf = NULL;
3185     QEMUIOVector local_qiov;
3186     bool use_local_qiov = false;
3187     int ret;
3188 
3189     if (!drv) {
3190         return -ENOMEDIUM;
3191     }
3192 
3193     ret = bdrv_check_byte_request(bs, offset, bytes);
3194     if (ret < 0) {
3195         return ret;
3196     }
3197 
3198     if (bs->copy_on_read) {
3199         flags |= BDRV_REQ_COPY_ON_READ;
3200     }
3201 
3202     /* throttling disk I/O */
3203     if (bs->io_limits_enabled) {
3204         bdrv_io_limits_intercept(bs, bytes, false);
3205     }
3206 
3207     /* Align read if necessary by padding qiov */
3208     if (offset & (align - 1)) {
3209         head_buf = qemu_blockalign(bs, align);
3210         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3211         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3212         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3213         use_local_qiov = true;
3214 
3215         bytes += offset & (align - 1);
3216         offset = offset & ~(align - 1);
3217     }
3218 
3219     if ((offset + bytes) & (align - 1)) {
3220         if (!use_local_qiov) {
3221             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3222             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3223             use_local_qiov = true;
3224         }
3225         tail_buf = qemu_blockalign(bs, align);
3226         qemu_iovec_add(&local_qiov, tail_buf,
3227                        align - ((offset + bytes) & (align - 1)));
3228 
3229         bytes = ROUND_UP(bytes, align);
3230     }
3231 
3232     tracked_request_begin(&req, bs, offset, bytes, false);
3233     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3234                               use_local_qiov ? &local_qiov : qiov,
3235                               flags);
3236     tracked_request_end(&req);
3237 
3238     if (use_local_qiov) {
3239         qemu_iovec_destroy(&local_qiov);
3240         qemu_vfree(head_buf);
3241         qemu_vfree(tail_buf);
3242     }
3243 
3244     return ret;
3245 }
3246 
3247 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3248     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3249     BdrvRequestFlags flags)
3250 {
3251     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
3252         return -EINVAL;
3253     }
3254 
3255     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3256                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3257 }
3258 
3259 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3260     int nb_sectors, QEMUIOVector *qiov)
3261 {
3262     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3263 
3264     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3265 }
3266 
3267 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3268     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3269 {
3270     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3271 
3272     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3273                             BDRV_REQ_COPY_ON_READ);
3274 }
3275 
3276 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
3277 
3278 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3279     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3280 {
3281     BlockDriver *drv = bs->drv;
3282     QEMUIOVector qiov;
3283     struct iovec iov = {0};
3284     int ret = 0;
3285 
3286     int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
3287                                         BDRV_REQUEST_MAX_SECTORS);
3288 
3289     while (nb_sectors > 0 && !ret) {
3290         int num = nb_sectors;
3291 
3292         /* Align request.  Block drivers can expect the "bulk" of the request
3293          * to be aligned.
3294          */
3295         if (bs->bl.write_zeroes_alignment
3296             && num > bs->bl.write_zeroes_alignment) {
3297             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3298                 /* Make a small request up to the first aligned sector.  */
3299                 num = bs->bl.write_zeroes_alignment;
3300                 num -= sector_num % bs->bl.write_zeroes_alignment;
3301             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3302                 /* Shorten the request to the last aligned sector.  num cannot
3303                  * underflow because num > bs->bl.write_zeroes_alignment.
3304                  */
3305                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3306             }
3307         }
3308 
3309         /* limit request size */
3310         if (num > max_write_zeroes) {
3311             num = max_write_zeroes;
3312         }
3313 
3314         ret = -ENOTSUP;
3315         /* First try the efficient write zeroes operation */
3316         if (drv->bdrv_co_write_zeroes) {
3317             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3318         }
3319 
3320         if (ret == -ENOTSUP) {
3321             /* Fall back to bounce buffer if write zeroes is unsupported */
3322             int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
3323                                             MAX_WRITE_ZEROES_BOUNCE_BUFFER);
3324             num = MIN(num, max_xfer_len);
3325             iov.iov_len = num * BDRV_SECTOR_SIZE;
3326             if (iov.iov_base == NULL) {
3327                 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3328                 if (iov.iov_base == NULL) {
3329                     ret = -ENOMEM;
3330                     goto fail;
3331                 }
3332                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3333             }
3334             qemu_iovec_init_external(&qiov, &iov, 1);
3335 
3336             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3337 
3338             /* Keep bounce buffer around if it is big enough for all
3339              * all future requests.
3340              */
3341             if (num < max_xfer_len) {
3342                 qemu_vfree(iov.iov_base);
3343                 iov.iov_base = NULL;
3344             }
3345         }
3346 
3347         sector_num += num;
3348         nb_sectors -= num;
3349     }
3350 
3351 fail:
3352     qemu_vfree(iov.iov_base);
3353     return ret;
3354 }
3355 
3356 /*
3357  * Forwards an already correctly aligned write request to the BlockDriver.
3358  */
3359 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3360     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3361     QEMUIOVector *qiov, int flags)
3362 {
3363     BlockDriver *drv = bs->drv;
3364     bool waited;
3365     int ret;
3366 
3367     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3368     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3369 
3370     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3371     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3372     assert(!qiov || bytes == qiov->size);
3373 
3374     waited = wait_serialising_requests(req);
3375     assert(!waited || !req->serialising);
3376     assert(req->overlap_offset <= offset);
3377     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3378 
3379     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3380 
3381     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3382         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3383         qemu_iovec_is_zero(qiov)) {
3384         flags |= BDRV_REQ_ZERO_WRITE;
3385         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3386             flags |= BDRV_REQ_MAY_UNMAP;
3387         }
3388     }
3389 
3390     if (ret < 0) {
3391         /* Do nothing, write notifier decided to fail this request */
3392     } else if (flags & BDRV_REQ_ZERO_WRITE) {
3393         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3394         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3395     } else {
3396         BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3397         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3398     }
3399     BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3400 
3401     if (ret == 0 && !bs->enable_write_cache) {
3402         ret = bdrv_co_flush(bs);
3403     }
3404 
3405     bdrv_set_dirty(bs, sector_num, nb_sectors);
3406 
3407     block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
3408 
3409     if (ret >= 0) {
3410         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3411     }
3412 
3413     return ret;
3414 }
3415 
3416 /*
3417  * Handle a write request in coroutine context
3418  */
3419 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3420     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3421     BdrvRequestFlags flags)
3422 {
3423     BdrvTrackedRequest req;
3424     uint64_t align = bdrv_get_align(bs);
3425     uint8_t *head_buf = NULL;
3426     uint8_t *tail_buf = NULL;
3427     QEMUIOVector local_qiov;
3428     bool use_local_qiov = false;
3429     int ret;
3430 
3431     if (!bs->drv) {
3432         return -ENOMEDIUM;
3433     }
3434     if (bs->read_only) {
3435         return -EACCES;
3436     }
3437 
3438     ret = bdrv_check_byte_request(bs, offset, bytes);
3439     if (ret < 0) {
3440         return ret;
3441     }
3442 
3443     /* throttling disk I/O */
3444     if (bs->io_limits_enabled) {
3445         bdrv_io_limits_intercept(bs, bytes, true);
3446     }
3447 
3448     /*
3449      * Align write if necessary by performing a read-modify-write cycle.
3450      * Pad qiov with the read parts and be sure to have a tracked request not
3451      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3452      */
3453     tracked_request_begin(&req, bs, offset, bytes, true);
3454 
3455     if (offset & (align - 1)) {
3456         QEMUIOVector head_qiov;
3457         struct iovec head_iov;
3458 
3459         mark_request_serialising(&req, align);
3460         wait_serialising_requests(&req);
3461 
3462         head_buf = qemu_blockalign(bs, align);
3463         head_iov = (struct iovec) {
3464             .iov_base   = head_buf,
3465             .iov_len    = align,
3466         };
3467         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3468 
3469         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3470         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3471                                   align, &head_qiov, 0);
3472         if (ret < 0) {
3473             goto fail;
3474         }
3475         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3476 
3477         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3478         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3479         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3480         use_local_qiov = true;
3481 
3482         bytes += offset & (align - 1);
3483         offset = offset & ~(align - 1);
3484     }
3485 
3486     if ((offset + bytes) & (align - 1)) {
3487         QEMUIOVector tail_qiov;
3488         struct iovec tail_iov;
3489         size_t tail_bytes;
3490         bool waited;
3491 
3492         mark_request_serialising(&req, align);
3493         waited = wait_serialising_requests(&req);
3494         assert(!waited || !use_local_qiov);
3495 
3496         tail_buf = qemu_blockalign(bs, align);
3497         tail_iov = (struct iovec) {
3498             .iov_base   = tail_buf,
3499             .iov_len    = align,
3500         };
3501         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3502 
3503         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3504         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3505                                   align, &tail_qiov, 0);
3506         if (ret < 0) {
3507             goto fail;
3508         }
3509         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3510 
3511         if (!use_local_qiov) {
3512             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3513             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3514             use_local_qiov = true;
3515         }
3516 
3517         tail_bytes = (offset + bytes) & (align - 1);
3518         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3519 
3520         bytes = ROUND_UP(bytes, align);
3521     }
3522 
3523     if (use_local_qiov) {
3524         /* Local buffer may have non-zero data. */
3525         flags &= ~BDRV_REQ_ZERO_WRITE;
3526     }
3527     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3528                                use_local_qiov ? &local_qiov : qiov,
3529                                flags);
3530 
3531 fail:
3532     tracked_request_end(&req);
3533 
3534     if (use_local_qiov) {
3535         qemu_iovec_destroy(&local_qiov);
3536     }
3537     qemu_vfree(head_buf);
3538     qemu_vfree(tail_buf);
3539 
3540     return ret;
3541 }
3542 
3543 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3544     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3545     BdrvRequestFlags flags)
3546 {
3547     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
3548         return -EINVAL;
3549     }
3550 
3551     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3552                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3553 }
3554 
3555 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3556     int nb_sectors, QEMUIOVector *qiov)
3557 {
3558     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3559 
3560     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3561 }
3562 
3563 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3564                                       int64_t sector_num, int nb_sectors,
3565                                       BdrvRequestFlags flags)
3566 {
3567     int ret;
3568 
3569     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3570 
3571     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3572         flags &= ~BDRV_REQ_MAY_UNMAP;
3573     }
3574     if (bdrv_req_is_aligned(bs, sector_num << BDRV_SECTOR_BITS,
3575                             nb_sectors << BDRV_SECTOR_BITS)) {
3576         ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3577                                 BDRV_REQ_ZERO_WRITE | flags);
3578     } else {
3579         uint8_t *buf;
3580         QEMUIOVector local_qiov;
3581         size_t bytes = nb_sectors << BDRV_SECTOR_BITS;
3582 
3583         buf = qemu_memalign(bdrv_opt_mem_align(bs), bytes);
3584         memset(buf, 0, bytes);
3585         qemu_iovec_init(&local_qiov, 1);
3586         qemu_iovec_add(&local_qiov, buf, bytes);
3587 
3588         ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, &local_qiov,
3589                                 BDRV_REQ_ZERO_WRITE | flags);
3590         qemu_vfree(buf);
3591     }
3592     return ret;
3593 }
3594 
3595 /**
3596  * Truncate file to 'offset' bytes (needed only for file protocols)
3597  */
3598 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3599 {
3600     BlockDriver *drv = bs->drv;
3601     int ret;
3602     if (!drv)
3603         return -ENOMEDIUM;
3604     if (!drv->bdrv_truncate)
3605         return -ENOTSUP;
3606     if (bs->read_only)
3607         return -EACCES;
3608 
3609     ret = drv->bdrv_truncate(bs, offset);
3610     if (ret == 0) {
3611         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3612         bdrv_dirty_bitmap_truncate(bs);
3613         if (bs->blk) {
3614             blk_dev_resize_cb(bs->blk);
3615         }
3616     }
3617     return ret;
3618 }
3619 
3620 /**
3621  * Length of a allocated file in bytes. Sparse files are counted by actual
3622  * allocated space. Return < 0 if error or unknown.
3623  */
3624 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3625 {
3626     BlockDriver *drv = bs->drv;
3627     if (!drv) {
3628         return -ENOMEDIUM;
3629     }
3630     if (drv->bdrv_get_allocated_file_size) {
3631         return drv->bdrv_get_allocated_file_size(bs);
3632     }
3633     if (bs->file) {
3634         return bdrv_get_allocated_file_size(bs->file);
3635     }
3636     return -ENOTSUP;
3637 }
3638 
3639 /**
3640  * Return number of sectors on success, -errno on error.
3641  */
3642 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3643 {
3644     BlockDriver *drv = bs->drv;
3645 
3646     if (!drv)
3647         return -ENOMEDIUM;
3648 
3649     if (drv->has_variable_length) {
3650         int ret = refresh_total_sectors(bs, bs->total_sectors);
3651         if (ret < 0) {
3652             return ret;
3653         }
3654     }
3655     return bs->total_sectors;
3656 }
3657 
3658 /**
3659  * Return length in bytes on success, -errno on error.
3660  * The length is always a multiple of BDRV_SECTOR_SIZE.
3661  */
3662 int64_t bdrv_getlength(BlockDriverState *bs)
3663 {
3664     int64_t ret = bdrv_nb_sectors(bs);
3665 
3666     return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3667 }
3668 
3669 /* return 0 as number of sectors if no device present or error */
3670 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3671 {
3672     int64_t nb_sectors = bdrv_nb_sectors(bs);
3673 
3674     *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3675 }
3676 
3677 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3678                        BlockdevOnError on_write_error)
3679 {
3680     bs->on_read_error = on_read_error;
3681     bs->on_write_error = on_write_error;
3682 }
3683 
3684 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3685 {
3686     return is_read ? bs->on_read_error : bs->on_write_error;
3687 }
3688 
3689 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3690 {
3691     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3692 
3693     switch (on_err) {
3694     case BLOCKDEV_ON_ERROR_ENOSPC:
3695         return (error == ENOSPC) ?
3696                BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3697     case BLOCKDEV_ON_ERROR_STOP:
3698         return BLOCK_ERROR_ACTION_STOP;
3699     case BLOCKDEV_ON_ERROR_REPORT:
3700         return BLOCK_ERROR_ACTION_REPORT;
3701     case BLOCKDEV_ON_ERROR_IGNORE:
3702         return BLOCK_ERROR_ACTION_IGNORE;
3703     default:
3704         abort();
3705     }
3706 }
3707 
3708 static void send_qmp_error_event(BlockDriverState *bs,
3709                                  BlockErrorAction action,
3710                                  bool is_read, int error)
3711 {
3712     IoOperationType optype;
3713 
3714     optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3715     qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
3716                                    bdrv_iostatus_is_enabled(bs),
3717                                    error == ENOSPC, strerror(error),
3718                                    &error_abort);
3719 }
3720 
3721 /* This is done by device models because, while the block layer knows
3722  * about the error, it does not know whether an operation comes from
3723  * the device or the block layer (from a job, for example).
3724  */
3725 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3726                        bool is_read, int error)
3727 {
3728     assert(error >= 0);
3729 
3730     if (action == BLOCK_ERROR_ACTION_STOP) {
3731         /* First set the iostatus, so that "info block" returns an iostatus
3732          * that matches the events raised so far (an additional error iostatus
3733          * is fine, but not a lost one).
3734          */
3735         bdrv_iostatus_set_err(bs, error);
3736 
3737         /* Then raise the request to stop the VM and the event.
3738          * qemu_system_vmstop_request_prepare has two effects.  First,
3739          * it ensures that the STOP event always comes after the
3740          * BLOCK_IO_ERROR event.  Second, it ensures that even if management
3741          * can observe the STOP event and do a "cont" before the STOP
3742          * event is issued, the VM will not stop.  In this case, vm_start()
3743          * also ensures that the STOP/RESUME pair of events is emitted.
3744          */
3745         qemu_system_vmstop_request_prepare();
3746         send_qmp_error_event(bs, action, is_read, error);
3747         qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3748     } else {
3749         send_qmp_error_event(bs, action, is_read, error);
3750     }
3751 }
3752 
3753 int bdrv_is_read_only(BlockDriverState *bs)
3754 {
3755     return bs->read_only;
3756 }
3757 
3758 int bdrv_is_sg(BlockDriverState *bs)
3759 {
3760     return bs->sg;
3761 }
3762 
3763 int bdrv_enable_write_cache(BlockDriverState *bs)
3764 {
3765     return bs->enable_write_cache;
3766 }
3767 
3768 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3769 {
3770     bs->enable_write_cache = wce;
3771 
3772     /* so a reopen() will preserve wce */
3773     if (wce) {
3774         bs->open_flags |= BDRV_O_CACHE_WB;
3775     } else {
3776         bs->open_flags &= ~BDRV_O_CACHE_WB;
3777     }
3778 }
3779 
3780 int bdrv_is_encrypted(BlockDriverState *bs)
3781 {
3782     if (bs->backing_hd && bs->backing_hd->encrypted)
3783         return 1;
3784     return bs->encrypted;
3785 }
3786 
3787 int bdrv_key_required(BlockDriverState *bs)
3788 {
3789     BlockDriverState *backing_hd = bs->backing_hd;
3790 
3791     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3792         return 1;
3793     return (bs->encrypted && !bs->valid_key);
3794 }
3795 
3796 int bdrv_set_key(BlockDriverState *bs, const char *key)
3797 {
3798     int ret;
3799     if (bs->backing_hd && bs->backing_hd->encrypted) {
3800         ret = bdrv_set_key(bs->backing_hd, key);
3801         if (ret < 0)
3802             return ret;
3803         if (!bs->encrypted)
3804             return 0;
3805     }
3806     if (!bs->encrypted) {
3807         return -EINVAL;
3808     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3809         return -ENOMEDIUM;
3810     }
3811     ret = bs->drv->bdrv_set_key(bs, key);
3812     if (ret < 0) {
3813         bs->valid_key = 0;
3814     } else if (!bs->valid_key) {
3815         bs->valid_key = 1;
3816         if (bs->blk) {
3817             /* call the change callback now, we skipped it on open */
3818             blk_dev_change_media_cb(bs->blk, true);
3819         }
3820     }
3821     return ret;
3822 }
3823 
3824 /*
3825  * Provide an encryption key for @bs.
3826  * If @key is non-null:
3827  *     If @bs is not encrypted, fail.
3828  *     Else if the key is invalid, fail.
3829  *     Else set @bs's key to @key, replacing the existing key, if any.
3830  * If @key is null:
3831  *     If @bs is encrypted and still lacks a key, fail.
3832  *     Else do nothing.
3833  * On failure, store an error object through @errp if non-null.
3834  */
3835 void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp)
3836 {
3837     if (key) {
3838         if (!bdrv_is_encrypted(bs)) {
3839             error_setg(errp, "Node '%s' is not encrypted",
3840                       bdrv_get_device_or_node_name(bs));
3841         } else if (bdrv_set_key(bs, key) < 0) {
3842             error_set(errp, QERR_INVALID_PASSWORD);
3843         }
3844     } else {
3845         if (bdrv_key_required(bs)) {
3846             error_set(errp, ERROR_CLASS_DEVICE_ENCRYPTED,
3847                       "'%s' (%s) is encrypted",
3848                       bdrv_get_device_or_node_name(bs),
3849                       bdrv_get_encrypted_filename(bs));
3850         }
3851     }
3852 }
3853 
3854 const char *bdrv_get_format_name(BlockDriverState *bs)
3855 {
3856     return bs->drv ? bs->drv->format_name : NULL;
3857 }
3858 
3859 static int qsort_strcmp(const void *a, const void *b)
3860 {
3861     return strcmp(a, b);
3862 }
3863 
3864 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3865                          void *opaque)
3866 {
3867     BlockDriver *drv;
3868     int count = 0;
3869     int i;
3870     const char **formats = NULL;
3871 
3872     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3873         if (drv->format_name) {
3874             bool found = false;
3875             int i = count;
3876             while (formats && i && !found) {
3877                 found = !strcmp(formats[--i], drv->format_name);
3878             }
3879 
3880             if (!found) {
3881                 formats = g_renew(const char *, formats, count + 1);
3882                 formats[count++] = drv->format_name;
3883             }
3884         }
3885     }
3886 
3887     qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3888 
3889     for (i = 0; i < count; i++) {
3890         it(opaque, formats[i]);
3891     }
3892 
3893     g_free(formats);
3894 }
3895 
3896 /* This function is to find a node in the bs graph */
3897 BlockDriverState *bdrv_find_node(const char *node_name)
3898 {
3899     BlockDriverState *bs;
3900 
3901     assert(node_name);
3902 
3903     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3904         if (!strcmp(node_name, bs->node_name)) {
3905             return bs;
3906         }
3907     }
3908     return NULL;
3909 }
3910 
3911 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3912 BlockDeviceInfoList *bdrv_named_nodes_list(Error **errp)
3913 {
3914     BlockDeviceInfoList *list, *entry;
3915     BlockDriverState *bs;
3916 
3917     list = NULL;
3918     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3919         BlockDeviceInfo *info = bdrv_block_device_info(bs, errp);
3920         if (!info) {
3921             qapi_free_BlockDeviceInfoList(list);
3922             return NULL;
3923         }
3924         entry = g_malloc0(sizeof(*entry));
3925         entry->value = info;
3926         entry->next = list;
3927         list = entry;
3928     }
3929 
3930     return list;
3931 }
3932 
3933 BlockDriverState *bdrv_lookup_bs(const char *device,
3934                                  const char *node_name,
3935                                  Error **errp)
3936 {
3937     BlockBackend *blk;
3938     BlockDriverState *bs;
3939 
3940     if (device) {
3941         blk = blk_by_name(device);
3942 
3943         if (blk) {
3944             return blk_bs(blk);
3945         }
3946     }
3947 
3948     if (node_name) {
3949         bs = bdrv_find_node(node_name);
3950 
3951         if (bs) {
3952             return bs;
3953         }
3954     }
3955 
3956     error_setg(errp, "Cannot find device=%s nor node_name=%s",
3957                      device ? device : "",
3958                      node_name ? node_name : "");
3959     return NULL;
3960 }
3961 
3962 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3963  * return false.  If either argument is NULL, return false. */
3964 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3965 {
3966     while (top && top != base) {
3967         top = top->backing_hd;
3968     }
3969 
3970     return top != NULL;
3971 }
3972 
3973 BlockDriverState *bdrv_next_node(BlockDriverState *bs)
3974 {
3975     if (!bs) {
3976         return QTAILQ_FIRST(&graph_bdrv_states);
3977     }
3978     return QTAILQ_NEXT(bs, node_list);
3979 }
3980 
3981 BlockDriverState *bdrv_next(BlockDriverState *bs)
3982 {
3983     if (!bs) {
3984         return QTAILQ_FIRST(&bdrv_states);
3985     }
3986     return QTAILQ_NEXT(bs, device_list);
3987 }
3988 
3989 const char *bdrv_get_node_name(const BlockDriverState *bs)
3990 {
3991     return bs->node_name;
3992 }
3993 
3994 /* TODO check what callers really want: bs->node_name or blk_name() */
3995 const char *bdrv_get_device_name(const BlockDriverState *bs)
3996 {
3997     return bs->blk ? blk_name(bs->blk) : "";
3998 }
3999 
4000 /* This can be used to identify nodes that might not have a device
4001  * name associated. Since node and device names live in the same
4002  * namespace, the result is unambiguous. The exception is if both are
4003  * absent, then this returns an empty (non-null) string. */
4004 const char *bdrv_get_device_or_node_name(const BlockDriverState *bs)
4005 {
4006     return bs->blk ? blk_name(bs->blk) : bs->node_name;
4007 }
4008 
4009 int bdrv_get_flags(BlockDriverState *bs)
4010 {
4011     return bs->open_flags;
4012 }
4013 
4014 int bdrv_flush_all(void)
4015 {
4016     BlockDriverState *bs = NULL;
4017     int result = 0;
4018 
4019     while ((bs = bdrv_next(bs))) {
4020         AioContext *aio_context = bdrv_get_aio_context(bs);
4021         int ret;
4022 
4023         aio_context_acquire(aio_context);
4024         ret = bdrv_flush(bs);
4025         if (ret < 0 && !result) {
4026             result = ret;
4027         }
4028         aio_context_release(aio_context);
4029     }
4030 
4031     return result;
4032 }
4033 
4034 int bdrv_has_zero_init_1(BlockDriverState *bs)
4035 {
4036     return 1;
4037 }
4038 
4039 int bdrv_has_zero_init(BlockDriverState *bs)
4040 {
4041     assert(bs->drv);
4042 
4043     /* If BS is a copy on write image, it is initialized to
4044        the contents of the base image, which may not be zeroes.  */
4045     if (bs->backing_hd) {
4046         return 0;
4047     }
4048     if (bs->drv->bdrv_has_zero_init) {
4049         return bs->drv->bdrv_has_zero_init(bs);
4050     }
4051 
4052     /* safe default */
4053     return 0;
4054 }
4055 
4056 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
4057 {
4058     BlockDriverInfo bdi;
4059 
4060     if (bs->backing_hd) {
4061         return false;
4062     }
4063 
4064     if (bdrv_get_info(bs, &bdi) == 0) {
4065         return bdi.unallocated_blocks_are_zero;
4066     }
4067 
4068     return false;
4069 }
4070 
4071 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
4072 {
4073     BlockDriverInfo bdi;
4074 
4075     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
4076         return false;
4077     }
4078 
4079     if (bdrv_get_info(bs, &bdi) == 0) {
4080         return bdi.can_write_zeroes_with_unmap;
4081     }
4082 
4083     return false;
4084 }
4085 
4086 typedef struct BdrvCoGetBlockStatusData {
4087     BlockDriverState *bs;
4088     BlockDriverState *base;
4089     int64_t sector_num;
4090     int nb_sectors;
4091     int *pnum;
4092     int64_t ret;
4093     bool done;
4094 } BdrvCoGetBlockStatusData;
4095 
4096 /*
4097  * Returns the allocation status of the specified sectors.
4098  * Drivers not implementing the functionality are assumed to not support
4099  * backing files, hence all their sectors are reported as allocated.
4100  *
4101  * If 'sector_num' is beyond the end of the disk image the return value is 0
4102  * and 'pnum' is set to 0.
4103  *
4104  * 'pnum' is set to the number of sectors (including and immediately following
4105  * the specified sector) that are known to be in the same
4106  * allocated/unallocated state.
4107  *
4108  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
4109  * beyond the end of the disk image it will be clamped.
4110  */
4111 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
4112                                                      int64_t sector_num,
4113                                                      int nb_sectors, int *pnum)
4114 {
4115     int64_t total_sectors;
4116     int64_t n;
4117     int64_t ret, ret2;
4118 
4119     total_sectors = bdrv_nb_sectors(bs);
4120     if (total_sectors < 0) {
4121         return total_sectors;
4122     }
4123 
4124     if (sector_num >= total_sectors) {
4125         *pnum = 0;
4126         return 0;
4127     }
4128 
4129     n = total_sectors - sector_num;
4130     if (n < nb_sectors) {
4131         nb_sectors = n;
4132     }
4133 
4134     if (!bs->drv->bdrv_co_get_block_status) {
4135         *pnum = nb_sectors;
4136         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
4137         if (bs->drv->protocol_name) {
4138             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
4139         }
4140         return ret;
4141     }
4142 
4143     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
4144     if (ret < 0) {
4145         *pnum = 0;
4146         return ret;
4147     }
4148 
4149     if (ret & BDRV_BLOCK_RAW) {
4150         assert(ret & BDRV_BLOCK_OFFSET_VALID);
4151         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4152                                      *pnum, pnum);
4153     }
4154 
4155     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
4156         ret |= BDRV_BLOCK_ALLOCATED;
4157     }
4158 
4159     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
4160         if (bdrv_unallocated_blocks_are_zero(bs)) {
4161             ret |= BDRV_BLOCK_ZERO;
4162         } else if (bs->backing_hd) {
4163             BlockDriverState *bs2 = bs->backing_hd;
4164             int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4165             if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
4166                 ret |= BDRV_BLOCK_ZERO;
4167             }
4168         }
4169     }
4170 
4171     if (bs->file &&
4172         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4173         (ret & BDRV_BLOCK_OFFSET_VALID)) {
4174         int file_pnum;
4175 
4176         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4177                                         *pnum, &file_pnum);
4178         if (ret2 >= 0) {
4179             /* Ignore errors.  This is just providing extra information, it
4180              * is useful but not necessary.
4181              */
4182             if (!file_pnum) {
4183                 /* !file_pnum indicates an offset at or beyond the EOF; it is
4184                  * perfectly valid for the format block driver to point to such
4185                  * offsets, so catch it and mark everything as zero */
4186                 ret |= BDRV_BLOCK_ZERO;
4187             } else {
4188                 /* Limit request to the range reported by the protocol driver */
4189                 *pnum = file_pnum;
4190                 ret |= (ret2 & BDRV_BLOCK_ZERO);
4191             }
4192         }
4193     }
4194 
4195     return ret;
4196 }
4197 
4198 /* Coroutine wrapper for bdrv_get_block_status() */
4199 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
4200 {
4201     BdrvCoGetBlockStatusData *data = opaque;
4202     BlockDriverState *bs = data->bs;
4203 
4204     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4205                                          data->pnum);
4206     data->done = true;
4207 }
4208 
4209 /*
4210  * Synchronous wrapper around bdrv_co_get_block_status().
4211  *
4212  * See bdrv_co_get_block_status() for details.
4213  */
4214 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4215                               int nb_sectors, int *pnum)
4216 {
4217     Coroutine *co;
4218     BdrvCoGetBlockStatusData data = {
4219         .bs = bs,
4220         .sector_num = sector_num,
4221         .nb_sectors = nb_sectors,
4222         .pnum = pnum,
4223         .done = false,
4224     };
4225 
4226     if (qemu_in_coroutine()) {
4227         /* Fast-path if already in coroutine context */
4228         bdrv_get_block_status_co_entry(&data);
4229     } else {
4230         AioContext *aio_context = bdrv_get_aio_context(bs);
4231 
4232         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4233         qemu_coroutine_enter(co, &data);
4234         while (!data.done) {
4235             aio_poll(aio_context, true);
4236         }
4237     }
4238     return data.ret;
4239 }
4240 
4241 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4242                                    int nb_sectors, int *pnum)
4243 {
4244     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4245     if (ret < 0) {
4246         return ret;
4247     }
4248     return !!(ret & BDRV_BLOCK_ALLOCATED);
4249 }
4250 
4251 /*
4252  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4253  *
4254  * Return true if the given sector is allocated in any image between
4255  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
4256  * sector is allocated in any image of the chain.  Return false otherwise.
4257  *
4258  * 'pnum' is set to the number of sectors (including and immediately following
4259  *  the specified sector) that are known to be in the same
4260  *  allocated/unallocated state.
4261  *
4262  */
4263 int bdrv_is_allocated_above(BlockDriverState *top,
4264                             BlockDriverState *base,
4265                             int64_t sector_num,
4266                             int nb_sectors, int *pnum)
4267 {
4268     BlockDriverState *intermediate;
4269     int ret, n = nb_sectors;
4270 
4271     intermediate = top;
4272     while (intermediate && intermediate != base) {
4273         int pnum_inter;
4274         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4275                                 &pnum_inter);
4276         if (ret < 0) {
4277             return ret;
4278         } else if (ret) {
4279             *pnum = pnum_inter;
4280             return 1;
4281         }
4282 
4283         /*
4284          * [sector_num, nb_sectors] is unallocated on top but intermediate
4285          * might have
4286          *
4287          * [sector_num+x, nr_sectors] allocated.
4288          */
4289         if (n > pnum_inter &&
4290             (intermediate == top ||
4291              sector_num + pnum_inter < intermediate->total_sectors)) {
4292             n = pnum_inter;
4293         }
4294 
4295         intermediate = intermediate->backing_hd;
4296     }
4297 
4298     *pnum = n;
4299     return 0;
4300 }
4301 
4302 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4303 {
4304     if (bs->backing_hd && bs->backing_hd->encrypted)
4305         return bs->backing_file;
4306     else if (bs->encrypted)
4307         return bs->filename;
4308     else
4309         return NULL;
4310 }
4311 
4312 void bdrv_get_backing_filename(BlockDriverState *bs,
4313                                char *filename, int filename_size)
4314 {
4315     pstrcpy(filename, filename_size, bs->backing_file);
4316 }
4317 
4318 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4319                           const uint8_t *buf, int nb_sectors)
4320 {
4321     BlockDriver *drv = bs->drv;
4322     int ret;
4323 
4324     if (!drv) {
4325         return -ENOMEDIUM;
4326     }
4327     if (!drv->bdrv_write_compressed) {
4328         return -ENOTSUP;
4329     }
4330     ret = bdrv_check_request(bs, sector_num, nb_sectors);
4331     if (ret < 0) {
4332         return ret;
4333     }
4334 
4335     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4336 
4337     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4338 }
4339 
4340 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4341 {
4342     BlockDriver *drv = bs->drv;
4343     if (!drv)
4344         return -ENOMEDIUM;
4345     if (!drv->bdrv_get_info)
4346         return -ENOTSUP;
4347     memset(bdi, 0, sizeof(*bdi));
4348     return drv->bdrv_get_info(bs, bdi);
4349 }
4350 
4351 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4352 {
4353     BlockDriver *drv = bs->drv;
4354     if (drv && drv->bdrv_get_specific_info) {
4355         return drv->bdrv_get_specific_info(bs);
4356     }
4357     return NULL;
4358 }
4359 
4360 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4361                       int64_t pos, int size)
4362 {
4363     QEMUIOVector qiov;
4364     struct iovec iov = {
4365         .iov_base   = (void *) buf,
4366         .iov_len    = size,
4367     };
4368 
4369     qemu_iovec_init_external(&qiov, &iov, 1);
4370     return bdrv_writev_vmstate(bs, &qiov, pos);
4371 }
4372 
4373 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4374 {
4375     BlockDriver *drv = bs->drv;
4376 
4377     if (!drv) {
4378         return -ENOMEDIUM;
4379     } else if (drv->bdrv_save_vmstate) {
4380         return drv->bdrv_save_vmstate(bs, qiov, pos);
4381     } else if (bs->file) {
4382         return bdrv_writev_vmstate(bs->file, qiov, pos);
4383     }
4384 
4385     return -ENOTSUP;
4386 }
4387 
4388 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4389                       int64_t pos, int size)
4390 {
4391     BlockDriver *drv = bs->drv;
4392     if (!drv)
4393         return -ENOMEDIUM;
4394     if (drv->bdrv_load_vmstate)
4395         return drv->bdrv_load_vmstate(bs, buf, pos, size);
4396     if (bs->file)
4397         return bdrv_load_vmstate(bs->file, buf, pos, size);
4398     return -ENOTSUP;
4399 }
4400 
4401 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4402 {
4403     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4404         return;
4405     }
4406 
4407     bs->drv->bdrv_debug_event(bs, event);
4408 }
4409 
4410 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4411                           const char *tag)
4412 {
4413     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4414         bs = bs->file;
4415     }
4416 
4417     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4418         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4419     }
4420 
4421     return -ENOTSUP;
4422 }
4423 
4424 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4425 {
4426     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4427         bs = bs->file;
4428     }
4429 
4430     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4431         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4432     }
4433 
4434     return -ENOTSUP;
4435 }
4436 
4437 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4438 {
4439     while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4440         bs = bs->file;
4441     }
4442 
4443     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4444         return bs->drv->bdrv_debug_resume(bs, tag);
4445     }
4446 
4447     return -ENOTSUP;
4448 }
4449 
4450 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4451 {
4452     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4453         bs = bs->file;
4454     }
4455 
4456     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4457         return bs->drv->bdrv_debug_is_suspended(bs, tag);
4458     }
4459 
4460     return false;
4461 }
4462 
4463 int bdrv_is_snapshot(BlockDriverState *bs)
4464 {
4465     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4466 }
4467 
4468 /* backing_file can either be relative, or absolute, or a protocol.  If it is
4469  * relative, it must be relative to the chain.  So, passing in bs->filename
4470  * from a BDS as backing_file should not be done, as that may be relative to
4471  * the CWD rather than the chain. */
4472 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4473         const char *backing_file)
4474 {
4475     char *filename_full = NULL;
4476     char *backing_file_full = NULL;
4477     char *filename_tmp = NULL;
4478     int is_protocol = 0;
4479     BlockDriverState *curr_bs = NULL;
4480     BlockDriverState *retval = NULL;
4481 
4482     if (!bs || !bs->drv || !backing_file) {
4483         return NULL;
4484     }
4485 
4486     filename_full     = g_malloc(PATH_MAX);
4487     backing_file_full = g_malloc(PATH_MAX);
4488     filename_tmp      = g_malloc(PATH_MAX);
4489 
4490     is_protocol = path_has_protocol(backing_file);
4491 
4492     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4493 
4494         /* If either of the filename paths is actually a protocol, then
4495          * compare unmodified paths; otherwise make paths relative */
4496         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4497             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4498                 retval = curr_bs->backing_hd;
4499                 break;
4500             }
4501         } else {
4502             /* If not an absolute filename path, make it relative to the current
4503              * image's filename path */
4504             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4505                          backing_file);
4506 
4507             /* We are going to compare absolute pathnames */
4508             if (!realpath(filename_tmp, filename_full)) {
4509                 continue;
4510             }
4511 
4512             /* We need to make sure the backing filename we are comparing against
4513              * is relative to the current image filename (or absolute) */
4514             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4515                          curr_bs->backing_file);
4516 
4517             if (!realpath(filename_tmp, backing_file_full)) {
4518                 continue;
4519             }
4520 
4521             if (strcmp(backing_file_full, filename_full) == 0) {
4522                 retval = curr_bs->backing_hd;
4523                 break;
4524             }
4525         }
4526     }
4527 
4528     g_free(filename_full);
4529     g_free(backing_file_full);
4530     g_free(filename_tmp);
4531     return retval;
4532 }
4533 
4534 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4535 {
4536     if (!bs->drv) {
4537         return 0;
4538     }
4539 
4540     if (!bs->backing_hd) {
4541         return 0;
4542     }
4543 
4544     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4545 }
4546 
4547 /**************************************************************/
4548 /* async I/Os */
4549 
4550 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4551                            QEMUIOVector *qiov, int nb_sectors,
4552                            BlockCompletionFunc *cb, void *opaque)
4553 {
4554     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4555 
4556     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4557                                  cb, opaque, false);
4558 }
4559 
4560 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4561                             QEMUIOVector *qiov, int nb_sectors,
4562                             BlockCompletionFunc *cb, void *opaque)
4563 {
4564     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4565 
4566     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4567                                  cb, opaque, true);
4568 }
4569 
4570 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4571         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4572         BlockCompletionFunc *cb, void *opaque)
4573 {
4574     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4575 
4576     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4577                                  BDRV_REQ_ZERO_WRITE | flags,
4578                                  cb, opaque, true);
4579 }
4580 
4581 
4582 typedef struct MultiwriteCB {
4583     int error;
4584     int num_requests;
4585     int num_callbacks;
4586     struct {
4587         BlockCompletionFunc *cb;
4588         void *opaque;
4589         QEMUIOVector *free_qiov;
4590     } callbacks[];
4591 } MultiwriteCB;
4592 
4593 static void multiwrite_user_cb(MultiwriteCB *mcb)
4594 {
4595     int i;
4596 
4597     for (i = 0; i < mcb->num_callbacks; i++) {
4598         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4599         if (mcb->callbacks[i].free_qiov) {
4600             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4601         }
4602         g_free(mcb->callbacks[i].free_qiov);
4603     }
4604 }
4605 
4606 static void multiwrite_cb(void *opaque, int ret)
4607 {
4608     MultiwriteCB *mcb = opaque;
4609 
4610     trace_multiwrite_cb(mcb, ret);
4611 
4612     if (ret < 0 && !mcb->error) {
4613         mcb->error = ret;
4614     }
4615 
4616     mcb->num_requests--;
4617     if (mcb->num_requests == 0) {
4618         multiwrite_user_cb(mcb);
4619         g_free(mcb);
4620     }
4621 }
4622 
4623 static int multiwrite_req_compare(const void *a, const void *b)
4624 {
4625     const BlockRequest *req1 = a, *req2 = b;
4626 
4627     /*
4628      * Note that we can't simply subtract req2->sector from req1->sector
4629      * here as that could overflow the return value.
4630      */
4631     if (req1->sector > req2->sector) {
4632         return 1;
4633     } else if (req1->sector < req2->sector) {
4634         return -1;
4635     } else {
4636         return 0;
4637     }
4638 }
4639 
4640 /*
4641  * Takes a bunch of requests and tries to merge them. Returns the number of
4642  * requests that remain after merging.
4643  */
4644 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4645     int num_reqs, MultiwriteCB *mcb)
4646 {
4647     int i, outidx;
4648 
4649     // Sort requests by start sector
4650     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4651 
4652     // Check if adjacent requests touch the same clusters. If so, combine them,
4653     // filling up gaps with zero sectors.
4654     outidx = 0;
4655     for (i = 1; i < num_reqs; i++) {
4656         int merge = 0;
4657         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4658 
4659         // Handle exactly sequential writes and overlapping writes.
4660         if (reqs[i].sector <= oldreq_last) {
4661             merge = 1;
4662         }
4663 
4664         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4665             merge = 0;
4666         }
4667 
4668         if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
4669             reqs[i].nb_sectors > bs->bl.max_transfer_length) {
4670             merge = 0;
4671         }
4672 
4673         if (merge) {
4674             size_t size;
4675             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4676             qemu_iovec_init(qiov,
4677                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4678 
4679             // Add the first request to the merged one. If the requests are
4680             // overlapping, drop the last sectors of the first request.
4681             size = (reqs[i].sector - reqs[outidx].sector) << 9;
4682             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4683 
4684             // We should need to add any zeros between the two requests
4685             assert (reqs[i].sector <= oldreq_last);
4686 
4687             // Add the second request
4688             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4689 
4690             // Add tail of first request, if necessary
4691             if (qiov->size < reqs[outidx].qiov->size) {
4692                 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4693                                   reqs[outidx].qiov->size - qiov->size);
4694             }
4695 
4696             reqs[outidx].nb_sectors = qiov->size >> 9;
4697             reqs[outidx].qiov = qiov;
4698 
4699             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4700         } else {
4701             outidx++;
4702             reqs[outidx].sector     = reqs[i].sector;
4703             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4704             reqs[outidx].qiov       = reqs[i].qiov;
4705         }
4706     }
4707 
4708     block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
4709 
4710     return outidx + 1;
4711 }
4712 
4713 /*
4714  * Submit multiple AIO write requests at once.
4715  *
4716  * On success, the function returns 0 and all requests in the reqs array have
4717  * been submitted. In error case this function returns -1, and any of the
4718  * requests may or may not be submitted yet. In particular, this means that the
4719  * callback will be called for some of the requests, for others it won't. The
4720  * caller must check the error field of the BlockRequest to wait for the right
4721  * callbacks (if error != 0, no callback will be called).
4722  *
4723  * The implementation may modify the contents of the reqs array, e.g. to merge
4724  * requests. However, the fields opaque and error are left unmodified as they
4725  * are used to signal failure for a single request to the caller.
4726  */
4727 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4728 {
4729     MultiwriteCB *mcb;
4730     int i;
4731 
4732     /* don't submit writes if we don't have a medium */
4733     if (bs->drv == NULL) {
4734         for (i = 0; i < num_reqs; i++) {
4735             reqs[i].error = -ENOMEDIUM;
4736         }
4737         return -1;
4738     }
4739 
4740     if (num_reqs == 0) {
4741         return 0;
4742     }
4743 
4744     // Create MultiwriteCB structure
4745     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4746     mcb->num_requests = 0;
4747     mcb->num_callbacks = num_reqs;
4748 
4749     for (i = 0; i < num_reqs; i++) {
4750         mcb->callbacks[i].cb = reqs[i].cb;
4751         mcb->callbacks[i].opaque = reqs[i].opaque;
4752     }
4753 
4754     // Check for mergable requests
4755     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4756 
4757     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4758 
4759     /* Run the aio requests. */
4760     mcb->num_requests = num_reqs;
4761     for (i = 0; i < num_reqs; i++) {
4762         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4763                               reqs[i].nb_sectors, reqs[i].flags,
4764                               multiwrite_cb, mcb,
4765                               true);
4766     }
4767 
4768     return 0;
4769 }
4770 
4771 void bdrv_aio_cancel(BlockAIOCB *acb)
4772 {
4773     qemu_aio_ref(acb);
4774     bdrv_aio_cancel_async(acb);
4775     while (acb->refcnt > 1) {
4776         if (acb->aiocb_info->get_aio_context) {
4777             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4778         } else if (acb->bs) {
4779             aio_poll(bdrv_get_aio_context(acb->bs), true);
4780         } else {
4781             abort();
4782         }
4783     }
4784     qemu_aio_unref(acb);
4785 }
4786 
4787 /* Async version of aio cancel. The caller is not blocked if the acb implements
4788  * cancel_async, otherwise we do nothing and let the request normally complete.
4789  * In either case the completion callback must be called. */
4790 void bdrv_aio_cancel_async(BlockAIOCB *acb)
4791 {
4792     if (acb->aiocb_info->cancel_async) {
4793         acb->aiocb_info->cancel_async(acb);
4794     }
4795 }
4796 
4797 /**************************************************************/
4798 /* async block device emulation */
4799 
4800 typedef struct BlockAIOCBSync {
4801     BlockAIOCB common;
4802     QEMUBH *bh;
4803     int ret;
4804     /* vector translation state */
4805     QEMUIOVector *qiov;
4806     uint8_t *bounce;
4807     int is_write;
4808 } BlockAIOCBSync;
4809 
4810 static const AIOCBInfo bdrv_em_aiocb_info = {
4811     .aiocb_size         = sizeof(BlockAIOCBSync),
4812 };
4813 
4814 static void bdrv_aio_bh_cb(void *opaque)
4815 {
4816     BlockAIOCBSync *acb = opaque;
4817 
4818     if (!acb->is_write && acb->ret >= 0) {
4819         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4820     }
4821     qemu_vfree(acb->bounce);
4822     acb->common.cb(acb->common.opaque, acb->ret);
4823     qemu_bh_delete(acb->bh);
4824     acb->bh = NULL;
4825     qemu_aio_unref(acb);
4826 }
4827 
4828 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4829                                       int64_t sector_num,
4830                                       QEMUIOVector *qiov,
4831                                       int nb_sectors,
4832                                       BlockCompletionFunc *cb,
4833                                       void *opaque,
4834                                       int is_write)
4835 
4836 {
4837     BlockAIOCBSync *acb;
4838 
4839     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4840     acb->is_write = is_write;
4841     acb->qiov = qiov;
4842     acb->bounce = qemu_try_blockalign(bs, qiov->size);
4843     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4844 
4845     if (acb->bounce == NULL) {
4846         acb->ret = -ENOMEM;
4847     } else if (is_write) {
4848         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4849         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4850     } else {
4851         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4852     }
4853 
4854     qemu_bh_schedule(acb->bh);
4855 
4856     return &acb->common;
4857 }
4858 
4859 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4860         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4861         BlockCompletionFunc *cb, void *opaque)
4862 {
4863     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4864 }
4865 
4866 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4867         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4868         BlockCompletionFunc *cb, void *opaque)
4869 {
4870     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4871 }
4872 
4873 
4874 typedef struct BlockAIOCBCoroutine {
4875     BlockAIOCB common;
4876     BlockRequest req;
4877     bool is_write;
4878     bool need_bh;
4879     bool *done;
4880     QEMUBH* bh;
4881 } BlockAIOCBCoroutine;
4882 
4883 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4884     .aiocb_size         = sizeof(BlockAIOCBCoroutine),
4885 };
4886 
4887 static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
4888 {
4889     if (!acb->need_bh) {
4890         acb->common.cb(acb->common.opaque, acb->req.error);
4891         qemu_aio_unref(acb);
4892     }
4893 }
4894 
4895 static void bdrv_co_em_bh(void *opaque)
4896 {
4897     BlockAIOCBCoroutine *acb = opaque;
4898 
4899     assert(!acb->need_bh);
4900     qemu_bh_delete(acb->bh);
4901     bdrv_co_complete(acb);
4902 }
4903 
4904 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
4905 {
4906     acb->need_bh = false;
4907     if (acb->req.error != -EINPROGRESS) {
4908         BlockDriverState *bs = acb->common.bs;
4909 
4910         acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4911         qemu_bh_schedule(acb->bh);
4912     }
4913 }
4914 
4915 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4916 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4917 {
4918     BlockAIOCBCoroutine *acb = opaque;
4919     BlockDriverState *bs = acb->common.bs;
4920 
4921     if (!acb->is_write) {
4922         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4923             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4924     } else {
4925         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4926             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4927     }
4928 
4929     bdrv_co_complete(acb);
4930 }
4931 
4932 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4933                                          int64_t sector_num,
4934                                          QEMUIOVector *qiov,
4935                                          int nb_sectors,
4936                                          BdrvRequestFlags flags,
4937                                          BlockCompletionFunc *cb,
4938                                          void *opaque,
4939                                          bool is_write)
4940 {
4941     Coroutine *co;
4942     BlockAIOCBCoroutine *acb;
4943 
4944     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4945     acb->need_bh = true;
4946     acb->req.error = -EINPROGRESS;
4947     acb->req.sector = sector_num;
4948     acb->req.nb_sectors = nb_sectors;
4949     acb->req.qiov = qiov;
4950     acb->req.flags = flags;
4951     acb->is_write = is_write;
4952 
4953     co = qemu_coroutine_create(bdrv_co_do_rw);
4954     qemu_coroutine_enter(co, acb);
4955 
4956     bdrv_co_maybe_schedule_bh(acb);
4957     return &acb->common;
4958 }
4959 
4960 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4961 {
4962     BlockAIOCBCoroutine *acb = opaque;
4963     BlockDriverState *bs = acb->common.bs;
4964 
4965     acb->req.error = bdrv_co_flush(bs);
4966     bdrv_co_complete(acb);
4967 }
4968 
4969 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4970         BlockCompletionFunc *cb, void *opaque)
4971 {
4972     trace_bdrv_aio_flush(bs, opaque);
4973 
4974     Coroutine *co;
4975     BlockAIOCBCoroutine *acb;
4976 
4977     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4978     acb->need_bh = true;
4979     acb->req.error = -EINPROGRESS;
4980 
4981     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4982     qemu_coroutine_enter(co, acb);
4983 
4984     bdrv_co_maybe_schedule_bh(acb);
4985     return &acb->common;
4986 }
4987 
4988 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4989 {
4990     BlockAIOCBCoroutine *acb = opaque;
4991     BlockDriverState *bs = acb->common.bs;
4992 
4993     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4994     bdrv_co_complete(acb);
4995 }
4996 
4997 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4998         int64_t sector_num, int nb_sectors,
4999         BlockCompletionFunc *cb, void *opaque)
5000 {
5001     Coroutine *co;
5002     BlockAIOCBCoroutine *acb;
5003 
5004     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
5005 
5006     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
5007     acb->need_bh = true;
5008     acb->req.error = -EINPROGRESS;
5009     acb->req.sector = sector_num;
5010     acb->req.nb_sectors = nb_sectors;
5011     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
5012     qemu_coroutine_enter(co, acb);
5013 
5014     bdrv_co_maybe_schedule_bh(acb);
5015     return &acb->common;
5016 }
5017 
5018 void bdrv_init(void)
5019 {
5020     module_call_init(MODULE_INIT_BLOCK);
5021 }
5022 
5023 void bdrv_init_with_whitelist(void)
5024 {
5025     use_bdrv_whitelist = 1;
5026     bdrv_init();
5027 }
5028 
5029 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
5030                    BlockCompletionFunc *cb, void *opaque)
5031 {
5032     BlockAIOCB *acb;
5033 
5034     acb = g_slice_alloc(aiocb_info->aiocb_size);
5035     acb->aiocb_info = aiocb_info;
5036     acb->bs = bs;
5037     acb->cb = cb;
5038     acb->opaque = opaque;
5039     acb->refcnt = 1;
5040     return acb;
5041 }
5042 
5043 void qemu_aio_ref(void *p)
5044 {
5045     BlockAIOCB *acb = p;
5046     acb->refcnt++;
5047 }
5048 
5049 void qemu_aio_unref(void *p)
5050 {
5051     BlockAIOCB *acb = p;
5052     assert(acb->refcnt > 0);
5053     if (--acb->refcnt == 0) {
5054         g_slice_free1(acb->aiocb_info->aiocb_size, acb);
5055     }
5056 }
5057 
5058 /**************************************************************/
5059 /* Coroutine block device emulation */
5060 
5061 typedef struct CoroutineIOCompletion {
5062     Coroutine *coroutine;
5063     int ret;
5064 } CoroutineIOCompletion;
5065 
5066 static void bdrv_co_io_em_complete(void *opaque, int ret)
5067 {
5068     CoroutineIOCompletion *co = opaque;
5069 
5070     co->ret = ret;
5071     qemu_coroutine_enter(co->coroutine, NULL);
5072 }
5073 
5074 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
5075                                       int nb_sectors, QEMUIOVector *iov,
5076                                       bool is_write)
5077 {
5078     CoroutineIOCompletion co = {
5079         .coroutine = qemu_coroutine_self(),
5080     };
5081     BlockAIOCB *acb;
5082 
5083     if (is_write) {
5084         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
5085                                        bdrv_co_io_em_complete, &co);
5086     } else {
5087         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
5088                                       bdrv_co_io_em_complete, &co);
5089     }
5090 
5091     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
5092     if (!acb) {
5093         return -EIO;
5094     }
5095     qemu_coroutine_yield();
5096 
5097     return co.ret;
5098 }
5099 
5100 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
5101                                          int64_t sector_num, int nb_sectors,
5102                                          QEMUIOVector *iov)
5103 {
5104     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
5105 }
5106 
5107 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
5108                                          int64_t sector_num, int nb_sectors,
5109                                          QEMUIOVector *iov)
5110 {
5111     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
5112 }
5113 
5114 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
5115 {
5116     RwCo *rwco = opaque;
5117 
5118     rwco->ret = bdrv_co_flush(rwco->bs);
5119 }
5120 
5121 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
5122 {
5123     int ret;
5124 
5125     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
5126         return 0;
5127     }
5128 
5129     /* Write back cached data to the OS even with cache=unsafe */
5130     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
5131     if (bs->drv->bdrv_co_flush_to_os) {
5132         ret = bs->drv->bdrv_co_flush_to_os(bs);
5133         if (ret < 0) {
5134             return ret;
5135         }
5136     }
5137 
5138     /* But don't actually force it to the disk with cache=unsafe */
5139     if (bs->open_flags & BDRV_O_NO_FLUSH) {
5140         goto flush_parent;
5141     }
5142 
5143     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
5144     if (bs->drv->bdrv_co_flush_to_disk) {
5145         ret = bs->drv->bdrv_co_flush_to_disk(bs);
5146     } else if (bs->drv->bdrv_aio_flush) {
5147         BlockAIOCB *acb;
5148         CoroutineIOCompletion co = {
5149             .coroutine = qemu_coroutine_self(),
5150         };
5151 
5152         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
5153         if (acb == NULL) {
5154             ret = -EIO;
5155         } else {
5156             qemu_coroutine_yield();
5157             ret = co.ret;
5158         }
5159     } else {
5160         /*
5161          * Some block drivers always operate in either writethrough or unsafe
5162          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
5163          * know how the server works (because the behaviour is hardcoded or
5164          * depends on server-side configuration), so we can't ensure that
5165          * everything is safe on disk. Returning an error doesn't work because
5166          * that would break guests even if the server operates in writethrough
5167          * mode.
5168          *
5169          * Let's hope the user knows what he's doing.
5170          */
5171         ret = 0;
5172     }
5173     if (ret < 0) {
5174         return ret;
5175     }
5176 
5177     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
5178      * in the case of cache=unsafe, so there are no useless flushes.
5179      */
5180 flush_parent:
5181     return bdrv_co_flush(bs->file);
5182 }
5183 
5184 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
5185 {
5186     Error *local_err = NULL;
5187     int ret;
5188 
5189     if (!bs->drv)  {
5190         return;
5191     }
5192 
5193     if (!(bs->open_flags & BDRV_O_INCOMING)) {
5194         return;
5195     }
5196     bs->open_flags &= ~BDRV_O_INCOMING;
5197 
5198     if (bs->drv->bdrv_invalidate_cache) {
5199         bs->drv->bdrv_invalidate_cache(bs, &local_err);
5200     } else if (bs->file) {
5201         bdrv_invalidate_cache(bs->file, &local_err);
5202     }
5203     if (local_err) {
5204         error_propagate(errp, local_err);
5205         return;
5206     }
5207 
5208     ret = refresh_total_sectors(bs, bs->total_sectors);
5209     if (ret < 0) {
5210         error_setg_errno(errp, -ret, "Could not refresh total sector count");
5211         return;
5212     }
5213 }
5214 
5215 void bdrv_invalidate_cache_all(Error **errp)
5216 {
5217     BlockDriverState *bs;
5218     Error *local_err = NULL;
5219 
5220     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5221         AioContext *aio_context = bdrv_get_aio_context(bs);
5222 
5223         aio_context_acquire(aio_context);
5224         bdrv_invalidate_cache(bs, &local_err);
5225         aio_context_release(aio_context);
5226         if (local_err) {
5227             error_propagate(errp, local_err);
5228             return;
5229         }
5230     }
5231 }
5232 
5233 int bdrv_flush(BlockDriverState *bs)
5234 {
5235     Coroutine *co;
5236     RwCo rwco = {
5237         .bs = bs,
5238         .ret = NOT_DONE,
5239     };
5240 
5241     if (qemu_in_coroutine()) {
5242         /* Fast-path if already in coroutine context */
5243         bdrv_flush_co_entry(&rwco);
5244     } else {
5245         AioContext *aio_context = bdrv_get_aio_context(bs);
5246 
5247         co = qemu_coroutine_create(bdrv_flush_co_entry);
5248         qemu_coroutine_enter(co, &rwco);
5249         while (rwco.ret == NOT_DONE) {
5250             aio_poll(aio_context, true);
5251         }
5252     }
5253 
5254     return rwco.ret;
5255 }
5256 
5257 typedef struct DiscardCo {
5258     BlockDriverState *bs;
5259     int64_t sector_num;
5260     int nb_sectors;
5261     int ret;
5262 } DiscardCo;
5263 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5264 {
5265     DiscardCo *rwco = opaque;
5266 
5267     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5268 }
5269 
5270 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5271                                  int nb_sectors)
5272 {
5273     int max_discard, ret;
5274 
5275     if (!bs->drv) {
5276         return -ENOMEDIUM;
5277     }
5278 
5279     ret = bdrv_check_request(bs, sector_num, nb_sectors);
5280     if (ret < 0) {
5281         return ret;
5282     } else if (bs->read_only) {
5283         return -EROFS;
5284     }
5285 
5286     bdrv_reset_dirty(bs, sector_num, nb_sectors);
5287 
5288     /* Do nothing if disabled.  */
5289     if (!(bs->open_flags & BDRV_O_UNMAP)) {
5290         return 0;
5291     }
5292 
5293     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5294         return 0;
5295     }
5296 
5297     max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
5298     while (nb_sectors > 0) {
5299         int ret;
5300         int num = nb_sectors;
5301 
5302         /* align request */
5303         if (bs->bl.discard_alignment &&
5304             num >= bs->bl.discard_alignment &&
5305             sector_num % bs->bl.discard_alignment) {
5306             if (num > bs->bl.discard_alignment) {
5307                 num = bs->bl.discard_alignment;
5308             }
5309             num -= sector_num % bs->bl.discard_alignment;
5310         }
5311 
5312         /* limit request size */
5313         if (num > max_discard) {
5314             num = max_discard;
5315         }
5316 
5317         if (bs->drv->bdrv_co_discard) {
5318             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5319         } else {
5320             BlockAIOCB *acb;
5321             CoroutineIOCompletion co = {
5322                 .coroutine = qemu_coroutine_self(),
5323             };
5324 
5325             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5326                                             bdrv_co_io_em_complete, &co);
5327             if (acb == NULL) {
5328                 return -EIO;
5329             } else {
5330                 qemu_coroutine_yield();
5331                 ret = co.ret;
5332             }
5333         }
5334         if (ret && ret != -ENOTSUP) {
5335             return ret;
5336         }
5337 
5338         sector_num += num;
5339         nb_sectors -= num;
5340     }
5341     return 0;
5342 }
5343 
5344 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5345 {
5346     Coroutine *co;
5347     DiscardCo rwco = {
5348         .bs = bs,
5349         .sector_num = sector_num,
5350         .nb_sectors = nb_sectors,
5351         .ret = NOT_DONE,
5352     };
5353 
5354     if (qemu_in_coroutine()) {
5355         /* Fast-path if already in coroutine context */
5356         bdrv_discard_co_entry(&rwco);
5357     } else {
5358         AioContext *aio_context = bdrv_get_aio_context(bs);
5359 
5360         co = qemu_coroutine_create(bdrv_discard_co_entry);
5361         qemu_coroutine_enter(co, &rwco);
5362         while (rwco.ret == NOT_DONE) {
5363             aio_poll(aio_context, true);
5364         }
5365     }
5366 
5367     return rwco.ret;
5368 }
5369 
5370 /**************************************************************/
5371 /* removable device support */
5372 
5373 /**
5374  * Return TRUE if the media is present
5375  */
5376 int bdrv_is_inserted(BlockDriverState *bs)
5377 {
5378     BlockDriver *drv = bs->drv;
5379 
5380     if (!drv)
5381         return 0;
5382     if (!drv->bdrv_is_inserted)
5383         return 1;
5384     return drv->bdrv_is_inserted(bs);
5385 }
5386 
5387 /**
5388  * Return whether the media changed since the last call to this
5389  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
5390  */
5391 int bdrv_media_changed(BlockDriverState *bs)
5392 {
5393     BlockDriver *drv = bs->drv;
5394 
5395     if (drv && drv->bdrv_media_changed) {
5396         return drv->bdrv_media_changed(bs);
5397     }
5398     return -ENOTSUP;
5399 }
5400 
5401 /**
5402  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5403  */
5404 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5405 {
5406     BlockDriver *drv = bs->drv;
5407     const char *device_name;
5408 
5409     if (drv && drv->bdrv_eject) {
5410         drv->bdrv_eject(bs, eject_flag);
5411     }
5412 
5413     device_name = bdrv_get_device_name(bs);
5414     if (device_name[0] != '\0') {
5415         qapi_event_send_device_tray_moved(device_name,
5416                                           eject_flag, &error_abort);
5417     }
5418 }
5419 
5420 /**
5421  * Lock or unlock the media (if it is locked, the user won't be able
5422  * to eject it manually).
5423  */
5424 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5425 {
5426     BlockDriver *drv = bs->drv;
5427 
5428     trace_bdrv_lock_medium(bs, locked);
5429 
5430     if (drv && drv->bdrv_lock_medium) {
5431         drv->bdrv_lock_medium(bs, locked);
5432     }
5433 }
5434 
5435 /* needed for generic scsi interface */
5436 
5437 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5438 {
5439     BlockDriver *drv = bs->drv;
5440 
5441     if (drv && drv->bdrv_ioctl)
5442         return drv->bdrv_ioctl(bs, req, buf);
5443     return -ENOTSUP;
5444 }
5445 
5446 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5447         unsigned long int req, void *buf,
5448         BlockCompletionFunc *cb, void *opaque)
5449 {
5450     BlockDriver *drv = bs->drv;
5451 
5452     if (drv && drv->bdrv_aio_ioctl)
5453         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5454     return NULL;
5455 }
5456 
5457 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5458 {
5459     bs->guest_block_size = align;
5460 }
5461 
5462 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5463 {
5464     return qemu_memalign(bdrv_opt_mem_align(bs), size);
5465 }
5466 
5467 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5468 {
5469     return memset(qemu_blockalign(bs, size), 0, size);
5470 }
5471 
5472 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5473 {
5474     size_t align = bdrv_opt_mem_align(bs);
5475 
5476     /* Ensure that NULL is never returned on success */
5477     assert(align > 0);
5478     if (size == 0) {
5479         size = align;
5480     }
5481 
5482     return qemu_try_memalign(align, size);
5483 }
5484 
5485 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5486 {
5487     void *mem = qemu_try_blockalign(bs, size);
5488 
5489     if (mem) {
5490         memset(mem, 0, size);
5491     }
5492 
5493     return mem;
5494 }
5495 
5496 /*
5497  * Check if all memory in this vector is sector aligned.
5498  */
5499 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5500 {
5501     int i;
5502     size_t alignment = bdrv_opt_mem_align(bs);
5503 
5504     for (i = 0; i < qiov->niov; i++) {
5505         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5506             return false;
5507         }
5508         if (qiov->iov[i].iov_len % alignment) {
5509             return false;
5510         }
5511     }
5512 
5513     return true;
5514 }
5515 
5516 BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name)
5517 {
5518     BdrvDirtyBitmap *bm;
5519 
5520     assert(name);
5521     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5522         if (bm->name && !strcmp(name, bm->name)) {
5523             return bm;
5524         }
5525     }
5526     return NULL;
5527 }
5528 
5529 void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap)
5530 {
5531     assert(!bdrv_dirty_bitmap_frozen(bitmap));
5532     g_free(bitmap->name);
5533     bitmap->name = NULL;
5534 }
5535 
5536 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
5537                                           uint32_t granularity,
5538                                           const char *name,
5539                                           Error **errp)
5540 {
5541     int64_t bitmap_size;
5542     BdrvDirtyBitmap *bitmap;
5543     uint32_t sector_granularity;
5544 
5545     assert((granularity & (granularity - 1)) == 0);
5546 
5547     if (name && bdrv_find_dirty_bitmap(bs, name)) {
5548         error_setg(errp, "Bitmap already exists: %s", name);
5549         return NULL;
5550     }
5551     sector_granularity = granularity >> BDRV_SECTOR_BITS;
5552     assert(sector_granularity);
5553     bitmap_size = bdrv_nb_sectors(bs);
5554     if (bitmap_size < 0) {
5555         error_setg_errno(errp, -bitmap_size, "could not get length of device");
5556         errno = -bitmap_size;
5557         return NULL;
5558     }
5559     bitmap = g_new0(BdrvDirtyBitmap, 1);
5560     bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(sector_granularity));
5561     bitmap->size = bitmap_size;
5562     bitmap->name = g_strdup(name);
5563     bitmap->disabled = false;
5564     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5565     return bitmap;
5566 }
5567 
5568 bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap)
5569 {
5570     return bitmap->successor;
5571 }
5572 
5573 bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap)
5574 {
5575     return !(bitmap->disabled || bitmap->successor);
5576 }
5577 
5578 /**
5579  * Create a successor bitmap destined to replace this bitmap after an operation.
5580  * Requires that the bitmap is not frozen and has no successor.
5581  */
5582 int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs,
5583                                        BdrvDirtyBitmap *bitmap, Error **errp)
5584 {
5585     uint64_t granularity;
5586     BdrvDirtyBitmap *child;
5587 
5588     if (bdrv_dirty_bitmap_frozen(bitmap)) {
5589         error_setg(errp, "Cannot create a successor for a bitmap that is "
5590                    "currently frozen");
5591         return -1;
5592     }
5593     assert(!bitmap->successor);
5594 
5595     /* Create an anonymous successor */
5596     granularity = bdrv_dirty_bitmap_granularity(bitmap);
5597     child = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
5598     if (!child) {
5599         return -1;
5600     }
5601 
5602     /* Successor will be on or off based on our current state. */
5603     child->disabled = bitmap->disabled;
5604 
5605     /* Install the successor and freeze the parent */
5606     bitmap->successor = child;
5607     return 0;
5608 }
5609 
5610 /**
5611  * For a bitmap with a successor, yield our name to the successor,
5612  * delete the old bitmap, and return a handle to the new bitmap.
5613  */
5614 BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs,
5615                                             BdrvDirtyBitmap *bitmap,
5616                                             Error **errp)
5617 {
5618     char *name;
5619     BdrvDirtyBitmap *successor = bitmap->successor;
5620 
5621     if (successor == NULL) {
5622         error_setg(errp, "Cannot relinquish control if "
5623                    "there's no successor present");
5624         return NULL;
5625     }
5626 
5627     name = bitmap->name;
5628     bitmap->name = NULL;
5629     successor->name = name;
5630     bitmap->successor = NULL;
5631     bdrv_release_dirty_bitmap(bs, bitmap);
5632 
5633     return successor;
5634 }
5635 
5636 /**
5637  * In cases of failure where we can no longer safely delete the parent,
5638  * we may wish to re-join the parent and child/successor.
5639  * The merged parent will be un-frozen, but not explicitly re-enabled.
5640  */
5641 BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs,
5642                                            BdrvDirtyBitmap *parent,
5643                                            Error **errp)
5644 {
5645     BdrvDirtyBitmap *successor = parent->successor;
5646 
5647     if (!successor) {
5648         error_setg(errp, "Cannot reclaim a successor when none is present");
5649         return NULL;
5650     }
5651 
5652     if (!hbitmap_merge(parent->bitmap, successor->bitmap)) {
5653         error_setg(errp, "Merging of parent and successor bitmap failed");
5654         return NULL;
5655     }
5656     bdrv_release_dirty_bitmap(bs, successor);
5657     parent->successor = NULL;
5658 
5659     return parent;
5660 }
5661 
5662 /**
5663  * Truncates _all_ bitmaps attached to a BDS.
5664  */
5665 static void bdrv_dirty_bitmap_truncate(BlockDriverState *bs)
5666 {
5667     BdrvDirtyBitmap *bitmap;
5668     uint64_t size = bdrv_nb_sectors(bs);
5669 
5670     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5671         if (bdrv_dirty_bitmap_frozen(bitmap)) {
5672             continue;
5673         }
5674         hbitmap_truncate(bitmap->bitmap, size);
5675     }
5676 }
5677 
5678 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5679 {
5680     BdrvDirtyBitmap *bm, *next;
5681     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5682         if (bm == bitmap) {
5683             assert(!bdrv_dirty_bitmap_frozen(bm));
5684             QLIST_REMOVE(bitmap, list);
5685             hbitmap_free(bitmap->bitmap);
5686             g_free(bitmap->name);
5687             g_free(bitmap);
5688             return;
5689         }
5690     }
5691 }
5692 
5693 void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
5694 {
5695     assert(!bdrv_dirty_bitmap_frozen(bitmap));
5696     bitmap->disabled = true;
5697 }
5698 
5699 void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
5700 {
5701     assert(!bdrv_dirty_bitmap_frozen(bitmap));
5702     bitmap->disabled = false;
5703 }
5704 
5705 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5706 {
5707     BdrvDirtyBitmap *bm;
5708     BlockDirtyInfoList *list = NULL;
5709     BlockDirtyInfoList **plist = &list;
5710 
5711     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5712         BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5713         BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
5714         info->count = bdrv_get_dirty_count(bm);
5715         info->granularity = bdrv_dirty_bitmap_granularity(bm);
5716         info->has_name = !!bm->name;
5717         info->name = g_strdup(bm->name);
5718         info->frozen = bdrv_dirty_bitmap_frozen(bm);
5719         entry->value = info;
5720         *plist = entry;
5721         plist = &entry->next;
5722     }
5723 
5724     return list;
5725 }
5726 
5727 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5728 {
5729     if (bitmap) {
5730         return hbitmap_get(bitmap->bitmap, sector);
5731     } else {
5732         return 0;
5733     }
5734 }
5735 
5736 /**
5737  * Chooses a default granularity based on the existing cluster size,
5738  * but clamped between [4K, 64K]. Defaults to 64K in the case that there
5739  * is no cluster size information available.
5740  */
5741 uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs)
5742 {
5743     BlockDriverInfo bdi;
5744     uint32_t granularity;
5745 
5746     if (bdrv_get_info(bs, &bdi) >= 0 && bdi.cluster_size > 0) {
5747         granularity = MAX(4096, bdi.cluster_size);
5748         granularity = MIN(65536, granularity);
5749     } else {
5750         granularity = 65536;
5751     }
5752 
5753     return granularity;
5754 }
5755 
5756 uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap)
5757 {
5758     return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap);
5759 }
5760 
5761 void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5762 {
5763     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5764 }
5765 
5766 void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
5767                            int64_t cur_sector, int nr_sectors)
5768 {
5769     assert(bdrv_dirty_bitmap_enabled(bitmap));
5770     hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5771 }
5772 
5773 void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
5774                              int64_t cur_sector, int nr_sectors)
5775 {
5776     assert(bdrv_dirty_bitmap_enabled(bitmap));
5777     hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5778 }
5779 
5780 void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap)
5781 {
5782     assert(bdrv_dirty_bitmap_enabled(bitmap));
5783     hbitmap_reset(bitmap->bitmap, 0, bitmap->size);
5784 }
5785 
5786 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5787                     int nr_sectors)
5788 {
5789     BdrvDirtyBitmap *bitmap;
5790     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5791         if (!bdrv_dirty_bitmap_enabled(bitmap)) {
5792             continue;
5793         }
5794         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5795     }
5796 }
5797 
5798 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
5799                       int nr_sectors)
5800 {
5801     BdrvDirtyBitmap *bitmap;
5802     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5803         if (!bdrv_dirty_bitmap_enabled(bitmap)) {
5804             continue;
5805         }
5806         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5807     }
5808 }
5809 
5810 /**
5811  * Advance an HBitmapIter to an arbitrary offset.
5812  */
5813 void bdrv_set_dirty_iter(HBitmapIter *hbi, int64_t offset)
5814 {
5815     assert(hbi->hb);
5816     hbitmap_iter_init(hbi, hbi->hb, offset);
5817 }
5818 
5819 int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap)
5820 {
5821     return hbitmap_count(bitmap->bitmap);
5822 }
5823 
5824 /* Get a reference to bs */
5825 void bdrv_ref(BlockDriverState *bs)
5826 {
5827     bs->refcnt++;
5828 }
5829 
5830 /* Release a previously grabbed reference to bs.
5831  * If after releasing, reference count is zero, the BlockDriverState is
5832  * deleted. */
5833 void bdrv_unref(BlockDriverState *bs)
5834 {
5835     if (!bs) {
5836         return;
5837     }
5838     assert(bs->refcnt > 0);
5839     if (--bs->refcnt == 0) {
5840         bdrv_delete(bs);
5841     }
5842 }
5843 
5844 struct BdrvOpBlocker {
5845     Error *reason;
5846     QLIST_ENTRY(BdrvOpBlocker) list;
5847 };
5848 
5849 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5850 {
5851     BdrvOpBlocker *blocker;
5852     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5853     if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5854         blocker = QLIST_FIRST(&bs->op_blockers[op]);
5855         if (errp) {
5856             error_setg(errp, "Node '%s' is busy: %s",
5857                        bdrv_get_device_or_node_name(bs),
5858                        error_get_pretty(blocker->reason));
5859         }
5860         return true;
5861     }
5862     return false;
5863 }
5864 
5865 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5866 {
5867     BdrvOpBlocker *blocker;
5868     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5869 
5870     blocker = g_new0(BdrvOpBlocker, 1);
5871     blocker->reason = reason;
5872     QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5873 }
5874 
5875 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5876 {
5877     BdrvOpBlocker *blocker, *next;
5878     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5879     QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5880         if (blocker->reason == reason) {
5881             QLIST_REMOVE(blocker, list);
5882             g_free(blocker);
5883         }
5884     }
5885 }
5886 
5887 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5888 {
5889     int i;
5890     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5891         bdrv_op_block(bs, i, reason);
5892     }
5893 }
5894 
5895 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5896 {
5897     int i;
5898     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5899         bdrv_op_unblock(bs, i, reason);
5900     }
5901 }
5902 
5903 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5904 {
5905     int i;
5906 
5907     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5908         if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5909             return false;
5910         }
5911     }
5912     return true;
5913 }
5914 
5915 void bdrv_iostatus_enable(BlockDriverState *bs)
5916 {
5917     bs->iostatus_enabled = true;
5918     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5919 }
5920 
5921 /* The I/O status is only enabled if the drive explicitly
5922  * enables it _and_ the VM is configured to stop on errors */
5923 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5924 {
5925     return (bs->iostatus_enabled &&
5926            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5927             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5928             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5929 }
5930 
5931 void bdrv_iostatus_disable(BlockDriverState *bs)
5932 {
5933     bs->iostatus_enabled = false;
5934 }
5935 
5936 void bdrv_iostatus_reset(BlockDriverState *bs)
5937 {
5938     if (bdrv_iostatus_is_enabled(bs)) {
5939         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5940         if (bs->job) {
5941             block_job_iostatus_reset(bs->job);
5942         }
5943     }
5944 }
5945 
5946 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5947 {
5948     assert(bdrv_iostatus_is_enabled(bs));
5949     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5950         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5951                                          BLOCK_DEVICE_IO_STATUS_FAILED;
5952     }
5953 }
5954 
5955 void bdrv_img_create(const char *filename, const char *fmt,
5956                      const char *base_filename, const char *base_fmt,
5957                      char *options, uint64_t img_size, int flags,
5958                      Error **errp, bool quiet)
5959 {
5960     QemuOptsList *create_opts = NULL;
5961     QemuOpts *opts = NULL;
5962     const char *backing_fmt, *backing_file;
5963     int64_t size;
5964     BlockDriver *drv, *proto_drv;
5965     BlockDriver *backing_drv = NULL;
5966     Error *local_err = NULL;
5967     int ret = 0;
5968 
5969     /* Find driver and parse its options */
5970     drv = bdrv_find_format(fmt);
5971     if (!drv) {
5972         error_setg(errp, "Unknown file format '%s'", fmt);
5973         return;
5974     }
5975 
5976     proto_drv = bdrv_find_protocol(filename, true, errp);
5977     if (!proto_drv) {
5978         return;
5979     }
5980 
5981     if (!drv->create_opts) {
5982         error_setg(errp, "Format driver '%s' does not support image creation",
5983                    drv->format_name);
5984         return;
5985     }
5986 
5987     if (!proto_drv->create_opts) {
5988         error_setg(errp, "Protocol driver '%s' does not support image creation",
5989                    proto_drv->format_name);
5990         return;
5991     }
5992 
5993     create_opts = qemu_opts_append(create_opts, drv->create_opts);
5994     create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5995 
5996     /* Create parameter list with default values */
5997     opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5998     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size, &error_abort);
5999 
6000     /* Parse -o options */
6001     if (options) {
6002         qemu_opts_do_parse(opts, options, NULL, &local_err);
6003         if (local_err) {
6004             error_report_err(local_err);
6005             local_err = NULL;
6006             error_setg(errp, "Invalid options for file format '%s'", fmt);
6007             goto out;
6008         }
6009     }
6010 
6011     if (base_filename) {
6012         qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename, &local_err);
6013         if (local_err) {
6014             error_setg(errp, "Backing file not supported for file format '%s'",
6015                        fmt);
6016             goto out;
6017         }
6018     }
6019 
6020     if (base_fmt) {
6021         qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt, &local_err);
6022         if (local_err) {
6023             error_setg(errp, "Backing file format not supported for file "
6024                              "format '%s'", fmt);
6025             goto out;
6026         }
6027     }
6028 
6029     backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
6030     if (backing_file) {
6031         if (!strcmp(filename, backing_file)) {
6032             error_setg(errp, "Error: Trying to create an image with the "
6033                              "same filename as the backing file");
6034             goto out;
6035         }
6036     }
6037 
6038     backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
6039     if (backing_fmt) {
6040         backing_drv = bdrv_find_format(backing_fmt);
6041         if (!backing_drv) {
6042             error_setg(errp, "Unknown backing file format '%s'",
6043                        backing_fmt);
6044             goto out;
6045         }
6046     }
6047 
6048     // The size for the image must always be specified, with one exception:
6049     // If we are using a backing file, we can obtain the size from there
6050     size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
6051     if (size == -1) {
6052         if (backing_file) {
6053             BlockDriverState *bs;
6054             char *full_backing = g_new0(char, PATH_MAX);
6055             int64_t size;
6056             int back_flags;
6057 
6058             bdrv_get_full_backing_filename_from_filename(filename, backing_file,
6059                                                          full_backing, PATH_MAX,
6060                                                          &local_err);
6061             if (local_err) {
6062                 g_free(full_backing);
6063                 goto out;
6064             }
6065 
6066             /* backing files always opened read-only */
6067             back_flags =
6068                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
6069 
6070             bs = NULL;
6071             ret = bdrv_open(&bs, full_backing, NULL, NULL, back_flags,
6072                             backing_drv, &local_err);
6073             g_free(full_backing);
6074             if (ret < 0) {
6075                 goto out;
6076             }
6077             size = bdrv_getlength(bs);
6078             if (size < 0) {
6079                 error_setg_errno(errp, -size, "Could not get size of '%s'",
6080                                  backing_file);
6081                 bdrv_unref(bs);
6082                 goto out;
6083             }
6084 
6085             qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size, &error_abort);
6086 
6087             bdrv_unref(bs);
6088         } else {
6089             error_setg(errp, "Image creation needs a size parameter");
6090             goto out;
6091         }
6092     }
6093 
6094     if (!quiet) {
6095         printf("Formatting '%s', fmt=%s", filename, fmt);
6096         qemu_opts_print(opts, " ");
6097         puts("");
6098     }
6099 
6100     ret = bdrv_create(drv, filename, opts, &local_err);
6101 
6102     if (ret == -EFBIG) {
6103         /* This is generally a better message than whatever the driver would
6104          * deliver (especially because of the cluster_size_hint), since that
6105          * is most probably not much different from "image too large". */
6106         const char *cluster_size_hint = "";
6107         if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
6108             cluster_size_hint = " (try using a larger cluster size)";
6109         }
6110         error_setg(errp, "The image size is too large for file format '%s'"
6111                    "%s", fmt, cluster_size_hint);
6112         error_free(local_err);
6113         local_err = NULL;
6114     }
6115 
6116 out:
6117     qemu_opts_del(opts);
6118     qemu_opts_free(create_opts);
6119     if (local_err) {
6120         error_propagate(errp, local_err);
6121     }
6122 }
6123 
6124 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
6125 {
6126     return bs->aio_context;
6127 }
6128 
6129 void bdrv_detach_aio_context(BlockDriverState *bs)
6130 {
6131     BdrvAioNotifier *baf;
6132 
6133     if (!bs->drv) {
6134         return;
6135     }
6136 
6137     QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
6138         baf->detach_aio_context(baf->opaque);
6139     }
6140 
6141     if (bs->io_limits_enabled) {
6142         throttle_detach_aio_context(&bs->throttle_state);
6143     }
6144     if (bs->drv->bdrv_detach_aio_context) {
6145         bs->drv->bdrv_detach_aio_context(bs);
6146     }
6147     if (bs->file) {
6148         bdrv_detach_aio_context(bs->file);
6149     }
6150     if (bs->backing_hd) {
6151         bdrv_detach_aio_context(bs->backing_hd);
6152     }
6153 
6154     bs->aio_context = NULL;
6155 }
6156 
6157 void bdrv_attach_aio_context(BlockDriverState *bs,
6158                              AioContext *new_context)
6159 {
6160     BdrvAioNotifier *ban;
6161 
6162     if (!bs->drv) {
6163         return;
6164     }
6165 
6166     bs->aio_context = new_context;
6167 
6168     if (bs->backing_hd) {
6169         bdrv_attach_aio_context(bs->backing_hd, new_context);
6170     }
6171     if (bs->file) {
6172         bdrv_attach_aio_context(bs->file, new_context);
6173     }
6174     if (bs->drv->bdrv_attach_aio_context) {
6175         bs->drv->bdrv_attach_aio_context(bs, new_context);
6176     }
6177     if (bs->io_limits_enabled) {
6178         throttle_attach_aio_context(&bs->throttle_state, new_context);
6179     }
6180 
6181     QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
6182         ban->attached_aio_context(new_context, ban->opaque);
6183     }
6184 }
6185 
6186 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
6187 {
6188     bdrv_drain_all(); /* ensure there are no in-flight requests */
6189 
6190     bdrv_detach_aio_context(bs);
6191 
6192     /* This function executes in the old AioContext so acquire the new one in
6193      * case it runs in a different thread.
6194      */
6195     aio_context_acquire(new_context);
6196     bdrv_attach_aio_context(bs, new_context);
6197     aio_context_release(new_context);
6198 }
6199 
6200 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
6201         void (*attached_aio_context)(AioContext *new_context, void *opaque),
6202         void (*detach_aio_context)(void *opaque), void *opaque)
6203 {
6204     BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
6205     *ban = (BdrvAioNotifier){
6206         .attached_aio_context = attached_aio_context,
6207         .detach_aio_context   = detach_aio_context,
6208         .opaque               = opaque
6209     };
6210 
6211     QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
6212 }
6213 
6214 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
6215                                       void (*attached_aio_context)(AioContext *,
6216                                                                    void *),
6217                                       void (*detach_aio_context)(void *),
6218                                       void *opaque)
6219 {
6220     BdrvAioNotifier *ban, *ban_next;
6221 
6222     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
6223         if (ban->attached_aio_context == attached_aio_context &&
6224             ban->detach_aio_context   == detach_aio_context   &&
6225             ban->opaque               == opaque)
6226         {
6227             QLIST_REMOVE(ban, list);
6228             g_free(ban);
6229 
6230             return;
6231         }
6232     }
6233 
6234     abort();
6235 }
6236 
6237 void bdrv_add_before_write_notifier(BlockDriverState *bs,
6238                                     NotifierWithReturn *notifier)
6239 {
6240     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
6241 }
6242 
6243 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
6244                        BlockDriverAmendStatusCB *status_cb)
6245 {
6246     if (!bs->drv->bdrv_amend_options) {
6247         return -ENOTSUP;
6248     }
6249     return bs->drv->bdrv_amend_options(bs, opts, status_cb);
6250 }
6251 
6252 /* This function will be called by the bdrv_recurse_is_first_non_filter method
6253  * of block filter and by bdrv_is_first_non_filter.
6254  * It is used to test if the given bs is the candidate or recurse more in the
6255  * node graph.
6256  */
6257 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
6258                                       BlockDriverState *candidate)
6259 {
6260     /* return false if basic checks fails */
6261     if (!bs || !bs->drv) {
6262         return false;
6263     }
6264 
6265     /* the code reached a non block filter driver -> check if the bs is
6266      * the same as the candidate. It's the recursion termination condition.
6267      */
6268     if (!bs->drv->is_filter) {
6269         return bs == candidate;
6270     }
6271     /* Down this path the driver is a block filter driver */
6272 
6273     /* If the block filter recursion method is defined use it to recurse down
6274      * the node graph.
6275      */
6276     if (bs->drv->bdrv_recurse_is_first_non_filter) {
6277         return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
6278     }
6279 
6280     /* the driver is a block filter but don't allow to recurse -> return false
6281      */
6282     return false;
6283 }
6284 
6285 /* This function checks if the candidate is the first non filter bs down it's
6286  * bs chain. Since we don't have pointers to parents it explore all bs chains
6287  * from the top. Some filters can choose not to pass down the recursion.
6288  */
6289 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
6290 {
6291     BlockDriverState *bs;
6292 
6293     /* walk down the bs forest recursively */
6294     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
6295         bool perm;
6296 
6297         /* try to recurse in this top level bs */
6298         perm = bdrv_recurse_is_first_non_filter(bs, candidate);
6299 
6300         /* candidate is the first non filter */
6301         if (perm) {
6302             return true;
6303         }
6304     }
6305 
6306     return false;
6307 }
6308 
6309 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
6310 {
6311     BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
6312     AioContext *aio_context;
6313 
6314     if (!to_replace_bs) {
6315         error_setg(errp, "Node name '%s' not found", node_name);
6316         return NULL;
6317     }
6318 
6319     aio_context = bdrv_get_aio_context(to_replace_bs);
6320     aio_context_acquire(aio_context);
6321 
6322     if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
6323         to_replace_bs = NULL;
6324         goto out;
6325     }
6326 
6327     /* We don't want arbitrary node of the BDS chain to be replaced only the top
6328      * most non filter in order to prevent data corruption.
6329      * Another benefit is that this tests exclude backing files which are
6330      * blocked by the backing blockers.
6331      */
6332     if (!bdrv_is_first_non_filter(to_replace_bs)) {
6333         error_setg(errp, "Only top most non filter can be replaced");
6334         to_replace_bs = NULL;
6335         goto out;
6336     }
6337 
6338 out:
6339     aio_context_release(aio_context);
6340     return to_replace_bs;
6341 }
6342 
6343 void bdrv_io_plug(BlockDriverState *bs)
6344 {
6345     BlockDriver *drv = bs->drv;
6346     if (drv && drv->bdrv_io_plug) {
6347         drv->bdrv_io_plug(bs);
6348     } else if (bs->file) {
6349         bdrv_io_plug(bs->file);
6350     }
6351 }
6352 
6353 void bdrv_io_unplug(BlockDriverState *bs)
6354 {
6355     BlockDriver *drv = bs->drv;
6356     if (drv && drv->bdrv_io_unplug) {
6357         drv->bdrv_io_unplug(bs);
6358     } else if (bs->file) {
6359         bdrv_io_unplug(bs->file);
6360     }
6361 }
6362 
6363 void bdrv_flush_io_queue(BlockDriverState *bs)
6364 {
6365     BlockDriver *drv = bs->drv;
6366     if (drv && drv->bdrv_flush_io_queue) {
6367         drv->bdrv_flush_io_queue(bs);
6368     } else if (bs->file) {
6369         bdrv_flush_io_queue(bs->file);
6370     }
6371 }
6372 
6373 static bool append_open_options(QDict *d, BlockDriverState *bs)
6374 {
6375     const QDictEntry *entry;
6376     bool found_any = false;
6377 
6378     for (entry = qdict_first(bs->options); entry;
6379          entry = qdict_next(bs->options, entry))
6380     {
6381         /* Only take options for this level and exclude all non-driver-specific
6382          * options */
6383         if (!strchr(qdict_entry_key(entry), '.') &&
6384             strcmp(qdict_entry_key(entry), "node-name"))
6385         {
6386             qobject_incref(qdict_entry_value(entry));
6387             qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
6388             found_any = true;
6389         }
6390     }
6391 
6392     return found_any;
6393 }
6394 
6395 /* Updates the following BDS fields:
6396  *  - exact_filename: A filename which may be used for opening a block device
6397  *                    which (mostly) equals the given BDS (even without any
6398  *                    other options; so reading and writing must return the same
6399  *                    results, but caching etc. may be different)
6400  *  - full_open_options: Options which, when given when opening a block device
6401  *                       (without a filename), result in a BDS (mostly)
6402  *                       equalling the given one
6403  *  - filename: If exact_filename is set, it is copied here. Otherwise,
6404  *              full_open_options is converted to a JSON object, prefixed with
6405  *              "json:" (for use through the JSON pseudo protocol) and put here.
6406  */
6407 void bdrv_refresh_filename(BlockDriverState *bs)
6408 {
6409     BlockDriver *drv = bs->drv;
6410     QDict *opts;
6411 
6412     if (!drv) {
6413         return;
6414     }
6415 
6416     /* This BDS's file name will most probably depend on its file's name, so
6417      * refresh that first */
6418     if (bs->file) {
6419         bdrv_refresh_filename(bs->file);
6420     }
6421 
6422     if (drv->bdrv_refresh_filename) {
6423         /* Obsolete information is of no use here, so drop the old file name
6424          * information before refreshing it */
6425         bs->exact_filename[0] = '\0';
6426         if (bs->full_open_options) {
6427             QDECREF(bs->full_open_options);
6428             bs->full_open_options = NULL;
6429         }
6430 
6431         drv->bdrv_refresh_filename(bs);
6432     } else if (bs->file) {
6433         /* Try to reconstruct valid information from the underlying file */
6434         bool has_open_options;
6435 
6436         bs->exact_filename[0] = '\0';
6437         if (bs->full_open_options) {
6438             QDECREF(bs->full_open_options);
6439             bs->full_open_options = NULL;
6440         }
6441 
6442         opts = qdict_new();
6443         has_open_options = append_open_options(opts, bs);
6444 
6445         /* If no specific options have been given for this BDS, the filename of
6446          * the underlying file should suffice for this one as well */
6447         if (bs->file->exact_filename[0] && !has_open_options) {
6448             strcpy(bs->exact_filename, bs->file->exact_filename);
6449         }
6450         /* Reconstructing the full options QDict is simple for most format block
6451          * drivers, as long as the full options are known for the underlying
6452          * file BDS. The full options QDict of that file BDS should somehow
6453          * contain a representation of the filename, therefore the following
6454          * suffices without querying the (exact_)filename of this BDS. */
6455         if (bs->file->full_open_options) {
6456             qdict_put_obj(opts, "driver",
6457                           QOBJECT(qstring_from_str(drv->format_name)));
6458             QINCREF(bs->file->full_open_options);
6459             qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6460 
6461             bs->full_open_options = opts;
6462         } else {
6463             QDECREF(opts);
6464         }
6465     } else if (!bs->full_open_options && qdict_size(bs->options)) {
6466         /* There is no underlying file BDS (at least referenced by BDS.file),
6467          * so the full options QDict should be equal to the options given
6468          * specifically for this block device when it was opened (plus the
6469          * driver specification).
6470          * Because those options don't change, there is no need to update
6471          * full_open_options when it's already set. */
6472 
6473         opts = qdict_new();
6474         append_open_options(opts, bs);
6475         qdict_put_obj(opts, "driver",
6476                       QOBJECT(qstring_from_str(drv->format_name)));
6477 
6478         if (bs->exact_filename[0]) {
6479             /* This may not work for all block protocol drivers (some may
6480              * require this filename to be parsed), but we have to find some
6481              * default solution here, so just include it. If some block driver
6482              * does not support pure options without any filename at all or
6483              * needs some special format of the options QDict, it needs to
6484              * implement the driver-specific bdrv_refresh_filename() function.
6485              */
6486             qdict_put_obj(opts, "filename",
6487                           QOBJECT(qstring_from_str(bs->exact_filename)));
6488         }
6489 
6490         bs->full_open_options = opts;
6491     }
6492 
6493     if (bs->exact_filename[0]) {
6494         pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6495     } else if (bs->full_open_options) {
6496         QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6497         snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6498                  qstring_get_str(json));
6499         QDECREF(json);
6500     }
6501 }
6502 
6503 /* This accessor function purpose is to allow the device models to access the
6504  * BlockAcctStats structure embedded inside a BlockDriverState without being
6505  * aware of the BlockDriverState structure layout.
6506  * It will go away when the BlockAcctStats structure will be moved inside
6507  * the device models.
6508  */
6509 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6510 {
6511     return &bs->stats;
6512 }
6513