xref: /openbmc/qemu/block.c (revision 93f7c4f0)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/block-backend.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 #include "qapi-event.h"
39 
40 #ifdef CONFIG_BSD
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <sys/ioctl.h>
44 #include <sys/queue.h>
45 #ifndef __DragonFly__
46 #include <sys/disk.h>
47 #endif
48 #endif
49 
50 #ifdef _WIN32
51 #include <windows.h>
52 #endif
53 
54 struct BdrvDirtyBitmap {
55     HBitmap *bitmap;
56     QLIST_ENTRY(BdrvDirtyBitmap) list;
57 };
58 
59 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
60 
61 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63         BlockCompletionFunc *cb, void *opaque);
64 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66         BlockCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68                                          int64_t sector_num, int nb_sectors,
69                                          QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71                                          int64_t sector_num, int nb_sectors,
72                                          QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75     BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78     BdrvRequestFlags flags);
79 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80                                          int64_t sector_num,
81                                          QEMUIOVector *qiov,
82                                          int nb_sectors,
83                                          BdrvRequestFlags flags,
84                                          BlockCompletionFunc *cb,
85                                          void *opaque,
86                                          bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90 
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92     QTAILQ_HEAD_INITIALIZER(bdrv_states);
93 
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96 
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98     QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 
100 static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
101                            int nr_sectors);
102 static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
103                              int nr_sectors);
104 /* If non-zero, use only whitelisted block drivers */
105 static int use_bdrv_whitelist;
106 
107 #ifdef _WIN32
108 static int is_windows_drive_prefix(const char *filename)
109 {
110     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
111              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
112             filename[1] == ':');
113 }
114 
115 int is_windows_drive(const char *filename)
116 {
117     if (is_windows_drive_prefix(filename) &&
118         filename[2] == '\0')
119         return 1;
120     if (strstart(filename, "\\\\.\\", NULL) ||
121         strstart(filename, "//./", NULL))
122         return 1;
123     return 0;
124 }
125 #endif
126 
127 /* throttling disk I/O limits */
128 void bdrv_set_io_limits(BlockDriverState *bs,
129                         ThrottleConfig *cfg)
130 {
131     int i;
132 
133     throttle_config(&bs->throttle_state, cfg);
134 
135     for (i = 0; i < 2; i++) {
136         qemu_co_enter_next(&bs->throttled_reqs[i]);
137     }
138 }
139 
140 /* this function drain all the throttled IOs */
141 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
142 {
143     bool drained = false;
144     bool enabled = bs->io_limits_enabled;
145     int i;
146 
147     bs->io_limits_enabled = false;
148 
149     for (i = 0; i < 2; i++) {
150         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
151             drained = true;
152         }
153     }
154 
155     bs->io_limits_enabled = enabled;
156 
157     return drained;
158 }
159 
160 void bdrv_io_limits_disable(BlockDriverState *bs)
161 {
162     bs->io_limits_enabled = false;
163 
164     bdrv_start_throttled_reqs(bs);
165 
166     throttle_destroy(&bs->throttle_state);
167 }
168 
169 static void bdrv_throttle_read_timer_cb(void *opaque)
170 {
171     BlockDriverState *bs = opaque;
172     qemu_co_enter_next(&bs->throttled_reqs[0]);
173 }
174 
175 static void bdrv_throttle_write_timer_cb(void *opaque)
176 {
177     BlockDriverState *bs = opaque;
178     qemu_co_enter_next(&bs->throttled_reqs[1]);
179 }
180 
181 /* should be called before bdrv_set_io_limits if a limit is set */
182 void bdrv_io_limits_enable(BlockDriverState *bs)
183 {
184     assert(!bs->io_limits_enabled);
185     throttle_init(&bs->throttle_state,
186                   bdrv_get_aio_context(bs),
187                   QEMU_CLOCK_VIRTUAL,
188                   bdrv_throttle_read_timer_cb,
189                   bdrv_throttle_write_timer_cb,
190                   bs);
191     bs->io_limits_enabled = true;
192 }
193 
194 /* This function makes an IO wait if needed
195  *
196  * @nb_sectors: the number of sectors of the IO
197  * @is_write:   is the IO a write
198  */
199 static void bdrv_io_limits_intercept(BlockDriverState *bs,
200                                      unsigned int bytes,
201                                      bool is_write)
202 {
203     /* does this io must wait */
204     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
205 
206     /* if must wait or any request of this type throttled queue the IO */
207     if (must_wait ||
208         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
209         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
210     }
211 
212     /* the IO will be executed, do the accounting */
213     throttle_account(&bs->throttle_state, is_write, bytes);
214 
215 
216     /* if the next request must wait -> do nothing */
217     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
218         return;
219     }
220 
221     /* else queue next request for execution */
222     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
223 }
224 
225 size_t bdrv_opt_mem_align(BlockDriverState *bs)
226 {
227     if (!bs || !bs->drv) {
228         /* 4k should be on the safe side */
229         return 4096;
230     }
231 
232     return bs->bl.opt_mem_alignment;
233 }
234 
235 /* check if the path starts with "<protocol>:" */
236 int path_has_protocol(const char *path)
237 {
238     const char *p;
239 
240 #ifdef _WIN32
241     if (is_windows_drive(path) ||
242         is_windows_drive_prefix(path)) {
243         return 0;
244     }
245     p = path + strcspn(path, ":/\\");
246 #else
247     p = path + strcspn(path, ":/");
248 #endif
249 
250     return *p == ':';
251 }
252 
253 int path_is_absolute(const char *path)
254 {
255 #ifdef _WIN32
256     /* specific case for names like: "\\.\d:" */
257     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
258         return 1;
259     }
260     return (*path == '/' || *path == '\\');
261 #else
262     return (*path == '/');
263 #endif
264 }
265 
266 /* if filename is absolute, just copy it to dest. Otherwise, build a
267    path to it by considering it is relative to base_path. URL are
268    supported. */
269 void path_combine(char *dest, int dest_size,
270                   const char *base_path,
271                   const char *filename)
272 {
273     const char *p, *p1;
274     int len;
275 
276     if (dest_size <= 0)
277         return;
278     if (path_is_absolute(filename)) {
279         pstrcpy(dest, dest_size, filename);
280     } else {
281         p = strchr(base_path, ':');
282         if (p)
283             p++;
284         else
285             p = base_path;
286         p1 = strrchr(base_path, '/');
287 #ifdef _WIN32
288         {
289             const char *p2;
290             p2 = strrchr(base_path, '\\');
291             if (!p1 || p2 > p1)
292                 p1 = p2;
293         }
294 #endif
295         if (p1)
296             p1++;
297         else
298             p1 = base_path;
299         if (p1 > p)
300             p = p1;
301         len = p - base_path;
302         if (len > dest_size - 1)
303             len = dest_size - 1;
304         memcpy(dest, base_path, len);
305         dest[len] = '\0';
306         pstrcat(dest, dest_size, filename);
307     }
308 }
309 
310 void bdrv_get_full_backing_filename_from_filename(const char *backed,
311                                                   const char *backing,
312                                                   char *dest, size_t sz,
313                                                   Error **errp)
314 {
315     if (backing[0] == '\0' || path_has_protocol(backing) ||
316         path_is_absolute(backing))
317     {
318         pstrcpy(dest, sz, backing);
319     } else if (backed[0] == '\0' || strstart(backed, "json:", NULL)) {
320         error_setg(errp, "Cannot use relative backing file names for '%s'",
321                    backed);
322     } else {
323         path_combine(dest, sz, backed, backing);
324     }
325 }
326 
327 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz,
328                                     Error **errp)
329 {
330     char *backed = bs->exact_filename[0] ? bs->exact_filename : bs->filename;
331 
332     bdrv_get_full_backing_filename_from_filename(backed, bs->backing_file,
333                                                  dest, sz, errp);
334 }
335 
336 void bdrv_register(BlockDriver *bdrv)
337 {
338     /* Block drivers without coroutine functions need emulation */
339     if (!bdrv->bdrv_co_readv) {
340         bdrv->bdrv_co_readv = bdrv_co_readv_em;
341         bdrv->bdrv_co_writev = bdrv_co_writev_em;
342 
343         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
344          * the block driver lacks aio we need to emulate that too.
345          */
346         if (!bdrv->bdrv_aio_readv) {
347             /* add AIO emulation layer */
348             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
349             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
350         }
351     }
352 
353     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
354 }
355 
356 BlockDriverState *bdrv_new_root(void)
357 {
358     BlockDriverState *bs = bdrv_new();
359 
360     QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
361     return bs;
362 }
363 
364 BlockDriverState *bdrv_new(void)
365 {
366     BlockDriverState *bs;
367     int i;
368 
369     bs = g_new0(BlockDriverState, 1);
370     QLIST_INIT(&bs->dirty_bitmaps);
371     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
372         QLIST_INIT(&bs->op_blockers[i]);
373     }
374     bdrv_iostatus_disable(bs);
375     notifier_list_init(&bs->close_notifiers);
376     notifier_with_return_list_init(&bs->before_write_notifiers);
377     qemu_co_queue_init(&bs->throttled_reqs[0]);
378     qemu_co_queue_init(&bs->throttled_reqs[1]);
379     bs->refcnt = 1;
380     bs->aio_context = qemu_get_aio_context();
381 
382     return bs;
383 }
384 
385 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
386 {
387     notifier_list_add(&bs->close_notifiers, notify);
388 }
389 
390 BlockDriver *bdrv_find_format(const char *format_name)
391 {
392     BlockDriver *drv1;
393     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
394         if (!strcmp(drv1->format_name, format_name)) {
395             return drv1;
396         }
397     }
398     return NULL;
399 }
400 
401 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
402 {
403     static const char *whitelist_rw[] = {
404         CONFIG_BDRV_RW_WHITELIST
405     };
406     static const char *whitelist_ro[] = {
407         CONFIG_BDRV_RO_WHITELIST
408     };
409     const char **p;
410 
411     if (!whitelist_rw[0] && !whitelist_ro[0]) {
412         return 1;               /* no whitelist, anything goes */
413     }
414 
415     for (p = whitelist_rw; *p; p++) {
416         if (!strcmp(drv->format_name, *p)) {
417             return 1;
418         }
419     }
420     if (read_only) {
421         for (p = whitelist_ro; *p; p++) {
422             if (!strcmp(drv->format_name, *p)) {
423                 return 1;
424             }
425         }
426     }
427     return 0;
428 }
429 
430 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
431                                           bool read_only)
432 {
433     BlockDriver *drv = bdrv_find_format(format_name);
434     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
435 }
436 
437 typedef struct CreateCo {
438     BlockDriver *drv;
439     char *filename;
440     QemuOpts *opts;
441     int ret;
442     Error *err;
443 } CreateCo;
444 
445 static void coroutine_fn bdrv_create_co_entry(void *opaque)
446 {
447     Error *local_err = NULL;
448     int ret;
449 
450     CreateCo *cco = opaque;
451     assert(cco->drv);
452 
453     ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
454     if (local_err) {
455         error_propagate(&cco->err, local_err);
456     }
457     cco->ret = ret;
458 }
459 
460 int bdrv_create(BlockDriver *drv, const char* filename,
461                 QemuOpts *opts, Error **errp)
462 {
463     int ret;
464 
465     Coroutine *co;
466     CreateCo cco = {
467         .drv = drv,
468         .filename = g_strdup(filename),
469         .opts = opts,
470         .ret = NOT_DONE,
471         .err = NULL,
472     };
473 
474     if (!drv->bdrv_create) {
475         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
476         ret = -ENOTSUP;
477         goto out;
478     }
479 
480     if (qemu_in_coroutine()) {
481         /* Fast-path if already in coroutine context */
482         bdrv_create_co_entry(&cco);
483     } else {
484         co = qemu_coroutine_create(bdrv_create_co_entry);
485         qemu_coroutine_enter(co, &cco);
486         while (cco.ret == NOT_DONE) {
487             aio_poll(qemu_get_aio_context(), true);
488         }
489     }
490 
491     ret = cco.ret;
492     if (ret < 0) {
493         if (cco.err) {
494             error_propagate(errp, cco.err);
495         } else {
496             error_setg_errno(errp, -ret, "Could not create image");
497         }
498     }
499 
500 out:
501     g_free(cco.filename);
502     return ret;
503 }
504 
505 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
506 {
507     BlockDriver *drv;
508     Error *local_err = NULL;
509     int ret;
510 
511     drv = bdrv_find_protocol(filename, true, errp);
512     if (drv == NULL) {
513         return -ENOENT;
514     }
515 
516     ret = bdrv_create(drv, filename, opts, &local_err);
517     if (local_err) {
518         error_propagate(errp, local_err);
519     }
520     return ret;
521 }
522 
523 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
524 {
525     BlockDriver *drv = bs->drv;
526     Error *local_err = NULL;
527 
528     memset(&bs->bl, 0, sizeof(bs->bl));
529 
530     if (!drv) {
531         return;
532     }
533 
534     /* Take some limits from the children as a default */
535     if (bs->file) {
536         bdrv_refresh_limits(bs->file, &local_err);
537         if (local_err) {
538             error_propagate(errp, local_err);
539             return;
540         }
541         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
542         bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
543         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
544     } else {
545         bs->bl.opt_mem_alignment = 512;
546     }
547 
548     if (bs->backing_hd) {
549         bdrv_refresh_limits(bs->backing_hd, &local_err);
550         if (local_err) {
551             error_propagate(errp, local_err);
552             return;
553         }
554         bs->bl.opt_transfer_length =
555             MAX(bs->bl.opt_transfer_length,
556                 bs->backing_hd->bl.opt_transfer_length);
557         bs->bl.max_transfer_length =
558             MIN_NON_ZERO(bs->bl.max_transfer_length,
559                          bs->backing_hd->bl.max_transfer_length);
560         bs->bl.opt_mem_alignment =
561             MAX(bs->bl.opt_mem_alignment,
562                 bs->backing_hd->bl.opt_mem_alignment);
563     }
564 
565     /* Then let the driver override it */
566     if (drv->bdrv_refresh_limits) {
567         drv->bdrv_refresh_limits(bs, errp);
568     }
569 }
570 
571 /**
572  * Try to get @bs's logical and physical block size.
573  * On success, store them in @bsz struct and return 0.
574  * On failure return -errno.
575  * @bs must not be empty.
576  */
577 int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
578 {
579     BlockDriver *drv = bs->drv;
580 
581     if (drv && drv->bdrv_probe_blocksizes) {
582         return drv->bdrv_probe_blocksizes(bs, bsz);
583     }
584 
585     return -ENOTSUP;
586 }
587 
588 /**
589  * Try to get @bs's geometry (cyls, heads, sectors).
590  * On success, store them in @geo struct and return 0.
591  * On failure return -errno.
592  * @bs must not be empty.
593  */
594 int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
595 {
596     BlockDriver *drv = bs->drv;
597 
598     if (drv && drv->bdrv_probe_geometry) {
599         return drv->bdrv_probe_geometry(bs, geo);
600     }
601 
602     return -ENOTSUP;
603 }
604 
605 /*
606  * Create a uniquely-named empty temporary file.
607  * Return 0 upon success, otherwise a negative errno value.
608  */
609 int get_tmp_filename(char *filename, int size)
610 {
611 #ifdef _WIN32
612     char temp_dir[MAX_PATH];
613     /* GetTempFileName requires that its output buffer (4th param)
614        have length MAX_PATH or greater.  */
615     assert(size >= MAX_PATH);
616     return (GetTempPath(MAX_PATH, temp_dir)
617             && GetTempFileName(temp_dir, "qem", 0, filename)
618             ? 0 : -GetLastError());
619 #else
620     int fd;
621     const char *tmpdir;
622     tmpdir = getenv("TMPDIR");
623     if (!tmpdir) {
624         tmpdir = "/var/tmp";
625     }
626     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
627         return -EOVERFLOW;
628     }
629     fd = mkstemp(filename);
630     if (fd < 0) {
631         return -errno;
632     }
633     if (close(fd) != 0) {
634         unlink(filename);
635         return -errno;
636     }
637     return 0;
638 #endif
639 }
640 
641 /*
642  * Detect host devices. By convention, /dev/cdrom[N] is always
643  * recognized as a host CDROM.
644  */
645 static BlockDriver *find_hdev_driver(const char *filename)
646 {
647     int score_max = 0, score;
648     BlockDriver *drv = NULL, *d;
649 
650     QLIST_FOREACH(d, &bdrv_drivers, list) {
651         if (d->bdrv_probe_device) {
652             score = d->bdrv_probe_device(filename);
653             if (score > score_max) {
654                 score_max = score;
655                 drv = d;
656             }
657         }
658     }
659 
660     return drv;
661 }
662 
663 BlockDriver *bdrv_find_protocol(const char *filename,
664                                 bool allow_protocol_prefix,
665                                 Error **errp)
666 {
667     BlockDriver *drv1;
668     char protocol[128];
669     int len;
670     const char *p;
671 
672     /* TODO Drivers without bdrv_file_open must be specified explicitly */
673 
674     /*
675      * XXX(hch): we really should not let host device detection
676      * override an explicit protocol specification, but moving this
677      * later breaks access to device names with colons in them.
678      * Thanks to the brain-dead persistent naming schemes on udev-
679      * based Linux systems those actually are quite common.
680      */
681     drv1 = find_hdev_driver(filename);
682     if (drv1) {
683         return drv1;
684     }
685 
686     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
687         return &bdrv_file;
688     }
689 
690     p = strchr(filename, ':');
691     assert(p != NULL);
692     len = p - filename;
693     if (len > sizeof(protocol) - 1)
694         len = sizeof(protocol) - 1;
695     memcpy(protocol, filename, len);
696     protocol[len] = '\0';
697     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
698         if (drv1->protocol_name &&
699             !strcmp(drv1->protocol_name, protocol)) {
700             return drv1;
701         }
702     }
703 
704     error_setg(errp, "Unknown protocol '%s'", protocol);
705     return NULL;
706 }
707 
708 /*
709  * Guess image format by probing its contents.
710  * This is not a good idea when your image is raw (CVE-2008-2004), but
711  * we do it anyway for backward compatibility.
712  *
713  * @buf         contains the image's first @buf_size bytes.
714  * @buf_size    is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
715  *              but can be smaller if the image file is smaller)
716  * @filename    is its filename.
717  *
718  * For all block drivers, call the bdrv_probe() method to get its
719  * probing score.
720  * Return the first block driver with the highest probing score.
721  */
722 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
723                             const char *filename)
724 {
725     int score_max = 0, score;
726     BlockDriver *drv = NULL, *d;
727 
728     QLIST_FOREACH(d, &bdrv_drivers, list) {
729         if (d->bdrv_probe) {
730             score = d->bdrv_probe(buf, buf_size, filename);
731             if (score > score_max) {
732                 score_max = score;
733                 drv = d;
734             }
735         }
736     }
737 
738     return drv;
739 }
740 
741 static int find_image_format(BlockDriverState *bs, const char *filename,
742                              BlockDriver **pdrv, Error **errp)
743 {
744     BlockDriver *drv;
745     uint8_t buf[BLOCK_PROBE_BUF_SIZE];
746     int ret = 0;
747 
748     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
749     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
750         *pdrv = &bdrv_raw;
751         return ret;
752     }
753 
754     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
755     if (ret < 0) {
756         error_setg_errno(errp, -ret, "Could not read image for determining its "
757                          "format");
758         *pdrv = NULL;
759         return ret;
760     }
761 
762     drv = bdrv_probe_all(buf, ret, filename);
763     if (!drv) {
764         error_setg(errp, "Could not determine image format: No compatible "
765                    "driver found");
766         ret = -ENOENT;
767     }
768     *pdrv = drv;
769     return ret;
770 }
771 
772 /**
773  * Set the current 'total_sectors' value
774  * Return 0 on success, -errno on error.
775  */
776 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
777 {
778     BlockDriver *drv = bs->drv;
779 
780     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
781     if (bs->sg)
782         return 0;
783 
784     /* query actual device if possible, otherwise just trust the hint */
785     if (drv->bdrv_getlength) {
786         int64_t length = drv->bdrv_getlength(bs);
787         if (length < 0) {
788             return length;
789         }
790         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
791     }
792 
793     bs->total_sectors = hint;
794     return 0;
795 }
796 
797 /**
798  * Set open flags for a given discard mode
799  *
800  * Return 0 on success, -1 if the discard mode was invalid.
801  */
802 int bdrv_parse_discard_flags(const char *mode, int *flags)
803 {
804     *flags &= ~BDRV_O_UNMAP;
805 
806     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
807         /* do nothing */
808     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
809         *flags |= BDRV_O_UNMAP;
810     } else {
811         return -1;
812     }
813 
814     return 0;
815 }
816 
817 /**
818  * Set open flags for a given cache mode
819  *
820  * Return 0 on success, -1 if the cache mode was invalid.
821  */
822 int bdrv_parse_cache_flags(const char *mode, int *flags)
823 {
824     *flags &= ~BDRV_O_CACHE_MASK;
825 
826     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
827         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
828     } else if (!strcmp(mode, "directsync")) {
829         *flags |= BDRV_O_NOCACHE;
830     } else if (!strcmp(mode, "writeback")) {
831         *flags |= BDRV_O_CACHE_WB;
832     } else if (!strcmp(mode, "unsafe")) {
833         *flags |= BDRV_O_CACHE_WB;
834         *flags |= BDRV_O_NO_FLUSH;
835     } else if (!strcmp(mode, "writethrough")) {
836         /* this is the default */
837     } else {
838         return -1;
839     }
840 
841     return 0;
842 }
843 
844 /**
845  * The copy-on-read flag is actually a reference count so multiple users may
846  * use the feature without worrying about clobbering its previous state.
847  * Copy-on-read stays enabled until all users have called to disable it.
848  */
849 void bdrv_enable_copy_on_read(BlockDriverState *bs)
850 {
851     bs->copy_on_read++;
852 }
853 
854 void bdrv_disable_copy_on_read(BlockDriverState *bs)
855 {
856     assert(bs->copy_on_read > 0);
857     bs->copy_on_read--;
858 }
859 
860 /*
861  * Returns the flags that a temporary snapshot should get, based on the
862  * originally requested flags (the originally requested image will have flags
863  * like a backing file)
864  */
865 static int bdrv_temp_snapshot_flags(int flags)
866 {
867     return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
868 }
869 
870 /*
871  * Returns the flags that bs->file should get, based on the given flags for
872  * the parent BDS
873  */
874 static int bdrv_inherited_flags(int flags)
875 {
876     /* Enable protocol handling, disable format probing for bs->file */
877     flags |= BDRV_O_PROTOCOL;
878 
879     /* Our block drivers take care to send flushes and respect unmap policy,
880      * so we can enable both unconditionally on lower layers. */
881     flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
882 
883     /* Clear flags that only apply to the top layer */
884     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
885 
886     return flags;
887 }
888 
889 /*
890  * Returns the flags that bs->backing_hd should get, based on the given flags
891  * for the parent BDS
892  */
893 static int bdrv_backing_flags(int flags)
894 {
895     /* backing files always opened read-only */
896     flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
897 
898     /* snapshot=on is handled on the top layer */
899     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
900 
901     return flags;
902 }
903 
904 static int bdrv_open_flags(BlockDriverState *bs, int flags)
905 {
906     int open_flags = flags | BDRV_O_CACHE_WB;
907 
908     /*
909      * Clear flags that are internal to the block layer before opening the
910      * image.
911      */
912     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
913 
914     /*
915      * Snapshots should be writable.
916      */
917     if (flags & BDRV_O_TEMPORARY) {
918         open_flags |= BDRV_O_RDWR;
919     }
920 
921     return open_flags;
922 }
923 
924 static void bdrv_assign_node_name(BlockDriverState *bs,
925                                   const char *node_name,
926                                   Error **errp)
927 {
928     if (!node_name) {
929         return;
930     }
931 
932     /* Check for empty string or invalid characters */
933     if (!id_wellformed(node_name)) {
934         error_setg(errp, "Invalid node name");
935         return;
936     }
937 
938     /* takes care of avoiding namespaces collisions */
939     if (blk_by_name(node_name)) {
940         error_setg(errp, "node-name=%s is conflicting with a device id",
941                    node_name);
942         return;
943     }
944 
945     /* takes care of avoiding duplicates node names */
946     if (bdrv_find_node(node_name)) {
947         error_setg(errp, "Duplicate node name");
948         return;
949     }
950 
951     /* copy node name into the bs and insert it into the graph list */
952     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
953     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
954 }
955 
956 /*
957  * Common part for opening disk images and files
958  *
959  * Removes all processed options from *options.
960  */
961 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
962     QDict *options, int flags, BlockDriver *drv, Error **errp)
963 {
964     int ret, open_flags;
965     const char *filename;
966     const char *node_name = NULL;
967     Error *local_err = NULL;
968 
969     assert(drv != NULL);
970     assert(bs->file == NULL);
971     assert(options != NULL && bs->options != options);
972 
973     if (file != NULL) {
974         filename = file->filename;
975     } else {
976         filename = qdict_get_try_str(options, "filename");
977     }
978 
979     if (drv->bdrv_needs_filename && !filename) {
980         error_setg(errp, "The '%s' block driver requires a file name",
981                    drv->format_name);
982         return -EINVAL;
983     }
984 
985     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
986 
987     node_name = qdict_get_try_str(options, "node-name");
988     bdrv_assign_node_name(bs, node_name, &local_err);
989     if (local_err) {
990         error_propagate(errp, local_err);
991         return -EINVAL;
992     }
993     qdict_del(options, "node-name");
994 
995     /* bdrv_open() with directly using a protocol as drv. This layer is already
996      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
997      * and return immediately. */
998     if (file != NULL && drv->bdrv_file_open) {
999         bdrv_swap(file, bs);
1000         return 0;
1001     }
1002 
1003     bs->open_flags = flags;
1004     bs->guest_block_size = 512;
1005     bs->request_alignment = 512;
1006     bs->zero_beyond_eof = true;
1007     open_flags = bdrv_open_flags(bs, flags);
1008     bs->read_only = !(open_flags & BDRV_O_RDWR);
1009 
1010     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
1011         error_setg(errp,
1012                    !bs->read_only && bdrv_is_whitelisted(drv, true)
1013                         ? "Driver '%s' can only be used for read-only devices"
1014                         : "Driver '%s' is not whitelisted",
1015                    drv->format_name);
1016         return -ENOTSUP;
1017     }
1018 
1019     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
1020     if (flags & BDRV_O_COPY_ON_READ) {
1021         if (!bs->read_only) {
1022             bdrv_enable_copy_on_read(bs);
1023         } else {
1024             error_setg(errp, "Can't use copy-on-read on read-only device");
1025             return -EINVAL;
1026         }
1027     }
1028 
1029     if (filename != NULL) {
1030         pstrcpy(bs->filename, sizeof(bs->filename), filename);
1031     } else {
1032         bs->filename[0] = '\0';
1033     }
1034     pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
1035 
1036     bs->drv = drv;
1037     bs->opaque = g_malloc0(drv->instance_size);
1038 
1039     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
1040 
1041     /* Open the image, either directly or using a protocol */
1042     if (drv->bdrv_file_open) {
1043         assert(file == NULL);
1044         assert(!drv->bdrv_needs_filename || filename != NULL);
1045         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
1046     } else {
1047         if (file == NULL) {
1048             error_setg(errp, "Can't use '%s' as a block driver for the "
1049                        "protocol level", drv->format_name);
1050             ret = -EINVAL;
1051             goto free_and_fail;
1052         }
1053         bs->file = file;
1054         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
1055     }
1056 
1057     if (ret < 0) {
1058         if (local_err) {
1059             error_propagate(errp, local_err);
1060         } else if (bs->filename[0]) {
1061             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
1062         } else {
1063             error_setg_errno(errp, -ret, "Could not open image");
1064         }
1065         goto free_and_fail;
1066     }
1067 
1068     ret = refresh_total_sectors(bs, bs->total_sectors);
1069     if (ret < 0) {
1070         error_setg_errno(errp, -ret, "Could not refresh total sector count");
1071         goto free_and_fail;
1072     }
1073 
1074     bdrv_refresh_limits(bs, &local_err);
1075     if (local_err) {
1076         error_propagate(errp, local_err);
1077         ret = -EINVAL;
1078         goto free_and_fail;
1079     }
1080 
1081     assert(bdrv_opt_mem_align(bs) != 0);
1082     assert((bs->request_alignment != 0) || bs->sg);
1083     return 0;
1084 
1085 free_and_fail:
1086     bs->file = NULL;
1087     g_free(bs->opaque);
1088     bs->opaque = NULL;
1089     bs->drv = NULL;
1090     return ret;
1091 }
1092 
1093 static QDict *parse_json_filename(const char *filename, Error **errp)
1094 {
1095     QObject *options_obj;
1096     QDict *options;
1097     int ret;
1098 
1099     ret = strstart(filename, "json:", &filename);
1100     assert(ret);
1101 
1102     options_obj = qobject_from_json(filename);
1103     if (!options_obj) {
1104         error_setg(errp, "Could not parse the JSON options");
1105         return NULL;
1106     }
1107 
1108     if (qobject_type(options_obj) != QTYPE_QDICT) {
1109         qobject_decref(options_obj);
1110         error_setg(errp, "Invalid JSON object given");
1111         return NULL;
1112     }
1113 
1114     options = qobject_to_qdict(options_obj);
1115     qdict_flatten(options);
1116 
1117     return options;
1118 }
1119 
1120 /*
1121  * Fills in default options for opening images and converts the legacy
1122  * filename/flags pair to option QDict entries.
1123  */
1124 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1125                              BlockDriver *drv, Error **errp)
1126 {
1127     const char *filename = *pfilename;
1128     const char *drvname;
1129     bool protocol = flags & BDRV_O_PROTOCOL;
1130     bool parse_filename = false;
1131     Error *local_err = NULL;
1132 
1133     /* Parse json: pseudo-protocol */
1134     if (filename && g_str_has_prefix(filename, "json:")) {
1135         QDict *json_options = parse_json_filename(filename, &local_err);
1136         if (local_err) {
1137             error_propagate(errp, local_err);
1138             return -EINVAL;
1139         }
1140 
1141         /* Options given in the filename have lower priority than options
1142          * specified directly */
1143         qdict_join(*options, json_options, false);
1144         QDECREF(json_options);
1145         *pfilename = filename = NULL;
1146     }
1147 
1148     /* Fetch the file name from the options QDict if necessary */
1149     if (protocol && filename) {
1150         if (!qdict_haskey(*options, "filename")) {
1151             qdict_put(*options, "filename", qstring_from_str(filename));
1152             parse_filename = true;
1153         } else {
1154             error_setg(errp, "Can't specify 'file' and 'filename' options at "
1155                              "the same time");
1156             return -EINVAL;
1157         }
1158     }
1159 
1160     /* Find the right block driver */
1161     filename = qdict_get_try_str(*options, "filename");
1162     drvname = qdict_get_try_str(*options, "driver");
1163 
1164     if (drv) {
1165         if (drvname) {
1166             error_setg(errp, "Driver specified twice");
1167             return -EINVAL;
1168         }
1169         drvname = drv->format_name;
1170         qdict_put(*options, "driver", qstring_from_str(drvname));
1171     } else {
1172         if (!drvname && protocol) {
1173             if (filename) {
1174                 drv = bdrv_find_protocol(filename, parse_filename, errp);
1175                 if (!drv) {
1176                     return -EINVAL;
1177                 }
1178 
1179                 drvname = drv->format_name;
1180                 qdict_put(*options, "driver", qstring_from_str(drvname));
1181             } else {
1182                 error_setg(errp, "Must specify either driver or file");
1183                 return -EINVAL;
1184             }
1185         } else if (drvname) {
1186             drv = bdrv_find_format(drvname);
1187             if (!drv) {
1188                 error_setg(errp, "Unknown driver '%s'", drvname);
1189                 return -ENOENT;
1190             }
1191         }
1192     }
1193 
1194     assert(drv || !protocol);
1195 
1196     /* Driver-specific filename parsing */
1197     if (drv && drv->bdrv_parse_filename && parse_filename) {
1198         drv->bdrv_parse_filename(filename, *options, &local_err);
1199         if (local_err) {
1200             error_propagate(errp, local_err);
1201             return -EINVAL;
1202         }
1203 
1204         if (!drv->bdrv_needs_filename) {
1205             qdict_del(*options, "filename");
1206         }
1207     }
1208 
1209     return 0;
1210 }
1211 
1212 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1213 {
1214 
1215     if (bs->backing_hd) {
1216         assert(bs->backing_blocker);
1217         bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1218     } else if (backing_hd) {
1219         error_setg(&bs->backing_blocker,
1220                    "device is used as backing hd of '%s'",
1221                    bdrv_get_device_name(bs));
1222     }
1223 
1224     bs->backing_hd = backing_hd;
1225     if (!backing_hd) {
1226         error_free(bs->backing_blocker);
1227         bs->backing_blocker = NULL;
1228         goto out;
1229     }
1230     bs->open_flags &= ~BDRV_O_NO_BACKING;
1231     pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1232     pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1233             backing_hd->drv ? backing_hd->drv->format_name : "");
1234 
1235     bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1236     /* Otherwise we won't be able to commit due to check in bdrv_commit */
1237     bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET,
1238                     bs->backing_blocker);
1239 out:
1240     bdrv_refresh_limits(bs, NULL);
1241 }
1242 
1243 /*
1244  * Opens the backing file for a BlockDriverState if not yet open
1245  *
1246  * options is a QDict of options to pass to the block drivers, or NULL for an
1247  * empty set of options. The reference to the QDict is transferred to this
1248  * function (even on failure), so if the caller intends to reuse the dictionary,
1249  * it needs to use QINCREF() before calling bdrv_file_open.
1250  */
1251 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1252 {
1253     char *backing_filename = g_malloc0(PATH_MAX);
1254     int ret = 0;
1255     BlockDriverState *backing_hd;
1256     Error *local_err = NULL;
1257 
1258     if (bs->backing_hd != NULL) {
1259         QDECREF(options);
1260         goto free_exit;
1261     }
1262 
1263     /* NULL means an empty set of options */
1264     if (options == NULL) {
1265         options = qdict_new();
1266     }
1267 
1268     bs->open_flags &= ~BDRV_O_NO_BACKING;
1269     if (qdict_haskey(options, "file.filename")) {
1270         backing_filename[0] = '\0';
1271     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1272         QDECREF(options);
1273         goto free_exit;
1274     } else {
1275         bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX,
1276                                        &local_err);
1277         if (local_err) {
1278             ret = -EINVAL;
1279             error_propagate(errp, local_err);
1280             QDECREF(options);
1281             goto free_exit;
1282         }
1283     }
1284 
1285     if (!bs->drv || !bs->drv->supports_backing) {
1286         ret = -EINVAL;
1287         error_setg(errp, "Driver doesn't support backing files");
1288         QDECREF(options);
1289         goto free_exit;
1290     }
1291 
1292     backing_hd = bdrv_new();
1293 
1294     if (bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) {
1295         qdict_put(options, "driver", qstring_from_str(bs->backing_format));
1296     }
1297 
1298     assert(bs->backing_hd == NULL);
1299     ret = bdrv_open(&backing_hd,
1300                     *backing_filename ? backing_filename : NULL, NULL, options,
1301                     bdrv_backing_flags(bs->open_flags), NULL, &local_err);
1302     if (ret < 0) {
1303         bdrv_unref(backing_hd);
1304         backing_hd = NULL;
1305         bs->open_flags |= BDRV_O_NO_BACKING;
1306         error_setg(errp, "Could not open backing file: %s",
1307                    error_get_pretty(local_err));
1308         error_free(local_err);
1309         goto free_exit;
1310     }
1311     bdrv_set_backing_hd(bs, backing_hd);
1312 
1313 free_exit:
1314     g_free(backing_filename);
1315     return ret;
1316 }
1317 
1318 /*
1319  * Opens a disk image whose options are given as BlockdevRef in another block
1320  * device's options.
1321  *
1322  * If allow_none is true, no image will be opened if filename is false and no
1323  * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1324  *
1325  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1326  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1327  * itself, all options starting with "${bdref_key}." are considered part of the
1328  * BlockdevRef.
1329  *
1330  * The BlockdevRef will be removed from the options QDict.
1331  *
1332  * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1333  */
1334 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1335                     QDict *options, const char *bdref_key, int flags,
1336                     bool allow_none, Error **errp)
1337 {
1338     QDict *image_options;
1339     int ret;
1340     char *bdref_key_dot;
1341     const char *reference;
1342 
1343     assert(pbs);
1344     assert(*pbs == NULL);
1345 
1346     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1347     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1348     g_free(bdref_key_dot);
1349 
1350     reference = qdict_get_try_str(options, bdref_key);
1351     if (!filename && !reference && !qdict_size(image_options)) {
1352         if (allow_none) {
1353             ret = 0;
1354         } else {
1355             error_setg(errp, "A block device must be specified for \"%s\"",
1356                        bdref_key);
1357             ret = -EINVAL;
1358         }
1359         QDECREF(image_options);
1360         goto done;
1361     }
1362 
1363     ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1364 
1365 done:
1366     qdict_del(options, bdref_key);
1367     return ret;
1368 }
1369 
1370 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1371 {
1372     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1373     char *tmp_filename = g_malloc0(PATH_MAX + 1);
1374     int64_t total_size;
1375     QemuOpts *opts = NULL;
1376     QDict *snapshot_options;
1377     BlockDriverState *bs_snapshot;
1378     Error *local_err;
1379     int ret;
1380 
1381     /* if snapshot, we create a temporary backing file and open it
1382        instead of opening 'filename' directly */
1383 
1384     /* Get the required size from the image */
1385     total_size = bdrv_getlength(bs);
1386     if (total_size < 0) {
1387         ret = total_size;
1388         error_setg_errno(errp, -total_size, "Could not get image size");
1389         goto out;
1390     }
1391 
1392     /* Create the temporary image */
1393     ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1394     if (ret < 0) {
1395         error_setg_errno(errp, -ret, "Could not get temporary filename");
1396         goto out;
1397     }
1398 
1399     opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
1400                             &error_abort);
1401     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size, &error_abort);
1402     ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, &local_err);
1403     qemu_opts_del(opts);
1404     if (ret < 0) {
1405         error_setg_errno(errp, -ret, "Could not create temporary overlay "
1406                          "'%s': %s", tmp_filename,
1407                          error_get_pretty(local_err));
1408         error_free(local_err);
1409         goto out;
1410     }
1411 
1412     /* Prepare a new options QDict for the temporary file */
1413     snapshot_options = qdict_new();
1414     qdict_put(snapshot_options, "file.driver",
1415               qstring_from_str("file"));
1416     qdict_put(snapshot_options, "file.filename",
1417               qstring_from_str(tmp_filename));
1418 
1419     bs_snapshot = bdrv_new();
1420 
1421     ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1422                     flags, &bdrv_qcow2, &local_err);
1423     if (ret < 0) {
1424         error_propagate(errp, local_err);
1425         goto out;
1426     }
1427 
1428     bdrv_append(bs_snapshot, bs);
1429 
1430 out:
1431     g_free(tmp_filename);
1432     return ret;
1433 }
1434 
1435 /*
1436  * Opens a disk image (raw, qcow2, vmdk, ...)
1437  *
1438  * options is a QDict of options to pass to the block drivers, or NULL for an
1439  * empty set of options. The reference to the QDict belongs to the block layer
1440  * after the call (even on failure), so if the caller intends to reuse the
1441  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1442  *
1443  * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1444  * If it is not NULL, the referenced BDS will be reused.
1445  *
1446  * The reference parameter may be used to specify an existing block device which
1447  * should be opened. If specified, neither options nor a filename may be given,
1448  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1449  */
1450 int bdrv_open(BlockDriverState **pbs, const char *filename,
1451               const char *reference, QDict *options, int flags,
1452               BlockDriver *drv, Error **errp)
1453 {
1454     int ret;
1455     BlockDriverState *file = NULL, *bs;
1456     const char *drvname;
1457     Error *local_err = NULL;
1458     int snapshot_flags = 0;
1459 
1460     assert(pbs);
1461 
1462     if (reference) {
1463         bool options_non_empty = options ? qdict_size(options) : false;
1464         QDECREF(options);
1465 
1466         if (*pbs) {
1467             error_setg(errp, "Cannot reuse an existing BDS when referencing "
1468                        "another block device");
1469             return -EINVAL;
1470         }
1471 
1472         if (filename || options_non_empty) {
1473             error_setg(errp, "Cannot reference an existing block device with "
1474                        "additional options or a new filename");
1475             return -EINVAL;
1476         }
1477 
1478         bs = bdrv_lookup_bs(reference, reference, errp);
1479         if (!bs) {
1480             return -ENODEV;
1481         }
1482         bdrv_ref(bs);
1483         *pbs = bs;
1484         return 0;
1485     }
1486 
1487     if (*pbs) {
1488         bs = *pbs;
1489     } else {
1490         bs = bdrv_new();
1491     }
1492 
1493     /* NULL means an empty set of options */
1494     if (options == NULL) {
1495         options = qdict_new();
1496     }
1497 
1498     ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1499     if (local_err) {
1500         goto fail;
1501     }
1502 
1503     /* Find the right image format driver */
1504     drv = NULL;
1505     drvname = qdict_get_try_str(options, "driver");
1506     if (drvname) {
1507         drv = bdrv_find_format(drvname);
1508         qdict_del(options, "driver");
1509         if (!drv) {
1510             error_setg(errp, "Unknown driver: '%s'", drvname);
1511             ret = -EINVAL;
1512             goto fail;
1513         }
1514     }
1515 
1516     assert(drvname || !(flags & BDRV_O_PROTOCOL));
1517     if (drv && !drv->bdrv_file_open) {
1518         /* If the user explicitly wants a format driver here, we'll need to add
1519          * another layer for the protocol in bs->file */
1520         flags &= ~BDRV_O_PROTOCOL;
1521     }
1522 
1523     bs->options = options;
1524     options = qdict_clone_shallow(options);
1525 
1526     /* Open image file without format layer */
1527     if ((flags & BDRV_O_PROTOCOL) == 0) {
1528         if (flags & BDRV_O_RDWR) {
1529             flags |= BDRV_O_ALLOW_RDWR;
1530         }
1531         if (flags & BDRV_O_SNAPSHOT) {
1532             snapshot_flags = bdrv_temp_snapshot_flags(flags);
1533             flags = bdrv_backing_flags(flags);
1534         }
1535 
1536         assert(file == NULL);
1537         ret = bdrv_open_image(&file, filename, options, "file",
1538                               bdrv_inherited_flags(flags),
1539                               true, &local_err);
1540         if (ret < 0) {
1541             goto fail;
1542         }
1543     }
1544 
1545     /* Image format probing */
1546     bs->probed = !drv;
1547     if (!drv && file) {
1548         ret = find_image_format(file, filename, &drv, &local_err);
1549         if (ret < 0) {
1550             goto fail;
1551         }
1552     } else if (!drv) {
1553         error_setg(errp, "Must specify either driver or file");
1554         ret = -EINVAL;
1555         goto fail;
1556     }
1557 
1558     /* Open the image */
1559     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1560     if (ret < 0) {
1561         goto fail;
1562     }
1563 
1564     if (file && (bs->file != file)) {
1565         bdrv_unref(file);
1566         file = NULL;
1567     }
1568 
1569     /* If there is a backing file, use it */
1570     if ((flags & BDRV_O_NO_BACKING) == 0) {
1571         QDict *backing_options;
1572 
1573         qdict_extract_subqdict(options, &backing_options, "backing.");
1574         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1575         if (ret < 0) {
1576             goto close_and_fail;
1577         }
1578     }
1579 
1580     bdrv_refresh_filename(bs);
1581 
1582     /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1583      * temporary snapshot afterwards. */
1584     if (snapshot_flags) {
1585         ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1586         if (local_err) {
1587             goto close_and_fail;
1588         }
1589     }
1590 
1591     /* Check if any unknown options were used */
1592     if (options && (qdict_size(options) != 0)) {
1593         const QDictEntry *entry = qdict_first(options);
1594         if (flags & BDRV_O_PROTOCOL) {
1595             error_setg(errp, "Block protocol '%s' doesn't support the option "
1596                        "'%s'", drv->format_name, entry->key);
1597         } else {
1598             error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1599                        "support the option '%s'", drv->format_name,
1600                        bdrv_get_device_name(bs), entry->key);
1601         }
1602 
1603         ret = -EINVAL;
1604         goto close_and_fail;
1605     }
1606 
1607     if (!bdrv_key_required(bs)) {
1608         if (bs->blk) {
1609             blk_dev_change_media_cb(bs->blk, true);
1610         }
1611     } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1612                && !runstate_check(RUN_STATE_INMIGRATE)
1613                && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1614         error_setg(errp,
1615                    "Guest must be stopped for opening of encrypted image");
1616         ret = -EBUSY;
1617         goto close_and_fail;
1618     }
1619 
1620     QDECREF(options);
1621     *pbs = bs;
1622     return 0;
1623 
1624 fail:
1625     if (file != NULL) {
1626         bdrv_unref(file);
1627     }
1628     QDECREF(bs->options);
1629     QDECREF(options);
1630     bs->options = NULL;
1631     if (!*pbs) {
1632         /* If *pbs is NULL, a new BDS has been created in this function and
1633            needs to be freed now. Otherwise, it does not need to be closed,
1634            since it has not really been opened yet. */
1635         bdrv_unref(bs);
1636     }
1637     if (local_err) {
1638         error_propagate(errp, local_err);
1639     }
1640     return ret;
1641 
1642 close_and_fail:
1643     /* See fail path, but now the BDS has to be always closed */
1644     if (*pbs) {
1645         bdrv_close(bs);
1646     } else {
1647         bdrv_unref(bs);
1648     }
1649     QDECREF(options);
1650     if (local_err) {
1651         error_propagate(errp, local_err);
1652     }
1653     return ret;
1654 }
1655 
1656 typedef struct BlockReopenQueueEntry {
1657      bool prepared;
1658      BDRVReopenState state;
1659      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1660 } BlockReopenQueueEntry;
1661 
1662 /*
1663  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1664  * reopen of multiple devices.
1665  *
1666  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1667  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1668  * be created and initialized. This newly created BlockReopenQueue should be
1669  * passed back in for subsequent calls that are intended to be of the same
1670  * atomic 'set'.
1671  *
1672  * bs is the BlockDriverState to add to the reopen queue.
1673  *
1674  * flags contains the open flags for the associated bs
1675  *
1676  * returns a pointer to bs_queue, which is either the newly allocated
1677  * bs_queue, or the existing bs_queue being used.
1678  *
1679  */
1680 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1681                                     BlockDriverState *bs, int flags)
1682 {
1683     assert(bs != NULL);
1684 
1685     BlockReopenQueueEntry *bs_entry;
1686     if (bs_queue == NULL) {
1687         bs_queue = g_new0(BlockReopenQueue, 1);
1688         QSIMPLEQ_INIT(bs_queue);
1689     }
1690 
1691     /* bdrv_open() masks this flag out */
1692     flags &= ~BDRV_O_PROTOCOL;
1693 
1694     if (bs->file) {
1695         bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1696     }
1697 
1698     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1699     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1700 
1701     bs_entry->state.bs = bs;
1702     bs_entry->state.flags = flags;
1703 
1704     return bs_queue;
1705 }
1706 
1707 /*
1708  * Reopen multiple BlockDriverStates atomically & transactionally.
1709  *
1710  * The queue passed in (bs_queue) must have been built up previous
1711  * via bdrv_reopen_queue().
1712  *
1713  * Reopens all BDS specified in the queue, with the appropriate
1714  * flags.  All devices are prepared for reopen, and failure of any
1715  * device will cause all device changes to be abandonded, and intermediate
1716  * data cleaned up.
1717  *
1718  * If all devices prepare successfully, then the changes are committed
1719  * to all devices.
1720  *
1721  */
1722 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1723 {
1724     int ret = -1;
1725     BlockReopenQueueEntry *bs_entry, *next;
1726     Error *local_err = NULL;
1727 
1728     assert(bs_queue != NULL);
1729 
1730     bdrv_drain_all();
1731 
1732     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1733         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1734             error_propagate(errp, local_err);
1735             goto cleanup;
1736         }
1737         bs_entry->prepared = true;
1738     }
1739 
1740     /* If we reach this point, we have success and just need to apply the
1741      * changes
1742      */
1743     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1744         bdrv_reopen_commit(&bs_entry->state);
1745     }
1746 
1747     ret = 0;
1748 
1749 cleanup:
1750     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1751         if (ret && bs_entry->prepared) {
1752             bdrv_reopen_abort(&bs_entry->state);
1753         }
1754         g_free(bs_entry);
1755     }
1756     g_free(bs_queue);
1757     return ret;
1758 }
1759 
1760 
1761 /* Reopen a single BlockDriverState with the specified flags. */
1762 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1763 {
1764     int ret = -1;
1765     Error *local_err = NULL;
1766     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1767 
1768     ret = bdrv_reopen_multiple(queue, &local_err);
1769     if (local_err != NULL) {
1770         error_propagate(errp, local_err);
1771     }
1772     return ret;
1773 }
1774 
1775 
1776 /*
1777  * Prepares a BlockDriverState for reopen. All changes are staged in the
1778  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1779  * the block driver layer .bdrv_reopen_prepare()
1780  *
1781  * bs is the BlockDriverState to reopen
1782  * flags are the new open flags
1783  * queue is the reopen queue
1784  *
1785  * Returns 0 on success, non-zero on error.  On error errp will be set
1786  * as well.
1787  *
1788  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1789  * It is the responsibility of the caller to then call the abort() or
1790  * commit() for any other BDS that have been left in a prepare() state
1791  *
1792  */
1793 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1794                         Error **errp)
1795 {
1796     int ret = -1;
1797     Error *local_err = NULL;
1798     BlockDriver *drv;
1799 
1800     assert(reopen_state != NULL);
1801     assert(reopen_state->bs->drv != NULL);
1802     drv = reopen_state->bs->drv;
1803 
1804     /* if we are to stay read-only, do not allow permission change
1805      * to r/w */
1806     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1807         reopen_state->flags & BDRV_O_RDWR) {
1808         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1809                   bdrv_get_device_name(reopen_state->bs));
1810         goto error;
1811     }
1812 
1813 
1814     ret = bdrv_flush(reopen_state->bs);
1815     if (ret) {
1816         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1817                   strerror(-ret));
1818         goto error;
1819     }
1820 
1821     if (drv->bdrv_reopen_prepare) {
1822         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1823         if (ret) {
1824             if (local_err != NULL) {
1825                 error_propagate(errp, local_err);
1826             } else {
1827                 error_setg(errp, "failed while preparing to reopen image '%s'",
1828                            reopen_state->bs->filename);
1829             }
1830             goto error;
1831         }
1832     } else {
1833         /* It is currently mandatory to have a bdrv_reopen_prepare()
1834          * handler for each supported drv. */
1835         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1836                   drv->format_name, bdrv_get_device_name(reopen_state->bs),
1837                  "reopening of file");
1838         ret = -1;
1839         goto error;
1840     }
1841 
1842     ret = 0;
1843 
1844 error:
1845     return ret;
1846 }
1847 
1848 /*
1849  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1850  * makes them final by swapping the staging BlockDriverState contents into
1851  * the active BlockDriverState contents.
1852  */
1853 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1854 {
1855     BlockDriver *drv;
1856 
1857     assert(reopen_state != NULL);
1858     drv = reopen_state->bs->drv;
1859     assert(drv != NULL);
1860 
1861     /* If there are any driver level actions to take */
1862     if (drv->bdrv_reopen_commit) {
1863         drv->bdrv_reopen_commit(reopen_state);
1864     }
1865 
1866     /* set BDS specific flags now */
1867     reopen_state->bs->open_flags         = reopen_state->flags;
1868     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1869                                               BDRV_O_CACHE_WB);
1870     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1871 
1872     bdrv_refresh_limits(reopen_state->bs, NULL);
1873 }
1874 
1875 /*
1876  * Abort the reopen, and delete and free the staged changes in
1877  * reopen_state
1878  */
1879 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1880 {
1881     BlockDriver *drv;
1882 
1883     assert(reopen_state != NULL);
1884     drv = reopen_state->bs->drv;
1885     assert(drv != NULL);
1886 
1887     if (drv->bdrv_reopen_abort) {
1888         drv->bdrv_reopen_abort(reopen_state);
1889     }
1890 }
1891 
1892 
1893 void bdrv_close(BlockDriverState *bs)
1894 {
1895     BdrvAioNotifier *ban, *ban_next;
1896 
1897     if (bs->job) {
1898         block_job_cancel_sync(bs->job);
1899     }
1900     bdrv_drain_all(); /* complete I/O */
1901     bdrv_flush(bs);
1902     bdrv_drain_all(); /* in case flush left pending I/O */
1903     notifier_list_notify(&bs->close_notifiers, bs);
1904 
1905     if (bs->drv) {
1906         if (bs->backing_hd) {
1907             BlockDriverState *backing_hd = bs->backing_hd;
1908             bdrv_set_backing_hd(bs, NULL);
1909             bdrv_unref(backing_hd);
1910         }
1911         bs->drv->bdrv_close(bs);
1912         g_free(bs->opaque);
1913         bs->opaque = NULL;
1914         bs->drv = NULL;
1915         bs->copy_on_read = 0;
1916         bs->backing_file[0] = '\0';
1917         bs->backing_format[0] = '\0';
1918         bs->total_sectors = 0;
1919         bs->encrypted = 0;
1920         bs->valid_key = 0;
1921         bs->sg = 0;
1922         bs->zero_beyond_eof = false;
1923         QDECREF(bs->options);
1924         bs->options = NULL;
1925         QDECREF(bs->full_open_options);
1926         bs->full_open_options = NULL;
1927 
1928         if (bs->file != NULL) {
1929             bdrv_unref(bs->file);
1930             bs->file = NULL;
1931         }
1932     }
1933 
1934     if (bs->blk) {
1935         blk_dev_change_media_cb(bs->blk, false);
1936     }
1937 
1938     /*throttling disk I/O limits*/
1939     if (bs->io_limits_enabled) {
1940         bdrv_io_limits_disable(bs);
1941     }
1942 
1943     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1944         g_free(ban);
1945     }
1946     QLIST_INIT(&bs->aio_notifiers);
1947 }
1948 
1949 void bdrv_close_all(void)
1950 {
1951     BlockDriverState *bs;
1952 
1953     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1954         AioContext *aio_context = bdrv_get_aio_context(bs);
1955 
1956         aio_context_acquire(aio_context);
1957         bdrv_close(bs);
1958         aio_context_release(aio_context);
1959     }
1960 }
1961 
1962 /* Check if any requests are in-flight (including throttled requests) */
1963 static bool bdrv_requests_pending(BlockDriverState *bs)
1964 {
1965     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1966         return true;
1967     }
1968     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1969         return true;
1970     }
1971     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1972         return true;
1973     }
1974     if (bs->file && bdrv_requests_pending(bs->file)) {
1975         return true;
1976     }
1977     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1978         return true;
1979     }
1980     return false;
1981 }
1982 
1983 static bool bdrv_drain_one(BlockDriverState *bs)
1984 {
1985     bool bs_busy;
1986 
1987     bdrv_flush_io_queue(bs);
1988     bdrv_start_throttled_reqs(bs);
1989     bs_busy = bdrv_requests_pending(bs);
1990     bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
1991     return bs_busy;
1992 }
1993 
1994 /*
1995  * Wait for pending requests to complete on a single BlockDriverState subtree
1996  *
1997  * See the warning in bdrv_drain_all().  This function can only be called if
1998  * you are sure nothing can generate I/O because you have op blockers
1999  * installed.
2000  *
2001  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
2002  * AioContext.
2003  */
2004 void bdrv_drain(BlockDriverState *bs)
2005 {
2006     while (bdrv_drain_one(bs)) {
2007         /* Keep iterating */
2008     }
2009 }
2010 
2011 /*
2012  * Wait for pending requests to complete across all BlockDriverStates
2013  *
2014  * This function does not flush data to disk, use bdrv_flush_all() for that
2015  * after calling this function.
2016  *
2017  * Note that completion of an asynchronous I/O operation can trigger any
2018  * number of other I/O operations on other devices---for example a coroutine
2019  * can be arbitrarily complex and a constant flow of I/O can come until the
2020  * coroutine is complete.  Because of this, it is not possible to have a
2021  * function to drain a single device's I/O queue.
2022  */
2023 void bdrv_drain_all(void)
2024 {
2025     /* Always run first iteration so any pending completion BHs run */
2026     bool busy = true;
2027     BlockDriverState *bs;
2028 
2029     while (busy) {
2030         busy = false;
2031 
2032         QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2033             AioContext *aio_context = bdrv_get_aio_context(bs);
2034 
2035             aio_context_acquire(aio_context);
2036             busy |= bdrv_drain_one(bs);
2037             aio_context_release(aio_context);
2038         }
2039     }
2040 }
2041 
2042 /* make a BlockDriverState anonymous by removing from bdrv_state and
2043  * graph_bdrv_state list.
2044    Also, NULL terminate the device_name to prevent double remove */
2045 void bdrv_make_anon(BlockDriverState *bs)
2046 {
2047     /*
2048      * Take care to remove bs from bdrv_states only when it's actually
2049      * in it.  Note that bs->device_list.tqe_prev is initially null,
2050      * and gets set to non-null by QTAILQ_INSERT_TAIL().  Establish
2051      * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
2052      * resetting it to null on remove.
2053      */
2054     if (bs->device_list.tqe_prev) {
2055         QTAILQ_REMOVE(&bdrv_states, bs, device_list);
2056         bs->device_list.tqe_prev = NULL;
2057     }
2058     if (bs->node_name[0] != '\0') {
2059         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
2060     }
2061     bs->node_name[0] = '\0';
2062 }
2063 
2064 static void bdrv_rebind(BlockDriverState *bs)
2065 {
2066     if (bs->drv && bs->drv->bdrv_rebind) {
2067         bs->drv->bdrv_rebind(bs);
2068     }
2069 }
2070 
2071 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
2072                                      BlockDriverState *bs_src)
2073 {
2074     /* move some fields that need to stay attached to the device */
2075 
2076     /* dev info */
2077     bs_dest->guest_block_size   = bs_src->guest_block_size;
2078     bs_dest->copy_on_read       = bs_src->copy_on_read;
2079 
2080     bs_dest->enable_write_cache = bs_src->enable_write_cache;
2081 
2082     /* i/o throttled req */
2083     memcpy(&bs_dest->throttle_state,
2084            &bs_src->throttle_state,
2085            sizeof(ThrottleState));
2086     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
2087     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
2088     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
2089 
2090     /* r/w error */
2091     bs_dest->on_read_error      = bs_src->on_read_error;
2092     bs_dest->on_write_error     = bs_src->on_write_error;
2093 
2094     /* i/o status */
2095     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
2096     bs_dest->iostatus           = bs_src->iostatus;
2097 
2098     /* dirty bitmap */
2099     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
2100 
2101     /* reference count */
2102     bs_dest->refcnt             = bs_src->refcnt;
2103 
2104     /* job */
2105     bs_dest->job                = bs_src->job;
2106 
2107     /* keep the same entry in bdrv_states */
2108     bs_dest->device_list = bs_src->device_list;
2109     bs_dest->blk = bs_src->blk;
2110 
2111     memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2112            sizeof(bs_dest->op_blockers));
2113 }
2114 
2115 /*
2116  * Swap bs contents for two image chains while they are live,
2117  * while keeping required fields on the BlockDriverState that is
2118  * actually attached to a device.
2119  *
2120  * This will modify the BlockDriverState fields, and swap contents
2121  * between bs_new and bs_old. Both bs_new and bs_old are modified.
2122  *
2123  * bs_new must not be attached to a BlockBackend.
2124  *
2125  * This function does not create any image files.
2126  */
2127 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2128 {
2129     BlockDriverState tmp;
2130 
2131     /* The code needs to swap the node_name but simply swapping node_list won't
2132      * work so first remove the nodes from the graph list, do the swap then
2133      * insert them back if needed.
2134      */
2135     if (bs_new->node_name[0] != '\0') {
2136         QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2137     }
2138     if (bs_old->node_name[0] != '\0') {
2139         QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2140     }
2141 
2142     /* bs_new must be unattached and shouldn't have anything fancy enabled */
2143     assert(!bs_new->blk);
2144     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2145     assert(bs_new->job == NULL);
2146     assert(bs_new->io_limits_enabled == false);
2147     assert(!throttle_have_timer(&bs_new->throttle_state));
2148 
2149     tmp = *bs_new;
2150     *bs_new = *bs_old;
2151     *bs_old = tmp;
2152 
2153     /* there are some fields that should not be swapped, move them back */
2154     bdrv_move_feature_fields(&tmp, bs_old);
2155     bdrv_move_feature_fields(bs_old, bs_new);
2156     bdrv_move_feature_fields(bs_new, &tmp);
2157 
2158     /* bs_new must remain unattached */
2159     assert(!bs_new->blk);
2160 
2161     /* Check a few fields that should remain attached to the device */
2162     assert(bs_new->job == NULL);
2163     assert(bs_new->io_limits_enabled == false);
2164     assert(!throttle_have_timer(&bs_new->throttle_state));
2165 
2166     /* insert the nodes back into the graph node list if needed */
2167     if (bs_new->node_name[0] != '\0') {
2168         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2169     }
2170     if (bs_old->node_name[0] != '\0') {
2171         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2172     }
2173 
2174     bdrv_rebind(bs_new);
2175     bdrv_rebind(bs_old);
2176 }
2177 
2178 /*
2179  * Add new bs contents at the top of an image chain while the chain is
2180  * live, while keeping required fields on the top layer.
2181  *
2182  * This will modify the BlockDriverState fields, and swap contents
2183  * between bs_new and bs_top. Both bs_new and bs_top are modified.
2184  *
2185  * bs_new must not be attached to a BlockBackend.
2186  *
2187  * This function does not create any image files.
2188  */
2189 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2190 {
2191     bdrv_swap(bs_new, bs_top);
2192 
2193     /* The contents of 'tmp' will become bs_top, as we are
2194      * swapping bs_new and bs_top contents. */
2195     bdrv_set_backing_hd(bs_top, bs_new);
2196 }
2197 
2198 static void bdrv_delete(BlockDriverState *bs)
2199 {
2200     assert(!bs->job);
2201     assert(bdrv_op_blocker_is_empty(bs));
2202     assert(!bs->refcnt);
2203     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2204 
2205     bdrv_close(bs);
2206 
2207     /* remove from list, if necessary */
2208     bdrv_make_anon(bs);
2209 
2210     g_free(bs);
2211 }
2212 
2213 /*
2214  * Run consistency checks on an image
2215  *
2216  * Returns 0 if the check could be completed (it doesn't mean that the image is
2217  * free of errors) or -errno when an internal error occurred. The results of the
2218  * check are stored in res.
2219  */
2220 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2221 {
2222     if (bs->drv == NULL) {
2223         return -ENOMEDIUM;
2224     }
2225     if (bs->drv->bdrv_check == NULL) {
2226         return -ENOTSUP;
2227     }
2228 
2229     memset(res, 0, sizeof(*res));
2230     return bs->drv->bdrv_check(bs, res, fix);
2231 }
2232 
2233 #define COMMIT_BUF_SECTORS 2048
2234 
2235 /* commit COW file into the raw image */
2236 int bdrv_commit(BlockDriverState *bs)
2237 {
2238     BlockDriver *drv = bs->drv;
2239     int64_t sector, total_sectors, length, backing_length;
2240     int n, ro, open_flags;
2241     int ret = 0;
2242     uint8_t *buf = NULL;
2243 
2244     if (!drv)
2245         return -ENOMEDIUM;
2246 
2247     if (!bs->backing_hd) {
2248         return -ENOTSUP;
2249     }
2250 
2251     if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) ||
2252         bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET, NULL)) {
2253         return -EBUSY;
2254     }
2255 
2256     ro = bs->backing_hd->read_only;
2257     open_flags =  bs->backing_hd->open_flags;
2258 
2259     if (ro) {
2260         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2261             return -EACCES;
2262         }
2263     }
2264 
2265     length = bdrv_getlength(bs);
2266     if (length < 0) {
2267         ret = length;
2268         goto ro_cleanup;
2269     }
2270 
2271     backing_length = bdrv_getlength(bs->backing_hd);
2272     if (backing_length < 0) {
2273         ret = backing_length;
2274         goto ro_cleanup;
2275     }
2276 
2277     /* If our top snapshot is larger than the backing file image,
2278      * grow the backing file image if possible.  If not possible,
2279      * we must return an error */
2280     if (length > backing_length) {
2281         ret = bdrv_truncate(bs->backing_hd, length);
2282         if (ret < 0) {
2283             goto ro_cleanup;
2284         }
2285     }
2286 
2287     total_sectors = length >> BDRV_SECTOR_BITS;
2288 
2289     /* qemu_try_blockalign() for bs will choose an alignment that works for
2290      * bs->backing_hd as well, so no need to compare the alignment manually. */
2291     buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2292     if (buf == NULL) {
2293         ret = -ENOMEM;
2294         goto ro_cleanup;
2295     }
2296 
2297     for (sector = 0; sector < total_sectors; sector += n) {
2298         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2299         if (ret < 0) {
2300             goto ro_cleanup;
2301         }
2302         if (ret) {
2303             ret = bdrv_read(bs, sector, buf, n);
2304             if (ret < 0) {
2305                 goto ro_cleanup;
2306             }
2307 
2308             ret = bdrv_write(bs->backing_hd, sector, buf, n);
2309             if (ret < 0) {
2310                 goto ro_cleanup;
2311             }
2312         }
2313     }
2314 
2315     if (drv->bdrv_make_empty) {
2316         ret = drv->bdrv_make_empty(bs);
2317         if (ret < 0) {
2318             goto ro_cleanup;
2319         }
2320         bdrv_flush(bs);
2321     }
2322 
2323     /*
2324      * Make sure all data we wrote to the backing device is actually
2325      * stable on disk.
2326      */
2327     if (bs->backing_hd) {
2328         bdrv_flush(bs->backing_hd);
2329     }
2330 
2331     ret = 0;
2332 ro_cleanup:
2333     qemu_vfree(buf);
2334 
2335     if (ro) {
2336         /* ignoring error return here */
2337         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2338     }
2339 
2340     return ret;
2341 }
2342 
2343 int bdrv_commit_all(void)
2344 {
2345     BlockDriverState *bs;
2346 
2347     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2348         AioContext *aio_context = bdrv_get_aio_context(bs);
2349 
2350         aio_context_acquire(aio_context);
2351         if (bs->drv && bs->backing_hd) {
2352             int ret = bdrv_commit(bs);
2353             if (ret < 0) {
2354                 aio_context_release(aio_context);
2355                 return ret;
2356             }
2357         }
2358         aio_context_release(aio_context);
2359     }
2360     return 0;
2361 }
2362 
2363 /**
2364  * Remove an active request from the tracked requests list
2365  *
2366  * This function should be called when a tracked request is completing.
2367  */
2368 static void tracked_request_end(BdrvTrackedRequest *req)
2369 {
2370     if (req->serialising) {
2371         req->bs->serialising_in_flight--;
2372     }
2373 
2374     QLIST_REMOVE(req, list);
2375     qemu_co_queue_restart_all(&req->wait_queue);
2376 }
2377 
2378 /**
2379  * Add an active request to the tracked requests list
2380  */
2381 static void tracked_request_begin(BdrvTrackedRequest *req,
2382                                   BlockDriverState *bs,
2383                                   int64_t offset,
2384                                   unsigned int bytes, bool is_write)
2385 {
2386     *req = (BdrvTrackedRequest){
2387         .bs = bs,
2388         .offset         = offset,
2389         .bytes          = bytes,
2390         .is_write       = is_write,
2391         .co             = qemu_coroutine_self(),
2392         .serialising    = false,
2393         .overlap_offset = offset,
2394         .overlap_bytes  = bytes,
2395     };
2396 
2397     qemu_co_queue_init(&req->wait_queue);
2398 
2399     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2400 }
2401 
2402 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2403 {
2404     int64_t overlap_offset = req->offset & ~(align - 1);
2405     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2406                                - overlap_offset;
2407 
2408     if (!req->serialising) {
2409         req->bs->serialising_in_flight++;
2410         req->serialising = true;
2411     }
2412 
2413     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2414     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2415 }
2416 
2417 /**
2418  * Round a region to cluster boundaries
2419  */
2420 void bdrv_round_to_clusters(BlockDriverState *bs,
2421                             int64_t sector_num, int nb_sectors,
2422                             int64_t *cluster_sector_num,
2423                             int *cluster_nb_sectors)
2424 {
2425     BlockDriverInfo bdi;
2426 
2427     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2428         *cluster_sector_num = sector_num;
2429         *cluster_nb_sectors = nb_sectors;
2430     } else {
2431         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2432         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2433         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2434                                             nb_sectors, c);
2435     }
2436 }
2437 
2438 static int bdrv_get_cluster_size(BlockDriverState *bs)
2439 {
2440     BlockDriverInfo bdi;
2441     int ret;
2442 
2443     ret = bdrv_get_info(bs, &bdi);
2444     if (ret < 0 || bdi.cluster_size == 0) {
2445         return bs->request_alignment;
2446     } else {
2447         return bdi.cluster_size;
2448     }
2449 }
2450 
2451 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2452                                      int64_t offset, unsigned int bytes)
2453 {
2454     /*        aaaa   bbbb */
2455     if (offset >= req->overlap_offset + req->overlap_bytes) {
2456         return false;
2457     }
2458     /* bbbb   aaaa        */
2459     if (req->overlap_offset >= offset + bytes) {
2460         return false;
2461     }
2462     return true;
2463 }
2464 
2465 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2466 {
2467     BlockDriverState *bs = self->bs;
2468     BdrvTrackedRequest *req;
2469     bool retry;
2470     bool waited = false;
2471 
2472     if (!bs->serialising_in_flight) {
2473         return false;
2474     }
2475 
2476     do {
2477         retry = false;
2478         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2479             if (req == self || (!req->serialising && !self->serialising)) {
2480                 continue;
2481             }
2482             if (tracked_request_overlaps(req, self->overlap_offset,
2483                                          self->overlap_bytes))
2484             {
2485                 /* Hitting this means there was a reentrant request, for
2486                  * example, a block driver issuing nested requests.  This must
2487                  * never happen since it means deadlock.
2488                  */
2489                 assert(qemu_coroutine_self() != req->co);
2490 
2491                 /* If the request is already (indirectly) waiting for us, or
2492                  * will wait for us as soon as it wakes up, then just go on
2493                  * (instead of producing a deadlock in the former case). */
2494                 if (!req->waiting_for) {
2495                     self->waiting_for = req;
2496                     qemu_co_queue_wait(&req->wait_queue);
2497                     self->waiting_for = NULL;
2498                     retry = true;
2499                     waited = true;
2500                     break;
2501                 }
2502             }
2503         }
2504     } while (retry);
2505 
2506     return waited;
2507 }
2508 
2509 /*
2510  * Return values:
2511  * 0        - success
2512  * -EINVAL  - backing format specified, but no file
2513  * -ENOSPC  - can't update the backing file because no space is left in the
2514  *            image file header
2515  * -ENOTSUP - format driver doesn't support changing the backing file
2516  */
2517 int bdrv_change_backing_file(BlockDriverState *bs,
2518     const char *backing_file, const char *backing_fmt)
2519 {
2520     BlockDriver *drv = bs->drv;
2521     int ret;
2522 
2523     /* Backing file format doesn't make sense without a backing file */
2524     if (backing_fmt && !backing_file) {
2525         return -EINVAL;
2526     }
2527 
2528     if (drv->bdrv_change_backing_file != NULL) {
2529         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2530     } else {
2531         ret = -ENOTSUP;
2532     }
2533 
2534     if (ret == 0) {
2535         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2536         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2537     }
2538     return ret;
2539 }
2540 
2541 /*
2542  * Finds the image layer in the chain that has 'bs' as its backing file.
2543  *
2544  * active is the current topmost image.
2545  *
2546  * Returns NULL if bs is not found in active's image chain,
2547  * or if active == bs.
2548  *
2549  * Returns the bottommost base image if bs == NULL.
2550  */
2551 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2552                                     BlockDriverState *bs)
2553 {
2554     while (active && bs != active->backing_hd) {
2555         active = active->backing_hd;
2556     }
2557 
2558     return active;
2559 }
2560 
2561 /* Given a BDS, searches for the base layer. */
2562 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2563 {
2564     return bdrv_find_overlay(bs, NULL);
2565 }
2566 
2567 typedef struct BlkIntermediateStates {
2568     BlockDriverState *bs;
2569     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2570 } BlkIntermediateStates;
2571 
2572 
2573 /*
2574  * Drops images above 'base' up to and including 'top', and sets the image
2575  * above 'top' to have base as its backing file.
2576  *
2577  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2578  * information in 'bs' can be properly updated.
2579  *
2580  * E.g., this will convert the following chain:
2581  * bottom <- base <- intermediate <- top <- active
2582  *
2583  * to
2584  *
2585  * bottom <- base <- active
2586  *
2587  * It is allowed for bottom==base, in which case it converts:
2588  *
2589  * base <- intermediate <- top <- active
2590  *
2591  * to
2592  *
2593  * base <- active
2594  *
2595  * If backing_file_str is non-NULL, it will be used when modifying top's
2596  * overlay image metadata.
2597  *
2598  * Error conditions:
2599  *  if active == top, that is considered an error
2600  *
2601  */
2602 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2603                            BlockDriverState *base, const char *backing_file_str)
2604 {
2605     BlockDriverState *intermediate;
2606     BlockDriverState *base_bs = NULL;
2607     BlockDriverState *new_top_bs = NULL;
2608     BlkIntermediateStates *intermediate_state, *next;
2609     int ret = -EIO;
2610 
2611     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2612     QSIMPLEQ_INIT(&states_to_delete);
2613 
2614     if (!top->drv || !base->drv) {
2615         goto exit;
2616     }
2617 
2618     new_top_bs = bdrv_find_overlay(active, top);
2619 
2620     if (new_top_bs == NULL) {
2621         /* we could not find the image above 'top', this is an error */
2622         goto exit;
2623     }
2624 
2625     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2626      * to do, no intermediate images */
2627     if (new_top_bs->backing_hd == base) {
2628         ret = 0;
2629         goto exit;
2630     }
2631 
2632     intermediate = top;
2633 
2634     /* now we will go down through the list, and add each BDS we find
2635      * into our deletion queue, until we hit the 'base'
2636      */
2637     while (intermediate) {
2638         intermediate_state = g_new0(BlkIntermediateStates, 1);
2639         intermediate_state->bs = intermediate;
2640         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2641 
2642         if (intermediate->backing_hd == base) {
2643             base_bs = intermediate->backing_hd;
2644             break;
2645         }
2646         intermediate = intermediate->backing_hd;
2647     }
2648     if (base_bs == NULL) {
2649         /* something went wrong, we did not end at the base. safely
2650          * unravel everything, and exit with error */
2651         goto exit;
2652     }
2653 
2654     /* success - we can delete the intermediate states, and link top->base */
2655     backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2656     ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2657                                    base_bs->drv ? base_bs->drv->format_name : "");
2658     if (ret) {
2659         goto exit;
2660     }
2661     bdrv_set_backing_hd(new_top_bs, base_bs);
2662 
2663     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2664         /* so that bdrv_close() does not recursively close the chain */
2665         bdrv_set_backing_hd(intermediate_state->bs, NULL);
2666         bdrv_unref(intermediate_state->bs);
2667     }
2668     ret = 0;
2669 
2670 exit:
2671     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2672         g_free(intermediate_state);
2673     }
2674     return ret;
2675 }
2676 
2677 
2678 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2679                                    size_t size)
2680 {
2681     if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
2682         return -EIO;
2683     }
2684 
2685     if (!bdrv_is_inserted(bs)) {
2686         return -ENOMEDIUM;
2687     }
2688 
2689     if (offset < 0) {
2690         return -EIO;
2691     }
2692 
2693     return 0;
2694 }
2695 
2696 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2697                               int nb_sectors)
2698 {
2699     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
2700         return -EIO;
2701     }
2702 
2703     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2704                                    nb_sectors * BDRV_SECTOR_SIZE);
2705 }
2706 
2707 typedef struct RwCo {
2708     BlockDriverState *bs;
2709     int64_t offset;
2710     QEMUIOVector *qiov;
2711     bool is_write;
2712     int ret;
2713     BdrvRequestFlags flags;
2714 } RwCo;
2715 
2716 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2717 {
2718     RwCo *rwco = opaque;
2719 
2720     if (!rwco->is_write) {
2721         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2722                                       rwco->qiov->size, rwco->qiov,
2723                                       rwco->flags);
2724     } else {
2725         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2726                                        rwco->qiov->size, rwco->qiov,
2727                                        rwco->flags);
2728     }
2729 }
2730 
2731 /*
2732  * Process a vectored synchronous request using coroutines
2733  */
2734 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2735                         QEMUIOVector *qiov, bool is_write,
2736                         BdrvRequestFlags flags)
2737 {
2738     Coroutine *co;
2739     RwCo rwco = {
2740         .bs = bs,
2741         .offset = offset,
2742         .qiov = qiov,
2743         .is_write = is_write,
2744         .ret = NOT_DONE,
2745         .flags = flags,
2746     };
2747 
2748     /**
2749      * In sync call context, when the vcpu is blocked, this throttling timer
2750      * will not fire; so the I/O throttling function has to be disabled here
2751      * if it has been enabled.
2752      */
2753     if (bs->io_limits_enabled) {
2754         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2755                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2756         bdrv_io_limits_disable(bs);
2757     }
2758 
2759     if (qemu_in_coroutine()) {
2760         /* Fast-path if already in coroutine context */
2761         bdrv_rw_co_entry(&rwco);
2762     } else {
2763         AioContext *aio_context = bdrv_get_aio_context(bs);
2764 
2765         co = qemu_coroutine_create(bdrv_rw_co_entry);
2766         qemu_coroutine_enter(co, &rwco);
2767         while (rwco.ret == NOT_DONE) {
2768             aio_poll(aio_context, true);
2769         }
2770     }
2771     return rwco.ret;
2772 }
2773 
2774 /*
2775  * Process a synchronous request using coroutines
2776  */
2777 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2778                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2779 {
2780     QEMUIOVector qiov;
2781     struct iovec iov = {
2782         .iov_base = (void *)buf,
2783         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2784     };
2785 
2786     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
2787         return -EINVAL;
2788     }
2789 
2790     qemu_iovec_init_external(&qiov, &iov, 1);
2791     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2792                         &qiov, is_write, flags);
2793 }
2794 
2795 /* return < 0 if error. See bdrv_write() for the return codes */
2796 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2797               uint8_t *buf, int nb_sectors)
2798 {
2799     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2800 }
2801 
2802 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2803 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2804                           uint8_t *buf, int nb_sectors)
2805 {
2806     bool enabled;
2807     int ret;
2808 
2809     enabled = bs->io_limits_enabled;
2810     bs->io_limits_enabled = false;
2811     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2812     bs->io_limits_enabled = enabled;
2813     return ret;
2814 }
2815 
2816 /* Return < 0 if error. Important errors are:
2817   -EIO         generic I/O error (may happen for all errors)
2818   -ENOMEDIUM   No media inserted.
2819   -EINVAL      Invalid sector number or nb_sectors
2820   -EACCES      Trying to write a read-only device
2821 */
2822 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2823                const uint8_t *buf, int nb_sectors)
2824 {
2825     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2826 }
2827 
2828 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2829                       int nb_sectors, BdrvRequestFlags flags)
2830 {
2831     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2832                       BDRV_REQ_ZERO_WRITE | flags);
2833 }
2834 
2835 /*
2836  * Completely zero out a block device with the help of bdrv_write_zeroes.
2837  * The operation is sped up by checking the block status and only writing
2838  * zeroes to the device if they currently do not return zeroes. Optional
2839  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2840  *
2841  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2842  */
2843 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2844 {
2845     int64_t target_sectors, ret, nb_sectors, sector_num = 0;
2846     int n;
2847 
2848     target_sectors = bdrv_nb_sectors(bs);
2849     if (target_sectors < 0) {
2850         return target_sectors;
2851     }
2852 
2853     for (;;) {
2854         nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
2855         if (nb_sectors <= 0) {
2856             return 0;
2857         }
2858         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2859         if (ret < 0) {
2860             error_report("error getting block status at sector %" PRId64 ": %s",
2861                          sector_num, strerror(-ret));
2862             return ret;
2863         }
2864         if (ret & BDRV_BLOCK_ZERO) {
2865             sector_num += n;
2866             continue;
2867         }
2868         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2869         if (ret < 0) {
2870             error_report("error writing zeroes at sector %" PRId64 ": %s",
2871                          sector_num, strerror(-ret));
2872             return ret;
2873         }
2874         sector_num += n;
2875     }
2876 }
2877 
2878 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2879 {
2880     QEMUIOVector qiov;
2881     struct iovec iov = {
2882         .iov_base = (void *)buf,
2883         .iov_len = bytes,
2884     };
2885     int ret;
2886 
2887     if (bytes < 0) {
2888         return -EINVAL;
2889     }
2890 
2891     qemu_iovec_init_external(&qiov, &iov, 1);
2892     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2893     if (ret < 0) {
2894         return ret;
2895     }
2896 
2897     return bytes;
2898 }
2899 
2900 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2901 {
2902     int ret;
2903 
2904     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2905     if (ret < 0) {
2906         return ret;
2907     }
2908 
2909     return qiov->size;
2910 }
2911 
2912 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2913                 const void *buf, int bytes)
2914 {
2915     QEMUIOVector qiov;
2916     struct iovec iov = {
2917         .iov_base   = (void *) buf,
2918         .iov_len    = bytes,
2919     };
2920 
2921     if (bytes < 0) {
2922         return -EINVAL;
2923     }
2924 
2925     qemu_iovec_init_external(&qiov, &iov, 1);
2926     return bdrv_pwritev(bs, offset, &qiov);
2927 }
2928 
2929 /*
2930  * Writes to the file and ensures that no writes are reordered across this
2931  * request (acts as a barrier)
2932  *
2933  * Returns 0 on success, -errno in error cases.
2934  */
2935 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2936     const void *buf, int count)
2937 {
2938     int ret;
2939 
2940     ret = bdrv_pwrite(bs, offset, buf, count);
2941     if (ret < 0) {
2942         return ret;
2943     }
2944 
2945     /* No flush needed for cache modes that already do it */
2946     if (bs->enable_write_cache) {
2947         bdrv_flush(bs);
2948     }
2949 
2950     return 0;
2951 }
2952 
2953 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2954         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2955 {
2956     /* Perform I/O through a temporary buffer so that users who scribble over
2957      * their read buffer while the operation is in progress do not end up
2958      * modifying the image file.  This is critical for zero-copy guest I/O
2959      * where anything might happen inside guest memory.
2960      */
2961     void *bounce_buffer;
2962 
2963     BlockDriver *drv = bs->drv;
2964     struct iovec iov;
2965     QEMUIOVector bounce_qiov;
2966     int64_t cluster_sector_num;
2967     int cluster_nb_sectors;
2968     size_t skip_bytes;
2969     int ret;
2970 
2971     /* Cover entire cluster so no additional backing file I/O is required when
2972      * allocating cluster in the image file.
2973      */
2974     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2975                            &cluster_sector_num, &cluster_nb_sectors);
2976 
2977     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2978                                    cluster_sector_num, cluster_nb_sectors);
2979 
2980     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2981     iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
2982     if (bounce_buffer == NULL) {
2983         ret = -ENOMEM;
2984         goto err;
2985     }
2986 
2987     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2988 
2989     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2990                              &bounce_qiov);
2991     if (ret < 0) {
2992         goto err;
2993     }
2994 
2995     if (drv->bdrv_co_write_zeroes &&
2996         buffer_is_zero(bounce_buffer, iov.iov_len)) {
2997         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2998                                       cluster_nb_sectors, 0);
2999     } else {
3000         /* This does not change the data on the disk, it is not necessary
3001          * to flush even in cache=writethrough mode.
3002          */
3003         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
3004                                   &bounce_qiov);
3005     }
3006 
3007     if (ret < 0) {
3008         /* It might be okay to ignore write errors for guest requests.  If this
3009          * is a deliberate copy-on-read then we don't want to ignore the error.
3010          * Simply report it in all cases.
3011          */
3012         goto err;
3013     }
3014 
3015     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
3016     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
3017                         nb_sectors * BDRV_SECTOR_SIZE);
3018 
3019 err:
3020     qemu_vfree(bounce_buffer);
3021     return ret;
3022 }
3023 
3024 /*
3025  * Forwards an already correctly aligned request to the BlockDriver. This
3026  * handles copy on read and zeroing after EOF; any other features must be
3027  * implemented by the caller.
3028  */
3029 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
3030     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3031     int64_t align, QEMUIOVector *qiov, int flags)
3032 {
3033     BlockDriver *drv = bs->drv;
3034     int ret;
3035 
3036     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3037     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3038 
3039     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3040     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3041     assert(!qiov || bytes == qiov->size);
3042 
3043     /* Handle Copy on Read and associated serialisation */
3044     if (flags & BDRV_REQ_COPY_ON_READ) {
3045         /* If we touch the same cluster it counts as an overlap.  This
3046          * guarantees that allocating writes will be serialized and not race
3047          * with each other for the same cluster.  For example, in copy-on-read
3048          * it ensures that the CoR read and write operations are atomic and
3049          * guest writes cannot interleave between them. */
3050         mark_request_serialising(req, bdrv_get_cluster_size(bs));
3051     }
3052 
3053     wait_serialising_requests(req);
3054 
3055     if (flags & BDRV_REQ_COPY_ON_READ) {
3056         int pnum;
3057 
3058         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3059         if (ret < 0) {
3060             goto out;
3061         }
3062 
3063         if (!ret || pnum != nb_sectors) {
3064             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3065             goto out;
3066         }
3067     }
3068 
3069     /* Forward the request to the BlockDriver */
3070     if (!bs->zero_beyond_eof) {
3071         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3072     } else {
3073         /* Read zeros after EOF */
3074         int64_t total_sectors, max_nb_sectors;
3075 
3076         total_sectors = bdrv_nb_sectors(bs);
3077         if (total_sectors < 0) {
3078             ret = total_sectors;
3079             goto out;
3080         }
3081 
3082         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3083                                   align >> BDRV_SECTOR_BITS);
3084         if (nb_sectors < max_nb_sectors) {
3085             ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3086         } else if (max_nb_sectors > 0) {
3087             QEMUIOVector local_qiov;
3088 
3089             qemu_iovec_init(&local_qiov, qiov->niov);
3090             qemu_iovec_concat(&local_qiov, qiov, 0,
3091                               max_nb_sectors * BDRV_SECTOR_SIZE);
3092 
3093             ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
3094                                      &local_qiov);
3095 
3096             qemu_iovec_destroy(&local_qiov);
3097         } else {
3098             ret = 0;
3099         }
3100 
3101         /* Reading beyond end of file is supposed to produce zeroes */
3102         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3103             uint64_t offset = MAX(0, total_sectors - sector_num);
3104             uint64_t bytes = (sector_num + nb_sectors - offset) *
3105                               BDRV_SECTOR_SIZE;
3106             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3107         }
3108     }
3109 
3110 out:
3111     return ret;
3112 }
3113 
3114 /*
3115  * Handle a read request in coroutine context
3116  */
3117 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3118     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3119     BdrvRequestFlags flags)
3120 {
3121     BlockDriver *drv = bs->drv;
3122     BdrvTrackedRequest req;
3123 
3124     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3125     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3126     uint8_t *head_buf = NULL;
3127     uint8_t *tail_buf = NULL;
3128     QEMUIOVector local_qiov;
3129     bool use_local_qiov = false;
3130     int ret;
3131 
3132     if (!drv) {
3133         return -ENOMEDIUM;
3134     }
3135 
3136     ret = bdrv_check_byte_request(bs, offset, bytes);
3137     if (ret < 0) {
3138         return ret;
3139     }
3140 
3141     if (bs->copy_on_read) {
3142         flags |= BDRV_REQ_COPY_ON_READ;
3143     }
3144 
3145     /* throttling disk I/O */
3146     if (bs->io_limits_enabled) {
3147         bdrv_io_limits_intercept(bs, bytes, false);
3148     }
3149 
3150     /* Align read if necessary by padding qiov */
3151     if (offset & (align - 1)) {
3152         head_buf = qemu_blockalign(bs, align);
3153         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3154         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3155         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3156         use_local_qiov = true;
3157 
3158         bytes += offset & (align - 1);
3159         offset = offset & ~(align - 1);
3160     }
3161 
3162     if ((offset + bytes) & (align - 1)) {
3163         if (!use_local_qiov) {
3164             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3165             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3166             use_local_qiov = true;
3167         }
3168         tail_buf = qemu_blockalign(bs, align);
3169         qemu_iovec_add(&local_qiov, tail_buf,
3170                        align - ((offset + bytes) & (align - 1)));
3171 
3172         bytes = ROUND_UP(bytes, align);
3173     }
3174 
3175     tracked_request_begin(&req, bs, offset, bytes, false);
3176     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3177                               use_local_qiov ? &local_qiov : qiov,
3178                               flags);
3179     tracked_request_end(&req);
3180 
3181     if (use_local_qiov) {
3182         qemu_iovec_destroy(&local_qiov);
3183         qemu_vfree(head_buf);
3184         qemu_vfree(tail_buf);
3185     }
3186 
3187     return ret;
3188 }
3189 
3190 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3191     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3192     BdrvRequestFlags flags)
3193 {
3194     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
3195         return -EINVAL;
3196     }
3197 
3198     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3199                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3200 }
3201 
3202 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3203     int nb_sectors, QEMUIOVector *qiov)
3204 {
3205     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3206 
3207     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3208 }
3209 
3210 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3211     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3212 {
3213     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3214 
3215     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3216                             BDRV_REQ_COPY_ON_READ);
3217 }
3218 
3219 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
3220 
3221 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3222     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3223 {
3224     BlockDriver *drv = bs->drv;
3225     QEMUIOVector qiov;
3226     struct iovec iov = {0};
3227     int ret = 0;
3228 
3229     int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
3230                                         BDRV_REQUEST_MAX_SECTORS);
3231 
3232     while (nb_sectors > 0 && !ret) {
3233         int num = nb_sectors;
3234 
3235         /* Align request.  Block drivers can expect the "bulk" of the request
3236          * to be aligned.
3237          */
3238         if (bs->bl.write_zeroes_alignment
3239             && num > bs->bl.write_zeroes_alignment) {
3240             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3241                 /* Make a small request up to the first aligned sector.  */
3242                 num = bs->bl.write_zeroes_alignment;
3243                 num -= sector_num % bs->bl.write_zeroes_alignment;
3244             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3245                 /* Shorten the request to the last aligned sector.  num cannot
3246                  * underflow because num > bs->bl.write_zeroes_alignment.
3247                  */
3248                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3249             }
3250         }
3251 
3252         /* limit request size */
3253         if (num > max_write_zeroes) {
3254             num = max_write_zeroes;
3255         }
3256 
3257         ret = -ENOTSUP;
3258         /* First try the efficient write zeroes operation */
3259         if (drv->bdrv_co_write_zeroes) {
3260             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3261         }
3262 
3263         if (ret == -ENOTSUP) {
3264             /* Fall back to bounce buffer if write zeroes is unsupported */
3265             int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
3266                                             MAX_WRITE_ZEROES_BOUNCE_BUFFER);
3267             num = MIN(num, max_xfer_len);
3268             iov.iov_len = num * BDRV_SECTOR_SIZE;
3269             if (iov.iov_base == NULL) {
3270                 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3271                 if (iov.iov_base == NULL) {
3272                     ret = -ENOMEM;
3273                     goto fail;
3274                 }
3275                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3276             }
3277             qemu_iovec_init_external(&qiov, &iov, 1);
3278 
3279             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3280 
3281             /* Keep bounce buffer around if it is big enough for all
3282              * all future requests.
3283              */
3284             if (num < max_xfer_len) {
3285                 qemu_vfree(iov.iov_base);
3286                 iov.iov_base = NULL;
3287             }
3288         }
3289 
3290         sector_num += num;
3291         nb_sectors -= num;
3292     }
3293 
3294 fail:
3295     qemu_vfree(iov.iov_base);
3296     return ret;
3297 }
3298 
3299 /*
3300  * Forwards an already correctly aligned write request to the BlockDriver.
3301  */
3302 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3303     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3304     QEMUIOVector *qiov, int flags)
3305 {
3306     BlockDriver *drv = bs->drv;
3307     bool waited;
3308     int ret;
3309 
3310     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3311     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3312 
3313     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3314     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3315     assert(!qiov || bytes == qiov->size);
3316 
3317     waited = wait_serialising_requests(req);
3318     assert(!waited || !req->serialising);
3319     assert(req->overlap_offset <= offset);
3320     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3321 
3322     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3323 
3324     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3325         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3326         qemu_iovec_is_zero(qiov)) {
3327         flags |= BDRV_REQ_ZERO_WRITE;
3328         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3329             flags |= BDRV_REQ_MAY_UNMAP;
3330         }
3331     }
3332 
3333     if (ret < 0) {
3334         /* Do nothing, write notifier decided to fail this request */
3335     } else if (flags & BDRV_REQ_ZERO_WRITE) {
3336         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3337         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3338     } else {
3339         BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3340         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3341     }
3342     BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3343 
3344     if (ret == 0 && !bs->enable_write_cache) {
3345         ret = bdrv_co_flush(bs);
3346     }
3347 
3348     bdrv_set_dirty(bs, sector_num, nb_sectors);
3349 
3350     block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
3351 
3352     if (ret >= 0) {
3353         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3354     }
3355 
3356     return ret;
3357 }
3358 
3359 /*
3360  * Handle a write request in coroutine context
3361  */
3362 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3363     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3364     BdrvRequestFlags flags)
3365 {
3366     BdrvTrackedRequest req;
3367     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3368     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3369     uint8_t *head_buf = NULL;
3370     uint8_t *tail_buf = NULL;
3371     QEMUIOVector local_qiov;
3372     bool use_local_qiov = false;
3373     int ret;
3374 
3375     if (!bs->drv) {
3376         return -ENOMEDIUM;
3377     }
3378     if (bs->read_only) {
3379         return -EACCES;
3380     }
3381 
3382     ret = bdrv_check_byte_request(bs, offset, bytes);
3383     if (ret < 0) {
3384         return ret;
3385     }
3386 
3387     /* throttling disk I/O */
3388     if (bs->io_limits_enabled) {
3389         bdrv_io_limits_intercept(bs, bytes, true);
3390     }
3391 
3392     /*
3393      * Align write if necessary by performing a read-modify-write cycle.
3394      * Pad qiov with the read parts and be sure to have a tracked request not
3395      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3396      */
3397     tracked_request_begin(&req, bs, offset, bytes, true);
3398 
3399     if (offset & (align - 1)) {
3400         QEMUIOVector head_qiov;
3401         struct iovec head_iov;
3402 
3403         mark_request_serialising(&req, align);
3404         wait_serialising_requests(&req);
3405 
3406         head_buf = qemu_blockalign(bs, align);
3407         head_iov = (struct iovec) {
3408             .iov_base   = head_buf,
3409             .iov_len    = align,
3410         };
3411         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3412 
3413         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3414         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3415                                   align, &head_qiov, 0);
3416         if (ret < 0) {
3417             goto fail;
3418         }
3419         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3420 
3421         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3422         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3423         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3424         use_local_qiov = true;
3425 
3426         bytes += offset & (align - 1);
3427         offset = offset & ~(align - 1);
3428     }
3429 
3430     if ((offset + bytes) & (align - 1)) {
3431         QEMUIOVector tail_qiov;
3432         struct iovec tail_iov;
3433         size_t tail_bytes;
3434         bool waited;
3435 
3436         mark_request_serialising(&req, align);
3437         waited = wait_serialising_requests(&req);
3438         assert(!waited || !use_local_qiov);
3439 
3440         tail_buf = qemu_blockalign(bs, align);
3441         tail_iov = (struct iovec) {
3442             .iov_base   = tail_buf,
3443             .iov_len    = align,
3444         };
3445         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3446 
3447         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3448         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3449                                   align, &tail_qiov, 0);
3450         if (ret < 0) {
3451             goto fail;
3452         }
3453         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3454 
3455         if (!use_local_qiov) {
3456             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3457             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3458             use_local_qiov = true;
3459         }
3460 
3461         tail_bytes = (offset + bytes) & (align - 1);
3462         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3463 
3464         bytes = ROUND_UP(bytes, align);
3465     }
3466 
3467     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3468                                use_local_qiov ? &local_qiov : qiov,
3469                                flags);
3470 
3471 fail:
3472     tracked_request_end(&req);
3473 
3474     if (use_local_qiov) {
3475         qemu_iovec_destroy(&local_qiov);
3476     }
3477     qemu_vfree(head_buf);
3478     qemu_vfree(tail_buf);
3479 
3480     return ret;
3481 }
3482 
3483 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3484     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3485     BdrvRequestFlags flags)
3486 {
3487     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
3488         return -EINVAL;
3489     }
3490 
3491     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3492                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3493 }
3494 
3495 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3496     int nb_sectors, QEMUIOVector *qiov)
3497 {
3498     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3499 
3500     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3501 }
3502 
3503 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3504                                       int64_t sector_num, int nb_sectors,
3505                                       BdrvRequestFlags flags)
3506 {
3507     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3508 
3509     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3510         flags &= ~BDRV_REQ_MAY_UNMAP;
3511     }
3512 
3513     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3514                              BDRV_REQ_ZERO_WRITE | flags);
3515 }
3516 
3517 /**
3518  * Truncate file to 'offset' bytes (needed only for file protocols)
3519  */
3520 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3521 {
3522     BlockDriver *drv = bs->drv;
3523     int ret;
3524     if (!drv)
3525         return -ENOMEDIUM;
3526     if (!drv->bdrv_truncate)
3527         return -ENOTSUP;
3528     if (bs->read_only)
3529         return -EACCES;
3530 
3531     ret = drv->bdrv_truncate(bs, offset);
3532     if (ret == 0) {
3533         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3534         if (bs->blk) {
3535             blk_dev_resize_cb(bs->blk);
3536         }
3537     }
3538     return ret;
3539 }
3540 
3541 /**
3542  * Length of a allocated file in bytes. Sparse files are counted by actual
3543  * allocated space. Return < 0 if error or unknown.
3544  */
3545 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3546 {
3547     BlockDriver *drv = bs->drv;
3548     if (!drv) {
3549         return -ENOMEDIUM;
3550     }
3551     if (drv->bdrv_get_allocated_file_size) {
3552         return drv->bdrv_get_allocated_file_size(bs);
3553     }
3554     if (bs->file) {
3555         return bdrv_get_allocated_file_size(bs->file);
3556     }
3557     return -ENOTSUP;
3558 }
3559 
3560 /**
3561  * Return number of sectors on success, -errno on error.
3562  */
3563 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3564 {
3565     BlockDriver *drv = bs->drv;
3566 
3567     if (!drv)
3568         return -ENOMEDIUM;
3569 
3570     if (drv->has_variable_length) {
3571         int ret = refresh_total_sectors(bs, bs->total_sectors);
3572         if (ret < 0) {
3573             return ret;
3574         }
3575     }
3576     return bs->total_sectors;
3577 }
3578 
3579 /**
3580  * Return length in bytes on success, -errno on error.
3581  * The length is always a multiple of BDRV_SECTOR_SIZE.
3582  */
3583 int64_t bdrv_getlength(BlockDriverState *bs)
3584 {
3585     int64_t ret = bdrv_nb_sectors(bs);
3586 
3587     return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3588 }
3589 
3590 /* return 0 as number of sectors if no device present or error */
3591 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3592 {
3593     int64_t nb_sectors = bdrv_nb_sectors(bs);
3594 
3595     *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3596 }
3597 
3598 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3599                        BlockdevOnError on_write_error)
3600 {
3601     bs->on_read_error = on_read_error;
3602     bs->on_write_error = on_write_error;
3603 }
3604 
3605 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3606 {
3607     return is_read ? bs->on_read_error : bs->on_write_error;
3608 }
3609 
3610 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3611 {
3612     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3613 
3614     switch (on_err) {
3615     case BLOCKDEV_ON_ERROR_ENOSPC:
3616         return (error == ENOSPC) ?
3617                BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3618     case BLOCKDEV_ON_ERROR_STOP:
3619         return BLOCK_ERROR_ACTION_STOP;
3620     case BLOCKDEV_ON_ERROR_REPORT:
3621         return BLOCK_ERROR_ACTION_REPORT;
3622     case BLOCKDEV_ON_ERROR_IGNORE:
3623         return BLOCK_ERROR_ACTION_IGNORE;
3624     default:
3625         abort();
3626     }
3627 }
3628 
3629 static void send_qmp_error_event(BlockDriverState *bs,
3630                                  BlockErrorAction action,
3631                                  bool is_read, int error)
3632 {
3633     IoOperationType optype;
3634 
3635     optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3636     qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
3637                                    bdrv_iostatus_is_enabled(bs),
3638                                    error == ENOSPC, strerror(error),
3639                                    &error_abort);
3640 }
3641 
3642 /* This is done by device models because, while the block layer knows
3643  * about the error, it does not know whether an operation comes from
3644  * the device or the block layer (from a job, for example).
3645  */
3646 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3647                        bool is_read, int error)
3648 {
3649     assert(error >= 0);
3650 
3651     if (action == BLOCK_ERROR_ACTION_STOP) {
3652         /* First set the iostatus, so that "info block" returns an iostatus
3653          * that matches the events raised so far (an additional error iostatus
3654          * is fine, but not a lost one).
3655          */
3656         bdrv_iostatus_set_err(bs, error);
3657 
3658         /* Then raise the request to stop the VM and the event.
3659          * qemu_system_vmstop_request_prepare has two effects.  First,
3660          * it ensures that the STOP event always comes after the
3661          * BLOCK_IO_ERROR event.  Second, it ensures that even if management
3662          * can observe the STOP event and do a "cont" before the STOP
3663          * event is issued, the VM will not stop.  In this case, vm_start()
3664          * also ensures that the STOP/RESUME pair of events is emitted.
3665          */
3666         qemu_system_vmstop_request_prepare();
3667         send_qmp_error_event(bs, action, is_read, error);
3668         qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3669     } else {
3670         send_qmp_error_event(bs, action, is_read, error);
3671     }
3672 }
3673 
3674 int bdrv_is_read_only(BlockDriverState *bs)
3675 {
3676     return bs->read_only;
3677 }
3678 
3679 int bdrv_is_sg(BlockDriverState *bs)
3680 {
3681     return bs->sg;
3682 }
3683 
3684 int bdrv_enable_write_cache(BlockDriverState *bs)
3685 {
3686     return bs->enable_write_cache;
3687 }
3688 
3689 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3690 {
3691     bs->enable_write_cache = wce;
3692 
3693     /* so a reopen() will preserve wce */
3694     if (wce) {
3695         bs->open_flags |= BDRV_O_CACHE_WB;
3696     } else {
3697         bs->open_flags &= ~BDRV_O_CACHE_WB;
3698     }
3699 }
3700 
3701 int bdrv_is_encrypted(BlockDriverState *bs)
3702 {
3703     if (bs->backing_hd && bs->backing_hd->encrypted)
3704         return 1;
3705     return bs->encrypted;
3706 }
3707 
3708 int bdrv_key_required(BlockDriverState *bs)
3709 {
3710     BlockDriverState *backing_hd = bs->backing_hd;
3711 
3712     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3713         return 1;
3714     return (bs->encrypted && !bs->valid_key);
3715 }
3716 
3717 int bdrv_set_key(BlockDriverState *bs, const char *key)
3718 {
3719     int ret;
3720     if (bs->backing_hd && bs->backing_hd->encrypted) {
3721         ret = bdrv_set_key(bs->backing_hd, key);
3722         if (ret < 0)
3723             return ret;
3724         if (!bs->encrypted)
3725             return 0;
3726     }
3727     if (!bs->encrypted) {
3728         return -EINVAL;
3729     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3730         return -ENOMEDIUM;
3731     }
3732     ret = bs->drv->bdrv_set_key(bs, key);
3733     if (ret < 0) {
3734         bs->valid_key = 0;
3735     } else if (!bs->valid_key) {
3736         bs->valid_key = 1;
3737         if (bs->blk) {
3738             /* call the change callback now, we skipped it on open */
3739             blk_dev_change_media_cb(bs->blk, true);
3740         }
3741     }
3742     return ret;
3743 }
3744 
3745 /*
3746  * Provide an encryption key for @bs.
3747  * If @key is non-null:
3748  *     If @bs is not encrypted, fail.
3749  *     Else if the key is invalid, fail.
3750  *     Else set @bs's key to @key, replacing the existing key, if any.
3751  * If @key is null:
3752  *     If @bs is encrypted and still lacks a key, fail.
3753  *     Else do nothing.
3754  * On failure, store an error object through @errp if non-null.
3755  */
3756 void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp)
3757 {
3758     if (key) {
3759         if (!bdrv_is_encrypted(bs)) {
3760             error_setg(errp, "Device '%s' is not encrypted",
3761                       bdrv_get_device_name(bs));
3762         } else if (bdrv_set_key(bs, key) < 0) {
3763             error_set(errp, QERR_INVALID_PASSWORD);
3764         }
3765     } else {
3766         if (bdrv_key_required(bs)) {
3767             error_set(errp, ERROR_CLASS_DEVICE_ENCRYPTED,
3768                       "'%s' (%s) is encrypted",
3769                       bdrv_get_device_name(bs),
3770                       bdrv_get_encrypted_filename(bs));
3771         }
3772     }
3773 }
3774 
3775 const char *bdrv_get_format_name(BlockDriverState *bs)
3776 {
3777     return bs->drv ? bs->drv->format_name : NULL;
3778 }
3779 
3780 static int qsort_strcmp(const void *a, const void *b)
3781 {
3782     return strcmp(a, b);
3783 }
3784 
3785 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3786                          void *opaque)
3787 {
3788     BlockDriver *drv;
3789     int count = 0;
3790     int i;
3791     const char **formats = NULL;
3792 
3793     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3794         if (drv->format_name) {
3795             bool found = false;
3796             int i = count;
3797             while (formats && i && !found) {
3798                 found = !strcmp(formats[--i], drv->format_name);
3799             }
3800 
3801             if (!found) {
3802                 formats = g_renew(const char *, formats, count + 1);
3803                 formats[count++] = drv->format_name;
3804             }
3805         }
3806     }
3807 
3808     qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3809 
3810     for (i = 0; i < count; i++) {
3811         it(opaque, formats[i]);
3812     }
3813 
3814     g_free(formats);
3815 }
3816 
3817 /* This function is to find block backend bs */
3818 /* TODO convert callers to blk_by_name(), then remove */
3819 BlockDriverState *bdrv_find(const char *name)
3820 {
3821     BlockBackend *blk = blk_by_name(name);
3822 
3823     return blk ? blk_bs(blk) : NULL;
3824 }
3825 
3826 /* This function is to find a node in the bs graph */
3827 BlockDriverState *bdrv_find_node(const char *node_name)
3828 {
3829     BlockDriverState *bs;
3830 
3831     assert(node_name);
3832 
3833     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3834         if (!strcmp(node_name, bs->node_name)) {
3835             return bs;
3836         }
3837     }
3838     return NULL;
3839 }
3840 
3841 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3842 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3843 {
3844     BlockDeviceInfoList *list, *entry;
3845     BlockDriverState *bs;
3846 
3847     list = NULL;
3848     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3849         entry = g_malloc0(sizeof(*entry));
3850         entry->value = bdrv_block_device_info(bs);
3851         entry->next = list;
3852         list = entry;
3853     }
3854 
3855     return list;
3856 }
3857 
3858 BlockDriverState *bdrv_lookup_bs(const char *device,
3859                                  const char *node_name,
3860                                  Error **errp)
3861 {
3862     BlockBackend *blk;
3863     BlockDriverState *bs;
3864 
3865     if (device) {
3866         blk = blk_by_name(device);
3867 
3868         if (blk) {
3869             return blk_bs(blk);
3870         }
3871     }
3872 
3873     if (node_name) {
3874         bs = bdrv_find_node(node_name);
3875 
3876         if (bs) {
3877             return bs;
3878         }
3879     }
3880 
3881     error_setg(errp, "Cannot find device=%s nor node_name=%s",
3882                      device ? device : "",
3883                      node_name ? node_name : "");
3884     return NULL;
3885 }
3886 
3887 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3888  * return false.  If either argument is NULL, return false. */
3889 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3890 {
3891     while (top && top != base) {
3892         top = top->backing_hd;
3893     }
3894 
3895     return top != NULL;
3896 }
3897 
3898 BlockDriverState *bdrv_next_node(BlockDriverState *bs)
3899 {
3900     if (!bs) {
3901         return QTAILQ_FIRST(&graph_bdrv_states);
3902     }
3903     return QTAILQ_NEXT(bs, node_list);
3904 }
3905 
3906 BlockDriverState *bdrv_next(BlockDriverState *bs)
3907 {
3908     if (!bs) {
3909         return QTAILQ_FIRST(&bdrv_states);
3910     }
3911     return QTAILQ_NEXT(bs, device_list);
3912 }
3913 
3914 const char *bdrv_get_node_name(const BlockDriverState *bs)
3915 {
3916     return bs->node_name;
3917 }
3918 
3919 /* TODO check what callers really want: bs->node_name or blk_name() */
3920 const char *bdrv_get_device_name(const BlockDriverState *bs)
3921 {
3922     return bs->blk ? blk_name(bs->blk) : "";
3923 }
3924 
3925 int bdrv_get_flags(BlockDriverState *bs)
3926 {
3927     return bs->open_flags;
3928 }
3929 
3930 int bdrv_flush_all(void)
3931 {
3932     BlockDriverState *bs;
3933     int result = 0;
3934 
3935     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3936         AioContext *aio_context = bdrv_get_aio_context(bs);
3937         int ret;
3938 
3939         aio_context_acquire(aio_context);
3940         ret = bdrv_flush(bs);
3941         if (ret < 0 && !result) {
3942             result = ret;
3943         }
3944         aio_context_release(aio_context);
3945     }
3946 
3947     return result;
3948 }
3949 
3950 int bdrv_has_zero_init_1(BlockDriverState *bs)
3951 {
3952     return 1;
3953 }
3954 
3955 int bdrv_has_zero_init(BlockDriverState *bs)
3956 {
3957     assert(bs->drv);
3958 
3959     /* If BS is a copy on write image, it is initialized to
3960        the contents of the base image, which may not be zeroes.  */
3961     if (bs->backing_hd) {
3962         return 0;
3963     }
3964     if (bs->drv->bdrv_has_zero_init) {
3965         return bs->drv->bdrv_has_zero_init(bs);
3966     }
3967 
3968     /* safe default */
3969     return 0;
3970 }
3971 
3972 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3973 {
3974     BlockDriverInfo bdi;
3975 
3976     if (bs->backing_hd) {
3977         return false;
3978     }
3979 
3980     if (bdrv_get_info(bs, &bdi) == 0) {
3981         return bdi.unallocated_blocks_are_zero;
3982     }
3983 
3984     return false;
3985 }
3986 
3987 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3988 {
3989     BlockDriverInfo bdi;
3990 
3991     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3992         return false;
3993     }
3994 
3995     if (bdrv_get_info(bs, &bdi) == 0) {
3996         return bdi.can_write_zeroes_with_unmap;
3997     }
3998 
3999     return false;
4000 }
4001 
4002 typedef struct BdrvCoGetBlockStatusData {
4003     BlockDriverState *bs;
4004     BlockDriverState *base;
4005     int64_t sector_num;
4006     int nb_sectors;
4007     int *pnum;
4008     int64_t ret;
4009     bool done;
4010 } BdrvCoGetBlockStatusData;
4011 
4012 /*
4013  * Returns the allocation status of the specified sectors.
4014  * Drivers not implementing the functionality are assumed to not support
4015  * backing files, hence all their sectors are reported as allocated.
4016  *
4017  * If 'sector_num' is beyond the end of the disk image the return value is 0
4018  * and 'pnum' is set to 0.
4019  *
4020  * 'pnum' is set to the number of sectors (including and immediately following
4021  * the specified sector) that are known to be in the same
4022  * allocated/unallocated state.
4023  *
4024  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
4025  * beyond the end of the disk image it will be clamped.
4026  */
4027 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
4028                                                      int64_t sector_num,
4029                                                      int nb_sectors, int *pnum)
4030 {
4031     int64_t total_sectors;
4032     int64_t n;
4033     int64_t ret, ret2;
4034 
4035     total_sectors = bdrv_nb_sectors(bs);
4036     if (total_sectors < 0) {
4037         return total_sectors;
4038     }
4039 
4040     if (sector_num >= total_sectors) {
4041         *pnum = 0;
4042         return 0;
4043     }
4044 
4045     n = total_sectors - sector_num;
4046     if (n < nb_sectors) {
4047         nb_sectors = n;
4048     }
4049 
4050     if (!bs->drv->bdrv_co_get_block_status) {
4051         *pnum = nb_sectors;
4052         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
4053         if (bs->drv->protocol_name) {
4054             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
4055         }
4056         return ret;
4057     }
4058 
4059     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
4060     if (ret < 0) {
4061         *pnum = 0;
4062         return ret;
4063     }
4064 
4065     if (ret & BDRV_BLOCK_RAW) {
4066         assert(ret & BDRV_BLOCK_OFFSET_VALID);
4067         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4068                                      *pnum, pnum);
4069     }
4070 
4071     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
4072         ret |= BDRV_BLOCK_ALLOCATED;
4073     }
4074 
4075     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
4076         if (bdrv_unallocated_blocks_are_zero(bs)) {
4077             ret |= BDRV_BLOCK_ZERO;
4078         } else if (bs->backing_hd) {
4079             BlockDriverState *bs2 = bs->backing_hd;
4080             int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4081             if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
4082                 ret |= BDRV_BLOCK_ZERO;
4083             }
4084         }
4085     }
4086 
4087     if (bs->file &&
4088         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4089         (ret & BDRV_BLOCK_OFFSET_VALID)) {
4090         int file_pnum;
4091 
4092         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4093                                         *pnum, &file_pnum);
4094         if (ret2 >= 0) {
4095             /* Ignore errors.  This is just providing extra information, it
4096              * is useful but not necessary.
4097              */
4098             if (!file_pnum) {
4099                 /* !file_pnum indicates an offset at or beyond the EOF; it is
4100                  * perfectly valid for the format block driver to point to such
4101                  * offsets, so catch it and mark everything as zero */
4102                 ret |= BDRV_BLOCK_ZERO;
4103             } else {
4104                 /* Limit request to the range reported by the protocol driver */
4105                 *pnum = file_pnum;
4106                 ret |= (ret2 & BDRV_BLOCK_ZERO);
4107             }
4108         }
4109     }
4110 
4111     return ret;
4112 }
4113 
4114 /* Coroutine wrapper for bdrv_get_block_status() */
4115 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
4116 {
4117     BdrvCoGetBlockStatusData *data = opaque;
4118     BlockDriverState *bs = data->bs;
4119 
4120     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4121                                          data->pnum);
4122     data->done = true;
4123 }
4124 
4125 /*
4126  * Synchronous wrapper around bdrv_co_get_block_status().
4127  *
4128  * See bdrv_co_get_block_status() for details.
4129  */
4130 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4131                               int nb_sectors, int *pnum)
4132 {
4133     Coroutine *co;
4134     BdrvCoGetBlockStatusData data = {
4135         .bs = bs,
4136         .sector_num = sector_num,
4137         .nb_sectors = nb_sectors,
4138         .pnum = pnum,
4139         .done = false,
4140     };
4141 
4142     if (qemu_in_coroutine()) {
4143         /* Fast-path if already in coroutine context */
4144         bdrv_get_block_status_co_entry(&data);
4145     } else {
4146         AioContext *aio_context = bdrv_get_aio_context(bs);
4147 
4148         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4149         qemu_coroutine_enter(co, &data);
4150         while (!data.done) {
4151             aio_poll(aio_context, true);
4152         }
4153     }
4154     return data.ret;
4155 }
4156 
4157 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4158                                    int nb_sectors, int *pnum)
4159 {
4160     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4161     if (ret < 0) {
4162         return ret;
4163     }
4164     return !!(ret & BDRV_BLOCK_ALLOCATED);
4165 }
4166 
4167 /*
4168  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4169  *
4170  * Return true if the given sector is allocated in any image between
4171  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
4172  * sector is allocated in any image of the chain.  Return false otherwise.
4173  *
4174  * 'pnum' is set to the number of sectors (including and immediately following
4175  *  the specified sector) that are known to be in the same
4176  *  allocated/unallocated state.
4177  *
4178  */
4179 int bdrv_is_allocated_above(BlockDriverState *top,
4180                             BlockDriverState *base,
4181                             int64_t sector_num,
4182                             int nb_sectors, int *pnum)
4183 {
4184     BlockDriverState *intermediate;
4185     int ret, n = nb_sectors;
4186 
4187     intermediate = top;
4188     while (intermediate && intermediate != base) {
4189         int pnum_inter;
4190         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4191                                 &pnum_inter);
4192         if (ret < 0) {
4193             return ret;
4194         } else if (ret) {
4195             *pnum = pnum_inter;
4196             return 1;
4197         }
4198 
4199         /*
4200          * [sector_num, nb_sectors] is unallocated on top but intermediate
4201          * might have
4202          *
4203          * [sector_num+x, nr_sectors] allocated.
4204          */
4205         if (n > pnum_inter &&
4206             (intermediate == top ||
4207              sector_num + pnum_inter < intermediate->total_sectors)) {
4208             n = pnum_inter;
4209         }
4210 
4211         intermediate = intermediate->backing_hd;
4212     }
4213 
4214     *pnum = n;
4215     return 0;
4216 }
4217 
4218 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4219 {
4220     if (bs->backing_hd && bs->backing_hd->encrypted)
4221         return bs->backing_file;
4222     else if (bs->encrypted)
4223         return bs->filename;
4224     else
4225         return NULL;
4226 }
4227 
4228 void bdrv_get_backing_filename(BlockDriverState *bs,
4229                                char *filename, int filename_size)
4230 {
4231     pstrcpy(filename, filename_size, bs->backing_file);
4232 }
4233 
4234 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4235                           const uint8_t *buf, int nb_sectors)
4236 {
4237     BlockDriver *drv = bs->drv;
4238     int ret;
4239 
4240     if (!drv) {
4241         return -ENOMEDIUM;
4242     }
4243     if (!drv->bdrv_write_compressed) {
4244         return -ENOTSUP;
4245     }
4246     ret = bdrv_check_request(bs, sector_num, nb_sectors);
4247     if (ret < 0) {
4248         return ret;
4249     }
4250 
4251     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4252 
4253     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4254 }
4255 
4256 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4257 {
4258     BlockDriver *drv = bs->drv;
4259     if (!drv)
4260         return -ENOMEDIUM;
4261     if (!drv->bdrv_get_info)
4262         return -ENOTSUP;
4263     memset(bdi, 0, sizeof(*bdi));
4264     return drv->bdrv_get_info(bs, bdi);
4265 }
4266 
4267 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4268 {
4269     BlockDriver *drv = bs->drv;
4270     if (drv && drv->bdrv_get_specific_info) {
4271         return drv->bdrv_get_specific_info(bs);
4272     }
4273     return NULL;
4274 }
4275 
4276 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4277                       int64_t pos, int size)
4278 {
4279     QEMUIOVector qiov;
4280     struct iovec iov = {
4281         .iov_base   = (void *) buf,
4282         .iov_len    = size,
4283     };
4284 
4285     qemu_iovec_init_external(&qiov, &iov, 1);
4286     return bdrv_writev_vmstate(bs, &qiov, pos);
4287 }
4288 
4289 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4290 {
4291     BlockDriver *drv = bs->drv;
4292 
4293     if (!drv) {
4294         return -ENOMEDIUM;
4295     } else if (drv->bdrv_save_vmstate) {
4296         return drv->bdrv_save_vmstate(bs, qiov, pos);
4297     } else if (bs->file) {
4298         return bdrv_writev_vmstate(bs->file, qiov, pos);
4299     }
4300 
4301     return -ENOTSUP;
4302 }
4303 
4304 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4305                       int64_t pos, int size)
4306 {
4307     BlockDriver *drv = bs->drv;
4308     if (!drv)
4309         return -ENOMEDIUM;
4310     if (drv->bdrv_load_vmstate)
4311         return drv->bdrv_load_vmstate(bs, buf, pos, size);
4312     if (bs->file)
4313         return bdrv_load_vmstate(bs->file, buf, pos, size);
4314     return -ENOTSUP;
4315 }
4316 
4317 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4318 {
4319     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4320         return;
4321     }
4322 
4323     bs->drv->bdrv_debug_event(bs, event);
4324 }
4325 
4326 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4327                           const char *tag)
4328 {
4329     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4330         bs = bs->file;
4331     }
4332 
4333     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4334         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4335     }
4336 
4337     return -ENOTSUP;
4338 }
4339 
4340 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4341 {
4342     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4343         bs = bs->file;
4344     }
4345 
4346     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4347         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4348     }
4349 
4350     return -ENOTSUP;
4351 }
4352 
4353 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4354 {
4355     while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4356         bs = bs->file;
4357     }
4358 
4359     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4360         return bs->drv->bdrv_debug_resume(bs, tag);
4361     }
4362 
4363     return -ENOTSUP;
4364 }
4365 
4366 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4367 {
4368     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4369         bs = bs->file;
4370     }
4371 
4372     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4373         return bs->drv->bdrv_debug_is_suspended(bs, tag);
4374     }
4375 
4376     return false;
4377 }
4378 
4379 int bdrv_is_snapshot(BlockDriverState *bs)
4380 {
4381     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4382 }
4383 
4384 /* backing_file can either be relative, or absolute, or a protocol.  If it is
4385  * relative, it must be relative to the chain.  So, passing in bs->filename
4386  * from a BDS as backing_file should not be done, as that may be relative to
4387  * the CWD rather than the chain. */
4388 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4389         const char *backing_file)
4390 {
4391     char *filename_full = NULL;
4392     char *backing_file_full = NULL;
4393     char *filename_tmp = NULL;
4394     int is_protocol = 0;
4395     BlockDriverState *curr_bs = NULL;
4396     BlockDriverState *retval = NULL;
4397 
4398     if (!bs || !bs->drv || !backing_file) {
4399         return NULL;
4400     }
4401 
4402     filename_full     = g_malloc(PATH_MAX);
4403     backing_file_full = g_malloc(PATH_MAX);
4404     filename_tmp      = g_malloc(PATH_MAX);
4405 
4406     is_protocol = path_has_protocol(backing_file);
4407 
4408     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4409 
4410         /* If either of the filename paths is actually a protocol, then
4411          * compare unmodified paths; otherwise make paths relative */
4412         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4413             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4414                 retval = curr_bs->backing_hd;
4415                 break;
4416             }
4417         } else {
4418             /* If not an absolute filename path, make it relative to the current
4419              * image's filename path */
4420             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4421                          backing_file);
4422 
4423             /* We are going to compare absolute pathnames */
4424             if (!realpath(filename_tmp, filename_full)) {
4425                 continue;
4426             }
4427 
4428             /* We need to make sure the backing filename we are comparing against
4429              * is relative to the current image filename (or absolute) */
4430             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4431                          curr_bs->backing_file);
4432 
4433             if (!realpath(filename_tmp, backing_file_full)) {
4434                 continue;
4435             }
4436 
4437             if (strcmp(backing_file_full, filename_full) == 0) {
4438                 retval = curr_bs->backing_hd;
4439                 break;
4440             }
4441         }
4442     }
4443 
4444     g_free(filename_full);
4445     g_free(backing_file_full);
4446     g_free(filename_tmp);
4447     return retval;
4448 }
4449 
4450 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4451 {
4452     if (!bs->drv) {
4453         return 0;
4454     }
4455 
4456     if (!bs->backing_hd) {
4457         return 0;
4458     }
4459 
4460     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4461 }
4462 
4463 /**************************************************************/
4464 /* async I/Os */
4465 
4466 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4467                            QEMUIOVector *qiov, int nb_sectors,
4468                            BlockCompletionFunc *cb, void *opaque)
4469 {
4470     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4471 
4472     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4473                                  cb, opaque, false);
4474 }
4475 
4476 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4477                             QEMUIOVector *qiov, int nb_sectors,
4478                             BlockCompletionFunc *cb, void *opaque)
4479 {
4480     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4481 
4482     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4483                                  cb, opaque, true);
4484 }
4485 
4486 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4487         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4488         BlockCompletionFunc *cb, void *opaque)
4489 {
4490     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4491 
4492     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4493                                  BDRV_REQ_ZERO_WRITE | flags,
4494                                  cb, opaque, true);
4495 }
4496 
4497 
4498 typedef struct MultiwriteCB {
4499     int error;
4500     int num_requests;
4501     int num_callbacks;
4502     struct {
4503         BlockCompletionFunc *cb;
4504         void *opaque;
4505         QEMUIOVector *free_qiov;
4506     } callbacks[];
4507 } MultiwriteCB;
4508 
4509 static void multiwrite_user_cb(MultiwriteCB *mcb)
4510 {
4511     int i;
4512 
4513     for (i = 0; i < mcb->num_callbacks; i++) {
4514         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4515         if (mcb->callbacks[i].free_qiov) {
4516             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4517         }
4518         g_free(mcb->callbacks[i].free_qiov);
4519     }
4520 }
4521 
4522 static void multiwrite_cb(void *opaque, int ret)
4523 {
4524     MultiwriteCB *mcb = opaque;
4525 
4526     trace_multiwrite_cb(mcb, ret);
4527 
4528     if (ret < 0 && !mcb->error) {
4529         mcb->error = ret;
4530     }
4531 
4532     mcb->num_requests--;
4533     if (mcb->num_requests == 0) {
4534         multiwrite_user_cb(mcb);
4535         g_free(mcb);
4536     }
4537 }
4538 
4539 static int multiwrite_req_compare(const void *a, const void *b)
4540 {
4541     const BlockRequest *req1 = a, *req2 = b;
4542 
4543     /*
4544      * Note that we can't simply subtract req2->sector from req1->sector
4545      * here as that could overflow the return value.
4546      */
4547     if (req1->sector > req2->sector) {
4548         return 1;
4549     } else if (req1->sector < req2->sector) {
4550         return -1;
4551     } else {
4552         return 0;
4553     }
4554 }
4555 
4556 /*
4557  * Takes a bunch of requests and tries to merge them. Returns the number of
4558  * requests that remain after merging.
4559  */
4560 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4561     int num_reqs, MultiwriteCB *mcb)
4562 {
4563     int i, outidx;
4564 
4565     // Sort requests by start sector
4566     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4567 
4568     // Check if adjacent requests touch the same clusters. If so, combine them,
4569     // filling up gaps with zero sectors.
4570     outidx = 0;
4571     for (i = 1; i < num_reqs; i++) {
4572         int merge = 0;
4573         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4574 
4575         // Handle exactly sequential writes and overlapping writes.
4576         if (reqs[i].sector <= oldreq_last) {
4577             merge = 1;
4578         }
4579 
4580         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4581             merge = 0;
4582         }
4583 
4584         if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
4585             reqs[i].nb_sectors > bs->bl.max_transfer_length) {
4586             merge = 0;
4587         }
4588 
4589         if (merge) {
4590             size_t size;
4591             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4592             qemu_iovec_init(qiov,
4593                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4594 
4595             // Add the first request to the merged one. If the requests are
4596             // overlapping, drop the last sectors of the first request.
4597             size = (reqs[i].sector - reqs[outidx].sector) << 9;
4598             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4599 
4600             // We should need to add any zeros between the two requests
4601             assert (reqs[i].sector <= oldreq_last);
4602 
4603             // Add the second request
4604             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4605 
4606             // Add tail of first request, if necessary
4607             if (qiov->size < reqs[outidx].qiov->size) {
4608                 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4609                                   reqs[outidx].qiov->size - qiov->size);
4610             }
4611 
4612             reqs[outidx].nb_sectors = qiov->size >> 9;
4613             reqs[outidx].qiov = qiov;
4614 
4615             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4616         } else {
4617             outidx++;
4618             reqs[outidx].sector     = reqs[i].sector;
4619             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4620             reqs[outidx].qiov       = reqs[i].qiov;
4621         }
4622     }
4623 
4624     block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
4625 
4626     return outidx + 1;
4627 }
4628 
4629 /*
4630  * Submit multiple AIO write requests at once.
4631  *
4632  * On success, the function returns 0 and all requests in the reqs array have
4633  * been submitted. In error case this function returns -1, and any of the
4634  * requests may or may not be submitted yet. In particular, this means that the
4635  * callback will be called for some of the requests, for others it won't. The
4636  * caller must check the error field of the BlockRequest to wait for the right
4637  * callbacks (if error != 0, no callback will be called).
4638  *
4639  * The implementation may modify the contents of the reqs array, e.g. to merge
4640  * requests. However, the fields opaque and error are left unmodified as they
4641  * are used to signal failure for a single request to the caller.
4642  */
4643 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4644 {
4645     MultiwriteCB *mcb;
4646     int i;
4647 
4648     /* don't submit writes if we don't have a medium */
4649     if (bs->drv == NULL) {
4650         for (i = 0; i < num_reqs; i++) {
4651             reqs[i].error = -ENOMEDIUM;
4652         }
4653         return -1;
4654     }
4655 
4656     if (num_reqs == 0) {
4657         return 0;
4658     }
4659 
4660     // Create MultiwriteCB structure
4661     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4662     mcb->num_requests = 0;
4663     mcb->num_callbacks = num_reqs;
4664 
4665     for (i = 0; i < num_reqs; i++) {
4666         mcb->callbacks[i].cb = reqs[i].cb;
4667         mcb->callbacks[i].opaque = reqs[i].opaque;
4668     }
4669 
4670     // Check for mergable requests
4671     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4672 
4673     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4674 
4675     /* Run the aio requests. */
4676     mcb->num_requests = num_reqs;
4677     for (i = 0; i < num_reqs; i++) {
4678         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4679                               reqs[i].nb_sectors, reqs[i].flags,
4680                               multiwrite_cb, mcb,
4681                               true);
4682     }
4683 
4684     return 0;
4685 }
4686 
4687 void bdrv_aio_cancel(BlockAIOCB *acb)
4688 {
4689     qemu_aio_ref(acb);
4690     bdrv_aio_cancel_async(acb);
4691     while (acb->refcnt > 1) {
4692         if (acb->aiocb_info->get_aio_context) {
4693             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4694         } else if (acb->bs) {
4695             aio_poll(bdrv_get_aio_context(acb->bs), true);
4696         } else {
4697             abort();
4698         }
4699     }
4700     qemu_aio_unref(acb);
4701 }
4702 
4703 /* Async version of aio cancel. The caller is not blocked if the acb implements
4704  * cancel_async, otherwise we do nothing and let the request normally complete.
4705  * In either case the completion callback must be called. */
4706 void bdrv_aio_cancel_async(BlockAIOCB *acb)
4707 {
4708     if (acb->aiocb_info->cancel_async) {
4709         acb->aiocb_info->cancel_async(acb);
4710     }
4711 }
4712 
4713 /**************************************************************/
4714 /* async block device emulation */
4715 
4716 typedef struct BlockAIOCBSync {
4717     BlockAIOCB common;
4718     QEMUBH *bh;
4719     int ret;
4720     /* vector translation state */
4721     QEMUIOVector *qiov;
4722     uint8_t *bounce;
4723     int is_write;
4724 } BlockAIOCBSync;
4725 
4726 static const AIOCBInfo bdrv_em_aiocb_info = {
4727     .aiocb_size         = sizeof(BlockAIOCBSync),
4728 };
4729 
4730 static void bdrv_aio_bh_cb(void *opaque)
4731 {
4732     BlockAIOCBSync *acb = opaque;
4733 
4734     if (!acb->is_write && acb->ret >= 0) {
4735         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4736     }
4737     qemu_vfree(acb->bounce);
4738     acb->common.cb(acb->common.opaque, acb->ret);
4739     qemu_bh_delete(acb->bh);
4740     acb->bh = NULL;
4741     qemu_aio_unref(acb);
4742 }
4743 
4744 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4745                                       int64_t sector_num,
4746                                       QEMUIOVector *qiov,
4747                                       int nb_sectors,
4748                                       BlockCompletionFunc *cb,
4749                                       void *opaque,
4750                                       int is_write)
4751 
4752 {
4753     BlockAIOCBSync *acb;
4754 
4755     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4756     acb->is_write = is_write;
4757     acb->qiov = qiov;
4758     acb->bounce = qemu_try_blockalign(bs, qiov->size);
4759     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4760 
4761     if (acb->bounce == NULL) {
4762         acb->ret = -ENOMEM;
4763     } else if (is_write) {
4764         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4765         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4766     } else {
4767         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4768     }
4769 
4770     qemu_bh_schedule(acb->bh);
4771 
4772     return &acb->common;
4773 }
4774 
4775 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4776         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4777         BlockCompletionFunc *cb, void *opaque)
4778 {
4779     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4780 }
4781 
4782 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4783         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4784         BlockCompletionFunc *cb, void *opaque)
4785 {
4786     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4787 }
4788 
4789 
4790 typedef struct BlockAIOCBCoroutine {
4791     BlockAIOCB common;
4792     BlockRequest req;
4793     bool is_write;
4794     bool *done;
4795     QEMUBH* bh;
4796 } BlockAIOCBCoroutine;
4797 
4798 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4799     .aiocb_size         = sizeof(BlockAIOCBCoroutine),
4800 };
4801 
4802 static void bdrv_co_em_bh(void *opaque)
4803 {
4804     BlockAIOCBCoroutine *acb = opaque;
4805 
4806     acb->common.cb(acb->common.opaque, acb->req.error);
4807 
4808     qemu_bh_delete(acb->bh);
4809     qemu_aio_unref(acb);
4810 }
4811 
4812 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4813 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4814 {
4815     BlockAIOCBCoroutine *acb = opaque;
4816     BlockDriverState *bs = acb->common.bs;
4817 
4818     if (!acb->is_write) {
4819         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4820             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4821     } else {
4822         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4823             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4824     }
4825 
4826     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4827     qemu_bh_schedule(acb->bh);
4828 }
4829 
4830 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4831                                          int64_t sector_num,
4832                                          QEMUIOVector *qiov,
4833                                          int nb_sectors,
4834                                          BdrvRequestFlags flags,
4835                                          BlockCompletionFunc *cb,
4836                                          void *opaque,
4837                                          bool is_write)
4838 {
4839     Coroutine *co;
4840     BlockAIOCBCoroutine *acb;
4841 
4842     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4843     acb->req.sector = sector_num;
4844     acb->req.nb_sectors = nb_sectors;
4845     acb->req.qiov = qiov;
4846     acb->req.flags = flags;
4847     acb->is_write = is_write;
4848 
4849     co = qemu_coroutine_create(bdrv_co_do_rw);
4850     qemu_coroutine_enter(co, acb);
4851 
4852     return &acb->common;
4853 }
4854 
4855 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4856 {
4857     BlockAIOCBCoroutine *acb = opaque;
4858     BlockDriverState *bs = acb->common.bs;
4859 
4860     acb->req.error = bdrv_co_flush(bs);
4861     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4862     qemu_bh_schedule(acb->bh);
4863 }
4864 
4865 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4866         BlockCompletionFunc *cb, void *opaque)
4867 {
4868     trace_bdrv_aio_flush(bs, opaque);
4869 
4870     Coroutine *co;
4871     BlockAIOCBCoroutine *acb;
4872 
4873     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4874 
4875     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4876     qemu_coroutine_enter(co, acb);
4877 
4878     return &acb->common;
4879 }
4880 
4881 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4882 {
4883     BlockAIOCBCoroutine *acb = opaque;
4884     BlockDriverState *bs = acb->common.bs;
4885 
4886     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4887     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4888     qemu_bh_schedule(acb->bh);
4889 }
4890 
4891 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4892         int64_t sector_num, int nb_sectors,
4893         BlockCompletionFunc *cb, void *opaque)
4894 {
4895     Coroutine *co;
4896     BlockAIOCBCoroutine *acb;
4897 
4898     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4899 
4900     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4901     acb->req.sector = sector_num;
4902     acb->req.nb_sectors = nb_sectors;
4903     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4904     qemu_coroutine_enter(co, acb);
4905 
4906     return &acb->common;
4907 }
4908 
4909 void bdrv_init(void)
4910 {
4911     module_call_init(MODULE_INIT_BLOCK);
4912 }
4913 
4914 void bdrv_init_with_whitelist(void)
4915 {
4916     use_bdrv_whitelist = 1;
4917     bdrv_init();
4918 }
4919 
4920 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4921                    BlockCompletionFunc *cb, void *opaque)
4922 {
4923     BlockAIOCB *acb;
4924 
4925     acb = g_slice_alloc(aiocb_info->aiocb_size);
4926     acb->aiocb_info = aiocb_info;
4927     acb->bs = bs;
4928     acb->cb = cb;
4929     acb->opaque = opaque;
4930     acb->refcnt = 1;
4931     return acb;
4932 }
4933 
4934 void qemu_aio_ref(void *p)
4935 {
4936     BlockAIOCB *acb = p;
4937     acb->refcnt++;
4938 }
4939 
4940 void qemu_aio_unref(void *p)
4941 {
4942     BlockAIOCB *acb = p;
4943     assert(acb->refcnt > 0);
4944     if (--acb->refcnt == 0) {
4945         g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4946     }
4947 }
4948 
4949 /**************************************************************/
4950 /* Coroutine block device emulation */
4951 
4952 typedef struct CoroutineIOCompletion {
4953     Coroutine *coroutine;
4954     int ret;
4955 } CoroutineIOCompletion;
4956 
4957 static void bdrv_co_io_em_complete(void *opaque, int ret)
4958 {
4959     CoroutineIOCompletion *co = opaque;
4960 
4961     co->ret = ret;
4962     qemu_coroutine_enter(co->coroutine, NULL);
4963 }
4964 
4965 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4966                                       int nb_sectors, QEMUIOVector *iov,
4967                                       bool is_write)
4968 {
4969     CoroutineIOCompletion co = {
4970         .coroutine = qemu_coroutine_self(),
4971     };
4972     BlockAIOCB *acb;
4973 
4974     if (is_write) {
4975         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4976                                        bdrv_co_io_em_complete, &co);
4977     } else {
4978         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4979                                       bdrv_co_io_em_complete, &co);
4980     }
4981 
4982     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4983     if (!acb) {
4984         return -EIO;
4985     }
4986     qemu_coroutine_yield();
4987 
4988     return co.ret;
4989 }
4990 
4991 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4992                                          int64_t sector_num, int nb_sectors,
4993                                          QEMUIOVector *iov)
4994 {
4995     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4996 }
4997 
4998 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4999                                          int64_t sector_num, int nb_sectors,
5000                                          QEMUIOVector *iov)
5001 {
5002     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
5003 }
5004 
5005 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
5006 {
5007     RwCo *rwco = opaque;
5008 
5009     rwco->ret = bdrv_co_flush(rwco->bs);
5010 }
5011 
5012 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
5013 {
5014     int ret;
5015 
5016     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
5017         return 0;
5018     }
5019 
5020     /* Write back cached data to the OS even with cache=unsafe */
5021     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
5022     if (bs->drv->bdrv_co_flush_to_os) {
5023         ret = bs->drv->bdrv_co_flush_to_os(bs);
5024         if (ret < 0) {
5025             return ret;
5026         }
5027     }
5028 
5029     /* But don't actually force it to the disk with cache=unsafe */
5030     if (bs->open_flags & BDRV_O_NO_FLUSH) {
5031         goto flush_parent;
5032     }
5033 
5034     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
5035     if (bs->drv->bdrv_co_flush_to_disk) {
5036         ret = bs->drv->bdrv_co_flush_to_disk(bs);
5037     } else if (bs->drv->bdrv_aio_flush) {
5038         BlockAIOCB *acb;
5039         CoroutineIOCompletion co = {
5040             .coroutine = qemu_coroutine_self(),
5041         };
5042 
5043         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
5044         if (acb == NULL) {
5045             ret = -EIO;
5046         } else {
5047             qemu_coroutine_yield();
5048             ret = co.ret;
5049         }
5050     } else {
5051         /*
5052          * Some block drivers always operate in either writethrough or unsafe
5053          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
5054          * know how the server works (because the behaviour is hardcoded or
5055          * depends on server-side configuration), so we can't ensure that
5056          * everything is safe on disk. Returning an error doesn't work because
5057          * that would break guests even if the server operates in writethrough
5058          * mode.
5059          *
5060          * Let's hope the user knows what he's doing.
5061          */
5062         ret = 0;
5063     }
5064     if (ret < 0) {
5065         return ret;
5066     }
5067 
5068     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
5069      * in the case of cache=unsafe, so there are no useless flushes.
5070      */
5071 flush_parent:
5072     return bdrv_co_flush(bs->file);
5073 }
5074 
5075 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
5076 {
5077     Error *local_err = NULL;
5078     int ret;
5079 
5080     if (!bs->drv)  {
5081         return;
5082     }
5083 
5084     if (!(bs->open_flags & BDRV_O_INCOMING)) {
5085         return;
5086     }
5087     bs->open_flags &= ~BDRV_O_INCOMING;
5088 
5089     if (bs->drv->bdrv_invalidate_cache) {
5090         bs->drv->bdrv_invalidate_cache(bs, &local_err);
5091     } else if (bs->file) {
5092         bdrv_invalidate_cache(bs->file, &local_err);
5093     }
5094     if (local_err) {
5095         error_propagate(errp, local_err);
5096         return;
5097     }
5098 
5099     ret = refresh_total_sectors(bs, bs->total_sectors);
5100     if (ret < 0) {
5101         error_setg_errno(errp, -ret, "Could not refresh total sector count");
5102         return;
5103     }
5104 }
5105 
5106 void bdrv_invalidate_cache_all(Error **errp)
5107 {
5108     BlockDriverState *bs;
5109     Error *local_err = NULL;
5110 
5111     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5112         AioContext *aio_context = bdrv_get_aio_context(bs);
5113 
5114         aio_context_acquire(aio_context);
5115         bdrv_invalidate_cache(bs, &local_err);
5116         aio_context_release(aio_context);
5117         if (local_err) {
5118             error_propagate(errp, local_err);
5119             return;
5120         }
5121     }
5122 }
5123 
5124 int bdrv_flush(BlockDriverState *bs)
5125 {
5126     Coroutine *co;
5127     RwCo rwco = {
5128         .bs = bs,
5129         .ret = NOT_DONE,
5130     };
5131 
5132     if (qemu_in_coroutine()) {
5133         /* Fast-path if already in coroutine context */
5134         bdrv_flush_co_entry(&rwco);
5135     } else {
5136         AioContext *aio_context = bdrv_get_aio_context(bs);
5137 
5138         co = qemu_coroutine_create(bdrv_flush_co_entry);
5139         qemu_coroutine_enter(co, &rwco);
5140         while (rwco.ret == NOT_DONE) {
5141             aio_poll(aio_context, true);
5142         }
5143     }
5144 
5145     return rwco.ret;
5146 }
5147 
5148 typedef struct DiscardCo {
5149     BlockDriverState *bs;
5150     int64_t sector_num;
5151     int nb_sectors;
5152     int ret;
5153 } DiscardCo;
5154 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5155 {
5156     DiscardCo *rwco = opaque;
5157 
5158     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5159 }
5160 
5161 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5162                                  int nb_sectors)
5163 {
5164     int max_discard, ret;
5165 
5166     if (!bs->drv) {
5167         return -ENOMEDIUM;
5168     }
5169 
5170     ret = bdrv_check_request(bs, sector_num, nb_sectors);
5171     if (ret < 0) {
5172         return ret;
5173     } else if (bs->read_only) {
5174         return -EROFS;
5175     }
5176 
5177     bdrv_reset_dirty(bs, sector_num, nb_sectors);
5178 
5179     /* Do nothing if disabled.  */
5180     if (!(bs->open_flags & BDRV_O_UNMAP)) {
5181         return 0;
5182     }
5183 
5184     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5185         return 0;
5186     }
5187 
5188     max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
5189     while (nb_sectors > 0) {
5190         int ret;
5191         int num = nb_sectors;
5192 
5193         /* align request */
5194         if (bs->bl.discard_alignment &&
5195             num >= bs->bl.discard_alignment &&
5196             sector_num % bs->bl.discard_alignment) {
5197             if (num > bs->bl.discard_alignment) {
5198                 num = bs->bl.discard_alignment;
5199             }
5200             num -= sector_num % bs->bl.discard_alignment;
5201         }
5202 
5203         /* limit request size */
5204         if (num > max_discard) {
5205             num = max_discard;
5206         }
5207 
5208         if (bs->drv->bdrv_co_discard) {
5209             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5210         } else {
5211             BlockAIOCB *acb;
5212             CoroutineIOCompletion co = {
5213                 .coroutine = qemu_coroutine_self(),
5214             };
5215 
5216             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5217                                             bdrv_co_io_em_complete, &co);
5218             if (acb == NULL) {
5219                 return -EIO;
5220             } else {
5221                 qemu_coroutine_yield();
5222                 ret = co.ret;
5223             }
5224         }
5225         if (ret && ret != -ENOTSUP) {
5226             return ret;
5227         }
5228 
5229         sector_num += num;
5230         nb_sectors -= num;
5231     }
5232     return 0;
5233 }
5234 
5235 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5236 {
5237     Coroutine *co;
5238     DiscardCo rwco = {
5239         .bs = bs,
5240         .sector_num = sector_num,
5241         .nb_sectors = nb_sectors,
5242         .ret = NOT_DONE,
5243     };
5244 
5245     if (qemu_in_coroutine()) {
5246         /* Fast-path if already in coroutine context */
5247         bdrv_discard_co_entry(&rwco);
5248     } else {
5249         AioContext *aio_context = bdrv_get_aio_context(bs);
5250 
5251         co = qemu_coroutine_create(bdrv_discard_co_entry);
5252         qemu_coroutine_enter(co, &rwco);
5253         while (rwco.ret == NOT_DONE) {
5254             aio_poll(aio_context, true);
5255         }
5256     }
5257 
5258     return rwco.ret;
5259 }
5260 
5261 /**************************************************************/
5262 /* removable device support */
5263 
5264 /**
5265  * Return TRUE if the media is present
5266  */
5267 int bdrv_is_inserted(BlockDriverState *bs)
5268 {
5269     BlockDriver *drv = bs->drv;
5270 
5271     if (!drv)
5272         return 0;
5273     if (!drv->bdrv_is_inserted)
5274         return 1;
5275     return drv->bdrv_is_inserted(bs);
5276 }
5277 
5278 /**
5279  * Return whether the media changed since the last call to this
5280  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
5281  */
5282 int bdrv_media_changed(BlockDriverState *bs)
5283 {
5284     BlockDriver *drv = bs->drv;
5285 
5286     if (drv && drv->bdrv_media_changed) {
5287         return drv->bdrv_media_changed(bs);
5288     }
5289     return -ENOTSUP;
5290 }
5291 
5292 /**
5293  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5294  */
5295 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5296 {
5297     BlockDriver *drv = bs->drv;
5298     const char *device_name;
5299 
5300     if (drv && drv->bdrv_eject) {
5301         drv->bdrv_eject(bs, eject_flag);
5302     }
5303 
5304     device_name = bdrv_get_device_name(bs);
5305     if (device_name[0] != '\0') {
5306         qapi_event_send_device_tray_moved(device_name,
5307                                           eject_flag, &error_abort);
5308     }
5309 }
5310 
5311 /**
5312  * Lock or unlock the media (if it is locked, the user won't be able
5313  * to eject it manually).
5314  */
5315 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5316 {
5317     BlockDriver *drv = bs->drv;
5318 
5319     trace_bdrv_lock_medium(bs, locked);
5320 
5321     if (drv && drv->bdrv_lock_medium) {
5322         drv->bdrv_lock_medium(bs, locked);
5323     }
5324 }
5325 
5326 /* needed for generic scsi interface */
5327 
5328 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5329 {
5330     BlockDriver *drv = bs->drv;
5331 
5332     if (drv && drv->bdrv_ioctl)
5333         return drv->bdrv_ioctl(bs, req, buf);
5334     return -ENOTSUP;
5335 }
5336 
5337 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5338         unsigned long int req, void *buf,
5339         BlockCompletionFunc *cb, void *opaque)
5340 {
5341     BlockDriver *drv = bs->drv;
5342 
5343     if (drv && drv->bdrv_aio_ioctl)
5344         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5345     return NULL;
5346 }
5347 
5348 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5349 {
5350     bs->guest_block_size = align;
5351 }
5352 
5353 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5354 {
5355     return qemu_memalign(bdrv_opt_mem_align(bs), size);
5356 }
5357 
5358 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5359 {
5360     return memset(qemu_blockalign(bs, size), 0, size);
5361 }
5362 
5363 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5364 {
5365     size_t align = bdrv_opt_mem_align(bs);
5366 
5367     /* Ensure that NULL is never returned on success */
5368     assert(align > 0);
5369     if (size == 0) {
5370         size = align;
5371     }
5372 
5373     return qemu_try_memalign(align, size);
5374 }
5375 
5376 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5377 {
5378     void *mem = qemu_try_blockalign(bs, size);
5379 
5380     if (mem) {
5381         memset(mem, 0, size);
5382     }
5383 
5384     return mem;
5385 }
5386 
5387 /*
5388  * Check if all memory in this vector is sector aligned.
5389  */
5390 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5391 {
5392     int i;
5393     size_t alignment = bdrv_opt_mem_align(bs);
5394 
5395     for (i = 0; i < qiov->niov; i++) {
5396         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5397             return false;
5398         }
5399         if (qiov->iov[i].iov_len % alignment) {
5400             return false;
5401         }
5402     }
5403 
5404     return true;
5405 }
5406 
5407 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5408                                           Error **errp)
5409 {
5410     int64_t bitmap_size;
5411     BdrvDirtyBitmap *bitmap;
5412 
5413     assert((granularity & (granularity - 1)) == 0);
5414 
5415     granularity >>= BDRV_SECTOR_BITS;
5416     assert(granularity);
5417     bitmap_size = bdrv_nb_sectors(bs);
5418     if (bitmap_size < 0) {
5419         error_setg_errno(errp, -bitmap_size, "could not get length of device");
5420         errno = -bitmap_size;
5421         return NULL;
5422     }
5423     bitmap = g_new0(BdrvDirtyBitmap, 1);
5424     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5425     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5426     return bitmap;
5427 }
5428 
5429 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5430 {
5431     BdrvDirtyBitmap *bm, *next;
5432     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5433         if (bm == bitmap) {
5434             QLIST_REMOVE(bitmap, list);
5435             hbitmap_free(bitmap->bitmap);
5436             g_free(bitmap);
5437             return;
5438         }
5439     }
5440 }
5441 
5442 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5443 {
5444     BdrvDirtyBitmap *bm;
5445     BlockDirtyInfoList *list = NULL;
5446     BlockDirtyInfoList **plist = &list;
5447 
5448     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5449         BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5450         BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
5451         info->count = bdrv_get_dirty_count(bs, bm);
5452         info->granularity =
5453             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5454         entry->value = info;
5455         *plist = entry;
5456         plist = &entry->next;
5457     }
5458 
5459     return list;
5460 }
5461 
5462 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5463 {
5464     if (bitmap) {
5465         return hbitmap_get(bitmap->bitmap, sector);
5466     } else {
5467         return 0;
5468     }
5469 }
5470 
5471 void bdrv_dirty_iter_init(BlockDriverState *bs,
5472                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5473 {
5474     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5475 }
5476 
5477 void bdrv_set_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
5478                            int64_t cur_sector, int nr_sectors)
5479 {
5480     hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5481 }
5482 
5483 void bdrv_reset_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
5484                              int64_t cur_sector, int nr_sectors)
5485 {
5486     hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5487 }
5488 
5489 static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5490                            int nr_sectors)
5491 {
5492     BdrvDirtyBitmap *bitmap;
5493     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5494         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5495     }
5496 }
5497 
5498 static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
5499                              int nr_sectors)
5500 {
5501     BdrvDirtyBitmap *bitmap;
5502     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5503         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5504     }
5505 }
5506 
5507 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5508 {
5509     return hbitmap_count(bitmap->bitmap);
5510 }
5511 
5512 /* Get a reference to bs */
5513 void bdrv_ref(BlockDriverState *bs)
5514 {
5515     bs->refcnt++;
5516 }
5517 
5518 /* Release a previously grabbed reference to bs.
5519  * If after releasing, reference count is zero, the BlockDriverState is
5520  * deleted. */
5521 void bdrv_unref(BlockDriverState *bs)
5522 {
5523     if (!bs) {
5524         return;
5525     }
5526     assert(bs->refcnt > 0);
5527     if (--bs->refcnt == 0) {
5528         bdrv_delete(bs);
5529     }
5530 }
5531 
5532 struct BdrvOpBlocker {
5533     Error *reason;
5534     QLIST_ENTRY(BdrvOpBlocker) list;
5535 };
5536 
5537 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5538 {
5539     BdrvOpBlocker *blocker;
5540     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5541     if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5542         blocker = QLIST_FIRST(&bs->op_blockers[op]);
5543         if (errp) {
5544             error_setg(errp, "Device '%s' is busy: %s",
5545                        bdrv_get_device_name(bs),
5546                        error_get_pretty(blocker->reason));
5547         }
5548         return true;
5549     }
5550     return false;
5551 }
5552 
5553 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5554 {
5555     BdrvOpBlocker *blocker;
5556     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5557 
5558     blocker = g_new0(BdrvOpBlocker, 1);
5559     blocker->reason = reason;
5560     QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5561 }
5562 
5563 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5564 {
5565     BdrvOpBlocker *blocker, *next;
5566     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5567     QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5568         if (blocker->reason == reason) {
5569             QLIST_REMOVE(blocker, list);
5570             g_free(blocker);
5571         }
5572     }
5573 }
5574 
5575 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5576 {
5577     int i;
5578     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5579         bdrv_op_block(bs, i, reason);
5580     }
5581 }
5582 
5583 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5584 {
5585     int i;
5586     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5587         bdrv_op_unblock(bs, i, reason);
5588     }
5589 }
5590 
5591 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5592 {
5593     int i;
5594 
5595     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5596         if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5597             return false;
5598         }
5599     }
5600     return true;
5601 }
5602 
5603 void bdrv_iostatus_enable(BlockDriverState *bs)
5604 {
5605     bs->iostatus_enabled = true;
5606     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5607 }
5608 
5609 /* The I/O status is only enabled if the drive explicitly
5610  * enables it _and_ the VM is configured to stop on errors */
5611 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5612 {
5613     return (bs->iostatus_enabled &&
5614            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5615             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5616             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5617 }
5618 
5619 void bdrv_iostatus_disable(BlockDriverState *bs)
5620 {
5621     bs->iostatus_enabled = false;
5622 }
5623 
5624 void bdrv_iostatus_reset(BlockDriverState *bs)
5625 {
5626     if (bdrv_iostatus_is_enabled(bs)) {
5627         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5628         if (bs->job) {
5629             block_job_iostatus_reset(bs->job);
5630         }
5631     }
5632 }
5633 
5634 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5635 {
5636     assert(bdrv_iostatus_is_enabled(bs));
5637     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5638         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5639                                          BLOCK_DEVICE_IO_STATUS_FAILED;
5640     }
5641 }
5642 
5643 void bdrv_img_create(const char *filename, const char *fmt,
5644                      const char *base_filename, const char *base_fmt,
5645                      char *options, uint64_t img_size, int flags,
5646                      Error **errp, bool quiet)
5647 {
5648     QemuOptsList *create_opts = NULL;
5649     QemuOpts *opts = NULL;
5650     const char *backing_fmt, *backing_file;
5651     int64_t size;
5652     BlockDriver *drv, *proto_drv;
5653     BlockDriver *backing_drv = NULL;
5654     Error *local_err = NULL;
5655     int ret = 0;
5656 
5657     /* Find driver and parse its options */
5658     drv = bdrv_find_format(fmt);
5659     if (!drv) {
5660         error_setg(errp, "Unknown file format '%s'", fmt);
5661         return;
5662     }
5663 
5664     proto_drv = bdrv_find_protocol(filename, true, errp);
5665     if (!proto_drv) {
5666         return;
5667     }
5668 
5669     if (!drv->create_opts) {
5670         error_setg(errp, "Format driver '%s' does not support image creation",
5671                    drv->format_name);
5672         return;
5673     }
5674 
5675     if (!proto_drv->create_opts) {
5676         error_setg(errp, "Protocol driver '%s' does not support image creation",
5677                    proto_drv->format_name);
5678         return;
5679     }
5680 
5681     create_opts = qemu_opts_append(create_opts, drv->create_opts);
5682     create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5683 
5684     /* Create parameter list with default values */
5685     opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5686     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size, &error_abort);
5687 
5688     /* Parse -o options */
5689     if (options) {
5690         qemu_opts_do_parse(opts, options, NULL, &local_err);
5691         if (local_err) {
5692             error_report_err(local_err);
5693             local_err = NULL;
5694             error_setg(errp, "Invalid options for file format '%s'", fmt);
5695             goto out;
5696         }
5697     }
5698 
5699     if (base_filename) {
5700         qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename, &local_err);
5701         if (local_err) {
5702             error_setg(errp, "Backing file not supported for file format '%s'",
5703                        fmt);
5704             goto out;
5705         }
5706     }
5707 
5708     if (base_fmt) {
5709         qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt, &local_err);
5710         if (local_err) {
5711             error_setg(errp, "Backing file format not supported for file "
5712                              "format '%s'", fmt);
5713             goto out;
5714         }
5715     }
5716 
5717     backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5718     if (backing_file) {
5719         if (!strcmp(filename, backing_file)) {
5720             error_setg(errp, "Error: Trying to create an image with the "
5721                              "same filename as the backing file");
5722             goto out;
5723         }
5724     }
5725 
5726     backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5727     if (backing_fmt) {
5728         backing_drv = bdrv_find_format(backing_fmt);
5729         if (!backing_drv) {
5730             error_setg(errp, "Unknown backing file format '%s'",
5731                        backing_fmt);
5732             goto out;
5733         }
5734     }
5735 
5736     // The size for the image must always be specified, with one exception:
5737     // If we are using a backing file, we can obtain the size from there
5738     size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5739     if (size == -1) {
5740         if (backing_file) {
5741             BlockDriverState *bs;
5742             char *full_backing = g_new0(char, PATH_MAX);
5743             int64_t size;
5744             int back_flags;
5745 
5746             bdrv_get_full_backing_filename_from_filename(filename, backing_file,
5747                                                          full_backing, PATH_MAX,
5748                                                          &local_err);
5749             if (local_err) {
5750                 g_free(full_backing);
5751                 goto out;
5752             }
5753 
5754             /* backing files always opened read-only */
5755             back_flags =
5756                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5757 
5758             bs = NULL;
5759             ret = bdrv_open(&bs, full_backing, NULL, NULL, back_flags,
5760                             backing_drv, &local_err);
5761             g_free(full_backing);
5762             if (ret < 0) {
5763                 goto out;
5764             }
5765             size = bdrv_getlength(bs);
5766             if (size < 0) {
5767                 error_setg_errno(errp, -size, "Could not get size of '%s'",
5768                                  backing_file);
5769                 bdrv_unref(bs);
5770                 goto out;
5771             }
5772 
5773             qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size, &error_abort);
5774 
5775             bdrv_unref(bs);
5776         } else {
5777             error_setg(errp, "Image creation needs a size parameter");
5778             goto out;
5779         }
5780     }
5781 
5782     if (!quiet) {
5783         printf("Formatting '%s', fmt=%s", filename, fmt);
5784         qemu_opts_print(opts, " ");
5785         puts("");
5786     }
5787 
5788     ret = bdrv_create(drv, filename, opts, &local_err);
5789 
5790     if (ret == -EFBIG) {
5791         /* This is generally a better message than whatever the driver would
5792          * deliver (especially because of the cluster_size_hint), since that
5793          * is most probably not much different from "image too large". */
5794         const char *cluster_size_hint = "";
5795         if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
5796             cluster_size_hint = " (try using a larger cluster size)";
5797         }
5798         error_setg(errp, "The image size is too large for file format '%s'"
5799                    "%s", fmt, cluster_size_hint);
5800         error_free(local_err);
5801         local_err = NULL;
5802     }
5803 
5804 out:
5805     qemu_opts_del(opts);
5806     qemu_opts_free(create_opts);
5807     if (local_err) {
5808         error_propagate(errp, local_err);
5809     }
5810 }
5811 
5812 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5813 {
5814     return bs->aio_context;
5815 }
5816 
5817 void bdrv_detach_aio_context(BlockDriverState *bs)
5818 {
5819     BdrvAioNotifier *baf;
5820 
5821     if (!bs->drv) {
5822         return;
5823     }
5824 
5825     QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5826         baf->detach_aio_context(baf->opaque);
5827     }
5828 
5829     if (bs->io_limits_enabled) {
5830         throttle_detach_aio_context(&bs->throttle_state);
5831     }
5832     if (bs->drv->bdrv_detach_aio_context) {
5833         bs->drv->bdrv_detach_aio_context(bs);
5834     }
5835     if (bs->file) {
5836         bdrv_detach_aio_context(bs->file);
5837     }
5838     if (bs->backing_hd) {
5839         bdrv_detach_aio_context(bs->backing_hd);
5840     }
5841 
5842     bs->aio_context = NULL;
5843 }
5844 
5845 void bdrv_attach_aio_context(BlockDriverState *bs,
5846                              AioContext *new_context)
5847 {
5848     BdrvAioNotifier *ban;
5849 
5850     if (!bs->drv) {
5851         return;
5852     }
5853 
5854     bs->aio_context = new_context;
5855 
5856     if (bs->backing_hd) {
5857         bdrv_attach_aio_context(bs->backing_hd, new_context);
5858     }
5859     if (bs->file) {
5860         bdrv_attach_aio_context(bs->file, new_context);
5861     }
5862     if (bs->drv->bdrv_attach_aio_context) {
5863         bs->drv->bdrv_attach_aio_context(bs, new_context);
5864     }
5865     if (bs->io_limits_enabled) {
5866         throttle_attach_aio_context(&bs->throttle_state, new_context);
5867     }
5868 
5869     QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5870         ban->attached_aio_context(new_context, ban->opaque);
5871     }
5872 }
5873 
5874 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5875 {
5876     bdrv_drain_all(); /* ensure there are no in-flight requests */
5877 
5878     bdrv_detach_aio_context(bs);
5879 
5880     /* This function executes in the old AioContext so acquire the new one in
5881      * case it runs in a different thread.
5882      */
5883     aio_context_acquire(new_context);
5884     bdrv_attach_aio_context(bs, new_context);
5885     aio_context_release(new_context);
5886 }
5887 
5888 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5889         void (*attached_aio_context)(AioContext *new_context, void *opaque),
5890         void (*detach_aio_context)(void *opaque), void *opaque)
5891 {
5892     BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5893     *ban = (BdrvAioNotifier){
5894         .attached_aio_context = attached_aio_context,
5895         .detach_aio_context   = detach_aio_context,
5896         .opaque               = opaque
5897     };
5898 
5899     QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5900 }
5901 
5902 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5903                                       void (*attached_aio_context)(AioContext *,
5904                                                                    void *),
5905                                       void (*detach_aio_context)(void *),
5906                                       void *opaque)
5907 {
5908     BdrvAioNotifier *ban, *ban_next;
5909 
5910     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5911         if (ban->attached_aio_context == attached_aio_context &&
5912             ban->detach_aio_context   == detach_aio_context   &&
5913             ban->opaque               == opaque)
5914         {
5915             QLIST_REMOVE(ban, list);
5916             g_free(ban);
5917 
5918             return;
5919         }
5920     }
5921 
5922     abort();
5923 }
5924 
5925 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5926                                     NotifierWithReturn *notifier)
5927 {
5928     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5929 }
5930 
5931 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
5932                        BlockDriverAmendStatusCB *status_cb)
5933 {
5934     if (!bs->drv->bdrv_amend_options) {
5935         return -ENOTSUP;
5936     }
5937     return bs->drv->bdrv_amend_options(bs, opts, status_cb);
5938 }
5939 
5940 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5941  * of block filter and by bdrv_is_first_non_filter.
5942  * It is used to test if the given bs is the candidate or recurse more in the
5943  * node graph.
5944  */
5945 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5946                                       BlockDriverState *candidate)
5947 {
5948     /* return false if basic checks fails */
5949     if (!bs || !bs->drv) {
5950         return false;
5951     }
5952 
5953     /* the code reached a non block filter driver -> check if the bs is
5954      * the same as the candidate. It's the recursion termination condition.
5955      */
5956     if (!bs->drv->is_filter) {
5957         return bs == candidate;
5958     }
5959     /* Down this path the driver is a block filter driver */
5960 
5961     /* If the block filter recursion method is defined use it to recurse down
5962      * the node graph.
5963      */
5964     if (bs->drv->bdrv_recurse_is_first_non_filter) {
5965         return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5966     }
5967 
5968     /* the driver is a block filter but don't allow to recurse -> return false
5969      */
5970     return false;
5971 }
5972 
5973 /* This function checks if the candidate is the first non filter bs down it's
5974  * bs chain. Since we don't have pointers to parents it explore all bs chains
5975  * from the top. Some filters can choose not to pass down the recursion.
5976  */
5977 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5978 {
5979     BlockDriverState *bs;
5980 
5981     /* walk down the bs forest recursively */
5982     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5983         bool perm;
5984 
5985         /* try to recurse in this top level bs */
5986         perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5987 
5988         /* candidate is the first non filter */
5989         if (perm) {
5990             return true;
5991         }
5992     }
5993 
5994     return false;
5995 }
5996 
5997 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5998 {
5999     BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
6000     AioContext *aio_context;
6001 
6002     if (!to_replace_bs) {
6003         error_setg(errp, "Node name '%s' not found", node_name);
6004         return NULL;
6005     }
6006 
6007     aio_context = bdrv_get_aio_context(to_replace_bs);
6008     aio_context_acquire(aio_context);
6009 
6010     if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
6011         to_replace_bs = NULL;
6012         goto out;
6013     }
6014 
6015     /* We don't want arbitrary node of the BDS chain to be replaced only the top
6016      * most non filter in order to prevent data corruption.
6017      * Another benefit is that this tests exclude backing files which are
6018      * blocked by the backing blockers.
6019      */
6020     if (!bdrv_is_first_non_filter(to_replace_bs)) {
6021         error_setg(errp, "Only top most non filter can be replaced");
6022         to_replace_bs = NULL;
6023         goto out;
6024     }
6025 
6026 out:
6027     aio_context_release(aio_context);
6028     return to_replace_bs;
6029 }
6030 
6031 void bdrv_io_plug(BlockDriverState *bs)
6032 {
6033     BlockDriver *drv = bs->drv;
6034     if (drv && drv->bdrv_io_plug) {
6035         drv->bdrv_io_plug(bs);
6036     } else if (bs->file) {
6037         bdrv_io_plug(bs->file);
6038     }
6039 }
6040 
6041 void bdrv_io_unplug(BlockDriverState *bs)
6042 {
6043     BlockDriver *drv = bs->drv;
6044     if (drv && drv->bdrv_io_unplug) {
6045         drv->bdrv_io_unplug(bs);
6046     } else if (bs->file) {
6047         bdrv_io_unplug(bs->file);
6048     }
6049 }
6050 
6051 void bdrv_flush_io_queue(BlockDriverState *bs)
6052 {
6053     BlockDriver *drv = bs->drv;
6054     if (drv && drv->bdrv_flush_io_queue) {
6055         drv->bdrv_flush_io_queue(bs);
6056     } else if (bs->file) {
6057         bdrv_flush_io_queue(bs->file);
6058     }
6059 }
6060 
6061 static bool append_open_options(QDict *d, BlockDriverState *bs)
6062 {
6063     const QDictEntry *entry;
6064     bool found_any = false;
6065 
6066     for (entry = qdict_first(bs->options); entry;
6067          entry = qdict_next(bs->options, entry))
6068     {
6069         /* Only take options for this level and exclude all non-driver-specific
6070          * options */
6071         if (!strchr(qdict_entry_key(entry), '.') &&
6072             strcmp(qdict_entry_key(entry), "node-name"))
6073         {
6074             qobject_incref(qdict_entry_value(entry));
6075             qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
6076             found_any = true;
6077         }
6078     }
6079 
6080     return found_any;
6081 }
6082 
6083 /* Updates the following BDS fields:
6084  *  - exact_filename: A filename which may be used for opening a block device
6085  *                    which (mostly) equals the given BDS (even without any
6086  *                    other options; so reading and writing must return the same
6087  *                    results, but caching etc. may be different)
6088  *  - full_open_options: Options which, when given when opening a block device
6089  *                       (without a filename), result in a BDS (mostly)
6090  *                       equalling the given one
6091  *  - filename: If exact_filename is set, it is copied here. Otherwise,
6092  *              full_open_options is converted to a JSON object, prefixed with
6093  *              "json:" (for use through the JSON pseudo protocol) and put here.
6094  */
6095 void bdrv_refresh_filename(BlockDriverState *bs)
6096 {
6097     BlockDriver *drv = bs->drv;
6098     QDict *opts;
6099 
6100     if (!drv) {
6101         return;
6102     }
6103 
6104     /* This BDS's file name will most probably depend on its file's name, so
6105      * refresh that first */
6106     if (bs->file) {
6107         bdrv_refresh_filename(bs->file);
6108     }
6109 
6110     if (drv->bdrv_refresh_filename) {
6111         /* Obsolete information is of no use here, so drop the old file name
6112          * information before refreshing it */
6113         bs->exact_filename[0] = '\0';
6114         if (bs->full_open_options) {
6115             QDECREF(bs->full_open_options);
6116             bs->full_open_options = NULL;
6117         }
6118 
6119         drv->bdrv_refresh_filename(bs);
6120     } else if (bs->file) {
6121         /* Try to reconstruct valid information from the underlying file */
6122         bool has_open_options;
6123 
6124         bs->exact_filename[0] = '\0';
6125         if (bs->full_open_options) {
6126             QDECREF(bs->full_open_options);
6127             bs->full_open_options = NULL;
6128         }
6129 
6130         opts = qdict_new();
6131         has_open_options = append_open_options(opts, bs);
6132 
6133         /* If no specific options have been given for this BDS, the filename of
6134          * the underlying file should suffice for this one as well */
6135         if (bs->file->exact_filename[0] && !has_open_options) {
6136             strcpy(bs->exact_filename, bs->file->exact_filename);
6137         }
6138         /* Reconstructing the full options QDict is simple for most format block
6139          * drivers, as long as the full options are known for the underlying
6140          * file BDS. The full options QDict of that file BDS should somehow
6141          * contain a representation of the filename, therefore the following
6142          * suffices without querying the (exact_)filename of this BDS. */
6143         if (bs->file->full_open_options) {
6144             qdict_put_obj(opts, "driver",
6145                           QOBJECT(qstring_from_str(drv->format_name)));
6146             QINCREF(bs->file->full_open_options);
6147             qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6148 
6149             bs->full_open_options = opts;
6150         } else {
6151             QDECREF(opts);
6152         }
6153     } else if (!bs->full_open_options && qdict_size(bs->options)) {
6154         /* There is no underlying file BDS (at least referenced by BDS.file),
6155          * so the full options QDict should be equal to the options given
6156          * specifically for this block device when it was opened (plus the
6157          * driver specification).
6158          * Because those options don't change, there is no need to update
6159          * full_open_options when it's already set. */
6160 
6161         opts = qdict_new();
6162         append_open_options(opts, bs);
6163         qdict_put_obj(opts, "driver",
6164                       QOBJECT(qstring_from_str(drv->format_name)));
6165 
6166         if (bs->exact_filename[0]) {
6167             /* This may not work for all block protocol drivers (some may
6168              * require this filename to be parsed), but we have to find some
6169              * default solution here, so just include it. If some block driver
6170              * does not support pure options without any filename at all or
6171              * needs some special format of the options QDict, it needs to
6172              * implement the driver-specific bdrv_refresh_filename() function.
6173              */
6174             qdict_put_obj(opts, "filename",
6175                           QOBJECT(qstring_from_str(bs->exact_filename)));
6176         }
6177 
6178         bs->full_open_options = opts;
6179     }
6180 
6181     if (bs->exact_filename[0]) {
6182         pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6183     } else if (bs->full_open_options) {
6184         QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6185         snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6186                  qstring_get_str(json));
6187         QDECREF(json);
6188     }
6189 }
6190 
6191 /* This accessor function purpose is to allow the device models to access the
6192  * BlockAcctStats structure embedded inside a BlockDriverState without being
6193  * aware of the BlockDriverState structure layout.
6194  * It will go away when the BlockAcctStats structure will be moved inside
6195  * the device models.
6196  */
6197 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6198 {
6199     return &bs->stats;
6200 }
6201