xref: /openbmc/qemu/block.c (revision e5e51dd3)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/block-backend.h"
32 #include "sysemu/sysemu.h"
33 #include "sysemu/qtest.h"
34 #include "qemu/notify.h"
35 #include "block/coroutine.h"
36 #include "block/qapi.h"
37 #include "qmp-commands.h"
38 #include "qemu/timer.h"
39 #include "qapi-event.h"
40 
41 #ifdef CONFIG_BSD
42 #include <sys/types.h>
43 #include <sys/stat.h>
44 #include <sys/ioctl.h>
45 #include <sys/queue.h>
46 #ifndef __DragonFly__
47 #include <sys/disk.h>
48 #endif
49 #endif
50 
51 #ifdef _WIN32
52 #include <windows.h>
53 #endif
54 
55 struct BdrvDirtyBitmap {
56     HBitmap *bitmap;
57     QLIST_ENTRY(BdrvDirtyBitmap) list;
58 };
59 
60 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
61 
62 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
63         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
64         BlockCompletionFunc *cb, void *opaque);
65 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
66         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
67         BlockCompletionFunc *cb, void *opaque);
68 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
69                                          int64_t sector_num, int nb_sectors,
70                                          QEMUIOVector *iov);
71 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
72                                          int64_t sector_num, int nb_sectors,
73                                          QEMUIOVector *iov);
74 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
75     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
76     BdrvRequestFlags flags);
77 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
78     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
79     BdrvRequestFlags flags);
80 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
81                                          int64_t sector_num,
82                                          QEMUIOVector *qiov,
83                                          int nb_sectors,
84                                          BdrvRequestFlags flags,
85                                          BlockCompletionFunc *cb,
86                                          void *opaque,
87                                          bool is_write);
88 static void coroutine_fn bdrv_co_do_rw(void *opaque);
89 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
90     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
91 
92 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
93     QTAILQ_HEAD_INITIALIZER(bdrv_states);
94 
95 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
96     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
97 
98 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
99     QLIST_HEAD_INITIALIZER(bdrv_drivers);
100 
101 static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
102                            int nr_sectors);
103 static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
104                              int nr_sectors);
105 /* If non-zero, use only whitelisted block drivers */
106 static int use_bdrv_whitelist;
107 
108 #ifdef _WIN32
109 static int is_windows_drive_prefix(const char *filename)
110 {
111     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
112              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
113             filename[1] == ':');
114 }
115 
116 int is_windows_drive(const char *filename)
117 {
118     if (is_windows_drive_prefix(filename) &&
119         filename[2] == '\0')
120         return 1;
121     if (strstart(filename, "\\\\.\\", NULL) ||
122         strstart(filename, "//./", NULL))
123         return 1;
124     return 0;
125 }
126 #endif
127 
128 /* throttling disk I/O limits */
129 void bdrv_set_io_limits(BlockDriverState *bs,
130                         ThrottleConfig *cfg)
131 {
132     int i;
133 
134     throttle_config(&bs->throttle_state, cfg);
135 
136     for (i = 0; i < 2; i++) {
137         qemu_co_enter_next(&bs->throttled_reqs[i]);
138     }
139 }
140 
141 /* this function drain all the throttled IOs */
142 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
143 {
144     bool drained = false;
145     bool enabled = bs->io_limits_enabled;
146     int i;
147 
148     bs->io_limits_enabled = false;
149 
150     for (i = 0; i < 2; i++) {
151         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
152             drained = true;
153         }
154     }
155 
156     bs->io_limits_enabled = enabled;
157 
158     return drained;
159 }
160 
161 void bdrv_io_limits_disable(BlockDriverState *bs)
162 {
163     bs->io_limits_enabled = false;
164 
165     bdrv_start_throttled_reqs(bs);
166 
167     throttle_destroy(&bs->throttle_state);
168 }
169 
170 static void bdrv_throttle_read_timer_cb(void *opaque)
171 {
172     BlockDriverState *bs = opaque;
173     qemu_co_enter_next(&bs->throttled_reqs[0]);
174 }
175 
176 static void bdrv_throttle_write_timer_cb(void *opaque)
177 {
178     BlockDriverState *bs = opaque;
179     qemu_co_enter_next(&bs->throttled_reqs[1]);
180 }
181 
182 /* should be called before bdrv_set_io_limits if a limit is set */
183 void bdrv_io_limits_enable(BlockDriverState *bs)
184 {
185     int clock_type = QEMU_CLOCK_REALTIME;
186 
187     if (qtest_enabled()) {
188         /* For testing block IO throttling only */
189         clock_type = QEMU_CLOCK_VIRTUAL;
190     }
191     assert(!bs->io_limits_enabled);
192     throttle_init(&bs->throttle_state,
193                   bdrv_get_aio_context(bs),
194                   clock_type,
195                   bdrv_throttle_read_timer_cb,
196                   bdrv_throttle_write_timer_cb,
197                   bs);
198     bs->io_limits_enabled = true;
199 }
200 
201 /* This function makes an IO wait if needed
202  *
203  * @nb_sectors: the number of sectors of the IO
204  * @is_write:   is the IO a write
205  */
206 static void bdrv_io_limits_intercept(BlockDriverState *bs,
207                                      unsigned int bytes,
208                                      bool is_write)
209 {
210     /* does this io must wait */
211     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
212 
213     /* if must wait or any request of this type throttled queue the IO */
214     if (must_wait ||
215         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
216         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
217     }
218 
219     /* the IO will be executed, do the accounting */
220     throttle_account(&bs->throttle_state, is_write, bytes);
221 
222 
223     /* if the next request must wait -> do nothing */
224     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
225         return;
226     }
227 
228     /* else queue next request for execution */
229     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
230 }
231 
232 size_t bdrv_opt_mem_align(BlockDriverState *bs)
233 {
234     if (!bs || !bs->drv) {
235         /* 4k should be on the safe side */
236         return 4096;
237     }
238 
239     return bs->bl.opt_mem_alignment;
240 }
241 
242 /* check if the path starts with "<protocol>:" */
243 int path_has_protocol(const char *path)
244 {
245     const char *p;
246 
247 #ifdef _WIN32
248     if (is_windows_drive(path) ||
249         is_windows_drive_prefix(path)) {
250         return 0;
251     }
252     p = path + strcspn(path, ":/\\");
253 #else
254     p = path + strcspn(path, ":/");
255 #endif
256 
257     return *p == ':';
258 }
259 
260 int path_is_absolute(const char *path)
261 {
262 #ifdef _WIN32
263     /* specific case for names like: "\\.\d:" */
264     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
265         return 1;
266     }
267     return (*path == '/' || *path == '\\');
268 #else
269     return (*path == '/');
270 #endif
271 }
272 
273 /* if filename is absolute, just copy it to dest. Otherwise, build a
274    path to it by considering it is relative to base_path. URL are
275    supported. */
276 void path_combine(char *dest, int dest_size,
277                   const char *base_path,
278                   const char *filename)
279 {
280     const char *p, *p1;
281     int len;
282 
283     if (dest_size <= 0)
284         return;
285     if (path_is_absolute(filename)) {
286         pstrcpy(dest, dest_size, filename);
287     } else {
288         p = strchr(base_path, ':');
289         if (p)
290             p++;
291         else
292             p = base_path;
293         p1 = strrchr(base_path, '/');
294 #ifdef _WIN32
295         {
296             const char *p2;
297             p2 = strrchr(base_path, '\\');
298             if (!p1 || p2 > p1)
299                 p1 = p2;
300         }
301 #endif
302         if (p1)
303             p1++;
304         else
305             p1 = base_path;
306         if (p1 > p)
307             p = p1;
308         len = p - base_path;
309         if (len > dest_size - 1)
310             len = dest_size - 1;
311         memcpy(dest, base_path, len);
312         dest[len] = '\0';
313         pstrcat(dest, dest_size, filename);
314     }
315 }
316 
317 void bdrv_get_full_backing_filename_from_filename(const char *backed,
318                                                   const char *backing,
319                                                   char *dest, size_t sz,
320                                                   Error **errp)
321 {
322     if (backing[0] == '\0' || path_has_protocol(backing) ||
323         path_is_absolute(backing))
324     {
325         pstrcpy(dest, sz, backing);
326     } else if (backed[0] == '\0' || strstart(backed, "json:", NULL)) {
327         error_setg(errp, "Cannot use relative backing file names for '%s'",
328                    backed);
329     } else {
330         path_combine(dest, sz, backed, backing);
331     }
332 }
333 
334 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz,
335                                     Error **errp)
336 {
337     char *backed = bs->exact_filename[0] ? bs->exact_filename : bs->filename;
338 
339     bdrv_get_full_backing_filename_from_filename(backed, bs->backing_file,
340                                                  dest, sz, errp);
341 }
342 
343 void bdrv_register(BlockDriver *bdrv)
344 {
345     /* Block drivers without coroutine functions need emulation */
346     if (!bdrv->bdrv_co_readv) {
347         bdrv->bdrv_co_readv = bdrv_co_readv_em;
348         bdrv->bdrv_co_writev = bdrv_co_writev_em;
349 
350         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
351          * the block driver lacks aio we need to emulate that too.
352          */
353         if (!bdrv->bdrv_aio_readv) {
354             /* add AIO emulation layer */
355             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
356             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
357         }
358     }
359 
360     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
361 }
362 
363 BlockDriverState *bdrv_new_root(void)
364 {
365     BlockDriverState *bs = bdrv_new();
366 
367     QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
368     return bs;
369 }
370 
371 BlockDriverState *bdrv_new(void)
372 {
373     BlockDriverState *bs;
374     int i;
375 
376     bs = g_new0(BlockDriverState, 1);
377     QLIST_INIT(&bs->dirty_bitmaps);
378     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
379         QLIST_INIT(&bs->op_blockers[i]);
380     }
381     bdrv_iostatus_disable(bs);
382     notifier_list_init(&bs->close_notifiers);
383     notifier_with_return_list_init(&bs->before_write_notifiers);
384     qemu_co_queue_init(&bs->throttled_reqs[0]);
385     qemu_co_queue_init(&bs->throttled_reqs[1]);
386     bs->refcnt = 1;
387     bs->aio_context = qemu_get_aio_context();
388 
389     return bs;
390 }
391 
392 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
393 {
394     notifier_list_add(&bs->close_notifiers, notify);
395 }
396 
397 BlockDriver *bdrv_find_format(const char *format_name)
398 {
399     BlockDriver *drv1;
400     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
401         if (!strcmp(drv1->format_name, format_name)) {
402             return drv1;
403         }
404     }
405     return NULL;
406 }
407 
408 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
409 {
410     static const char *whitelist_rw[] = {
411         CONFIG_BDRV_RW_WHITELIST
412     };
413     static const char *whitelist_ro[] = {
414         CONFIG_BDRV_RO_WHITELIST
415     };
416     const char **p;
417 
418     if (!whitelist_rw[0] && !whitelist_ro[0]) {
419         return 1;               /* no whitelist, anything goes */
420     }
421 
422     for (p = whitelist_rw; *p; p++) {
423         if (!strcmp(drv->format_name, *p)) {
424             return 1;
425         }
426     }
427     if (read_only) {
428         for (p = whitelist_ro; *p; p++) {
429             if (!strcmp(drv->format_name, *p)) {
430                 return 1;
431             }
432         }
433     }
434     return 0;
435 }
436 
437 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
438                                           bool read_only)
439 {
440     BlockDriver *drv = bdrv_find_format(format_name);
441     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
442 }
443 
444 typedef struct CreateCo {
445     BlockDriver *drv;
446     char *filename;
447     QemuOpts *opts;
448     int ret;
449     Error *err;
450 } CreateCo;
451 
452 static void coroutine_fn bdrv_create_co_entry(void *opaque)
453 {
454     Error *local_err = NULL;
455     int ret;
456 
457     CreateCo *cco = opaque;
458     assert(cco->drv);
459 
460     ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
461     if (local_err) {
462         error_propagate(&cco->err, local_err);
463     }
464     cco->ret = ret;
465 }
466 
467 int bdrv_create(BlockDriver *drv, const char* filename,
468                 QemuOpts *opts, Error **errp)
469 {
470     int ret;
471 
472     Coroutine *co;
473     CreateCo cco = {
474         .drv = drv,
475         .filename = g_strdup(filename),
476         .opts = opts,
477         .ret = NOT_DONE,
478         .err = NULL,
479     };
480 
481     if (!drv->bdrv_create) {
482         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
483         ret = -ENOTSUP;
484         goto out;
485     }
486 
487     if (qemu_in_coroutine()) {
488         /* Fast-path if already in coroutine context */
489         bdrv_create_co_entry(&cco);
490     } else {
491         co = qemu_coroutine_create(bdrv_create_co_entry);
492         qemu_coroutine_enter(co, &cco);
493         while (cco.ret == NOT_DONE) {
494             aio_poll(qemu_get_aio_context(), true);
495         }
496     }
497 
498     ret = cco.ret;
499     if (ret < 0) {
500         if (cco.err) {
501             error_propagate(errp, cco.err);
502         } else {
503             error_setg_errno(errp, -ret, "Could not create image");
504         }
505     }
506 
507 out:
508     g_free(cco.filename);
509     return ret;
510 }
511 
512 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
513 {
514     BlockDriver *drv;
515     Error *local_err = NULL;
516     int ret;
517 
518     drv = bdrv_find_protocol(filename, true, errp);
519     if (drv == NULL) {
520         return -ENOENT;
521     }
522 
523     ret = bdrv_create(drv, filename, opts, &local_err);
524     if (local_err) {
525         error_propagate(errp, local_err);
526     }
527     return ret;
528 }
529 
530 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
531 {
532     BlockDriver *drv = bs->drv;
533     Error *local_err = NULL;
534 
535     memset(&bs->bl, 0, sizeof(bs->bl));
536 
537     if (!drv) {
538         return;
539     }
540 
541     /* Take some limits from the children as a default */
542     if (bs->file) {
543         bdrv_refresh_limits(bs->file, &local_err);
544         if (local_err) {
545             error_propagate(errp, local_err);
546             return;
547         }
548         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
549         bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
550         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
551     } else {
552         bs->bl.opt_mem_alignment = 512;
553     }
554 
555     if (bs->backing_hd) {
556         bdrv_refresh_limits(bs->backing_hd, &local_err);
557         if (local_err) {
558             error_propagate(errp, local_err);
559             return;
560         }
561         bs->bl.opt_transfer_length =
562             MAX(bs->bl.opt_transfer_length,
563                 bs->backing_hd->bl.opt_transfer_length);
564         bs->bl.max_transfer_length =
565             MIN_NON_ZERO(bs->bl.max_transfer_length,
566                          bs->backing_hd->bl.max_transfer_length);
567         bs->bl.opt_mem_alignment =
568             MAX(bs->bl.opt_mem_alignment,
569                 bs->backing_hd->bl.opt_mem_alignment);
570     }
571 
572     /* Then let the driver override it */
573     if (drv->bdrv_refresh_limits) {
574         drv->bdrv_refresh_limits(bs, errp);
575     }
576 }
577 
578 /**
579  * Try to get @bs's logical and physical block size.
580  * On success, store them in @bsz struct and return 0.
581  * On failure return -errno.
582  * @bs must not be empty.
583  */
584 int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
585 {
586     BlockDriver *drv = bs->drv;
587 
588     if (drv && drv->bdrv_probe_blocksizes) {
589         return drv->bdrv_probe_blocksizes(bs, bsz);
590     }
591 
592     return -ENOTSUP;
593 }
594 
595 /**
596  * Try to get @bs's geometry (cyls, heads, sectors).
597  * On success, store them in @geo struct and return 0.
598  * On failure return -errno.
599  * @bs must not be empty.
600  */
601 int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
602 {
603     BlockDriver *drv = bs->drv;
604 
605     if (drv && drv->bdrv_probe_geometry) {
606         return drv->bdrv_probe_geometry(bs, geo);
607     }
608 
609     return -ENOTSUP;
610 }
611 
612 /*
613  * Create a uniquely-named empty temporary file.
614  * Return 0 upon success, otherwise a negative errno value.
615  */
616 int get_tmp_filename(char *filename, int size)
617 {
618 #ifdef _WIN32
619     char temp_dir[MAX_PATH];
620     /* GetTempFileName requires that its output buffer (4th param)
621        have length MAX_PATH or greater.  */
622     assert(size >= MAX_PATH);
623     return (GetTempPath(MAX_PATH, temp_dir)
624             && GetTempFileName(temp_dir, "qem", 0, filename)
625             ? 0 : -GetLastError());
626 #else
627     int fd;
628     const char *tmpdir;
629     tmpdir = getenv("TMPDIR");
630     if (!tmpdir) {
631         tmpdir = "/var/tmp";
632     }
633     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
634         return -EOVERFLOW;
635     }
636     fd = mkstemp(filename);
637     if (fd < 0) {
638         return -errno;
639     }
640     if (close(fd) != 0) {
641         unlink(filename);
642         return -errno;
643     }
644     return 0;
645 #endif
646 }
647 
648 /*
649  * Detect host devices. By convention, /dev/cdrom[N] is always
650  * recognized as a host CDROM.
651  */
652 static BlockDriver *find_hdev_driver(const char *filename)
653 {
654     int score_max = 0, score;
655     BlockDriver *drv = NULL, *d;
656 
657     QLIST_FOREACH(d, &bdrv_drivers, list) {
658         if (d->bdrv_probe_device) {
659             score = d->bdrv_probe_device(filename);
660             if (score > score_max) {
661                 score_max = score;
662                 drv = d;
663             }
664         }
665     }
666 
667     return drv;
668 }
669 
670 BlockDriver *bdrv_find_protocol(const char *filename,
671                                 bool allow_protocol_prefix,
672                                 Error **errp)
673 {
674     BlockDriver *drv1;
675     char protocol[128];
676     int len;
677     const char *p;
678 
679     /* TODO Drivers without bdrv_file_open must be specified explicitly */
680 
681     /*
682      * XXX(hch): we really should not let host device detection
683      * override an explicit protocol specification, but moving this
684      * later breaks access to device names with colons in them.
685      * Thanks to the brain-dead persistent naming schemes on udev-
686      * based Linux systems those actually are quite common.
687      */
688     drv1 = find_hdev_driver(filename);
689     if (drv1) {
690         return drv1;
691     }
692 
693     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
694         return &bdrv_file;
695     }
696 
697     p = strchr(filename, ':');
698     assert(p != NULL);
699     len = p - filename;
700     if (len > sizeof(protocol) - 1)
701         len = sizeof(protocol) - 1;
702     memcpy(protocol, filename, len);
703     protocol[len] = '\0';
704     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
705         if (drv1->protocol_name &&
706             !strcmp(drv1->protocol_name, protocol)) {
707             return drv1;
708         }
709     }
710 
711     error_setg(errp, "Unknown protocol '%s'", protocol);
712     return NULL;
713 }
714 
715 /*
716  * Guess image format by probing its contents.
717  * This is not a good idea when your image is raw (CVE-2008-2004), but
718  * we do it anyway for backward compatibility.
719  *
720  * @buf         contains the image's first @buf_size bytes.
721  * @buf_size    is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
722  *              but can be smaller if the image file is smaller)
723  * @filename    is its filename.
724  *
725  * For all block drivers, call the bdrv_probe() method to get its
726  * probing score.
727  * Return the first block driver with the highest probing score.
728  */
729 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
730                             const char *filename)
731 {
732     int score_max = 0, score;
733     BlockDriver *drv = NULL, *d;
734 
735     QLIST_FOREACH(d, &bdrv_drivers, list) {
736         if (d->bdrv_probe) {
737             score = d->bdrv_probe(buf, buf_size, filename);
738             if (score > score_max) {
739                 score_max = score;
740                 drv = d;
741             }
742         }
743     }
744 
745     return drv;
746 }
747 
748 static int find_image_format(BlockDriverState *bs, const char *filename,
749                              BlockDriver **pdrv, Error **errp)
750 {
751     BlockDriver *drv;
752     uint8_t buf[BLOCK_PROBE_BUF_SIZE];
753     int ret = 0;
754 
755     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
756     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
757         *pdrv = &bdrv_raw;
758         return ret;
759     }
760 
761     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
762     if (ret < 0) {
763         error_setg_errno(errp, -ret, "Could not read image for determining its "
764                          "format");
765         *pdrv = NULL;
766         return ret;
767     }
768 
769     drv = bdrv_probe_all(buf, ret, filename);
770     if (!drv) {
771         error_setg(errp, "Could not determine image format: No compatible "
772                    "driver found");
773         ret = -ENOENT;
774     }
775     *pdrv = drv;
776     return ret;
777 }
778 
779 /**
780  * Set the current 'total_sectors' value
781  * Return 0 on success, -errno on error.
782  */
783 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
784 {
785     BlockDriver *drv = bs->drv;
786 
787     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
788     if (bs->sg)
789         return 0;
790 
791     /* query actual device if possible, otherwise just trust the hint */
792     if (drv->bdrv_getlength) {
793         int64_t length = drv->bdrv_getlength(bs);
794         if (length < 0) {
795             return length;
796         }
797         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
798     }
799 
800     bs->total_sectors = hint;
801     return 0;
802 }
803 
804 /**
805  * Set open flags for a given discard mode
806  *
807  * Return 0 on success, -1 if the discard mode was invalid.
808  */
809 int bdrv_parse_discard_flags(const char *mode, int *flags)
810 {
811     *flags &= ~BDRV_O_UNMAP;
812 
813     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
814         /* do nothing */
815     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
816         *flags |= BDRV_O_UNMAP;
817     } else {
818         return -1;
819     }
820 
821     return 0;
822 }
823 
824 /**
825  * Set open flags for a given cache mode
826  *
827  * Return 0 on success, -1 if the cache mode was invalid.
828  */
829 int bdrv_parse_cache_flags(const char *mode, int *flags)
830 {
831     *flags &= ~BDRV_O_CACHE_MASK;
832 
833     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
834         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
835     } else if (!strcmp(mode, "directsync")) {
836         *flags |= BDRV_O_NOCACHE;
837     } else if (!strcmp(mode, "writeback")) {
838         *flags |= BDRV_O_CACHE_WB;
839     } else if (!strcmp(mode, "unsafe")) {
840         *flags |= BDRV_O_CACHE_WB;
841         *flags |= BDRV_O_NO_FLUSH;
842     } else if (!strcmp(mode, "writethrough")) {
843         /* this is the default */
844     } else {
845         return -1;
846     }
847 
848     return 0;
849 }
850 
851 /**
852  * The copy-on-read flag is actually a reference count so multiple users may
853  * use the feature without worrying about clobbering its previous state.
854  * Copy-on-read stays enabled until all users have called to disable it.
855  */
856 void bdrv_enable_copy_on_read(BlockDriverState *bs)
857 {
858     bs->copy_on_read++;
859 }
860 
861 void bdrv_disable_copy_on_read(BlockDriverState *bs)
862 {
863     assert(bs->copy_on_read > 0);
864     bs->copy_on_read--;
865 }
866 
867 /*
868  * Returns the flags that a temporary snapshot should get, based on the
869  * originally requested flags (the originally requested image will have flags
870  * like a backing file)
871  */
872 static int bdrv_temp_snapshot_flags(int flags)
873 {
874     return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
875 }
876 
877 /*
878  * Returns the flags that bs->file should get, based on the given flags for
879  * the parent BDS
880  */
881 static int bdrv_inherited_flags(int flags)
882 {
883     /* Enable protocol handling, disable format probing for bs->file */
884     flags |= BDRV_O_PROTOCOL;
885 
886     /* Our block drivers take care to send flushes and respect unmap policy,
887      * so we can enable both unconditionally on lower layers. */
888     flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
889 
890     /* Clear flags that only apply to the top layer */
891     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
892 
893     return flags;
894 }
895 
896 /*
897  * Returns the flags that bs->backing_hd should get, based on the given flags
898  * for the parent BDS
899  */
900 static int bdrv_backing_flags(int flags)
901 {
902     /* backing files always opened read-only */
903     flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
904 
905     /* snapshot=on is handled on the top layer */
906     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
907 
908     return flags;
909 }
910 
911 static int bdrv_open_flags(BlockDriverState *bs, int flags)
912 {
913     int open_flags = flags | BDRV_O_CACHE_WB;
914 
915     /*
916      * Clear flags that are internal to the block layer before opening the
917      * image.
918      */
919     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
920 
921     /*
922      * Snapshots should be writable.
923      */
924     if (flags & BDRV_O_TEMPORARY) {
925         open_flags |= BDRV_O_RDWR;
926     }
927 
928     return open_flags;
929 }
930 
931 static void bdrv_assign_node_name(BlockDriverState *bs,
932                                   const char *node_name,
933                                   Error **errp)
934 {
935     if (!node_name) {
936         return;
937     }
938 
939     /* Check for empty string or invalid characters */
940     if (!id_wellformed(node_name)) {
941         error_setg(errp, "Invalid node name");
942         return;
943     }
944 
945     /* takes care of avoiding namespaces collisions */
946     if (blk_by_name(node_name)) {
947         error_setg(errp, "node-name=%s is conflicting with a device id",
948                    node_name);
949         return;
950     }
951 
952     /* takes care of avoiding duplicates node names */
953     if (bdrv_find_node(node_name)) {
954         error_setg(errp, "Duplicate node name");
955         return;
956     }
957 
958     /* copy node name into the bs and insert it into the graph list */
959     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
960     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
961 }
962 
963 /*
964  * Common part for opening disk images and files
965  *
966  * Removes all processed options from *options.
967  */
968 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
969     QDict *options, int flags, BlockDriver *drv, Error **errp)
970 {
971     int ret, open_flags;
972     const char *filename;
973     const char *node_name = NULL;
974     Error *local_err = NULL;
975 
976     assert(drv != NULL);
977     assert(bs->file == NULL);
978     assert(options != NULL && bs->options != options);
979 
980     if (file != NULL) {
981         filename = file->filename;
982     } else {
983         filename = qdict_get_try_str(options, "filename");
984     }
985 
986     if (drv->bdrv_needs_filename && !filename) {
987         error_setg(errp, "The '%s' block driver requires a file name",
988                    drv->format_name);
989         return -EINVAL;
990     }
991 
992     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
993 
994     node_name = qdict_get_try_str(options, "node-name");
995     bdrv_assign_node_name(bs, node_name, &local_err);
996     if (local_err) {
997         error_propagate(errp, local_err);
998         return -EINVAL;
999     }
1000     qdict_del(options, "node-name");
1001 
1002     /* bdrv_open() with directly using a protocol as drv. This layer is already
1003      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
1004      * and return immediately. */
1005     if (file != NULL && drv->bdrv_file_open) {
1006         bdrv_swap(file, bs);
1007         return 0;
1008     }
1009 
1010     bs->open_flags = flags;
1011     bs->guest_block_size = 512;
1012     bs->request_alignment = 512;
1013     bs->zero_beyond_eof = true;
1014     open_flags = bdrv_open_flags(bs, flags);
1015     bs->read_only = !(open_flags & BDRV_O_RDWR);
1016 
1017     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
1018         error_setg(errp,
1019                    !bs->read_only && bdrv_is_whitelisted(drv, true)
1020                         ? "Driver '%s' can only be used for read-only devices"
1021                         : "Driver '%s' is not whitelisted",
1022                    drv->format_name);
1023         return -ENOTSUP;
1024     }
1025 
1026     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
1027     if (flags & BDRV_O_COPY_ON_READ) {
1028         if (!bs->read_only) {
1029             bdrv_enable_copy_on_read(bs);
1030         } else {
1031             error_setg(errp, "Can't use copy-on-read on read-only device");
1032             return -EINVAL;
1033         }
1034     }
1035 
1036     if (filename != NULL) {
1037         pstrcpy(bs->filename, sizeof(bs->filename), filename);
1038     } else {
1039         bs->filename[0] = '\0';
1040     }
1041     pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
1042 
1043     bs->drv = drv;
1044     bs->opaque = g_malloc0(drv->instance_size);
1045 
1046     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
1047 
1048     /* Open the image, either directly or using a protocol */
1049     if (drv->bdrv_file_open) {
1050         assert(file == NULL);
1051         assert(!drv->bdrv_needs_filename || filename != NULL);
1052         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
1053     } else {
1054         if (file == NULL) {
1055             error_setg(errp, "Can't use '%s' as a block driver for the "
1056                        "protocol level", drv->format_name);
1057             ret = -EINVAL;
1058             goto free_and_fail;
1059         }
1060         bs->file = file;
1061         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
1062     }
1063 
1064     if (ret < 0) {
1065         if (local_err) {
1066             error_propagate(errp, local_err);
1067         } else if (bs->filename[0]) {
1068             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
1069         } else {
1070             error_setg_errno(errp, -ret, "Could not open image");
1071         }
1072         goto free_and_fail;
1073     }
1074 
1075     if (bs->encrypted) {
1076         error_report("Encrypted images are deprecated");
1077         error_printf("Support for them will be removed in a future release.\n"
1078                      "You can use 'qemu-img convert' to convert your image"
1079                      " to an unencrypted one.\n");
1080     }
1081 
1082     ret = refresh_total_sectors(bs, bs->total_sectors);
1083     if (ret < 0) {
1084         error_setg_errno(errp, -ret, "Could not refresh total sector count");
1085         goto free_and_fail;
1086     }
1087 
1088     bdrv_refresh_limits(bs, &local_err);
1089     if (local_err) {
1090         error_propagate(errp, local_err);
1091         ret = -EINVAL;
1092         goto free_and_fail;
1093     }
1094 
1095     assert(bdrv_opt_mem_align(bs) != 0);
1096     assert((bs->request_alignment != 0) || bs->sg);
1097     return 0;
1098 
1099 free_and_fail:
1100     bs->file = NULL;
1101     g_free(bs->opaque);
1102     bs->opaque = NULL;
1103     bs->drv = NULL;
1104     return ret;
1105 }
1106 
1107 static QDict *parse_json_filename(const char *filename, Error **errp)
1108 {
1109     QObject *options_obj;
1110     QDict *options;
1111     int ret;
1112 
1113     ret = strstart(filename, "json:", &filename);
1114     assert(ret);
1115 
1116     options_obj = qobject_from_json(filename);
1117     if (!options_obj) {
1118         error_setg(errp, "Could not parse the JSON options");
1119         return NULL;
1120     }
1121 
1122     if (qobject_type(options_obj) != QTYPE_QDICT) {
1123         qobject_decref(options_obj);
1124         error_setg(errp, "Invalid JSON object given");
1125         return NULL;
1126     }
1127 
1128     options = qobject_to_qdict(options_obj);
1129     qdict_flatten(options);
1130 
1131     return options;
1132 }
1133 
1134 /*
1135  * Fills in default options for opening images and converts the legacy
1136  * filename/flags pair to option QDict entries.
1137  */
1138 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1139                              BlockDriver *drv, Error **errp)
1140 {
1141     const char *filename = *pfilename;
1142     const char *drvname;
1143     bool protocol = flags & BDRV_O_PROTOCOL;
1144     bool parse_filename = false;
1145     Error *local_err = NULL;
1146 
1147     /* Parse json: pseudo-protocol */
1148     if (filename && g_str_has_prefix(filename, "json:")) {
1149         QDict *json_options = parse_json_filename(filename, &local_err);
1150         if (local_err) {
1151             error_propagate(errp, local_err);
1152             return -EINVAL;
1153         }
1154 
1155         /* Options given in the filename have lower priority than options
1156          * specified directly */
1157         qdict_join(*options, json_options, false);
1158         QDECREF(json_options);
1159         *pfilename = filename = NULL;
1160     }
1161 
1162     /* Fetch the file name from the options QDict if necessary */
1163     if (protocol && filename) {
1164         if (!qdict_haskey(*options, "filename")) {
1165             qdict_put(*options, "filename", qstring_from_str(filename));
1166             parse_filename = true;
1167         } else {
1168             error_setg(errp, "Can't specify 'file' and 'filename' options at "
1169                              "the same time");
1170             return -EINVAL;
1171         }
1172     }
1173 
1174     /* Find the right block driver */
1175     filename = qdict_get_try_str(*options, "filename");
1176     drvname = qdict_get_try_str(*options, "driver");
1177 
1178     if (drv) {
1179         if (drvname) {
1180             error_setg(errp, "Driver specified twice");
1181             return -EINVAL;
1182         }
1183         drvname = drv->format_name;
1184         qdict_put(*options, "driver", qstring_from_str(drvname));
1185     } else {
1186         if (!drvname && protocol) {
1187             if (filename) {
1188                 drv = bdrv_find_protocol(filename, parse_filename, errp);
1189                 if (!drv) {
1190                     return -EINVAL;
1191                 }
1192 
1193                 drvname = drv->format_name;
1194                 qdict_put(*options, "driver", qstring_from_str(drvname));
1195             } else {
1196                 error_setg(errp, "Must specify either driver or file");
1197                 return -EINVAL;
1198             }
1199         } else if (drvname) {
1200             drv = bdrv_find_format(drvname);
1201             if (!drv) {
1202                 error_setg(errp, "Unknown driver '%s'", drvname);
1203                 return -ENOENT;
1204             }
1205         }
1206     }
1207 
1208     assert(drv || !protocol);
1209 
1210     /* Driver-specific filename parsing */
1211     if (drv && drv->bdrv_parse_filename && parse_filename) {
1212         drv->bdrv_parse_filename(filename, *options, &local_err);
1213         if (local_err) {
1214             error_propagate(errp, local_err);
1215             return -EINVAL;
1216         }
1217 
1218         if (!drv->bdrv_needs_filename) {
1219             qdict_del(*options, "filename");
1220         }
1221     }
1222 
1223     return 0;
1224 }
1225 
1226 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1227 {
1228 
1229     if (bs->backing_hd) {
1230         assert(bs->backing_blocker);
1231         bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1232     } else if (backing_hd) {
1233         error_setg(&bs->backing_blocker,
1234                    "device is used as backing hd of '%s'",
1235                    bdrv_get_device_name(bs));
1236     }
1237 
1238     bs->backing_hd = backing_hd;
1239     if (!backing_hd) {
1240         error_free(bs->backing_blocker);
1241         bs->backing_blocker = NULL;
1242         goto out;
1243     }
1244     bs->open_flags &= ~BDRV_O_NO_BACKING;
1245     pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1246     pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1247             backing_hd->drv ? backing_hd->drv->format_name : "");
1248 
1249     bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1250     /* Otherwise we won't be able to commit due to check in bdrv_commit */
1251     bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET,
1252                     bs->backing_blocker);
1253 out:
1254     bdrv_refresh_limits(bs, NULL);
1255 }
1256 
1257 /*
1258  * Opens the backing file for a BlockDriverState if not yet open
1259  *
1260  * options is a QDict of options to pass to the block drivers, or NULL for an
1261  * empty set of options. The reference to the QDict is transferred to this
1262  * function (even on failure), so if the caller intends to reuse the dictionary,
1263  * it needs to use QINCREF() before calling bdrv_file_open.
1264  */
1265 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1266 {
1267     char *backing_filename = g_malloc0(PATH_MAX);
1268     int ret = 0;
1269     BlockDriverState *backing_hd;
1270     Error *local_err = NULL;
1271 
1272     if (bs->backing_hd != NULL) {
1273         QDECREF(options);
1274         goto free_exit;
1275     }
1276 
1277     /* NULL means an empty set of options */
1278     if (options == NULL) {
1279         options = qdict_new();
1280     }
1281 
1282     bs->open_flags &= ~BDRV_O_NO_BACKING;
1283     if (qdict_haskey(options, "file.filename")) {
1284         backing_filename[0] = '\0';
1285     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1286         QDECREF(options);
1287         goto free_exit;
1288     } else {
1289         bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX,
1290                                        &local_err);
1291         if (local_err) {
1292             ret = -EINVAL;
1293             error_propagate(errp, local_err);
1294             QDECREF(options);
1295             goto free_exit;
1296         }
1297     }
1298 
1299     if (!bs->drv || !bs->drv->supports_backing) {
1300         ret = -EINVAL;
1301         error_setg(errp, "Driver doesn't support backing files");
1302         QDECREF(options);
1303         goto free_exit;
1304     }
1305 
1306     backing_hd = bdrv_new();
1307 
1308     if (bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) {
1309         qdict_put(options, "driver", qstring_from_str(bs->backing_format));
1310     }
1311 
1312     assert(bs->backing_hd == NULL);
1313     ret = bdrv_open(&backing_hd,
1314                     *backing_filename ? backing_filename : NULL, NULL, options,
1315                     bdrv_backing_flags(bs->open_flags), NULL, &local_err);
1316     if (ret < 0) {
1317         bdrv_unref(backing_hd);
1318         backing_hd = NULL;
1319         bs->open_flags |= BDRV_O_NO_BACKING;
1320         error_setg(errp, "Could not open backing file: %s",
1321                    error_get_pretty(local_err));
1322         error_free(local_err);
1323         goto free_exit;
1324     }
1325     bdrv_set_backing_hd(bs, backing_hd);
1326 
1327 free_exit:
1328     g_free(backing_filename);
1329     return ret;
1330 }
1331 
1332 /*
1333  * Opens a disk image whose options are given as BlockdevRef in another block
1334  * device's options.
1335  *
1336  * If allow_none is true, no image will be opened if filename is false and no
1337  * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1338  *
1339  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1340  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1341  * itself, all options starting with "${bdref_key}." are considered part of the
1342  * BlockdevRef.
1343  *
1344  * The BlockdevRef will be removed from the options QDict.
1345  *
1346  * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1347  */
1348 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1349                     QDict *options, const char *bdref_key, int flags,
1350                     bool allow_none, Error **errp)
1351 {
1352     QDict *image_options;
1353     int ret;
1354     char *bdref_key_dot;
1355     const char *reference;
1356 
1357     assert(pbs);
1358     assert(*pbs == NULL);
1359 
1360     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1361     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1362     g_free(bdref_key_dot);
1363 
1364     reference = qdict_get_try_str(options, bdref_key);
1365     if (!filename && !reference && !qdict_size(image_options)) {
1366         if (allow_none) {
1367             ret = 0;
1368         } else {
1369             error_setg(errp, "A block device must be specified for \"%s\"",
1370                        bdref_key);
1371             ret = -EINVAL;
1372         }
1373         QDECREF(image_options);
1374         goto done;
1375     }
1376 
1377     ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1378 
1379 done:
1380     qdict_del(options, bdref_key);
1381     return ret;
1382 }
1383 
1384 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1385 {
1386     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1387     char *tmp_filename = g_malloc0(PATH_MAX + 1);
1388     int64_t total_size;
1389     QemuOpts *opts = NULL;
1390     QDict *snapshot_options;
1391     BlockDriverState *bs_snapshot;
1392     Error *local_err;
1393     int ret;
1394 
1395     /* if snapshot, we create a temporary backing file and open it
1396        instead of opening 'filename' directly */
1397 
1398     /* Get the required size from the image */
1399     total_size = bdrv_getlength(bs);
1400     if (total_size < 0) {
1401         ret = total_size;
1402         error_setg_errno(errp, -total_size, "Could not get image size");
1403         goto out;
1404     }
1405 
1406     /* Create the temporary image */
1407     ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1408     if (ret < 0) {
1409         error_setg_errno(errp, -ret, "Could not get temporary filename");
1410         goto out;
1411     }
1412 
1413     opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
1414                             &error_abort);
1415     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size, &error_abort);
1416     ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, &local_err);
1417     qemu_opts_del(opts);
1418     if (ret < 0) {
1419         error_setg_errno(errp, -ret, "Could not create temporary overlay "
1420                          "'%s': %s", tmp_filename,
1421                          error_get_pretty(local_err));
1422         error_free(local_err);
1423         goto out;
1424     }
1425 
1426     /* Prepare a new options QDict for the temporary file */
1427     snapshot_options = qdict_new();
1428     qdict_put(snapshot_options, "file.driver",
1429               qstring_from_str("file"));
1430     qdict_put(snapshot_options, "file.filename",
1431               qstring_from_str(tmp_filename));
1432 
1433     bs_snapshot = bdrv_new();
1434 
1435     ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1436                     flags, &bdrv_qcow2, &local_err);
1437     if (ret < 0) {
1438         error_propagate(errp, local_err);
1439         goto out;
1440     }
1441 
1442     bdrv_append(bs_snapshot, bs);
1443 
1444 out:
1445     g_free(tmp_filename);
1446     return ret;
1447 }
1448 
1449 /*
1450  * Opens a disk image (raw, qcow2, vmdk, ...)
1451  *
1452  * options is a QDict of options to pass to the block drivers, or NULL for an
1453  * empty set of options. The reference to the QDict belongs to the block layer
1454  * after the call (even on failure), so if the caller intends to reuse the
1455  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1456  *
1457  * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1458  * If it is not NULL, the referenced BDS will be reused.
1459  *
1460  * The reference parameter may be used to specify an existing block device which
1461  * should be opened. If specified, neither options nor a filename may be given,
1462  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1463  */
1464 int bdrv_open(BlockDriverState **pbs, const char *filename,
1465               const char *reference, QDict *options, int flags,
1466               BlockDriver *drv, Error **errp)
1467 {
1468     int ret;
1469     BlockDriverState *file = NULL, *bs;
1470     const char *drvname;
1471     Error *local_err = NULL;
1472     int snapshot_flags = 0;
1473 
1474     assert(pbs);
1475 
1476     if (reference) {
1477         bool options_non_empty = options ? qdict_size(options) : false;
1478         QDECREF(options);
1479 
1480         if (*pbs) {
1481             error_setg(errp, "Cannot reuse an existing BDS when referencing "
1482                        "another block device");
1483             return -EINVAL;
1484         }
1485 
1486         if (filename || options_non_empty) {
1487             error_setg(errp, "Cannot reference an existing block device with "
1488                        "additional options or a new filename");
1489             return -EINVAL;
1490         }
1491 
1492         bs = bdrv_lookup_bs(reference, reference, errp);
1493         if (!bs) {
1494             return -ENODEV;
1495         }
1496         bdrv_ref(bs);
1497         *pbs = bs;
1498         return 0;
1499     }
1500 
1501     if (*pbs) {
1502         bs = *pbs;
1503     } else {
1504         bs = bdrv_new();
1505     }
1506 
1507     /* NULL means an empty set of options */
1508     if (options == NULL) {
1509         options = qdict_new();
1510     }
1511 
1512     ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1513     if (local_err) {
1514         goto fail;
1515     }
1516 
1517     /* Find the right image format driver */
1518     drv = NULL;
1519     drvname = qdict_get_try_str(options, "driver");
1520     if (drvname) {
1521         drv = bdrv_find_format(drvname);
1522         qdict_del(options, "driver");
1523         if (!drv) {
1524             error_setg(errp, "Unknown driver: '%s'", drvname);
1525             ret = -EINVAL;
1526             goto fail;
1527         }
1528     }
1529 
1530     assert(drvname || !(flags & BDRV_O_PROTOCOL));
1531     if (drv && !drv->bdrv_file_open) {
1532         /* If the user explicitly wants a format driver here, we'll need to add
1533          * another layer for the protocol in bs->file */
1534         flags &= ~BDRV_O_PROTOCOL;
1535     }
1536 
1537     bs->options = options;
1538     options = qdict_clone_shallow(options);
1539 
1540     /* Open image file without format layer */
1541     if ((flags & BDRV_O_PROTOCOL) == 0) {
1542         if (flags & BDRV_O_RDWR) {
1543             flags |= BDRV_O_ALLOW_RDWR;
1544         }
1545         if (flags & BDRV_O_SNAPSHOT) {
1546             snapshot_flags = bdrv_temp_snapshot_flags(flags);
1547             flags = bdrv_backing_flags(flags);
1548         }
1549 
1550         assert(file == NULL);
1551         ret = bdrv_open_image(&file, filename, options, "file",
1552                               bdrv_inherited_flags(flags),
1553                               true, &local_err);
1554         if (ret < 0) {
1555             goto fail;
1556         }
1557     }
1558 
1559     /* Image format probing */
1560     bs->probed = !drv;
1561     if (!drv && file) {
1562         ret = find_image_format(file, filename, &drv, &local_err);
1563         if (ret < 0) {
1564             goto fail;
1565         }
1566     } else if (!drv) {
1567         error_setg(errp, "Must specify either driver or file");
1568         ret = -EINVAL;
1569         goto fail;
1570     }
1571 
1572     /* Open the image */
1573     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1574     if (ret < 0) {
1575         goto fail;
1576     }
1577 
1578     if (file && (bs->file != file)) {
1579         bdrv_unref(file);
1580         file = NULL;
1581     }
1582 
1583     /* If there is a backing file, use it */
1584     if ((flags & BDRV_O_NO_BACKING) == 0) {
1585         QDict *backing_options;
1586 
1587         qdict_extract_subqdict(options, &backing_options, "backing.");
1588         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1589         if (ret < 0) {
1590             goto close_and_fail;
1591         }
1592     }
1593 
1594     bdrv_refresh_filename(bs);
1595 
1596     /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1597      * temporary snapshot afterwards. */
1598     if (snapshot_flags) {
1599         ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1600         if (local_err) {
1601             goto close_and_fail;
1602         }
1603     }
1604 
1605     /* Check if any unknown options were used */
1606     if (options && (qdict_size(options) != 0)) {
1607         const QDictEntry *entry = qdict_first(options);
1608         if (flags & BDRV_O_PROTOCOL) {
1609             error_setg(errp, "Block protocol '%s' doesn't support the option "
1610                        "'%s'", drv->format_name, entry->key);
1611         } else {
1612             error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1613                        "support the option '%s'", drv->format_name,
1614                        bdrv_get_device_name(bs), entry->key);
1615         }
1616 
1617         ret = -EINVAL;
1618         goto close_and_fail;
1619     }
1620 
1621     if (!bdrv_key_required(bs)) {
1622         if (bs->blk) {
1623             blk_dev_change_media_cb(bs->blk, true);
1624         }
1625     } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1626                && !runstate_check(RUN_STATE_INMIGRATE)
1627                && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1628         error_setg(errp,
1629                    "Guest must be stopped for opening of encrypted image");
1630         ret = -EBUSY;
1631         goto close_and_fail;
1632     }
1633 
1634     QDECREF(options);
1635     *pbs = bs;
1636     return 0;
1637 
1638 fail:
1639     if (file != NULL) {
1640         bdrv_unref(file);
1641     }
1642     QDECREF(bs->options);
1643     QDECREF(options);
1644     bs->options = NULL;
1645     if (!*pbs) {
1646         /* If *pbs is NULL, a new BDS has been created in this function and
1647            needs to be freed now. Otherwise, it does not need to be closed,
1648            since it has not really been opened yet. */
1649         bdrv_unref(bs);
1650     }
1651     if (local_err) {
1652         error_propagate(errp, local_err);
1653     }
1654     return ret;
1655 
1656 close_and_fail:
1657     /* See fail path, but now the BDS has to be always closed */
1658     if (*pbs) {
1659         bdrv_close(bs);
1660     } else {
1661         bdrv_unref(bs);
1662     }
1663     QDECREF(options);
1664     if (local_err) {
1665         error_propagate(errp, local_err);
1666     }
1667     return ret;
1668 }
1669 
1670 typedef struct BlockReopenQueueEntry {
1671      bool prepared;
1672      BDRVReopenState state;
1673      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1674 } BlockReopenQueueEntry;
1675 
1676 /*
1677  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1678  * reopen of multiple devices.
1679  *
1680  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1681  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1682  * be created and initialized. This newly created BlockReopenQueue should be
1683  * passed back in for subsequent calls that are intended to be of the same
1684  * atomic 'set'.
1685  *
1686  * bs is the BlockDriverState to add to the reopen queue.
1687  *
1688  * flags contains the open flags for the associated bs
1689  *
1690  * returns a pointer to bs_queue, which is either the newly allocated
1691  * bs_queue, or the existing bs_queue being used.
1692  *
1693  */
1694 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1695                                     BlockDriverState *bs, int flags)
1696 {
1697     assert(bs != NULL);
1698 
1699     BlockReopenQueueEntry *bs_entry;
1700     if (bs_queue == NULL) {
1701         bs_queue = g_new0(BlockReopenQueue, 1);
1702         QSIMPLEQ_INIT(bs_queue);
1703     }
1704 
1705     /* bdrv_open() masks this flag out */
1706     flags &= ~BDRV_O_PROTOCOL;
1707 
1708     if (bs->file) {
1709         bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1710     }
1711 
1712     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1713     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1714 
1715     bs_entry->state.bs = bs;
1716     bs_entry->state.flags = flags;
1717 
1718     return bs_queue;
1719 }
1720 
1721 /*
1722  * Reopen multiple BlockDriverStates atomically & transactionally.
1723  *
1724  * The queue passed in (bs_queue) must have been built up previous
1725  * via bdrv_reopen_queue().
1726  *
1727  * Reopens all BDS specified in the queue, with the appropriate
1728  * flags.  All devices are prepared for reopen, and failure of any
1729  * device will cause all device changes to be abandonded, and intermediate
1730  * data cleaned up.
1731  *
1732  * If all devices prepare successfully, then the changes are committed
1733  * to all devices.
1734  *
1735  */
1736 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1737 {
1738     int ret = -1;
1739     BlockReopenQueueEntry *bs_entry, *next;
1740     Error *local_err = NULL;
1741 
1742     assert(bs_queue != NULL);
1743 
1744     bdrv_drain_all();
1745 
1746     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1747         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1748             error_propagate(errp, local_err);
1749             goto cleanup;
1750         }
1751         bs_entry->prepared = true;
1752     }
1753 
1754     /* If we reach this point, we have success and just need to apply the
1755      * changes
1756      */
1757     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1758         bdrv_reopen_commit(&bs_entry->state);
1759     }
1760 
1761     ret = 0;
1762 
1763 cleanup:
1764     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1765         if (ret && bs_entry->prepared) {
1766             bdrv_reopen_abort(&bs_entry->state);
1767         }
1768         g_free(bs_entry);
1769     }
1770     g_free(bs_queue);
1771     return ret;
1772 }
1773 
1774 
1775 /* Reopen a single BlockDriverState with the specified flags. */
1776 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1777 {
1778     int ret = -1;
1779     Error *local_err = NULL;
1780     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1781 
1782     ret = bdrv_reopen_multiple(queue, &local_err);
1783     if (local_err != NULL) {
1784         error_propagate(errp, local_err);
1785     }
1786     return ret;
1787 }
1788 
1789 
1790 /*
1791  * Prepares a BlockDriverState for reopen. All changes are staged in the
1792  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1793  * the block driver layer .bdrv_reopen_prepare()
1794  *
1795  * bs is the BlockDriverState to reopen
1796  * flags are the new open flags
1797  * queue is the reopen queue
1798  *
1799  * Returns 0 on success, non-zero on error.  On error errp will be set
1800  * as well.
1801  *
1802  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1803  * It is the responsibility of the caller to then call the abort() or
1804  * commit() for any other BDS that have been left in a prepare() state
1805  *
1806  */
1807 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1808                         Error **errp)
1809 {
1810     int ret = -1;
1811     Error *local_err = NULL;
1812     BlockDriver *drv;
1813 
1814     assert(reopen_state != NULL);
1815     assert(reopen_state->bs->drv != NULL);
1816     drv = reopen_state->bs->drv;
1817 
1818     /* if we are to stay read-only, do not allow permission change
1819      * to r/w */
1820     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1821         reopen_state->flags & BDRV_O_RDWR) {
1822         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1823                   bdrv_get_device_name(reopen_state->bs));
1824         goto error;
1825     }
1826 
1827 
1828     ret = bdrv_flush(reopen_state->bs);
1829     if (ret) {
1830         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1831                   strerror(-ret));
1832         goto error;
1833     }
1834 
1835     if (drv->bdrv_reopen_prepare) {
1836         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1837         if (ret) {
1838             if (local_err != NULL) {
1839                 error_propagate(errp, local_err);
1840             } else {
1841                 error_setg(errp, "failed while preparing to reopen image '%s'",
1842                            reopen_state->bs->filename);
1843             }
1844             goto error;
1845         }
1846     } else {
1847         /* It is currently mandatory to have a bdrv_reopen_prepare()
1848          * handler for each supported drv. */
1849         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1850                   drv->format_name, bdrv_get_device_name(reopen_state->bs),
1851                  "reopening of file");
1852         ret = -1;
1853         goto error;
1854     }
1855 
1856     ret = 0;
1857 
1858 error:
1859     return ret;
1860 }
1861 
1862 /*
1863  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1864  * makes them final by swapping the staging BlockDriverState contents into
1865  * the active BlockDriverState contents.
1866  */
1867 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1868 {
1869     BlockDriver *drv;
1870 
1871     assert(reopen_state != NULL);
1872     drv = reopen_state->bs->drv;
1873     assert(drv != NULL);
1874 
1875     /* If there are any driver level actions to take */
1876     if (drv->bdrv_reopen_commit) {
1877         drv->bdrv_reopen_commit(reopen_state);
1878     }
1879 
1880     /* set BDS specific flags now */
1881     reopen_state->bs->open_flags         = reopen_state->flags;
1882     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1883                                               BDRV_O_CACHE_WB);
1884     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1885 
1886     bdrv_refresh_limits(reopen_state->bs, NULL);
1887 }
1888 
1889 /*
1890  * Abort the reopen, and delete and free the staged changes in
1891  * reopen_state
1892  */
1893 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1894 {
1895     BlockDriver *drv;
1896 
1897     assert(reopen_state != NULL);
1898     drv = reopen_state->bs->drv;
1899     assert(drv != NULL);
1900 
1901     if (drv->bdrv_reopen_abort) {
1902         drv->bdrv_reopen_abort(reopen_state);
1903     }
1904 }
1905 
1906 
1907 void bdrv_close(BlockDriverState *bs)
1908 {
1909     BdrvAioNotifier *ban, *ban_next;
1910 
1911     if (bs->job) {
1912         block_job_cancel_sync(bs->job);
1913     }
1914     bdrv_drain_all(); /* complete I/O */
1915     bdrv_flush(bs);
1916     bdrv_drain_all(); /* in case flush left pending I/O */
1917     notifier_list_notify(&bs->close_notifiers, bs);
1918 
1919     if (bs->drv) {
1920         if (bs->backing_hd) {
1921             BlockDriverState *backing_hd = bs->backing_hd;
1922             bdrv_set_backing_hd(bs, NULL);
1923             bdrv_unref(backing_hd);
1924         }
1925         bs->drv->bdrv_close(bs);
1926         g_free(bs->opaque);
1927         bs->opaque = NULL;
1928         bs->drv = NULL;
1929         bs->copy_on_read = 0;
1930         bs->backing_file[0] = '\0';
1931         bs->backing_format[0] = '\0';
1932         bs->total_sectors = 0;
1933         bs->encrypted = 0;
1934         bs->valid_key = 0;
1935         bs->sg = 0;
1936         bs->zero_beyond_eof = false;
1937         QDECREF(bs->options);
1938         bs->options = NULL;
1939         QDECREF(bs->full_open_options);
1940         bs->full_open_options = NULL;
1941 
1942         if (bs->file != NULL) {
1943             bdrv_unref(bs->file);
1944             bs->file = NULL;
1945         }
1946     }
1947 
1948     if (bs->blk) {
1949         blk_dev_change_media_cb(bs->blk, false);
1950     }
1951 
1952     /*throttling disk I/O limits*/
1953     if (bs->io_limits_enabled) {
1954         bdrv_io_limits_disable(bs);
1955     }
1956 
1957     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1958         g_free(ban);
1959     }
1960     QLIST_INIT(&bs->aio_notifiers);
1961 }
1962 
1963 void bdrv_close_all(void)
1964 {
1965     BlockDriverState *bs;
1966 
1967     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1968         AioContext *aio_context = bdrv_get_aio_context(bs);
1969 
1970         aio_context_acquire(aio_context);
1971         bdrv_close(bs);
1972         aio_context_release(aio_context);
1973     }
1974 }
1975 
1976 /* Check if any requests are in-flight (including throttled requests) */
1977 static bool bdrv_requests_pending(BlockDriverState *bs)
1978 {
1979     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1980         return true;
1981     }
1982     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1983         return true;
1984     }
1985     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1986         return true;
1987     }
1988     if (bs->file && bdrv_requests_pending(bs->file)) {
1989         return true;
1990     }
1991     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1992         return true;
1993     }
1994     return false;
1995 }
1996 
1997 static bool bdrv_drain_one(BlockDriverState *bs)
1998 {
1999     bool bs_busy;
2000 
2001     bdrv_flush_io_queue(bs);
2002     bdrv_start_throttled_reqs(bs);
2003     bs_busy = bdrv_requests_pending(bs);
2004     bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
2005     return bs_busy;
2006 }
2007 
2008 /*
2009  * Wait for pending requests to complete on a single BlockDriverState subtree
2010  *
2011  * See the warning in bdrv_drain_all().  This function can only be called if
2012  * you are sure nothing can generate I/O because you have op blockers
2013  * installed.
2014  *
2015  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
2016  * AioContext.
2017  */
2018 void bdrv_drain(BlockDriverState *bs)
2019 {
2020     while (bdrv_drain_one(bs)) {
2021         /* Keep iterating */
2022     }
2023 }
2024 
2025 /*
2026  * Wait for pending requests to complete across all BlockDriverStates
2027  *
2028  * This function does not flush data to disk, use bdrv_flush_all() for that
2029  * after calling this function.
2030  *
2031  * Note that completion of an asynchronous I/O operation can trigger any
2032  * number of other I/O operations on other devices---for example a coroutine
2033  * can be arbitrarily complex and a constant flow of I/O can come until the
2034  * coroutine is complete.  Because of this, it is not possible to have a
2035  * function to drain a single device's I/O queue.
2036  */
2037 void bdrv_drain_all(void)
2038 {
2039     /* Always run first iteration so any pending completion BHs run */
2040     bool busy = true;
2041     BlockDriverState *bs;
2042 
2043     while (busy) {
2044         busy = false;
2045 
2046         QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2047             AioContext *aio_context = bdrv_get_aio_context(bs);
2048 
2049             aio_context_acquire(aio_context);
2050             busy |= bdrv_drain_one(bs);
2051             aio_context_release(aio_context);
2052         }
2053     }
2054 }
2055 
2056 /* make a BlockDriverState anonymous by removing from bdrv_state and
2057  * graph_bdrv_state list.
2058    Also, NULL terminate the device_name to prevent double remove */
2059 void bdrv_make_anon(BlockDriverState *bs)
2060 {
2061     /*
2062      * Take care to remove bs from bdrv_states only when it's actually
2063      * in it.  Note that bs->device_list.tqe_prev is initially null,
2064      * and gets set to non-null by QTAILQ_INSERT_TAIL().  Establish
2065      * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
2066      * resetting it to null on remove.
2067      */
2068     if (bs->device_list.tqe_prev) {
2069         QTAILQ_REMOVE(&bdrv_states, bs, device_list);
2070         bs->device_list.tqe_prev = NULL;
2071     }
2072     if (bs->node_name[0] != '\0') {
2073         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
2074     }
2075     bs->node_name[0] = '\0';
2076 }
2077 
2078 static void bdrv_rebind(BlockDriverState *bs)
2079 {
2080     if (bs->drv && bs->drv->bdrv_rebind) {
2081         bs->drv->bdrv_rebind(bs);
2082     }
2083 }
2084 
2085 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
2086                                      BlockDriverState *bs_src)
2087 {
2088     /* move some fields that need to stay attached to the device */
2089 
2090     /* dev info */
2091     bs_dest->guest_block_size   = bs_src->guest_block_size;
2092     bs_dest->copy_on_read       = bs_src->copy_on_read;
2093 
2094     bs_dest->enable_write_cache = bs_src->enable_write_cache;
2095 
2096     /* i/o throttled req */
2097     memcpy(&bs_dest->throttle_state,
2098            &bs_src->throttle_state,
2099            sizeof(ThrottleState));
2100     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
2101     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
2102     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
2103 
2104     /* r/w error */
2105     bs_dest->on_read_error      = bs_src->on_read_error;
2106     bs_dest->on_write_error     = bs_src->on_write_error;
2107 
2108     /* i/o status */
2109     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
2110     bs_dest->iostatus           = bs_src->iostatus;
2111 
2112     /* dirty bitmap */
2113     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
2114 
2115     /* reference count */
2116     bs_dest->refcnt             = bs_src->refcnt;
2117 
2118     /* job */
2119     bs_dest->job                = bs_src->job;
2120 
2121     /* keep the same entry in bdrv_states */
2122     bs_dest->device_list = bs_src->device_list;
2123     bs_dest->blk = bs_src->blk;
2124 
2125     memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2126            sizeof(bs_dest->op_blockers));
2127 }
2128 
2129 /*
2130  * Swap bs contents for two image chains while they are live,
2131  * while keeping required fields on the BlockDriverState that is
2132  * actually attached to a device.
2133  *
2134  * This will modify the BlockDriverState fields, and swap contents
2135  * between bs_new and bs_old. Both bs_new and bs_old are modified.
2136  *
2137  * bs_new must not be attached to a BlockBackend.
2138  *
2139  * This function does not create any image files.
2140  */
2141 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2142 {
2143     BlockDriverState tmp;
2144 
2145     /* The code needs to swap the node_name but simply swapping node_list won't
2146      * work so first remove the nodes from the graph list, do the swap then
2147      * insert them back if needed.
2148      */
2149     if (bs_new->node_name[0] != '\0') {
2150         QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2151     }
2152     if (bs_old->node_name[0] != '\0') {
2153         QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2154     }
2155 
2156     /* bs_new must be unattached and shouldn't have anything fancy enabled */
2157     assert(!bs_new->blk);
2158     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2159     assert(bs_new->job == NULL);
2160     assert(bs_new->io_limits_enabled == false);
2161     assert(!throttle_have_timer(&bs_new->throttle_state));
2162 
2163     tmp = *bs_new;
2164     *bs_new = *bs_old;
2165     *bs_old = tmp;
2166 
2167     /* there are some fields that should not be swapped, move them back */
2168     bdrv_move_feature_fields(&tmp, bs_old);
2169     bdrv_move_feature_fields(bs_old, bs_new);
2170     bdrv_move_feature_fields(bs_new, &tmp);
2171 
2172     /* bs_new must remain unattached */
2173     assert(!bs_new->blk);
2174 
2175     /* Check a few fields that should remain attached to the device */
2176     assert(bs_new->job == NULL);
2177     assert(bs_new->io_limits_enabled == false);
2178     assert(!throttle_have_timer(&bs_new->throttle_state));
2179 
2180     /* insert the nodes back into the graph node list if needed */
2181     if (bs_new->node_name[0] != '\0') {
2182         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2183     }
2184     if (bs_old->node_name[0] != '\0') {
2185         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2186     }
2187 
2188     bdrv_rebind(bs_new);
2189     bdrv_rebind(bs_old);
2190 }
2191 
2192 /*
2193  * Add new bs contents at the top of an image chain while the chain is
2194  * live, while keeping required fields on the top layer.
2195  *
2196  * This will modify the BlockDriverState fields, and swap contents
2197  * between bs_new and bs_top. Both bs_new and bs_top are modified.
2198  *
2199  * bs_new must not be attached to a BlockBackend.
2200  *
2201  * This function does not create any image files.
2202  */
2203 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2204 {
2205     bdrv_swap(bs_new, bs_top);
2206 
2207     /* The contents of 'tmp' will become bs_top, as we are
2208      * swapping bs_new and bs_top contents. */
2209     bdrv_set_backing_hd(bs_top, bs_new);
2210 }
2211 
2212 static void bdrv_delete(BlockDriverState *bs)
2213 {
2214     assert(!bs->job);
2215     assert(bdrv_op_blocker_is_empty(bs));
2216     assert(!bs->refcnt);
2217     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2218 
2219     bdrv_close(bs);
2220 
2221     /* remove from list, if necessary */
2222     bdrv_make_anon(bs);
2223 
2224     g_free(bs);
2225 }
2226 
2227 /*
2228  * Run consistency checks on an image
2229  *
2230  * Returns 0 if the check could be completed (it doesn't mean that the image is
2231  * free of errors) or -errno when an internal error occurred. The results of the
2232  * check are stored in res.
2233  */
2234 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2235 {
2236     if (bs->drv == NULL) {
2237         return -ENOMEDIUM;
2238     }
2239     if (bs->drv->bdrv_check == NULL) {
2240         return -ENOTSUP;
2241     }
2242 
2243     memset(res, 0, sizeof(*res));
2244     return bs->drv->bdrv_check(bs, res, fix);
2245 }
2246 
2247 #define COMMIT_BUF_SECTORS 2048
2248 
2249 /* commit COW file into the raw image */
2250 int bdrv_commit(BlockDriverState *bs)
2251 {
2252     BlockDriver *drv = bs->drv;
2253     int64_t sector, total_sectors, length, backing_length;
2254     int n, ro, open_flags;
2255     int ret = 0;
2256     uint8_t *buf = NULL;
2257 
2258     if (!drv)
2259         return -ENOMEDIUM;
2260 
2261     if (!bs->backing_hd) {
2262         return -ENOTSUP;
2263     }
2264 
2265     if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) ||
2266         bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET, NULL)) {
2267         return -EBUSY;
2268     }
2269 
2270     ro = bs->backing_hd->read_only;
2271     open_flags =  bs->backing_hd->open_flags;
2272 
2273     if (ro) {
2274         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2275             return -EACCES;
2276         }
2277     }
2278 
2279     length = bdrv_getlength(bs);
2280     if (length < 0) {
2281         ret = length;
2282         goto ro_cleanup;
2283     }
2284 
2285     backing_length = bdrv_getlength(bs->backing_hd);
2286     if (backing_length < 0) {
2287         ret = backing_length;
2288         goto ro_cleanup;
2289     }
2290 
2291     /* If our top snapshot is larger than the backing file image,
2292      * grow the backing file image if possible.  If not possible,
2293      * we must return an error */
2294     if (length > backing_length) {
2295         ret = bdrv_truncate(bs->backing_hd, length);
2296         if (ret < 0) {
2297             goto ro_cleanup;
2298         }
2299     }
2300 
2301     total_sectors = length >> BDRV_SECTOR_BITS;
2302 
2303     /* qemu_try_blockalign() for bs will choose an alignment that works for
2304      * bs->backing_hd as well, so no need to compare the alignment manually. */
2305     buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2306     if (buf == NULL) {
2307         ret = -ENOMEM;
2308         goto ro_cleanup;
2309     }
2310 
2311     for (sector = 0; sector < total_sectors; sector += n) {
2312         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2313         if (ret < 0) {
2314             goto ro_cleanup;
2315         }
2316         if (ret) {
2317             ret = bdrv_read(bs, sector, buf, n);
2318             if (ret < 0) {
2319                 goto ro_cleanup;
2320             }
2321 
2322             ret = bdrv_write(bs->backing_hd, sector, buf, n);
2323             if (ret < 0) {
2324                 goto ro_cleanup;
2325             }
2326         }
2327     }
2328 
2329     if (drv->bdrv_make_empty) {
2330         ret = drv->bdrv_make_empty(bs);
2331         if (ret < 0) {
2332             goto ro_cleanup;
2333         }
2334         bdrv_flush(bs);
2335     }
2336 
2337     /*
2338      * Make sure all data we wrote to the backing device is actually
2339      * stable on disk.
2340      */
2341     if (bs->backing_hd) {
2342         bdrv_flush(bs->backing_hd);
2343     }
2344 
2345     ret = 0;
2346 ro_cleanup:
2347     qemu_vfree(buf);
2348 
2349     if (ro) {
2350         /* ignoring error return here */
2351         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2352     }
2353 
2354     return ret;
2355 }
2356 
2357 int bdrv_commit_all(void)
2358 {
2359     BlockDriverState *bs;
2360 
2361     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2362         AioContext *aio_context = bdrv_get_aio_context(bs);
2363 
2364         aio_context_acquire(aio_context);
2365         if (bs->drv && bs->backing_hd) {
2366             int ret = bdrv_commit(bs);
2367             if (ret < 0) {
2368                 aio_context_release(aio_context);
2369                 return ret;
2370             }
2371         }
2372         aio_context_release(aio_context);
2373     }
2374     return 0;
2375 }
2376 
2377 /**
2378  * Remove an active request from the tracked requests list
2379  *
2380  * This function should be called when a tracked request is completing.
2381  */
2382 static void tracked_request_end(BdrvTrackedRequest *req)
2383 {
2384     if (req->serialising) {
2385         req->bs->serialising_in_flight--;
2386     }
2387 
2388     QLIST_REMOVE(req, list);
2389     qemu_co_queue_restart_all(&req->wait_queue);
2390 }
2391 
2392 /**
2393  * Add an active request to the tracked requests list
2394  */
2395 static void tracked_request_begin(BdrvTrackedRequest *req,
2396                                   BlockDriverState *bs,
2397                                   int64_t offset,
2398                                   unsigned int bytes, bool is_write)
2399 {
2400     *req = (BdrvTrackedRequest){
2401         .bs = bs,
2402         .offset         = offset,
2403         .bytes          = bytes,
2404         .is_write       = is_write,
2405         .co             = qemu_coroutine_self(),
2406         .serialising    = false,
2407         .overlap_offset = offset,
2408         .overlap_bytes  = bytes,
2409     };
2410 
2411     qemu_co_queue_init(&req->wait_queue);
2412 
2413     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2414 }
2415 
2416 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2417 {
2418     int64_t overlap_offset = req->offset & ~(align - 1);
2419     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2420                                - overlap_offset;
2421 
2422     if (!req->serialising) {
2423         req->bs->serialising_in_flight++;
2424         req->serialising = true;
2425     }
2426 
2427     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2428     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2429 }
2430 
2431 /**
2432  * Round a region to cluster boundaries
2433  */
2434 void bdrv_round_to_clusters(BlockDriverState *bs,
2435                             int64_t sector_num, int nb_sectors,
2436                             int64_t *cluster_sector_num,
2437                             int *cluster_nb_sectors)
2438 {
2439     BlockDriverInfo bdi;
2440 
2441     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2442         *cluster_sector_num = sector_num;
2443         *cluster_nb_sectors = nb_sectors;
2444     } else {
2445         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2446         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2447         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2448                                             nb_sectors, c);
2449     }
2450 }
2451 
2452 static int bdrv_get_cluster_size(BlockDriverState *bs)
2453 {
2454     BlockDriverInfo bdi;
2455     int ret;
2456 
2457     ret = bdrv_get_info(bs, &bdi);
2458     if (ret < 0 || bdi.cluster_size == 0) {
2459         return bs->request_alignment;
2460     } else {
2461         return bdi.cluster_size;
2462     }
2463 }
2464 
2465 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2466                                      int64_t offset, unsigned int bytes)
2467 {
2468     /*        aaaa   bbbb */
2469     if (offset >= req->overlap_offset + req->overlap_bytes) {
2470         return false;
2471     }
2472     /* bbbb   aaaa        */
2473     if (req->overlap_offset >= offset + bytes) {
2474         return false;
2475     }
2476     return true;
2477 }
2478 
2479 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2480 {
2481     BlockDriverState *bs = self->bs;
2482     BdrvTrackedRequest *req;
2483     bool retry;
2484     bool waited = false;
2485 
2486     if (!bs->serialising_in_flight) {
2487         return false;
2488     }
2489 
2490     do {
2491         retry = false;
2492         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2493             if (req == self || (!req->serialising && !self->serialising)) {
2494                 continue;
2495             }
2496             if (tracked_request_overlaps(req, self->overlap_offset,
2497                                          self->overlap_bytes))
2498             {
2499                 /* Hitting this means there was a reentrant request, for
2500                  * example, a block driver issuing nested requests.  This must
2501                  * never happen since it means deadlock.
2502                  */
2503                 assert(qemu_coroutine_self() != req->co);
2504 
2505                 /* If the request is already (indirectly) waiting for us, or
2506                  * will wait for us as soon as it wakes up, then just go on
2507                  * (instead of producing a deadlock in the former case). */
2508                 if (!req->waiting_for) {
2509                     self->waiting_for = req;
2510                     qemu_co_queue_wait(&req->wait_queue);
2511                     self->waiting_for = NULL;
2512                     retry = true;
2513                     waited = true;
2514                     break;
2515                 }
2516             }
2517         }
2518     } while (retry);
2519 
2520     return waited;
2521 }
2522 
2523 /*
2524  * Return values:
2525  * 0        - success
2526  * -EINVAL  - backing format specified, but no file
2527  * -ENOSPC  - can't update the backing file because no space is left in the
2528  *            image file header
2529  * -ENOTSUP - format driver doesn't support changing the backing file
2530  */
2531 int bdrv_change_backing_file(BlockDriverState *bs,
2532     const char *backing_file, const char *backing_fmt)
2533 {
2534     BlockDriver *drv = bs->drv;
2535     int ret;
2536 
2537     /* Backing file format doesn't make sense without a backing file */
2538     if (backing_fmt && !backing_file) {
2539         return -EINVAL;
2540     }
2541 
2542     if (drv->bdrv_change_backing_file != NULL) {
2543         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2544     } else {
2545         ret = -ENOTSUP;
2546     }
2547 
2548     if (ret == 0) {
2549         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2550         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2551     }
2552     return ret;
2553 }
2554 
2555 /*
2556  * Finds the image layer in the chain that has 'bs' as its backing file.
2557  *
2558  * active is the current topmost image.
2559  *
2560  * Returns NULL if bs is not found in active's image chain,
2561  * or if active == bs.
2562  *
2563  * Returns the bottommost base image if bs == NULL.
2564  */
2565 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2566                                     BlockDriverState *bs)
2567 {
2568     while (active && bs != active->backing_hd) {
2569         active = active->backing_hd;
2570     }
2571 
2572     return active;
2573 }
2574 
2575 /* Given a BDS, searches for the base layer. */
2576 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2577 {
2578     return bdrv_find_overlay(bs, NULL);
2579 }
2580 
2581 typedef struct BlkIntermediateStates {
2582     BlockDriverState *bs;
2583     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2584 } BlkIntermediateStates;
2585 
2586 
2587 /*
2588  * Drops images above 'base' up to and including 'top', and sets the image
2589  * above 'top' to have base as its backing file.
2590  *
2591  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2592  * information in 'bs' can be properly updated.
2593  *
2594  * E.g., this will convert the following chain:
2595  * bottom <- base <- intermediate <- top <- active
2596  *
2597  * to
2598  *
2599  * bottom <- base <- active
2600  *
2601  * It is allowed for bottom==base, in which case it converts:
2602  *
2603  * base <- intermediate <- top <- active
2604  *
2605  * to
2606  *
2607  * base <- active
2608  *
2609  * If backing_file_str is non-NULL, it will be used when modifying top's
2610  * overlay image metadata.
2611  *
2612  * Error conditions:
2613  *  if active == top, that is considered an error
2614  *
2615  */
2616 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2617                            BlockDriverState *base, const char *backing_file_str)
2618 {
2619     BlockDriverState *intermediate;
2620     BlockDriverState *base_bs = NULL;
2621     BlockDriverState *new_top_bs = NULL;
2622     BlkIntermediateStates *intermediate_state, *next;
2623     int ret = -EIO;
2624 
2625     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2626     QSIMPLEQ_INIT(&states_to_delete);
2627 
2628     if (!top->drv || !base->drv) {
2629         goto exit;
2630     }
2631 
2632     new_top_bs = bdrv_find_overlay(active, top);
2633 
2634     if (new_top_bs == NULL) {
2635         /* we could not find the image above 'top', this is an error */
2636         goto exit;
2637     }
2638 
2639     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2640      * to do, no intermediate images */
2641     if (new_top_bs->backing_hd == base) {
2642         ret = 0;
2643         goto exit;
2644     }
2645 
2646     intermediate = top;
2647 
2648     /* now we will go down through the list, and add each BDS we find
2649      * into our deletion queue, until we hit the 'base'
2650      */
2651     while (intermediate) {
2652         intermediate_state = g_new0(BlkIntermediateStates, 1);
2653         intermediate_state->bs = intermediate;
2654         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2655 
2656         if (intermediate->backing_hd == base) {
2657             base_bs = intermediate->backing_hd;
2658             break;
2659         }
2660         intermediate = intermediate->backing_hd;
2661     }
2662     if (base_bs == NULL) {
2663         /* something went wrong, we did not end at the base. safely
2664          * unravel everything, and exit with error */
2665         goto exit;
2666     }
2667 
2668     /* success - we can delete the intermediate states, and link top->base */
2669     backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2670     ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2671                                    base_bs->drv ? base_bs->drv->format_name : "");
2672     if (ret) {
2673         goto exit;
2674     }
2675     bdrv_set_backing_hd(new_top_bs, base_bs);
2676 
2677     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2678         /* so that bdrv_close() does not recursively close the chain */
2679         bdrv_set_backing_hd(intermediate_state->bs, NULL);
2680         bdrv_unref(intermediate_state->bs);
2681     }
2682     ret = 0;
2683 
2684 exit:
2685     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2686         g_free(intermediate_state);
2687     }
2688     return ret;
2689 }
2690 
2691 
2692 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2693                                    size_t size)
2694 {
2695     if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
2696         return -EIO;
2697     }
2698 
2699     if (!bdrv_is_inserted(bs)) {
2700         return -ENOMEDIUM;
2701     }
2702 
2703     if (offset < 0) {
2704         return -EIO;
2705     }
2706 
2707     return 0;
2708 }
2709 
2710 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2711                               int nb_sectors)
2712 {
2713     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
2714         return -EIO;
2715     }
2716 
2717     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2718                                    nb_sectors * BDRV_SECTOR_SIZE);
2719 }
2720 
2721 typedef struct RwCo {
2722     BlockDriverState *bs;
2723     int64_t offset;
2724     QEMUIOVector *qiov;
2725     bool is_write;
2726     int ret;
2727     BdrvRequestFlags flags;
2728 } RwCo;
2729 
2730 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2731 {
2732     RwCo *rwco = opaque;
2733 
2734     if (!rwco->is_write) {
2735         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2736                                       rwco->qiov->size, rwco->qiov,
2737                                       rwco->flags);
2738     } else {
2739         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2740                                        rwco->qiov->size, rwco->qiov,
2741                                        rwco->flags);
2742     }
2743 }
2744 
2745 /*
2746  * Process a vectored synchronous request using coroutines
2747  */
2748 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2749                         QEMUIOVector *qiov, bool is_write,
2750                         BdrvRequestFlags flags)
2751 {
2752     Coroutine *co;
2753     RwCo rwco = {
2754         .bs = bs,
2755         .offset = offset,
2756         .qiov = qiov,
2757         .is_write = is_write,
2758         .ret = NOT_DONE,
2759         .flags = flags,
2760     };
2761 
2762     /**
2763      * In sync call context, when the vcpu is blocked, this throttling timer
2764      * will not fire; so the I/O throttling function has to be disabled here
2765      * if it has been enabled.
2766      */
2767     if (bs->io_limits_enabled) {
2768         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2769                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2770         bdrv_io_limits_disable(bs);
2771     }
2772 
2773     if (qemu_in_coroutine()) {
2774         /* Fast-path if already in coroutine context */
2775         bdrv_rw_co_entry(&rwco);
2776     } else {
2777         AioContext *aio_context = bdrv_get_aio_context(bs);
2778 
2779         co = qemu_coroutine_create(bdrv_rw_co_entry);
2780         qemu_coroutine_enter(co, &rwco);
2781         while (rwco.ret == NOT_DONE) {
2782             aio_poll(aio_context, true);
2783         }
2784     }
2785     return rwco.ret;
2786 }
2787 
2788 /*
2789  * Process a synchronous request using coroutines
2790  */
2791 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2792                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2793 {
2794     QEMUIOVector qiov;
2795     struct iovec iov = {
2796         .iov_base = (void *)buf,
2797         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2798     };
2799 
2800     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
2801         return -EINVAL;
2802     }
2803 
2804     qemu_iovec_init_external(&qiov, &iov, 1);
2805     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2806                         &qiov, is_write, flags);
2807 }
2808 
2809 /* return < 0 if error. See bdrv_write() for the return codes */
2810 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2811               uint8_t *buf, int nb_sectors)
2812 {
2813     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2814 }
2815 
2816 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2817 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2818                           uint8_t *buf, int nb_sectors)
2819 {
2820     bool enabled;
2821     int ret;
2822 
2823     enabled = bs->io_limits_enabled;
2824     bs->io_limits_enabled = false;
2825     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2826     bs->io_limits_enabled = enabled;
2827     return ret;
2828 }
2829 
2830 /* Return < 0 if error. Important errors are:
2831   -EIO         generic I/O error (may happen for all errors)
2832   -ENOMEDIUM   No media inserted.
2833   -EINVAL      Invalid sector number or nb_sectors
2834   -EACCES      Trying to write a read-only device
2835 */
2836 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2837                const uint8_t *buf, int nb_sectors)
2838 {
2839     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2840 }
2841 
2842 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2843                       int nb_sectors, BdrvRequestFlags flags)
2844 {
2845     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2846                       BDRV_REQ_ZERO_WRITE | flags);
2847 }
2848 
2849 /*
2850  * Completely zero out a block device with the help of bdrv_write_zeroes.
2851  * The operation is sped up by checking the block status and only writing
2852  * zeroes to the device if they currently do not return zeroes. Optional
2853  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2854  *
2855  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2856  */
2857 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2858 {
2859     int64_t target_sectors, ret, nb_sectors, sector_num = 0;
2860     int n;
2861 
2862     target_sectors = bdrv_nb_sectors(bs);
2863     if (target_sectors < 0) {
2864         return target_sectors;
2865     }
2866 
2867     for (;;) {
2868         nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
2869         if (nb_sectors <= 0) {
2870             return 0;
2871         }
2872         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2873         if (ret < 0) {
2874             error_report("error getting block status at sector %" PRId64 ": %s",
2875                          sector_num, strerror(-ret));
2876             return ret;
2877         }
2878         if (ret & BDRV_BLOCK_ZERO) {
2879             sector_num += n;
2880             continue;
2881         }
2882         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2883         if (ret < 0) {
2884             error_report("error writing zeroes at sector %" PRId64 ": %s",
2885                          sector_num, strerror(-ret));
2886             return ret;
2887         }
2888         sector_num += n;
2889     }
2890 }
2891 
2892 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2893 {
2894     QEMUIOVector qiov;
2895     struct iovec iov = {
2896         .iov_base = (void *)buf,
2897         .iov_len = bytes,
2898     };
2899     int ret;
2900 
2901     if (bytes < 0) {
2902         return -EINVAL;
2903     }
2904 
2905     qemu_iovec_init_external(&qiov, &iov, 1);
2906     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2907     if (ret < 0) {
2908         return ret;
2909     }
2910 
2911     return bytes;
2912 }
2913 
2914 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2915 {
2916     int ret;
2917 
2918     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2919     if (ret < 0) {
2920         return ret;
2921     }
2922 
2923     return qiov->size;
2924 }
2925 
2926 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2927                 const void *buf, int bytes)
2928 {
2929     QEMUIOVector qiov;
2930     struct iovec iov = {
2931         .iov_base   = (void *) buf,
2932         .iov_len    = bytes,
2933     };
2934 
2935     if (bytes < 0) {
2936         return -EINVAL;
2937     }
2938 
2939     qemu_iovec_init_external(&qiov, &iov, 1);
2940     return bdrv_pwritev(bs, offset, &qiov);
2941 }
2942 
2943 /*
2944  * Writes to the file and ensures that no writes are reordered across this
2945  * request (acts as a barrier)
2946  *
2947  * Returns 0 on success, -errno in error cases.
2948  */
2949 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2950     const void *buf, int count)
2951 {
2952     int ret;
2953 
2954     ret = bdrv_pwrite(bs, offset, buf, count);
2955     if (ret < 0) {
2956         return ret;
2957     }
2958 
2959     /* No flush needed for cache modes that already do it */
2960     if (bs->enable_write_cache) {
2961         bdrv_flush(bs);
2962     }
2963 
2964     return 0;
2965 }
2966 
2967 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2968         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2969 {
2970     /* Perform I/O through a temporary buffer so that users who scribble over
2971      * their read buffer while the operation is in progress do not end up
2972      * modifying the image file.  This is critical for zero-copy guest I/O
2973      * where anything might happen inside guest memory.
2974      */
2975     void *bounce_buffer;
2976 
2977     BlockDriver *drv = bs->drv;
2978     struct iovec iov;
2979     QEMUIOVector bounce_qiov;
2980     int64_t cluster_sector_num;
2981     int cluster_nb_sectors;
2982     size_t skip_bytes;
2983     int ret;
2984 
2985     /* Cover entire cluster so no additional backing file I/O is required when
2986      * allocating cluster in the image file.
2987      */
2988     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2989                            &cluster_sector_num, &cluster_nb_sectors);
2990 
2991     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2992                                    cluster_sector_num, cluster_nb_sectors);
2993 
2994     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2995     iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
2996     if (bounce_buffer == NULL) {
2997         ret = -ENOMEM;
2998         goto err;
2999     }
3000 
3001     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
3002 
3003     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
3004                              &bounce_qiov);
3005     if (ret < 0) {
3006         goto err;
3007     }
3008 
3009     if (drv->bdrv_co_write_zeroes &&
3010         buffer_is_zero(bounce_buffer, iov.iov_len)) {
3011         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
3012                                       cluster_nb_sectors, 0);
3013     } else {
3014         /* This does not change the data on the disk, it is not necessary
3015          * to flush even in cache=writethrough mode.
3016          */
3017         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
3018                                   &bounce_qiov);
3019     }
3020 
3021     if (ret < 0) {
3022         /* It might be okay to ignore write errors for guest requests.  If this
3023          * is a deliberate copy-on-read then we don't want to ignore the error.
3024          * Simply report it in all cases.
3025          */
3026         goto err;
3027     }
3028 
3029     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
3030     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
3031                         nb_sectors * BDRV_SECTOR_SIZE);
3032 
3033 err:
3034     qemu_vfree(bounce_buffer);
3035     return ret;
3036 }
3037 
3038 /*
3039  * Forwards an already correctly aligned request to the BlockDriver. This
3040  * handles copy on read and zeroing after EOF; any other features must be
3041  * implemented by the caller.
3042  */
3043 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
3044     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3045     int64_t align, QEMUIOVector *qiov, int flags)
3046 {
3047     BlockDriver *drv = bs->drv;
3048     int ret;
3049 
3050     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3051     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3052 
3053     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3054     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3055     assert(!qiov || bytes == qiov->size);
3056 
3057     /* Handle Copy on Read and associated serialisation */
3058     if (flags & BDRV_REQ_COPY_ON_READ) {
3059         /* If we touch the same cluster it counts as an overlap.  This
3060          * guarantees that allocating writes will be serialized and not race
3061          * with each other for the same cluster.  For example, in copy-on-read
3062          * it ensures that the CoR read and write operations are atomic and
3063          * guest writes cannot interleave between them. */
3064         mark_request_serialising(req, bdrv_get_cluster_size(bs));
3065     }
3066 
3067     wait_serialising_requests(req);
3068 
3069     if (flags & BDRV_REQ_COPY_ON_READ) {
3070         int pnum;
3071 
3072         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3073         if (ret < 0) {
3074             goto out;
3075         }
3076 
3077         if (!ret || pnum != nb_sectors) {
3078             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3079             goto out;
3080         }
3081     }
3082 
3083     /* Forward the request to the BlockDriver */
3084     if (!bs->zero_beyond_eof) {
3085         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3086     } else {
3087         /* Read zeros after EOF */
3088         int64_t total_sectors, max_nb_sectors;
3089 
3090         total_sectors = bdrv_nb_sectors(bs);
3091         if (total_sectors < 0) {
3092             ret = total_sectors;
3093             goto out;
3094         }
3095 
3096         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3097                                   align >> BDRV_SECTOR_BITS);
3098         if (nb_sectors < max_nb_sectors) {
3099             ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3100         } else if (max_nb_sectors > 0) {
3101             QEMUIOVector local_qiov;
3102 
3103             qemu_iovec_init(&local_qiov, qiov->niov);
3104             qemu_iovec_concat(&local_qiov, qiov, 0,
3105                               max_nb_sectors * BDRV_SECTOR_SIZE);
3106 
3107             ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
3108                                      &local_qiov);
3109 
3110             qemu_iovec_destroy(&local_qiov);
3111         } else {
3112             ret = 0;
3113         }
3114 
3115         /* Reading beyond end of file is supposed to produce zeroes */
3116         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3117             uint64_t offset = MAX(0, total_sectors - sector_num);
3118             uint64_t bytes = (sector_num + nb_sectors - offset) *
3119                               BDRV_SECTOR_SIZE;
3120             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3121         }
3122     }
3123 
3124 out:
3125     return ret;
3126 }
3127 
3128 static inline uint64_t bdrv_get_align(BlockDriverState *bs)
3129 {
3130     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3131     return MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3132 }
3133 
3134 static inline bool bdrv_req_is_aligned(BlockDriverState *bs,
3135                                        int64_t offset, size_t bytes)
3136 {
3137     int64_t align = bdrv_get_align(bs);
3138     return !(offset & (align - 1) || (bytes & (align - 1)));
3139 }
3140 
3141 /*
3142  * Handle a read request in coroutine context
3143  */
3144 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3145     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3146     BdrvRequestFlags flags)
3147 {
3148     BlockDriver *drv = bs->drv;
3149     BdrvTrackedRequest req;
3150 
3151     uint64_t align = bdrv_get_align(bs);
3152     uint8_t *head_buf = NULL;
3153     uint8_t *tail_buf = NULL;
3154     QEMUIOVector local_qiov;
3155     bool use_local_qiov = false;
3156     int ret;
3157 
3158     if (!drv) {
3159         return -ENOMEDIUM;
3160     }
3161 
3162     ret = bdrv_check_byte_request(bs, offset, bytes);
3163     if (ret < 0) {
3164         return ret;
3165     }
3166 
3167     if (bs->copy_on_read) {
3168         flags |= BDRV_REQ_COPY_ON_READ;
3169     }
3170 
3171     /* throttling disk I/O */
3172     if (bs->io_limits_enabled) {
3173         bdrv_io_limits_intercept(bs, bytes, false);
3174     }
3175 
3176     /* Align read if necessary by padding qiov */
3177     if (offset & (align - 1)) {
3178         head_buf = qemu_blockalign(bs, align);
3179         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3180         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3181         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3182         use_local_qiov = true;
3183 
3184         bytes += offset & (align - 1);
3185         offset = offset & ~(align - 1);
3186     }
3187 
3188     if ((offset + bytes) & (align - 1)) {
3189         if (!use_local_qiov) {
3190             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3191             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3192             use_local_qiov = true;
3193         }
3194         tail_buf = qemu_blockalign(bs, align);
3195         qemu_iovec_add(&local_qiov, tail_buf,
3196                        align - ((offset + bytes) & (align - 1)));
3197 
3198         bytes = ROUND_UP(bytes, align);
3199     }
3200 
3201     tracked_request_begin(&req, bs, offset, bytes, false);
3202     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3203                               use_local_qiov ? &local_qiov : qiov,
3204                               flags);
3205     tracked_request_end(&req);
3206 
3207     if (use_local_qiov) {
3208         qemu_iovec_destroy(&local_qiov);
3209         qemu_vfree(head_buf);
3210         qemu_vfree(tail_buf);
3211     }
3212 
3213     return ret;
3214 }
3215 
3216 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3217     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3218     BdrvRequestFlags flags)
3219 {
3220     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
3221         return -EINVAL;
3222     }
3223 
3224     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3225                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3226 }
3227 
3228 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3229     int nb_sectors, QEMUIOVector *qiov)
3230 {
3231     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3232 
3233     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3234 }
3235 
3236 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3237     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3238 {
3239     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3240 
3241     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3242                             BDRV_REQ_COPY_ON_READ);
3243 }
3244 
3245 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
3246 
3247 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3248     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3249 {
3250     BlockDriver *drv = bs->drv;
3251     QEMUIOVector qiov;
3252     struct iovec iov = {0};
3253     int ret = 0;
3254 
3255     int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
3256                                         BDRV_REQUEST_MAX_SECTORS);
3257 
3258     while (nb_sectors > 0 && !ret) {
3259         int num = nb_sectors;
3260 
3261         /* Align request.  Block drivers can expect the "bulk" of the request
3262          * to be aligned.
3263          */
3264         if (bs->bl.write_zeroes_alignment
3265             && num > bs->bl.write_zeroes_alignment) {
3266             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3267                 /* Make a small request up to the first aligned sector.  */
3268                 num = bs->bl.write_zeroes_alignment;
3269                 num -= sector_num % bs->bl.write_zeroes_alignment;
3270             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3271                 /* Shorten the request to the last aligned sector.  num cannot
3272                  * underflow because num > bs->bl.write_zeroes_alignment.
3273                  */
3274                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3275             }
3276         }
3277 
3278         /* limit request size */
3279         if (num > max_write_zeroes) {
3280             num = max_write_zeroes;
3281         }
3282 
3283         ret = -ENOTSUP;
3284         /* First try the efficient write zeroes operation */
3285         if (drv->bdrv_co_write_zeroes) {
3286             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3287         }
3288 
3289         if (ret == -ENOTSUP) {
3290             /* Fall back to bounce buffer if write zeroes is unsupported */
3291             int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
3292                                             MAX_WRITE_ZEROES_BOUNCE_BUFFER);
3293             num = MIN(num, max_xfer_len);
3294             iov.iov_len = num * BDRV_SECTOR_SIZE;
3295             if (iov.iov_base == NULL) {
3296                 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3297                 if (iov.iov_base == NULL) {
3298                     ret = -ENOMEM;
3299                     goto fail;
3300                 }
3301                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3302             }
3303             qemu_iovec_init_external(&qiov, &iov, 1);
3304 
3305             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3306 
3307             /* Keep bounce buffer around if it is big enough for all
3308              * all future requests.
3309              */
3310             if (num < max_xfer_len) {
3311                 qemu_vfree(iov.iov_base);
3312                 iov.iov_base = NULL;
3313             }
3314         }
3315 
3316         sector_num += num;
3317         nb_sectors -= num;
3318     }
3319 
3320 fail:
3321     qemu_vfree(iov.iov_base);
3322     return ret;
3323 }
3324 
3325 /*
3326  * Forwards an already correctly aligned write request to the BlockDriver.
3327  */
3328 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3329     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3330     QEMUIOVector *qiov, int flags)
3331 {
3332     BlockDriver *drv = bs->drv;
3333     bool waited;
3334     int ret;
3335 
3336     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3337     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3338 
3339     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3340     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3341     assert(!qiov || bytes == qiov->size);
3342 
3343     waited = wait_serialising_requests(req);
3344     assert(!waited || !req->serialising);
3345     assert(req->overlap_offset <= offset);
3346     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3347 
3348     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3349 
3350     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3351         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3352         qemu_iovec_is_zero(qiov)) {
3353         flags |= BDRV_REQ_ZERO_WRITE;
3354         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3355             flags |= BDRV_REQ_MAY_UNMAP;
3356         }
3357     }
3358 
3359     if (ret < 0) {
3360         /* Do nothing, write notifier decided to fail this request */
3361     } else if (flags & BDRV_REQ_ZERO_WRITE) {
3362         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3363         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3364     } else {
3365         BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3366         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3367     }
3368     BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3369 
3370     if (ret == 0 && !bs->enable_write_cache) {
3371         ret = bdrv_co_flush(bs);
3372     }
3373 
3374     bdrv_set_dirty(bs, sector_num, nb_sectors);
3375 
3376     block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
3377 
3378     if (ret >= 0) {
3379         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3380     }
3381 
3382     return ret;
3383 }
3384 
3385 /*
3386  * Handle a write request in coroutine context
3387  */
3388 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3389     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3390     BdrvRequestFlags flags)
3391 {
3392     BdrvTrackedRequest req;
3393     uint64_t align = bdrv_get_align(bs);
3394     uint8_t *head_buf = NULL;
3395     uint8_t *tail_buf = NULL;
3396     QEMUIOVector local_qiov;
3397     bool use_local_qiov = false;
3398     int ret;
3399 
3400     if (!bs->drv) {
3401         return -ENOMEDIUM;
3402     }
3403     if (bs->read_only) {
3404         return -EACCES;
3405     }
3406 
3407     ret = bdrv_check_byte_request(bs, offset, bytes);
3408     if (ret < 0) {
3409         return ret;
3410     }
3411 
3412     /* throttling disk I/O */
3413     if (bs->io_limits_enabled) {
3414         bdrv_io_limits_intercept(bs, bytes, true);
3415     }
3416 
3417     /*
3418      * Align write if necessary by performing a read-modify-write cycle.
3419      * Pad qiov with the read parts and be sure to have a tracked request not
3420      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3421      */
3422     tracked_request_begin(&req, bs, offset, bytes, true);
3423 
3424     if (offset & (align - 1)) {
3425         QEMUIOVector head_qiov;
3426         struct iovec head_iov;
3427 
3428         mark_request_serialising(&req, align);
3429         wait_serialising_requests(&req);
3430 
3431         head_buf = qemu_blockalign(bs, align);
3432         head_iov = (struct iovec) {
3433             .iov_base   = head_buf,
3434             .iov_len    = align,
3435         };
3436         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3437 
3438         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3439         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3440                                   align, &head_qiov, 0);
3441         if (ret < 0) {
3442             goto fail;
3443         }
3444         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3445 
3446         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3447         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3448         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3449         use_local_qiov = true;
3450 
3451         bytes += offset & (align - 1);
3452         offset = offset & ~(align - 1);
3453     }
3454 
3455     if ((offset + bytes) & (align - 1)) {
3456         QEMUIOVector tail_qiov;
3457         struct iovec tail_iov;
3458         size_t tail_bytes;
3459         bool waited;
3460 
3461         mark_request_serialising(&req, align);
3462         waited = wait_serialising_requests(&req);
3463         assert(!waited || !use_local_qiov);
3464 
3465         tail_buf = qemu_blockalign(bs, align);
3466         tail_iov = (struct iovec) {
3467             .iov_base   = tail_buf,
3468             .iov_len    = align,
3469         };
3470         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3471 
3472         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3473         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3474                                   align, &tail_qiov, 0);
3475         if (ret < 0) {
3476             goto fail;
3477         }
3478         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3479 
3480         if (!use_local_qiov) {
3481             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3482             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3483             use_local_qiov = true;
3484         }
3485 
3486         tail_bytes = (offset + bytes) & (align - 1);
3487         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3488 
3489         bytes = ROUND_UP(bytes, align);
3490     }
3491 
3492     if (use_local_qiov) {
3493         /* Local buffer may have non-zero data. */
3494         flags &= ~BDRV_REQ_ZERO_WRITE;
3495     }
3496     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3497                                use_local_qiov ? &local_qiov : qiov,
3498                                flags);
3499 
3500 fail:
3501     tracked_request_end(&req);
3502 
3503     if (use_local_qiov) {
3504         qemu_iovec_destroy(&local_qiov);
3505     }
3506     qemu_vfree(head_buf);
3507     qemu_vfree(tail_buf);
3508 
3509     return ret;
3510 }
3511 
3512 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3513     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3514     BdrvRequestFlags flags)
3515 {
3516     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
3517         return -EINVAL;
3518     }
3519 
3520     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3521                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3522 }
3523 
3524 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3525     int nb_sectors, QEMUIOVector *qiov)
3526 {
3527     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3528 
3529     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3530 }
3531 
3532 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3533                                       int64_t sector_num, int nb_sectors,
3534                                       BdrvRequestFlags flags)
3535 {
3536     int ret;
3537 
3538     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3539 
3540     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3541         flags &= ~BDRV_REQ_MAY_UNMAP;
3542     }
3543     if (bdrv_req_is_aligned(bs, sector_num << BDRV_SECTOR_BITS,
3544                             nb_sectors << BDRV_SECTOR_BITS)) {
3545         ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3546                                 BDRV_REQ_ZERO_WRITE | flags);
3547     } else {
3548         uint8_t *buf;
3549         QEMUIOVector local_qiov;
3550         size_t bytes = nb_sectors << BDRV_SECTOR_BITS;
3551 
3552         buf = qemu_memalign(bdrv_opt_mem_align(bs), bytes);
3553         memset(buf, 0, bytes);
3554         qemu_iovec_init(&local_qiov, 1);
3555         qemu_iovec_add(&local_qiov, buf, bytes);
3556 
3557         ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, &local_qiov,
3558                                 BDRV_REQ_ZERO_WRITE | flags);
3559         qemu_vfree(buf);
3560     }
3561     return ret;
3562 }
3563 
3564 /**
3565  * Truncate file to 'offset' bytes (needed only for file protocols)
3566  */
3567 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3568 {
3569     BlockDriver *drv = bs->drv;
3570     int ret;
3571     if (!drv)
3572         return -ENOMEDIUM;
3573     if (!drv->bdrv_truncate)
3574         return -ENOTSUP;
3575     if (bs->read_only)
3576         return -EACCES;
3577 
3578     ret = drv->bdrv_truncate(bs, offset);
3579     if (ret == 0) {
3580         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3581         if (bs->blk) {
3582             blk_dev_resize_cb(bs->blk);
3583         }
3584     }
3585     return ret;
3586 }
3587 
3588 /**
3589  * Length of a allocated file in bytes. Sparse files are counted by actual
3590  * allocated space. Return < 0 if error or unknown.
3591  */
3592 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3593 {
3594     BlockDriver *drv = bs->drv;
3595     if (!drv) {
3596         return -ENOMEDIUM;
3597     }
3598     if (drv->bdrv_get_allocated_file_size) {
3599         return drv->bdrv_get_allocated_file_size(bs);
3600     }
3601     if (bs->file) {
3602         return bdrv_get_allocated_file_size(bs->file);
3603     }
3604     return -ENOTSUP;
3605 }
3606 
3607 /**
3608  * Return number of sectors on success, -errno on error.
3609  */
3610 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3611 {
3612     BlockDriver *drv = bs->drv;
3613 
3614     if (!drv)
3615         return -ENOMEDIUM;
3616 
3617     if (drv->has_variable_length) {
3618         int ret = refresh_total_sectors(bs, bs->total_sectors);
3619         if (ret < 0) {
3620             return ret;
3621         }
3622     }
3623     return bs->total_sectors;
3624 }
3625 
3626 /**
3627  * Return length in bytes on success, -errno on error.
3628  * The length is always a multiple of BDRV_SECTOR_SIZE.
3629  */
3630 int64_t bdrv_getlength(BlockDriverState *bs)
3631 {
3632     int64_t ret = bdrv_nb_sectors(bs);
3633 
3634     return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3635 }
3636 
3637 /* return 0 as number of sectors if no device present or error */
3638 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3639 {
3640     int64_t nb_sectors = bdrv_nb_sectors(bs);
3641 
3642     *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3643 }
3644 
3645 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3646                        BlockdevOnError on_write_error)
3647 {
3648     bs->on_read_error = on_read_error;
3649     bs->on_write_error = on_write_error;
3650 }
3651 
3652 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3653 {
3654     return is_read ? bs->on_read_error : bs->on_write_error;
3655 }
3656 
3657 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3658 {
3659     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3660 
3661     switch (on_err) {
3662     case BLOCKDEV_ON_ERROR_ENOSPC:
3663         return (error == ENOSPC) ?
3664                BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3665     case BLOCKDEV_ON_ERROR_STOP:
3666         return BLOCK_ERROR_ACTION_STOP;
3667     case BLOCKDEV_ON_ERROR_REPORT:
3668         return BLOCK_ERROR_ACTION_REPORT;
3669     case BLOCKDEV_ON_ERROR_IGNORE:
3670         return BLOCK_ERROR_ACTION_IGNORE;
3671     default:
3672         abort();
3673     }
3674 }
3675 
3676 static void send_qmp_error_event(BlockDriverState *bs,
3677                                  BlockErrorAction action,
3678                                  bool is_read, int error)
3679 {
3680     IoOperationType optype;
3681 
3682     optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3683     qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
3684                                    bdrv_iostatus_is_enabled(bs),
3685                                    error == ENOSPC, strerror(error),
3686                                    &error_abort);
3687 }
3688 
3689 /* This is done by device models because, while the block layer knows
3690  * about the error, it does not know whether an operation comes from
3691  * the device or the block layer (from a job, for example).
3692  */
3693 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3694                        bool is_read, int error)
3695 {
3696     assert(error >= 0);
3697 
3698     if (action == BLOCK_ERROR_ACTION_STOP) {
3699         /* First set the iostatus, so that "info block" returns an iostatus
3700          * that matches the events raised so far (an additional error iostatus
3701          * is fine, but not a lost one).
3702          */
3703         bdrv_iostatus_set_err(bs, error);
3704 
3705         /* Then raise the request to stop the VM and the event.
3706          * qemu_system_vmstop_request_prepare has two effects.  First,
3707          * it ensures that the STOP event always comes after the
3708          * BLOCK_IO_ERROR event.  Second, it ensures that even if management
3709          * can observe the STOP event and do a "cont" before the STOP
3710          * event is issued, the VM will not stop.  In this case, vm_start()
3711          * also ensures that the STOP/RESUME pair of events is emitted.
3712          */
3713         qemu_system_vmstop_request_prepare();
3714         send_qmp_error_event(bs, action, is_read, error);
3715         qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3716     } else {
3717         send_qmp_error_event(bs, action, is_read, error);
3718     }
3719 }
3720 
3721 int bdrv_is_read_only(BlockDriverState *bs)
3722 {
3723     return bs->read_only;
3724 }
3725 
3726 int bdrv_is_sg(BlockDriverState *bs)
3727 {
3728     return bs->sg;
3729 }
3730 
3731 int bdrv_enable_write_cache(BlockDriverState *bs)
3732 {
3733     return bs->enable_write_cache;
3734 }
3735 
3736 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3737 {
3738     bs->enable_write_cache = wce;
3739 
3740     /* so a reopen() will preserve wce */
3741     if (wce) {
3742         bs->open_flags |= BDRV_O_CACHE_WB;
3743     } else {
3744         bs->open_flags &= ~BDRV_O_CACHE_WB;
3745     }
3746 }
3747 
3748 int bdrv_is_encrypted(BlockDriverState *bs)
3749 {
3750     if (bs->backing_hd && bs->backing_hd->encrypted)
3751         return 1;
3752     return bs->encrypted;
3753 }
3754 
3755 int bdrv_key_required(BlockDriverState *bs)
3756 {
3757     BlockDriverState *backing_hd = bs->backing_hd;
3758 
3759     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3760         return 1;
3761     return (bs->encrypted && !bs->valid_key);
3762 }
3763 
3764 int bdrv_set_key(BlockDriverState *bs, const char *key)
3765 {
3766     int ret;
3767     if (bs->backing_hd && bs->backing_hd->encrypted) {
3768         ret = bdrv_set_key(bs->backing_hd, key);
3769         if (ret < 0)
3770             return ret;
3771         if (!bs->encrypted)
3772             return 0;
3773     }
3774     if (!bs->encrypted) {
3775         return -EINVAL;
3776     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3777         return -ENOMEDIUM;
3778     }
3779     ret = bs->drv->bdrv_set_key(bs, key);
3780     if (ret < 0) {
3781         bs->valid_key = 0;
3782     } else if (!bs->valid_key) {
3783         bs->valid_key = 1;
3784         if (bs->blk) {
3785             /* call the change callback now, we skipped it on open */
3786             blk_dev_change_media_cb(bs->blk, true);
3787         }
3788     }
3789     return ret;
3790 }
3791 
3792 /*
3793  * Provide an encryption key for @bs.
3794  * If @key is non-null:
3795  *     If @bs is not encrypted, fail.
3796  *     Else if the key is invalid, fail.
3797  *     Else set @bs's key to @key, replacing the existing key, if any.
3798  * If @key is null:
3799  *     If @bs is encrypted and still lacks a key, fail.
3800  *     Else do nothing.
3801  * On failure, store an error object through @errp if non-null.
3802  */
3803 void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp)
3804 {
3805     if (key) {
3806         if (!bdrv_is_encrypted(bs)) {
3807             error_setg(errp, "Device '%s' is not encrypted",
3808                       bdrv_get_device_name(bs));
3809         } else if (bdrv_set_key(bs, key) < 0) {
3810             error_set(errp, QERR_INVALID_PASSWORD);
3811         }
3812     } else {
3813         if (bdrv_key_required(bs)) {
3814             error_set(errp, ERROR_CLASS_DEVICE_ENCRYPTED,
3815                       "'%s' (%s) is encrypted",
3816                       bdrv_get_device_name(bs),
3817                       bdrv_get_encrypted_filename(bs));
3818         }
3819     }
3820 }
3821 
3822 const char *bdrv_get_format_name(BlockDriverState *bs)
3823 {
3824     return bs->drv ? bs->drv->format_name : NULL;
3825 }
3826 
3827 static int qsort_strcmp(const void *a, const void *b)
3828 {
3829     return strcmp(a, b);
3830 }
3831 
3832 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3833                          void *opaque)
3834 {
3835     BlockDriver *drv;
3836     int count = 0;
3837     int i;
3838     const char **formats = NULL;
3839 
3840     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3841         if (drv->format_name) {
3842             bool found = false;
3843             int i = count;
3844             while (formats && i && !found) {
3845                 found = !strcmp(formats[--i], drv->format_name);
3846             }
3847 
3848             if (!found) {
3849                 formats = g_renew(const char *, formats, count + 1);
3850                 formats[count++] = drv->format_name;
3851             }
3852         }
3853     }
3854 
3855     qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3856 
3857     for (i = 0; i < count; i++) {
3858         it(opaque, formats[i]);
3859     }
3860 
3861     g_free(formats);
3862 }
3863 
3864 /* This function is to find a node in the bs graph */
3865 BlockDriverState *bdrv_find_node(const char *node_name)
3866 {
3867     BlockDriverState *bs;
3868 
3869     assert(node_name);
3870 
3871     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3872         if (!strcmp(node_name, bs->node_name)) {
3873             return bs;
3874         }
3875     }
3876     return NULL;
3877 }
3878 
3879 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3880 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3881 {
3882     BlockDeviceInfoList *list, *entry;
3883     BlockDriverState *bs;
3884 
3885     list = NULL;
3886     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3887         entry = g_malloc0(sizeof(*entry));
3888         entry->value = bdrv_block_device_info(bs);
3889         entry->next = list;
3890         list = entry;
3891     }
3892 
3893     return list;
3894 }
3895 
3896 BlockDriverState *bdrv_lookup_bs(const char *device,
3897                                  const char *node_name,
3898                                  Error **errp)
3899 {
3900     BlockBackend *blk;
3901     BlockDriverState *bs;
3902 
3903     if (device) {
3904         blk = blk_by_name(device);
3905 
3906         if (blk) {
3907             return blk_bs(blk);
3908         }
3909     }
3910 
3911     if (node_name) {
3912         bs = bdrv_find_node(node_name);
3913 
3914         if (bs) {
3915             return bs;
3916         }
3917     }
3918 
3919     error_setg(errp, "Cannot find device=%s nor node_name=%s",
3920                      device ? device : "",
3921                      node_name ? node_name : "");
3922     return NULL;
3923 }
3924 
3925 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3926  * return false.  If either argument is NULL, return false. */
3927 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3928 {
3929     while (top && top != base) {
3930         top = top->backing_hd;
3931     }
3932 
3933     return top != NULL;
3934 }
3935 
3936 BlockDriverState *bdrv_next_node(BlockDriverState *bs)
3937 {
3938     if (!bs) {
3939         return QTAILQ_FIRST(&graph_bdrv_states);
3940     }
3941     return QTAILQ_NEXT(bs, node_list);
3942 }
3943 
3944 BlockDriverState *bdrv_next(BlockDriverState *bs)
3945 {
3946     if (!bs) {
3947         return QTAILQ_FIRST(&bdrv_states);
3948     }
3949     return QTAILQ_NEXT(bs, device_list);
3950 }
3951 
3952 const char *bdrv_get_node_name(const BlockDriverState *bs)
3953 {
3954     return bs->node_name;
3955 }
3956 
3957 /* TODO check what callers really want: bs->node_name or blk_name() */
3958 const char *bdrv_get_device_name(const BlockDriverState *bs)
3959 {
3960     return bs->blk ? blk_name(bs->blk) : "";
3961 }
3962 
3963 int bdrv_get_flags(BlockDriverState *bs)
3964 {
3965     return bs->open_flags;
3966 }
3967 
3968 int bdrv_flush_all(void)
3969 {
3970     BlockDriverState *bs;
3971     int result = 0;
3972 
3973     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3974         AioContext *aio_context = bdrv_get_aio_context(bs);
3975         int ret;
3976 
3977         aio_context_acquire(aio_context);
3978         ret = bdrv_flush(bs);
3979         if (ret < 0 && !result) {
3980             result = ret;
3981         }
3982         aio_context_release(aio_context);
3983     }
3984 
3985     return result;
3986 }
3987 
3988 int bdrv_has_zero_init_1(BlockDriverState *bs)
3989 {
3990     return 1;
3991 }
3992 
3993 int bdrv_has_zero_init(BlockDriverState *bs)
3994 {
3995     assert(bs->drv);
3996 
3997     /* If BS is a copy on write image, it is initialized to
3998        the contents of the base image, which may not be zeroes.  */
3999     if (bs->backing_hd) {
4000         return 0;
4001     }
4002     if (bs->drv->bdrv_has_zero_init) {
4003         return bs->drv->bdrv_has_zero_init(bs);
4004     }
4005 
4006     /* safe default */
4007     return 0;
4008 }
4009 
4010 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
4011 {
4012     BlockDriverInfo bdi;
4013 
4014     if (bs->backing_hd) {
4015         return false;
4016     }
4017 
4018     if (bdrv_get_info(bs, &bdi) == 0) {
4019         return bdi.unallocated_blocks_are_zero;
4020     }
4021 
4022     return false;
4023 }
4024 
4025 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
4026 {
4027     BlockDriverInfo bdi;
4028 
4029     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
4030         return false;
4031     }
4032 
4033     if (bdrv_get_info(bs, &bdi) == 0) {
4034         return bdi.can_write_zeroes_with_unmap;
4035     }
4036 
4037     return false;
4038 }
4039 
4040 typedef struct BdrvCoGetBlockStatusData {
4041     BlockDriverState *bs;
4042     BlockDriverState *base;
4043     int64_t sector_num;
4044     int nb_sectors;
4045     int *pnum;
4046     int64_t ret;
4047     bool done;
4048 } BdrvCoGetBlockStatusData;
4049 
4050 /*
4051  * Returns the allocation status of the specified sectors.
4052  * Drivers not implementing the functionality are assumed to not support
4053  * backing files, hence all their sectors are reported as allocated.
4054  *
4055  * If 'sector_num' is beyond the end of the disk image the return value is 0
4056  * and 'pnum' is set to 0.
4057  *
4058  * 'pnum' is set to the number of sectors (including and immediately following
4059  * the specified sector) that are known to be in the same
4060  * allocated/unallocated state.
4061  *
4062  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
4063  * beyond the end of the disk image it will be clamped.
4064  */
4065 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
4066                                                      int64_t sector_num,
4067                                                      int nb_sectors, int *pnum)
4068 {
4069     int64_t total_sectors;
4070     int64_t n;
4071     int64_t ret, ret2;
4072 
4073     total_sectors = bdrv_nb_sectors(bs);
4074     if (total_sectors < 0) {
4075         return total_sectors;
4076     }
4077 
4078     if (sector_num >= total_sectors) {
4079         *pnum = 0;
4080         return 0;
4081     }
4082 
4083     n = total_sectors - sector_num;
4084     if (n < nb_sectors) {
4085         nb_sectors = n;
4086     }
4087 
4088     if (!bs->drv->bdrv_co_get_block_status) {
4089         *pnum = nb_sectors;
4090         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
4091         if (bs->drv->protocol_name) {
4092             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
4093         }
4094         return ret;
4095     }
4096 
4097     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
4098     if (ret < 0) {
4099         *pnum = 0;
4100         return ret;
4101     }
4102 
4103     if (ret & BDRV_BLOCK_RAW) {
4104         assert(ret & BDRV_BLOCK_OFFSET_VALID);
4105         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4106                                      *pnum, pnum);
4107     }
4108 
4109     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
4110         ret |= BDRV_BLOCK_ALLOCATED;
4111     }
4112 
4113     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
4114         if (bdrv_unallocated_blocks_are_zero(bs)) {
4115             ret |= BDRV_BLOCK_ZERO;
4116         } else if (bs->backing_hd) {
4117             BlockDriverState *bs2 = bs->backing_hd;
4118             int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4119             if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
4120                 ret |= BDRV_BLOCK_ZERO;
4121             }
4122         }
4123     }
4124 
4125     if (bs->file &&
4126         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4127         (ret & BDRV_BLOCK_OFFSET_VALID)) {
4128         int file_pnum;
4129 
4130         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4131                                         *pnum, &file_pnum);
4132         if (ret2 >= 0) {
4133             /* Ignore errors.  This is just providing extra information, it
4134              * is useful but not necessary.
4135              */
4136             if (!file_pnum) {
4137                 /* !file_pnum indicates an offset at or beyond the EOF; it is
4138                  * perfectly valid for the format block driver to point to such
4139                  * offsets, so catch it and mark everything as zero */
4140                 ret |= BDRV_BLOCK_ZERO;
4141             } else {
4142                 /* Limit request to the range reported by the protocol driver */
4143                 *pnum = file_pnum;
4144                 ret |= (ret2 & BDRV_BLOCK_ZERO);
4145             }
4146         }
4147     }
4148 
4149     return ret;
4150 }
4151 
4152 /* Coroutine wrapper for bdrv_get_block_status() */
4153 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
4154 {
4155     BdrvCoGetBlockStatusData *data = opaque;
4156     BlockDriverState *bs = data->bs;
4157 
4158     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4159                                          data->pnum);
4160     data->done = true;
4161 }
4162 
4163 /*
4164  * Synchronous wrapper around bdrv_co_get_block_status().
4165  *
4166  * See bdrv_co_get_block_status() for details.
4167  */
4168 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4169                               int nb_sectors, int *pnum)
4170 {
4171     Coroutine *co;
4172     BdrvCoGetBlockStatusData data = {
4173         .bs = bs,
4174         .sector_num = sector_num,
4175         .nb_sectors = nb_sectors,
4176         .pnum = pnum,
4177         .done = false,
4178     };
4179 
4180     if (qemu_in_coroutine()) {
4181         /* Fast-path if already in coroutine context */
4182         bdrv_get_block_status_co_entry(&data);
4183     } else {
4184         AioContext *aio_context = bdrv_get_aio_context(bs);
4185 
4186         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4187         qemu_coroutine_enter(co, &data);
4188         while (!data.done) {
4189             aio_poll(aio_context, true);
4190         }
4191     }
4192     return data.ret;
4193 }
4194 
4195 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4196                                    int nb_sectors, int *pnum)
4197 {
4198     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4199     if (ret < 0) {
4200         return ret;
4201     }
4202     return !!(ret & BDRV_BLOCK_ALLOCATED);
4203 }
4204 
4205 /*
4206  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4207  *
4208  * Return true if the given sector is allocated in any image between
4209  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
4210  * sector is allocated in any image of the chain.  Return false otherwise.
4211  *
4212  * 'pnum' is set to the number of sectors (including and immediately following
4213  *  the specified sector) that are known to be in the same
4214  *  allocated/unallocated state.
4215  *
4216  */
4217 int bdrv_is_allocated_above(BlockDriverState *top,
4218                             BlockDriverState *base,
4219                             int64_t sector_num,
4220                             int nb_sectors, int *pnum)
4221 {
4222     BlockDriverState *intermediate;
4223     int ret, n = nb_sectors;
4224 
4225     intermediate = top;
4226     while (intermediate && intermediate != base) {
4227         int pnum_inter;
4228         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4229                                 &pnum_inter);
4230         if (ret < 0) {
4231             return ret;
4232         } else if (ret) {
4233             *pnum = pnum_inter;
4234             return 1;
4235         }
4236 
4237         /*
4238          * [sector_num, nb_sectors] is unallocated on top but intermediate
4239          * might have
4240          *
4241          * [sector_num+x, nr_sectors] allocated.
4242          */
4243         if (n > pnum_inter &&
4244             (intermediate == top ||
4245              sector_num + pnum_inter < intermediate->total_sectors)) {
4246             n = pnum_inter;
4247         }
4248 
4249         intermediate = intermediate->backing_hd;
4250     }
4251 
4252     *pnum = n;
4253     return 0;
4254 }
4255 
4256 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4257 {
4258     if (bs->backing_hd && bs->backing_hd->encrypted)
4259         return bs->backing_file;
4260     else if (bs->encrypted)
4261         return bs->filename;
4262     else
4263         return NULL;
4264 }
4265 
4266 void bdrv_get_backing_filename(BlockDriverState *bs,
4267                                char *filename, int filename_size)
4268 {
4269     pstrcpy(filename, filename_size, bs->backing_file);
4270 }
4271 
4272 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4273                           const uint8_t *buf, int nb_sectors)
4274 {
4275     BlockDriver *drv = bs->drv;
4276     int ret;
4277 
4278     if (!drv) {
4279         return -ENOMEDIUM;
4280     }
4281     if (!drv->bdrv_write_compressed) {
4282         return -ENOTSUP;
4283     }
4284     ret = bdrv_check_request(bs, sector_num, nb_sectors);
4285     if (ret < 0) {
4286         return ret;
4287     }
4288 
4289     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4290 
4291     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4292 }
4293 
4294 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4295 {
4296     BlockDriver *drv = bs->drv;
4297     if (!drv)
4298         return -ENOMEDIUM;
4299     if (!drv->bdrv_get_info)
4300         return -ENOTSUP;
4301     memset(bdi, 0, sizeof(*bdi));
4302     return drv->bdrv_get_info(bs, bdi);
4303 }
4304 
4305 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4306 {
4307     BlockDriver *drv = bs->drv;
4308     if (drv && drv->bdrv_get_specific_info) {
4309         return drv->bdrv_get_specific_info(bs);
4310     }
4311     return NULL;
4312 }
4313 
4314 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4315                       int64_t pos, int size)
4316 {
4317     QEMUIOVector qiov;
4318     struct iovec iov = {
4319         .iov_base   = (void *) buf,
4320         .iov_len    = size,
4321     };
4322 
4323     qemu_iovec_init_external(&qiov, &iov, 1);
4324     return bdrv_writev_vmstate(bs, &qiov, pos);
4325 }
4326 
4327 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4328 {
4329     BlockDriver *drv = bs->drv;
4330 
4331     if (!drv) {
4332         return -ENOMEDIUM;
4333     } else if (drv->bdrv_save_vmstate) {
4334         return drv->bdrv_save_vmstate(bs, qiov, pos);
4335     } else if (bs->file) {
4336         return bdrv_writev_vmstate(bs->file, qiov, pos);
4337     }
4338 
4339     return -ENOTSUP;
4340 }
4341 
4342 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4343                       int64_t pos, int size)
4344 {
4345     BlockDriver *drv = bs->drv;
4346     if (!drv)
4347         return -ENOMEDIUM;
4348     if (drv->bdrv_load_vmstate)
4349         return drv->bdrv_load_vmstate(bs, buf, pos, size);
4350     if (bs->file)
4351         return bdrv_load_vmstate(bs->file, buf, pos, size);
4352     return -ENOTSUP;
4353 }
4354 
4355 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4356 {
4357     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4358         return;
4359     }
4360 
4361     bs->drv->bdrv_debug_event(bs, event);
4362 }
4363 
4364 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4365                           const char *tag)
4366 {
4367     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4368         bs = bs->file;
4369     }
4370 
4371     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4372         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4373     }
4374 
4375     return -ENOTSUP;
4376 }
4377 
4378 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4379 {
4380     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4381         bs = bs->file;
4382     }
4383 
4384     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4385         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4386     }
4387 
4388     return -ENOTSUP;
4389 }
4390 
4391 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4392 {
4393     while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4394         bs = bs->file;
4395     }
4396 
4397     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4398         return bs->drv->bdrv_debug_resume(bs, tag);
4399     }
4400 
4401     return -ENOTSUP;
4402 }
4403 
4404 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4405 {
4406     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4407         bs = bs->file;
4408     }
4409 
4410     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4411         return bs->drv->bdrv_debug_is_suspended(bs, tag);
4412     }
4413 
4414     return false;
4415 }
4416 
4417 int bdrv_is_snapshot(BlockDriverState *bs)
4418 {
4419     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4420 }
4421 
4422 /* backing_file can either be relative, or absolute, or a protocol.  If it is
4423  * relative, it must be relative to the chain.  So, passing in bs->filename
4424  * from a BDS as backing_file should not be done, as that may be relative to
4425  * the CWD rather than the chain. */
4426 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4427         const char *backing_file)
4428 {
4429     char *filename_full = NULL;
4430     char *backing_file_full = NULL;
4431     char *filename_tmp = NULL;
4432     int is_protocol = 0;
4433     BlockDriverState *curr_bs = NULL;
4434     BlockDriverState *retval = NULL;
4435 
4436     if (!bs || !bs->drv || !backing_file) {
4437         return NULL;
4438     }
4439 
4440     filename_full     = g_malloc(PATH_MAX);
4441     backing_file_full = g_malloc(PATH_MAX);
4442     filename_tmp      = g_malloc(PATH_MAX);
4443 
4444     is_protocol = path_has_protocol(backing_file);
4445 
4446     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4447 
4448         /* If either of the filename paths is actually a protocol, then
4449          * compare unmodified paths; otherwise make paths relative */
4450         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4451             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4452                 retval = curr_bs->backing_hd;
4453                 break;
4454             }
4455         } else {
4456             /* If not an absolute filename path, make it relative to the current
4457              * image's filename path */
4458             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4459                          backing_file);
4460 
4461             /* We are going to compare absolute pathnames */
4462             if (!realpath(filename_tmp, filename_full)) {
4463                 continue;
4464             }
4465 
4466             /* We need to make sure the backing filename we are comparing against
4467              * is relative to the current image filename (or absolute) */
4468             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4469                          curr_bs->backing_file);
4470 
4471             if (!realpath(filename_tmp, backing_file_full)) {
4472                 continue;
4473             }
4474 
4475             if (strcmp(backing_file_full, filename_full) == 0) {
4476                 retval = curr_bs->backing_hd;
4477                 break;
4478             }
4479         }
4480     }
4481 
4482     g_free(filename_full);
4483     g_free(backing_file_full);
4484     g_free(filename_tmp);
4485     return retval;
4486 }
4487 
4488 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4489 {
4490     if (!bs->drv) {
4491         return 0;
4492     }
4493 
4494     if (!bs->backing_hd) {
4495         return 0;
4496     }
4497 
4498     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4499 }
4500 
4501 /**************************************************************/
4502 /* async I/Os */
4503 
4504 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4505                            QEMUIOVector *qiov, int nb_sectors,
4506                            BlockCompletionFunc *cb, void *opaque)
4507 {
4508     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4509 
4510     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4511                                  cb, opaque, false);
4512 }
4513 
4514 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4515                             QEMUIOVector *qiov, int nb_sectors,
4516                             BlockCompletionFunc *cb, void *opaque)
4517 {
4518     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4519 
4520     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4521                                  cb, opaque, true);
4522 }
4523 
4524 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4525         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4526         BlockCompletionFunc *cb, void *opaque)
4527 {
4528     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4529 
4530     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4531                                  BDRV_REQ_ZERO_WRITE | flags,
4532                                  cb, opaque, true);
4533 }
4534 
4535 
4536 typedef struct MultiwriteCB {
4537     int error;
4538     int num_requests;
4539     int num_callbacks;
4540     struct {
4541         BlockCompletionFunc *cb;
4542         void *opaque;
4543         QEMUIOVector *free_qiov;
4544     } callbacks[];
4545 } MultiwriteCB;
4546 
4547 static void multiwrite_user_cb(MultiwriteCB *mcb)
4548 {
4549     int i;
4550 
4551     for (i = 0; i < mcb->num_callbacks; i++) {
4552         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4553         if (mcb->callbacks[i].free_qiov) {
4554             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4555         }
4556         g_free(mcb->callbacks[i].free_qiov);
4557     }
4558 }
4559 
4560 static void multiwrite_cb(void *opaque, int ret)
4561 {
4562     MultiwriteCB *mcb = opaque;
4563 
4564     trace_multiwrite_cb(mcb, ret);
4565 
4566     if (ret < 0 && !mcb->error) {
4567         mcb->error = ret;
4568     }
4569 
4570     mcb->num_requests--;
4571     if (mcb->num_requests == 0) {
4572         multiwrite_user_cb(mcb);
4573         g_free(mcb);
4574     }
4575 }
4576 
4577 static int multiwrite_req_compare(const void *a, const void *b)
4578 {
4579     const BlockRequest *req1 = a, *req2 = b;
4580 
4581     /*
4582      * Note that we can't simply subtract req2->sector from req1->sector
4583      * here as that could overflow the return value.
4584      */
4585     if (req1->sector > req2->sector) {
4586         return 1;
4587     } else if (req1->sector < req2->sector) {
4588         return -1;
4589     } else {
4590         return 0;
4591     }
4592 }
4593 
4594 /*
4595  * Takes a bunch of requests and tries to merge them. Returns the number of
4596  * requests that remain after merging.
4597  */
4598 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4599     int num_reqs, MultiwriteCB *mcb)
4600 {
4601     int i, outidx;
4602 
4603     // Sort requests by start sector
4604     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4605 
4606     // Check if adjacent requests touch the same clusters. If so, combine them,
4607     // filling up gaps with zero sectors.
4608     outidx = 0;
4609     for (i = 1; i < num_reqs; i++) {
4610         int merge = 0;
4611         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4612 
4613         // Handle exactly sequential writes and overlapping writes.
4614         if (reqs[i].sector <= oldreq_last) {
4615             merge = 1;
4616         }
4617 
4618         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4619             merge = 0;
4620         }
4621 
4622         if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
4623             reqs[i].nb_sectors > bs->bl.max_transfer_length) {
4624             merge = 0;
4625         }
4626 
4627         if (merge) {
4628             size_t size;
4629             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4630             qemu_iovec_init(qiov,
4631                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4632 
4633             // Add the first request to the merged one. If the requests are
4634             // overlapping, drop the last sectors of the first request.
4635             size = (reqs[i].sector - reqs[outidx].sector) << 9;
4636             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4637 
4638             // We should need to add any zeros between the two requests
4639             assert (reqs[i].sector <= oldreq_last);
4640 
4641             // Add the second request
4642             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4643 
4644             // Add tail of first request, if necessary
4645             if (qiov->size < reqs[outidx].qiov->size) {
4646                 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4647                                   reqs[outidx].qiov->size - qiov->size);
4648             }
4649 
4650             reqs[outidx].nb_sectors = qiov->size >> 9;
4651             reqs[outidx].qiov = qiov;
4652 
4653             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4654         } else {
4655             outidx++;
4656             reqs[outidx].sector     = reqs[i].sector;
4657             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4658             reqs[outidx].qiov       = reqs[i].qiov;
4659         }
4660     }
4661 
4662     block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
4663 
4664     return outidx + 1;
4665 }
4666 
4667 /*
4668  * Submit multiple AIO write requests at once.
4669  *
4670  * On success, the function returns 0 and all requests in the reqs array have
4671  * been submitted. In error case this function returns -1, and any of the
4672  * requests may or may not be submitted yet. In particular, this means that the
4673  * callback will be called for some of the requests, for others it won't. The
4674  * caller must check the error field of the BlockRequest to wait for the right
4675  * callbacks (if error != 0, no callback will be called).
4676  *
4677  * The implementation may modify the contents of the reqs array, e.g. to merge
4678  * requests. However, the fields opaque and error are left unmodified as they
4679  * are used to signal failure for a single request to the caller.
4680  */
4681 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4682 {
4683     MultiwriteCB *mcb;
4684     int i;
4685 
4686     /* don't submit writes if we don't have a medium */
4687     if (bs->drv == NULL) {
4688         for (i = 0; i < num_reqs; i++) {
4689             reqs[i].error = -ENOMEDIUM;
4690         }
4691         return -1;
4692     }
4693 
4694     if (num_reqs == 0) {
4695         return 0;
4696     }
4697 
4698     // Create MultiwriteCB structure
4699     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4700     mcb->num_requests = 0;
4701     mcb->num_callbacks = num_reqs;
4702 
4703     for (i = 0; i < num_reqs; i++) {
4704         mcb->callbacks[i].cb = reqs[i].cb;
4705         mcb->callbacks[i].opaque = reqs[i].opaque;
4706     }
4707 
4708     // Check for mergable requests
4709     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4710 
4711     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4712 
4713     /* Run the aio requests. */
4714     mcb->num_requests = num_reqs;
4715     for (i = 0; i < num_reqs; i++) {
4716         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4717                               reqs[i].nb_sectors, reqs[i].flags,
4718                               multiwrite_cb, mcb,
4719                               true);
4720     }
4721 
4722     return 0;
4723 }
4724 
4725 void bdrv_aio_cancel(BlockAIOCB *acb)
4726 {
4727     qemu_aio_ref(acb);
4728     bdrv_aio_cancel_async(acb);
4729     while (acb->refcnt > 1) {
4730         if (acb->aiocb_info->get_aio_context) {
4731             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4732         } else if (acb->bs) {
4733             aio_poll(bdrv_get_aio_context(acb->bs), true);
4734         } else {
4735             abort();
4736         }
4737     }
4738     qemu_aio_unref(acb);
4739 }
4740 
4741 /* Async version of aio cancel. The caller is not blocked if the acb implements
4742  * cancel_async, otherwise we do nothing and let the request normally complete.
4743  * In either case the completion callback must be called. */
4744 void bdrv_aio_cancel_async(BlockAIOCB *acb)
4745 {
4746     if (acb->aiocb_info->cancel_async) {
4747         acb->aiocb_info->cancel_async(acb);
4748     }
4749 }
4750 
4751 /**************************************************************/
4752 /* async block device emulation */
4753 
4754 typedef struct BlockAIOCBSync {
4755     BlockAIOCB common;
4756     QEMUBH *bh;
4757     int ret;
4758     /* vector translation state */
4759     QEMUIOVector *qiov;
4760     uint8_t *bounce;
4761     int is_write;
4762 } BlockAIOCBSync;
4763 
4764 static const AIOCBInfo bdrv_em_aiocb_info = {
4765     .aiocb_size         = sizeof(BlockAIOCBSync),
4766 };
4767 
4768 static void bdrv_aio_bh_cb(void *opaque)
4769 {
4770     BlockAIOCBSync *acb = opaque;
4771 
4772     if (!acb->is_write && acb->ret >= 0) {
4773         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4774     }
4775     qemu_vfree(acb->bounce);
4776     acb->common.cb(acb->common.opaque, acb->ret);
4777     qemu_bh_delete(acb->bh);
4778     acb->bh = NULL;
4779     qemu_aio_unref(acb);
4780 }
4781 
4782 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4783                                       int64_t sector_num,
4784                                       QEMUIOVector *qiov,
4785                                       int nb_sectors,
4786                                       BlockCompletionFunc *cb,
4787                                       void *opaque,
4788                                       int is_write)
4789 
4790 {
4791     BlockAIOCBSync *acb;
4792 
4793     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4794     acb->is_write = is_write;
4795     acb->qiov = qiov;
4796     acb->bounce = qemu_try_blockalign(bs, qiov->size);
4797     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4798 
4799     if (acb->bounce == NULL) {
4800         acb->ret = -ENOMEM;
4801     } else if (is_write) {
4802         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4803         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4804     } else {
4805         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4806     }
4807 
4808     qemu_bh_schedule(acb->bh);
4809 
4810     return &acb->common;
4811 }
4812 
4813 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4814         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4815         BlockCompletionFunc *cb, void *opaque)
4816 {
4817     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4818 }
4819 
4820 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4821         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4822         BlockCompletionFunc *cb, void *opaque)
4823 {
4824     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4825 }
4826 
4827 
4828 typedef struct BlockAIOCBCoroutine {
4829     BlockAIOCB common;
4830     BlockRequest req;
4831     bool is_write;
4832     bool *done;
4833     QEMUBH* bh;
4834 } BlockAIOCBCoroutine;
4835 
4836 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4837     .aiocb_size         = sizeof(BlockAIOCBCoroutine),
4838 };
4839 
4840 static void bdrv_co_em_bh(void *opaque)
4841 {
4842     BlockAIOCBCoroutine *acb = opaque;
4843 
4844     acb->common.cb(acb->common.opaque, acb->req.error);
4845 
4846     qemu_bh_delete(acb->bh);
4847     qemu_aio_unref(acb);
4848 }
4849 
4850 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4851 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4852 {
4853     BlockAIOCBCoroutine *acb = opaque;
4854     BlockDriverState *bs = acb->common.bs;
4855 
4856     if (!acb->is_write) {
4857         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4858             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4859     } else {
4860         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4861             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4862     }
4863 
4864     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4865     qemu_bh_schedule(acb->bh);
4866 }
4867 
4868 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4869                                          int64_t sector_num,
4870                                          QEMUIOVector *qiov,
4871                                          int nb_sectors,
4872                                          BdrvRequestFlags flags,
4873                                          BlockCompletionFunc *cb,
4874                                          void *opaque,
4875                                          bool is_write)
4876 {
4877     Coroutine *co;
4878     BlockAIOCBCoroutine *acb;
4879 
4880     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4881     acb->req.sector = sector_num;
4882     acb->req.nb_sectors = nb_sectors;
4883     acb->req.qiov = qiov;
4884     acb->req.flags = flags;
4885     acb->is_write = is_write;
4886 
4887     co = qemu_coroutine_create(bdrv_co_do_rw);
4888     qemu_coroutine_enter(co, acb);
4889 
4890     return &acb->common;
4891 }
4892 
4893 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4894 {
4895     BlockAIOCBCoroutine *acb = opaque;
4896     BlockDriverState *bs = acb->common.bs;
4897 
4898     acb->req.error = bdrv_co_flush(bs);
4899     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4900     qemu_bh_schedule(acb->bh);
4901 }
4902 
4903 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4904         BlockCompletionFunc *cb, void *opaque)
4905 {
4906     trace_bdrv_aio_flush(bs, opaque);
4907 
4908     Coroutine *co;
4909     BlockAIOCBCoroutine *acb;
4910 
4911     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4912 
4913     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4914     qemu_coroutine_enter(co, acb);
4915 
4916     return &acb->common;
4917 }
4918 
4919 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4920 {
4921     BlockAIOCBCoroutine *acb = opaque;
4922     BlockDriverState *bs = acb->common.bs;
4923 
4924     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4925     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4926     qemu_bh_schedule(acb->bh);
4927 }
4928 
4929 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4930         int64_t sector_num, int nb_sectors,
4931         BlockCompletionFunc *cb, void *opaque)
4932 {
4933     Coroutine *co;
4934     BlockAIOCBCoroutine *acb;
4935 
4936     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4937 
4938     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4939     acb->req.sector = sector_num;
4940     acb->req.nb_sectors = nb_sectors;
4941     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4942     qemu_coroutine_enter(co, acb);
4943 
4944     return &acb->common;
4945 }
4946 
4947 void bdrv_init(void)
4948 {
4949     module_call_init(MODULE_INIT_BLOCK);
4950 }
4951 
4952 void bdrv_init_with_whitelist(void)
4953 {
4954     use_bdrv_whitelist = 1;
4955     bdrv_init();
4956 }
4957 
4958 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4959                    BlockCompletionFunc *cb, void *opaque)
4960 {
4961     BlockAIOCB *acb;
4962 
4963     acb = g_slice_alloc(aiocb_info->aiocb_size);
4964     acb->aiocb_info = aiocb_info;
4965     acb->bs = bs;
4966     acb->cb = cb;
4967     acb->opaque = opaque;
4968     acb->refcnt = 1;
4969     return acb;
4970 }
4971 
4972 void qemu_aio_ref(void *p)
4973 {
4974     BlockAIOCB *acb = p;
4975     acb->refcnt++;
4976 }
4977 
4978 void qemu_aio_unref(void *p)
4979 {
4980     BlockAIOCB *acb = p;
4981     assert(acb->refcnt > 0);
4982     if (--acb->refcnt == 0) {
4983         g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4984     }
4985 }
4986 
4987 /**************************************************************/
4988 /* Coroutine block device emulation */
4989 
4990 typedef struct CoroutineIOCompletion {
4991     Coroutine *coroutine;
4992     int ret;
4993 } CoroutineIOCompletion;
4994 
4995 static void bdrv_co_io_em_complete(void *opaque, int ret)
4996 {
4997     CoroutineIOCompletion *co = opaque;
4998 
4999     co->ret = ret;
5000     qemu_coroutine_enter(co->coroutine, NULL);
5001 }
5002 
5003 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
5004                                       int nb_sectors, QEMUIOVector *iov,
5005                                       bool is_write)
5006 {
5007     CoroutineIOCompletion co = {
5008         .coroutine = qemu_coroutine_self(),
5009     };
5010     BlockAIOCB *acb;
5011 
5012     if (is_write) {
5013         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
5014                                        bdrv_co_io_em_complete, &co);
5015     } else {
5016         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
5017                                       bdrv_co_io_em_complete, &co);
5018     }
5019 
5020     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
5021     if (!acb) {
5022         return -EIO;
5023     }
5024     qemu_coroutine_yield();
5025 
5026     return co.ret;
5027 }
5028 
5029 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
5030                                          int64_t sector_num, int nb_sectors,
5031                                          QEMUIOVector *iov)
5032 {
5033     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
5034 }
5035 
5036 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
5037                                          int64_t sector_num, int nb_sectors,
5038                                          QEMUIOVector *iov)
5039 {
5040     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
5041 }
5042 
5043 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
5044 {
5045     RwCo *rwco = opaque;
5046 
5047     rwco->ret = bdrv_co_flush(rwco->bs);
5048 }
5049 
5050 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
5051 {
5052     int ret;
5053 
5054     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
5055         return 0;
5056     }
5057 
5058     /* Write back cached data to the OS even with cache=unsafe */
5059     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
5060     if (bs->drv->bdrv_co_flush_to_os) {
5061         ret = bs->drv->bdrv_co_flush_to_os(bs);
5062         if (ret < 0) {
5063             return ret;
5064         }
5065     }
5066 
5067     /* But don't actually force it to the disk with cache=unsafe */
5068     if (bs->open_flags & BDRV_O_NO_FLUSH) {
5069         goto flush_parent;
5070     }
5071 
5072     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
5073     if (bs->drv->bdrv_co_flush_to_disk) {
5074         ret = bs->drv->bdrv_co_flush_to_disk(bs);
5075     } else if (bs->drv->bdrv_aio_flush) {
5076         BlockAIOCB *acb;
5077         CoroutineIOCompletion co = {
5078             .coroutine = qemu_coroutine_self(),
5079         };
5080 
5081         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
5082         if (acb == NULL) {
5083             ret = -EIO;
5084         } else {
5085             qemu_coroutine_yield();
5086             ret = co.ret;
5087         }
5088     } else {
5089         /*
5090          * Some block drivers always operate in either writethrough or unsafe
5091          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
5092          * know how the server works (because the behaviour is hardcoded or
5093          * depends on server-side configuration), so we can't ensure that
5094          * everything is safe on disk. Returning an error doesn't work because
5095          * that would break guests even if the server operates in writethrough
5096          * mode.
5097          *
5098          * Let's hope the user knows what he's doing.
5099          */
5100         ret = 0;
5101     }
5102     if (ret < 0) {
5103         return ret;
5104     }
5105 
5106     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
5107      * in the case of cache=unsafe, so there are no useless flushes.
5108      */
5109 flush_parent:
5110     return bdrv_co_flush(bs->file);
5111 }
5112 
5113 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
5114 {
5115     Error *local_err = NULL;
5116     int ret;
5117 
5118     if (!bs->drv)  {
5119         return;
5120     }
5121 
5122     if (!(bs->open_flags & BDRV_O_INCOMING)) {
5123         return;
5124     }
5125     bs->open_flags &= ~BDRV_O_INCOMING;
5126 
5127     if (bs->drv->bdrv_invalidate_cache) {
5128         bs->drv->bdrv_invalidate_cache(bs, &local_err);
5129     } else if (bs->file) {
5130         bdrv_invalidate_cache(bs->file, &local_err);
5131     }
5132     if (local_err) {
5133         error_propagate(errp, local_err);
5134         return;
5135     }
5136 
5137     ret = refresh_total_sectors(bs, bs->total_sectors);
5138     if (ret < 0) {
5139         error_setg_errno(errp, -ret, "Could not refresh total sector count");
5140         return;
5141     }
5142 }
5143 
5144 void bdrv_invalidate_cache_all(Error **errp)
5145 {
5146     BlockDriverState *bs;
5147     Error *local_err = NULL;
5148 
5149     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5150         AioContext *aio_context = bdrv_get_aio_context(bs);
5151 
5152         aio_context_acquire(aio_context);
5153         bdrv_invalidate_cache(bs, &local_err);
5154         aio_context_release(aio_context);
5155         if (local_err) {
5156             error_propagate(errp, local_err);
5157             return;
5158         }
5159     }
5160 }
5161 
5162 int bdrv_flush(BlockDriverState *bs)
5163 {
5164     Coroutine *co;
5165     RwCo rwco = {
5166         .bs = bs,
5167         .ret = NOT_DONE,
5168     };
5169 
5170     if (qemu_in_coroutine()) {
5171         /* Fast-path if already in coroutine context */
5172         bdrv_flush_co_entry(&rwco);
5173     } else {
5174         AioContext *aio_context = bdrv_get_aio_context(bs);
5175 
5176         co = qemu_coroutine_create(bdrv_flush_co_entry);
5177         qemu_coroutine_enter(co, &rwco);
5178         while (rwco.ret == NOT_DONE) {
5179             aio_poll(aio_context, true);
5180         }
5181     }
5182 
5183     return rwco.ret;
5184 }
5185 
5186 typedef struct DiscardCo {
5187     BlockDriverState *bs;
5188     int64_t sector_num;
5189     int nb_sectors;
5190     int ret;
5191 } DiscardCo;
5192 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5193 {
5194     DiscardCo *rwco = opaque;
5195 
5196     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5197 }
5198 
5199 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5200                                  int nb_sectors)
5201 {
5202     int max_discard, ret;
5203 
5204     if (!bs->drv) {
5205         return -ENOMEDIUM;
5206     }
5207 
5208     ret = bdrv_check_request(bs, sector_num, nb_sectors);
5209     if (ret < 0) {
5210         return ret;
5211     } else if (bs->read_only) {
5212         return -EROFS;
5213     }
5214 
5215     bdrv_reset_dirty(bs, sector_num, nb_sectors);
5216 
5217     /* Do nothing if disabled.  */
5218     if (!(bs->open_flags & BDRV_O_UNMAP)) {
5219         return 0;
5220     }
5221 
5222     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5223         return 0;
5224     }
5225 
5226     max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
5227     while (nb_sectors > 0) {
5228         int ret;
5229         int num = nb_sectors;
5230 
5231         /* align request */
5232         if (bs->bl.discard_alignment &&
5233             num >= bs->bl.discard_alignment &&
5234             sector_num % bs->bl.discard_alignment) {
5235             if (num > bs->bl.discard_alignment) {
5236                 num = bs->bl.discard_alignment;
5237             }
5238             num -= sector_num % bs->bl.discard_alignment;
5239         }
5240 
5241         /* limit request size */
5242         if (num > max_discard) {
5243             num = max_discard;
5244         }
5245 
5246         if (bs->drv->bdrv_co_discard) {
5247             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5248         } else {
5249             BlockAIOCB *acb;
5250             CoroutineIOCompletion co = {
5251                 .coroutine = qemu_coroutine_self(),
5252             };
5253 
5254             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5255                                             bdrv_co_io_em_complete, &co);
5256             if (acb == NULL) {
5257                 return -EIO;
5258             } else {
5259                 qemu_coroutine_yield();
5260                 ret = co.ret;
5261             }
5262         }
5263         if (ret && ret != -ENOTSUP) {
5264             return ret;
5265         }
5266 
5267         sector_num += num;
5268         nb_sectors -= num;
5269     }
5270     return 0;
5271 }
5272 
5273 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5274 {
5275     Coroutine *co;
5276     DiscardCo rwco = {
5277         .bs = bs,
5278         .sector_num = sector_num,
5279         .nb_sectors = nb_sectors,
5280         .ret = NOT_DONE,
5281     };
5282 
5283     if (qemu_in_coroutine()) {
5284         /* Fast-path if already in coroutine context */
5285         bdrv_discard_co_entry(&rwco);
5286     } else {
5287         AioContext *aio_context = bdrv_get_aio_context(bs);
5288 
5289         co = qemu_coroutine_create(bdrv_discard_co_entry);
5290         qemu_coroutine_enter(co, &rwco);
5291         while (rwco.ret == NOT_DONE) {
5292             aio_poll(aio_context, true);
5293         }
5294     }
5295 
5296     return rwco.ret;
5297 }
5298 
5299 /**************************************************************/
5300 /* removable device support */
5301 
5302 /**
5303  * Return TRUE if the media is present
5304  */
5305 int bdrv_is_inserted(BlockDriverState *bs)
5306 {
5307     BlockDriver *drv = bs->drv;
5308 
5309     if (!drv)
5310         return 0;
5311     if (!drv->bdrv_is_inserted)
5312         return 1;
5313     return drv->bdrv_is_inserted(bs);
5314 }
5315 
5316 /**
5317  * Return whether the media changed since the last call to this
5318  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
5319  */
5320 int bdrv_media_changed(BlockDriverState *bs)
5321 {
5322     BlockDriver *drv = bs->drv;
5323 
5324     if (drv && drv->bdrv_media_changed) {
5325         return drv->bdrv_media_changed(bs);
5326     }
5327     return -ENOTSUP;
5328 }
5329 
5330 /**
5331  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5332  */
5333 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5334 {
5335     BlockDriver *drv = bs->drv;
5336     const char *device_name;
5337 
5338     if (drv && drv->bdrv_eject) {
5339         drv->bdrv_eject(bs, eject_flag);
5340     }
5341 
5342     device_name = bdrv_get_device_name(bs);
5343     if (device_name[0] != '\0') {
5344         qapi_event_send_device_tray_moved(device_name,
5345                                           eject_flag, &error_abort);
5346     }
5347 }
5348 
5349 /**
5350  * Lock or unlock the media (if it is locked, the user won't be able
5351  * to eject it manually).
5352  */
5353 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5354 {
5355     BlockDriver *drv = bs->drv;
5356 
5357     trace_bdrv_lock_medium(bs, locked);
5358 
5359     if (drv && drv->bdrv_lock_medium) {
5360         drv->bdrv_lock_medium(bs, locked);
5361     }
5362 }
5363 
5364 /* needed for generic scsi interface */
5365 
5366 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5367 {
5368     BlockDriver *drv = bs->drv;
5369 
5370     if (drv && drv->bdrv_ioctl)
5371         return drv->bdrv_ioctl(bs, req, buf);
5372     return -ENOTSUP;
5373 }
5374 
5375 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5376         unsigned long int req, void *buf,
5377         BlockCompletionFunc *cb, void *opaque)
5378 {
5379     BlockDriver *drv = bs->drv;
5380 
5381     if (drv && drv->bdrv_aio_ioctl)
5382         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5383     return NULL;
5384 }
5385 
5386 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5387 {
5388     bs->guest_block_size = align;
5389 }
5390 
5391 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5392 {
5393     return qemu_memalign(bdrv_opt_mem_align(bs), size);
5394 }
5395 
5396 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5397 {
5398     return memset(qemu_blockalign(bs, size), 0, size);
5399 }
5400 
5401 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5402 {
5403     size_t align = bdrv_opt_mem_align(bs);
5404 
5405     /* Ensure that NULL is never returned on success */
5406     assert(align > 0);
5407     if (size == 0) {
5408         size = align;
5409     }
5410 
5411     return qemu_try_memalign(align, size);
5412 }
5413 
5414 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5415 {
5416     void *mem = qemu_try_blockalign(bs, size);
5417 
5418     if (mem) {
5419         memset(mem, 0, size);
5420     }
5421 
5422     return mem;
5423 }
5424 
5425 /*
5426  * Check if all memory in this vector is sector aligned.
5427  */
5428 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5429 {
5430     int i;
5431     size_t alignment = bdrv_opt_mem_align(bs);
5432 
5433     for (i = 0; i < qiov->niov; i++) {
5434         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5435             return false;
5436         }
5437         if (qiov->iov[i].iov_len % alignment) {
5438             return false;
5439         }
5440     }
5441 
5442     return true;
5443 }
5444 
5445 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5446                                           Error **errp)
5447 {
5448     int64_t bitmap_size;
5449     BdrvDirtyBitmap *bitmap;
5450 
5451     assert((granularity & (granularity - 1)) == 0);
5452 
5453     granularity >>= BDRV_SECTOR_BITS;
5454     assert(granularity);
5455     bitmap_size = bdrv_nb_sectors(bs);
5456     if (bitmap_size < 0) {
5457         error_setg_errno(errp, -bitmap_size, "could not get length of device");
5458         errno = -bitmap_size;
5459         return NULL;
5460     }
5461     bitmap = g_new0(BdrvDirtyBitmap, 1);
5462     bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(granularity));
5463     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5464     return bitmap;
5465 }
5466 
5467 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5468 {
5469     BdrvDirtyBitmap *bm, *next;
5470     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5471         if (bm == bitmap) {
5472             QLIST_REMOVE(bitmap, list);
5473             hbitmap_free(bitmap->bitmap);
5474             g_free(bitmap);
5475             return;
5476         }
5477     }
5478 }
5479 
5480 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5481 {
5482     BdrvDirtyBitmap *bm;
5483     BlockDirtyInfoList *list = NULL;
5484     BlockDirtyInfoList **plist = &list;
5485 
5486     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5487         BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5488         BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
5489         info->count = bdrv_get_dirty_count(bs, bm);
5490         info->granularity =
5491             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5492         entry->value = info;
5493         *plist = entry;
5494         plist = &entry->next;
5495     }
5496 
5497     return list;
5498 }
5499 
5500 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5501 {
5502     if (bitmap) {
5503         return hbitmap_get(bitmap->bitmap, sector);
5504     } else {
5505         return 0;
5506     }
5507 }
5508 
5509 void bdrv_dirty_iter_init(BlockDriverState *bs,
5510                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5511 {
5512     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5513 }
5514 
5515 void bdrv_set_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
5516                            int64_t cur_sector, int nr_sectors)
5517 {
5518     hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5519 }
5520 
5521 void bdrv_reset_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
5522                              int64_t cur_sector, int nr_sectors)
5523 {
5524     hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5525 }
5526 
5527 static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5528                            int nr_sectors)
5529 {
5530     BdrvDirtyBitmap *bitmap;
5531     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5532         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5533     }
5534 }
5535 
5536 static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
5537                              int nr_sectors)
5538 {
5539     BdrvDirtyBitmap *bitmap;
5540     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5541         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5542     }
5543 }
5544 
5545 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5546 {
5547     return hbitmap_count(bitmap->bitmap);
5548 }
5549 
5550 /* Get a reference to bs */
5551 void bdrv_ref(BlockDriverState *bs)
5552 {
5553     bs->refcnt++;
5554 }
5555 
5556 /* Release a previously grabbed reference to bs.
5557  * If after releasing, reference count is zero, the BlockDriverState is
5558  * deleted. */
5559 void bdrv_unref(BlockDriverState *bs)
5560 {
5561     if (!bs) {
5562         return;
5563     }
5564     assert(bs->refcnt > 0);
5565     if (--bs->refcnt == 0) {
5566         bdrv_delete(bs);
5567     }
5568 }
5569 
5570 struct BdrvOpBlocker {
5571     Error *reason;
5572     QLIST_ENTRY(BdrvOpBlocker) list;
5573 };
5574 
5575 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5576 {
5577     BdrvOpBlocker *blocker;
5578     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5579     if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5580         blocker = QLIST_FIRST(&bs->op_blockers[op]);
5581         if (errp) {
5582             error_setg(errp, "Device '%s' is busy: %s",
5583                        bdrv_get_device_name(bs),
5584                        error_get_pretty(blocker->reason));
5585         }
5586         return true;
5587     }
5588     return false;
5589 }
5590 
5591 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5592 {
5593     BdrvOpBlocker *blocker;
5594     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5595 
5596     blocker = g_new0(BdrvOpBlocker, 1);
5597     blocker->reason = reason;
5598     QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5599 }
5600 
5601 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5602 {
5603     BdrvOpBlocker *blocker, *next;
5604     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5605     QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5606         if (blocker->reason == reason) {
5607             QLIST_REMOVE(blocker, list);
5608             g_free(blocker);
5609         }
5610     }
5611 }
5612 
5613 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5614 {
5615     int i;
5616     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5617         bdrv_op_block(bs, i, reason);
5618     }
5619 }
5620 
5621 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5622 {
5623     int i;
5624     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5625         bdrv_op_unblock(bs, i, reason);
5626     }
5627 }
5628 
5629 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5630 {
5631     int i;
5632 
5633     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5634         if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5635             return false;
5636         }
5637     }
5638     return true;
5639 }
5640 
5641 void bdrv_iostatus_enable(BlockDriverState *bs)
5642 {
5643     bs->iostatus_enabled = true;
5644     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5645 }
5646 
5647 /* The I/O status is only enabled if the drive explicitly
5648  * enables it _and_ the VM is configured to stop on errors */
5649 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5650 {
5651     return (bs->iostatus_enabled &&
5652            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5653             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5654             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5655 }
5656 
5657 void bdrv_iostatus_disable(BlockDriverState *bs)
5658 {
5659     bs->iostatus_enabled = false;
5660 }
5661 
5662 void bdrv_iostatus_reset(BlockDriverState *bs)
5663 {
5664     if (bdrv_iostatus_is_enabled(bs)) {
5665         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5666         if (bs->job) {
5667             block_job_iostatus_reset(bs->job);
5668         }
5669     }
5670 }
5671 
5672 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5673 {
5674     assert(bdrv_iostatus_is_enabled(bs));
5675     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5676         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5677                                          BLOCK_DEVICE_IO_STATUS_FAILED;
5678     }
5679 }
5680 
5681 void bdrv_img_create(const char *filename, const char *fmt,
5682                      const char *base_filename, const char *base_fmt,
5683                      char *options, uint64_t img_size, int flags,
5684                      Error **errp, bool quiet)
5685 {
5686     QemuOptsList *create_opts = NULL;
5687     QemuOpts *opts = NULL;
5688     const char *backing_fmt, *backing_file;
5689     int64_t size;
5690     BlockDriver *drv, *proto_drv;
5691     BlockDriver *backing_drv = NULL;
5692     Error *local_err = NULL;
5693     int ret = 0;
5694 
5695     /* Find driver and parse its options */
5696     drv = bdrv_find_format(fmt);
5697     if (!drv) {
5698         error_setg(errp, "Unknown file format '%s'", fmt);
5699         return;
5700     }
5701 
5702     proto_drv = bdrv_find_protocol(filename, true, errp);
5703     if (!proto_drv) {
5704         return;
5705     }
5706 
5707     if (!drv->create_opts) {
5708         error_setg(errp, "Format driver '%s' does not support image creation",
5709                    drv->format_name);
5710         return;
5711     }
5712 
5713     if (!proto_drv->create_opts) {
5714         error_setg(errp, "Protocol driver '%s' does not support image creation",
5715                    proto_drv->format_name);
5716         return;
5717     }
5718 
5719     create_opts = qemu_opts_append(create_opts, drv->create_opts);
5720     create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5721 
5722     /* Create parameter list with default values */
5723     opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5724     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size, &error_abort);
5725 
5726     /* Parse -o options */
5727     if (options) {
5728         qemu_opts_do_parse(opts, options, NULL, &local_err);
5729         if (local_err) {
5730             error_report_err(local_err);
5731             local_err = NULL;
5732             error_setg(errp, "Invalid options for file format '%s'", fmt);
5733             goto out;
5734         }
5735     }
5736 
5737     if (base_filename) {
5738         qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename, &local_err);
5739         if (local_err) {
5740             error_setg(errp, "Backing file not supported for file format '%s'",
5741                        fmt);
5742             goto out;
5743         }
5744     }
5745 
5746     if (base_fmt) {
5747         qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt, &local_err);
5748         if (local_err) {
5749             error_setg(errp, "Backing file format not supported for file "
5750                              "format '%s'", fmt);
5751             goto out;
5752         }
5753     }
5754 
5755     backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5756     if (backing_file) {
5757         if (!strcmp(filename, backing_file)) {
5758             error_setg(errp, "Error: Trying to create an image with the "
5759                              "same filename as the backing file");
5760             goto out;
5761         }
5762     }
5763 
5764     backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5765     if (backing_fmt) {
5766         backing_drv = bdrv_find_format(backing_fmt);
5767         if (!backing_drv) {
5768             error_setg(errp, "Unknown backing file format '%s'",
5769                        backing_fmt);
5770             goto out;
5771         }
5772     }
5773 
5774     // The size for the image must always be specified, with one exception:
5775     // If we are using a backing file, we can obtain the size from there
5776     size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5777     if (size == -1) {
5778         if (backing_file) {
5779             BlockDriverState *bs;
5780             char *full_backing = g_new0(char, PATH_MAX);
5781             int64_t size;
5782             int back_flags;
5783 
5784             bdrv_get_full_backing_filename_from_filename(filename, backing_file,
5785                                                          full_backing, PATH_MAX,
5786                                                          &local_err);
5787             if (local_err) {
5788                 g_free(full_backing);
5789                 goto out;
5790             }
5791 
5792             /* backing files always opened read-only */
5793             back_flags =
5794                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5795 
5796             bs = NULL;
5797             ret = bdrv_open(&bs, full_backing, NULL, NULL, back_flags,
5798                             backing_drv, &local_err);
5799             g_free(full_backing);
5800             if (ret < 0) {
5801                 goto out;
5802             }
5803             size = bdrv_getlength(bs);
5804             if (size < 0) {
5805                 error_setg_errno(errp, -size, "Could not get size of '%s'",
5806                                  backing_file);
5807                 bdrv_unref(bs);
5808                 goto out;
5809             }
5810 
5811             qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size, &error_abort);
5812 
5813             bdrv_unref(bs);
5814         } else {
5815             error_setg(errp, "Image creation needs a size parameter");
5816             goto out;
5817         }
5818     }
5819 
5820     if (!quiet) {
5821         printf("Formatting '%s', fmt=%s", filename, fmt);
5822         qemu_opts_print(opts, " ");
5823         puts("");
5824     }
5825 
5826     ret = bdrv_create(drv, filename, opts, &local_err);
5827 
5828     if (ret == -EFBIG) {
5829         /* This is generally a better message than whatever the driver would
5830          * deliver (especially because of the cluster_size_hint), since that
5831          * is most probably not much different from "image too large". */
5832         const char *cluster_size_hint = "";
5833         if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
5834             cluster_size_hint = " (try using a larger cluster size)";
5835         }
5836         error_setg(errp, "The image size is too large for file format '%s'"
5837                    "%s", fmt, cluster_size_hint);
5838         error_free(local_err);
5839         local_err = NULL;
5840     }
5841 
5842 out:
5843     qemu_opts_del(opts);
5844     qemu_opts_free(create_opts);
5845     if (local_err) {
5846         error_propagate(errp, local_err);
5847     }
5848 }
5849 
5850 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5851 {
5852     return bs->aio_context;
5853 }
5854 
5855 void bdrv_detach_aio_context(BlockDriverState *bs)
5856 {
5857     BdrvAioNotifier *baf;
5858 
5859     if (!bs->drv) {
5860         return;
5861     }
5862 
5863     QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5864         baf->detach_aio_context(baf->opaque);
5865     }
5866 
5867     if (bs->io_limits_enabled) {
5868         throttle_detach_aio_context(&bs->throttle_state);
5869     }
5870     if (bs->drv->bdrv_detach_aio_context) {
5871         bs->drv->bdrv_detach_aio_context(bs);
5872     }
5873     if (bs->file) {
5874         bdrv_detach_aio_context(bs->file);
5875     }
5876     if (bs->backing_hd) {
5877         bdrv_detach_aio_context(bs->backing_hd);
5878     }
5879 
5880     bs->aio_context = NULL;
5881 }
5882 
5883 void bdrv_attach_aio_context(BlockDriverState *bs,
5884                              AioContext *new_context)
5885 {
5886     BdrvAioNotifier *ban;
5887 
5888     if (!bs->drv) {
5889         return;
5890     }
5891 
5892     bs->aio_context = new_context;
5893 
5894     if (bs->backing_hd) {
5895         bdrv_attach_aio_context(bs->backing_hd, new_context);
5896     }
5897     if (bs->file) {
5898         bdrv_attach_aio_context(bs->file, new_context);
5899     }
5900     if (bs->drv->bdrv_attach_aio_context) {
5901         bs->drv->bdrv_attach_aio_context(bs, new_context);
5902     }
5903     if (bs->io_limits_enabled) {
5904         throttle_attach_aio_context(&bs->throttle_state, new_context);
5905     }
5906 
5907     QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5908         ban->attached_aio_context(new_context, ban->opaque);
5909     }
5910 }
5911 
5912 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5913 {
5914     bdrv_drain_all(); /* ensure there are no in-flight requests */
5915 
5916     bdrv_detach_aio_context(bs);
5917 
5918     /* This function executes in the old AioContext so acquire the new one in
5919      * case it runs in a different thread.
5920      */
5921     aio_context_acquire(new_context);
5922     bdrv_attach_aio_context(bs, new_context);
5923     aio_context_release(new_context);
5924 }
5925 
5926 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5927         void (*attached_aio_context)(AioContext *new_context, void *opaque),
5928         void (*detach_aio_context)(void *opaque), void *opaque)
5929 {
5930     BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5931     *ban = (BdrvAioNotifier){
5932         .attached_aio_context = attached_aio_context,
5933         .detach_aio_context   = detach_aio_context,
5934         .opaque               = opaque
5935     };
5936 
5937     QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5938 }
5939 
5940 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5941                                       void (*attached_aio_context)(AioContext *,
5942                                                                    void *),
5943                                       void (*detach_aio_context)(void *),
5944                                       void *opaque)
5945 {
5946     BdrvAioNotifier *ban, *ban_next;
5947 
5948     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5949         if (ban->attached_aio_context == attached_aio_context &&
5950             ban->detach_aio_context   == detach_aio_context   &&
5951             ban->opaque               == opaque)
5952         {
5953             QLIST_REMOVE(ban, list);
5954             g_free(ban);
5955 
5956             return;
5957         }
5958     }
5959 
5960     abort();
5961 }
5962 
5963 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5964                                     NotifierWithReturn *notifier)
5965 {
5966     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5967 }
5968 
5969 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
5970                        BlockDriverAmendStatusCB *status_cb)
5971 {
5972     if (!bs->drv->bdrv_amend_options) {
5973         return -ENOTSUP;
5974     }
5975     return bs->drv->bdrv_amend_options(bs, opts, status_cb);
5976 }
5977 
5978 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5979  * of block filter and by bdrv_is_first_non_filter.
5980  * It is used to test if the given bs is the candidate or recurse more in the
5981  * node graph.
5982  */
5983 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5984                                       BlockDriverState *candidate)
5985 {
5986     /* return false if basic checks fails */
5987     if (!bs || !bs->drv) {
5988         return false;
5989     }
5990 
5991     /* the code reached a non block filter driver -> check if the bs is
5992      * the same as the candidate. It's the recursion termination condition.
5993      */
5994     if (!bs->drv->is_filter) {
5995         return bs == candidate;
5996     }
5997     /* Down this path the driver is a block filter driver */
5998 
5999     /* If the block filter recursion method is defined use it to recurse down
6000      * the node graph.
6001      */
6002     if (bs->drv->bdrv_recurse_is_first_non_filter) {
6003         return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
6004     }
6005 
6006     /* the driver is a block filter but don't allow to recurse -> return false
6007      */
6008     return false;
6009 }
6010 
6011 /* This function checks if the candidate is the first non filter bs down it's
6012  * bs chain. Since we don't have pointers to parents it explore all bs chains
6013  * from the top. Some filters can choose not to pass down the recursion.
6014  */
6015 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
6016 {
6017     BlockDriverState *bs;
6018 
6019     /* walk down the bs forest recursively */
6020     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
6021         bool perm;
6022 
6023         /* try to recurse in this top level bs */
6024         perm = bdrv_recurse_is_first_non_filter(bs, candidate);
6025 
6026         /* candidate is the first non filter */
6027         if (perm) {
6028             return true;
6029         }
6030     }
6031 
6032     return false;
6033 }
6034 
6035 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
6036 {
6037     BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
6038     AioContext *aio_context;
6039 
6040     if (!to_replace_bs) {
6041         error_setg(errp, "Node name '%s' not found", node_name);
6042         return NULL;
6043     }
6044 
6045     aio_context = bdrv_get_aio_context(to_replace_bs);
6046     aio_context_acquire(aio_context);
6047 
6048     if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
6049         to_replace_bs = NULL;
6050         goto out;
6051     }
6052 
6053     /* We don't want arbitrary node of the BDS chain to be replaced only the top
6054      * most non filter in order to prevent data corruption.
6055      * Another benefit is that this tests exclude backing files which are
6056      * blocked by the backing blockers.
6057      */
6058     if (!bdrv_is_first_non_filter(to_replace_bs)) {
6059         error_setg(errp, "Only top most non filter can be replaced");
6060         to_replace_bs = NULL;
6061         goto out;
6062     }
6063 
6064 out:
6065     aio_context_release(aio_context);
6066     return to_replace_bs;
6067 }
6068 
6069 void bdrv_io_plug(BlockDriverState *bs)
6070 {
6071     BlockDriver *drv = bs->drv;
6072     if (drv && drv->bdrv_io_plug) {
6073         drv->bdrv_io_plug(bs);
6074     } else if (bs->file) {
6075         bdrv_io_plug(bs->file);
6076     }
6077 }
6078 
6079 void bdrv_io_unplug(BlockDriverState *bs)
6080 {
6081     BlockDriver *drv = bs->drv;
6082     if (drv && drv->bdrv_io_unplug) {
6083         drv->bdrv_io_unplug(bs);
6084     } else if (bs->file) {
6085         bdrv_io_unplug(bs->file);
6086     }
6087 }
6088 
6089 void bdrv_flush_io_queue(BlockDriverState *bs)
6090 {
6091     BlockDriver *drv = bs->drv;
6092     if (drv && drv->bdrv_flush_io_queue) {
6093         drv->bdrv_flush_io_queue(bs);
6094     } else if (bs->file) {
6095         bdrv_flush_io_queue(bs->file);
6096     }
6097 }
6098 
6099 static bool append_open_options(QDict *d, BlockDriverState *bs)
6100 {
6101     const QDictEntry *entry;
6102     bool found_any = false;
6103 
6104     for (entry = qdict_first(bs->options); entry;
6105          entry = qdict_next(bs->options, entry))
6106     {
6107         /* Only take options for this level and exclude all non-driver-specific
6108          * options */
6109         if (!strchr(qdict_entry_key(entry), '.') &&
6110             strcmp(qdict_entry_key(entry), "node-name"))
6111         {
6112             qobject_incref(qdict_entry_value(entry));
6113             qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
6114             found_any = true;
6115         }
6116     }
6117 
6118     return found_any;
6119 }
6120 
6121 /* Updates the following BDS fields:
6122  *  - exact_filename: A filename which may be used for opening a block device
6123  *                    which (mostly) equals the given BDS (even without any
6124  *                    other options; so reading and writing must return the same
6125  *                    results, but caching etc. may be different)
6126  *  - full_open_options: Options which, when given when opening a block device
6127  *                       (without a filename), result in a BDS (mostly)
6128  *                       equalling the given one
6129  *  - filename: If exact_filename is set, it is copied here. Otherwise,
6130  *              full_open_options is converted to a JSON object, prefixed with
6131  *              "json:" (for use through the JSON pseudo protocol) and put here.
6132  */
6133 void bdrv_refresh_filename(BlockDriverState *bs)
6134 {
6135     BlockDriver *drv = bs->drv;
6136     QDict *opts;
6137 
6138     if (!drv) {
6139         return;
6140     }
6141 
6142     /* This BDS's file name will most probably depend on its file's name, so
6143      * refresh that first */
6144     if (bs->file) {
6145         bdrv_refresh_filename(bs->file);
6146     }
6147 
6148     if (drv->bdrv_refresh_filename) {
6149         /* Obsolete information is of no use here, so drop the old file name
6150          * information before refreshing it */
6151         bs->exact_filename[0] = '\0';
6152         if (bs->full_open_options) {
6153             QDECREF(bs->full_open_options);
6154             bs->full_open_options = NULL;
6155         }
6156 
6157         drv->bdrv_refresh_filename(bs);
6158     } else if (bs->file) {
6159         /* Try to reconstruct valid information from the underlying file */
6160         bool has_open_options;
6161 
6162         bs->exact_filename[0] = '\0';
6163         if (bs->full_open_options) {
6164             QDECREF(bs->full_open_options);
6165             bs->full_open_options = NULL;
6166         }
6167 
6168         opts = qdict_new();
6169         has_open_options = append_open_options(opts, bs);
6170 
6171         /* If no specific options have been given for this BDS, the filename of
6172          * the underlying file should suffice for this one as well */
6173         if (bs->file->exact_filename[0] && !has_open_options) {
6174             strcpy(bs->exact_filename, bs->file->exact_filename);
6175         }
6176         /* Reconstructing the full options QDict is simple for most format block
6177          * drivers, as long as the full options are known for the underlying
6178          * file BDS. The full options QDict of that file BDS should somehow
6179          * contain a representation of the filename, therefore the following
6180          * suffices without querying the (exact_)filename of this BDS. */
6181         if (bs->file->full_open_options) {
6182             qdict_put_obj(opts, "driver",
6183                           QOBJECT(qstring_from_str(drv->format_name)));
6184             QINCREF(bs->file->full_open_options);
6185             qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6186 
6187             bs->full_open_options = opts;
6188         } else {
6189             QDECREF(opts);
6190         }
6191     } else if (!bs->full_open_options && qdict_size(bs->options)) {
6192         /* There is no underlying file BDS (at least referenced by BDS.file),
6193          * so the full options QDict should be equal to the options given
6194          * specifically for this block device when it was opened (plus the
6195          * driver specification).
6196          * Because those options don't change, there is no need to update
6197          * full_open_options when it's already set. */
6198 
6199         opts = qdict_new();
6200         append_open_options(opts, bs);
6201         qdict_put_obj(opts, "driver",
6202                       QOBJECT(qstring_from_str(drv->format_name)));
6203 
6204         if (bs->exact_filename[0]) {
6205             /* This may not work for all block protocol drivers (some may
6206              * require this filename to be parsed), but we have to find some
6207              * default solution here, so just include it. If some block driver
6208              * does not support pure options without any filename at all or
6209              * needs some special format of the options QDict, it needs to
6210              * implement the driver-specific bdrv_refresh_filename() function.
6211              */
6212             qdict_put_obj(opts, "filename",
6213                           QOBJECT(qstring_from_str(bs->exact_filename)));
6214         }
6215 
6216         bs->full_open_options = opts;
6217     }
6218 
6219     if (bs->exact_filename[0]) {
6220         pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6221     } else if (bs->full_open_options) {
6222         QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6223         snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6224                  qstring_get_str(json));
6225         QDECREF(json);
6226     }
6227 }
6228 
6229 /* This accessor function purpose is to allow the device models to access the
6230  * BlockAcctStats structure embedded inside a BlockDriverState without being
6231  * aware of the BlockDriverState structure layout.
6232  * It will go away when the BlockAcctStats structure will be moved inside
6233  * the device models.
6234  */
6235 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6236 {
6237     return &bs->stats;
6238 }
6239