xref: /openbmc/qemu/block.c (revision 4875a779)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/block-backend.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 #include "qapi-event.h"
39 
40 #ifdef CONFIG_BSD
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <sys/ioctl.h>
44 #include <sys/queue.h>
45 #ifndef __DragonFly__
46 #include <sys/disk.h>
47 #endif
48 #endif
49 
50 #ifdef _WIN32
51 #include <windows.h>
52 #endif
53 
54 struct BdrvDirtyBitmap {
55     HBitmap *bitmap;
56     QLIST_ENTRY(BdrvDirtyBitmap) list;
57 };
58 
59 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
60 
61 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63         BlockCompletionFunc *cb, void *opaque);
64 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66         BlockCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68                                          int64_t sector_num, int nb_sectors,
69                                          QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71                                          int64_t sector_num, int nb_sectors,
72                                          QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75     BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78     BdrvRequestFlags flags);
79 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80                                          int64_t sector_num,
81                                          QEMUIOVector *qiov,
82                                          int nb_sectors,
83                                          BdrvRequestFlags flags,
84                                          BlockCompletionFunc *cb,
85                                          void *opaque,
86                                          bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90 
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92     QTAILQ_HEAD_INITIALIZER(bdrv_states);
93 
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96 
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98     QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
102 
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
105 {
106     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108             filename[1] == ':');
109 }
110 
111 int is_windows_drive(const char *filename)
112 {
113     if (is_windows_drive_prefix(filename) &&
114         filename[2] == '\0')
115         return 1;
116     if (strstart(filename, "\\\\.\\", NULL) ||
117         strstart(filename, "//./", NULL))
118         return 1;
119     return 0;
120 }
121 #endif
122 
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125                         ThrottleConfig *cfg)
126 {
127     int i;
128 
129     throttle_config(&bs->throttle_state, cfg);
130 
131     for (i = 0; i < 2; i++) {
132         qemu_co_enter_next(&bs->throttled_reqs[i]);
133     }
134 }
135 
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138 {
139     bool drained = false;
140     bool enabled = bs->io_limits_enabled;
141     int i;
142 
143     bs->io_limits_enabled = false;
144 
145     for (i = 0; i < 2; i++) {
146         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147             drained = true;
148         }
149     }
150 
151     bs->io_limits_enabled = enabled;
152 
153     return drained;
154 }
155 
156 void bdrv_io_limits_disable(BlockDriverState *bs)
157 {
158     bs->io_limits_enabled = false;
159 
160     bdrv_start_throttled_reqs(bs);
161 
162     throttle_destroy(&bs->throttle_state);
163 }
164 
165 static void bdrv_throttle_read_timer_cb(void *opaque)
166 {
167     BlockDriverState *bs = opaque;
168     qemu_co_enter_next(&bs->throttled_reqs[0]);
169 }
170 
171 static void bdrv_throttle_write_timer_cb(void *opaque)
172 {
173     BlockDriverState *bs = opaque;
174     qemu_co_enter_next(&bs->throttled_reqs[1]);
175 }
176 
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
179 {
180     assert(!bs->io_limits_enabled);
181     throttle_init(&bs->throttle_state,
182                   bdrv_get_aio_context(bs),
183                   QEMU_CLOCK_VIRTUAL,
184                   bdrv_throttle_read_timer_cb,
185                   bdrv_throttle_write_timer_cb,
186                   bs);
187     bs->io_limits_enabled = true;
188 }
189 
190 /* This function makes an IO wait if needed
191  *
192  * @nb_sectors: the number of sectors of the IO
193  * @is_write:   is the IO a write
194  */
195 static void bdrv_io_limits_intercept(BlockDriverState *bs,
196                                      unsigned int bytes,
197                                      bool is_write)
198 {
199     /* does this io must wait */
200     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
201 
202     /* if must wait or any request of this type throttled queue the IO */
203     if (must_wait ||
204         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
205         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
206     }
207 
208     /* the IO will be executed, do the accounting */
209     throttle_account(&bs->throttle_state, is_write, bytes);
210 
211 
212     /* if the next request must wait -> do nothing */
213     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214         return;
215     }
216 
217     /* else queue next request for execution */
218     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
219 }
220 
221 size_t bdrv_opt_mem_align(BlockDriverState *bs)
222 {
223     if (!bs || !bs->drv) {
224         /* 4k should be on the safe side */
225         return 4096;
226     }
227 
228     return bs->bl.opt_mem_alignment;
229 }
230 
231 /* check if the path starts with "<protocol>:" */
232 static int path_has_protocol(const char *path)
233 {
234     const char *p;
235 
236 #ifdef _WIN32
237     if (is_windows_drive(path) ||
238         is_windows_drive_prefix(path)) {
239         return 0;
240     }
241     p = path + strcspn(path, ":/\\");
242 #else
243     p = path + strcspn(path, ":/");
244 #endif
245 
246     return *p == ':';
247 }
248 
249 int path_is_absolute(const char *path)
250 {
251 #ifdef _WIN32
252     /* specific case for names like: "\\.\d:" */
253     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
254         return 1;
255     }
256     return (*path == '/' || *path == '\\');
257 #else
258     return (*path == '/');
259 #endif
260 }
261 
262 /* if filename is absolute, just copy it to dest. Otherwise, build a
263    path to it by considering it is relative to base_path. URL are
264    supported. */
265 void path_combine(char *dest, int dest_size,
266                   const char *base_path,
267                   const char *filename)
268 {
269     const char *p, *p1;
270     int len;
271 
272     if (dest_size <= 0)
273         return;
274     if (path_is_absolute(filename)) {
275         pstrcpy(dest, dest_size, filename);
276     } else {
277         p = strchr(base_path, ':');
278         if (p)
279             p++;
280         else
281             p = base_path;
282         p1 = strrchr(base_path, '/');
283 #ifdef _WIN32
284         {
285             const char *p2;
286             p2 = strrchr(base_path, '\\');
287             if (!p1 || p2 > p1)
288                 p1 = p2;
289         }
290 #endif
291         if (p1)
292             p1++;
293         else
294             p1 = base_path;
295         if (p1 > p)
296             p = p1;
297         len = p - base_path;
298         if (len > dest_size - 1)
299             len = dest_size - 1;
300         memcpy(dest, base_path, len);
301         dest[len] = '\0';
302         pstrcat(dest, dest_size, filename);
303     }
304 }
305 
306 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
307 {
308     if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
309         pstrcpy(dest, sz, bs->backing_file);
310     } else {
311         path_combine(dest, sz, bs->filename, bs->backing_file);
312     }
313 }
314 
315 void bdrv_register(BlockDriver *bdrv)
316 {
317     /* Block drivers without coroutine functions need emulation */
318     if (!bdrv->bdrv_co_readv) {
319         bdrv->bdrv_co_readv = bdrv_co_readv_em;
320         bdrv->bdrv_co_writev = bdrv_co_writev_em;
321 
322         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
323          * the block driver lacks aio we need to emulate that too.
324          */
325         if (!bdrv->bdrv_aio_readv) {
326             /* add AIO emulation layer */
327             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
328             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
329         }
330     }
331 
332     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
333 }
334 
335 BlockDriverState *bdrv_new_root(void)
336 {
337     BlockDriverState *bs = bdrv_new();
338 
339     QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
340     return bs;
341 }
342 
343 BlockDriverState *bdrv_new(void)
344 {
345     BlockDriverState *bs;
346     int i;
347 
348     bs = g_new0(BlockDriverState, 1);
349     QLIST_INIT(&bs->dirty_bitmaps);
350     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
351         QLIST_INIT(&bs->op_blockers[i]);
352     }
353     bdrv_iostatus_disable(bs);
354     notifier_list_init(&bs->close_notifiers);
355     notifier_with_return_list_init(&bs->before_write_notifiers);
356     qemu_co_queue_init(&bs->throttled_reqs[0]);
357     qemu_co_queue_init(&bs->throttled_reqs[1]);
358     bs->refcnt = 1;
359     bs->aio_context = qemu_get_aio_context();
360 
361     return bs;
362 }
363 
364 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
365 {
366     notifier_list_add(&bs->close_notifiers, notify);
367 }
368 
369 BlockDriver *bdrv_find_format(const char *format_name)
370 {
371     BlockDriver *drv1;
372     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
373         if (!strcmp(drv1->format_name, format_name)) {
374             return drv1;
375         }
376     }
377     return NULL;
378 }
379 
380 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
381 {
382     static const char *whitelist_rw[] = {
383         CONFIG_BDRV_RW_WHITELIST
384     };
385     static const char *whitelist_ro[] = {
386         CONFIG_BDRV_RO_WHITELIST
387     };
388     const char **p;
389 
390     if (!whitelist_rw[0] && !whitelist_ro[0]) {
391         return 1;               /* no whitelist, anything goes */
392     }
393 
394     for (p = whitelist_rw; *p; p++) {
395         if (!strcmp(drv->format_name, *p)) {
396             return 1;
397         }
398     }
399     if (read_only) {
400         for (p = whitelist_ro; *p; p++) {
401             if (!strcmp(drv->format_name, *p)) {
402                 return 1;
403             }
404         }
405     }
406     return 0;
407 }
408 
409 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
410                                           bool read_only)
411 {
412     BlockDriver *drv = bdrv_find_format(format_name);
413     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
414 }
415 
416 typedef struct CreateCo {
417     BlockDriver *drv;
418     char *filename;
419     QemuOpts *opts;
420     int ret;
421     Error *err;
422 } CreateCo;
423 
424 static void coroutine_fn bdrv_create_co_entry(void *opaque)
425 {
426     Error *local_err = NULL;
427     int ret;
428 
429     CreateCo *cco = opaque;
430     assert(cco->drv);
431 
432     ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
433     if (local_err) {
434         error_propagate(&cco->err, local_err);
435     }
436     cco->ret = ret;
437 }
438 
439 int bdrv_create(BlockDriver *drv, const char* filename,
440                 QemuOpts *opts, Error **errp)
441 {
442     int ret;
443 
444     Coroutine *co;
445     CreateCo cco = {
446         .drv = drv,
447         .filename = g_strdup(filename),
448         .opts = opts,
449         .ret = NOT_DONE,
450         .err = NULL,
451     };
452 
453     if (!drv->bdrv_create) {
454         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
455         ret = -ENOTSUP;
456         goto out;
457     }
458 
459     if (qemu_in_coroutine()) {
460         /* Fast-path if already in coroutine context */
461         bdrv_create_co_entry(&cco);
462     } else {
463         co = qemu_coroutine_create(bdrv_create_co_entry);
464         qemu_coroutine_enter(co, &cco);
465         while (cco.ret == NOT_DONE) {
466             aio_poll(qemu_get_aio_context(), true);
467         }
468     }
469 
470     ret = cco.ret;
471     if (ret < 0) {
472         if (cco.err) {
473             error_propagate(errp, cco.err);
474         } else {
475             error_setg_errno(errp, -ret, "Could not create image");
476         }
477     }
478 
479 out:
480     g_free(cco.filename);
481     return ret;
482 }
483 
484 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
485 {
486     BlockDriver *drv;
487     Error *local_err = NULL;
488     int ret;
489 
490     drv = bdrv_find_protocol(filename, true);
491     if (drv == NULL) {
492         error_setg(errp, "Could not find protocol for file '%s'", filename);
493         return -ENOENT;
494     }
495 
496     ret = bdrv_create(drv, filename, opts, &local_err);
497     if (local_err) {
498         error_propagate(errp, local_err);
499     }
500     return ret;
501 }
502 
503 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
504 {
505     BlockDriver *drv = bs->drv;
506     Error *local_err = NULL;
507 
508     memset(&bs->bl, 0, sizeof(bs->bl));
509 
510     if (!drv) {
511         return;
512     }
513 
514     /* Take some limits from the children as a default */
515     if (bs->file) {
516         bdrv_refresh_limits(bs->file, &local_err);
517         if (local_err) {
518             error_propagate(errp, local_err);
519             return;
520         }
521         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
522         bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
523         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
524     } else {
525         bs->bl.opt_mem_alignment = 512;
526     }
527 
528     if (bs->backing_hd) {
529         bdrv_refresh_limits(bs->backing_hd, &local_err);
530         if (local_err) {
531             error_propagate(errp, local_err);
532             return;
533         }
534         bs->bl.opt_transfer_length =
535             MAX(bs->bl.opt_transfer_length,
536                 bs->backing_hd->bl.opt_transfer_length);
537         bs->bl.max_transfer_length =
538             MIN_NON_ZERO(bs->bl.max_transfer_length,
539                          bs->backing_hd->bl.max_transfer_length);
540         bs->bl.opt_mem_alignment =
541             MAX(bs->bl.opt_mem_alignment,
542                 bs->backing_hd->bl.opt_mem_alignment);
543     }
544 
545     /* Then let the driver override it */
546     if (drv->bdrv_refresh_limits) {
547         drv->bdrv_refresh_limits(bs, errp);
548     }
549 }
550 
551 /*
552  * Create a uniquely-named empty temporary file.
553  * Return 0 upon success, otherwise a negative errno value.
554  */
555 int get_tmp_filename(char *filename, int size)
556 {
557 #ifdef _WIN32
558     char temp_dir[MAX_PATH];
559     /* GetTempFileName requires that its output buffer (4th param)
560        have length MAX_PATH or greater.  */
561     assert(size >= MAX_PATH);
562     return (GetTempPath(MAX_PATH, temp_dir)
563             && GetTempFileName(temp_dir, "qem", 0, filename)
564             ? 0 : -GetLastError());
565 #else
566     int fd;
567     const char *tmpdir;
568     tmpdir = getenv("TMPDIR");
569     if (!tmpdir) {
570         tmpdir = "/var/tmp";
571     }
572     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
573         return -EOVERFLOW;
574     }
575     fd = mkstemp(filename);
576     if (fd < 0) {
577         return -errno;
578     }
579     if (close(fd) != 0) {
580         unlink(filename);
581         return -errno;
582     }
583     return 0;
584 #endif
585 }
586 
587 /*
588  * Detect host devices. By convention, /dev/cdrom[N] is always
589  * recognized as a host CDROM.
590  */
591 static BlockDriver *find_hdev_driver(const char *filename)
592 {
593     int score_max = 0, score;
594     BlockDriver *drv = NULL, *d;
595 
596     QLIST_FOREACH(d, &bdrv_drivers, list) {
597         if (d->bdrv_probe_device) {
598             score = d->bdrv_probe_device(filename);
599             if (score > score_max) {
600                 score_max = score;
601                 drv = d;
602             }
603         }
604     }
605 
606     return drv;
607 }
608 
609 BlockDriver *bdrv_find_protocol(const char *filename,
610                                 bool allow_protocol_prefix)
611 {
612     BlockDriver *drv1;
613     char protocol[128];
614     int len;
615     const char *p;
616 
617     /* TODO Drivers without bdrv_file_open must be specified explicitly */
618 
619     /*
620      * XXX(hch): we really should not let host device detection
621      * override an explicit protocol specification, but moving this
622      * later breaks access to device names with colons in them.
623      * Thanks to the brain-dead persistent naming schemes on udev-
624      * based Linux systems those actually are quite common.
625      */
626     drv1 = find_hdev_driver(filename);
627     if (drv1) {
628         return drv1;
629     }
630 
631     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
632         return bdrv_find_format("file");
633     }
634 
635     p = strchr(filename, ':');
636     assert(p != NULL);
637     len = p - filename;
638     if (len > sizeof(protocol) - 1)
639         len = sizeof(protocol) - 1;
640     memcpy(protocol, filename, len);
641     protocol[len] = '\0';
642     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
643         if (drv1->protocol_name &&
644             !strcmp(drv1->protocol_name, protocol)) {
645             return drv1;
646         }
647     }
648     return NULL;
649 }
650 
651 static int find_image_format(BlockDriverState *bs, const char *filename,
652                              BlockDriver **pdrv, Error **errp)
653 {
654     int score, score_max;
655     BlockDriver *drv1, *drv;
656     uint8_t buf[2048];
657     int ret = 0;
658 
659     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
660     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
661         drv = bdrv_find_format("raw");
662         if (!drv) {
663             error_setg(errp, "Could not find raw image format");
664             ret = -ENOENT;
665         }
666         *pdrv = drv;
667         return ret;
668     }
669 
670     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
671     if (ret < 0) {
672         error_setg_errno(errp, -ret, "Could not read image for determining its "
673                          "format");
674         *pdrv = NULL;
675         return ret;
676     }
677 
678     score_max = 0;
679     drv = NULL;
680     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
681         if (drv1->bdrv_probe) {
682             score = drv1->bdrv_probe(buf, ret, filename);
683             if (score > score_max) {
684                 score_max = score;
685                 drv = drv1;
686             }
687         }
688     }
689     if (!drv) {
690         error_setg(errp, "Could not determine image format: No compatible "
691                    "driver found");
692         ret = -ENOENT;
693     }
694     *pdrv = drv;
695     return ret;
696 }
697 
698 /**
699  * Set the current 'total_sectors' value
700  * Return 0 on success, -errno on error.
701  */
702 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
703 {
704     BlockDriver *drv = bs->drv;
705 
706     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
707     if (bs->sg)
708         return 0;
709 
710     /* query actual device if possible, otherwise just trust the hint */
711     if (drv->bdrv_getlength) {
712         int64_t length = drv->bdrv_getlength(bs);
713         if (length < 0) {
714             return length;
715         }
716         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
717     }
718 
719     bs->total_sectors = hint;
720     return 0;
721 }
722 
723 /**
724  * Set open flags for a given discard mode
725  *
726  * Return 0 on success, -1 if the discard mode was invalid.
727  */
728 int bdrv_parse_discard_flags(const char *mode, int *flags)
729 {
730     *flags &= ~BDRV_O_UNMAP;
731 
732     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
733         /* do nothing */
734     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
735         *flags |= BDRV_O_UNMAP;
736     } else {
737         return -1;
738     }
739 
740     return 0;
741 }
742 
743 /**
744  * Set open flags for a given cache mode
745  *
746  * Return 0 on success, -1 if the cache mode was invalid.
747  */
748 int bdrv_parse_cache_flags(const char *mode, int *flags)
749 {
750     *flags &= ~BDRV_O_CACHE_MASK;
751 
752     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
753         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
754     } else if (!strcmp(mode, "directsync")) {
755         *flags |= BDRV_O_NOCACHE;
756     } else if (!strcmp(mode, "writeback")) {
757         *flags |= BDRV_O_CACHE_WB;
758     } else if (!strcmp(mode, "unsafe")) {
759         *flags |= BDRV_O_CACHE_WB;
760         *flags |= BDRV_O_NO_FLUSH;
761     } else if (!strcmp(mode, "writethrough")) {
762         /* this is the default */
763     } else {
764         return -1;
765     }
766 
767     return 0;
768 }
769 
770 /**
771  * The copy-on-read flag is actually a reference count so multiple users may
772  * use the feature without worrying about clobbering its previous state.
773  * Copy-on-read stays enabled until all users have called to disable it.
774  */
775 void bdrv_enable_copy_on_read(BlockDriverState *bs)
776 {
777     bs->copy_on_read++;
778 }
779 
780 void bdrv_disable_copy_on_read(BlockDriverState *bs)
781 {
782     assert(bs->copy_on_read > 0);
783     bs->copy_on_read--;
784 }
785 
786 /*
787  * Returns the flags that a temporary snapshot should get, based on the
788  * originally requested flags (the originally requested image will have flags
789  * like a backing file)
790  */
791 static int bdrv_temp_snapshot_flags(int flags)
792 {
793     return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
794 }
795 
796 /*
797  * Returns the flags that bs->file should get, based on the given flags for
798  * the parent BDS
799  */
800 static int bdrv_inherited_flags(int flags)
801 {
802     /* Enable protocol handling, disable format probing for bs->file */
803     flags |= BDRV_O_PROTOCOL;
804 
805     /* Our block drivers take care to send flushes and respect unmap policy,
806      * so we can enable both unconditionally on lower layers. */
807     flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
808 
809     /* Clear flags that only apply to the top layer */
810     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
811 
812     return flags;
813 }
814 
815 /*
816  * Returns the flags that bs->backing_hd should get, based on the given flags
817  * for the parent BDS
818  */
819 static int bdrv_backing_flags(int flags)
820 {
821     /* backing files always opened read-only */
822     flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
823 
824     /* snapshot=on is handled on the top layer */
825     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
826 
827     return flags;
828 }
829 
830 static int bdrv_open_flags(BlockDriverState *bs, int flags)
831 {
832     int open_flags = flags | BDRV_O_CACHE_WB;
833 
834     /*
835      * Clear flags that are internal to the block layer before opening the
836      * image.
837      */
838     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
839 
840     /*
841      * Snapshots should be writable.
842      */
843     if (flags & BDRV_O_TEMPORARY) {
844         open_flags |= BDRV_O_RDWR;
845     }
846 
847     return open_flags;
848 }
849 
850 static void bdrv_assign_node_name(BlockDriverState *bs,
851                                   const char *node_name,
852                                   Error **errp)
853 {
854     if (!node_name) {
855         return;
856     }
857 
858     /* Check for empty string or invalid characters */
859     if (!id_wellformed(node_name)) {
860         error_setg(errp, "Invalid node name");
861         return;
862     }
863 
864     /* takes care of avoiding namespaces collisions */
865     if (blk_by_name(node_name)) {
866         error_setg(errp, "node-name=%s is conflicting with a device id",
867                    node_name);
868         return;
869     }
870 
871     /* takes care of avoiding duplicates node names */
872     if (bdrv_find_node(node_name)) {
873         error_setg(errp, "Duplicate node name");
874         return;
875     }
876 
877     /* copy node name into the bs and insert it into the graph list */
878     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
879     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
880 }
881 
882 /*
883  * Common part for opening disk images and files
884  *
885  * Removes all processed options from *options.
886  */
887 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
888     QDict *options, int flags, BlockDriver *drv, Error **errp)
889 {
890     int ret, open_flags;
891     const char *filename;
892     const char *node_name = NULL;
893     Error *local_err = NULL;
894 
895     assert(drv != NULL);
896     assert(bs->file == NULL);
897     assert(options != NULL && bs->options != options);
898 
899     if (file != NULL) {
900         filename = file->filename;
901     } else {
902         filename = qdict_get_try_str(options, "filename");
903     }
904 
905     if (drv->bdrv_needs_filename && !filename) {
906         error_setg(errp, "The '%s' block driver requires a file name",
907                    drv->format_name);
908         return -EINVAL;
909     }
910 
911     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
912 
913     node_name = qdict_get_try_str(options, "node-name");
914     bdrv_assign_node_name(bs, node_name, &local_err);
915     if (local_err) {
916         error_propagate(errp, local_err);
917         return -EINVAL;
918     }
919     qdict_del(options, "node-name");
920 
921     /* bdrv_open() with directly using a protocol as drv. This layer is already
922      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
923      * and return immediately. */
924     if (file != NULL && drv->bdrv_file_open) {
925         bdrv_swap(file, bs);
926         return 0;
927     }
928 
929     bs->open_flags = flags;
930     bs->guest_block_size = 512;
931     bs->request_alignment = 512;
932     bs->zero_beyond_eof = true;
933     open_flags = bdrv_open_flags(bs, flags);
934     bs->read_only = !(open_flags & BDRV_O_RDWR);
935     bs->growable = !!(flags & BDRV_O_PROTOCOL);
936 
937     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
938         error_setg(errp,
939                    !bs->read_only && bdrv_is_whitelisted(drv, true)
940                         ? "Driver '%s' can only be used for read-only devices"
941                         : "Driver '%s' is not whitelisted",
942                    drv->format_name);
943         return -ENOTSUP;
944     }
945 
946     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
947     if (flags & BDRV_O_COPY_ON_READ) {
948         if (!bs->read_only) {
949             bdrv_enable_copy_on_read(bs);
950         } else {
951             error_setg(errp, "Can't use copy-on-read on read-only device");
952             return -EINVAL;
953         }
954     }
955 
956     if (filename != NULL) {
957         pstrcpy(bs->filename, sizeof(bs->filename), filename);
958     } else {
959         bs->filename[0] = '\0';
960     }
961     pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
962 
963     bs->drv = drv;
964     bs->opaque = g_malloc0(drv->instance_size);
965 
966     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
967 
968     /* Open the image, either directly or using a protocol */
969     if (drv->bdrv_file_open) {
970         assert(file == NULL);
971         assert(!drv->bdrv_needs_filename || filename != NULL);
972         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
973     } else {
974         if (file == NULL) {
975             error_setg(errp, "Can't use '%s' as a block driver for the "
976                        "protocol level", drv->format_name);
977             ret = -EINVAL;
978             goto free_and_fail;
979         }
980         bs->file = file;
981         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
982     }
983 
984     if (ret < 0) {
985         if (local_err) {
986             error_propagate(errp, local_err);
987         } else if (bs->filename[0]) {
988             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
989         } else {
990             error_setg_errno(errp, -ret, "Could not open image");
991         }
992         goto free_and_fail;
993     }
994 
995     ret = refresh_total_sectors(bs, bs->total_sectors);
996     if (ret < 0) {
997         error_setg_errno(errp, -ret, "Could not refresh total sector count");
998         goto free_and_fail;
999     }
1000 
1001     bdrv_refresh_limits(bs, &local_err);
1002     if (local_err) {
1003         error_propagate(errp, local_err);
1004         ret = -EINVAL;
1005         goto free_and_fail;
1006     }
1007 
1008     assert(bdrv_opt_mem_align(bs) != 0);
1009     assert((bs->request_alignment != 0) || bs->sg);
1010     return 0;
1011 
1012 free_and_fail:
1013     bs->file = NULL;
1014     g_free(bs->opaque);
1015     bs->opaque = NULL;
1016     bs->drv = NULL;
1017     return ret;
1018 }
1019 
1020 static QDict *parse_json_filename(const char *filename, Error **errp)
1021 {
1022     QObject *options_obj;
1023     QDict *options;
1024     int ret;
1025 
1026     ret = strstart(filename, "json:", &filename);
1027     assert(ret);
1028 
1029     options_obj = qobject_from_json(filename);
1030     if (!options_obj) {
1031         error_setg(errp, "Could not parse the JSON options");
1032         return NULL;
1033     }
1034 
1035     if (qobject_type(options_obj) != QTYPE_QDICT) {
1036         qobject_decref(options_obj);
1037         error_setg(errp, "Invalid JSON object given");
1038         return NULL;
1039     }
1040 
1041     options = qobject_to_qdict(options_obj);
1042     qdict_flatten(options);
1043 
1044     return options;
1045 }
1046 
1047 /*
1048  * Fills in default options for opening images and converts the legacy
1049  * filename/flags pair to option QDict entries.
1050  */
1051 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1052                              BlockDriver *drv, Error **errp)
1053 {
1054     const char *filename = *pfilename;
1055     const char *drvname;
1056     bool protocol = flags & BDRV_O_PROTOCOL;
1057     bool parse_filename = false;
1058     Error *local_err = NULL;
1059 
1060     /* Parse json: pseudo-protocol */
1061     if (filename && g_str_has_prefix(filename, "json:")) {
1062         QDict *json_options = parse_json_filename(filename, &local_err);
1063         if (local_err) {
1064             error_propagate(errp, local_err);
1065             return -EINVAL;
1066         }
1067 
1068         /* Options given in the filename have lower priority than options
1069          * specified directly */
1070         qdict_join(*options, json_options, false);
1071         QDECREF(json_options);
1072         *pfilename = filename = NULL;
1073     }
1074 
1075     /* Fetch the file name from the options QDict if necessary */
1076     if (protocol && filename) {
1077         if (!qdict_haskey(*options, "filename")) {
1078             qdict_put(*options, "filename", qstring_from_str(filename));
1079             parse_filename = true;
1080         } else {
1081             error_setg(errp, "Can't specify 'file' and 'filename' options at "
1082                              "the same time");
1083             return -EINVAL;
1084         }
1085     }
1086 
1087     /* Find the right block driver */
1088     filename = qdict_get_try_str(*options, "filename");
1089     drvname = qdict_get_try_str(*options, "driver");
1090 
1091     if (drv) {
1092         if (drvname) {
1093             error_setg(errp, "Driver specified twice");
1094             return -EINVAL;
1095         }
1096         drvname = drv->format_name;
1097         qdict_put(*options, "driver", qstring_from_str(drvname));
1098     } else {
1099         if (!drvname && protocol) {
1100             if (filename) {
1101                 drv = bdrv_find_protocol(filename, parse_filename);
1102                 if (!drv) {
1103                     error_setg(errp, "Unknown protocol");
1104                     return -EINVAL;
1105                 }
1106 
1107                 drvname = drv->format_name;
1108                 qdict_put(*options, "driver", qstring_from_str(drvname));
1109             } else {
1110                 error_setg(errp, "Must specify either driver or file");
1111                 return -EINVAL;
1112             }
1113         } else if (drvname) {
1114             drv = bdrv_find_format(drvname);
1115             if (!drv) {
1116                 error_setg(errp, "Unknown driver '%s'", drvname);
1117                 return -ENOENT;
1118             }
1119         }
1120     }
1121 
1122     assert(drv || !protocol);
1123 
1124     /* Driver-specific filename parsing */
1125     if (drv && drv->bdrv_parse_filename && parse_filename) {
1126         drv->bdrv_parse_filename(filename, *options, &local_err);
1127         if (local_err) {
1128             error_propagate(errp, local_err);
1129             return -EINVAL;
1130         }
1131 
1132         if (!drv->bdrv_needs_filename) {
1133             qdict_del(*options, "filename");
1134         }
1135     }
1136 
1137     return 0;
1138 }
1139 
1140 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1141 {
1142 
1143     if (bs->backing_hd) {
1144         assert(bs->backing_blocker);
1145         bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1146     } else if (backing_hd) {
1147         error_setg(&bs->backing_blocker,
1148                    "device is used as backing hd of '%s'",
1149                    bdrv_get_device_name(bs));
1150     }
1151 
1152     bs->backing_hd = backing_hd;
1153     if (!backing_hd) {
1154         error_free(bs->backing_blocker);
1155         bs->backing_blocker = NULL;
1156         goto out;
1157     }
1158     bs->open_flags &= ~BDRV_O_NO_BACKING;
1159     pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1160     pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1161             backing_hd->drv ? backing_hd->drv->format_name : "");
1162 
1163     bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1164     /* Otherwise we won't be able to commit due to check in bdrv_commit */
1165     bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1166                     bs->backing_blocker);
1167 out:
1168     bdrv_refresh_limits(bs, NULL);
1169 }
1170 
1171 /*
1172  * Opens the backing file for a BlockDriverState if not yet open
1173  *
1174  * options is a QDict of options to pass to the block drivers, or NULL for an
1175  * empty set of options. The reference to the QDict is transferred to this
1176  * function (even on failure), so if the caller intends to reuse the dictionary,
1177  * it needs to use QINCREF() before calling bdrv_file_open.
1178  */
1179 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1180 {
1181     char *backing_filename = g_malloc0(PATH_MAX);
1182     int ret = 0;
1183     BlockDriver *back_drv = NULL;
1184     BlockDriverState *backing_hd;
1185     Error *local_err = NULL;
1186 
1187     if (bs->backing_hd != NULL) {
1188         QDECREF(options);
1189         goto free_exit;
1190     }
1191 
1192     /* NULL means an empty set of options */
1193     if (options == NULL) {
1194         options = qdict_new();
1195     }
1196 
1197     bs->open_flags &= ~BDRV_O_NO_BACKING;
1198     if (qdict_haskey(options, "file.filename")) {
1199         backing_filename[0] = '\0';
1200     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1201         QDECREF(options);
1202         goto free_exit;
1203     } else {
1204         bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1205     }
1206 
1207     if (!bs->drv || !bs->drv->supports_backing) {
1208         ret = -EINVAL;
1209         error_setg(errp, "Driver doesn't support backing files");
1210         QDECREF(options);
1211         goto free_exit;
1212     }
1213 
1214     backing_hd = bdrv_new();
1215 
1216     if (bs->backing_format[0] != '\0') {
1217         back_drv = bdrv_find_format(bs->backing_format);
1218     }
1219 
1220     assert(bs->backing_hd == NULL);
1221     ret = bdrv_open(&backing_hd,
1222                     *backing_filename ? backing_filename : NULL, NULL, options,
1223                     bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
1224     if (ret < 0) {
1225         bdrv_unref(backing_hd);
1226         backing_hd = NULL;
1227         bs->open_flags |= BDRV_O_NO_BACKING;
1228         error_setg(errp, "Could not open backing file: %s",
1229                    error_get_pretty(local_err));
1230         error_free(local_err);
1231         goto free_exit;
1232     }
1233     bdrv_set_backing_hd(bs, backing_hd);
1234 
1235 free_exit:
1236     g_free(backing_filename);
1237     return ret;
1238 }
1239 
1240 /*
1241  * Opens a disk image whose options are given as BlockdevRef in another block
1242  * device's options.
1243  *
1244  * If allow_none is true, no image will be opened if filename is false and no
1245  * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1246  *
1247  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1248  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1249  * itself, all options starting with "${bdref_key}." are considered part of the
1250  * BlockdevRef.
1251  *
1252  * The BlockdevRef will be removed from the options QDict.
1253  *
1254  * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1255  */
1256 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1257                     QDict *options, const char *bdref_key, int flags,
1258                     bool allow_none, Error **errp)
1259 {
1260     QDict *image_options;
1261     int ret;
1262     char *bdref_key_dot;
1263     const char *reference;
1264 
1265     assert(pbs);
1266     assert(*pbs == NULL);
1267 
1268     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1269     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1270     g_free(bdref_key_dot);
1271 
1272     reference = qdict_get_try_str(options, bdref_key);
1273     if (!filename && !reference && !qdict_size(image_options)) {
1274         if (allow_none) {
1275             ret = 0;
1276         } else {
1277             error_setg(errp, "A block device must be specified for \"%s\"",
1278                        bdref_key);
1279             ret = -EINVAL;
1280         }
1281         QDECREF(image_options);
1282         goto done;
1283     }
1284 
1285     ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1286 
1287 done:
1288     qdict_del(options, bdref_key);
1289     return ret;
1290 }
1291 
1292 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1293 {
1294     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1295     char *tmp_filename = g_malloc0(PATH_MAX + 1);
1296     int64_t total_size;
1297     BlockDriver *bdrv_qcow2;
1298     QemuOpts *opts = NULL;
1299     QDict *snapshot_options;
1300     BlockDriverState *bs_snapshot;
1301     Error *local_err;
1302     int ret;
1303 
1304     /* if snapshot, we create a temporary backing file and open it
1305        instead of opening 'filename' directly */
1306 
1307     /* Get the required size from the image */
1308     total_size = bdrv_getlength(bs);
1309     if (total_size < 0) {
1310         ret = total_size;
1311         error_setg_errno(errp, -total_size, "Could not get image size");
1312         goto out;
1313     }
1314 
1315     /* Create the temporary image */
1316     ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1317     if (ret < 0) {
1318         error_setg_errno(errp, -ret, "Could not get temporary filename");
1319         goto out;
1320     }
1321 
1322     bdrv_qcow2 = bdrv_find_format("qcow2");
1323     opts = qemu_opts_create(bdrv_qcow2->create_opts, NULL, 0,
1324                             &error_abort);
1325     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
1326     ret = bdrv_create(bdrv_qcow2, tmp_filename, opts, &local_err);
1327     qemu_opts_del(opts);
1328     if (ret < 0) {
1329         error_setg_errno(errp, -ret, "Could not create temporary overlay "
1330                          "'%s': %s", tmp_filename,
1331                          error_get_pretty(local_err));
1332         error_free(local_err);
1333         goto out;
1334     }
1335 
1336     /* Prepare a new options QDict for the temporary file */
1337     snapshot_options = qdict_new();
1338     qdict_put(snapshot_options, "file.driver",
1339               qstring_from_str("file"));
1340     qdict_put(snapshot_options, "file.filename",
1341               qstring_from_str(tmp_filename));
1342 
1343     bs_snapshot = bdrv_new();
1344 
1345     ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1346                     flags, bdrv_qcow2, &local_err);
1347     if (ret < 0) {
1348         error_propagate(errp, local_err);
1349         goto out;
1350     }
1351 
1352     bdrv_append(bs_snapshot, bs);
1353 
1354 out:
1355     g_free(tmp_filename);
1356     return ret;
1357 }
1358 
1359 /*
1360  * Opens a disk image (raw, qcow2, vmdk, ...)
1361  *
1362  * options is a QDict of options to pass to the block drivers, or NULL for an
1363  * empty set of options. The reference to the QDict belongs to the block layer
1364  * after the call (even on failure), so if the caller intends to reuse the
1365  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1366  *
1367  * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1368  * If it is not NULL, the referenced BDS will be reused.
1369  *
1370  * The reference parameter may be used to specify an existing block device which
1371  * should be opened. If specified, neither options nor a filename may be given,
1372  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1373  */
1374 int bdrv_open(BlockDriverState **pbs, const char *filename,
1375               const char *reference, QDict *options, int flags,
1376               BlockDriver *drv, Error **errp)
1377 {
1378     int ret;
1379     BlockDriverState *file = NULL, *bs;
1380     const char *drvname;
1381     Error *local_err = NULL;
1382     int snapshot_flags = 0;
1383 
1384     assert(pbs);
1385 
1386     if (reference) {
1387         bool options_non_empty = options ? qdict_size(options) : false;
1388         QDECREF(options);
1389 
1390         if (*pbs) {
1391             error_setg(errp, "Cannot reuse an existing BDS when referencing "
1392                        "another block device");
1393             return -EINVAL;
1394         }
1395 
1396         if (filename || options_non_empty) {
1397             error_setg(errp, "Cannot reference an existing block device with "
1398                        "additional options or a new filename");
1399             return -EINVAL;
1400         }
1401 
1402         bs = bdrv_lookup_bs(reference, reference, errp);
1403         if (!bs) {
1404             return -ENODEV;
1405         }
1406         bdrv_ref(bs);
1407         *pbs = bs;
1408         return 0;
1409     }
1410 
1411     if (*pbs) {
1412         bs = *pbs;
1413     } else {
1414         bs = bdrv_new();
1415     }
1416 
1417     /* NULL means an empty set of options */
1418     if (options == NULL) {
1419         options = qdict_new();
1420     }
1421 
1422     ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1423     if (local_err) {
1424         goto fail;
1425     }
1426 
1427     /* Find the right image format driver */
1428     drv = NULL;
1429     drvname = qdict_get_try_str(options, "driver");
1430     if (drvname) {
1431         drv = bdrv_find_format(drvname);
1432         qdict_del(options, "driver");
1433         if (!drv) {
1434             error_setg(errp, "Unknown driver: '%s'", drvname);
1435             ret = -EINVAL;
1436             goto fail;
1437         }
1438     }
1439 
1440     assert(drvname || !(flags & BDRV_O_PROTOCOL));
1441     if (drv && !drv->bdrv_file_open) {
1442         /* If the user explicitly wants a format driver here, we'll need to add
1443          * another layer for the protocol in bs->file */
1444         flags &= ~BDRV_O_PROTOCOL;
1445     }
1446 
1447     bs->options = options;
1448     options = qdict_clone_shallow(options);
1449 
1450     /* Open image file without format layer */
1451     if ((flags & BDRV_O_PROTOCOL) == 0) {
1452         if (flags & BDRV_O_RDWR) {
1453             flags |= BDRV_O_ALLOW_RDWR;
1454         }
1455         if (flags & BDRV_O_SNAPSHOT) {
1456             snapshot_flags = bdrv_temp_snapshot_flags(flags);
1457             flags = bdrv_backing_flags(flags);
1458         }
1459 
1460         assert(file == NULL);
1461         ret = bdrv_open_image(&file, filename, options, "file",
1462                               bdrv_inherited_flags(flags),
1463                               true, &local_err);
1464         if (ret < 0) {
1465             goto fail;
1466         }
1467     }
1468 
1469     /* Image format probing */
1470     if (!drv && file) {
1471         ret = find_image_format(file, filename, &drv, &local_err);
1472         if (ret < 0) {
1473             goto fail;
1474         }
1475     } else if (!drv) {
1476         error_setg(errp, "Must specify either driver or file");
1477         ret = -EINVAL;
1478         goto fail;
1479     }
1480 
1481     /* Open the image */
1482     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1483     if (ret < 0) {
1484         goto fail;
1485     }
1486 
1487     if (file && (bs->file != file)) {
1488         bdrv_unref(file);
1489         file = NULL;
1490     }
1491 
1492     /* If there is a backing file, use it */
1493     if ((flags & BDRV_O_NO_BACKING) == 0) {
1494         QDict *backing_options;
1495 
1496         qdict_extract_subqdict(options, &backing_options, "backing.");
1497         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1498         if (ret < 0) {
1499             goto close_and_fail;
1500         }
1501     }
1502 
1503     bdrv_refresh_filename(bs);
1504 
1505     /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1506      * temporary snapshot afterwards. */
1507     if (snapshot_flags) {
1508         ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1509         if (local_err) {
1510             goto close_and_fail;
1511         }
1512     }
1513 
1514     /* Check if any unknown options were used */
1515     if (options && (qdict_size(options) != 0)) {
1516         const QDictEntry *entry = qdict_first(options);
1517         if (flags & BDRV_O_PROTOCOL) {
1518             error_setg(errp, "Block protocol '%s' doesn't support the option "
1519                        "'%s'", drv->format_name, entry->key);
1520         } else {
1521             error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1522                        "support the option '%s'", drv->format_name,
1523                        bdrv_get_device_name(bs), entry->key);
1524         }
1525 
1526         ret = -EINVAL;
1527         goto close_and_fail;
1528     }
1529 
1530     if (!bdrv_key_required(bs)) {
1531         if (bs->blk) {
1532             blk_dev_change_media_cb(bs->blk, true);
1533         }
1534     } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1535                && !runstate_check(RUN_STATE_INMIGRATE)
1536                && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1537         error_setg(errp,
1538                    "Guest must be stopped for opening of encrypted image");
1539         ret = -EBUSY;
1540         goto close_and_fail;
1541     }
1542 
1543     QDECREF(options);
1544     *pbs = bs;
1545     return 0;
1546 
1547 fail:
1548     if (file != NULL) {
1549         bdrv_unref(file);
1550     }
1551     QDECREF(bs->options);
1552     QDECREF(options);
1553     bs->options = NULL;
1554     if (!*pbs) {
1555         /* If *pbs is NULL, a new BDS has been created in this function and
1556            needs to be freed now. Otherwise, it does not need to be closed,
1557            since it has not really been opened yet. */
1558         bdrv_unref(bs);
1559     }
1560     if (local_err) {
1561         error_propagate(errp, local_err);
1562     }
1563     return ret;
1564 
1565 close_and_fail:
1566     /* See fail path, but now the BDS has to be always closed */
1567     if (*pbs) {
1568         bdrv_close(bs);
1569     } else {
1570         bdrv_unref(bs);
1571     }
1572     QDECREF(options);
1573     if (local_err) {
1574         error_propagate(errp, local_err);
1575     }
1576     return ret;
1577 }
1578 
1579 typedef struct BlockReopenQueueEntry {
1580      bool prepared;
1581      BDRVReopenState state;
1582      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1583 } BlockReopenQueueEntry;
1584 
1585 /*
1586  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1587  * reopen of multiple devices.
1588  *
1589  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1590  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1591  * be created and initialized. This newly created BlockReopenQueue should be
1592  * passed back in for subsequent calls that are intended to be of the same
1593  * atomic 'set'.
1594  *
1595  * bs is the BlockDriverState to add to the reopen queue.
1596  *
1597  * flags contains the open flags for the associated bs
1598  *
1599  * returns a pointer to bs_queue, which is either the newly allocated
1600  * bs_queue, or the existing bs_queue being used.
1601  *
1602  */
1603 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1604                                     BlockDriverState *bs, int flags)
1605 {
1606     assert(bs != NULL);
1607 
1608     BlockReopenQueueEntry *bs_entry;
1609     if (bs_queue == NULL) {
1610         bs_queue = g_new0(BlockReopenQueue, 1);
1611         QSIMPLEQ_INIT(bs_queue);
1612     }
1613 
1614     /* bdrv_open() masks this flag out */
1615     flags &= ~BDRV_O_PROTOCOL;
1616 
1617     if (bs->file) {
1618         bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1619     }
1620 
1621     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1622     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1623 
1624     bs_entry->state.bs = bs;
1625     bs_entry->state.flags = flags;
1626 
1627     return bs_queue;
1628 }
1629 
1630 /*
1631  * Reopen multiple BlockDriverStates atomically & transactionally.
1632  *
1633  * The queue passed in (bs_queue) must have been built up previous
1634  * via bdrv_reopen_queue().
1635  *
1636  * Reopens all BDS specified in the queue, with the appropriate
1637  * flags.  All devices are prepared for reopen, and failure of any
1638  * device will cause all device changes to be abandonded, and intermediate
1639  * data cleaned up.
1640  *
1641  * If all devices prepare successfully, then the changes are committed
1642  * to all devices.
1643  *
1644  */
1645 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1646 {
1647     int ret = -1;
1648     BlockReopenQueueEntry *bs_entry, *next;
1649     Error *local_err = NULL;
1650 
1651     assert(bs_queue != NULL);
1652 
1653     bdrv_drain_all();
1654 
1655     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1656         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1657             error_propagate(errp, local_err);
1658             goto cleanup;
1659         }
1660         bs_entry->prepared = true;
1661     }
1662 
1663     /* If we reach this point, we have success and just need to apply the
1664      * changes
1665      */
1666     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1667         bdrv_reopen_commit(&bs_entry->state);
1668     }
1669 
1670     ret = 0;
1671 
1672 cleanup:
1673     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1674         if (ret && bs_entry->prepared) {
1675             bdrv_reopen_abort(&bs_entry->state);
1676         }
1677         g_free(bs_entry);
1678     }
1679     g_free(bs_queue);
1680     return ret;
1681 }
1682 
1683 
1684 /* Reopen a single BlockDriverState with the specified flags. */
1685 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1686 {
1687     int ret = -1;
1688     Error *local_err = NULL;
1689     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1690 
1691     ret = bdrv_reopen_multiple(queue, &local_err);
1692     if (local_err != NULL) {
1693         error_propagate(errp, local_err);
1694     }
1695     return ret;
1696 }
1697 
1698 
1699 /*
1700  * Prepares a BlockDriverState for reopen. All changes are staged in the
1701  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1702  * the block driver layer .bdrv_reopen_prepare()
1703  *
1704  * bs is the BlockDriverState to reopen
1705  * flags are the new open flags
1706  * queue is the reopen queue
1707  *
1708  * Returns 0 on success, non-zero on error.  On error errp will be set
1709  * as well.
1710  *
1711  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1712  * It is the responsibility of the caller to then call the abort() or
1713  * commit() for any other BDS that have been left in a prepare() state
1714  *
1715  */
1716 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1717                         Error **errp)
1718 {
1719     int ret = -1;
1720     Error *local_err = NULL;
1721     BlockDriver *drv;
1722 
1723     assert(reopen_state != NULL);
1724     assert(reopen_state->bs->drv != NULL);
1725     drv = reopen_state->bs->drv;
1726 
1727     /* if we are to stay read-only, do not allow permission change
1728      * to r/w */
1729     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1730         reopen_state->flags & BDRV_O_RDWR) {
1731         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1732                   bdrv_get_device_name(reopen_state->bs));
1733         goto error;
1734     }
1735 
1736 
1737     ret = bdrv_flush(reopen_state->bs);
1738     if (ret) {
1739         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1740                   strerror(-ret));
1741         goto error;
1742     }
1743 
1744     if (drv->bdrv_reopen_prepare) {
1745         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1746         if (ret) {
1747             if (local_err != NULL) {
1748                 error_propagate(errp, local_err);
1749             } else {
1750                 error_setg(errp, "failed while preparing to reopen image '%s'",
1751                            reopen_state->bs->filename);
1752             }
1753             goto error;
1754         }
1755     } else {
1756         /* It is currently mandatory to have a bdrv_reopen_prepare()
1757          * handler for each supported drv. */
1758         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1759                   drv->format_name, bdrv_get_device_name(reopen_state->bs),
1760                  "reopening of file");
1761         ret = -1;
1762         goto error;
1763     }
1764 
1765     ret = 0;
1766 
1767 error:
1768     return ret;
1769 }
1770 
1771 /*
1772  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1773  * makes them final by swapping the staging BlockDriverState contents into
1774  * the active BlockDriverState contents.
1775  */
1776 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1777 {
1778     BlockDriver *drv;
1779 
1780     assert(reopen_state != NULL);
1781     drv = reopen_state->bs->drv;
1782     assert(drv != NULL);
1783 
1784     /* If there are any driver level actions to take */
1785     if (drv->bdrv_reopen_commit) {
1786         drv->bdrv_reopen_commit(reopen_state);
1787     }
1788 
1789     /* set BDS specific flags now */
1790     reopen_state->bs->open_flags         = reopen_state->flags;
1791     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1792                                               BDRV_O_CACHE_WB);
1793     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1794 
1795     bdrv_refresh_limits(reopen_state->bs, NULL);
1796 }
1797 
1798 /*
1799  * Abort the reopen, and delete and free the staged changes in
1800  * reopen_state
1801  */
1802 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1803 {
1804     BlockDriver *drv;
1805 
1806     assert(reopen_state != NULL);
1807     drv = reopen_state->bs->drv;
1808     assert(drv != NULL);
1809 
1810     if (drv->bdrv_reopen_abort) {
1811         drv->bdrv_reopen_abort(reopen_state);
1812     }
1813 }
1814 
1815 
1816 void bdrv_close(BlockDriverState *bs)
1817 {
1818     BdrvAioNotifier *ban, *ban_next;
1819 
1820     if (bs->job) {
1821         block_job_cancel_sync(bs->job);
1822     }
1823     bdrv_drain_all(); /* complete I/O */
1824     bdrv_flush(bs);
1825     bdrv_drain_all(); /* in case flush left pending I/O */
1826     notifier_list_notify(&bs->close_notifiers, bs);
1827 
1828     if (bs->drv) {
1829         if (bs->backing_hd) {
1830             BlockDriverState *backing_hd = bs->backing_hd;
1831             bdrv_set_backing_hd(bs, NULL);
1832             bdrv_unref(backing_hd);
1833         }
1834         bs->drv->bdrv_close(bs);
1835         g_free(bs->opaque);
1836         bs->opaque = NULL;
1837         bs->drv = NULL;
1838         bs->copy_on_read = 0;
1839         bs->backing_file[0] = '\0';
1840         bs->backing_format[0] = '\0';
1841         bs->total_sectors = 0;
1842         bs->encrypted = 0;
1843         bs->valid_key = 0;
1844         bs->sg = 0;
1845         bs->growable = 0;
1846         bs->zero_beyond_eof = false;
1847         QDECREF(bs->options);
1848         bs->options = NULL;
1849         QDECREF(bs->full_open_options);
1850         bs->full_open_options = NULL;
1851 
1852         if (bs->file != NULL) {
1853             bdrv_unref(bs->file);
1854             bs->file = NULL;
1855         }
1856     }
1857 
1858     if (bs->blk) {
1859         blk_dev_change_media_cb(bs->blk, false);
1860     }
1861 
1862     /*throttling disk I/O limits*/
1863     if (bs->io_limits_enabled) {
1864         bdrv_io_limits_disable(bs);
1865     }
1866 
1867     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1868         g_free(ban);
1869     }
1870     QLIST_INIT(&bs->aio_notifiers);
1871 }
1872 
1873 void bdrv_close_all(void)
1874 {
1875     BlockDriverState *bs;
1876 
1877     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1878         AioContext *aio_context = bdrv_get_aio_context(bs);
1879 
1880         aio_context_acquire(aio_context);
1881         bdrv_close(bs);
1882         aio_context_release(aio_context);
1883     }
1884 }
1885 
1886 /* Check if any requests are in-flight (including throttled requests) */
1887 static bool bdrv_requests_pending(BlockDriverState *bs)
1888 {
1889     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1890         return true;
1891     }
1892     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1893         return true;
1894     }
1895     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1896         return true;
1897     }
1898     if (bs->file && bdrv_requests_pending(bs->file)) {
1899         return true;
1900     }
1901     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1902         return true;
1903     }
1904     return false;
1905 }
1906 
1907 static bool bdrv_drain_one(BlockDriverState *bs)
1908 {
1909     bool bs_busy;
1910 
1911     bdrv_flush_io_queue(bs);
1912     bdrv_start_throttled_reqs(bs);
1913     bs_busy = bdrv_requests_pending(bs);
1914     bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
1915     return bs_busy;
1916 }
1917 
1918 /*
1919  * Wait for pending requests to complete on a single BlockDriverState subtree
1920  *
1921  * See the warning in bdrv_drain_all().  This function can only be called if
1922  * you are sure nothing can generate I/O because you have op blockers
1923  * installed.
1924  *
1925  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
1926  * AioContext.
1927  */
1928 void bdrv_drain(BlockDriverState *bs)
1929 {
1930     while (bdrv_drain_one(bs)) {
1931         /* Keep iterating */
1932     }
1933 }
1934 
1935 /*
1936  * Wait for pending requests to complete across all BlockDriverStates
1937  *
1938  * This function does not flush data to disk, use bdrv_flush_all() for that
1939  * after calling this function.
1940  *
1941  * Note that completion of an asynchronous I/O operation can trigger any
1942  * number of other I/O operations on other devices---for example a coroutine
1943  * can be arbitrarily complex and a constant flow of I/O can come until the
1944  * coroutine is complete.  Because of this, it is not possible to have a
1945  * function to drain a single device's I/O queue.
1946  */
1947 void bdrv_drain_all(void)
1948 {
1949     /* Always run first iteration so any pending completion BHs run */
1950     bool busy = true;
1951     BlockDriverState *bs;
1952 
1953     while (busy) {
1954         busy = false;
1955 
1956         QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1957             AioContext *aio_context = bdrv_get_aio_context(bs);
1958 
1959             aio_context_acquire(aio_context);
1960             busy |= bdrv_drain_one(bs);
1961             aio_context_release(aio_context);
1962         }
1963     }
1964 }
1965 
1966 /* make a BlockDriverState anonymous by removing from bdrv_state and
1967  * graph_bdrv_state list.
1968    Also, NULL terminate the device_name to prevent double remove */
1969 void bdrv_make_anon(BlockDriverState *bs)
1970 {
1971     /*
1972      * Take care to remove bs from bdrv_states only when it's actually
1973      * in it.  Note that bs->device_list.tqe_prev is initially null,
1974      * and gets set to non-null by QTAILQ_INSERT_TAIL().  Establish
1975      * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
1976      * resetting it to null on remove.
1977      */
1978     if (bs->device_list.tqe_prev) {
1979         QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1980         bs->device_list.tqe_prev = NULL;
1981     }
1982     if (bs->node_name[0] != '\0') {
1983         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1984     }
1985     bs->node_name[0] = '\0';
1986 }
1987 
1988 static void bdrv_rebind(BlockDriverState *bs)
1989 {
1990     if (bs->drv && bs->drv->bdrv_rebind) {
1991         bs->drv->bdrv_rebind(bs);
1992     }
1993 }
1994 
1995 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1996                                      BlockDriverState *bs_src)
1997 {
1998     /* move some fields that need to stay attached to the device */
1999 
2000     /* dev info */
2001     bs_dest->guest_block_size   = bs_src->guest_block_size;
2002     bs_dest->copy_on_read       = bs_src->copy_on_read;
2003 
2004     bs_dest->enable_write_cache = bs_src->enable_write_cache;
2005 
2006     /* i/o throttled req */
2007     memcpy(&bs_dest->throttle_state,
2008            &bs_src->throttle_state,
2009            sizeof(ThrottleState));
2010     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
2011     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
2012     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
2013 
2014     /* r/w error */
2015     bs_dest->on_read_error      = bs_src->on_read_error;
2016     bs_dest->on_write_error     = bs_src->on_write_error;
2017 
2018     /* i/o status */
2019     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
2020     bs_dest->iostatus           = bs_src->iostatus;
2021 
2022     /* dirty bitmap */
2023     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
2024 
2025     /* reference count */
2026     bs_dest->refcnt             = bs_src->refcnt;
2027 
2028     /* job */
2029     bs_dest->job                = bs_src->job;
2030 
2031     /* keep the same entry in bdrv_states */
2032     bs_dest->device_list = bs_src->device_list;
2033     bs_dest->blk = bs_src->blk;
2034 
2035     memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2036            sizeof(bs_dest->op_blockers));
2037 }
2038 
2039 /*
2040  * Swap bs contents for two image chains while they are live,
2041  * while keeping required fields on the BlockDriverState that is
2042  * actually attached to a device.
2043  *
2044  * This will modify the BlockDriverState fields, and swap contents
2045  * between bs_new and bs_old. Both bs_new and bs_old are modified.
2046  *
2047  * bs_new must not be attached to a BlockBackend.
2048  *
2049  * This function does not create any image files.
2050  */
2051 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2052 {
2053     BlockDriverState tmp;
2054 
2055     /* The code needs to swap the node_name but simply swapping node_list won't
2056      * work so first remove the nodes from the graph list, do the swap then
2057      * insert them back if needed.
2058      */
2059     if (bs_new->node_name[0] != '\0') {
2060         QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2061     }
2062     if (bs_old->node_name[0] != '\0') {
2063         QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2064     }
2065 
2066     /* bs_new must be unattached and shouldn't have anything fancy enabled */
2067     assert(!bs_new->blk);
2068     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2069     assert(bs_new->job == NULL);
2070     assert(bs_new->io_limits_enabled == false);
2071     assert(!throttle_have_timer(&bs_new->throttle_state));
2072 
2073     tmp = *bs_new;
2074     *bs_new = *bs_old;
2075     *bs_old = tmp;
2076 
2077     /* there are some fields that should not be swapped, move them back */
2078     bdrv_move_feature_fields(&tmp, bs_old);
2079     bdrv_move_feature_fields(bs_old, bs_new);
2080     bdrv_move_feature_fields(bs_new, &tmp);
2081 
2082     /* bs_new must remain unattached */
2083     assert(!bs_new->blk);
2084 
2085     /* Check a few fields that should remain attached to the device */
2086     assert(bs_new->job == NULL);
2087     assert(bs_new->io_limits_enabled == false);
2088     assert(!throttle_have_timer(&bs_new->throttle_state));
2089 
2090     /* insert the nodes back into the graph node list if needed */
2091     if (bs_new->node_name[0] != '\0') {
2092         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2093     }
2094     if (bs_old->node_name[0] != '\0') {
2095         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2096     }
2097 
2098     bdrv_rebind(bs_new);
2099     bdrv_rebind(bs_old);
2100 }
2101 
2102 /*
2103  * Add new bs contents at the top of an image chain while the chain is
2104  * live, while keeping required fields on the top layer.
2105  *
2106  * This will modify the BlockDriverState fields, and swap contents
2107  * between bs_new and bs_top. Both bs_new and bs_top are modified.
2108  *
2109  * bs_new must not be attached to a BlockBackend.
2110  *
2111  * This function does not create any image files.
2112  */
2113 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2114 {
2115     bdrv_swap(bs_new, bs_top);
2116 
2117     /* The contents of 'tmp' will become bs_top, as we are
2118      * swapping bs_new and bs_top contents. */
2119     bdrv_set_backing_hd(bs_top, bs_new);
2120 }
2121 
2122 static void bdrv_delete(BlockDriverState *bs)
2123 {
2124     assert(!bs->job);
2125     assert(bdrv_op_blocker_is_empty(bs));
2126     assert(!bs->refcnt);
2127     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2128 
2129     bdrv_close(bs);
2130 
2131     /* remove from list, if necessary */
2132     bdrv_make_anon(bs);
2133 
2134     g_free(bs);
2135 }
2136 
2137 /*
2138  * Run consistency checks on an image
2139  *
2140  * Returns 0 if the check could be completed (it doesn't mean that the image is
2141  * free of errors) or -errno when an internal error occurred. The results of the
2142  * check are stored in res.
2143  */
2144 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2145 {
2146     if (bs->drv == NULL) {
2147         return -ENOMEDIUM;
2148     }
2149     if (bs->drv->bdrv_check == NULL) {
2150         return -ENOTSUP;
2151     }
2152 
2153     memset(res, 0, sizeof(*res));
2154     return bs->drv->bdrv_check(bs, res, fix);
2155 }
2156 
2157 #define COMMIT_BUF_SECTORS 2048
2158 
2159 /* commit COW file into the raw image */
2160 int bdrv_commit(BlockDriverState *bs)
2161 {
2162     BlockDriver *drv = bs->drv;
2163     int64_t sector, total_sectors, length, backing_length;
2164     int n, ro, open_flags;
2165     int ret = 0;
2166     uint8_t *buf = NULL;
2167     char filename[PATH_MAX];
2168 
2169     if (!drv)
2170         return -ENOMEDIUM;
2171 
2172     if (!bs->backing_hd) {
2173         return -ENOTSUP;
2174     }
2175 
2176     if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2177         bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2178         return -EBUSY;
2179     }
2180 
2181     ro = bs->backing_hd->read_only;
2182     /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2183     pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2184     open_flags =  bs->backing_hd->open_flags;
2185 
2186     if (ro) {
2187         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2188             return -EACCES;
2189         }
2190     }
2191 
2192     length = bdrv_getlength(bs);
2193     if (length < 0) {
2194         ret = length;
2195         goto ro_cleanup;
2196     }
2197 
2198     backing_length = bdrv_getlength(bs->backing_hd);
2199     if (backing_length < 0) {
2200         ret = backing_length;
2201         goto ro_cleanup;
2202     }
2203 
2204     /* If our top snapshot is larger than the backing file image,
2205      * grow the backing file image if possible.  If not possible,
2206      * we must return an error */
2207     if (length > backing_length) {
2208         ret = bdrv_truncate(bs->backing_hd, length);
2209         if (ret < 0) {
2210             goto ro_cleanup;
2211         }
2212     }
2213 
2214     total_sectors = length >> BDRV_SECTOR_BITS;
2215 
2216     /* qemu_try_blockalign() for bs will choose an alignment that works for
2217      * bs->backing_hd as well, so no need to compare the alignment manually. */
2218     buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2219     if (buf == NULL) {
2220         ret = -ENOMEM;
2221         goto ro_cleanup;
2222     }
2223 
2224     for (sector = 0; sector < total_sectors; sector += n) {
2225         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2226         if (ret < 0) {
2227             goto ro_cleanup;
2228         }
2229         if (ret) {
2230             ret = bdrv_read(bs, sector, buf, n);
2231             if (ret < 0) {
2232                 goto ro_cleanup;
2233             }
2234 
2235             ret = bdrv_write(bs->backing_hd, sector, buf, n);
2236             if (ret < 0) {
2237                 goto ro_cleanup;
2238             }
2239         }
2240     }
2241 
2242     if (drv->bdrv_make_empty) {
2243         ret = drv->bdrv_make_empty(bs);
2244         if (ret < 0) {
2245             goto ro_cleanup;
2246         }
2247         bdrv_flush(bs);
2248     }
2249 
2250     /*
2251      * Make sure all data we wrote to the backing device is actually
2252      * stable on disk.
2253      */
2254     if (bs->backing_hd) {
2255         bdrv_flush(bs->backing_hd);
2256     }
2257 
2258     ret = 0;
2259 ro_cleanup:
2260     qemu_vfree(buf);
2261 
2262     if (ro) {
2263         /* ignoring error return here */
2264         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2265     }
2266 
2267     return ret;
2268 }
2269 
2270 int bdrv_commit_all(void)
2271 {
2272     BlockDriverState *bs;
2273 
2274     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2275         AioContext *aio_context = bdrv_get_aio_context(bs);
2276 
2277         aio_context_acquire(aio_context);
2278         if (bs->drv && bs->backing_hd) {
2279             int ret = bdrv_commit(bs);
2280             if (ret < 0) {
2281                 aio_context_release(aio_context);
2282                 return ret;
2283             }
2284         }
2285         aio_context_release(aio_context);
2286     }
2287     return 0;
2288 }
2289 
2290 /**
2291  * Remove an active request from the tracked requests list
2292  *
2293  * This function should be called when a tracked request is completing.
2294  */
2295 static void tracked_request_end(BdrvTrackedRequest *req)
2296 {
2297     if (req->serialising) {
2298         req->bs->serialising_in_flight--;
2299     }
2300 
2301     QLIST_REMOVE(req, list);
2302     qemu_co_queue_restart_all(&req->wait_queue);
2303 }
2304 
2305 /**
2306  * Add an active request to the tracked requests list
2307  */
2308 static void tracked_request_begin(BdrvTrackedRequest *req,
2309                                   BlockDriverState *bs,
2310                                   int64_t offset,
2311                                   unsigned int bytes, bool is_write)
2312 {
2313     *req = (BdrvTrackedRequest){
2314         .bs = bs,
2315         .offset         = offset,
2316         .bytes          = bytes,
2317         .is_write       = is_write,
2318         .co             = qemu_coroutine_self(),
2319         .serialising    = false,
2320         .overlap_offset = offset,
2321         .overlap_bytes  = bytes,
2322     };
2323 
2324     qemu_co_queue_init(&req->wait_queue);
2325 
2326     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2327 }
2328 
2329 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2330 {
2331     int64_t overlap_offset = req->offset & ~(align - 1);
2332     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2333                                - overlap_offset;
2334 
2335     if (!req->serialising) {
2336         req->bs->serialising_in_flight++;
2337         req->serialising = true;
2338     }
2339 
2340     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2341     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2342 }
2343 
2344 /**
2345  * Round a region to cluster boundaries
2346  */
2347 void bdrv_round_to_clusters(BlockDriverState *bs,
2348                             int64_t sector_num, int nb_sectors,
2349                             int64_t *cluster_sector_num,
2350                             int *cluster_nb_sectors)
2351 {
2352     BlockDriverInfo bdi;
2353 
2354     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2355         *cluster_sector_num = sector_num;
2356         *cluster_nb_sectors = nb_sectors;
2357     } else {
2358         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2359         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2360         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2361                                             nb_sectors, c);
2362     }
2363 }
2364 
2365 static int bdrv_get_cluster_size(BlockDriverState *bs)
2366 {
2367     BlockDriverInfo bdi;
2368     int ret;
2369 
2370     ret = bdrv_get_info(bs, &bdi);
2371     if (ret < 0 || bdi.cluster_size == 0) {
2372         return bs->request_alignment;
2373     } else {
2374         return bdi.cluster_size;
2375     }
2376 }
2377 
2378 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2379                                      int64_t offset, unsigned int bytes)
2380 {
2381     /*        aaaa   bbbb */
2382     if (offset >= req->overlap_offset + req->overlap_bytes) {
2383         return false;
2384     }
2385     /* bbbb   aaaa        */
2386     if (req->overlap_offset >= offset + bytes) {
2387         return false;
2388     }
2389     return true;
2390 }
2391 
2392 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2393 {
2394     BlockDriverState *bs = self->bs;
2395     BdrvTrackedRequest *req;
2396     bool retry;
2397     bool waited = false;
2398 
2399     if (!bs->serialising_in_flight) {
2400         return false;
2401     }
2402 
2403     do {
2404         retry = false;
2405         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2406             if (req == self || (!req->serialising && !self->serialising)) {
2407                 continue;
2408             }
2409             if (tracked_request_overlaps(req, self->overlap_offset,
2410                                          self->overlap_bytes))
2411             {
2412                 /* Hitting this means there was a reentrant request, for
2413                  * example, a block driver issuing nested requests.  This must
2414                  * never happen since it means deadlock.
2415                  */
2416                 assert(qemu_coroutine_self() != req->co);
2417 
2418                 /* If the request is already (indirectly) waiting for us, or
2419                  * will wait for us as soon as it wakes up, then just go on
2420                  * (instead of producing a deadlock in the former case). */
2421                 if (!req->waiting_for) {
2422                     self->waiting_for = req;
2423                     qemu_co_queue_wait(&req->wait_queue);
2424                     self->waiting_for = NULL;
2425                     retry = true;
2426                     waited = true;
2427                     break;
2428                 }
2429             }
2430         }
2431     } while (retry);
2432 
2433     return waited;
2434 }
2435 
2436 /*
2437  * Return values:
2438  * 0        - success
2439  * -EINVAL  - backing format specified, but no file
2440  * -ENOSPC  - can't update the backing file because no space is left in the
2441  *            image file header
2442  * -ENOTSUP - format driver doesn't support changing the backing file
2443  */
2444 int bdrv_change_backing_file(BlockDriverState *bs,
2445     const char *backing_file, const char *backing_fmt)
2446 {
2447     BlockDriver *drv = bs->drv;
2448     int ret;
2449 
2450     /* Backing file format doesn't make sense without a backing file */
2451     if (backing_fmt && !backing_file) {
2452         return -EINVAL;
2453     }
2454 
2455     if (drv->bdrv_change_backing_file != NULL) {
2456         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2457     } else {
2458         ret = -ENOTSUP;
2459     }
2460 
2461     if (ret == 0) {
2462         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2463         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2464     }
2465     return ret;
2466 }
2467 
2468 /*
2469  * Finds the image layer in the chain that has 'bs' as its backing file.
2470  *
2471  * active is the current topmost image.
2472  *
2473  * Returns NULL if bs is not found in active's image chain,
2474  * or if active == bs.
2475  *
2476  * Returns the bottommost base image if bs == NULL.
2477  */
2478 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2479                                     BlockDriverState *bs)
2480 {
2481     while (active && bs != active->backing_hd) {
2482         active = active->backing_hd;
2483     }
2484 
2485     return active;
2486 }
2487 
2488 /* Given a BDS, searches for the base layer. */
2489 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2490 {
2491     return bdrv_find_overlay(bs, NULL);
2492 }
2493 
2494 typedef struct BlkIntermediateStates {
2495     BlockDriverState *bs;
2496     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2497 } BlkIntermediateStates;
2498 
2499 
2500 /*
2501  * Drops images above 'base' up to and including 'top', and sets the image
2502  * above 'top' to have base as its backing file.
2503  *
2504  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2505  * information in 'bs' can be properly updated.
2506  *
2507  * E.g., this will convert the following chain:
2508  * bottom <- base <- intermediate <- top <- active
2509  *
2510  * to
2511  *
2512  * bottom <- base <- active
2513  *
2514  * It is allowed for bottom==base, in which case it converts:
2515  *
2516  * base <- intermediate <- top <- active
2517  *
2518  * to
2519  *
2520  * base <- active
2521  *
2522  * If backing_file_str is non-NULL, it will be used when modifying top's
2523  * overlay image metadata.
2524  *
2525  * Error conditions:
2526  *  if active == top, that is considered an error
2527  *
2528  */
2529 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2530                            BlockDriverState *base, const char *backing_file_str)
2531 {
2532     BlockDriverState *intermediate;
2533     BlockDriverState *base_bs = NULL;
2534     BlockDriverState *new_top_bs = NULL;
2535     BlkIntermediateStates *intermediate_state, *next;
2536     int ret = -EIO;
2537 
2538     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2539     QSIMPLEQ_INIT(&states_to_delete);
2540 
2541     if (!top->drv || !base->drv) {
2542         goto exit;
2543     }
2544 
2545     new_top_bs = bdrv_find_overlay(active, top);
2546 
2547     if (new_top_bs == NULL) {
2548         /* we could not find the image above 'top', this is an error */
2549         goto exit;
2550     }
2551 
2552     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2553      * to do, no intermediate images */
2554     if (new_top_bs->backing_hd == base) {
2555         ret = 0;
2556         goto exit;
2557     }
2558 
2559     intermediate = top;
2560 
2561     /* now we will go down through the list, and add each BDS we find
2562      * into our deletion queue, until we hit the 'base'
2563      */
2564     while (intermediate) {
2565         intermediate_state = g_new0(BlkIntermediateStates, 1);
2566         intermediate_state->bs = intermediate;
2567         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2568 
2569         if (intermediate->backing_hd == base) {
2570             base_bs = intermediate->backing_hd;
2571             break;
2572         }
2573         intermediate = intermediate->backing_hd;
2574     }
2575     if (base_bs == NULL) {
2576         /* something went wrong, we did not end at the base. safely
2577          * unravel everything, and exit with error */
2578         goto exit;
2579     }
2580 
2581     /* success - we can delete the intermediate states, and link top->base */
2582     backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2583     ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2584                                    base_bs->drv ? base_bs->drv->format_name : "");
2585     if (ret) {
2586         goto exit;
2587     }
2588     bdrv_set_backing_hd(new_top_bs, base_bs);
2589 
2590     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2591         /* so that bdrv_close() does not recursively close the chain */
2592         bdrv_set_backing_hd(intermediate_state->bs, NULL);
2593         bdrv_unref(intermediate_state->bs);
2594     }
2595     ret = 0;
2596 
2597 exit:
2598     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2599         g_free(intermediate_state);
2600     }
2601     return ret;
2602 }
2603 
2604 
2605 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2606                                    size_t size)
2607 {
2608     int64_t len;
2609 
2610     if (size > INT_MAX) {
2611         return -EIO;
2612     }
2613 
2614     if (!bdrv_is_inserted(bs))
2615         return -ENOMEDIUM;
2616 
2617     if (bs->growable)
2618         return 0;
2619 
2620     len = bdrv_getlength(bs);
2621 
2622     if (offset < 0)
2623         return -EIO;
2624 
2625     if ((offset > len) || (len - offset < size))
2626         return -EIO;
2627 
2628     return 0;
2629 }
2630 
2631 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2632                               int nb_sectors)
2633 {
2634     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2635         return -EIO;
2636     }
2637 
2638     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2639                                    nb_sectors * BDRV_SECTOR_SIZE);
2640 }
2641 
2642 typedef struct RwCo {
2643     BlockDriverState *bs;
2644     int64_t offset;
2645     QEMUIOVector *qiov;
2646     bool is_write;
2647     int ret;
2648     BdrvRequestFlags flags;
2649 } RwCo;
2650 
2651 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2652 {
2653     RwCo *rwco = opaque;
2654 
2655     if (!rwco->is_write) {
2656         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2657                                       rwco->qiov->size, rwco->qiov,
2658                                       rwco->flags);
2659     } else {
2660         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2661                                        rwco->qiov->size, rwco->qiov,
2662                                        rwco->flags);
2663     }
2664 }
2665 
2666 /*
2667  * Process a vectored synchronous request using coroutines
2668  */
2669 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2670                         QEMUIOVector *qiov, bool is_write,
2671                         BdrvRequestFlags flags)
2672 {
2673     Coroutine *co;
2674     RwCo rwco = {
2675         .bs = bs,
2676         .offset = offset,
2677         .qiov = qiov,
2678         .is_write = is_write,
2679         .ret = NOT_DONE,
2680         .flags = flags,
2681     };
2682 
2683     /**
2684      * In sync call context, when the vcpu is blocked, this throttling timer
2685      * will not fire; so the I/O throttling function has to be disabled here
2686      * if it has been enabled.
2687      */
2688     if (bs->io_limits_enabled) {
2689         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2690                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2691         bdrv_io_limits_disable(bs);
2692     }
2693 
2694     if (qemu_in_coroutine()) {
2695         /* Fast-path if already in coroutine context */
2696         bdrv_rw_co_entry(&rwco);
2697     } else {
2698         AioContext *aio_context = bdrv_get_aio_context(bs);
2699 
2700         co = qemu_coroutine_create(bdrv_rw_co_entry);
2701         qemu_coroutine_enter(co, &rwco);
2702         while (rwco.ret == NOT_DONE) {
2703             aio_poll(aio_context, true);
2704         }
2705     }
2706     return rwco.ret;
2707 }
2708 
2709 /*
2710  * Process a synchronous request using coroutines
2711  */
2712 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2713                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2714 {
2715     QEMUIOVector qiov;
2716     struct iovec iov = {
2717         .iov_base = (void *)buf,
2718         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2719     };
2720 
2721     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2722         return -EINVAL;
2723     }
2724 
2725     qemu_iovec_init_external(&qiov, &iov, 1);
2726     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2727                         &qiov, is_write, flags);
2728 }
2729 
2730 /* return < 0 if error. See bdrv_write() for the return codes */
2731 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2732               uint8_t *buf, int nb_sectors)
2733 {
2734     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2735 }
2736 
2737 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2738 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2739                           uint8_t *buf, int nb_sectors)
2740 {
2741     bool enabled;
2742     int ret;
2743 
2744     enabled = bs->io_limits_enabled;
2745     bs->io_limits_enabled = false;
2746     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2747     bs->io_limits_enabled = enabled;
2748     return ret;
2749 }
2750 
2751 /* Return < 0 if error. Important errors are:
2752   -EIO         generic I/O error (may happen for all errors)
2753   -ENOMEDIUM   No media inserted.
2754   -EINVAL      Invalid sector number or nb_sectors
2755   -EACCES      Trying to write a read-only device
2756 */
2757 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2758                const uint8_t *buf, int nb_sectors)
2759 {
2760     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2761 }
2762 
2763 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2764                       int nb_sectors, BdrvRequestFlags flags)
2765 {
2766     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2767                       BDRV_REQ_ZERO_WRITE | flags);
2768 }
2769 
2770 /*
2771  * Completely zero out a block device with the help of bdrv_write_zeroes.
2772  * The operation is sped up by checking the block status and only writing
2773  * zeroes to the device if they currently do not return zeroes. Optional
2774  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2775  *
2776  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2777  */
2778 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2779 {
2780     int64_t target_sectors, ret, nb_sectors, sector_num = 0;
2781     int n;
2782 
2783     target_sectors = bdrv_nb_sectors(bs);
2784     if (target_sectors < 0) {
2785         return target_sectors;
2786     }
2787 
2788     for (;;) {
2789         nb_sectors = target_sectors - sector_num;
2790         if (nb_sectors <= 0) {
2791             return 0;
2792         }
2793         if (nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2794             nb_sectors = INT_MAX / BDRV_SECTOR_SIZE;
2795         }
2796         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2797         if (ret < 0) {
2798             error_report("error getting block status at sector %" PRId64 ": %s",
2799                          sector_num, strerror(-ret));
2800             return ret;
2801         }
2802         if (ret & BDRV_BLOCK_ZERO) {
2803             sector_num += n;
2804             continue;
2805         }
2806         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2807         if (ret < 0) {
2808             error_report("error writing zeroes at sector %" PRId64 ": %s",
2809                          sector_num, strerror(-ret));
2810             return ret;
2811         }
2812         sector_num += n;
2813     }
2814 }
2815 
2816 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2817 {
2818     QEMUIOVector qiov;
2819     struct iovec iov = {
2820         .iov_base = (void *)buf,
2821         .iov_len = bytes,
2822     };
2823     int ret;
2824 
2825     if (bytes < 0) {
2826         return -EINVAL;
2827     }
2828 
2829     qemu_iovec_init_external(&qiov, &iov, 1);
2830     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2831     if (ret < 0) {
2832         return ret;
2833     }
2834 
2835     return bytes;
2836 }
2837 
2838 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2839 {
2840     int ret;
2841 
2842     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2843     if (ret < 0) {
2844         return ret;
2845     }
2846 
2847     return qiov->size;
2848 }
2849 
2850 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2851                 const void *buf, int bytes)
2852 {
2853     QEMUIOVector qiov;
2854     struct iovec iov = {
2855         .iov_base   = (void *) buf,
2856         .iov_len    = bytes,
2857     };
2858 
2859     if (bytes < 0) {
2860         return -EINVAL;
2861     }
2862 
2863     qemu_iovec_init_external(&qiov, &iov, 1);
2864     return bdrv_pwritev(bs, offset, &qiov);
2865 }
2866 
2867 /*
2868  * Writes to the file and ensures that no writes are reordered across this
2869  * request (acts as a barrier)
2870  *
2871  * Returns 0 on success, -errno in error cases.
2872  */
2873 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2874     const void *buf, int count)
2875 {
2876     int ret;
2877 
2878     ret = bdrv_pwrite(bs, offset, buf, count);
2879     if (ret < 0) {
2880         return ret;
2881     }
2882 
2883     /* No flush needed for cache modes that already do it */
2884     if (bs->enable_write_cache) {
2885         bdrv_flush(bs);
2886     }
2887 
2888     return 0;
2889 }
2890 
2891 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2892         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2893 {
2894     /* Perform I/O through a temporary buffer so that users who scribble over
2895      * their read buffer while the operation is in progress do not end up
2896      * modifying the image file.  This is critical for zero-copy guest I/O
2897      * where anything might happen inside guest memory.
2898      */
2899     void *bounce_buffer;
2900 
2901     BlockDriver *drv = bs->drv;
2902     struct iovec iov;
2903     QEMUIOVector bounce_qiov;
2904     int64_t cluster_sector_num;
2905     int cluster_nb_sectors;
2906     size_t skip_bytes;
2907     int ret;
2908 
2909     /* Cover entire cluster so no additional backing file I/O is required when
2910      * allocating cluster in the image file.
2911      */
2912     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2913                            &cluster_sector_num, &cluster_nb_sectors);
2914 
2915     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2916                                    cluster_sector_num, cluster_nb_sectors);
2917 
2918     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2919     iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
2920     if (bounce_buffer == NULL) {
2921         ret = -ENOMEM;
2922         goto err;
2923     }
2924 
2925     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2926 
2927     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2928                              &bounce_qiov);
2929     if (ret < 0) {
2930         goto err;
2931     }
2932 
2933     if (drv->bdrv_co_write_zeroes &&
2934         buffer_is_zero(bounce_buffer, iov.iov_len)) {
2935         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2936                                       cluster_nb_sectors, 0);
2937     } else {
2938         /* This does not change the data on the disk, it is not necessary
2939          * to flush even in cache=writethrough mode.
2940          */
2941         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2942                                   &bounce_qiov);
2943     }
2944 
2945     if (ret < 0) {
2946         /* It might be okay to ignore write errors for guest requests.  If this
2947          * is a deliberate copy-on-read then we don't want to ignore the error.
2948          * Simply report it in all cases.
2949          */
2950         goto err;
2951     }
2952 
2953     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2954     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2955                         nb_sectors * BDRV_SECTOR_SIZE);
2956 
2957 err:
2958     qemu_vfree(bounce_buffer);
2959     return ret;
2960 }
2961 
2962 /*
2963  * Forwards an already correctly aligned request to the BlockDriver. This
2964  * handles copy on read and zeroing after EOF; any other features must be
2965  * implemented by the caller.
2966  */
2967 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2968     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2969     int64_t align, QEMUIOVector *qiov, int flags)
2970 {
2971     BlockDriver *drv = bs->drv;
2972     int ret;
2973 
2974     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2975     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2976 
2977     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2978     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2979     assert(!qiov || bytes == qiov->size);
2980 
2981     /* Handle Copy on Read and associated serialisation */
2982     if (flags & BDRV_REQ_COPY_ON_READ) {
2983         /* If we touch the same cluster it counts as an overlap.  This
2984          * guarantees that allocating writes will be serialized and not race
2985          * with each other for the same cluster.  For example, in copy-on-read
2986          * it ensures that the CoR read and write operations are atomic and
2987          * guest writes cannot interleave between them. */
2988         mark_request_serialising(req, bdrv_get_cluster_size(bs));
2989     }
2990 
2991     wait_serialising_requests(req);
2992 
2993     if (flags & BDRV_REQ_COPY_ON_READ) {
2994         int pnum;
2995 
2996         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2997         if (ret < 0) {
2998             goto out;
2999         }
3000 
3001         if (!ret || pnum != nb_sectors) {
3002             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3003             goto out;
3004         }
3005     }
3006 
3007     /* Forward the request to the BlockDriver */
3008     if (!(bs->zero_beyond_eof && bs->growable)) {
3009         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3010     } else {
3011         /* Read zeros after EOF of growable BDSes */
3012         int64_t total_sectors, max_nb_sectors;
3013 
3014         total_sectors = bdrv_nb_sectors(bs);
3015         if (total_sectors < 0) {
3016             ret = total_sectors;
3017             goto out;
3018         }
3019 
3020         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3021                                   align >> BDRV_SECTOR_BITS);
3022         if (max_nb_sectors > 0) {
3023             QEMUIOVector local_qiov;
3024             size_t local_sectors;
3025 
3026             max_nb_sectors = MIN(max_nb_sectors, SIZE_MAX / BDRV_SECTOR_BITS);
3027             local_sectors = MIN(max_nb_sectors, nb_sectors);
3028 
3029             qemu_iovec_init(&local_qiov, qiov->niov);
3030             qemu_iovec_concat(&local_qiov, qiov, 0,
3031                               local_sectors * BDRV_SECTOR_SIZE);
3032 
3033             ret = drv->bdrv_co_readv(bs, sector_num, local_sectors,
3034                                      &local_qiov);
3035 
3036             qemu_iovec_destroy(&local_qiov);
3037         } else {
3038             ret = 0;
3039         }
3040 
3041         /* Reading beyond end of file is supposed to produce zeroes */
3042         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3043             uint64_t offset = MAX(0, total_sectors - sector_num);
3044             uint64_t bytes = (sector_num + nb_sectors - offset) *
3045                               BDRV_SECTOR_SIZE;
3046             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3047         }
3048     }
3049 
3050 out:
3051     return ret;
3052 }
3053 
3054 /*
3055  * Handle a read request in coroutine context
3056  */
3057 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3058     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3059     BdrvRequestFlags flags)
3060 {
3061     BlockDriver *drv = bs->drv;
3062     BdrvTrackedRequest req;
3063 
3064     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3065     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3066     uint8_t *head_buf = NULL;
3067     uint8_t *tail_buf = NULL;
3068     QEMUIOVector local_qiov;
3069     bool use_local_qiov = false;
3070     int ret;
3071 
3072     if (!drv) {
3073         return -ENOMEDIUM;
3074     }
3075     if (bdrv_check_byte_request(bs, offset, bytes)) {
3076         return -EIO;
3077     }
3078 
3079     if (bs->copy_on_read) {
3080         flags |= BDRV_REQ_COPY_ON_READ;
3081     }
3082 
3083     /* throttling disk I/O */
3084     if (bs->io_limits_enabled) {
3085         bdrv_io_limits_intercept(bs, bytes, false);
3086     }
3087 
3088     /* Align read if necessary by padding qiov */
3089     if (offset & (align - 1)) {
3090         head_buf = qemu_blockalign(bs, align);
3091         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3092         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3093         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3094         use_local_qiov = true;
3095 
3096         bytes += offset & (align - 1);
3097         offset = offset & ~(align - 1);
3098     }
3099 
3100     if ((offset + bytes) & (align - 1)) {
3101         if (!use_local_qiov) {
3102             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3103             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3104             use_local_qiov = true;
3105         }
3106         tail_buf = qemu_blockalign(bs, align);
3107         qemu_iovec_add(&local_qiov, tail_buf,
3108                        align - ((offset + bytes) & (align - 1)));
3109 
3110         bytes = ROUND_UP(bytes, align);
3111     }
3112 
3113     tracked_request_begin(&req, bs, offset, bytes, false);
3114     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3115                               use_local_qiov ? &local_qiov : qiov,
3116                               flags);
3117     tracked_request_end(&req);
3118 
3119     if (use_local_qiov) {
3120         qemu_iovec_destroy(&local_qiov);
3121         qemu_vfree(head_buf);
3122         qemu_vfree(tail_buf);
3123     }
3124 
3125     return ret;
3126 }
3127 
3128 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3129     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3130     BdrvRequestFlags flags)
3131 {
3132     if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3133         return -EINVAL;
3134     }
3135 
3136     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3137                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3138 }
3139 
3140 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3141     int nb_sectors, QEMUIOVector *qiov)
3142 {
3143     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3144 
3145     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3146 }
3147 
3148 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3149     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3150 {
3151     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3152 
3153     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3154                             BDRV_REQ_COPY_ON_READ);
3155 }
3156 
3157 /* if no limit is specified in the BlockLimits use a default
3158  * of 32768 512-byte sectors (16 MiB) per request.
3159  */
3160 #define MAX_WRITE_ZEROES_DEFAULT 32768
3161 
3162 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3163     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3164 {
3165     BlockDriver *drv = bs->drv;
3166     QEMUIOVector qiov;
3167     struct iovec iov = {0};
3168     int ret = 0;
3169 
3170     int max_write_zeroes = bs->bl.max_write_zeroes ?
3171                            bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3172 
3173     while (nb_sectors > 0 && !ret) {
3174         int num = nb_sectors;
3175 
3176         /* Align request.  Block drivers can expect the "bulk" of the request
3177          * to be aligned.
3178          */
3179         if (bs->bl.write_zeroes_alignment
3180             && num > bs->bl.write_zeroes_alignment) {
3181             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3182                 /* Make a small request up to the first aligned sector.  */
3183                 num = bs->bl.write_zeroes_alignment;
3184                 num -= sector_num % bs->bl.write_zeroes_alignment;
3185             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3186                 /* Shorten the request to the last aligned sector.  num cannot
3187                  * underflow because num > bs->bl.write_zeroes_alignment.
3188                  */
3189                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3190             }
3191         }
3192 
3193         /* limit request size */
3194         if (num > max_write_zeroes) {
3195             num = max_write_zeroes;
3196         }
3197 
3198         ret = -ENOTSUP;
3199         /* First try the efficient write zeroes operation */
3200         if (drv->bdrv_co_write_zeroes) {
3201             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3202         }
3203 
3204         if (ret == -ENOTSUP) {
3205             /* Fall back to bounce buffer if write zeroes is unsupported */
3206             iov.iov_len = num * BDRV_SECTOR_SIZE;
3207             if (iov.iov_base == NULL) {
3208                 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3209                 if (iov.iov_base == NULL) {
3210                     ret = -ENOMEM;
3211                     goto fail;
3212                 }
3213                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3214             }
3215             qemu_iovec_init_external(&qiov, &iov, 1);
3216 
3217             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3218 
3219             /* Keep bounce buffer around if it is big enough for all
3220              * all future requests.
3221              */
3222             if (num < max_write_zeroes) {
3223                 qemu_vfree(iov.iov_base);
3224                 iov.iov_base = NULL;
3225             }
3226         }
3227 
3228         sector_num += num;
3229         nb_sectors -= num;
3230     }
3231 
3232 fail:
3233     qemu_vfree(iov.iov_base);
3234     return ret;
3235 }
3236 
3237 /*
3238  * Forwards an already correctly aligned write request to the BlockDriver.
3239  */
3240 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3241     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3242     QEMUIOVector *qiov, int flags)
3243 {
3244     BlockDriver *drv = bs->drv;
3245     bool waited;
3246     int ret;
3247 
3248     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3249     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3250 
3251     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3252     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3253     assert(!qiov || bytes == qiov->size);
3254 
3255     waited = wait_serialising_requests(req);
3256     assert(!waited || !req->serialising);
3257     assert(req->overlap_offset <= offset);
3258     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3259 
3260     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3261 
3262     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3263         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3264         qemu_iovec_is_zero(qiov)) {
3265         flags |= BDRV_REQ_ZERO_WRITE;
3266         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3267             flags |= BDRV_REQ_MAY_UNMAP;
3268         }
3269     }
3270 
3271     if (ret < 0) {
3272         /* Do nothing, write notifier decided to fail this request */
3273     } else if (flags & BDRV_REQ_ZERO_WRITE) {
3274         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3275         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3276     } else {
3277         BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3278         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3279     }
3280     BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3281 
3282     if (ret == 0 && !bs->enable_write_cache) {
3283         ret = bdrv_co_flush(bs);
3284     }
3285 
3286     bdrv_set_dirty(bs, sector_num, nb_sectors);
3287 
3288     block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
3289 
3290     if (bs->growable && ret >= 0) {
3291         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3292     }
3293 
3294     return ret;
3295 }
3296 
3297 /*
3298  * Handle a write request in coroutine context
3299  */
3300 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3301     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3302     BdrvRequestFlags flags)
3303 {
3304     BdrvTrackedRequest req;
3305     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3306     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3307     uint8_t *head_buf = NULL;
3308     uint8_t *tail_buf = NULL;
3309     QEMUIOVector local_qiov;
3310     bool use_local_qiov = false;
3311     int ret;
3312 
3313     if (!bs->drv) {
3314         return -ENOMEDIUM;
3315     }
3316     if (bs->read_only) {
3317         return -EACCES;
3318     }
3319     if (bdrv_check_byte_request(bs, offset, bytes)) {
3320         return -EIO;
3321     }
3322 
3323     /* throttling disk I/O */
3324     if (bs->io_limits_enabled) {
3325         bdrv_io_limits_intercept(bs, bytes, true);
3326     }
3327 
3328     /*
3329      * Align write if necessary by performing a read-modify-write cycle.
3330      * Pad qiov with the read parts and be sure to have a tracked request not
3331      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3332      */
3333     tracked_request_begin(&req, bs, offset, bytes, true);
3334 
3335     if (offset & (align - 1)) {
3336         QEMUIOVector head_qiov;
3337         struct iovec head_iov;
3338 
3339         mark_request_serialising(&req, align);
3340         wait_serialising_requests(&req);
3341 
3342         head_buf = qemu_blockalign(bs, align);
3343         head_iov = (struct iovec) {
3344             .iov_base   = head_buf,
3345             .iov_len    = align,
3346         };
3347         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3348 
3349         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3350         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3351                                   align, &head_qiov, 0);
3352         if (ret < 0) {
3353             goto fail;
3354         }
3355         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3356 
3357         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3358         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3359         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3360         use_local_qiov = true;
3361 
3362         bytes += offset & (align - 1);
3363         offset = offset & ~(align - 1);
3364     }
3365 
3366     if ((offset + bytes) & (align - 1)) {
3367         QEMUIOVector tail_qiov;
3368         struct iovec tail_iov;
3369         size_t tail_bytes;
3370         bool waited;
3371 
3372         mark_request_serialising(&req, align);
3373         waited = wait_serialising_requests(&req);
3374         assert(!waited || !use_local_qiov);
3375 
3376         tail_buf = qemu_blockalign(bs, align);
3377         tail_iov = (struct iovec) {
3378             .iov_base   = tail_buf,
3379             .iov_len    = align,
3380         };
3381         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3382 
3383         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3384         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3385                                   align, &tail_qiov, 0);
3386         if (ret < 0) {
3387             goto fail;
3388         }
3389         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3390 
3391         if (!use_local_qiov) {
3392             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3393             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3394             use_local_qiov = true;
3395         }
3396 
3397         tail_bytes = (offset + bytes) & (align - 1);
3398         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3399 
3400         bytes = ROUND_UP(bytes, align);
3401     }
3402 
3403     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3404                                use_local_qiov ? &local_qiov : qiov,
3405                                flags);
3406 
3407 fail:
3408     tracked_request_end(&req);
3409 
3410     if (use_local_qiov) {
3411         qemu_iovec_destroy(&local_qiov);
3412     }
3413     qemu_vfree(head_buf);
3414     qemu_vfree(tail_buf);
3415 
3416     return ret;
3417 }
3418 
3419 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3420     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3421     BdrvRequestFlags flags)
3422 {
3423     if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3424         return -EINVAL;
3425     }
3426 
3427     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3428                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3429 }
3430 
3431 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3432     int nb_sectors, QEMUIOVector *qiov)
3433 {
3434     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3435 
3436     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3437 }
3438 
3439 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3440                                       int64_t sector_num, int nb_sectors,
3441                                       BdrvRequestFlags flags)
3442 {
3443     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3444 
3445     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3446         flags &= ~BDRV_REQ_MAY_UNMAP;
3447     }
3448 
3449     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3450                              BDRV_REQ_ZERO_WRITE | flags);
3451 }
3452 
3453 /**
3454  * Truncate file to 'offset' bytes (needed only for file protocols)
3455  */
3456 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3457 {
3458     BlockDriver *drv = bs->drv;
3459     int ret;
3460     if (!drv)
3461         return -ENOMEDIUM;
3462     if (!drv->bdrv_truncate)
3463         return -ENOTSUP;
3464     if (bs->read_only)
3465         return -EACCES;
3466 
3467     ret = drv->bdrv_truncate(bs, offset);
3468     if (ret == 0) {
3469         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3470         if (bs->blk) {
3471             blk_dev_resize_cb(bs->blk);
3472         }
3473     }
3474     return ret;
3475 }
3476 
3477 /**
3478  * Length of a allocated file in bytes. Sparse files are counted by actual
3479  * allocated space. Return < 0 if error or unknown.
3480  */
3481 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3482 {
3483     BlockDriver *drv = bs->drv;
3484     if (!drv) {
3485         return -ENOMEDIUM;
3486     }
3487     if (drv->bdrv_get_allocated_file_size) {
3488         return drv->bdrv_get_allocated_file_size(bs);
3489     }
3490     if (bs->file) {
3491         return bdrv_get_allocated_file_size(bs->file);
3492     }
3493     return -ENOTSUP;
3494 }
3495 
3496 /**
3497  * Return number of sectors on success, -errno on error.
3498  */
3499 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3500 {
3501     BlockDriver *drv = bs->drv;
3502 
3503     if (!drv)
3504         return -ENOMEDIUM;
3505 
3506     if (drv->has_variable_length) {
3507         int ret = refresh_total_sectors(bs, bs->total_sectors);
3508         if (ret < 0) {
3509             return ret;
3510         }
3511     }
3512     return bs->total_sectors;
3513 }
3514 
3515 /**
3516  * Return length in bytes on success, -errno on error.
3517  * The length is always a multiple of BDRV_SECTOR_SIZE.
3518  */
3519 int64_t bdrv_getlength(BlockDriverState *bs)
3520 {
3521     int64_t ret = bdrv_nb_sectors(bs);
3522 
3523     return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3524 }
3525 
3526 /* return 0 as number of sectors if no device present or error */
3527 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3528 {
3529     int64_t nb_sectors = bdrv_nb_sectors(bs);
3530 
3531     *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3532 }
3533 
3534 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3535                        BlockdevOnError on_write_error)
3536 {
3537     bs->on_read_error = on_read_error;
3538     bs->on_write_error = on_write_error;
3539 }
3540 
3541 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3542 {
3543     return is_read ? bs->on_read_error : bs->on_write_error;
3544 }
3545 
3546 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3547 {
3548     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3549 
3550     switch (on_err) {
3551     case BLOCKDEV_ON_ERROR_ENOSPC:
3552         return (error == ENOSPC) ?
3553                BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3554     case BLOCKDEV_ON_ERROR_STOP:
3555         return BLOCK_ERROR_ACTION_STOP;
3556     case BLOCKDEV_ON_ERROR_REPORT:
3557         return BLOCK_ERROR_ACTION_REPORT;
3558     case BLOCKDEV_ON_ERROR_IGNORE:
3559         return BLOCK_ERROR_ACTION_IGNORE;
3560     default:
3561         abort();
3562     }
3563 }
3564 
3565 static void send_qmp_error_event(BlockDriverState *bs,
3566                                  BlockErrorAction action,
3567                                  bool is_read, int error)
3568 {
3569     IoOperationType optype;
3570 
3571     optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3572     qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
3573                                    bdrv_iostatus_is_enabled(bs),
3574                                    error == ENOSPC, strerror(error),
3575                                    &error_abort);
3576 }
3577 
3578 /* This is done by device models because, while the block layer knows
3579  * about the error, it does not know whether an operation comes from
3580  * the device or the block layer (from a job, for example).
3581  */
3582 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3583                        bool is_read, int error)
3584 {
3585     assert(error >= 0);
3586 
3587     if (action == BLOCK_ERROR_ACTION_STOP) {
3588         /* First set the iostatus, so that "info block" returns an iostatus
3589          * that matches the events raised so far (an additional error iostatus
3590          * is fine, but not a lost one).
3591          */
3592         bdrv_iostatus_set_err(bs, error);
3593 
3594         /* Then raise the request to stop the VM and the event.
3595          * qemu_system_vmstop_request_prepare has two effects.  First,
3596          * it ensures that the STOP event always comes after the
3597          * BLOCK_IO_ERROR event.  Second, it ensures that even if management
3598          * can observe the STOP event and do a "cont" before the STOP
3599          * event is issued, the VM will not stop.  In this case, vm_start()
3600          * also ensures that the STOP/RESUME pair of events is emitted.
3601          */
3602         qemu_system_vmstop_request_prepare();
3603         send_qmp_error_event(bs, action, is_read, error);
3604         qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3605     } else {
3606         send_qmp_error_event(bs, action, is_read, error);
3607     }
3608 }
3609 
3610 int bdrv_is_read_only(BlockDriverState *bs)
3611 {
3612     return bs->read_only;
3613 }
3614 
3615 int bdrv_is_sg(BlockDriverState *bs)
3616 {
3617     return bs->sg;
3618 }
3619 
3620 int bdrv_enable_write_cache(BlockDriverState *bs)
3621 {
3622     return bs->enable_write_cache;
3623 }
3624 
3625 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3626 {
3627     bs->enable_write_cache = wce;
3628 
3629     /* so a reopen() will preserve wce */
3630     if (wce) {
3631         bs->open_flags |= BDRV_O_CACHE_WB;
3632     } else {
3633         bs->open_flags &= ~BDRV_O_CACHE_WB;
3634     }
3635 }
3636 
3637 int bdrv_is_encrypted(BlockDriverState *bs)
3638 {
3639     if (bs->backing_hd && bs->backing_hd->encrypted)
3640         return 1;
3641     return bs->encrypted;
3642 }
3643 
3644 int bdrv_key_required(BlockDriverState *bs)
3645 {
3646     BlockDriverState *backing_hd = bs->backing_hd;
3647 
3648     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3649         return 1;
3650     return (bs->encrypted && !bs->valid_key);
3651 }
3652 
3653 int bdrv_set_key(BlockDriverState *bs, const char *key)
3654 {
3655     int ret;
3656     if (bs->backing_hd && bs->backing_hd->encrypted) {
3657         ret = bdrv_set_key(bs->backing_hd, key);
3658         if (ret < 0)
3659             return ret;
3660         if (!bs->encrypted)
3661             return 0;
3662     }
3663     if (!bs->encrypted) {
3664         return -EINVAL;
3665     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3666         return -ENOMEDIUM;
3667     }
3668     ret = bs->drv->bdrv_set_key(bs, key);
3669     if (ret < 0) {
3670         bs->valid_key = 0;
3671     } else if (!bs->valid_key) {
3672         bs->valid_key = 1;
3673         if (bs->blk) {
3674             /* call the change callback now, we skipped it on open */
3675             blk_dev_change_media_cb(bs->blk, true);
3676         }
3677     }
3678     return ret;
3679 }
3680 
3681 const char *bdrv_get_format_name(BlockDriverState *bs)
3682 {
3683     return bs->drv ? bs->drv->format_name : NULL;
3684 }
3685 
3686 static int qsort_strcmp(const void *a, const void *b)
3687 {
3688     return strcmp(a, b);
3689 }
3690 
3691 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3692                          void *opaque)
3693 {
3694     BlockDriver *drv;
3695     int count = 0;
3696     int i;
3697     const char **formats = NULL;
3698 
3699     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3700         if (drv->format_name) {
3701             bool found = false;
3702             int i = count;
3703             while (formats && i && !found) {
3704                 found = !strcmp(formats[--i], drv->format_name);
3705             }
3706 
3707             if (!found) {
3708                 formats = g_renew(const char *, formats, count + 1);
3709                 formats[count++] = drv->format_name;
3710             }
3711         }
3712     }
3713 
3714     qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3715 
3716     for (i = 0; i < count; i++) {
3717         it(opaque, formats[i]);
3718     }
3719 
3720     g_free(formats);
3721 }
3722 
3723 /* This function is to find block backend bs */
3724 /* TODO convert callers to blk_by_name(), then remove */
3725 BlockDriverState *bdrv_find(const char *name)
3726 {
3727     BlockBackend *blk = blk_by_name(name);
3728 
3729     return blk ? blk_bs(blk) : NULL;
3730 }
3731 
3732 /* This function is to find a node in the bs graph */
3733 BlockDriverState *bdrv_find_node(const char *node_name)
3734 {
3735     BlockDriverState *bs;
3736 
3737     assert(node_name);
3738 
3739     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3740         if (!strcmp(node_name, bs->node_name)) {
3741             return bs;
3742         }
3743     }
3744     return NULL;
3745 }
3746 
3747 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3748 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3749 {
3750     BlockDeviceInfoList *list, *entry;
3751     BlockDriverState *bs;
3752 
3753     list = NULL;
3754     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3755         entry = g_malloc0(sizeof(*entry));
3756         entry->value = bdrv_block_device_info(bs);
3757         entry->next = list;
3758         list = entry;
3759     }
3760 
3761     return list;
3762 }
3763 
3764 BlockDriverState *bdrv_lookup_bs(const char *device,
3765                                  const char *node_name,
3766                                  Error **errp)
3767 {
3768     BlockBackend *blk;
3769     BlockDriverState *bs;
3770 
3771     if (device) {
3772         blk = blk_by_name(device);
3773 
3774         if (blk) {
3775             return blk_bs(blk);
3776         }
3777     }
3778 
3779     if (node_name) {
3780         bs = bdrv_find_node(node_name);
3781 
3782         if (bs) {
3783             return bs;
3784         }
3785     }
3786 
3787     error_setg(errp, "Cannot find device=%s nor node_name=%s",
3788                      device ? device : "",
3789                      node_name ? node_name : "");
3790     return NULL;
3791 }
3792 
3793 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3794  * return false.  If either argument is NULL, return false. */
3795 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3796 {
3797     while (top && top != base) {
3798         top = top->backing_hd;
3799     }
3800 
3801     return top != NULL;
3802 }
3803 
3804 BlockDriverState *bdrv_next_node(BlockDriverState *bs)
3805 {
3806     if (!bs) {
3807         return QTAILQ_FIRST(&graph_bdrv_states);
3808     }
3809     return QTAILQ_NEXT(bs, node_list);
3810 }
3811 
3812 BlockDriverState *bdrv_next(BlockDriverState *bs)
3813 {
3814     if (!bs) {
3815         return QTAILQ_FIRST(&bdrv_states);
3816     }
3817     return QTAILQ_NEXT(bs, device_list);
3818 }
3819 
3820 const char *bdrv_get_node_name(const BlockDriverState *bs)
3821 {
3822     return bs->node_name;
3823 }
3824 
3825 /* TODO check what callers really want: bs->node_name or blk_name() */
3826 const char *bdrv_get_device_name(const BlockDriverState *bs)
3827 {
3828     return bs->blk ? blk_name(bs->blk) : "";
3829 }
3830 
3831 int bdrv_get_flags(BlockDriverState *bs)
3832 {
3833     return bs->open_flags;
3834 }
3835 
3836 int bdrv_flush_all(void)
3837 {
3838     BlockDriverState *bs;
3839     int result = 0;
3840 
3841     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3842         AioContext *aio_context = bdrv_get_aio_context(bs);
3843         int ret;
3844 
3845         aio_context_acquire(aio_context);
3846         ret = bdrv_flush(bs);
3847         if (ret < 0 && !result) {
3848             result = ret;
3849         }
3850         aio_context_release(aio_context);
3851     }
3852 
3853     return result;
3854 }
3855 
3856 int bdrv_has_zero_init_1(BlockDriverState *bs)
3857 {
3858     return 1;
3859 }
3860 
3861 int bdrv_has_zero_init(BlockDriverState *bs)
3862 {
3863     assert(bs->drv);
3864 
3865     /* If BS is a copy on write image, it is initialized to
3866        the contents of the base image, which may not be zeroes.  */
3867     if (bs->backing_hd) {
3868         return 0;
3869     }
3870     if (bs->drv->bdrv_has_zero_init) {
3871         return bs->drv->bdrv_has_zero_init(bs);
3872     }
3873 
3874     /* safe default */
3875     return 0;
3876 }
3877 
3878 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3879 {
3880     BlockDriverInfo bdi;
3881 
3882     if (bs->backing_hd) {
3883         return false;
3884     }
3885 
3886     if (bdrv_get_info(bs, &bdi) == 0) {
3887         return bdi.unallocated_blocks_are_zero;
3888     }
3889 
3890     return false;
3891 }
3892 
3893 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3894 {
3895     BlockDriverInfo bdi;
3896 
3897     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3898         return false;
3899     }
3900 
3901     if (bdrv_get_info(bs, &bdi) == 0) {
3902         return bdi.can_write_zeroes_with_unmap;
3903     }
3904 
3905     return false;
3906 }
3907 
3908 typedef struct BdrvCoGetBlockStatusData {
3909     BlockDriverState *bs;
3910     BlockDriverState *base;
3911     int64_t sector_num;
3912     int nb_sectors;
3913     int *pnum;
3914     int64_t ret;
3915     bool done;
3916 } BdrvCoGetBlockStatusData;
3917 
3918 /*
3919  * Returns the allocation status of the specified sectors.
3920  * Drivers not implementing the functionality are assumed to not support
3921  * backing files, hence all their sectors are reported as allocated.
3922  *
3923  * If 'sector_num' is beyond the end of the disk image the return value is 0
3924  * and 'pnum' is set to 0.
3925  *
3926  * 'pnum' is set to the number of sectors (including and immediately following
3927  * the specified sector) that are known to be in the same
3928  * allocated/unallocated state.
3929  *
3930  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3931  * beyond the end of the disk image it will be clamped.
3932  */
3933 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3934                                                      int64_t sector_num,
3935                                                      int nb_sectors, int *pnum)
3936 {
3937     int64_t total_sectors;
3938     int64_t n;
3939     int64_t ret, ret2;
3940 
3941     total_sectors = bdrv_nb_sectors(bs);
3942     if (total_sectors < 0) {
3943         return total_sectors;
3944     }
3945 
3946     if (sector_num >= total_sectors) {
3947         *pnum = 0;
3948         return 0;
3949     }
3950 
3951     n = total_sectors - sector_num;
3952     if (n < nb_sectors) {
3953         nb_sectors = n;
3954     }
3955 
3956     if (!bs->drv->bdrv_co_get_block_status) {
3957         *pnum = nb_sectors;
3958         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
3959         if (bs->drv->protocol_name) {
3960             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3961         }
3962         return ret;
3963     }
3964 
3965     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3966     if (ret < 0) {
3967         *pnum = 0;
3968         return ret;
3969     }
3970 
3971     if (ret & BDRV_BLOCK_RAW) {
3972         assert(ret & BDRV_BLOCK_OFFSET_VALID);
3973         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3974                                      *pnum, pnum);
3975     }
3976 
3977     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
3978         ret |= BDRV_BLOCK_ALLOCATED;
3979     }
3980 
3981     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3982         if (bdrv_unallocated_blocks_are_zero(bs)) {
3983             ret |= BDRV_BLOCK_ZERO;
3984         } else if (bs->backing_hd) {
3985             BlockDriverState *bs2 = bs->backing_hd;
3986             int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
3987             if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
3988                 ret |= BDRV_BLOCK_ZERO;
3989             }
3990         }
3991     }
3992 
3993     if (bs->file &&
3994         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3995         (ret & BDRV_BLOCK_OFFSET_VALID)) {
3996         int file_pnum;
3997 
3998         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3999                                         *pnum, &file_pnum);
4000         if (ret2 >= 0) {
4001             /* Ignore errors.  This is just providing extra information, it
4002              * is useful but not necessary.
4003              */
4004             if (!file_pnum) {
4005                 /* !file_pnum indicates an offset at or beyond the EOF; it is
4006                  * perfectly valid for the format block driver to point to such
4007                  * offsets, so catch it and mark everything as zero */
4008                 ret |= BDRV_BLOCK_ZERO;
4009             } else {
4010                 /* Limit request to the range reported by the protocol driver */
4011                 *pnum = file_pnum;
4012                 ret |= (ret2 & BDRV_BLOCK_ZERO);
4013             }
4014         }
4015     }
4016 
4017     return ret;
4018 }
4019 
4020 /* Coroutine wrapper for bdrv_get_block_status() */
4021 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
4022 {
4023     BdrvCoGetBlockStatusData *data = opaque;
4024     BlockDriverState *bs = data->bs;
4025 
4026     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4027                                          data->pnum);
4028     data->done = true;
4029 }
4030 
4031 /*
4032  * Synchronous wrapper around bdrv_co_get_block_status().
4033  *
4034  * See bdrv_co_get_block_status() for details.
4035  */
4036 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4037                               int nb_sectors, int *pnum)
4038 {
4039     Coroutine *co;
4040     BdrvCoGetBlockStatusData data = {
4041         .bs = bs,
4042         .sector_num = sector_num,
4043         .nb_sectors = nb_sectors,
4044         .pnum = pnum,
4045         .done = false,
4046     };
4047 
4048     if (qemu_in_coroutine()) {
4049         /* Fast-path if already in coroutine context */
4050         bdrv_get_block_status_co_entry(&data);
4051     } else {
4052         AioContext *aio_context = bdrv_get_aio_context(bs);
4053 
4054         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4055         qemu_coroutine_enter(co, &data);
4056         while (!data.done) {
4057             aio_poll(aio_context, true);
4058         }
4059     }
4060     return data.ret;
4061 }
4062 
4063 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4064                                    int nb_sectors, int *pnum)
4065 {
4066     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4067     if (ret < 0) {
4068         return ret;
4069     }
4070     return !!(ret & BDRV_BLOCK_ALLOCATED);
4071 }
4072 
4073 /*
4074  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4075  *
4076  * Return true if the given sector is allocated in any image between
4077  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
4078  * sector is allocated in any image of the chain.  Return false otherwise.
4079  *
4080  * 'pnum' is set to the number of sectors (including and immediately following
4081  *  the specified sector) that are known to be in the same
4082  *  allocated/unallocated state.
4083  *
4084  */
4085 int bdrv_is_allocated_above(BlockDriverState *top,
4086                             BlockDriverState *base,
4087                             int64_t sector_num,
4088                             int nb_sectors, int *pnum)
4089 {
4090     BlockDriverState *intermediate;
4091     int ret, n = nb_sectors;
4092 
4093     intermediate = top;
4094     while (intermediate && intermediate != base) {
4095         int pnum_inter;
4096         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4097                                 &pnum_inter);
4098         if (ret < 0) {
4099             return ret;
4100         } else if (ret) {
4101             *pnum = pnum_inter;
4102             return 1;
4103         }
4104 
4105         /*
4106          * [sector_num, nb_sectors] is unallocated on top but intermediate
4107          * might have
4108          *
4109          * [sector_num+x, nr_sectors] allocated.
4110          */
4111         if (n > pnum_inter &&
4112             (intermediate == top ||
4113              sector_num + pnum_inter < intermediate->total_sectors)) {
4114             n = pnum_inter;
4115         }
4116 
4117         intermediate = intermediate->backing_hd;
4118     }
4119 
4120     *pnum = n;
4121     return 0;
4122 }
4123 
4124 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4125 {
4126     if (bs->backing_hd && bs->backing_hd->encrypted)
4127         return bs->backing_file;
4128     else if (bs->encrypted)
4129         return bs->filename;
4130     else
4131         return NULL;
4132 }
4133 
4134 void bdrv_get_backing_filename(BlockDriverState *bs,
4135                                char *filename, int filename_size)
4136 {
4137     pstrcpy(filename, filename_size, bs->backing_file);
4138 }
4139 
4140 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4141                           const uint8_t *buf, int nb_sectors)
4142 {
4143     BlockDriver *drv = bs->drv;
4144     if (!drv)
4145         return -ENOMEDIUM;
4146     if (!drv->bdrv_write_compressed)
4147         return -ENOTSUP;
4148     if (bdrv_check_request(bs, sector_num, nb_sectors))
4149         return -EIO;
4150 
4151     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4152 
4153     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4154 }
4155 
4156 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4157 {
4158     BlockDriver *drv = bs->drv;
4159     if (!drv)
4160         return -ENOMEDIUM;
4161     if (!drv->bdrv_get_info)
4162         return -ENOTSUP;
4163     memset(bdi, 0, sizeof(*bdi));
4164     return drv->bdrv_get_info(bs, bdi);
4165 }
4166 
4167 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4168 {
4169     BlockDriver *drv = bs->drv;
4170     if (drv && drv->bdrv_get_specific_info) {
4171         return drv->bdrv_get_specific_info(bs);
4172     }
4173     return NULL;
4174 }
4175 
4176 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4177                       int64_t pos, int size)
4178 {
4179     QEMUIOVector qiov;
4180     struct iovec iov = {
4181         .iov_base   = (void *) buf,
4182         .iov_len    = size,
4183     };
4184 
4185     qemu_iovec_init_external(&qiov, &iov, 1);
4186     return bdrv_writev_vmstate(bs, &qiov, pos);
4187 }
4188 
4189 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4190 {
4191     BlockDriver *drv = bs->drv;
4192 
4193     if (!drv) {
4194         return -ENOMEDIUM;
4195     } else if (drv->bdrv_save_vmstate) {
4196         return drv->bdrv_save_vmstate(bs, qiov, pos);
4197     } else if (bs->file) {
4198         return bdrv_writev_vmstate(bs->file, qiov, pos);
4199     }
4200 
4201     return -ENOTSUP;
4202 }
4203 
4204 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4205                       int64_t pos, int size)
4206 {
4207     BlockDriver *drv = bs->drv;
4208     if (!drv)
4209         return -ENOMEDIUM;
4210     if (drv->bdrv_load_vmstate)
4211         return drv->bdrv_load_vmstate(bs, buf, pos, size);
4212     if (bs->file)
4213         return bdrv_load_vmstate(bs->file, buf, pos, size);
4214     return -ENOTSUP;
4215 }
4216 
4217 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4218 {
4219     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4220         return;
4221     }
4222 
4223     bs->drv->bdrv_debug_event(bs, event);
4224 }
4225 
4226 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4227                           const char *tag)
4228 {
4229     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4230         bs = bs->file;
4231     }
4232 
4233     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4234         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4235     }
4236 
4237     return -ENOTSUP;
4238 }
4239 
4240 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4241 {
4242     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4243         bs = bs->file;
4244     }
4245 
4246     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4247         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4248     }
4249 
4250     return -ENOTSUP;
4251 }
4252 
4253 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4254 {
4255     while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4256         bs = bs->file;
4257     }
4258 
4259     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4260         return bs->drv->bdrv_debug_resume(bs, tag);
4261     }
4262 
4263     return -ENOTSUP;
4264 }
4265 
4266 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4267 {
4268     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4269         bs = bs->file;
4270     }
4271 
4272     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4273         return bs->drv->bdrv_debug_is_suspended(bs, tag);
4274     }
4275 
4276     return false;
4277 }
4278 
4279 int bdrv_is_snapshot(BlockDriverState *bs)
4280 {
4281     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4282 }
4283 
4284 /* backing_file can either be relative, or absolute, or a protocol.  If it is
4285  * relative, it must be relative to the chain.  So, passing in bs->filename
4286  * from a BDS as backing_file should not be done, as that may be relative to
4287  * the CWD rather than the chain. */
4288 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4289         const char *backing_file)
4290 {
4291     char *filename_full = NULL;
4292     char *backing_file_full = NULL;
4293     char *filename_tmp = NULL;
4294     int is_protocol = 0;
4295     BlockDriverState *curr_bs = NULL;
4296     BlockDriverState *retval = NULL;
4297 
4298     if (!bs || !bs->drv || !backing_file) {
4299         return NULL;
4300     }
4301 
4302     filename_full     = g_malloc(PATH_MAX);
4303     backing_file_full = g_malloc(PATH_MAX);
4304     filename_tmp      = g_malloc(PATH_MAX);
4305 
4306     is_protocol = path_has_protocol(backing_file);
4307 
4308     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4309 
4310         /* If either of the filename paths is actually a protocol, then
4311          * compare unmodified paths; otherwise make paths relative */
4312         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4313             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4314                 retval = curr_bs->backing_hd;
4315                 break;
4316             }
4317         } else {
4318             /* If not an absolute filename path, make it relative to the current
4319              * image's filename path */
4320             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4321                          backing_file);
4322 
4323             /* We are going to compare absolute pathnames */
4324             if (!realpath(filename_tmp, filename_full)) {
4325                 continue;
4326             }
4327 
4328             /* We need to make sure the backing filename we are comparing against
4329              * is relative to the current image filename (or absolute) */
4330             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4331                          curr_bs->backing_file);
4332 
4333             if (!realpath(filename_tmp, backing_file_full)) {
4334                 continue;
4335             }
4336 
4337             if (strcmp(backing_file_full, filename_full) == 0) {
4338                 retval = curr_bs->backing_hd;
4339                 break;
4340             }
4341         }
4342     }
4343 
4344     g_free(filename_full);
4345     g_free(backing_file_full);
4346     g_free(filename_tmp);
4347     return retval;
4348 }
4349 
4350 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4351 {
4352     if (!bs->drv) {
4353         return 0;
4354     }
4355 
4356     if (!bs->backing_hd) {
4357         return 0;
4358     }
4359 
4360     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4361 }
4362 
4363 /**************************************************************/
4364 /* async I/Os */
4365 
4366 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4367                            QEMUIOVector *qiov, int nb_sectors,
4368                            BlockCompletionFunc *cb, void *opaque)
4369 {
4370     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4371 
4372     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4373                                  cb, opaque, false);
4374 }
4375 
4376 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4377                             QEMUIOVector *qiov, int nb_sectors,
4378                             BlockCompletionFunc *cb, void *opaque)
4379 {
4380     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4381 
4382     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4383                                  cb, opaque, true);
4384 }
4385 
4386 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4387         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4388         BlockCompletionFunc *cb, void *opaque)
4389 {
4390     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4391 
4392     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4393                                  BDRV_REQ_ZERO_WRITE | flags,
4394                                  cb, opaque, true);
4395 }
4396 
4397 
4398 typedef struct MultiwriteCB {
4399     int error;
4400     int num_requests;
4401     int num_callbacks;
4402     struct {
4403         BlockCompletionFunc *cb;
4404         void *opaque;
4405         QEMUIOVector *free_qiov;
4406     } callbacks[];
4407 } MultiwriteCB;
4408 
4409 static void multiwrite_user_cb(MultiwriteCB *mcb)
4410 {
4411     int i;
4412 
4413     for (i = 0; i < mcb->num_callbacks; i++) {
4414         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4415         if (mcb->callbacks[i].free_qiov) {
4416             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4417         }
4418         g_free(mcb->callbacks[i].free_qiov);
4419     }
4420 }
4421 
4422 static void multiwrite_cb(void *opaque, int ret)
4423 {
4424     MultiwriteCB *mcb = opaque;
4425 
4426     trace_multiwrite_cb(mcb, ret);
4427 
4428     if (ret < 0 && !mcb->error) {
4429         mcb->error = ret;
4430     }
4431 
4432     mcb->num_requests--;
4433     if (mcb->num_requests == 0) {
4434         multiwrite_user_cb(mcb);
4435         g_free(mcb);
4436     }
4437 }
4438 
4439 static int multiwrite_req_compare(const void *a, const void *b)
4440 {
4441     const BlockRequest *req1 = a, *req2 = b;
4442 
4443     /*
4444      * Note that we can't simply subtract req2->sector from req1->sector
4445      * here as that could overflow the return value.
4446      */
4447     if (req1->sector > req2->sector) {
4448         return 1;
4449     } else if (req1->sector < req2->sector) {
4450         return -1;
4451     } else {
4452         return 0;
4453     }
4454 }
4455 
4456 /*
4457  * Takes a bunch of requests and tries to merge them. Returns the number of
4458  * requests that remain after merging.
4459  */
4460 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4461     int num_reqs, MultiwriteCB *mcb)
4462 {
4463     int i, outidx;
4464 
4465     // Sort requests by start sector
4466     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4467 
4468     // Check if adjacent requests touch the same clusters. If so, combine them,
4469     // filling up gaps with zero sectors.
4470     outidx = 0;
4471     for (i = 1; i < num_reqs; i++) {
4472         int merge = 0;
4473         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4474 
4475         // Handle exactly sequential writes and overlapping writes.
4476         if (reqs[i].sector <= oldreq_last) {
4477             merge = 1;
4478         }
4479 
4480         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4481             merge = 0;
4482         }
4483 
4484         if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
4485             reqs[i].nb_sectors > bs->bl.max_transfer_length) {
4486             merge = 0;
4487         }
4488 
4489         if (merge) {
4490             size_t size;
4491             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4492             qemu_iovec_init(qiov,
4493                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4494 
4495             // Add the first request to the merged one. If the requests are
4496             // overlapping, drop the last sectors of the first request.
4497             size = (reqs[i].sector - reqs[outidx].sector) << 9;
4498             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4499 
4500             // We should need to add any zeros between the two requests
4501             assert (reqs[i].sector <= oldreq_last);
4502 
4503             // Add the second request
4504             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4505 
4506             // Add tail of first request, if necessary
4507             if (qiov->size < reqs[outidx].qiov->size) {
4508                 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4509                                   reqs[outidx].qiov->size - qiov->size);
4510             }
4511 
4512             reqs[outidx].nb_sectors = qiov->size >> 9;
4513             reqs[outidx].qiov = qiov;
4514 
4515             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4516         } else {
4517             outidx++;
4518             reqs[outidx].sector     = reqs[i].sector;
4519             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4520             reqs[outidx].qiov       = reqs[i].qiov;
4521         }
4522     }
4523 
4524     return outidx + 1;
4525 }
4526 
4527 /*
4528  * Submit multiple AIO write requests at once.
4529  *
4530  * On success, the function returns 0 and all requests in the reqs array have
4531  * been submitted. In error case this function returns -1, and any of the
4532  * requests may or may not be submitted yet. In particular, this means that the
4533  * callback will be called for some of the requests, for others it won't. The
4534  * caller must check the error field of the BlockRequest to wait for the right
4535  * callbacks (if error != 0, no callback will be called).
4536  *
4537  * The implementation may modify the contents of the reqs array, e.g. to merge
4538  * requests. However, the fields opaque and error are left unmodified as they
4539  * are used to signal failure for a single request to the caller.
4540  */
4541 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4542 {
4543     MultiwriteCB *mcb;
4544     int i;
4545 
4546     /* don't submit writes if we don't have a medium */
4547     if (bs->drv == NULL) {
4548         for (i = 0; i < num_reqs; i++) {
4549             reqs[i].error = -ENOMEDIUM;
4550         }
4551         return -1;
4552     }
4553 
4554     if (num_reqs == 0) {
4555         return 0;
4556     }
4557 
4558     // Create MultiwriteCB structure
4559     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4560     mcb->num_requests = 0;
4561     mcb->num_callbacks = num_reqs;
4562 
4563     for (i = 0; i < num_reqs; i++) {
4564         mcb->callbacks[i].cb = reqs[i].cb;
4565         mcb->callbacks[i].opaque = reqs[i].opaque;
4566     }
4567 
4568     // Check for mergable requests
4569     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4570 
4571     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4572 
4573     /* Run the aio requests. */
4574     mcb->num_requests = num_reqs;
4575     for (i = 0; i < num_reqs; i++) {
4576         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4577                               reqs[i].nb_sectors, reqs[i].flags,
4578                               multiwrite_cb, mcb,
4579                               true);
4580     }
4581 
4582     return 0;
4583 }
4584 
4585 void bdrv_aio_cancel(BlockAIOCB *acb)
4586 {
4587     qemu_aio_ref(acb);
4588     bdrv_aio_cancel_async(acb);
4589     while (acb->refcnt > 1) {
4590         if (acb->aiocb_info->get_aio_context) {
4591             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4592         } else if (acb->bs) {
4593             aio_poll(bdrv_get_aio_context(acb->bs), true);
4594         } else {
4595             abort();
4596         }
4597     }
4598     qemu_aio_unref(acb);
4599 }
4600 
4601 /* Async version of aio cancel. The caller is not blocked if the acb implements
4602  * cancel_async, otherwise we do nothing and let the request normally complete.
4603  * In either case the completion callback must be called. */
4604 void bdrv_aio_cancel_async(BlockAIOCB *acb)
4605 {
4606     if (acb->aiocb_info->cancel_async) {
4607         acb->aiocb_info->cancel_async(acb);
4608     }
4609 }
4610 
4611 /**************************************************************/
4612 /* async block device emulation */
4613 
4614 typedef struct BlockAIOCBSync {
4615     BlockAIOCB common;
4616     QEMUBH *bh;
4617     int ret;
4618     /* vector translation state */
4619     QEMUIOVector *qiov;
4620     uint8_t *bounce;
4621     int is_write;
4622 } BlockAIOCBSync;
4623 
4624 static const AIOCBInfo bdrv_em_aiocb_info = {
4625     .aiocb_size         = sizeof(BlockAIOCBSync),
4626 };
4627 
4628 static void bdrv_aio_bh_cb(void *opaque)
4629 {
4630     BlockAIOCBSync *acb = opaque;
4631 
4632     if (!acb->is_write && acb->ret >= 0) {
4633         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4634     }
4635     qemu_vfree(acb->bounce);
4636     acb->common.cb(acb->common.opaque, acb->ret);
4637     qemu_bh_delete(acb->bh);
4638     acb->bh = NULL;
4639     qemu_aio_unref(acb);
4640 }
4641 
4642 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4643                                       int64_t sector_num,
4644                                       QEMUIOVector *qiov,
4645                                       int nb_sectors,
4646                                       BlockCompletionFunc *cb,
4647                                       void *opaque,
4648                                       int is_write)
4649 
4650 {
4651     BlockAIOCBSync *acb;
4652 
4653     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4654     acb->is_write = is_write;
4655     acb->qiov = qiov;
4656     acb->bounce = qemu_try_blockalign(bs, qiov->size);
4657     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4658 
4659     if (acb->bounce == NULL) {
4660         acb->ret = -ENOMEM;
4661     } else if (is_write) {
4662         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4663         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4664     } else {
4665         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4666     }
4667 
4668     qemu_bh_schedule(acb->bh);
4669 
4670     return &acb->common;
4671 }
4672 
4673 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4674         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4675         BlockCompletionFunc *cb, void *opaque)
4676 {
4677     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4678 }
4679 
4680 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4681         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4682         BlockCompletionFunc *cb, void *opaque)
4683 {
4684     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4685 }
4686 
4687 
4688 typedef struct BlockAIOCBCoroutine {
4689     BlockAIOCB common;
4690     BlockRequest req;
4691     bool is_write;
4692     bool *done;
4693     QEMUBH* bh;
4694 } BlockAIOCBCoroutine;
4695 
4696 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4697     .aiocb_size         = sizeof(BlockAIOCBCoroutine),
4698 };
4699 
4700 static void bdrv_co_em_bh(void *opaque)
4701 {
4702     BlockAIOCBCoroutine *acb = opaque;
4703 
4704     acb->common.cb(acb->common.opaque, acb->req.error);
4705 
4706     qemu_bh_delete(acb->bh);
4707     qemu_aio_unref(acb);
4708 }
4709 
4710 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4711 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4712 {
4713     BlockAIOCBCoroutine *acb = opaque;
4714     BlockDriverState *bs = acb->common.bs;
4715 
4716     if (!acb->is_write) {
4717         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4718             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4719     } else {
4720         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4721             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4722     }
4723 
4724     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4725     qemu_bh_schedule(acb->bh);
4726 }
4727 
4728 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4729                                          int64_t sector_num,
4730                                          QEMUIOVector *qiov,
4731                                          int nb_sectors,
4732                                          BdrvRequestFlags flags,
4733                                          BlockCompletionFunc *cb,
4734                                          void *opaque,
4735                                          bool is_write)
4736 {
4737     Coroutine *co;
4738     BlockAIOCBCoroutine *acb;
4739 
4740     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4741     acb->req.sector = sector_num;
4742     acb->req.nb_sectors = nb_sectors;
4743     acb->req.qiov = qiov;
4744     acb->req.flags = flags;
4745     acb->is_write = is_write;
4746 
4747     co = qemu_coroutine_create(bdrv_co_do_rw);
4748     qemu_coroutine_enter(co, acb);
4749 
4750     return &acb->common;
4751 }
4752 
4753 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4754 {
4755     BlockAIOCBCoroutine *acb = opaque;
4756     BlockDriverState *bs = acb->common.bs;
4757 
4758     acb->req.error = bdrv_co_flush(bs);
4759     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4760     qemu_bh_schedule(acb->bh);
4761 }
4762 
4763 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4764         BlockCompletionFunc *cb, void *opaque)
4765 {
4766     trace_bdrv_aio_flush(bs, opaque);
4767 
4768     Coroutine *co;
4769     BlockAIOCBCoroutine *acb;
4770 
4771     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4772 
4773     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4774     qemu_coroutine_enter(co, acb);
4775 
4776     return &acb->common;
4777 }
4778 
4779 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4780 {
4781     BlockAIOCBCoroutine *acb = opaque;
4782     BlockDriverState *bs = acb->common.bs;
4783 
4784     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4785     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4786     qemu_bh_schedule(acb->bh);
4787 }
4788 
4789 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4790         int64_t sector_num, int nb_sectors,
4791         BlockCompletionFunc *cb, void *opaque)
4792 {
4793     Coroutine *co;
4794     BlockAIOCBCoroutine *acb;
4795 
4796     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4797 
4798     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4799     acb->req.sector = sector_num;
4800     acb->req.nb_sectors = nb_sectors;
4801     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4802     qemu_coroutine_enter(co, acb);
4803 
4804     return &acb->common;
4805 }
4806 
4807 void bdrv_init(void)
4808 {
4809     module_call_init(MODULE_INIT_BLOCK);
4810 }
4811 
4812 void bdrv_init_with_whitelist(void)
4813 {
4814     use_bdrv_whitelist = 1;
4815     bdrv_init();
4816 }
4817 
4818 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4819                    BlockCompletionFunc *cb, void *opaque)
4820 {
4821     BlockAIOCB *acb;
4822 
4823     acb = g_slice_alloc(aiocb_info->aiocb_size);
4824     acb->aiocb_info = aiocb_info;
4825     acb->bs = bs;
4826     acb->cb = cb;
4827     acb->opaque = opaque;
4828     acb->refcnt = 1;
4829     return acb;
4830 }
4831 
4832 void qemu_aio_ref(void *p)
4833 {
4834     BlockAIOCB *acb = p;
4835     acb->refcnt++;
4836 }
4837 
4838 void qemu_aio_unref(void *p)
4839 {
4840     BlockAIOCB *acb = p;
4841     assert(acb->refcnt > 0);
4842     if (--acb->refcnt == 0) {
4843         g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4844     }
4845 }
4846 
4847 /**************************************************************/
4848 /* Coroutine block device emulation */
4849 
4850 typedef struct CoroutineIOCompletion {
4851     Coroutine *coroutine;
4852     int ret;
4853 } CoroutineIOCompletion;
4854 
4855 static void bdrv_co_io_em_complete(void *opaque, int ret)
4856 {
4857     CoroutineIOCompletion *co = opaque;
4858 
4859     co->ret = ret;
4860     qemu_coroutine_enter(co->coroutine, NULL);
4861 }
4862 
4863 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4864                                       int nb_sectors, QEMUIOVector *iov,
4865                                       bool is_write)
4866 {
4867     CoroutineIOCompletion co = {
4868         .coroutine = qemu_coroutine_self(),
4869     };
4870     BlockAIOCB *acb;
4871 
4872     if (is_write) {
4873         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4874                                        bdrv_co_io_em_complete, &co);
4875     } else {
4876         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4877                                       bdrv_co_io_em_complete, &co);
4878     }
4879 
4880     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4881     if (!acb) {
4882         return -EIO;
4883     }
4884     qemu_coroutine_yield();
4885 
4886     return co.ret;
4887 }
4888 
4889 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4890                                          int64_t sector_num, int nb_sectors,
4891                                          QEMUIOVector *iov)
4892 {
4893     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4894 }
4895 
4896 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4897                                          int64_t sector_num, int nb_sectors,
4898                                          QEMUIOVector *iov)
4899 {
4900     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4901 }
4902 
4903 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4904 {
4905     RwCo *rwco = opaque;
4906 
4907     rwco->ret = bdrv_co_flush(rwco->bs);
4908 }
4909 
4910 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4911 {
4912     int ret;
4913 
4914     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4915         return 0;
4916     }
4917 
4918     /* Write back cached data to the OS even with cache=unsafe */
4919     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4920     if (bs->drv->bdrv_co_flush_to_os) {
4921         ret = bs->drv->bdrv_co_flush_to_os(bs);
4922         if (ret < 0) {
4923             return ret;
4924         }
4925     }
4926 
4927     /* But don't actually force it to the disk with cache=unsafe */
4928     if (bs->open_flags & BDRV_O_NO_FLUSH) {
4929         goto flush_parent;
4930     }
4931 
4932     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4933     if (bs->drv->bdrv_co_flush_to_disk) {
4934         ret = bs->drv->bdrv_co_flush_to_disk(bs);
4935     } else if (bs->drv->bdrv_aio_flush) {
4936         BlockAIOCB *acb;
4937         CoroutineIOCompletion co = {
4938             .coroutine = qemu_coroutine_self(),
4939         };
4940 
4941         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4942         if (acb == NULL) {
4943             ret = -EIO;
4944         } else {
4945             qemu_coroutine_yield();
4946             ret = co.ret;
4947         }
4948     } else {
4949         /*
4950          * Some block drivers always operate in either writethrough or unsafe
4951          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4952          * know how the server works (because the behaviour is hardcoded or
4953          * depends on server-side configuration), so we can't ensure that
4954          * everything is safe on disk. Returning an error doesn't work because
4955          * that would break guests even if the server operates in writethrough
4956          * mode.
4957          *
4958          * Let's hope the user knows what he's doing.
4959          */
4960         ret = 0;
4961     }
4962     if (ret < 0) {
4963         return ret;
4964     }
4965 
4966     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4967      * in the case of cache=unsafe, so there are no useless flushes.
4968      */
4969 flush_parent:
4970     return bdrv_co_flush(bs->file);
4971 }
4972 
4973 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
4974 {
4975     Error *local_err = NULL;
4976     int ret;
4977 
4978     if (!bs->drv)  {
4979         return;
4980     }
4981 
4982     if (!(bs->open_flags & BDRV_O_INCOMING)) {
4983         return;
4984     }
4985     bs->open_flags &= ~BDRV_O_INCOMING;
4986 
4987     if (bs->drv->bdrv_invalidate_cache) {
4988         bs->drv->bdrv_invalidate_cache(bs, &local_err);
4989     } else if (bs->file) {
4990         bdrv_invalidate_cache(bs->file, &local_err);
4991     }
4992     if (local_err) {
4993         error_propagate(errp, local_err);
4994         return;
4995     }
4996 
4997     ret = refresh_total_sectors(bs, bs->total_sectors);
4998     if (ret < 0) {
4999         error_setg_errno(errp, -ret, "Could not refresh total sector count");
5000         return;
5001     }
5002 }
5003 
5004 void bdrv_invalidate_cache_all(Error **errp)
5005 {
5006     BlockDriverState *bs;
5007     Error *local_err = NULL;
5008 
5009     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5010         AioContext *aio_context = bdrv_get_aio_context(bs);
5011 
5012         aio_context_acquire(aio_context);
5013         bdrv_invalidate_cache(bs, &local_err);
5014         aio_context_release(aio_context);
5015         if (local_err) {
5016             error_propagate(errp, local_err);
5017             return;
5018         }
5019     }
5020 }
5021 
5022 int bdrv_flush(BlockDriverState *bs)
5023 {
5024     Coroutine *co;
5025     RwCo rwco = {
5026         .bs = bs,
5027         .ret = NOT_DONE,
5028     };
5029 
5030     if (qemu_in_coroutine()) {
5031         /* Fast-path if already in coroutine context */
5032         bdrv_flush_co_entry(&rwco);
5033     } else {
5034         AioContext *aio_context = bdrv_get_aio_context(bs);
5035 
5036         co = qemu_coroutine_create(bdrv_flush_co_entry);
5037         qemu_coroutine_enter(co, &rwco);
5038         while (rwco.ret == NOT_DONE) {
5039             aio_poll(aio_context, true);
5040         }
5041     }
5042 
5043     return rwco.ret;
5044 }
5045 
5046 typedef struct DiscardCo {
5047     BlockDriverState *bs;
5048     int64_t sector_num;
5049     int nb_sectors;
5050     int ret;
5051 } DiscardCo;
5052 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5053 {
5054     DiscardCo *rwco = opaque;
5055 
5056     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5057 }
5058 
5059 /* if no limit is specified in the BlockLimits use a default
5060  * of 32768 512-byte sectors (16 MiB) per request.
5061  */
5062 #define MAX_DISCARD_DEFAULT 32768
5063 
5064 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5065                                  int nb_sectors)
5066 {
5067     int max_discard;
5068 
5069     if (!bs->drv) {
5070         return -ENOMEDIUM;
5071     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5072         return -EIO;
5073     } else if (bs->read_only) {
5074         return -EROFS;
5075     }
5076 
5077     bdrv_reset_dirty(bs, sector_num, nb_sectors);
5078 
5079     /* Do nothing if disabled.  */
5080     if (!(bs->open_flags & BDRV_O_UNMAP)) {
5081         return 0;
5082     }
5083 
5084     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5085         return 0;
5086     }
5087 
5088     max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5089     while (nb_sectors > 0) {
5090         int ret;
5091         int num = nb_sectors;
5092 
5093         /* align request */
5094         if (bs->bl.discard_alignment &&
5095             num >= bs->bl.discard_alignment &&
5096             sector_num % bs->bl.discard_alignment) {
5097             if (num > bs->bl.discard_alignment) {
5098                 num = bs->bl.discard_alignment;
5099             }
5100             num -= sector_num % bs->bl.discard_alignment;
5101         }
5102 
5103         /* limit request size */
5104         if (num > max_discard) {
5105             num = max_discard;
5106         }
5107 
5108         if (bs->drv->bdrv_co_discard) {
5109             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5110         } else {
5111             BlockAIOCB *acb;
5112             CoroutineIOCompletion co = {
5113                 .coroutine = qemu_coroutine_self(),
5114             };
5115 
5116             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5117                                             bdrv_co_io_em_complete, &co);
5118             if (acb == NULL) {
5119                 return -EIO;
5120             } else {
5121                 qemu_coroutine_yield();
5122                 ret = co.ret;
5123             }
5124         }
5125         if (ret && ret != -ENOTSUP) {
5126             return ret;
5127         }
5128 
5129         sector_num += num;
5130         nb_sectors -= num;
5131     }
5132     return 0;
5133 }
5134 
5135 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5136 {
5137     Coroutine *co;
5138     DiscardCo rwco = {
5139         .bs = bs,
5140         .sector_num = sector_num,
5141         .nb_sectors = nb_sectors,
5142         .ret = NOT_DONE,
5143     };
5144 
5145     if (qemu_in_coroutine()) {
5146         /* Fast-path if already in coroutine context */
5147         bdrv_discard_co_entry(&rwco);
5148     } else {
5149         AioContext *aio_context = bdrv_get_aio_context(bs);
5150 
5151         co = qemu_coroutine_create(bdrv_discard_co_entry);
5152         qemu_coroutine_enter(co, &rwco);
5153         while (rwco.ret == NOT_DONE) {
5154             aio_poll(aio_context, true);
5155         }
5156     }
5157 
5158     return rwco.ret;
5159 }
5160 
5161 /**************************************************************/
5162 /* removable device support */
5163 
5164 /**
5165  * Return TRUE if the media is present
5166  */
5167 int bdrv_is_inserted(BlockDriverState *bs)
5168 {
5169     BlockDriver *drv = bs->drv;
5170 
5171     if (!drv)
5172         return 0;
5173     if (!drv->bdrv_is_inserted)
5174         return 1;
5175     return drv->bdrv_is_inserted(bs);
5176 }
5177 
5178 /**
5179  * Return whether the media changed since the last call to this
5180  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
5181  */
5182 int bdrv_media_changed(BlockDriverState *bs)
5183 {
5184     BlockDriver *drv = bs->drv;
5185 
5186     if (drv && drv->bdrv_media_changed) {
5187         return drv->bdrv_media_changed(bs);
5188     }
5189     return -ENOTSUP;
5190 }
5191 
5192 /**
5193  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5194  */
5195 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5196 {
5197     BlockDriver *drv = bs->drv;
5198     const char *device_name;
5199 
5200     if (drv && drv->bdrv_eject) {
5201         drv->bdrv_eject(bs, eject_flag);
5202     }
5203 
5204     device_name = bdrv_get_device_name(bs);
5205     if (device_name[0] != '\0') {
5206         qapi_event_send_device_tray_moved(device_name,
5207                                           eject_flag, &error_abort);
5208     }
5209 }
5210 
5211 /**
5212  * Lock or unlock the media (if it is locked, the user won't be able
5213  * to eject it manually).
5214  */
5215 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5216 {
5217     BlockDriver *drv = bs->drv;
5218 
5219     trace_bdrv_lock_medium(bs, locked);
5220 
5221     if (drv && drv->bdrv_lock_medium) {
5222         drv->bdrv_lock_medium(bs, locked);
5223     }
5224 }
5225 
5226 /* needed for generic scsi interface */
5227 
5228 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5229 {
5230     BlockDriver *drv = bs->drv;
5231 
5232     if (drv && drv->bdrv_ioctl)
5233         return drv->bdrv_ioctl(bs, req, buf);
5234     return -ENOTSUP;
5235 }
5236 
5237 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5238         unsigned long int req, void *buf,
5239         BlockCompletionFunc *cb, void *opaque)
5240 {
5241     BlockDriver *drv = bs->drv;
5242 
5243     if (drv && drv->bdrv_aio_ioctl)
5244         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5245     return NULL;
5246 }
5247 
5248 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5249 {
5250     bs->guest_block_size = align;
5251 }
5252 
5253 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5254 {
5255     return qemu_memalign(bdrv_opt_mem_align(bs), size);
5256 }
5257 
5258 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5259 {
5260     return memset(qemu_blockalign(bs, size), 0, size);
5261 }
5262 
5263 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5264 {
5265     size_t align = bdrv_opt_mem_align(bs);
5266 
5267     /* Ensure that NULL is never returned on success */
5268     assert(align > 0);
5269     if (size == 0) {
5270         size = align;
5271     }
5272 
5273     return qemu_try_memalign(align, size);
5274 }
5275 
5276 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5277 {
5278     void *mem = qemu_try_blockalign(bs, size);
5279 
5280     if (mem) {
5281         memset(mem, 0, size);
5282     }
5283 
5284     return mem;
5285 }
5286 
5287 /*
5288  * Check if all memory in this vector is sector aligned.
5289  */
5290 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5291 {
5292     int i;
5293     size_t alignment = bdrv_opt_mem_align(bs);
5294 
5295     for (i = 0; i < qiov->niov; i++) {
5296         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5297             return false;
5298         }
5299         if (qiov->iov[i].iov_len % alignment) {
5300             return false;
5301         }
5302     }
5303 
5304     return true;
5305 }
5306 
5307 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5308                                           Error **errp)
5309 {
5310     int64_t bitmap_size;
5311     BdrvDirtyBitmap *bitmap;
5312 
5313     assert((granularity & (granularity - 1)) == 0);
5314 
5315     granularity >>= BDRV_SECTOR_BITS;
5316     assert(granularity);
5317     bitmap_size = bdrv_nb_sectors(bs);
5318     if (bitmap_size < 0) {
5319         error_setg_errno(errp, -bitmap_size, "could not get length of device");
5320         errno = -bitmap_size;
5321         return NULL;
5322     }
5323     bitmap = g_new0(BdrvDirtyBitmap, 1);
5324     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5325     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5326     return bitmap;
5327 }
5328 
5329 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5330 {
5331     BdrvDirtyBitmap *bm, *next;
5332     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5333         if (bm == bitmap) {
5334             QLIST_REMOVE(bitmap, list);
5335             hbitmap_free(bitmap->bitmap);
5336             g_free(bitmap);
5337             return;
5338         }
5339     }
5340 }
5341 
5342 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5343 {
5344     BdrvDirtyBitmap *bm;
5345     BlockDirtyInfoList *list = NULL;
5346     BlockDirtyInfoList **plist = &list;
5347 
5348     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5349         BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5350         BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
5351         info->count = bdrv_get_dirty_count(bs, bm);
5352         info->granularity =
5353             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5354         entry->value = info;
5355         *plist = entry;
5356         plist = &entry->next;
5357     }
5358 
5359     return list;
5360 }
5361 
5362 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5363 {
5364     if (bitmap) {
5365         return hbitmap_get(bitmap->bitmap, sector);
5366     } else {
5367         return 0;
5368     }
5369 }
5370 
5371 void bdrv_dirty_iter_init(BlockDriverState *bs,
5372                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5373 {
5374     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5375 }
5376 
5377 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5378                     int nr_sectors)
5379 {
5380     BdrvDirtyBitmap *bitmap;
5381     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5382         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5383     }
5384 }
5385 
5386 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5387 {
5388     BdrvDirtyBitmap *bitmap;
5389     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5390         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5391     }
5392 }
5393 
5394 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5395 {
5396     return hbitmap_count(bitmap->bitmap);
5397 }
5398 
5399 /* Get a reference to bs */
5400 void bdrv_ref(BlockDriverState *bs)
5401 {
5402     bs->refcnt++;
5403 }
5404 
5405 /* Release a previously grabbed reference to bs.
5406  * If after releasing, reference count is zero, the BlockDriverState is
5407  * deleted. */
5408 void bdrv_unref(BlockDriverState *bs)
5409 {
5410     if (!bs) {
5411         return;
5412     }
5413     assert(bs->refcnt > 0);
5414     if (--bs->refcnt == 0) {
5415         bdrv_delete(bs);
5416     }
5417 }
5418 
5419 struct BdrvOpBlocker {
5420     Error *reason;
5421     QLIST_ENTRY(BdrvOpBlocker) list;
5422 };
5423 
5424 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5425 {
5426     BdrvOpBlocker *blocker;
5427     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5428     if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5429         blocker = QLIST_FIRST(&bs->op_blockers[op]);
5430         if (errp) {
5431             error_setg(errp, "Device '%s' is busy: %s",
5432                        bdrv_get_device_name(bs),
5433                        error_get_pretty(blocker->reason));
5434         }
5435         return true;
5436     }
5437     return false;
5438 }
5439 
5440 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5441 {
5442     BdrvOpBlocker *blocker;
5443     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5444 
5445     blocker = g_new0(BdrvOpBlocker, 1);
5446     blocker->reason = reason;
5447     QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5448 }
5449 
5450 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5451 {
5452     BdrvOpBlocker *blocker, *next;
5453     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5454     QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5455         if (blocker->reason == reason) {
5456             QLIST_REMOVE(blocker, list);
5457             g_free(blocker);
5458         }
5459     }
5460 }
5461 
5462 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5463 {
5464     int i;
5465     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5466         bdrv_op_block(bs, i, reason);
5467     }
5468 }
5469 
5470 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5471 {
5472     int i;
5473     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5474         bdrv_op_unblock(bs, i, reason);
5475     }
5476 }
5477 
5478 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5479 {
5480     int i;
5481 
5482     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5483         if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5484             return false;
5485         }
5486     }
5487     return true;
5488 }
5489 
5490 void bdrv_iostatus_enable(BlockDriverState *bs)
5491 {
5492     bs->iostatus_enabled = true;
5493     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5494 }
5495 
5496 /* The I/O status is only enabled if the drive explicitly
5497  * enables it _and_ the VM is configured to stop on errors */
5498 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5499 {
5500     return (bs->iostatus_enabled &&
5501            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5502             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5503             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5504 }
5505 
5506 void bdrv_iostatus_disable(BlockDriverState *bs)
5507 {
5508     bs->iostatus_enabled = false;
5509 }
5510 
5511 void bdrv_iostatus_reset(BlockDriverState *bs)
5512 {
5513     if (bdrv_iostatus_is_enabled(bs)) {
5514         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5515         if (bs->job) {
5516             block_job_iostatus_reset(bs->job);
5517         }
5518     }
5519 }
5520 
5521 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5522 {
5523     assert(bdrv_iostatus_is_enabled(bs));
5524     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5525         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5526                                          BLOCK_DEVICE_IO_STATUS_FAILED;
5527     }
5528 }
5529 
5530 void bdrv_img_create(const char *filename, const char *fmt,
5531                      const char *base_filename, const char *base_fmt,
5532                      char *options, uint64_t img_size, int flags,
5533                      Error **errp, bool quiet)
5534 {
5535     QemuOptsList *create_opts = NULL;
5536     QemuOpts *opts = NULL;
5537     const char *backing_fmt, *backing_file;
5538     int64_t size;
5539     BlockDriver *drv, *proto_drv;
5540     BlockDriver *backing_drv = NULL;
5541     Error *local_err = NULL;
5542     int ret = 0;
5543 
5544     /* Find driver and parse its options */
5545     drv = bdrv_find_format(fmt);
5546     if (!drv) {
5547         error_setg(errp, "Unknown file format '%s'", fmt);
5548         return;
5549     }
5550 
5551     proto_drv = bdrv_find_protocol(filename, true);
5552     if (!proto_drv) {
5553         error_setg(errp, "Unknown protocol '%s'", filename);
5554         return;
5555     }
5556 
5557     create_opts = qemu_opts_append(create_opts, drv->create_opts);
5558     create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5559 
5560     /* Create parameter list with default values */
5561     opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5562     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
5563 
5564     /* Parse -o options */
5565     if (options) {
5566         if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5567             error_setg(errp, "Invalid options for file format '%s'", fmt);
5568             goto out;
5569         }
5570     }
5571 
5572     if (base_filename) {
5573         if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
5574             error_setg(errp, "Backing file not supported for file format '%s'",
5575                        fmt);
5576             goto out;
5577         }
5578     }
5579 
5580     if (base_fmt) {
5581         if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5582             error_setg(errp, "Backing file format not supported for file "
5583                              "format '%s'", fmt);
5584             goto out;
5585         }
5586     }
5587 
5588     backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5589     if (backing_file) {
5590         if (!strcmp(filename, backing_file)) {
5591             error_setg(errp, "Error: Trying to create an image with the "
5592                              "same filename as the backing file");
5593             goto out;
5594         }
5595     }
5596 
5597     backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5598     if (backing_fmt) {
5599         backing_drv = bdrv_find_format(backing_fmt);
5600         if (!backing_drv) {
5601             error_setg(errp, "Unknown backing file format '%s'",
5602                        backing_fmt);
5603             goto out;
5604         }
5605     }
5606 
5607     // The size for the image must always be specified, with one exception:
5608     // If we are using a backing file, we can obtain the size from there
5609     size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5610     if (size == -1) {
5611         if (backing_file) {
5612             BlockDriverState *bs;
5613             int64_t size;
5614             int back_flags;
5615 
5616             /* backing files always opened read-only */
5617             back_flags =
5618                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5619 
5620             bs = NULL;
5621             ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
5622                             backing_drv, &local_err);
5623             if (ret < 0) {
5624                 goto out;
5625             }
5626             size = bdrv_getlength(bs);
5627             if (size < 0) {
5628                 error_setg_errno(errp, -size, "Could not get size of '%s'",
5629                                  backing_file);
5630                 bdrv_unref(bs);
5631                 goto out;
5632             }
5633 
5634             qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
5635 
5636             bdrv_unref(bs);
5637         } else {
5638             error_setg(errp, "Image creation needs a size parameter");
5639             goto out;
5640         }
5641     }
5642 
5643     if (!quiet) {
5644         printf("Formatting '%s', fmt=%s ", filename, fmt);
5645         qemu_opts_print(opts);
5646         puts("");
5647     }
5648 
5649     ret = bdrv_create(drv, filename, opts, &local_err);
5650 
5651     if (ret == -EFBIG) {
5652         /* This is generally a better message than whatever the driver would
5653          * deliver (especially because of the cluster_size_hint), since that
5654          * is most probably not much different from "image too large". */
5655         const char *cluster_size_hint = "";
5656         if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
5657             cluster_size_hint = " (try using a larger cluster size)";
5658         }
5659         error_setg(errp, "The image size is too large for file format '%s'"
5660                    "%s", fmt, cluster_size_hint);
5661         error_free(local_err);
5662         local_err = NULL;
5663     }
5664 
5665 out:
5666     qemu_opts_del(opts);
5667     qemu_opts_free(create_opts);
5668     if (local_err) {
5669         error_propagate(errp, local_err);
5670     }
5671 }
5672 
5673 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5674 {
5675     return bs->aio_context;
5676 }
5677 
5678 void bdrv_detach_aio_context(BlockDriverState *bs)
5679 {
5680     BdrvAioNotifier *baf;
5681 
5682     if (!bs->drv) {
5683         return;
5684     }
5685 
5686     QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5687         baf->detach_aio_context(baf->opaque);
5688     }
5689 
5690     if (bs->io_limits_enabled) {
5691         throttle_detach_aio_context(&bs->throttle_state);
5692     }
5693     if (bs->drv->bdrv_detach_aio_context) {
5694         bs->drv->bdrv_detach_aio_context(bs);
5695     }
5696     if (bs->file) {
5697         bdrv_detach_aio_context(bs->file);
5698     }
5699     if (bs->backing_hd) {
5700         bdrv_detach_aio_context(bs->backing_hd);
5701     }
5702 
5703     bs->aio_context = NULL;
5704 }
5705 
5706 void bdrv_attach_aio_context(BlockDriverState *bs,
5707                              AioContext *new_context)
5708 {
5709     BdrvAioNotifier *ban;
5710 
5711     if (!bs->drv) {
5712         return;
5713     }
5714 
5715     bs->aio_context = new_context;
5716 
5717     if (bs->backing_hd) {
5718         bdrv_attach_aio_context(bs->backing_hd, new_context);
5719     }
5720     if (bs->file) {
5721         bdrv_attach_aio_context(bs->file, new_context);
5722     }
5723     if (bs->drv->bdrv_attach_aio_context) {
5724         bs->drv->bdrv_attach_aio_context(bs, new_context);
5725     }
5726     if (bs->io_limits_enabled) {
5727         throttle_attach_aio_context(&bs->throttle_state, new_context);
5728     }
5729 
5730     QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5731         ban->attached_aio_context(new_context, ban->opaque);
5732     }
5733 }
5734 
5735 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5736 {
5737     bdrv_drain_all(); /* ensure there are no in-flight requests */
5738 
5739     bdrv_detach_aio_context(bs);
5740 
5741     /* This function executes in the old AioContext so acquire the new one in
5742      * case it runs in a different thread.
5743      */
5744     aio_context_acquire(new_context);
5745     bdrv_attach_aio_context(bs, new_context);
5746     aio_context_release(new_context);
5747 }
5748 
5749 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5750         void (*attached_aio_context)(AioContext *new_context, void *opaque),
5751         void (*detach_aio_context)(void *opaque), void *opaque)
5752 {
5753     BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5754     *ban = (BdrvAioNotifier){
5755         .attached_aio_context = attached_aio_context,
5756         .detach_aio_context   = detach_aio_context,
5757         .opaque               = opaque
5758     };
5759 
5760     QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5761 }
5762 
5763 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5764                                       void (*attached_aio_context)(AioContext *,
5765                                                                    void *),
5766                                       void (*detach_aio_context)(void *),
5767                                       void *opaque)
5768 {
5769     BdrvAioNotifier *ban, *ban_next;
5770 
5771     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5772         if (ban->attached_aio_context == attached_aio_context &&
5773             ban->detach_aio_context   == detach_aio_context   &&
5774             ban->opaque               == opaque)
5775         {
5776             QLIST_REMOVE(ban, list);
5777             g_free(ban);
5778 
5779             return;
5780         }
5781     }
5782 
5783     abort();
5784 }
5785 
5786 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5787                                     NotifierWithReturn *notifier)
5788 {
5789     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5790 }
5791 
5792 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
5793                        BlockDriverAmendStatusCB *status_cb)
5794 {
5795     if (!bs->drv->bdrv_amend_options) {
5796         return -ENOTSUP;
5797     }
5798     return bs->drv->bdrv_amend_options(bs, opts, status_cb);
5799 }
5800 
5801 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5802  * of block filter and by bdrv_is_first_non_filter.
5803  * It is used to test if the given bs is the candidate or recurse more in the
5804  * node graph.
5805  */
5806 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5807                                       BlockDriverState *candidate)
5808 {
5809     /* return false if basic checks fails */
5810     if (!bs || !bs->drv) {
5811         return false;
5812     }
5813 
5814     /* the code reached a non block filter driver -> check if the bs is
5815      * the same as the candidate. It's the recursion termination condition.
5816      */
5817     if (!bs->drv->is_filter) {
5818         return bs == candidate;
5819     }
5820     /* Down this path the driver is a block filter driver */
5821 
5822     /* If the block filter recursion method is defined use it to recurse down
5823      * the node graph.
5824      */
5825     if (bs->drv->bdrv_recurse_is_first_non_filter) {
5826         return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5827     }
5828 
5829     /* the driver is a block filter but don't allow to recurse -> return false
5830      */
5831     return false;
5832 }
5833 
5834 /* This function checks if the candidate is the first non filter bs down it's
5835  * bs chain. Since we don't have pointers to parents it explore all bs chains
5836  * from the top. Some filters can choose not to pass down the recursion.
5837  */
5838 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5839 {
5840     BlockDriverState *bs;
5841 
5842     /* walk down the bs forest recursively */
5843     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5844         bool perm;
5845 
5846         /* try to recurse in this top level bs */
5847         perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5848 
5849         /* candidate is the first non filter */
5850         if (perm) {
5851             return true;
5852         }
5853     }
5854 
5855     return false;
5856 }
5857 
5858 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5859 {
5860     BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5861     AioContext *aio_context;
5862 
5863     if (!to_replace_bs) {
5864         error_setg(errp, "Node name '%s' not found", node_name);
5865         return NULL;
5866     }
5867 
5868     aio_context = bdrv_get_aio_context(to_replace_bs);
5869     aio_context_acquire(aio_context);
5870 
5871     if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5872         to_replace_bs = NULL;
5873         goto out;
5874     }
5875 
5876     /* We don't want arbitrary node of the BDS chain to be replaced only the top
5877      * most non filter in order to prevent data corruption.
5878      * Another benefit is that this tests exclude backing files which are
5879      * blocked by the backing blockers.
5880      */
5881     if (!bdrv_is_first_non_filter(to_replace_bs)) {
5882         error_setg(errp, "Only top most non filter can be replaced");
5883         to_replace_bs = NULL;
5884         goto out;
5885     }
5886 
5887 out:
5888     aio_context_release(aio_context);
5889     return to_replace_bs;
5890 }
5891 
5892 void bdrv_io_plug(BlockDriverState *bs)
5893 {
5894     BlockDriver *drv = bs->drv;
5895     if (drv && drv->bdrv_io_plug) {
5896         drv->bdrv_io_plug(bs);
5897     } else if (bs->file) {
5898         bdrv_io_plug(bs->file);
5899     }
5900 }
5901 
5902 void bdrv_io_unplug(BlockDriverState *bs)
5903 {
5904     BlockDriver *drv = bs->drv;
5905     if (drv && drv->bdrv_io_unplug) {
5906         drv->bdrv_io_unplug(bs);
5907     } else if (bs->file) {
5908         bdrv_io_unplug(bs->file);
5909     }
5910 }
5911 
5912 void bdrv_flush_io_queue(BlockDriverState *bs)
5913 {
5914     BlockDriver *drv = bs->drv;
5915     if (drv && drv->bdrv_flush_io_queue) {
5916         drv->bdrv_flush_io_queue(bs);
5917     } else if (bs->file) {
5918         bdrv_flush_io_queue(bs->file);
5919     }
5920 }
5921 
5922 static bool append_open_options(QDict *d, BlockDriverState *bs)
5923 {
5924     const QDictEntry *entry;
5925     bool found_any = false;
5926 
5927     for (entry = qdict_first(bs->options); entry;
5928          entry = qdict_next(bs->options, entry))
5929     {
5930         /* Only take options for this level and exclude all non-driver-specific
5931          * options */
5932         if (!strchr(qdict_entry_key(entry), '.') &&
5933             strcmp(qdict_entry_key(entry), "node-name"))
5934         {
5935             qobject_incref(qdict_entry_value(entry));
5936             qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
5937             found_any = true;
5938         }
5939     }
5940 
5941     return found_any;
5942 }
5943 
5944 /* Updates the following BDS fields:
5945  *  - exact_filename: A filename which may be used for opening a block device
5946  *                    which (mostly) equals the given BDS (even without any
5947  *                    other options; so reading and writing must return the same
5948  *                    results, but caching etc. may be different)
5949  *  - full_open_options: Options which, when given when opening a block device
5950  *                       (without a filename), result in a BDS (mostly)
5951  *                       equalling the given one
5952  *  - filename: If exact_filename is set, it is copied here. Otherwise,
5953  *              full_open_options is converted to a JSON object, prefixed with
5954  *              "json:" (for use through the JSON pseudo protocol) and put here.
5955  */
5956 void bdrv_refresh_filename(BlockDriverState *bs)
5957 {
5958     BlockDriver *drv = bs->drv;
5959     QDict *opts;
5960 
5961     if (!drv) {
5962         return;
5963     }
5964 
5965     /* This BDS's file name will most probably depend on its file's name, so
5966      * refresh that first */
5967     if (bs->file) {
5968         bdrv_refresh_filename(bs->file);
5969     }
5970 
5971     if (drv->bdrv_refresh_filename) {
5972         /* Obsolete information is of no use here, so drop the old file name
5973          * information before refreshing it */
5974         bs->exact_filename[0] = '\0';
5975         if (bs->full_open_options) {
5976             QDECREF(bs->full_open_options);
5977             bs->full_open_options = NULL;
5978         }
5979 
5980         drv->bdrv_refresh_filename(bs);
5981     } else if (bs->file) {
5982         /* Try to reconstruct valid information from the underlying file */
5983         bool has_open_options;
5984 
5985         bs->exact_filename[0] = '\0';
5986         if (bs->full_open_options) {
5987             QDECREF(bs->full_open_options);
5988             bs->full_open_options = NULL;
5989         }
5990 
5991         opts = qdict_new();
5992         has_open_options = append_open_options(opts, bs);
5993 
5994         /* If no specific options have been given for this BDS, the filename of
5995          * the underlying file should suffice for this one as well */
5996         if (bs->file->exact_filename[0] && !has_open_options) {
5997             strcpy(bs->exact_filename, bs->file->exact_filename);
5998         }
5999         /* Reconstructing the full options QDict is simple for most format block
6000          * drivers, as long as the full options are known for the underlying
6001          * file BDS. The full options QDict of that file BDS should somehow
6002          * contain a representation of the filename, therefore the following
6003          * suffices without querying the (exact_)filename of this BDS. */
6004         if (bs->file->full_open_options) {
6005             qdict_put_obj(opts, "driver",
6006                           QOBJECT(qstring_from_str(drv->format_name)));
6007             QINCREF(bs->file->full_open_options);
6008             qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6009 
6010             bs->full_open_options = opts;
6011         } else {
6012             QDECREF(opts);
6013         }
6014     } else if (!bs->full_open_options && qdict_size(bs->options)) {
6015         /* There is no underlying file BDS (at least referenced by BDS.file),
6016          * so the full options QDict should be equal to the options given
6017          * specifically for this block device when it was opened (plus the
6018          * driver specification).
6019          * Because those options don't change, there is no need to update
6020          * full_open_options when it's already set. */
6021 
6022         opts = qdict_new();
6023         append_open_options(opts, bs);
6024         qdict_put_obj(opts, "driver",
6025                       QOBJECT(qstring_from_str(drv->format_name)));
6026 
6027         if (bs->exact_filename[0]) {
6028             /* This may not work for all block protocol drivers (some may
6029              * require this filename to be parsed), but we have to find some
6030              * default solution here, so just include it. If some block driver
6031              * does not support pure options without any filename at all or
6032              * needs some special format of the options QDict, it needs to
6033              * implement the driver-specific bdrv_refresh_filename() function.
6034              */
6035             qdict_put_obj(opts, "filename",
6036                           QOBJECT(qstring_from_str(bs->exact_filename)));
6037         }
6038 
6039         bs->full_open_options = opts;
6040     }
6041 
6042     if (bs->exact_filename[0]) {
6043         pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6044     } else if (bs->full_open_options) {
6045         QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6046         snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6047                  qstring_get_str(json));
6048         QDECREF(json);
6049     }
6050 }
6051 
6052 /* This accessor function purpose is to allow the device models to access the
6053  * BlockAcctStats structure embedded inside a BlockDriverState without being
6054  * aware of the BlockDriverState structure layout.
6055  * It will go away when the BlockAcctStats structure will be moved inside
6056  * the device models.
6057  */
6058 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6059 {
6060     return &bs->stats;
6061 }
6062