xref: /openbmc/qemu/block.c (revision 7b95a50858502485b0e159a1adabfa9912214147)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/block-backend.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 #include "qapi-event.h"
39 
40 #ifdef CONFIG_BSD
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <sys/ioctl.h>
44 #include <sys/queue.h>
45 #ifndef __DragonFly__
46 #include <sys/disk.h>
47 #endif
48 #endif
49 
50 #ifdef _WIN32
51 #include <windows.h>
52 #endif
53 
54 struct BdrvDirtyBitmap {
55     HBitmap *bitmap;
56     QLIST_ENTRY(BdrvDirtyBitmap) list;
57 };
58 
59 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
60 
61 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63         BlockCompletionFunc *cb, void *opaque);
64 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66         BlockCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68                                          int64_t sector_num, int nb_sectors,
69                                          QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71                                          int64_t sector_num, int nb_sectors,
72                                          QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75     BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78     BdrvRequestFlags flags);
79 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80                                          int64_t sector_num,
81                                          QEMUIOVector *qiov,
82                                          int nb_sectors,
83                                          BdrvRequestFlags flags,
84                                          BlockCompletionFunc *cb,
85                                          void *opaque,
86                                          bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90 
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92     QTAILQ_HEAD_INITIALIZER(bdrv_states);
93 
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96 
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98     QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
102 
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
105 {
106     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108             filename[1] == ':');
109 }
110 
111 int is_windows_drive(const char *filename)
112 {
113     if (is_windows_drive_prefix(filename) &&
114         filename[2] == '\0')
115         return 1;
116     if (strstart(filename, "\\\\.\\", NULL) ||
117         strstart(filename, "//./", NULL))
118         return 1;
119     return 0;
120 }
121 #endif
122 
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125                         ThrottleConfig *cfg)
126 {
127     int i;
128 
129     throttle_config(&bs->throttle_state, cfg);
130 
131     for (i = 0; i < 2; i++) {
132         qemu_co_enter_next(&bs->throttled_reqs[i]);
133     }
134 }
135 
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138 {
139     bool drained = false;
140     bool enabled = bs->io_limits_enabled;
141     int i;
142 
143     bs->io_limits_enabled = false;
144 
145     for (i = 0; i < 2; i++) {
146         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147             drained = true;
148         }
149     }
150 
151     bs->io_limits_enabled = enabled;
152 
153     return drained;
154 }
155 
156 void bdrv_io_limits_disable(BlockDriverState *bs)
157 {
158     bs->io_limits_enabled = false;
159 
160     bdrv_start_throttled_reqs(bs);
161 
162     throttle_destroy(&bs->throttle_state);
163 }
164 
165 static void bdrv_throttle_read_timer_cb(void *opaque)
166 {
167     BlockDriverState *bs = opaque;
168     qemu_co_enter_next(&bs->throttled_reqs[0]);
169 }
170 
171 static void bdrv_throttle_write_timer_cb(void *opaque)
172 {
173     BlockDriverState *bs = opaque;
174     qemu_co_enter_next(&bs->throttled_reqs[1]);
175 }
176 
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
179 {
180     assert(!bs->io_limits_enabled);
181     throttle_init(&bs->throttle_state,
182                   bdrv_get_aio_context(bs),
183                   QEMU_CLOCK_VIRTUAL,
184                   bdrv_throttle_read_timer_cb,
185                   bdrv_throttle_write_timer_cb,
186                   bs);
187     bs->io_limits_enabled = true;
188 }
189 
190 /* This function makes an IO wait if needed
191  *
192  * @nb_sectors: the number of sectors of the IO
193  * @is_write:   is the IO a write
194  */
195 static void bdrv_io_limits_intercept(BlockDriverState *bs,
196                                      unsigned int bytes,
197                                      bool is_write)
198 {
199     /* does this io must wait */
200     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
201 
202     /* if must wait or any request of this type throttled queue the IO */
203     if (must_wait ||
204         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
205         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
206     }
207 
208     /* the IO will be executed, do the accounting */
209     throttle_account(&bs->throttle_state, is_write, bytes);
210 
211 
212     /* if the next request must wait -> do nothing */
213     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214         return;
215     }
216 
217     /* else queue next request for execution */
218     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
219 }
220 
221 size_t bdrv_opt_mem_align(BlockDriverState *bs)
222 {
223     if (!bs || !bs->drv) {
224         /* 4k should be on the safe side */
225         return 4096;
226     }
227 
228     return bs->bl.opt_mem_alignment;
229 }
230 
231 /* check if the path starts with "<protocol>:" */
232 static int path_has_protocol(const char *path)
233 {
234     const char *p;
235 
236 #ifdef _WIN32
237     if (is_windows_drive(path) ||
238         is_windows_drive_prefix(path)) {
239         return 0;
240     }
241     p = path + strcspn(path, ":/\\");
242 #else
243     p = path + strcspn(path, ":/");
244 #endif
245 
246     return *p == ':';
247 }
248 
249 int path_is_absolute(const char *path)
250 {
251 #ifdef _WIN32
252     /* specific case for names like: "\\.\d:" */
253     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
254         return 1;
255     }
256     return (*path == '/' || *path == '\\');
257 #else
258     return (*path == '/');
259 #endif
260 }
261 
262 /* if filename is absolute, just copy it to dest. Otherwise, build a
263    path to it by considering it is relative to base_path. URL are
264    supported. */
265 void path_combine(char *dest, int dest_size,
266                   const char *base_path,
267                   const char *filename)
268 {
269     const char *p, *p1;
270     int len;
271 
272     if (dest_size <= 0)
273         return;
274     if (path_is_absolute(filename)) {
275         pstrcpy(dest, dest_size, filename);
276     } else {
277         p = strchr(base_path, ':');
278         if (p)
279             p++;
280         else
281             p = base_path;
282         p1 = strrchr(base_path, '/');
283 #ifdef _WIN32
284         {
285             const char *p2;
286             p2 = strrchr(base_path, '\\');
287             if (!p1 || p2 > p1)
288                 p1 = p2;
289         }
290 #endif
291         if (p1)
292             p1++;
293         else
294             p1 = base_path;
295         if (p1 > p)
296             p = p1;
297         len = p - base_path;
298         if (len > dest_size - 1)
299             len = dest_size - 1;
300         memcpy(dest, base_path, len);
301         dest[len] = '\0';
302         pstrcat(dest, dest_size, filename);
303     }
304 }
305 
306 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
307 {
308     if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
309         pstrcpy(dest, sz, bs->backing_file);
310     } else {
311         path_combine(dest, sz, bs->filename, bs->backing_file);
312     }
313 }
314 
315 void bdrv_register(BlockDriver *bdrv)
316 {
317     /* Block drivers without coroutine functions need emulation */
318     if (!bdrv->bdrv_co_readv) {
319         bdrv->bdrv_co_readv = bdrv_co_readv_em;
320         bdrv->bdrv_co_writev = bdrv_co_writev_em;
321 
322         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
323          * the block driver lacks aio we need to emulate that too.
324          */
325         if (!bdrv->bdrv_aio_readv) {
326             /* add AIO emulation layer */
327             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
328             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
329         }
330     }
331 
332     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
333 }
334 
335 BlockDriverState *bdrv_new_root(void)
336 {
337     BlockDriverState *bs = bdrv_new();
338 
339     QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
340     return bs;
341 }
342 
343 BlockDriverState *bdrv_new(void)
344 {
345     BlockDriverState *bs;
346     int i;
347 
348     bs = g_new0(BlockDriverState, 1);
349     QLIST_INIT(&bs->dirty_bitmaps);
350     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
351         QLIST_INIT(&bs->op_blockers[i]);
352     }
353     bdrv_iostatus_disable(bs);
354     notifier_list_init(&bs->close_notifiers);
355     notifier_with_return_list_init(&bs->before_write_notifiers);
356     qemu_co_queue_init(&bs->throttled_reqs[0]);
357     qemu_co_queue_init(&bs->throttled_reqs[1]);
358     bs->refcnt = 1;
359     bs->aio_context = qemu_get_aio_context();
360 
361     return bs;
362 }
363 
364 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
365 {
366     notifier_list_add(&bs->close_notifiers, notify);
367 }
368 
369 BlockDriver *bdrv_find_format(const char *format_name)
370 {
371     BlockDriver *drv1;
372     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
373         if (!strcmp(drv1->format_name, format_name)) {
374             return drv1;
375         }
376     }
377     return NULL;
378 }
379 
380 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
381 {
382     static const char *whitelist_rw[] = {
383         CONFIG_BDRV_RW_WHITELIST
384     };
385     static const char *whitelist_ro[] = {
386         CONFIG_BDRV_RO_WHITELIST
387     };
388     const char **p;
389 
390     if (!whitelist_rw[0] && !whitelist_ro[0]) {
391         return 1;               /* no whitelist, anything goes */
392     }
393 
394     for (p = whitelist_rw; *p; p++) {
395         if (!strcmp(drv->format_name, *p)) {
396             return 1;
397         }
398     }
399     if (read_only) {
400         for (p = whitelist_ro; *p; p++) {
401             if (!strcmp(drv->format_name, *p)) {
402                 return 1;
403             }
404         }
405     }
406     return 0;
407 }
408 
409 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
410                                           bool read_only)
411 {
412     BlockDriver *drv = bdrv_find_format(format_name);
413     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
414 }
415 
416 typedef struct CreateCo {
417     BlockDriver *drv;
418     char *filename;
419     QemuOpts *opts;
420     int ret;
421     Error *err;
422 } CreateCo;
423 
424 static void coroutine_fn bdrv_create_co_entry(void *opaque)
425 {
426     Error *local_err = NULL;
427     int ret;
428 
429     CreateCo *cco = opaque;
430     assert(cco->drv);
431 
432     ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
433     if (local_err) {
434         error_propagate(&cco->err, local_err);
435     }
436     cco->ret = ret;
437 }
438 
439 int bdrv_create(BlockDriver *drv, const char* filename,
440                 QemuOpts *opts, Error **errp)
441 {
442     int ret;
443 
444     Coroutine *co;
445     CreateCo cco = {
446         .drv = drv,
447         .filename = g_strdup(filename),
448         .opts = opts,
449         .ret = NOT_DONE,
450         .err = NULL,
451     };
452 
453     if (!drv->bdrv_create) {
454         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
455         ret = -ENOTSUP;
456         goto out;
457     }
458 
459     if (qemu_in_coroutine()) {
460         /* Fast-path if already in coroutine context */
461         bdrv_create_co_entry(&cco);
462     } else {
463         co = qemu_coroutine_create(bdrv_create_co_entry);
464         qemu_coroutine_enter(co, &cco);
465         while (cco.ret == NOT_DONE) {
466             aio_poll(qemu_get_aio_context(), true);
467         }
468     }
469 
470     ret = cco.ret;
471     if (ret < 0) {
472         if (cco.err) {
473             error_propagate(errp, cco.err);
474         } else {
475             error_setg_errno(errp, -ret, "Could not create image");
476         }
477     }
478 
479 out:
480     g_free(cco.filename);
481     return ret;
482 }
483 
484 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
485 {
486     BlockDriver *drv;
487     Error *local_err = NULL;
488     int ret;
489 
490     drv = bdrv_find_protocol(filename, true);
491     if (drv == NULL) {
492         error_setg(errp, "Could not find protocol for file '%s'", filename);
493         return -ENOENT;
494     }
495 
496     ret = bdrv_create(drv, filename, opts, &local_err);
497     if (local_err) {
498         error_propagate(errp, local_err);
499     }
500     return ret;
501 }
502 
503 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
504 {
505     BlockDriver *drv = bs->drv;
506     Error *local_err = NULL;
507 
508     memset(&bs->bl, 0, sizeof(bs->bl));
509 
510     if (!drv) {
511         return;
512     }
513 
514     /* Take some limits from the children as a default */
515     if (bs->file) {
516         bdrv_refresh_limits(bs->file, &local_err);
517         if (local_err) {
518             error_propagate(errp, local_err);
519             return;
520         }
521         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
522         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
523     } else {
524         bs->bl.opt_mem_alignment = 512;
525     }
526 
527     if (bs->backing_hd) {
528         bdrv_refresh_limits(bs->backing_hd, &local_err);
529         if (local_err) {
530             error_propagate(errp, local_err);
531             return;
532         }
533         bs->bl.opt_transfer_length =
534             MAX(bs->bl.opt_transfer_length,
535                 bs->backing_hd->bl.opt_transfer_length);
536         bs->bl.opt_mem_alignment =
537             MAX(bs->bl.opt_mem_alignment,
538                 bs->backing_hd->bl.opt_mem_alignment);
539     }
540 
541     /* Then let the driver override it */
542     if (drv->bdrv_refresh_limits) {
543         drv->bdrv_refresh_limits(bs, errp);
544     }
545 }
546 
547 /*
548  * Create a uniquely-named empty temporary file.
549  * Return 0 upon success, otherwise a negative errno value.
550  */
551 int get_tmp_filename(char *filename, int size)
552 {
553 #ifdef _WIN32
554     char temp_dir[MAX_PATH];
555     /* GetTempFileName requires that its output buffer (4th param)
556        have length MAX_PATH or greater.  */
557     assert(size >= MAX_PATH);
558     return (GetTempPath(MAX_PATH, temp_dir)
559             && GetTempFileName(temp_dir, "qem", 0, filename)
560             ? 0 : -GetLastError());
561 #else
562     int fd;
563     const char *tmpdir;
564     tmpdir = getenv("TMPDIR");
565     if (!tmpdir) {
566         tmpdir = "/var/tmp";
567     }
568     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
569         return -EOVERFLOW;
570     }
571     fd = mkstemp(filename);
572     if (fd < 0) {
573         return -errno;
574     }
575     if (close(fd) != 0) {
576         unlink(filename);
577         return -errno;
578     }
579     return 0;
580 #endif
581 }
582 
583 /*
584  * Detect host devices. By convention, /dev/cdrom[N] is always
585  * recognized as a host CDROM.
586  */
587 static BlockDriver *find_hdev_driver(const char *filename)
588 {
589     int score_max = 0, score;
590     BlockDriver *drv = NULL, *d;
591 
592     QLIST_FOREACH(d, &bdrv_drivers, list) {
593         if (d->bdrv_probe_device) {
594             score = d->bdrv_probe_device(filename);
595             if (score > score_max) {
596                 score_max = score;
597                 drv = d;
598             }
599         }
600     }
601 
602     return drv;
603 }
604 
605 BlockDriver *bdrv_find_protocol(const char *filename,
606                                 bool allow_protocol_prefix)
607 {
608     BlockDriver *drv1;
609     char protocol[128];
610     int len;
611     const char *p;
612 
613     /* TODO Drivers without bdrv_file_open must be specified explicitly */
614 
615     /*
616      * XXX(hch): we really should not let host device detection
617      * override an explicit protocol specification, but moving this
618      * later breaks access to device names with colons in them.
619      * Thanks to the brain-dead persistent naming schemes on udev-
620      * based Linux systems those actually are quite common.
621      */
622     drv1 = find_hdev_driver(filename);
623     if (drv1) {
624         return drv1;
625     }
626 
627     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
628         return bdrv_find_format("file");
629     }
630 
631     p = strchr(filename, ':');
632     assert(p != NULL);
633     len = p - filename;
634     if (len > sizeof(protocol) - 1)
635         len = sizeof(protocol) - 1;
636     memcpy(protocol, filename, len);
637     protocol[len] = '\0';
638     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
639         if (drv1->protocol_name &&
640             !strcmp(drv1->protocol_name, protocol)) {
641             return drv1;
642         }
643     }
644     return NULL;
645 }
646 
647 static int find_image_format(BlockDriverState *bs, const char *filename,
648                              BlockDriver **pdrv, Error **errp)
649 {
650     int score, score_max;
651     BlockDriver *drv1, *drv;
652     uint8_t buf[2048];
653     int ret = 0;
654 
655     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
656     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
657         drv = bdrv_find_format("raw");
658         if (!drv) {
659             error_setg(errp, "Could not find raw image format");
660             ret = -ENOENT;
661         }
662         *pdrv = drv;
663         return ret;
664     }
665 
666     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
667     if (ret < 0) {
668         error_setg_errno(errp, -ret, "Could not read image for determining its "
669                          "format");
670         *pdrv = NULL;
671         return ret;
672     }
673 
674     score_max = 0;
675     drv = NULL;
676     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
677         if (drv1->bdrv_probe) {
678             score = drv1->bdrv_probe(buf, ret, filename);
679             if (score > score_max) {
680                 score_max = score;
681                 drv = drv1;
682             }
683         }
684     }
685     if (!drv) {
686         error_setg(errp, "Could not determine image format: No compatible "
687                    "driver found");
688         ret = -ENOENT;
689     }
690     *pdrv = drv;
691     return ret;
692 }
693 
694 /**
695  * Set the current 'total_sectors' value
696  * Return 0 on success, -errno on error.
697  */
698 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
699 {
700     BlockDriver *drv = bs->drv;
701 
702     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
703     if (bs->sg)
704         return 0;
705 
706     /* query actual device if possible, otherwise just trust the hint */
707     if (drv->bdrv_getlength) {
708         int64_t length = drv->bdrv_getlength(bs);
709         if (length < 0) {
710             return length;
711         }
712         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
713     }
714 
715     bs->total_sectors = hint;
716     return 0;
717 }
718 
719 /**
720  * Set open flags for a given discard mode
721  *
722  * Return 0 on success, -1 if the discard mode was invalid.
723  */
724 int bdrv_parse_discard_flags(const char *mode, int *flags)
725 {
726     *flags &= ~BDRV_O_UNMAP;
727 
728     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
729         /* do nothing */
730     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
731         *flags |= BDRV_O_UNMAP;
732     } else {
733         return -1;
734     }
735 
736     return 0;
737 }
738 
739 /**
740  * Set open flags for a given cache mode
741  *
742  * Return 0 on success, -1 if the cache mode was invalid.
743  */
744 int bdrv_parse_cache_flags(const char *mode, int *flags)
745 {
746     *flags &= ~BDRV_O_CACHE_MASK;
747 
748     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
749         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
750     } else if (!strcmp(mode, "directsync")) {
751         *flags |= BDRV_O_NOCACHE;
752     } else if (!strcmp(mode, "writeback")) {
753         *flags |= BDRV_O_CACHE_WB;
754     } else if (!strcmp(mode, "unsafe")) {
755         *flags |= BDRV_O_CACHE_WB;
756         *flags |= BDRV_O_NO_FLUSH;
757     } else if (!strcmp(mode, "writethrough")) {
758         /* this is the default */
759     } else {
760         return -1;
761     }
762 
763     return 0;
764 }
765 
766 /**
767  * The copy-on-read flag is actually a reference count so multiple users may
768  * use the feature without worrying about clobbering its previous state.
769  * Copy-on-read stays enabled until all users have called to disable it.
770  */
771 void bdrv_enable_copy_on_read(BlockDriverState *bs)
772 {
773     bs->copy_on_read++;
774 }
775 
776 void bdrv_disable_copy_on_read(BlockDriverState *bs)
777 {
778     assert(bs->copy_on_read > 0);
779     bs->copy_on_read--;
780 }
781 
782 /*
783  * Returns the flags that a temporary snapshot should get, based on the
784  * originally requested flags (the originally requested image will have flags
785  * like a backing file)
786  */
787 static int bdrv_temp_snapshot_flags(int flags)
788 {
789     return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
790 }
791 
792 /*
793  * Returns the flags that bs->file should get, based on the given flags for
794  * the parent BDS
795  */
796 static int bdrv_inherited_flags(int flags)
797 {
798     /* Enable protocol handling, disable format probing for bs->file */
799     flags |= BDRV_O_PROTOCOL;
800 
801     /* Our block drivers take care to send flushes and respect unmap policy,
802      * so we can enable both unconditionally on lower layers. */
803     flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
804 
805     /* Clear flags that only apply to the top layer */
806     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
807 
808     return flags;
809 }
810 
811 /*
812  * Returns the flags that bs->backing_hd should get, based on the given flags
813  * for the parent BDS
814  */
815 static int bdrv_backing_flags(int flags)
816 {
817     /* backing files always opened read-only */
818     flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
819 
820     /* snapshot=on is handled on the top layer */
821     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
822 
823     return flags;
824 }
825 
826 static int bdrv_open_flags(BlockDriverState *bs, int flags)
827 {
828     int open_flags = flags | BDRV_O_CACHE_WB;
829 
830     /*
831      * Clear flags that are internal to the block layer before opening the
832      * image.
833      */
834     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
835 
836     /*
837      * Snapshots should be writable.
838      */
839     if (flags & BDRV_O_TEMPORARY) {
840         open_flags |= BDRV_O_RDWR;
841     }
842 
843     return open_flags;
844 }
845 
846 static void bdrv_assign_node_name(BlockDriverState *bs,
847                                   const char *node_name,
848                                   Error **errp)
849 {
850     if (!node_name) {
851         return;
852     }
853 
854     /* Check for empty string or invalid characters */
855     if (!id_wellformed(node_name)) {
856         error_setg(errp, "Invalid node name");
857         return;
858     }
859 
860     /* takes care of avoiding namespaces collisions */
861     if (blk_by_name(node_name)) {
862         error_setg(errp, "node-name=%s is conflicting with a device id",
863                    node_name);
864         return;
865     }
866 
867     /* takes care of avoiding duplicates node names */
868     if (bdrv_find_node(node_name)) {
869         error_setg(errp, "Duplicate node name");
870         return;
871     }
872 
873     /* copy node name into the bs and insert it into the graph list */
874     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
875     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
876 }
877 
878 /*
879  * Common part for opening disk images and files
880  *
881  * Removes all processed options from *options.
882  */
883 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
884     QDict *options, int flags, BlockDriver *drv, Error **errp)
885 {
886     int ret, open_flags;
887     const char *filename;
888     const char *node_name = NULL;
889     Error *local_err = NULL;
890 
891     assert(drv != NULL);
892     assert(bs->file == NULL);
893     assert(options != NULL && bs->options != options);
894 
895     if (file != NULL) {
896         filename = file->filename;
897     } else {
898         filename = qdict_get_try_str(options, "filename");
899     }
900 
901     if (drv->bdrv_needs_filename && !filename) {
902         error_setg(errp, "The '%s' block driver requires a file name",
903                    drv->format_name);
904         return -EINVAL;
905     }
906 
907     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
908 
909     node_name = qdict_get_try_str(options, "node-name");
910     bdrv_assign_node_name(bs, node_name, &local_err);
911     if (local_err) {
912         error_propagate(errp, local_err);
913         return -EINVAL;
914     }
915     qdict_del(options, "node-name");
916 
917     /* bdrv_open() with directly using a protocol as drv. This layer is already
918      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
919      * and return immediately. */
920     if (file != NULL && drv->bdrv_file_open) {
921         bdrv_swap(file, bs);
922         return 0;
923     }
924 
925     bs->open_flags = flags;
926     bs->guest_block_size = 512;
927     bs->request_alignment = 512;
928     bs->zero_beyond_eof = true;
929     open_flags = bdrv_open_flags(bs, flags);
930     bs->read_only = !(open_flags & BDRV_O_RDWR);
931     bs->growable = !!(flags & BDRV_O_PROTOCOL);
932 
933     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
934         error_setg(errp,
935                    !bs->read_only && bdrv_is_whitelisted(drv, true)
936                         ? "Driver '%s' can only be used for read-only devices"
937                         : "Driver '%s' is not whitelisted",
938                    drv->format_name);
939         return -ENOTSUP;
940     }
941 
942     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
943     if (flags & BDRV_O_COPY_ON_READ) {
944         if (!bs->read_only) {
945             bdrv_enable_copy_on_read(bs);
946         } else {
947             error_setg(errp, "Can't use copy-on-read on read-only device");
948             return -EINVAL;
949         }
950     }
951 
952     if (filename != NULL) {
953         pstrcpy(bs->filename, sizeof(bs->filename), filename);
954     } else {
955         bs->filename[0] = '\0';
956     }
957     pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
958 
959     bs->drv = drv;
960     bs->opaque = g_malloc0(drv->instance_size);
961 
962     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
963 
964     /* Open the image, either directly or using a protocol */
965     if (drv->bdrv_file_open) {
966         assert(file == NULL);
967         assert(!drv->bdrv_needs_filename || filename != NULL);
968         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
969     } else {
970         if (file == NULL) {
971             error_setg(errp, "Can't use '%s' as a block driver for the "
972                        "protocol level", drv->format_name);
973             ret = -EINVAL;
974             goto free_and_fail;
975         }
976         bs->file = file;
977         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
978     }
979 
980     if (ret < 0) {
981         if (local_err) {
982             error_propagate(errp, local_err);
983         } else if (bs->filename[0]) {
984             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
985         } else {
986             error_setg_errno(errp, -ret, "Could not open image");
987         }
988         goto free_and_fail;
989     }
990 
991     ret = refresh_total_sectors(bs, bs->total_sectors);
992     if (ret < 0) {
993         error_setg_errno(errp, -ret, "Could not refresh total sector count");
994         goto free_and_fail;
995     }
996 
997     bdrv_refresh_limits(bs, &local_err);
998     if (local_err) {
999         error_propagate(errp, local_err);
1000         ret = -EINVAL;
1001         goto free_and_fail;
1002     }
1003 
1004     assert(bdrv_opt_mem_align(bs) != 0);
1005     assert((bs->request_alignment != 0) || bs->sg);
1006     return 0;
1007 
1008 free_and_fail:
1009     bs->file = NULL;
1010     g_free(bs->opaque);
1011     bs->opaque = NULL;
1012     bs->drv = NULL;
1013     return ret;
1014 }
1015 
1016 static QDict *parse_json_filename(const char *filename, Error **errp)
1017 {
1018     QObject *options_obj;
1019     QDict *options;
1020     int ret;
1021 
1022     ret = strstart(filename, "json:", &filename);
1023     assert(ret);
1024 
1025     options_obj = qobject_from_json(filename);
1026     if (!options_obj) {
1027         error_setg(errp, "Could not parse the JSON options");
1028         return NULL;
1029     }
1030 
1031     if (qobject_type(options_obj) != QTYPE_QDICT) {
1032         qobject_decref(options_obj);
1033         error_setg(errp, "Invalid JSON object given");
1034         return NULL;
1035     }
1036 
1037     options = qobject_to_qdict(options_obj);
1038     qdict_flatten(options);
1039 
1040     return options;
1041 }
1042 
1043 /*
1044  * Fills in default options for opening images and converts the legacy
1045  * filename/flags pair to option QDict entries.
1046  */
1047 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1048                              BlockDriver *drv, Error **errp)
1049 {
1050     const char *filename = *pfilename;
1051     const char *drvname;
1052     bool protocol = flags & BDRV_O_PROTOCOL;
1053     bool parse_filename = false;
1054     Error *local_err = NULL;
1055 
1056     /* Parse json: pseudo-protocol */
1057     if (filename && g_str_has_prefix(filename, "json:")) {
1058         QDict *json_options = parse_json_filename(filename, &local_err);
1059         if (local_err) {
1060             error_propagate(errp, local_err);
1061             return -EINVAL;
1062         }
1063 
1064         /* Options given in the filename have lower priority than options
1065          * specified directly */
1066         qdict_join(*options, json_options, false);
1067         QDECREF(json_options);
1068         *pfilename = filename = NULL;
1069     }
1070 
1071     /* Fetch the file name from the options QDict if necessary */
1072     if (protocol && filename) {
1073         if (!qdict_haskey(*options, "filename")) {
1074             qdict_put(*options, "filename", qstring_from_str(filename));
1075             parse_filename = true;
1076         } else {
1077             error_setg(errp, "Can't specify 'file' and 'filename' options at "
1078                              "the same time");
1079             return -EINVAL;
1080         }
1081     }
1082 
1083     /* Find the right block driver */
1084     filename = qdict_get_try_str(*options, "filename");
1085     drvname = qdict_get_try_str(*options, "driver");
1086 
1087     if (drv) {
1088         if (drvname) {
1089             error_setg(errp, "Driver specified twice");
1090             return -EINVAL;
1091         }
1092         drvname = drv->format_name;
1093         qdict_put(*options, "driver", qstring_from_str(drvname));
1094     } else {
1095         if (!drvname && protocol) {
1096             if (filename) {
1097                 drv = bdrv_find_protocol(filename, parse_filename);
1098                 if (!drv) {
1099                     error_setg(errp, "Unknown protocol");
1100                     return -EINVAL;
1101                 }
1102 
1103                 drvname = drv->format_name;
1104                 qdict_put(*options, "driver", qstring_from_str(drvname));
1105             } else {
1106                 error_setg(errp, "Must specify either driver or file");
1107                 return -EINVAL;
1108             }
1109         } else if (drvname) {
1110             drv = bdrv_find_format(drvname);
1111             if (!drv) {
1112                 error_setg(errp, "Unknown driver '%s'", drvname);
1113                 return -ENOENT;
1114             }
1115         }
1116     }
1117 
1118     assert(drv || !protocol);
1119 
1120     /* Driver-specific filename parsing */
1121     if (drv && drv->bdrv_parse_filename && parse_filename) {
1122         drv->bdrv_parse_filename(filename, *options, &local_err);
1123         if (local_err) {
1124             error_propagate(errp, local_err);
1125             return -EINVAL;
1126         }
1127 
1128         if (!drv->bdrv_needs_filename) {
1129             qdict_del(*options, "filename");
1130         }
1131     }
1132 
1133     return 0;
1134 }
1135 
1136 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1137 {
1138 
1139     if (bs->backing_hd) {
1140         assert(bs->backing_blocker);
1141         bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1142     } else if (backing_hd) {
1143         error_setg(&bs->backing_blocker,
1144                    "device is used as backing hd of '%s'",
1145                    bdrv_get_device_name(bs));
1146     }
1147 
1148     bs->backing_hd = backing_hd;
1149     if (!backing_hd) {
1150         error_free(bs->backing_blocker);
1151         bs->backing_blocker = NULL;
1152         goto out;
1153     }
1154     bs->open_flags &= ~BDRV_O_NO_BACKING;
1155     pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1156     pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1157             backing_hd->drv ? backing_hd->drv->format_name : "");
1158 
1159     bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1160     /* Otherwise we won't be able to commit due to check in bdrv_commit */
1161     bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1162                     bs->backing_blocker);
1163 out:
1164     bdrv_refresh_limits(bs, NULL);
1165 }
1166 
1167 /*
1168  * Opens the backing file for a BlockDriverState if not yet open
1169  *
1170  * options is a QDict of options to pass to the block drivers, or NULL for an
1171  * empty set of options. The reference to the QDict is transferred to this
1172  * function (even on failure), so if the caller intends to reuse the dictionary,
1173  * it needs to use QINCREF() before calling bdrv_file_open.
1174  */
1175 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1176 {
1177     char *backing_filename = g_malloc0(PATH_MAX);
1178     int ret = 0;
1179     BlockDriver *back_drv = NULL;
1180     BlockDriverState *backing_hd;
1181     Error *local_err = NULL;
1182 
1183     if (bs->backing_hd != NULL) {
1184         QDECREF(options);
1185         goto free_exit;
1186     }
1187 
1188     /* NULL means an empty set of options */
1189     if (options == NULL) {
1190         options = qdict_new();
1191     }
1192 
1193     bs->open_flags &= ~BDRV_O_NO_BACKING;
1194     if (qdict_haskey(options, "file.filename")) {
1195         backing_filename[0] = '\0';
1196     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1197         QDECREF(options);
1198         goto free_exit;
1199     } else {
1200         bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1201     }
1202 
1203     if (!bs->drv || !bs->drv->supports_backing) {
1204         ret = -EINVAL;
1205         error_setg(errp, "Driver doesn't support backing files");
1206         QDECREF(options);
1207         goto free_exit;
1208     }
1209 
1210     backing_hd = bdrv_new();
1211 
1212     if (bs->backing_format[0] != '\0') {
1213         back_drv = bdrv_find_format(bs->backing_format);
1214     }
1215 
1216     assert(bs->backing_hd == NULL);
1217     ret = bdrv_open(&backing_hd,
1218                     *backing_filename ? backing_filename : NULL, NULL, options,
1219                     bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
1220     if (ret < 0) {
1221         bdrv_unref(backing_hd);
1222         backing_hd = NULL;
1223         bs->open_flags |= BDRV_O_NO_BACKING;
1224         error_setg(errp, "Could not open backing file: %s",
1225                    error_get_pretty(local_err));
1226         error_free(local_err);
1227         goto free_exit;
1228     }
1229     bdrv_set_backing_hd(bs, backing_hd);
1230 
1231 free_exit:
1232     g_free(backing_filename);
1233     return ret;
1234 }
1235 
1236 /*
1237  * Opens a disk image whose options are given as BlockdevRef in another block
1238  * device's options.
1239  *
1240  * If allow_none is true, no image will be opened if filename is false and no
1241  * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1242  *
1243  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1244  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1245  * itself, all options starting with "${bdref_key}." are considered part of the
1246  * BlockdevRef.
1247  *
1248  * The BlockdevRef will be removed from the options QDict.
1249  *
1250  * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1251  */
1252 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1253                     QDict *options, const char *bdref_key, int flags,
1254                     bool allow_none, Error **errp)
1255 {
1256     QDict *image_options;
1257     int ret;
1258     char *bdref_key_dot;
1259     const char *reference;
1260 
1261     assert(pbs);
1262     assert(*pbs == NULL);
1263 
1264     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1265     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1266     g_free(bdref_key_dot);
1267 
1268     reference = qdict_get_try_str(options, bdref_key);
1269     if (!filename && !reference && !qdict_size(image_options)) {
1270         if (allow_none) {
1271             ret = 0;
1272         } else {
1273             error_setg(errp, "A block device must be specified for \"%s\"",
1274                        bdref_key);
1275             ret = -EINVAL;
1276         }
1277         QDECREF(image_options);
1278         goto done;
1279     }
1280 
1281     ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1282 
1283 done:
1284     qdict_del(options, bdref_key);
1285     return ret;
1286 }
1287 
1288 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1289 {
1290     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1291     char *tmp_filename = g_malloc0(PATH_MAX + 1);
1292     int64_t total_size;
1293     BlockDriver *bdrv_qcow2;
1294     QemuOpts *opts = NULL;
1295     QDict *snapshot_options;
1296     BlockDriverState *bs_snapshot;
1297     Error *local_err;
1298     int ret;
1299 
1300     /* if snapshot, we create a temporary backing file and open it
1301        instead of opening 'filename' directly */
1302 
1303     /* Get the required size from the image */
1304     total_size = bdrv_getlength(bs);
1305     if (total_size < 0) {
1306         ret = total_size;
1307         error_setg_errno(errp, -total_size, "Could not get image size");
1308         goto out;
1309     }
1310 
1311     /* Create the temporary image */
1312     ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1313     if (ret < 0) {
1314         error_setg_errno(errp, -ret, "Could not get temporary filename");
1315         goto out;
1316     }
1317 
1318     bdrv_qcow2 = bdrv_find_format("qcow2");
1319     opts = qemu_opts_create(bdrv_qcow2->create_opts, NULL, 0,
1320                             &error_abort);
1321     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
1322     ret = bdrv_create(bdrv_qcow2, tmp_filename, opts, &local_err);
1323     qemu_opts_del(opts);
1324     if (ret < 0) {
1325         error_setg_errno(errp, -ret, "Could not create temporary overlay "
1326                          "'%s': %s", tmp_filename,
1327                          error_get_pretty(local_err));
1328         error_free(local_err);
1329         goto out;
1330     }
1331 
1332     /* Prepare a new options QDict for the temporary file */
1333     snapshot_options = qdict_new();
1334     qdict_put(snapshot_options, "file.driver",
1335               qstring_from_str("file"));
1336     qdict_put(snapshot_options, "file.filename",
1337               qstring_from_str(tmp_filename));
1338 
1339     bs_snapshot = bdrv_new();
1340 
1341     ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1342                     flags, bdrv_qcow2, &local_err);
1343     if (ret < 0) {
1344         error_propagate(errp, local_err);
1345         goto out;
1346     }
1347 
1348     bdrv_append(bs_snapshot, bs);
1349 
1350 out:
1351     g_free(tmp_filename);
1352     return ret;
1353 }
1354 
1355 /*
1356  * Opens a disk image (raw, qcow2, vmdk, ...)
1357  *
1358  * options is a QDict of options to pass to the block drivers, or NULL for an
1359  * empty set of options. The reference to the QDict belongs to the block layer
1360  * after the call (even on failure), so if the caller intends to reuse the
1361  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1362  *
1363  * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1364  * If it is not NULL, the referenced BDS will be reused.
1365  *
1366  * The reference parameter may be used to specify an existing block device which
1367  * should be opened. If specified, neither options nor a filename may be given,
1368  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1369  */
1370 int bdrv_open(BlockDriverState **pbs, const char *filename,
1371               const char *reference, QDict *options, int flags,
1372               BlockDriver *drv, Error **errp)
1373 {
1374     int ret;
1375     BlockDriverState *file = NULL, *bs;
1376     const char *drvname;
1377     Error *local_err = NULL;
1378     int snapshot_flags = 0;
1379 
1380     assert(pbs);
1381 
1382     if (reference) {
1383         bool options_non_empty = options ? qdict_size(options) : false;
1384         QDECREF(options);
1385 
1386         if (*pbs) {
1387             error_setg(errp, "Cannot reuse an existing BDS when referencing "
1388                        "another block device");
1389             return -EINVAL;
1390         }
1391 
1392         if (filename || options_non_empty) {
1393             error_setg(errp, "Cannot reference an existing block device with "
1394                        "additional options or a new filename");
1395             return -EINVAL;
1396         }
1397 
1398         bs = bdrv_lookup_bs(reference, reference, errp);
1399         if (!bs) {
1400             return -ENODEV;
1401         }
1402         bdrv_ref(bs);
1403         *pbs = bs;
1404         return 0;
1405     }
1406 
1407     if (*pbs) {
1408         bs = *pbs;
1409     } else {
1410         bs = bdrv_new();
1411     }
1412 
1413     /* NULL means an empty set of options */
1414     if (options == NULL) {
1415         options = qdict_new();
1416     }
1417 
1418     ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1419     if (local_err) {
1420         goto fail;
1421     }
1422 
1423     /* Find the right image format driver */
1424     drv = NULL;
1425     drvname = qdict_get_try_str(options, "driver");
1426     if (drvname) {
1427         drv = bdrv_find_format(drvname);
1428         qdict_del(options, "driver");
1429         if (!drv) {
1430             error_setg(errp, "Unknown driver: '%s'", drvname);
1431             ret = -EINVAL;
1432             goto fail;
1433         }
1434     }
1435 
1436     assert(drvname || !(flags & BDRV_O_PROTOCOL));
1437     if (drv && !drv->bdrv_file_open) {
1438         /* If the user explicitly wants a format driver here, we'll need to add
1439          * another layer for the protocol in bs->file */
1440         flags &= ~BDRV_O_PROTOCOL;
1441     }
1442 
1443     bs->options = options;
1444     options = qdict_clone_shallow(options);
1445 
1446     /* Open image file without format layer */
1447     if ((flags & BDRV_O_PROTOCOL) == 0) {
1448         if (flags & BDRV_O_RDWR) {
1449             flags |= BDRV_O_ALLOW_RDWR;
1450         }
1451         if (flags & BDRV_O_SNAPSHOT) {
1452             snapshot_flags = bdrv_temp_snapshot_flags(flags);
1453             flags = bdrv_backing_flags(flags);
1454         }
1455 
1456         assert(file == NULL);
1457         ret = bdrv_open_image(&file, filename, options, "file",
1458                               bdrv_inherited_flags(flags),
1459                               true, &local_err);
1460         if (ret < 0) {
1461             goto fail;
1462         }
1463     }
1464 
1465     /* Image format probing */
1466     if (!drv && file) {
1467         ret = find_image_format(file, filename, &drv, &local_err);
1468         if (ret < 0) {
1469             goto fail;
1470         }
1471     } else if (!drv) {
1472         error_setg(errp, "Must specify either driver or file");
1473         ret = -EINVAL;
1474         goto fail;
1475     }
1476 
1477     /* Open the image */
1478     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1479     if (ret < 0) {
1480         goto fail;
1481     }
1482 
1483     if (file && (bs->file != file)) {
1484         bdrv_unref(file);
1485         file = NULL;
1486     }
1487 
1488     /* If there is a backing file, use it */
1489     if ((flags & BDRV_O_NO_BACKING) == 0) {
1490         QDict *backing_options;
1491 
1492         qdict_extract_subqdict(options, &backing_options, "backing.");
1493         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1494         if (ret < 0) {
1495             goto close_and_fail;
1496         }
1497     }
1498 
1499     bdrv_refresh_filename(bs);
1500 
1501     /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1502      * temporary snapshot afterwards. */
1503     if (snapshot_flags) {
1504         ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1505         if (local_err) {
1506             goto close_and_fail;
1507         }
1508     }
1509 
1510     /* Check if any unknown options were used */
1511     if (options && (qdict_size(options) != 0)) {
1512         const QDictEntry *entry = qdict_first(options);
1513         if (flags & BDRV_O_PROTOCOL) {
1514             error_setg(errp, "Block protocol '%s' doesn't support the option "
1515                        "'%s'", drv->format_name, entry->key);
1516         } else {
1517             error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1518                        "support the option '%s'", drv->format_name,
1519                        bdrv_get_device_name(bs), entry->key);
1520         }
1521 
1522         ret = -EINVAL;
1523         goto close_and_fail;
1524     }
1525 
1526     if (!bdrv_key_required(bs)) {
1527         if (bs->blk) {
1528             blk_dev_change_media_cb(bs->blk, true);
1529         }
1530     } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1531                && !runstate_check(RUN_STATE_INMIGRATE)
1532                && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1533         error_setg(errp,
1534                    "Guest must be stopped for opening of encrypted image");
1535         ret = -EBUSY;
1536         goto close_and_fail;
1537     }
1538 
1539     QDECREF(options);
1540     *pbs = bs;
1541     return 0;
1542 
1543 fail:
1544     if (file != NULL) {
1545         bdrv_unref(file);
1546     }
1547     QDECREF(bs->options);
1548     QDECREF(options);
1549     bs->options = NULL;
1550     if (!*pbs) {
1551         /* If *pbs is NULL, a new BDS has been created in this function and
1552            needs to be freed now. Otherwise, it does not need to be closed,
1553            since it has not really been opened yet. */
1554         bdrv_unref(bs);
1555     }
1556     if (local_err) {
1557         error_propagate(errp, local_err);
1558     }
1559     return ret;
1560 
1561 close_and_fail:
1562     /* See fail path, but now the BDS has to be always closed */
1563     if (*pbs) {
1564         bdrv_close(bs);
1565     } else {
1566         bdrv_unref(bs);
1567     }
1568     QDECREF(options);
1569     if (local_err) {
1570         error_propagate(errp, local_err);
1571     }
1572     return ret;
1573 }
1574 
1575 typedef struct BlockReopenQueueEntry {
1576      bool prepared;
1577      BDRVReopenState state;
1578      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1579 } BlockReopenQueueEntry;
1580 
1581 /*
1582  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1583  * reopen of multiple devices.
1584  *
1585  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1586  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1587  * be created and initialized. This newly created BlockReopenQueue should be
1588  * passed back in for subsequent calls that are intended to be of the same
1589  * atomic 'set'.
1590  *
1591  * bs is the BlockDriverState to add to the reopen queue.
1592  *
1593  * flags contains the open flags for the associated bs
1594  *
1595  * returns a pointer to bs_queue, which is either the newly allocated
1596  * bs_queue, or the existing bs_queue being used.
1597  *
1598  */
1599 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1600                                     BlockDriverState *bs, int flags)
1601 {
1602     assert(bs != NULL);
1603 
1604     BlockReopenQueueEntry *bs_entry;
1605     if (bs_queue == NULL) {
1606         bs_queue = g_new0(BlockReopenQueue, 1);
1607         QSIMPLEQ_INIT(bs_queue);
1608     }
1609 
1610     /* bdrv_open() masks this flag out */
1611     flags &= ~BDRV_O_PROTOCOL;
1612 
1613     if (bs->file) {
1614         bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1615     }
1616 
1617     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1618     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1619 
1620     bs_entry->state.bs = bs;
1621     bs_entry->state.flags = flags;
1622 
1623     return bs_queue;
1624 }
1625 
1626 /*
1627  * Reopen multiple BlockDriverStates atomically & transactionally.
1628  *
1629  * The queue passed in (bs_queue) must have been built up previous
1630  * via bdrv_reopen_queue().
1631  *
1632  * Reopens all BDS specified in the queue, with the appropriate
1633  * flags.  All devices are prepared for reopen, and failure of any
1634  * device will cause all device changes to be abandonded, and intermediate
1635  * data cleaned up.
1636  *
1637  * If all devices prepare successfully, then the changes are committed
1638  * to all devices.
1639  *
1640  */
1641 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1642 {
1643     int ret = -1;
1644     BlockReopenQueueEntry *bs_entry, *next;
1645     Error *local_err = NULL;
1646 
1647     assert(bs_queue != NULL);
1648 
1649     bdrv_drain_all();
1650 
1651     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1652         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1653             error_propagate(errp, local_err);
1654             goto cleanup;
1655         }
1656         bs_entry->prepared = true;
1657     }
1658 
1659     /* If we reach this point, we have success and just need to apply the
1660      * changes
1661      */
1662     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1663         bdrv_reopen_commit(&bs_entry->state);
1664     }
1665 
1666     ret = 0;
1667 
1668 cleanup:
1669     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1670         if (ret && bs_entry->prepared) {
1671             bdrv_reopen_abort(&bs_entry->state);
1672         }
1673         g_free(bs_entry);
1674     }
1675     g_free(bs_queue);
1676     return ret;
1677 }
1678 
1679 
1680 /* Reopen a single BlockDriverState with the specified flags. */
1681 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1682 {
1683     int ret = -1;
1684     Error *local_err = NULL;
1685     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1686 
1687     ret = bdrv_reopen_multiple(queue, &local_err);
1688     if (local_err != NULL) {
1689         error_propagate(errp, local_err);
1690     }
1691     return ret;
1692 }
1693 
1694 
1695 /*
1696  * Prepares a BlockDriverState for reopen. All changes are staged in the
1697  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1698  * the block driver layer .bdrv_reopen_prepare()
1699  *
1700  * bs is the BlockDriverState to reopen
1701  * flags are the new open flags
1702  * queue is the reopen queue
1703  *
1704  * Returns 0 on success, non-zero on error.  On error errp will be set
1705  * as well.
1706  *
1707  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1708  * It is the responsibility of the caller to then call the abort() or
1709  * commit() for any other BDS that have been left in a prepare() state
1710  *
1711  */
1712 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1713                         Error **errp)
1714 {
1715     int ret = -1;
1716     Error *local_err = NULL;
1717     BlockDriver *drv;
1718 
1719     assert(reopen_state != NULL);
1720     assert(reopen_state->bs->drv != NULL);
1721     drv = reopen_state->bs->drv;
1722 
1723     /* if we are to stay read-only, do not allow permission change
1724      * to r/w */
1725     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1726         reopen_state->flags & BDRV_O_RDWR) {
1727         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1728                   bdrv_get_device_name(reopen_state->bs));
1729         goto error;
1730     }
1731 
1732 
1733     ret = bdrv_flush(reopen_state->bs);
1734     if (ret) {
1735         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1736                   strerror(-ret));
1737         goto error;
1738     }
1739 
1740     if (drv->bdrv_reopen_prepare) {
1741         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1742         if (ret) {
1743             if (local_err != NULL) {
1744                 error_propagate(errp, local_err);
1745             } else {
1746                 error_setg(errp, "failed while preparing to reopen image '%s'",
1747                            reopen_state->bs->filename);
1748             }
1749             goto error;
1750         }
1751     } else {
1752         /* It is currently mandatory to have a bdrv_reopen_prepare()
1753          * handler for each supported drv. */
1754         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1755                   drv->format_name, bdrv_get_device_name(reopen_state->bs),
1756                  "reopening of file");
1757         ret = -1;
1758         goto error;
1759     }
1760 
1761     ret = 0;
1762 
1763 error:
1764     return ret;
1765 }
1766 
1767 /*
1768  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1769  * makes them final by swapping the staging BlockDriverState contents into
1770  * the active BlockDriverState contents.
1771  */
1772 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1773 {
1774     BlockDriver *drv;
1775 
1776     assert(reopen_state != NULL);
1777     drv = reopen_state->bs->drv;
1778     assert(drv != NULL);
1779 
1780     /* If there are any driver level actions to take */
1781     if (drv->bdrv_reopen_commit) {
1782         drv->bdrv_reopen_commit(reopen_state);
1783     }
1784 
1785     /* set BDS specific flags now */
1786     reopen_state->bs->open_flags         = reopen_state->flags;
1787     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1788                                               BDRV_O_CACHE_WB);
1789     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1790 
1791     bdrv_refresh_limits(reopen_state->bs, NULL);
1792 }
1793 
1794 /*
1795  * Abort the reopen, and delete and free the staged changes in
1796  * reopen_state
1797  */
1798 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1799 {
1800     BlockDriver *drv;
1801 
1802     assert(reopen_state != NULL);
1803     drv = reopen_state->bs->drv;
1804     assert(drv != NULL);
1805 
1806     if (drv->bdrv_reopen_abort) {
1807         drv->bdrv_reopen_abort(reopen_state);
1808     }
1809 }
1810 
1811 
1812 void bdrv_close(BlockDriverState *bs)
1813 {
1814     BdrvAioNotifier *ban, *ban_next;
1815 
1816     if (bs->job) {
1817         block_job_cancel_sync(bs->job);
1818     }
1819     bdrv_drain_all(); /* complete I/O */
1820     bdrv_flush(bs);
1821     bdrv_drain_all(); /* in case flush left pending I/O */
1822     notifier_list_notify(&bs->close_notifiers, bs);
1823 
1824     if (bs->drv) {
1825         if (bs->backing_hd) {
1826             BlockDriverState *backing_hd = bs->backing_hd;
1827             bdrv_set_backing_hd(bs, NULL);
1828             bdrv_unref(backing_hd);
1829         }
1830         bs->drv->bdrv_close(bs);
1831         g_free(bs->opaque);
1832         bs->opaque = NULL;
1833         bs->drv = NULL;
1834         bs->copy_on_read = 0;
1835         bs->backing_file[0] = '\0';
1836         bs->backing_format[0] = '\0';
1837         bs->total_sectors = 0;
1838         bs->encrypted = 0;
1839         bs->valid_key = 0;
1840         bs->sg = 0;
1841         bs->growable = 0;
1842         bs->zero_beyond_eof = false;
1843         QDECREF(bs->options);
1844         bs->options = NULL;
1845         QDECREF(bs->full_open_options);
1846         bs->full_open_options = NULL;
1847 
1848         if (bs->file != NULL) {
1849             bdrv_unref(bs->file);
1850             bs->file = NULL;
1851         }
1852     }
1853 
1854     if (bs->blk) {
1855         blk_dev_change_media_cb(bs->blk, false);
1856     }
1857 
1858     /*throttling disk I/O limits*/
1859     if (bs->io_limits_enabled) {
1860         bdrv_io_limits_disable(bs);
1861     }
1862 
1863     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1864         g_free(ban);
1865     }
1866     QLIST_INIT(&bs->aio_notifiers);
1867 }
1868 
1869 void bdrv_close_all(void)
1870 {
1871     BlockDriverState *bs;
1872 
1873     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1874         AioContext *aio_context = bdrv_get_aio_context(bs);
1875 
1876         aio_context_acquire(aio_context);
1877         bdrv_close(bs);
1878         aio_context_release(aio_context);
1879     }
1880 }
1881 
1882 /* Check if any requests are in-flight (including throttled requests) */
1883 static bool bdrv_requests_pending(BlockDriverState *bs)
1884 {
1885     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1886         return true;
1887     }
1888     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1889         return true;
1890     }
1891     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1892         return true;
1893     }
1894     if (bs->file && bdrv_requests_pending(bs->file)) {
1895         return true;
1896     }
1897     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1898         return true;
1899     }
1900     return false;
1901 }
1902 
1903 /*
1904  * Wait for pending requests to complete across all BlockDriverStates
1905  *
1906  * This function does not flush data to disk, use bdrv_flush_all() for that
1907  * after calling this function.
1908  *
1909  * Note that completion of an asynchronous I/O operation can trigger any
1910  * number of other I/O operations on other devices---for example a coroutine
1911  * can be arbitrarily complex and a constant flow of I/O can come until the
1912  * coroutine is complete.  Because of this, it is not possible to have a
1913  * function to drain a single device's I/O queue.
1914  */
1915 void bdrv_drain_all(void)
1916 {
1917     /* Always run first iteration so any pending completion BHs run */
1918     bool busy = true;
1919     BlockDriverState *bs;
1920 
1921     while (busy) {
1922         busy = false;
1923 
1924         QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1925             AioContext *aio_context = bdrv_get_aio_context(bs);
1926             bool bs_busy;
1927 
1928             aio_context_acquire(aio_context);
1929             bdrv_flush_io_queue(bs);
1930             bdrv_start_throttled_reqs(bs);
1931             bs_busy = bdrv_requests_pending(bs);
1932             bs_busy |= aio_poll(aio_context, bs_busy);
1933             aio_context_release(aio_context);
1934 
1935             busy |= bs_busy;
1936         }
1937     }
1938 }
1939 
1940 /* make a BlockDriverState anonymous by removing from bdrv_state and
1941  * graph_bdrv_state list.
1942    Also, NULL terminate the device_name to prevent double remove */
1943 void bdrv_make_anon(BlockDriverState *bs)
1944 {
1945     /*
1946      * Take care to remove bs from bdrv_states only when it's actually
1947      * in it.  Note that bs->device_list.tqe_prev is initially null,
1948      * and gets set to non-null by QTAILQ_INSERT_TAIL().  Establish
1949      * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
1950      * resetting it to null on remove.
1951      */
1952     if (bs->device_list.tqe_prev) {
1953         QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1954         bs->device_list.tqe_prev = NULL;
1955     }
1956     if (bs->node_name[0] != '\0') {
1957         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1958     }
1959     bs->node_name[0] = '\0';
1960 }
1961 
1962 static void bdrv_rebind(BlockDriverState *bs)
1963 {
1964     if (bs->drv && bs->drv->bdrv_rebind) {
1965         bs->drv->bdrv_rebind(bs);
1966     }
1967 }
1968 
1969 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1970                                      BlockDriverState *bs_src)
1971 {
1972     /* move some fields that need to stay attached to the device */
1973 
1974     /* dev info */
1975     bs_dest->guest_block_size   = bs_src->guest_block_size;
1976     bs_dest->copy_on_read       = bs_src->copy_on_read;
1977 
1978     bs_dest->enable_write_cache = bs_src->enable_write_cache;
1979 
1980     /* i/o throttled req */
1981     memcpy(&bs_dest->throttle_state,
1982            &bs_src->throttle_state,
1983            sizeof(ThrottleState));
1984     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1985     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1986     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1987 
1988     /* r/w error */
1989     bs_dest->on_read_error      = bs_src->on_read_error;
1990     bs_dest->on_write_error     = bs_src->on_write_error;
1991 
1992     /* i/o status */
1993     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1994     bs_dest->iostatus           = bs_src->iostatus;
1995 
1996     /* dirty bitmap */
1997     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1998 
1999     /* reference count */
2000     bs_dest->refcnt             = bs_src->refcnt;
2001 
2002     /* job */
2003     bs_dest->job                = bs_src->job;
2004 
2005     /* keep the same entry in bdrv_states */
2006     bs_dest->device_list = bs_src->device_list;
2007     bs_dest->blk = bs_src->blk;
2008 
2009     memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2010            sizeof(bs_dest->op_blockers));
2011 }
2012 
2013 /*
2014  * Swap bs contents for two image chains while they are live,
2015  * while keeping required fields on the BlockDriverState that is
2016  * actually attached to a device.
2017  *
2018  * This will modify the BlockDriverState fields, and swap contents
2019  * between bs_new and bs_old. Both bs_new and bs_old are modified.
2020  *
2021  * bs_new must not be attached to a BlockBackend.
2022  *
2023  * This function does not create any image files.
2024  */
2025 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2026 {
2027     BlockDriverState tmp;
2028 
2029     /* The code needs to swap the node_name but simply swapping node_list won't
2030      * work so first remove the nodes from the graph list, do the swap then
2031      * insert them back if needed.
2032      */
2033     if (bs_new->node_name[0] != '\0') {
2034         QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2035     }
2036     if (bs_old->node_name[0] != '\0') {
2037         QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2038     }
2039 
2040     /* bs_new must be unattached and shouldn't have anything fancy enabled */
2041     assert(!bs_new->blk);
2042     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2043     assert(bs_new->job == NULL);
2044     assert(bs_new->io_limits_enabled == false);
2045     assert(!throttle_have_timer(&bs_new->throttle_state));
2046 
2047     tmp = *bs_new;
2048     *bs_new = *bs_old;
2049     *bs_old = tmp;
2050 
2051     /* there are some fields that should not be swapped, move them back */
2052     bdrv_move_feature_fields(&tmp, bs_old);
2053     bdrv_move_feature_fields(bs_old, bs_new);
2054     bdrv_move_feature_fields(bs_new, &tmp);
2055 
2056     /* bs_new must remain unattached */
2057     assert(!bs_new->blk);
2058 
2059     /* Check a few fields that should remain attached to the device */
2060     assert(bs_new->job == NULL);
2061     assert(bs_new->io_limits_enabled == false);
2062     assert(!throttle_have_timer(&bs_new->throttle_state));
2063 
2064     /* insert the nodes back into the graph node list if needed */
2065     if (bs_new->node_name[0] != '\0') {
2066         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2067     }
2068     if (bs_old->node_name[0] != '\0') {
2069         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2070     }
2071 
2072     bdrv_rebind(bs_new);
2073     bdrv_rebind(bs_old);
2074 }
2075 
2076 /*
2077  * Add new bs contents at the top of an image chain while the chain is
2078  * live, while keeping required fields on the top layer.
2079  *
2080  * This will modify the BlockDriverState fields, and swap contents
2081  * between bs_new and bs_top. Both bs_new and bs_top are modified.
2082  *
2083  * bs_new must not be attached to a BlockBackend.
2084  *
2085  * This function does not create any image files.
2086  */
2087 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2088 {
2089     bdrv_swap(bs_new, bs_top);
2090 
2091     /* The contents of 'tmp' will become bs_top, as we are
2092      * swapping bs_new and bs_top contents. */
2093     bdrv_set_backing_hd(bs_top, bs_new);
2094 }
2095 
2096 static void bdrv_delete(BlockDriverState *bs)
2097 {
2098     assert(!bs->job);
2099     assert(bdrv_op_blocker_is_empty(bs));
2100     assert(!bs->refcnt);
2101     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2102 
2103     bdrv_close(bs);
2104 
2105     /* remove from list, if necessary */
2106     bdrv_make_anon(bs);
2107 
2108     g_free(bs);
2109 }
2110 
2111 /*
2112  * Run consistency checks on an image
2113  *
2114  * Returns 0 if the check could be completed (it doesn't mean that the image is
2115  * free of errors) or -errno when an internal error occurred. The results of the
2116  * check are stored in res.
2117  */
2118 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2119 {
2120     if (bs->drv == NULL) {
2121         return -ENOMEDIUM;
2122     }
2123     if (bs->drv->bdrv_check == NULL) {
2124         return -ENOTSUP;
2125     }
2126 
2127     memset(res, 0, sizeof(*res));
2128     return bs->drv->bdrv_check(bs, res, fix);
2129 }
2130 
2131 #define COMMIT_BUF_SECTORS 2048
2132 
2133 /* commit COW file into the raw image */
2134 int bdrv_commit(BlockDriverState *bs)
2135 {
2136     BlockDriver *drv = bs->drv;
2137     int64_t sector, total_sectors, length, backing_length;
2138     int n, ro, open_flags;
2139     int ret = 0;
2140     uint8_t *buf = NULL;
2141     char filename[PATH_MAX];
2142 
2143     if (!drv)
2144         return -ENOMEDIUM;
2145 
2146     if (!bs->backing_hd) {
2147         return -ENOTSUP;
2148     }
2149 
2150     if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2151         bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2152         return -EBUSY;
2153     }
2154 
2155     ro = bs->backing_hd->read_only;
2156     /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2157     pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2158     open_flags =  bs->backing_hd->open_flags;
2159 
2160     if (ro) {
2161         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2162             return -EACCES;
2163         }
2164     }
2165 
2166     length = bdrv_getlength(bs);
2167     if (length < 0) {
2168         ret = length;
2169         goto ro_cleanup;
2170     }
2171 
2172     backing_length = bdrv_getlength(bs->backing_hd);
2173     if (backing_length < 0) {
2174         ret = backing_length;
2175         goto ro_cleanup;
2176     }
2177 
2178     /* If our top snapshot is larger than the backing file image,
2179      * grow the backing file image if possible.  If not possible,
2180      * we must return an error */
2181     if (length > backing_length) {
2182         ret = bdrv_truncate(bs->backing_hd, length);
2183         if (ret < 0) {
2184             goto ro_cleanup;
2185         }
2186     }
2187 
2188     total_sectors = length >> BDRV_SECTOR_BITS;
2189 
2190     /* qemu_try_blockalign() for bs will choose an alignment that works for
2191      * bs->backing_hd as well, so no need to compare the alignment manually. */
2192     buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2193     if (buf == NULL) {
2194         ret = -ENOMEM;
2195         goto ro_cleanup;
2196     }
2197 
2198     for (sector = 0; sector < total_sectors; sector += n) {
2199         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2200         if (ret < 0) {
2201             goto ro_cleanup;
2202         }
2203         if (ret) {
2204             ret = bdrv_read(bs, sector, buf, n);
2205             if (ret < 0) {
2206                 goto ro_cleanup;
2207             }
2208 
2209             ret = bdrv_write(bs->backing_hd, sector, buf, n);
2210             if (ret < 0) {
2211                 goto ro_cleanup;
2212             }
2213         }
2214     }
2215 
2216     if (drv->bdrv_make_empty) {
2217         ret = drv->bdrv_make_empty(bs);
2218         if (ret < 0) {
2219             goto ro_cleanup;
2220         }
2221         bdrv_flush(bs);
2222     }
2223 
2224     /*
2225      * Make sure all data we wrote to the backing device is actually
2226      * stable on disk.
2227      */
2228     if (bs->backing_hd) {
2229         bdrv_flush(bs->backing_hd);
2230     }
2231 
2232     ret = 0;
2233 ro_cleanup:
2234     qemu_vfree(buf);
2235 
2236     if (ro) {
2237         /* ignoring error return here */
2238         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2239     }
2240 
2241     return ret;
2242 }
2243 
2244 int bdrv_commit_all(void)
2245 {
2246     BlockDriverState *bs;
2247 
2248     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2249         AioContext *aio_context = bdrv_get_aio_context(bs);
2250 
2251         aio_context_acquire(aio_context);
2252         if (bs->drv && bs->backing_hd) {
2253             int ret = bdrv_commit(bs);
2254             if (ret < 0) {
2255                 aio_context_release(aio_context);
2256                 return ret;
2257             }
2258         }
2259         aio_context_release(aio_context);
2260     }
2261     return 0;
2262 }
2263 
2264 /**
2265  * Remove an active request from the tracked requests list
2266  *
2267  * This function should be called when a tracked request is completing.
2268  */
2269 static void tracked_request_end(BdrvTrackedRequest *req)
2270 {
2271     if (req->serialising) {
2272         req->bs->serialising_in_flight--;
2273     }
2274 
2275     QLIST_REMOVE(req, list);
2276     qemu_co_queue_restart_all(&req->wait_queue);
2277 }
2278 
2279 /**
2280  * Add an active request to the tracked requests list
2281  */
2282 static void tracked_request_begin(BdrvTrackedRequest *req,
2283                                   BlockDriverState *bs,
2284                                   int64_t offset,
2285                                   unsigned int bytes, bool is_write)
2286 {
2287     *req = (BdrvTrackedRequest){
2288         .bs = bs,
2289         .offset         = offset,
2290         .bytes          = bytes,
2291         .is_write       = is_write,
2292         .co             = qemu_coroutine_self(),
2293         .serialising    = false,
2294         .overlap_offset = offset,
2295         .overlap_bytes  = bytes,
2296     };
2297 
2298     qemu_co_queue_init(&req->wait_queue);
2299 
2300     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2301 }
2302 
2303 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2304 {
2305     int64_t overlap_offset = req->offset & ~(align - 1);
2306     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2307                                - overlap_offset;
2308 
2309     if (!req->serialising) {
2310         req->bs->serialising_in_flight++;
2311         req->serialising = true;
2312     }
2313 
2314     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2315     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2316 }
2317 
2318 /**
2319  * Round a region to cluster boundaries
2320  */
2321 void bdrv_round_to_clusters(BlockDriverState *bs,
2322                             int64_t sector_num, int nb_sectors,
2323                             int64_t *cluster_sector_num,
2324                             int *cluster_nb_sectors)
2325 {
2326     BlockDriverInfo bdi;
2327 
2328     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2329         *cluster_sector_num = sector_num;
2330         *cluster_nb_sectors = nb_sectors;
2331     } else {
2332         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2333         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2334         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2335                                             nb_sectors, c);
2336     }
2337 }
2338 
2339 static int bdrv_get_cluster_size(BlockDriverState *bs)
2340 {
2341     BlockDriverInfo bdi;
2342     int ret;
2343 
2344     ret = bdrv_get_info(bs, &bdi);
2345     if (ret < 0 || bdi.cluster_size == 0) {
2346         return bs->request_alignment;
2347     } else {
2348         return bdi.cluster_size;
2349     }
2350 }
2351 
2352 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2353                                      int64_t offset, unsigned int bytes)
2354 {
2355     /*        aaaa   bbbb */
2356     if (offset >= req->overlap_offset + req->overlap_bytes) {
2357         return false;
2358     }
2359     /* bbbb   aaaa        */
2360     if (req->overlap_offset >= offset + bytes) {
2361         return false;
2362     }
2363     return true;
2364 }
2365 
2366 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2367 {
2368     BlockDriverState *bs = self->bs;
2369     BdrvTrackedRequest *req;
2370     bool retry;
2371     bool waited = false;
2372 
2373     if (!bs->serialising_in_flight) {
2374         return false;
2375     }
2376 
2377     do {
2378         retry = false;
2379         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2380             if (req == self || (!req->serialising && !self->serialising)) {
2381                 continue;
2382             }
2383             if (tracked_request_overlaps(req, self->overlap_offset,
2384                                          self->overlap_bytes))
2385             {
2386                 /* Hitting this means there was a reentrant request, for
2387                  * example, a block driver issuing nested requests.  This must
2388                  * never happen since it means deadlock.
2389                  */
2390                 assert(qemu_coroutine_self() != req->co);
2391 
2392                 /* If the request is already (indirectly) waiting for us, or
2393                  * will wait for us as soon as it wakes up, then just go on
2394                  * (instead of producing a deadlock in the former case). */
2395                 if (!req->waiting_for) {
2396                     self->waiting_for = req;
2397                     qemu_co_queue_wait(&req->wait_queue);
2398                     self->waiting_for = NULL;
2399                     retry = true;
2400                     waited = true;
2401                     break;
2402                 }
2403             }
2404         }
2405     } while (retry);
2406 
2407     return waited;
2408 }
2409 
2410 /*
2411  * Return values:
2412  * 0        - success
2413  * -EINVAL  - backing format specified, but no file
2414  * -ENOSPC  - can't update the backing file because no space is left in the
2415  *            image file header
2416  * -ENOTSUP - format driver doesn't support changing the backing file
2417  */
2418 int bdrv_change_backing_file(BlockDriverState *bs,
2419     const char *backing_file, const char *backing_fmt)
2420 {
2421     BlockDriver *drv = bs->drv;
2422     int ret;
2423 
2424     /* Backing file format doesn't make sense without a backing file */
2425     if (backing_fmt && !backing_file) {
2426         return -EINVAL;
2427     }
2428 
2429     if (drv->bdrv_change_backing_file != NULL) {
2430         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2431     } else {
2432         ret = -ENOTSUP;
2433     }
2434 
2435     if (ret == 0) {
2436         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2437         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2438     }
2439     return ret;
2440 }
2441 
2442 /*
2443  * Finds the image layer in the chain that has 'bs' as its backing file.
2444  *
2445  * active is the current topmost image.
2446  *
2447  * Returns NULL if bs is not found in active's image chain,
2448  * or if active == bs.
2449  *
2450  * Returns the bottommost base image if bs == NULL.
2451  */
2452 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2453                                     BlockDriverState *bs)
2454 {
2455     while (active && bs != active->backing_hd) {
2456         active = active->backing_hd;
2457     }
2458 
2459     return active;
2460 }
2461 
2462 /* Given a BDS, searches for the base layer. */
2463 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2464 {
2465     return bdrv_find_overlay(bs, NULL);
2466 }
2467 
2468 typedef struct BlkIntermediateStates {
2469     BlockDriverState *bs;
2470     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2471 } BlkIntermediateStates;
2472 
2473 
2474 /*
2475  * Drops images above 'base' up to and including 'top', and sets the image
2476  * above 'top' to have base as its backing file.
2477  *
2478  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2479  * information in 'bs' can be properly updated.
2480  *
2481  * E.g., this will convert the following chain:
2482  * bottom <- base <- intermediate <- top <- active
2483  *
2484  * to
2485  *
2486  * bottom <- base <- active
2487  *
2488  * It is allowed for bottom==base, in which case it converts:
2489  *
2490  * base <- intermediate <- top <- active
2491  *
2492  * to
2493  *
2494  * base <- active
2495  *
2496  * If backing_file_str is non-NULL, it will be used when modifying top's
2497  * overlay image metadata.
2498  *
2499  * Error conditions:
2500  *  if active == top, that is considered an error
2501  *
2502  */
2503 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2504                            BlockDriverState *base, const char *backing_file_str)
2505 {
2506     BlockDriverState *intermediate;
2507     BlockDriverState *base_bs = NULL;
2508     BlockDriverState *new_top_bs = NULL;
2509     BlkIntermediateStates *intermediate_state, *next;
2510     int ret = -EIO;
2511 
2512     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2513     QSIMPLEQ_INIT(&states_to_delete);
2514 
2515     if (!top->drv || !base->drv) {
2516         goto exit;
2517     }
2518 
2519     new_top_bs = bdrv_find_overlay(active, top);
2520 
2521     if (new_top_bs == NULL) {
2522         /* we could not find the image above 'top', this is an error */
2523         goto exit;
2524     }
2525 
2526     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2527      * to do, no intermediate images */
2528     if (new_top_bs->backing_hd == base) {
2529         ret = 0;
2530         goto exit;
2531     }
2532 
2533     intermediate = top;
2534 
2535     /* now we will go down through the list, and add each BDS we find
2536      * into our deletion queue, until we hit the 'base'
2537      */
2538     while (intermediate) {
2539         intermediate_state = g_new0(BlkIntermediateStates, 1);
2540         intermediate_state->bs = intermediate;
2541         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2542 
2543         if (intermediate->backing_hd == base) {
2544             base_bs = intermediate->backing_hd;
2545             break;
2546         }
2547         intermediate = intermediate->backing_hd;
2548     }
2549     if (base_bs == NULL) {
2550         /* something went wrong, we did not end at the base. safely
2551          * unravel everything, and exit with error */
2552         goto exit;
2553     }
2554 
2555     /* success - we can delete the intermediate states, and link top->base */
2556     backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2557     ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2558                                    base_bs->drv ? base_bs->drv->format_name : "");
2559     if (ret) {
2560         goto exit;
2561     }
2562     bdrv_set_backing_hd(new_top_bs, base_bs);
2563 
2564     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2565         /* so that bdrv_close() does not recursively close the chain */
2566         bdrv_set_backing_hd(intermediate_state->bs, NULL);
2567         bdrv_unref(intermediate_state->bs);
2568     }
2569     ret = 0;
2570 
2571 exit:
2572     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2573         g_free(intermediate_state);
2574     }
2575     return ret;
2576 }
2577 
2578 
2579 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2580                                    size_t size)
2581 {
2582     int64_t len;
2583 
2584     if (size > INT_MAX) {
2585         return -EIO;
2586     }
2587 
2588     if (!bdrv_is_inserted(bs))
2589         return -ENOMEDIUM;
2590 
2591     if (bs->growable)
2592         return 0;
2593 
2594     len = bdrv_getlength(bs);
2595 
2596     if (offset < 0)
2597         return -EIO;
2598 
2599     if ((offset > len) || (len - offset < size))
2600         return -EIO;
2601 
2602     return 0;
2603 }
2604 
2605 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2606                               int nb_sectors)
2607 {
2608     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2609         return -EIO;
2610     }
2611 
2612     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2613                                    nb_sectors * BDRV_SECTOR_SIZE);
2614 }
2615 
2616 typedef struct RwCo {
2617     BlockDriverState *bs;
2618     int64_t offset;
2619     QEMUIOVector *qiov;
2620     bool is_write;
2621     int ret;
2622     BdrvRequestFlags flags;
2623 } RwCo;
2624 
2625 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2626 {
2627     RwCo *rwco = opaque;
2628 
2629     if (!rwco->is_write) {
2630         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2631                                       rwco->qiov->size, rwco->qiov,
2632                                       rwco->flags);
2633     } else {
2634         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2635                                        rwco->qiov->size, rwco->qiov,
2636                                        rwco->flags);
2637     }
2638 }
2639 
2640 /*
2641  * Process a vectored synchronous request using coroutines
2642  */
2643 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2644                         QEMUIOVector *qiov, bool is_write,
2645                         BdrvRequestFlags flags)
2646 {
2647     Coroutine *co;
2648     RwCo rwco = {
2649         .bs = bs,
2650         .offset = offset,
2651         .qiov = qiov,
2652         .is_write = is_write,
2653         .ret = NOT_DONE,
2654         .flags = flags,
2655     };
2656 
2657     /**
2658      * In sync call context, when the vcpu is blocked, this throttling timer
2659      * will not fire; so the I/O throttling function has to be disabled here
2660      * if it has been enabled.
2661      */
2662     if (bs->io_limits_enabled) {
2663         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2664                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2665         bdrv_io_limits_disable(bs);
2666     }
2667 
2668     if (qemu_in_coroutine()) {
2669         /* Fast-path if already in coroutine context */
2670         bdrv_rw_co_entry(&rwco);
2671     } else {
2672         AioContext *aio_context = bdrv_get_aio_context(bs);
2673 
2674         co = qemu_coroutine_create(bdrv_rw_co_entry);
2675         qemu_coroutine_enter(co, &rwco);
2676         while (rwco.ret == NOT_DONE) {
2677             aio_poll(aio_context, true);
2678         }
2679     }
2680     return rwco.ret;
2681 }
2682 
2683 /*
2684  * Process a synchronous request using coroutines
2685  */
2686 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2687                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2688 {
2689     QEMUIOVector qiov;
2690     struct iovec iov = {
2691         .iov_base = (void *)buf,
2692         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2693     };
2694 
2695     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2696         return -EINVAL;
2697     }
2698 
2699     qemu_iovec_init_external(&qiov, &iov, 1);
2700     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2701                         &qiov, is_write, flags);
2702 }
2703 
2704 /* return < 0 if error. See bdrv_write() for the return codes */
2705 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2706               uint8_t *buf, int nb_sectors)
2707 {
2708     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2709 }
2710 
2711 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2712 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2713                           uint8_t *buf, int nb_sectors)
2714 {
2715     bool enabled;
2716     int ret;
2717 
2718     enabled = bs->io_limits_enabled;
2719     bs->io_limits_enabled = false;
2720     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2721     bs->io_limits_enabled = enabled;
2722     return ret;
2723 }
2724 
2725 /* Return < 0 if error. Important errors are:
2726   -EIO         generic I/O error (may happen for all errors)
2727   -ENOMEDIUM   No media inserted.
2728   -EINVAL      Invalid sector number or nb_sectors
2729   -EACCES      Trying to write a read-only device
2730 */
2731 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2732                const uint8_t *buf, int nb_sectors)
2733 {
2734     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2735 }
2736 
2737 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2738                       int nb_sectors, BdrvRequestFlags flags)
2739 {
2740     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2741                       BDRV_REQ_ZERO_WRITE | flags);
2742 }
2743 
2744 /*
2745  * Completely zero out a block device with the help of bdrv_write_zeroes.
2746  * The operation is sped up by checking the block status and only writing
2747  * zeroes to the device if they currently do not return zeroes. Optional
2748  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2749  *
2750  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2751  */
2752 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2753 {
2754     int64_t target_sectors, ret, nb_sectors, sector_num = 0;
2755     int n;
2756 
2757     target_sectors = bdrv_nb_sectors(bs);
2758     if (target_sectors < 0) {
2759         return target_sectors;
2760     }
2761 
2762     for (;;) {
2763         nb_sectors = target_sectors - sector_num;
2764         if (nb_sectors <= 0) {
2765             return 0;
2766         }
2767         if (nb_sectors > INT_MAX) {
2768             nb_sectors = INT_MAX;
2769         }
2770         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2771         if (ret < 0) {
2772             error_report("error getting block status at sector %" PRId64 ": %s",
2773                          sector_num, strerror(-ret));
2774             return ret;
2775         }
2776         if (ret & BDRV_BLOCK_ZERO) {
2777             sector_num += n;
2778             continue;
2779         }
2780         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2781         if (ret < 0) {
2782             error_report("error writing zeroes at sector %" PRId64 ": %s",
2783                          sector_num, strerror(-ret));
2784             return ret;
2785         }
2786         sector_num += n;
2787     }
2788 }
2789 
2790 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2791 {
2792     QEMUIOVector qiov;
2793     struct iovec iov = {
2794         .iov_base = (void *)buf,
2795         .iov_len = bytes,
2796     };
2797     int ret;
2798 
2799     if (bytes < 0) {
2800         return -EINVAL;
2801     }
2802 
2803     qemu_iovec_init_external(&qiov, &iov, 1);
2804     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2805     if (ret < 0) {
2806         return ret;
2807     }
2808 
2809     return bytes;
2810 }
2811 
2812 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2813 {
2814     int ret;
2815 
2816     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2817     if (ret < 0) {
2818         return ret;
2819     }
2820 
2821     return qiov->size;
2822 }
2823 
2824 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2825                 const void *buf, int bytes)
2826 {
2827     QEMUIOVector qiov;
2828     struct iovec iov = {
2829         .iov_base   = (void *) buf,
2830         .iov_len    = bytes,
2831     };
2832 
2833     if (bytes < 0) {
2834         return -EINVAL;
2835     }
2836 
2837     qemu_iovec_init_external(&qiov, &iov, 1);
2838     return bdrv_pwritev(bs, offset, &qiov);
2839 }
2840 
2841 /*
2842  * Writes to the file and ensures that no writes are reordered across this
2843  * request (acts as a barrier)
2844  *
2845  * Returns 0 on success, -errno in error cases.
2846  */
2847 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2848     const void *buf, int count)
2849 {
2850     int ret;
2851 
2852     ret = bdrv_pwrite(bs, offset, buf, count);
2853     if (ret < 0) {
2854         return ret;
2855     }
2856 
2857     /* No flush needed for cache modes that already do it */
2858     if (bs->enable_write_cache) {
2859         bdrv_flush(bs);
2860     }
2861 
2862     return 0;
2863 }
2864 
2865 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2866         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2867 {
2868     /* Perform I/O through a temporary buffer so that users who scribble over
2869      * their read buffer while the operation is in progress do not end up
2870      * modifying the image file.  This is critical for zero-copy guest I/O
2871      * where anything might happen inside guest memory.
2872      */
2873     void *bounce_buffer;
2874 
2875     BlockDriver *drv = bs->drv;
2876     struct iovec iov;
2877     QEMUIOVector bounce_qiov;
2878     int64_t cluster_sector_num;
2879     int cluster_nb_sectors;
2880     size_t skip_bytes;
2881     int ret;
2882 
2883     /* Cover entire cluster so no additional backing file I/O is required when
2884      * allocating cluster in the image file.
2885      */
2886     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2887                            &cluster_sector_num, &cluster_nb_sectors);
2888 
2889     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2890                                    cluster_sector_num, cluster_nb_sectors);
2891 
2892     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2893     iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
2894     if (bounce_buffer == NULL) {
2895         ret = -ENOMEM;
2896         goto err;
2897     }
2898 
2899     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2900 
2901     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2902                              &bounce_qiov);
2903     if (ret < 0) {
2904         goto err;
2905     }
2906 
2907     if (drv->bdrv_co_write_zeroes &&
2908         buffer_is_zero(bounce_buffer, iov.iov_len)) {
2909         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2910                                       cluster_nb_sectors, 0);
2911     } else {
2912         /* This does not change the data on the disk, it is not necessary
2913          * to flush even in cache=writethrough mode.
2914          */
2915         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2916                                   &bounce_qiov);
2917     }
2918 
2919     if (ret < 0) {
2920         /* It might be okay to ignore write errors for guest requests.  If this
2921          * is a deliberate copy-on-read then we don't want to ignore the error.
2922          * Simply report it in all cases.
2923          */
2924         goto err;
2925     }
2926 
2927     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2928     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2929                         nb_sectors * BDRV_SECTOR_SIZE);
2930 
2931 err:
2932     qemu_vfree(bounce_buffer);
2933     return ret;
2934 }
2935 
2936 /*
2937  * Forwards an already correctly aligned request to the BlockDriver. This
2938  * handles copy on read and zeroing after EOF; any other features must be
2939  * implemented by the caller.
2940  */
2941 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2942     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2943     int64_t align, QEMUIOVector *qiov, int flags)
2944 {
2945     BlockDriver *drv = bs->drv;
2946     int ret;
2947 
2948     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2949     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2950 
2951     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2952     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2953     assert(!qiov || bytes == qiov->size);
2954 
2955     /* Handle Copy on Read and associated serialisation */
2956     if (flags & BDRV_REQ_COPY_ON_READ) {
2957         /* If we touch the same cluster it counts as an overlap.  This
2958          * guarantees that allocating writes will be serialized and not race
2959          * with each other for the same cluster.  For example, in copy-on-read
2960          * it ensures that the CoR read and write operations are atomic and
2961          * guest writes cannot interleave between them. */
2962         mark_request_serialising(req, bdrv_get_cluster_size(bs));
2963     }
2964 
2965     wait_serialising_requests(req);
2966 
2967     if (flags & BDRV_REQ_COPY_ON_READ) {
2968         int pnum;
2969 
2970         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2971         if (ret < 0) {
2972             goto out;
2973         }
2974 
2975         if (!ret || pnum != nb_sectors) {
2976             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2977             goto out;
2978         }
2979     }
2980 
2981     /* Forward the request to the BlockDriver */
2982     if (!(bs->zero_beyond_eof && bs->growable)) {
2983         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2984     } else {
2985         /* Read zeros after EOF of growable BDSes */
2986         int64_t total_sectors, max_nb_sectors;
2987 
2988         total_sectors = bdrv_nb_sectors(bs);
2989         if (total_sectors < 0) {
2990             ret = total_sectors;
2991             goto out;
2992         }
2993 
2994         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
2995                                   align >> BDRV_SECTOR_BITS);
2996         if (max_nb_sectors > 0) {
2997             QEMUIOVector local_qiov;
2998             size_t local_sectors;
2999 
3000             max_nb_sectors = MIN(max_nb_sectors, SIZE_MAX / BDRV_SECTOR_BITS);
3001             local_sectors = MIN(max_nb_sectors, nb_sectors);
3002 
3003             qemu_iovec_init(&local_qiov, qiov->niov);
3004             qemu_iovec_concat(&local_qiov, qiov, 0,
3005                               local_sectors * BDRV_SECTOR_SIZE);
3006 
3007             ret = drv->bdrv_co_readv(bs, sector_num, local_sectors,
3008                                      &local_qiov);
3009 
3010             qemu_iovec_destroy(&local_qiov);
3011         } else {
3012             ret = 0;
3013         }
3014 
3015         /* Reading beyond end of file is supposed to produce zeroes */
3016         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3017             uint64_t offset = MAX(0, total_sectors - sector_num);
3018             uint64_t bytes = (sector_num + nb_sectors - offset) *
3019                               BDRV_SECTOR_SIZE;
3020             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3021         }
3022     }
3023 
3024 out:
3025     return ret;
3026 }
3027 
3028 /*
3029  * Handle a read request in coroutine context
3030  */
3031 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3032     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3033     BdrvRequestFlags flags)
3034 {
3035     BlockDriver *drv = bs->drv;
3036     BdrvTrackedRequest req;
3037 
3038     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3039     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3040     uint8_t *head_buf = NULL;
3041     uint8_t *tail_buf = NULL;
3042     QEMUIOVector local_qiov;
3043     bool use_local_qiov = false;
3044     int ret;
3045 
3046     if (!drv) {
3047         return -ENOMEDIUM;
3048     }
3049     if (bdrv_check_byte_request(bs, offset, bytes)) {
3050         return -EIO;
3051     }
3052 
3053     if (bs->copy_on_read) {
3054         flags |= BDRV_REQ_COPY_ON_READ;
3055     }
3056 
3057     /* throttling disk I/O */
3058     if (bs->io_limits_enabled) {
3059         bdrv_io_limits_intercept(bs, bytes, false);
3060     }
3061 
3062     /* Align read if necessary by padding qiov */
3063     if (offset & (align - 1)) {
3064         head_buf = qemu_blockalign(bs, align);
3065         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3066         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3067         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3068         use_local_qiov = true;
3069 
3070         bytes += offset & (align - 1);
3071         offset = offset & ~(align - 1);
3072     }
3073 
3074     if ((offset + bytes) & (align - 1)) {
3075         if (!use_local_qiov) {
3076             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3077             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3078             use_local_qiov = true;
3079         }
3080         tail_buf = qemu_blockalign(bs, align);
3081         qemu_iovec_add(&local_qiov, tail_buf,
3082                        align - ((offset + bytes) & (align - 1)));
3083 
3084         bytes = ROUND_UP(bytes, align);
3085     }
3086 
3087     tracked_request_begin(&req, bs, offset, bytes, false);
3088     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3089                               use_local_qiov ? &local_qiov : qiov,
3090                               flags);
3091     tracked_request_end(&req);
3092 
3093     if (use_local_qiov) {
3094         qemu_iovec_destroy(&local_qiov);
3095         qemu_vfree(head_buf);
3096         qemu_vfree(tail_buf);
3097     }
3098 
3099     return ret;
3100 }
3101 
3102 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3103     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3104     BdrvRequestFlags flags)
3105 {
3106     if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3107         return -EINVAL;
3108     }
3109 
3110     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3111                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3112 }
3113 
3114 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3115     int nb_sectors, QEMUIOVector *qiov)
3116 {
3117     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3118 
3119     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3120 }
3121 
3122 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3123     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3124 {
3125     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3126 
3127     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3128                             BDRV_REQ_COPY_ON_READ);
3129 }
3130 
3131 /* if no limit is specified in the BlockLimits use a default
3132  * of 32768 512-byte sectors (16 MiB) per request.
3133  */
3134 #define MAX_WRITE_ZEROES_DEFAULT 32768
3135 
3136 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3137     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3138 {
3139     BlockDriver *drv = bs->drv;
3140     QEMUIOVector qiov;
3141     struct iovec iov = {0};
3142     int ret = 0;
3143 
3144     int max_write_zeroes = bs->bl.max_write_zeroes ?
3145                            bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3146 
3147     while (nb_sectors > 0 && !ret) {
3148         int num = nb_sectors;
3149 
3150         /* Align request.  Block drivers can expect the "bulk" of the request
3151          * to be aligned.
3152          */
3153         if (bs->bl.write_zeroes_alignment
3154             && num > bs->bl.write_zeroes_alignment) {
3155             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3156                 /* Make a small request up to the first aligned sector.  */
3157                 num = bs->bl.write_zeroes_alignment;
3158                 num -= sector_num % bs->bl.write_zeroes_alignment;
3159             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3160                 /* Shorten the request to the last aligned sector.  num cannot
3161                  * underflow because num > bs->bl.write_zeroes_alignment.
3162                  */
3163                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3164             }
3165         }
3166 
3167         /* limit request size */
3168         if (num > max_write_zeroes) {
3169             num = max_write_zeroes;
3170         }
3171 
3172         ret = -ENOTSUP;
3173         /* First try the efficient write zeroes operation */
3174         if (drv->bdrv_co_write_zeroes) {
3175             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3176         }
3177 
3178         if (ret == -ENOTSUP) {
3179             /* Fall back to bounce buffer if write zeroes is unsupported */
3180             iov.iov_len = num * BDRV_SECTOR_SIZE;
3181             if (iov.iov_base == NULL) {
3182                 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3183                 if (iov.iov_base == NULL) {
3184                     ret = -ENOMEM;
3185                     goto fail;
3186                 }
3187                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3188             }
3189             qemu_iovec_init_external(&qiov, &iov, 1);
3190 
3191             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3192 
3193             /* Keep bounce buffer around if it is big enough for all
3194              * all future requests.
3195              */
3196             if (num < max_write_zeroes) {
3197                 qemu_vfree(iov.iov_base);
3198                 iov.iov_base = NULL;
3199             }
3200         }
3201 
3202         sector_num += num;
3203         nb_sectors -= num;
3204     }
3205 
3206 fail:
3207     qemu_vfree(iov.iov_base);
3208     return ret;
3209 }
3210 
3211 /*
3212  * Forwards an already correctly aligned write request to the BlockDriver.
3213  */
3214 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3215     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3216     QEMUIOVector *qiov, int flags)
3217 {
3218     BlockDriver *drv = bs->drv;
3219     bool waited;
3220     int ret;
3221 
3222     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3223     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3224 
3225     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3226     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3227     assert(!qiov || bytes == qiov->size);
3228 
3229     waited = wait_serialising_requests(req);
3230     assert(!waited || !req->serialising);
3231     assert(req->overlap_offset <= offset);
3232     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3233 
3234     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3235 
3236     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3237         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3238         qemu_iovec_is_zero(qiov)) {
3239         flags |= BDRV_REQ_ZERO_WRITE;
3240         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3241             flags |= BDRV_REQ_MAY_UNMAP;
3242         }
3243     }
3244 
3245     if (ret < 0) {
3246         /* Do nothing, write notifier decided to fail this request */
3247     } else if (flags & BDRV_REQ_ZERO_WRITE) {
3248         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3249         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3250     } else {
3251         BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3252         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3253     }
3254     BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3255 
3256     if (ret == 0 && !bs->enable_write_cache) {
3257         ret = bdrv_co_flush(bs);
3258     }
3259 
3260     bdrv_set_dirty(bs, sector_num, nb_sectors);
3261 
3262     block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
3263 
3264     if (bs->growable && ret >= 0) {
3265         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3266     }
3267 
3268     return ret;
3269 }
3270 
3271 /*
3272  * Handle a write request in coroutine context
3273  */
3274 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3275     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3276     BdrvRequestFlags flags)
3277 {
3278     BdrvTrackedRequest req;
3279     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3280     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3281     uint8_t *head_buf = NULL;
3282     uint8_t *tail_buf = NULL;
3283     QEMUIOVector local_qiov;
3284     bool use_local_qiov = false;
3285     int ret;
3286 
3287     if (!bs->drv) {
3288         return -ENOMEDIUM;
3289     }
3290     if (bs->read_only) {
3291         return -EACCES;
3292     }
3293     if (bdrv_check_byte_request(bs, offset, bytes)) {
3294         return -EIO;
3295     }
3296 
3297     /* throttling disk I/O */
3298     if (bs->io_limits_enabled) {
3299         bdrv_io_limits_intercept(bs, bytes, true);
3300     }
3301 
3302     /*
3303      * Align write if necessary by performing a read-modify-write cycle.
3304      * Pad qiov with the read parts and be sure to have a tracked request not
3305      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3306      */
3307     tracked_request_begin(&req, bs, offset, bytes, true);
3308 
3309     if (offset & (align - 1)) {
3310         QEMUIOVector head_qiov;
3311         struct iovec head_iov;
3312 
3313         mark_request_serialising(&req, align);
3314         wait_serialising_requests(&req);
3315 
3316         head_buf = qemu_blockalign(bs, align);
3317         head_iov = (struct iovec) {
3318             .iov_base   = head_buf,
3319             .iov_len    = align,
3320         };
3321         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3322 
3323         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3324         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3325                                   align, &head_qiov, 0);
3326         if (ret < 0) {
3327             goto fail;
3328         }
3329         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3330 
3331         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3332         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3333         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3334         use_local_qiov = true;
3335 
3336         bytes += offset & (align - 1);
3337         offset = offset & ~(align - 1);
3338     }
3339 
3340     if ((offset + bytes) & (align - 1)) {
3341         QEMUIOVector tail_qiov;
3342         struct iovec tail_iov;
3343         size_t tail_bytes;
3344         bool waited;
3345 
3346         mark_request_serialising(&req, align);
3347         waited = wait_serialising_requests(&req);
3348         assert(!waited || !use_local_qiov);
3349 
3350         tail_buf = qemu_blockalign(bs, align);
3351         tail_iov = (struct iovec) {
3352             .iov_base   = tail_buf,
3353             .iov_len    = align,
3354         };
3355         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3356 
3357         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3358         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3359                                   align, &tail_qiov, 0);
3360         if (ret < 0) {
3361             goto fail;
3362         }
3363         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3364 
3365         if (!use_local_qiov) {
3366             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3367             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3368             use_local_qiov = true;
3369         }
3370 
3371         tail_bytes = (offset + bytes) & (align - 1);
3372         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3373 
3374         bytes = ROUND_UP(bytes, align);
3375     }
3376 
3377     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3378                                use_local_qiov ? &local_qiov : qiov,
3379                                flags);
3380 
3381 fail:
3382     tracked_request_end(&req);
3383 
3384     if (use_local_qiov) {
3385         qemu_iovec_destroy(&local_qiov);
3386     }
3387     qemu_vfree(head_buf);
3388     qemu_vfree(tail_buf);
3389 
3390     return ret;
3391 }
3392 
3393 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3394     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3395     BdrvRequestFlags flags)
3396 {
3397     if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3398         return -EINVAL;
3399     }
3400 
3401     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3402                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3403 }
3404 
3405 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3406     int nb_sectors, QEMUIOVector *qiov)
3407 {
3408     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3409 
3410     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3411 }
3412 
3413 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3414                                       int64_t sector_num, int nb_sectors,
3415                                       BdrvRequestFlags flags)
3416 {
3417     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3418 
3419     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3420         flags &= ~BDRV_REQ_MAY_UNMAP;
3421     }
3422 
3423     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3424                              BDRV_REQ_ZERO_WRITE | flags);
3425 }
3426 
3427 /**
3428  * Truncate file to 'offset' bytes (needed only for file protocols)
3429  */
3430 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3431 {
3432     BlockDriver *drv = bs->drv;
3433     int ret;
3434     if (!drv)
3435         return -ENOMEDIUM;
3436     if (!drv->bdrv_truncate)
3437         return -ENOTSUP;
3438     if (bs->read_only)
3439         return -EACCES;
3440 
3441     ret = drv->bdrv_truncate(bs, offset);
3442     if (ret == 0) {
3443         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3444         if (bs->blk) {
3445             blk_dev_resize_cb(bs->blk);
3446         }
3447     }
3448     return ret;
3449 }
3450 
3451 /**
3452  * Length of a allocated file in bytes. Sparse files are counted by actual
3453  * allocated space. Return < 0 if error or unknown.
3454  */
3455 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3456 {
3457     BlockDriver *drv = bs->drv;
3458     if (!drv) {
3459         return -ENOMEDIUM;
3460     }
3461     if (drv->bdrv_get_allocated_file_size) {
3462         return drv->bdrv_get_allocated_file_size(bs);
3463     }
3464     if (bs->file) {
3465         return bdrv_get_allocated_file_size(bs->file);
3466     }
3467     return -ENOTSUP;
3468 }
3469 
3470 /**
3471  * Return number of sectors on success, -errno on error.
3472  */
3473 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3474 {
3475     BlockDriver *drv = bs->drv;
3476 
3477     if (!drv)
3478         return -ENOMEDIUM;
3479 
3480     if (drv->has_variable_length) {
3481         int ret = refresh_total_sectors(bs, bs->total_sectors);
3482         if (ret < 0) {
3483             return ret;
3484         }
3485     }
3486     return bs->total_sectors;
3487 }
3488 
3489 /**
3490  * Return length in bytes on success, -errno on error.
3491  * The length is always a multiple of BDRV_SECTOR_SIZE.
3492  */
3493 int64_t bdrv_getlength(BlockDriverState *bs)
3494 {
3495     int64_t ret = bdrv_nb_sectors(bs);
3496 
3497     return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3498 }
3499 
3500 /* return 0 as number of sectors if no device present or error */
3501 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3502 {
3503     int64_t nb_sectors = bdrv_nb_sectors(bs);
3504 
3505     *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3506 }
3507 
3508 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3509                        BlockdevOnError on_write_error)
3510 {
3511     bs->on_read_error = on_read_error;
3512     bs->on_write_error = on_write_error;
3513 }
3514 
3515 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3516 {
3517     return is_read ? bs->on_read_error : bs->on_write_error;
3518 }
3519 
3520 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3521 {
3522     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3523 
3524     switch (on_err) {
3525     case BLOCKDEV_ON_ERROR_ENOSPC:
3526         return (error == ENOSPC) ?
3527                BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3528     case BLOCKDEV_ON_ERROR_STOP:
3529         return BLOCK_ERROR_ACTION_STOP;
3530     case BLOCKDEV_ON_ERROR_REPORT:
3531         return BLOCK_ERROR_ACTION_REPORT;
3532     case BLOCKDEV_ON_ERROR_IGNORE:
3533         return BLOCK_ERROR_ACTION_IGNORE;
3534     default:
3535         abort();
3536     }
3537 }
3538 
3539 static void send_qmp_error_event(BlockDriverState *bs,
3540                                  BlockErrorAction action,
3541                                  bool is_read, int error)
3542 {
3543     BlockErrorAction ac;
3544 
3545     ac = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3546     qapi_event_send_block_io_error(bdrv_get_device_name(bs), ac, action,
3547                                    bdrv_iostatus_is_enabled(bs),
3548                                    error == ENOSPC, strerror(error),
3549                                    &error_abort);
3550 }
3551 
3552 /* This is done by device models because, while the block layer knows
3553  * about the error, it does not know whether an operation comes from
3554  * the device or the block layer (from a job, for example).
3555  */
3556 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3557                        bool is_read, int error)
3558 {
3559     assert(error >= 0);
3560 
3561     if (action == BLOCK_ERROR_ACTION_STOP) {
3562         /* First set the iostatus, so that "info block" returns an iostatus
3563          * that matches the events raised so far (an additional error iostatus
3564          * is fine, but not a lost one).
3565          */
3566         bdrv_iostatus_set_err(bs, error);
3567 
3568         /* Then raise the request to stop the VM and the event.
3569          * qemu_system_vmstop_request_prepare has two effects.  First,
3570          * it ensures that the STOP event always comes after the
3571          * BLOCK_IO_ERROR event.  Second, it ensures that even if management
3572          * can observe the STOP event and do a "cont" before the STOP
3573          * event is issued, the VM will not stop.  In this case, vm_start()
3574          * also ensures that the STOP/RESUME pair of events is emitted.
3575          */
3576         qemu_system_vmstop_request_prepare();
3577         send_qmp_error_event(bs, action, is_read, error);
3578         qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3579     } else {
3580         send_qmp_error_event(bs, action, is_read, error);
3581     }
3582 }
3583 
3584 int bdrv_is_read_only(BlockDriverState *bs)
3585 {
3586     return bs->read_only;
3587 }
3588 
3589 int bdrv_is_sg(BlockDriverState *bs)
3590 {
3591     return bs->sg;
3592 }
3593 
3594 int bdrv_enable_write_cache(BlockDriverState *bs)
3595 {
3596     return bs->enable_write_cache;
3597 }
3598 
3599 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3600 {
3601     bs->enable_write_cache = wce;
3602 
3603     /* so a reopen() will preserve wce */
3604     if (wce) {
3605         bs->open_flags |= BDRV_O_CACHE_WB;
3606     } else {
3607         bs->open_flags &= ~BDRV_O_CACHE_WB;
3608     }
3609 }
3610 
3611 int bdrv_is_encrypted(BlockDriverState *bs)
3612 {
3613     if (bs->backing_hd && bs->backing_hd->encrypted)
3614         return 1;
3615     return bs->encrypted;
3616 }
3617 
3618 int bdrv_key_required(BlockDriverState *bs)
3619 {
3620     BlockDriverState *backing_hd = bs->backing_hd;
3621 
3622     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3623         return 1;
3624     return (bs->encrypted && !bs->valid_key);
3625 }
3626 
3627 int bdrv_set_key(BlockDriverState *bs, const char *key)
3628 {
3629     int ret;
3630     if (bs->backing_hd && bs->backing_hd->encrypted) {
3631         ret = bdrv_set_key(bs->backing_hd, key);
3632         if (ret < 0)
3633             return ret;
3634         if (!bs->encrypted)
3635             return 0;
3636     }
3637     if (!bs->encrypted) {
3638         return -EINVAL;
3639     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3640         return -ENOMEDIUM;
3641     }
3642     ret = bs->drv->bdrv_set_key(bs, key);
3643     if (ret < 0) {
3644         bs->valid_key = 0;
3645     } else if (!bs->valid_key) {
3646         bs->valid_key = 1;
3647         if (bs->blk) {
3648             /* call the change callback now, we skipped it on open */
3649             blk_dev_change_media_cb(bs->blk, true);
3650         }
3651     }
3652     return ret;
3653 }
3654 
3655 const char *bdrv_get_format_name(BlockDriverState *bs)
3656 {
3657     return bs->drv ? bs->drv->format_name : NULL;
3658 }
3659 
3660 static int qsort_strcmp(const void *a, const void *b)
3661 {
3662     return strcmp(a, b);
3663 }
3664 
3665 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3666                          void *opaque)
3667 {
3668     BlockDriver *drv;
3669     int count = 0;
3670     int i;
3671     const char **formats = NULL;
3672 
3673     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3674         if (drv->format_name) {
3675             bool found = false;
3676             int i = count;
3677             while (formats && i && !found) {
3678                 found = !strcmp(formats[--i], drv->format_name);
3679             }
3680 
3681             if (!found) {
3682                 formats = g_renew(const char *, formats, count + 1);
3683                 formats[count++] = drv->format_name;
3684             }
3685         }
3686     }
3687 
3688     qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3689 
3690     for (i = 0; i < count; i++) {
3691         it(opaque, formats[i]);
3692     }
3693 
3694     g_free(formats);
3695 }
3696 
3697 /* This function is to find block backend bs */
3698 /* TODO convert callers to blk_by_name(), then remove */
3699 BlockDriverState *bdrv_find(const char *name)
3700 {
3701     BlockBackend *blk = blk_by_name(name);
3702 
3703     return blk ? blk_bs(blk) : NULL;
3704 }
3705 
3706 /* This function is to find a node in the bs graph */
3707 BlockDriverState *bdrv_find_node(const char *node_name)
3708 {
3709     BlockDriverState *bs;
3710 
3711     assert(node_name);
3712 
3713     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3714         if (!strcmp(node_name, bs->node_name)) {
3715             return bs;
3716         }
3717     }
3718     return NULL;
3719 }
3720 
3721 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3722 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3723 {
3724     BlockDeviceInfoList *list, *entry;
3725     BlockDriverState *bs;
3726 
3727     list = NULL;
3728     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3729         entry = g_malloc0(sizeof(*entry));
3730         entry->value = bdrv_block_device_info(bs);
3731         entry->next = list;
3732         list = entry;
3733     }
3734 
3735     return list;
3736 }
3737 
3738 BlockDriverState *bdrv_lookup_bs(const char *device,
3739                                  const char *node_name,
3740                                  Error **errp)
3741 {
3742     BlockBackend *blk;
3743     BlockDriverState *bs;
3744 
3745     if (device) {
3746         blk = blk_by_name(device);
3747 
3748         if (blk) {
3749             return blk_bs(blk);
3750         }
3751     }
3752 
3753     if (node_name) {
3754         bs = bdrv_find_node(node_name);
3755 
3756         if (bs) {
3757             return bs;
3758         }
3759     }
3760 
3761     error_setg(errp, "Cannot find device=%s nor node_name=%s",
3762                      device ? device : "",
3763                      node_name ? node_name : "");
3764     return NULL;
3765 }
3766 
3767 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3768  * return false.  If either argument is NULL, return false. */
3769 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3770 {
3771     while (top && top != base) {
3772         top = top->backing_hd;
3773     }
3774 
3775     return top != NULL;
3776 }
3777 
3778 BlockDriverState *bdrv_next(BlockDriverState *bs)
3779 {
3780     if (!bs) {
3781         return QTAILQ_FIRST(&bdrv_states);
3782     }
3783     return QTAILQ_NEXT(bs, device_list);
3784 }
3785 
3786 /* TODO check what callers really want: bs->node_name or blk_name() */
3787 const char *bdrv_get_device_name(const BlockDriverState *bs)
3788 {
3789     return bs->blk ? blk_name(bs->blk) : "";
3790 }
3791 
3792 int bdrv_get_flags(BlockDriverState *bs)
3793 {
3794     return bs->open_flags;
3795 }
3796 
3797 int bdrv_flush_all(void)
3798 {
3799     BlockDriverState *bs;
3800     int result = 0;
3801 
3802     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3803         AioContext *aio_context = bdrv_get_aio_context(bs);
3804         int ret;
3805 
3806         aio_context_acquire(aio_context);
3807         ret = bdrv_flush(bs);
3808         if (ret < 0 && !result) {
3809             result = ret;
3810         }
3811         aio_context_release(aio_context);
3812     }
3813 
3814     return result;
3815 }
3816 
3817 int bdrv_has_zero_init_1(BlockDriverState *bs)
3818 {
3819     return 1;
3820 }
3821 
3822 int bdrv_has_zero_init(BlockDriverState *bs)
3823 {
3824     assert(bs->drv);
3825 
3826     /* If BS is a copy on write image, it is initialized to
3827        the contents of the base image, which may not be zeroes.  */
3828     if (bs->backing_hd) {
3829         return 0;
3830     }
3831     if (bs->drv->bdrv_has_zero_init) {
3832         return bs->drv->bdrv_has_zero_init(bs);
3833     }
3834 
3835     /* safe default */
3836     return 0;
3837 }
3838 
3839 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3840 {
3841     BlockDriverInfo bdi;
3842 
3843     if (bs->backing_hd) {
3844         return false;
3845     }
3846 
3847     if (bdrv_get_info(bs, &bdi) == 0) {
3848         return bdi.unallocated_blocks_are_zero;
3849     }
3850 
3851     return false;
3852 }
3853 
3854 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3855 {
3856     BlockDriverInfo bdi;
3857 
3858     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3859         return false;
3860     }
3861 
3862     if (bdrv_get_info(bs, &bdi) == 0) {
3863         return bdi.can_write_zeroes_with_unmap;
3864     }
3865 
3866     return false;
3867 }
3868 
3869 typedef struct BdrvCoGetBlockStatusData {
3870     BlockDriverState *bs;
3871     BlockDriverState *base;
3872     int64_t sector_num;
3873     int nb_sectors;
3874     int *pnum;
3875     int64_t ret;
3876     bool done;
3877 } BdrvCoGetBlockStatusData;
3878 
3879 /*
3880  * Returns true iff the specified sector is present in the disk image. Drivers
3881  * not implementing the functionality are assumed to not support backing files,
3882  * hence all their sectors are reported as allocated.
3883  *
3884  * If 'sector_num' is beyond the end of the disk image the return value is 0
3885  * and 'pnum' is set to 0.
3886  *
3887  * 'pnum' is set to the number of sectors (including and immediately following
3888  * the specified sector) that are known to be in the same
3889  * allocated/unallocated state.
3890  *
3891  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3892  * beyond the end of the disk image it will be clamped.
3893  */
3894 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3895                                                      int64_t sector_num,
3896                                                      int nb_sectors, int *pnum)
3897 {
3898     int64_t total_sectors;
3899     int64_t n;
3900     int64_t ret, ret2;
3901 
3902     total_sectors = bdrv_nb_sectors(bs);
3903     if (total_sectors < 0) {
3904         return total_sectors;
3905     }
3906 
3907     if (sector_num >= total_sectors) {
3908         *pnum = 0;
3909         return 0;
3910     }
3911 
3912     n = total_sectors - sector_num;
3913     if (n < nb_sectors) {
3914         nb_sectors = n;
3915     }
3916 
3917     if (!bs->drv->bdrv_co_get_block_status) {
3918         *pnum = nb_sectors;
3919         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
3920         if (bs->drv->protocol_name) {
3921             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3922         }
3923         return ret;
3924     }
3925 
3926     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3927     if (ret < 0) {
3928         *pnum = 0;
3929         return ret;
3930     }
3931 
3932     if (ret & BDRV_BLOCK_RAW) {
3933         assert(ret & BDRV_BLOCK_OFFSET_VALID);
3934         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3935                                      *pnum, pnum);
3936     }
3937 
3938     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
3939         ret |= BDRV_BLOCK_ALLOCATED;
3940     }
3941 
3942     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3943         if (bdrv_unallocated_blocks_are_zero(bs)) {
3944             ret |= BDRV_BLOCK_ZERO;
3945         } else if (bs->backing_hd) {
3946             BlockDriverState *bs2 = bs->backing_hd;
3947             int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
3948             if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
3949                 ret |= BDRV_BLOCK_ZERO;
3950             }
3951         }
3952     }
3953 
3954     if (bs->file &&
3955         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3956         (ret & BDRV_BLOCK_OFFSET_VALID)) {
3957         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3958                                         *pnum, pnum);
3959         if (ret2 >= 0) {
3960             /* Ignore errors.  This is just providing extra information, it
3961              * is useful but not necessary.
3962              */
3963             ret |= (ret2 & BDRV_BLOCK_ZERO);
3964         }
3965     }
3966 
3967     return ret;
3968 }
3969 
3970 /* Coroutine wrapper for bdrv_get_block_status() */
3971 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3972 {
3973     BdrvCoGetBlockStatusData *data = opaque;
3974     BlockDriverState *bs = data->bs;
3975 
3976     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3977                                          data->pnum);
3978     data->done = true;
3979 }
3980 
3981 /*
3982  * Synchronous wrapper around bdrv_co_get_block_status().
3983  *
3984  * See bdrv_co_get_block_status() for details.
3985  */
3986 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3987                               int nb_sectors, int *pnum)
3988 {
3989     Coroutine *co;
3990     BdrvCoGetBlockStatusData data = {
3991         .bs = bs,
3992         .sector_num = sector_num,
3993         .nb_sectors = nb_sectors,
3994         .pnum = pnum,
3995         .done = false,
3996     };
3997 
3998     if (qemu_in_coroutine()) {
3999         /* Fast-path if already in coroutine context */
4000         bdrv_get_block_status_co_entry(&data);
4001     } else {
4002         AioContext *aio_context = bdrv_get_aio_context(bs);
4003 
4004         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4005         qemu_coroutine_enter(co, &data);
4006         while (!data.done) {
4007             aio_poll(aio_context, true);
4008         }
4009     }
4010     return data.ret;
4011 }
4012 
4013 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4014                                    int nb_sectors, int *pnum)
4015 {
4016     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4017     if (ret < 0) {
4018         return ret;
4019     }
4020     return !!(ret & BDRV_BLOCK_ALLOCATED);
4021 }
4022 
4023 /*
4024  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4025  *
4026  * Return true if the given sector is allocated in any image between
4027  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
4028  * sector is allocated in any image of the chain.  Return false otherwise.
4029  *
4030  * 'pnum' is set to the number of sectors (including and immediately following
4031  *  the specified sector) that are known to be in the same
4032  *  allocated/unallocated state.
4033  *
4034  */
4035 int bdrv_is_allocated_above(BlockDriverState *top,
4036                             BlockDriverState *base,
4037                             int64_t sector_num,
4038                             int nb_sectors, int *pnum)
4039 {
4040     BlockDriverState *intermediate;
4041     int ret, n = nb_sectors;
4042 
4043     intermediate = top;
4044     while (intermediate && intermediate != base) {
4045         int pnum_inter;
4046         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4047                                 &pnum_inter);
4048         if (ret < 0) {
4049             return ret;
4050         } else if (ret) {
4051             *pnum = pnum_inter;
4052             return 1;
4053         }
4054 
4055         /*
4056          * [sector_num, nb_sectors] is unallocated on top but intermediate
4057          * might have
4058          *
4059          * [sector_num+x, nr_sectors] allocated.
4060          */
4061         if (n > pnum_inter &&
4062             (intermediate == top ||
4063              sector_num + pnum_inter < intermediate->total_sectors)) {
4064             n = pnum_inter;
4065         }
4066 
4067         intermediate = intermediate->backing_hd;
4068     }
4069 
4070     *pnum = n;
4071     return 0;
4072 }
4073 
4074 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4075 {
4076     if (bs->backing_hd && bs->backing_hd->encrypted)
4077         return bs->backing_file;
4078     else if (bs->encrypted)
4079         return bs->filename;
4080     else
4081         return NULL;
4082 }
4083 
4084 void bdrv_get_backing_filename(BlockDriverState *bs,
4085                                char *filename, int filename_size)
4086 {
4087     pstrcpy(filename, filename_size, bs->backing_file);
4088 }
4089 
4090 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4091                           const uint8_t *buf, int nb_sectors)
4092 {
4093     BlockDriver *drv = bs->drv;
4094     if (!drv)
4095         return -ENOMEDIUM;
4096     if (!drv->bdrv_write_compressed)
4097         return -ENOTSUP;
4098     if (bdrv_check_request(bs, sector_num, nb_sectors))
4099         return -EIO;
4100 
4101     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4102 
4103     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4104 }
4105 
4106 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4107 {
4108     BlockDriver *drv = bs->drv;
4109     if (!drv)
4110         return -ENOMEDIUM;
4111     if (!drv->bdrv_get_info)
4112         return -ENOTSUP;
4113     memset(bdi, 0, sizeof(*bdi));
4114     return drv->bdrv_get_info(bs, bdi);
4115 }
4116 
4117 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4118 {
4119     BlockDriver *drv = bs->drv;
4120     if (drv && drv->bdrv_get_specific_info) {
4121         return drv->bdrv_get_specific_info(bs);
4122     }
4123     return NULL;
4124 }
4125 
4126 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4127                       int64_t pos, int size)
4128 {
4129     QEMUIOVector qiov;
4130     struct iovec iov = {
4131         .iov_base   = (void *) buf,
4132         .iov_len    = size,
4133     };
4134 
4135     qemu_iovec_init_external(&qiov, &iov, 1);
4136     return bdrv_writev_vmstate(bs, &qiov, pos);
4137 }
4138 
4139 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4140 {
4141     BlockDriver *drv = bs->drv;
4142 
4143     if (!drv) {
4144         return -ENOMEDIUM;
4145     } else if (drv->bdrv_save_vmstate) {
4146         return drv->bdrv_save_vmstate(bs, qiov, pos);
4147     } else if (bs->file) {
4148         return bdrv_writev_vmstate(bs->file, qiov, pos);
4149     }
4150 
4151     return -ENOTSUP;
4152 }
4153 
4154 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4155                       int64_t pos, int size)
4156 {
4157     BlockDriver *drv = bs->drv;
4158     if (!drv)
4159         return -ENOMEDIUM;
4160     if (drv->bdrv_load_vmstate)
4161         return drv->bdrv_load_vmstate(bs, buf, pos, size);
4162     if (bs->file)
4163         return bdrv_load_vmstate(bs->file, buf, pos, size);
4164     return -ENOTSUP;
4165 }
4166 
4167 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4168 {
4169     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4170         return;
4171     }
4172 
4173     bs->drv->bdrv_debug_event(bs, event);
4174 }
4175 
4176 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4177                           const char *tag)
4178 {
4179     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4180         bs = bs->file;
4181     }
4182 
4183     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4184         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4185     }
4186 
4187     return -ENOTSUP;
4188 }
4189 
4190 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4191 {
4192     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4193         bs = bs->file;
4194     }
4195 
4196     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4197         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4198     }
4199 
4200     return -ENOTSUP;
4201 }
4202 
4203 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4204 {
4205     while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4206         bs = bs->file;
4207     }
4208 
4209     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4210         return bs->drv->bdrv_debug_resume(bs, tag);
4211     }
4212 
4213     return -ENOTSUP;
4214 }
4215 
4216 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4217 {
4218     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4219         bs = bs->file;
4220     }
4221 
4222     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4223         return bs->drv->bdrv_debug_is_suspended(bs, tag);
4224     }
4225 
4226     return false;
4227 }
4228 
4229 int bdrv_is_snapshot(BlockDriverState *bs)
4230 {
4231     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4232 }
4233 
4234 /* backing_file can either be relative, or absolute, or a protocol.  If it is
4235  * relative, it must be relative to the chain.  So, passing in bs->filename
4236  * from a BDS as backing_file should not be done, as that may be relative to
4237  * the CWD rather than the chain. */
4238 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4239         const char *backing_file)
4240 {
4241     char *filename_full = NULL;
4242     char *backing_file_full = NULL;
4243     char *filename_tmp = NULL;
4244     int is_protocol = 0;
4245     BlockDriverState *curr_bs = NULL;
4246     BlockDriverState *retval = NULL;
4247 
4248     if (!bs || !bs->drv || !backing_file) {
4249         return NULL;
4250     }
4251 
4252     filename_full     = g_malloc(PATH_MAX);
4253     backing_file_full = g_malloc(PATH_MAX);
4254     filename_tmp      = g_malloc(PATH_MAX);
4255 
4256     is_protocol = path_has_protocol(backing_file);
4257 
4258     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4259 
4260         /* If either of the filename paths is actually a protocol, then
4261          * compare unmodified paths; otherwise make paths relative */
4262         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4263             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4264                 retval = curr_bs->backing_hd;
4265                 break;
4266             }
4267         } else {
4268             /* If not an absolute filename path, make it relative to the current
4269              * image's filename path */
4270             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4271                          backing_file);
4272 
4273             /* We are going to compare absolute pathnames */
4274             if (!realpath(filename_tmp, filename_full)) {
4275                 continue;
4276             }
4277 
4278             /* We need to make sure the backing filename we are comparing against
4279              * is relative to the current image filename (or absolute) */
4280             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4281                          curr_bs->backing_file);
4282 
4283             if (!realpath(filename_tmp, backing_file_full)) {
4284                 continue;
4285             }
4286 
4287             if (strcmp(backing_file_full, filename_full) == 0) {
4288                 retval = curr_bs->backing_hd;
4289                 break;
4290             }
4291         }
4292     }
4293 
4294     g_free(filename_full);
4295     g_free(backing_file_full);
4296     g_free(filename_tmp);
4297     return retval;
4298 }
4299 
4300 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4301 {
4302     if (!bs->drv) {
4303         return 0;
4304     }
4305 
4306     if (!bs->backing_hd) {
4307         return 0;
4308     }
4309 
4310     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4311 }
4312 
4313 /**************************************************************/
4314 /* async I/Os */
4315 
4316 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4317                            QEMUIOVector *qiov, int nb_sectors,
4318                            BlockCompletionFunc *cb, void *opaque)
4319 {
4320     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4321 
4322     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4323                                  cb, opaque, false);
4324 }
4325 
4326 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4327                             QEMUIOVector *qiov, int nb_sectors,
4328                             BlockCompletionFunc *cb, void *opaque)
4329 {
4330     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4331 
4332     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4333                                  cb, opaque, true);
4334 }
4335 
4336 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4337         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4338         BlockCompletionFunc *cb, void *opaque)
4339 {
4340     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4341 
4342     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4343                                  BDRV_REQ_ZERO_WRITE | flags,
4344                                  cb, opaque, true);
4345 }
4346 
4347 
4348 typedef struct MultiwriteCB {
4349     int error;
4350     int num_requests;
4351     int num_callbacks;
4352     struct {
4353         BlockCompletionFunc *cb;
4354         void *opaque;
4355         QEMUIOVector *free_qiov;
4356     } callbacks[];
4357 } MultiwriteCB;
4358 
4359 static void multiwrite_user_cb(MultiwriteCB *mcb)
4360 {
4361     int i;
4362 
4363     for (i = 0; i < mcb->num_callbacks; i++) {
4364         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4365         if (mcb->callbacks[i].free_qiov) {
4366             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4367         }
4368         g_free(mcb->callbacks[i].free_qiov);
4369     }
4370 }
4371 
4372 static void multiwrite_cb(void *opaque, int ret)
4373 {
4374     MultiwriteCB *mcb = opaque;
4375 
4376     trace_multiwrite_cb(mcb, ret);
4377 
4378     if (ret < 0 && !mcb->error) {
4379         mcb->error = ret;
4380     }
4381 
4382     mcb->num_requests--;
4383     if (mcb->num_requests == 0) {
4384         multiwrite_user_cb(mcb);
4385         g_free(mcb);
4386     }
4387 }
4388 
4389 static int multiwrite_req_compare(const void *a, const void *b)
4390 {
4391     const BlockRequest *req1 = a, *req2 = b;
4392 
4393     /*
4394      * Note that we can't simply subtract req2->sector from req1->sector
4395      * here as that could overflow the return value.
4396      */
4397     if (req1->sector > req2->sector) {
4398         return 1;
4399     } else if (req1->sector < req2->sector) {
4400         return -1;
4401     } else {
4402         return 0;
4403     }
4404 }
4405 
4406 /*
4407  * Takes a bunch of requests and tries to merge them. Returns the number of
4408  * requests that remain after merging.
4409  */
4410 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4411     int num_reqs, MultiwriteCB *mcb)
4412 {
4413     int i, outidx;
4414 
4415     // Sort requests by start sector
4416     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4417 
4418     // Check if adjacent requests touch the same clusters. If so, combine them,
4419     // filling up gaps with zero sectors.
4420     outidx = 0;
4421     for (i = 1; i < num_reqs; i++) {
4422         int merge = 0;
4423         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4424 
4425         // Handle exactly sequential writes and overlapping writes.
4426         if (reqs[i].sector <= oldreq_last) {
4427             merge = 1;
4428         }
4429 
4430         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4431             merge = 0;
4432         }
4433 
4434         if (merge) {
4435             size_t size;
4436             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4437             qemu_iovec_init(qiov,
4438                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4439 
4440             // Add the first request to the merged one. If the requests are
4441             // overlapping, drop the last sectors of the first request.
4442             size = (reqs[i].sector - reqs[outidx].sector) << 9;
4443             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4444 
4445             // We should need to add any zeros between the two requests
4446             assert (reqs[i].sector <= oldreq_last);
4447 
4448             // Add the second request
4449             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4450 
4451             // Add tail of first request, if necessary
4452             if (qiov->size < reqs[outidx].qiov->size) {
4453                 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4454                                   reqs[outidx].qiov->size - qiov->size);
4455             }
4456 
4457             reqs[outidx].nb_sectors = qiov->size >> 9;
4458             reqs[outidx].qiov = qiov;
4459 
4460             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4461         } else {
4462             outidx++;
4463             reqs[outidx].sector     = reqs[i].sector;
4464             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4465             reqs[outidx].qiov       = reqs[i].qiov;
4466         }
4467     }
4468 
4469     return outidx + 1;
4470 }
4471 
4472 /*
4473  * Submit multiple AIO write requests at once.
4474  *
4475  * On success, the function returns 0 and all requests in the reqs array have
4476  * been submitted. In error case this function returns -1, and any of the
4477  * requests may or may not be submitted yet. In particular, this means that the
4478  * callback will be called for some of the requests, for others it won't. The
4479  * caller must check the error field of the BlockRequest to wait for the right
4480  * callbacks (if error != 0, no callback will be called).
4481  *
4482  * The implementation may modify the contents of the reqs array, e.g. to merge
4483  * requests. However, the fields opaque and error are left unmodified as they
4484  * are used to signal failure for a single request to the caller.
4485  */
4486 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4487 {
4488     MultiwriteCB *mcb;
4489     int i;
4490 
4491     /* don't submit writes if we don't have a medium */
4492     if (bs->drv == NULL) {
4493         for (i = 0; i < num_reqs; i++) {
4494             reqs[i].error = -ENOMEDIUM;
4495         }
4496         return -1;
4497     }
4498 
4499     if (num_reqs == 0) {
4500         return 0;
4501     }
4502 
4503     // Create MultiwriteCB structure
4504     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4505     mcb->num_requests = 0;
4506     mcb->num_callbacks = num_reqs;
4507 
4508     for (i = 0; i < num_reqs; i++) {
4509         mcb->callbacks[i].cb = reqs[i].cb;
4510         mcb->callbacks[i].opaque = reqs[i].opaque;
4511     }
4512 
4513     // Check for mergable requests
4514     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4515 
4516     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4517 
4518     /* Run the aio requests. */
4519     mcb->num_requests = num_reqs;
4520     for (i = 0; i < num_reqs; i++) {
4521         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4522                               reqs[i].nb_sectors, reqs[i].flags,
4523                               multiwrite_cb, mcb,
4524                               true);
4525     }
4526 
4527     return 0;
4528 }
4529 
4530 void bdrv_aio_cancel(BlockAIOCB *acb)
4531 {
4532     qemu_aio_ref(acb);
4533     bdrv_aio_cancel_async(acb);
4534     while (acb->refcnt > 1) {
4535         if (acb->aiocb_info->get_aio_context) {
4536             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4537         } else if (acb->bs) {
4538             aio_poll(bdrv_get_aio_context(acb->bs), true);
4539         } else {
4540             abort();
4541         }
4542     }
4543     qemu_aio_unref(acb);
4544 }
4545 
4546 /* Async version of aio cancel. The caller is not blocked if the acb implements
4547  * cancel_async, otherwise we do nothing and let the request normally complete.
4548  * In either case the completion callback must be called. */
4549 void bdrv_aio_cancel_async(BlockAIOCB *acb)
4550 {
4551     if (acb->aiocb_info->cancel_async) {
4552         acb->aiocb_info->cancel_async(acb);
4553     }
4554 }
4555 
4556 /**************************************************************/
4557 /* async block device emulation */
4558 
4559 typedef struct BlockAIOCBSync {
4560     BlockAIOCB common;
4561     QEMUBH *bh;
4562     int ret;
4563     /* vector translation state */
4564     QEMUIOVector *qiov;
4565     uint8_t *bounce;
4566     int is_write;
4567 } BlockAIOCBSync;
4568 
4569 static const AIOCBInfo bdrv_em_aiocb_info = {
4570     .aiocb_size         = sizeof(BlockAIOCBSync),
4571 };
4572 
4573 static void bdrv_aio_bh_cb(void *opaque)
4574 {
4575     BlockAIOCBSync *acb = opaque;
4576 
4577     if (!acb->is_write && acb->ret >= 0) {
4578         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4579     }
4580     qemu_vfree(acb->bounce);
4581     acb->common.cb(acb->common.opaque, acb->ret);
4582     qemu_bh_delete(acb->bh);
4583     acb->bh = NULL;
4584     qemu_aio_unref(acb);
4585 }
4586 
4587 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4588                                       int64_t sector_num,
4589                                       QEMUIOVector *qiov,
4590                                       int nb_sectors,
4591                                       BlockCompletionFunc *cb,
4592                                       void *opaque,
4593                                       int is_write)
4594 
4595 {
4596     BlockAIOCBSync *acb;
4597 
4598     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4599     acb->is_write = is_write;
4600     acb->qiov = qiov;
4601     acb->bounce = qemu_try_blockalign(bs, qiov->size);
4602     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4603 
4604     if (acb->bounce == NULL) {
4605         acb->ret = -ENOMEM;
4606     } else if (is_write) {
4607         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4608         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4609     } else {
4610         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4611     }
4612 
4613     qemu_bh_schedule(acb->bh);
4614 
4615     return &acb->common;
4616 }
4617 
4618 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4619         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4620         BlockCompletionFunc *cb, void *opaque)
4621 {
4622     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4623 }
4624 
4625 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4626         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4627         BlockCompletionFunc *cb, void *opaque)
4628 {
4629     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4630 }
4631 
4632 
4633 typedef struct BlockAIOCBCoroutine {
4634     BlockAIOCB common;
4635     BlockRequest req;
4636     bool is_write;
4637     bool *done;
4638     QEMUBH* bh;
4639 } BlockAIOCBCoroutine;
4640 
4641 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4642     .aiocb_size         = sizeof(BlockAIOCBCoroutine),
4643 };
4644 
4645 static void bdrv_co_em_bh(void *opaque)
4646 {
4647     BlockAIOCBCoroutine *acb = opaque;
4648 
4649     acb->common.cb(acb->common.opaque, acb->req.error);
4650 
4651     qemu_bh_delete(acb->bh);
4652     qemu_aio_unref(acb);
4653 }
4654 
4655 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4656 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4657 {
4658     BlockAIOCBCoroutine *acb = opaque;
4659     BlockDriverState *bs = acb->common.bs;
4660 
4661     if (!acb->is_write) {
4662         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4663             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4664     } else {
4665         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4666             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4667     }
4668 
4669     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4670     qemu_bh_schedule(acb->bh);
4671 }
4672 
4673 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4674                                          int64_t sector_num,
4675                                          QEMUIOVector *qiov,
4676                                          int nb_sectors,
4677                                          BdrvRequestFlags flags,
4678                                          BlockCompletionFunc *cb,
4679                                          void *opaque,
4680                                          bool is_write)
4681 {
4682     Coroutine *co;
4683     BlockAIOCBCoroutine *acb;
4684 
4685     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4686     acb->req.sector = sector_num;
4687     acb->req.nb_sectors = nb_sectors;
4688     acb->req.qiov = qiov;
4689     acb->req.flags = flags;
4690     acb->is_write = is_write;
4691 
4692     co = qemu_coroutine_create(bdrv_co_do_rw);
4693     qemu_coroutine_enter(co, acb);
4694 
4695     return &acb->common;
4696 }
4697 
4698 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4699 {
4700     BlockAIOCBCoroutine *acb = opaque;
4701     BlockDriverState *bs = acb->common.bs;
4702 
4703     acb->req.error = bdrv_co_flush(bs);
4704     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4705     qemu_bh_schedule(acb->bh);
4706 }
4707 
4708 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4709         BlockCompletionFunc *cb, void *opaque)
4710 {
4711     trace_bdrv_aio_flush(bs, opaque);
4712 
4713     Coroutine *co;
4714     BlockAIOCBCoroutine *acb;
4715 
4716     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4717 
4718     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4719     qemu_coroutine_enter(co, acb);
4720 
4721     return &acb->common;
4722 }
4723 
4724 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4725 {
4726     BlockAIOCBCoroutine *acb = opaque;
4727     BlockDriverState *bs = acb->common.bs;
4728 
4729     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4730     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4731     qemu_bh_schedule(acb->bh);
4732 }
4733 
4734 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4735         int64_t sector_num, int nb_sectors,
4736         BlockCompletionFunc *cb, void *opaque)
4737 {
4738     Coroutine *co;
4739     BlockAIOCBCoroutine *acb;
4740 
4741     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4742 
4743     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4744     acb->req.sector = sector_num;
4745     acb->req.nb_sectors = nb_sectors;
4746     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4747     qemu_coroutine_enter(co, acb);
4748 
4749     return &acb->common;
4750 }
4751 
4752 void bdrv_init(void)
4753 {
4754     module_call_init(MODULE_INIT_BLOCK);
4755 }
4756 
4757 void bdrv_init_with_whitelist(void)
4758 {
4759     use_bdrv_whitelist = 1;
4760     bdrv_init();
4761 }
4762 
4763 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4764                    BlockCompletionFunc *cb, void *opaque)
4765 {
4766     BlockAIOCB *acb;
4767 
4768     acb = g_slice_alloc(aiocb_info->aiocb_size);
4769     acb->aiocb_info = aiocb_info;
4770     acb->bs = bs;
4771     acb->cb = cb;
4772     acb->opaque = opaque;
4773     acb->refcnt = 1;
4774     return acb;
4775 }
4776 
4777 void qemu_aio_ref(void *p)
4778 {
4779     BlockAIOCB *acb = p;
4780     acb->refcnt++;
4781 }
4782 
4783 void qemu_aio_unref(void *p)
4784 {
4785     BlockAIOCB *acb = p;
4786     assert(acb->refcnt > 0);
4787     if (--acb->refcnt == 0) {
4788         g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4789     }
4790 }
4791 
4792 /**************************************************************/
4793 /* Coroutine block device emulation */
4794 
4795 typedef struct CoroutineIOCompletion {
4796     Coroutine *coroutine;
4797     int ret;
4798 } CoroutineIOCompletion;
4799 
4800 static void bdrv_co_io_em_complete(void *opaque, int ret)
4801 {
4802     CoroutineIOCompletion *co = opaque;
4803 
4804     co->ret = ret;
4805     qemu_coroutine_enter(co->coroutine, NULL);
4806 }
4807 
4808 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4809                                       int nb_sectors, QEMUIOVector *iov,
4810                                       bool is_write)
4811 {
4812     CoroutineIOCompletion co = {
4813         .coroutine = qemu_coroutine_self(),
4814     };
4815     BlockAIOCB *acb;
4816 
4817     if (is_write) {
4818         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4819                                        bdrv_co_io_em_complete, &co);
4820     } else {
4821         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4822                                       bdrv_co_io_em_complete, &co);
4823     }
4824 
4825     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4826     if (!acb) {
4827         return -EIO;
4828     }
4829     qemu_coroutine_yield();
4830 
4831     return co.ret;
4832 }
4833 
4834 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4835                                          int64_t sector_num, int nb_sectors,
4836                                          QEMUIOVector *iov)
4837 {
4838     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4839 }
4840 
4841 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4842                                          int64_t sector_num, int nb_sectors,
4843                                          QEMUIOVector *iov)
4844 {
4845     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4846 }
4847 
4848 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4849 {
4850     RwCo *rwco = opaque;
4851 
4852     rwco->ret = bdrv_co_flush(rwco->bs);
4853 }
4854 
4855 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4856 {
4857     int ret;
4858 
4859     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4860         return 0;
4861     }
4862 
4863     /* Write back cached data to the OS even with cache=unsafe */
4864     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4865     if (bs->drv->bdrv_co_flush_to_os) {
4866         ret = bs->drv->bdrv_co_flush_to_os(bs);
4867         if (ret < 0) {
4868             return ret;
4869         }
4870     }
4871 
4872     /* But don't actually force it to the disk with cache=unsafe */
4873     if (bs->open_flags & BDRV_O_NO_FLUSH) {
4874         goto flush_parent;
4875     }
4876 
4877     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4878     if (bs->drv->bdrv_co_flush_to_disk) {
4879         ret = bs->drv->bdrv_co_flush_to_disk(bs);
4880     } else if (bs->drv->bdrv_aio_flush) {
4881         BlockAIOCB *acb;
4882         CoroutineIOCompletion co = {
4883             .coroutine = qemu_coroutine_self(),
4884         };
4885 
4886         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4887         if (acb == NULL) {
4888             ret = -EIO;
4889         } else {
4890             qemu_coroutine_yield();
4891             ret = co.ret;
4892         }
4893     } else {
4894         /*
4895          * Some block drivers always operate in either writethrough or unsafe
4896          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4897          * know how the server works (because the behaviour is hardcoded or
4898          * depends on server-side configuration), so we can't ensure that
4899          * everything is safe on disk. Returning an error doesn't work because
4900          * that would break guests even if the server operates in writethrough
4901          * mode.
4902          *
4903          * Let's hope the user knows what he's doing.
4904          */
4905         ret = 0;
4906     }
4907     if (ret < 0) {
4908         return ret;
4909     }
4910 
4911     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4912      * in the case of cache=unsafe, so there are no useless flushes.
4913      */
4914 flush_parent:
4915     return bdrv_co_flush(bs->file);
4916 }
4917 
4918 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
4919 {
4920     Error *local_err = NULL;
4921     int ret;
4922 
4923     if (!bs->drv)  {
4924         return;
4925     }
4926 
4927     if (!(bs->open_flags & BDRV_O_INCOMING)) {
4928         return;
4929     }
4930     bs->open_flags &= ~BDRV_O_INCOMING;
4931 
4932     if (bs->drv->bdrv_invalidate_cache) {
4933         bs->drv->bdrv_invalidate_cache(bs, &local_err);
4934     } else if (bs->file) {
4935         bdrv_invalidate_cache(bs->file, &local_err);
4936     }
4937     if (local_err) {
4938         error_propagate(errp, local_err);
4939         return;
4940     }
4941 
4942     ret = refresh_total_sectors(bs, bs->total_sectors);
4943     if (ret < 0) {
4944         error_setg_errno(errp, -ret, "Could not refresh total sector count");
4945         return;
4946     }
4947 }
4948 
4949 void bdrv_invalidate_cache_all(Error **errp)
4950 {
4951     BlockDriverState *bs;
4952     Error *local_err = NULL;
4953 
4954     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4955         AioContext *aio_context = bdrv_get_aio_context(bs);
4956 
4957         aio_context_acquire(aio_context);
4958         bdrv_invalidate_cache(bs, &local_err);
4959         aio_context_release(aio_context);
4960         if (local_err) {
4961             error_propagate(errp, local_err);
4962             return;
4963         }
4964     }
4965 }
4966 
4967 int bdrv_flush(BlockDriverState *bs)
4968 {
4969     Coroutine *co;
4970     RwCo rwco = {
4971         .bs = bs,
4972         .ret = NOT_DONE,
4973     };
4974 
4975     if (qemu_in_coroutine()) {
4976         /* Fast-path if already in coroutine context */
4977         bdrv_flush_co_entry(&rwco);
4978     } else {
4979         AioContext *aio_context = bdrv_get_aio_context(bs);
4980 
4981         co = qemu_coroutine_create(bdrv_flush_co_entry);
4982         qemu_coroutine_enter(co, &rwco);
4983         while (rwco.ret == NOT_DONE) {
4984             aio_poll(aio_context, true);
4985         }
4986     }
4987 
4988     return rwco.ret;
4989 }
4990 
4991 typedef struct DiscardCo {
4992     BlockDriverState *bs;
4993     int64_t sector_num;
4994     int nb_sectors;
4995     int ret;
4996 } DiscardCo;
4997 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4998 {
4999     DiscardCo *rwco = opaque;
5000 
5001     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5002 }
5003 
5004 /* if no limit is specified in the BlockLimits use a default
5005  * of 32768 512-byte sectors (16 MiB) per request.
5006  */
5007 #define MAX_DISCARD_DEFAULT 32768
5008 
5009 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5010                                  int nb_sectors)
5011 {
5012     int max_discard;
5013 
5014     if (!bs->drv) {
5015         return -ENOMEDIUM;
5016     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5017         return -EIO;
5018     } else if (bs->read_only) {
5019         return -EROFS;
5020     }
5021 
5022     bdrv_reset_dirty(bs, sector_num, nb_sectors);
5023 
5024     /* Do nothing if disabled.  */
5025     if (!(bs->open_flags & BDRV_O_UNMAP)) {
5026         return 0;
5027     }
5028 
5029     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5030         return 0;
5031     }
5032 
5033     max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5034     while (nb_sectors > 0) {
5035         int ret;
5036         int num = nb_sectors;
5037 
5038         /* align request */
5039         if (bs->bl.discard_alignment &&
5040             num >= bs->bl.discard_alignment &&
5041             sector_num % bs->bl.discard_alignment) {
5042             if (num > bs->bl.discard_alignment) {
5043                 num = bs->bl.discard_alignment;
5044             }
5045             num -= sector_num % bs->bl.discard_alignment;
5046         }
5047 
5048         /* limit request size */
5049         if (num > max_discard) {
5050             num = max_discard;
5051         }
5052 
5053         if (bs->drv->bdrv_co_discard) {
5054             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5055         } else {
5056             BlockAIOCB *acb;
5057             CoroutineIOCompletion co = {
5058                 .coroutine = qemu_coroutine_self(),
5059             };
5060 
5061             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5062                                             bdrv_co_io_em_complete, &co);
5063             if (acb == NULL) {
5064                 return -EIO;
5065             } else {
5066                 qemu_coroutine_yield();
5067                 ret = co.ret;
5068             }
5069         }
5070         if (ret && ret != -ENOTSUP) {
5071             return ret;
5072         }
5073 
5074         sector_num += num;
5075         nb_sectors -= num;
5076     }
5077     return 0;
5078 }
5079 
5080 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5081 {
5082     Coroutine *co;
5083     DiscardCo rwco = {
5084         .bs = bs,
5085         .sector_num = sector_num,
5086         .nb_sectors = nb_sectors,
5087         .ret = NOT_DONE,
5088     };
5089 
5090     if (qemu_in_coroutine()) {
5091         /* Fast-path if already in coroutine context */
5092         bdrv_discard_co_entry(&rwco);
5093     } else {
5094         AioContext *aio_context = bdrv_get_aio_context(bs);
5095 
5096         co = qemu_coroutine_create(bdrv_discard_co_entry);
5097         qemu_coroutine_enter(co, &rwco);
5098         while (rwco.ret == NOT_DONE) {
5099             aio_poll(aio_context, true);
5100         }
5101     }
5102 
5103     return rwco.ret;
5104 }
5105 
5106 /**************************************************************/
5107 /* removable device support */
5108 
5109 /**
5110  * Return TRUE if the media is present
5111  */
5112 int bdrv_is_inserted(BlockDriverState *bs)
5113 {
5114     BlockDriver *drv = bs->drv;
5115 
5116     if (!drv)
5117         return 0;
5118     if (!drv->bdrv_is_inserted)
5119         return 1;
5120     return drv->bdrv_is_inserted(bs);
5121 }
5122 
5123 /**
5124  * Return whether the media changed since the last call to this
5125  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
5126  */
5127 int bdrv_media_changed(BlockDriverState *bs)
5128 {
5129     BlockDriver *drv = bs->drv;
5130 
5131     if (drv && drv->bdrv_media_changed) {
5132         return drv->bdrv_media_changed(bs);
5133     }
5134     return -ENOTSUP;
5135 }
5136 
5137 /**
5138  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5139  */
5140 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5141 {
5142     BlockDriver *drv = bs->drv;
5143     const char *device_name;
5144 
5145     if (drv && drv->bdrv_eject) {
5146         drv->bdrv_eject(bs, eject_flag);
5147     }
5148 
5149     device_name = bdrv_get_device_name(bs);
5150     if (device_name[0] != '\0') {
5151         qapi_event_send_device_tray_moved(device_name,
5152                                           eject_flag, &error_abort);
5153     }
5154 }
5155 
5156 /**
5157  * Lock or unlock the media (if it is locked, the user won't be able
5158  * to eject it manually).
5159  */
5160 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5161 {
5162     BlockDriver *drv = bs->drv;
5163 
5164     trace_bdrv_lock_medium(bs, locked);
5165 
5166     if (drv && drv->bdrv_lock_medium) {
5167         drv->bdrv_lock_medium(bs, locked);
5168     }
5169 }
5170 
5171 /* needed for generic scsi interface */
5172 
5173 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5174 {
5175     BlockDriver *drv = bs->drv;
5176 
5177     if (drv && drv->bdrv_ioctl)
5178         return drv->bdrv_ioctl(bs, req, buf);
5179     return -ENOTSUP;
5180 }
5181 
5182 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5183         unsigned long int req, void *buf,
5184         BlockCompletionFunc *cb, void *opaque)
5185 {
5186     BlockDriver *drv = bs->drv;
5187 
5188     if (drv && drv->bdrv_aio_ioctl)
5189         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5190     return NULL;
5191 }
5192 
5193 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5194 {
5195     bs->guest_block_size = align;
5196 }
5197 
5198 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5199 {
5200     return qemu_memalign(bdrv_opt_mem_align(bs), size);
5201 }
5202 
5203 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5204 {
5205     size_t align = bdrv_opt_mem_align(bs);
5206 
5207     /* Ensure that NULL is never returned on success */
5208     assert(align > 0);
5209     if (size == 0) {
5210         size = align;
5211     }
5212 
5213     return qemu_try_memalign(align, size);
5214 }
5215 
5216 /*
5217  * Check if all memory in this vector is sector aligned.
5218  */
5219 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5220 {
5221     int i;
5222     size_t alignment = bdrv_opt_mem_align(bs);
5223 
5224     for (i = 0; i < qiov->niov; i++) {
5225         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5226             return false;
5227         }
5228         if (qiov->iov[i].iov_len % alignment) {
5229             return false;
5230         }
5231     }
5232 
5233     return true;
5234 }
5235 
5236 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5237                                           Error **errp)
5238 {
5239     int64_t bitmap_size;
5240     BdrvDirtyBitmap *bitmap;
5241 
5242     assert((granularity & (granularity - 1)) == 0);
5243 
5244     granularity >>= BDRV_SECTOR_BITS;
5245     assert(granularity);
5246     bitmap_size = bdrv_nb_sectors(bs);
5247     if (bitmap_size < 0) {
5248         error_setg_errno(errp, -bitmap_size, "could not get length of device");
5249         errno = -bitmap_size;
5250         return NULL;
5251     }
5252     bitmap = g_new0(BdrvDirtyBitmap, 1);
5253     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5254     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5255     return bitmap;
5256 }
5257 
5258 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5259 {
5260     BdrvDirtyBitmap *bm, *next;
5261     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5262         if (bm == bitmap) {
5263             QLIST_REMOVE(bitmap, list);
5264             hbitmap_free(bitmap->bitmap);
5265             g_free(bitmap);
5266             return;
5267         }
5268     }
5269 }
5270 
5271 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5272 {
5273     BdrvDirtyBitmap *bm;
5274     BlockDirtyInfoList *list = NULL;
5275     BlockDirtyInfoList **plist = &list;
5276 
5277     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5278         BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5279         BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
5280         info->count = bdrv_get_dirty_count(bs, bm);
5281         info->granularity =
5282             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5283         entry->value = info;
5284         *plist = entry;
5285         plist = &entry->next;
5286     }
5287 
5288     return list;
5289 }
5290 
5291 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5292 {
5293     if (bitmap) {
5294         return hbitmap_get(bitmap->bitmap, sector);
5295     } else {
5296         return 0;
5297     }
5298 }
5299 
5300 void bdrv_dirty_iter_init(BlockDriverState *bs,
5301                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5302 {
5303     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5304 }
5305 
5306 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5307                     int nr_sectors)
5308 {
5309     BdrvDirtyBitmap *bitmap;
5310     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5311         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5312     }
5313 }
5314 
5315 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5316 {
5317     BdrvDirtyBitmap *bitmap;
5318     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5319         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5320     }
5321 }
5322 
5323 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5324 {
5325     return hbitmap_count(bitmap->bitmap);
5326 }
5327 
5328 /* Get a reference to bs */
5329 void bdrv_ref(BlockDriverState *bs)
5330 {
5331     bs->refcnt++;
5332 }
5333 
5334 /* Release a previously grabbed reference to bs.
5335  * If after releasing, reference count is zero, the BlockDriverState is
5336  * deleted. */
5337 void bdrv_unref(BlockDriverState *bs)
5338 {
5339     if (!bs) {
5340         return;
5341     }
5342     assert(bs->refcnt > 0);
5343     if (--bs->refcnt == 0) {
5344         bdrv_delete(bs);
5345     }
5346 }
5347 
5348 struct BdrvOpBlocker {
5349     Error *reason;
5350     QLIST_ENTRY(BdrvOpBlocker) list;
5351 };
5352 
5353 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5354 {
5355     BdrvOpBlocker *blocker;
5356     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5357     if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5358         blocker = QLIST_FIRST(&bs->op_blockers[op]);
5359         if (errp) {
5360             error_setg(errp, "Device '%s' is busy: %s",
5361                        bdrv_get_device_name(bs),
5362                        error_get_pretty(blocker->reason));
5363         }
5364         return true;
5365     }
5366     return false;
5367 }
5368 
5369 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5370 {
5371     BdrvOpBlocker *blocker;
5372     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5373 
5374     blocker = g_new0(BdrvOpBlocker, 1);
5375     blocker->reason = reason;
5376     QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5377 }
5378 
5379 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5380 {
5381     BdrvOpBlocker *blocker, *next;
5382     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5383     QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5384         if (blocker->reason == reason) {
5385             QLIST_REMOVE(blocker, list);
5386             g_free(blocker);
5387         }
5388     }
5389 }
5390 
5391 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5392 {
5393     int i;
5394     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5395         bdrv_op_block(bs, i, reason);
5396     }
5397 }
5398 
5399 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5400 {
5401     int i;
5402     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5403         bdrv_op_unblock(bs, i, reason);
5404     }
5405 }
5406 
5407 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5408 {
5409     int i;
5410 
5411     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5412         if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5413             return false;
5414         }
5415     }
5416     return true;
5417 }
5418 
5419 void bdrv_iostatus_enable(BlockDriverState *bs)
5420 {
5421     bs->iostatus_enabled = true;
5422     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5423 }
5424 
5425 /* The I/O status is only enabled if the drive explicitly
5426  * enables it _and_ the VM is configured to stop on errors */
5427 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5428 {
5429     return (bs->iostatus_enabled &&
5430            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5431             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5432             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5433 }
5434 
5435 void bdrv_iostatus_disable(BlockDriverState *bs)
5436 {
5437     bs->iostatus_enabled = false;
5438 }
5439 
5440 void bdrv_iostatus_reset(BlockDriverState *bs)
5441 {
5442     if (bdrv_iostatus_is_enabled(bs)) {
5443         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5444         if (bs->job) {
5445             block_job_iostatus_reset(bs->job);
5446         }
5447     }
5448 }
5449 
5450 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5451 {
5452     assert(bdrv_iostatus_is_enabled(bs));
5453     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5454         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5455                                          BLOCK_DEVICE_IO_STATUS_FAILED;
5456     }
5457 }
5458 
5459 void bdrv_img_create(const char *filename, const char *fmt,
5460                      const char *base_filename, const char *base_fmt,
5461                      char *options, uint64_t img_size, int flags,
5462                      Error **errp, bool quiet)
5463 {
5464     QemuOptsList *create_opts = NULL;
5465     QemuOpts *opts = NULL;
5466     const char *backing_fmt, *backing_file;
5467     int64_t size;
5468     BlockDriver *drv, *proto_drv;
5469     BlockDriver *backing_drv = NULL;
5470     Error *local_err = NULL;
5471     int ret = 0;
5472 
5473     /* Find driver and parse its options */
5474     drv = bdrv_find_format(fmt);
5475     if (!drv) {
5476         error_setg(errp, "Unknown file format '%s'", fmt);
5477         return;
5478     }
5479 
5480     proto_drv = bdrv_find_protocol(filename, true);
5481     if (!proto_drv) {
5482         error_setg(errp, "Unknown protocol '%s'", filename);
5483         return;
5484     }
5485 
5486     create_opts = qemu_opts_append(create_opts, drv->create_opts);
5487     create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5488 
5489     /* Create parameter list with default values */
5490     opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5491     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
5492 
5493     /* Parse -o options */
5494     if (options) {
5495         if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5496             error_setg(errp, "Invalid options for file format '%s'", fmt);
5497             goto out;
5498         }
5499     }
5500 
5501     if (base_filename) {
5502         if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
5503             error_setg(errp, "Backing file not supported for file format '%s'",
5504                        fmt);
5505             goto out;
5506         }
5507     }
5508 
5509     if (base_fmt) {
5510         if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5511             error_setg(errp, "Backing file format not supported for file "
5512                              "format '%s'", fmt);
5513             goto out;
5514         }
5515     }
5516 
5517     backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5518     if (backing_file) {
5519         if (!strcmp(filename, backing_file)) {
5520             error_setg(errp, "Error: Trying to create an image with the "
5521                              "same filename as the backing file");
5522             goto out;
5523         }
5524     }
5525 
5526     backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5527     if (backing_fmt) {
5528         backing_drv = bdrv_find_format(backing_fmt);
5529         if (!backing_drv) {
5530             error_setg(errp, "Unknown backing file format '%s'",
5531                        backing_fmt);
5532             goto out;
5533         }
5534     }
5535 
5536     // The size for the image must always be specified, with one exception:
5537     // If we are using a backing file, we can obtain the size from there
5538     size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5539     if (size == -1) {
5540         if (backing_file) {
5541             BlockDriverState *bs;
5542             int64_t size;
5543             int back_flags;
5544 
5545             /* backing files always opened read-only */
5546             back_flags =
5547                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5548 
5549             bs = NULL;
5550             ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
5551                             backing_drv, &local_err);
5552             if (ret < 0) {
5553                 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5554                                  backing_file,
5555                                  error_get_pretty(local_err));
5556                 error_free(local_err);
5557                 local_err = NULL;
5558                 goto out;
5559             }
5560             size = bdrv_getlength(bs);
5561             if (size < 0) {
5562                 error_setg_errno(errp, -size, "Could not get size of '%s'",
5563                                  backing_file);
5564                 bdrv_unref(bs);
5565                 goto out;
5566             }
5567 
5568             qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
5569 
5570             bdrv_unref(bs);
5571         } else {
5572             error_setg(errp, "Image creation needs a size parameter");
5573             goto out;
5574         }
5575     }
5576 
5577     if (!quiet) {
5578         printf("Formatting '%s', fmt=%s ", filename, fmt);
5579         qemu_opts_print(opts);
5580         puts("");
5581     }
5582 
5583     ret = bdrv_create(drv, filename, opts, &local_err);
5584 
5585     if (ret == -EFBIG) {
5586         /* This is generally a better message than whatever the driver would
5587          * deliver (especially because of the cluster_size_hint), since that
5588          * is most probably not much different from "image too large". */
5589         const char *cluster_size_hint = "";
5590         if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
5591             cluster_size_hint = " (try using a larger cluster size)";
5592         }
5593         error_setg(errp, "The image size is too large for file format '%s'"
5594                    "%s", fmt, cluster_size_hint);
5595         error_free(local_err);
5596         local_err = NULL;
5597     }
5598 
5599 out:
5600     qemu_opts_del(opts);
5601     qemu_opts_free(create_opts);
5602     if (local_err) {
5603         error_propagate(errp, local_err);
5604     }
5605 }
5606 
5607 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5608 {
5609     return bs->aio_context;
5610 }
5611 
5612 void bdrv_detach_aio_context(BlockDriverState *bs)
5613 {
5614     BdrvAioNotifier *baf;
5615 
5616     if (!bs->drv) {
5617         return;
5618     }
5619 
5620     QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5621         baf->detach_aio_context(baf->opaque);
5622     }
5623 
5624     if (bs->io_limits_enabled) {
5625         throttle_detach_aio_context(&bs->throttle_state);
5626     }
5627     if (bs->drv->bdrv_detach_aio_context) {
5628         bs->drv->bdrv_detach_aio_context(bs);
5629     }
5630     if (bs->file) {
5631         bdrv_detach_aio_context(bs->file);
5632     }
5633     if (bs->backing_hd) {
5634         bdrv_detach_aio_context(bs->backing_hd);
5635     }
5636 
5637     bs->aio_context = NULL;
5638 }
5639 
5640 void bdrv_attach_aio_context(BlockDriverState *bs,
5641                              AioContext *new_context)
5642 {
5643     BdrvAioNotifier *ban;
5644 
5645     if (!bs->drv) {
5646         return;
5647     }
5648 
5649     bs->aio_context = new_context;
5650 
5651     if (bs->backing_hd) {
5652         bdrv_attach_aio_context(bs->backing_hd, new_context);
5653     }
5654     if (bs->file) {
5655         bdrv_attach_aio_context(bs->file, new_context);
5656     }
5657     if (bs->drv->bdrv_attach_aio_context) {
5658         bs->drv->bdrv_attach_aio_context(bs, new_context);
5659     }
5660     if (bs->io_limits_enabled) {
5661         throttle_attach_aio_context(&bs->throttle_state, new_context);
5662     }
5663 
5664     QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5665         ban->attached_aio_context(new_context, ban->opaque);
5666     }
5667 }
5668 
5669 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5670 {
5671     bdrv_drain_all(); /* ensure there are no in-flight requests */
5672 
5673     bdrv_detach_aio_context(bs);
5674 
5675     /* This function executes in the old AioContext so acquire the new one in
5676      * case it runs in a different thread.
5677      */
5678     aio_context_acquire(new_context);
5679     bdrv_attach_aio_context(bs, new_context);
5680     aio_context_release(new_context);
5681 }
5682 
5683 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5684         void (*attached_aio_context)(AioContext *new_context, void *opaque),
5685         void (*detach_aio_context)(void *opaque), void *opaque)
5686 {
5687     BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5688     *ban = (BdrvAioNotifier){
5689         .attached_aio_context = attached_aio_context,
5690         .detach_aio_context   = detach_aio_context,
5691         .opaque               = opaque
5692     };
5693 
5694     QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5695 }
5696 
5697 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5698                                       void (*attached_aio_context)(AioContext *,
5699                                                                    void *),
5700                                       void (*detach_aio_context)(void *),
5701                                       void *opaque)
5702 {
5703     BdrvAioNotifier *ban, *ban_next;
5704 
5705     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5706         if (ban->attached_aio_context == attached_aio_context &&
5707             ban->detach_aio_context   == detach_aio_context   &&
5708             ban->opaque               == opaque)
5709         {
5710             QLIST_REMOVE(ban, list);
5711             g_free(ban);
5712 
5713             return;
5714         }
5715     }
5716 
5717     abort();
5718 }
5719 
5720 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5721                                     NotifierWithReturn *notifier)
5722 {
5723     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5724 }
5725 
5726 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts)
5727 {
5728     if (!bs->drv->bdrv_amend_options) {
5729         return -ENOTSUP;
5730     }
5731     return bs->drv->bdrv_amend_options(bs, opts);
5732 }
5733 
5734 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5735  * of block filter and by bdrv_is_first_non_filter.
5736  * It is used to test if the given bs is the candidate or recurse more in the
5737  * node graph.
5738  */
5739 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5740                                       BlockDriverState *candidate)
5741 {
5742     /* return false if basic checks fails */
5743     if (!bs || !bs->drv) {
5744         return false;
5745     }
5746 
5747     /* the code reached a non block filter driver -> check if the bs is
5748      * the same as the candidate. It's the recursion termination condition.
5749      */
5750     if (!bs->drv->is_filter) {
5751         return bs == candidate;
5752     }
5753     /* Down this path the driver is a block filter driver */
5754 
5755     /* If the block filter recursion method is defined use it to recurse down
5756      * the node graph.
5757      */
5758     if (bs->drv->bdrv_recurse_is_first_non_filter) {
5759         return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5760     }
5761 
5762     /* the driver is a block filter but don't allow to recurse -> return false
5763      */
5764     return false;
5765 }
5766 
5767 /* This function checks if the candidate is the first non filter bs down it's
5768  * bs chain. Since we don't have pointers to parents it explore all bs chains
5769  * from the top. Some filters can choose not to pass down the recursion.
5770  */
5771 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5772 {
5773     BlockDriverState *bs;
5774 
5775     /* walk down the bs forest recursively */
5776     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5777         bool perm;
5778 
5779         /* try to recurse in this top level bs */
5780         perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5781 
5782         /* candidate is the first non filter */
5783         if (perm) {
5784             return true;
5785         }
5786     }
5787 
5788     return false;
5789 }
5790 
5791 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5792 {
5793     BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5794     if (!to_replace_bs) {
5795         error_setg(errp, "Node name '%s' not found", node_name);
5796         return NULL;
5797     }
5798 
5799     if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5800         return NULL;
5801     }
5802 
5803     /* We don't want arbitrary node of the BDS chain to be replaced only the top
5804      * most non filter in order to prevent data corruption.
5805      * Another benefit is that this tests exclude backing files which are
5806      * blocked by the backing blockers.
5807      */
5808     if (!bdrv_is_first_non_filter(to_replace_bs)) {
5809         error_setg(errp, "Only top most non filter can be replaced");
5810         return NULL;
5811     }
5812 
5813     return to_replace_bs;
5814 }
5815 
5816 void bdrv_io_plug(BlockDriverState *bs)
5817 {
5818     BlockDriver *drv = bs->drv;
5819     if (drv && drv->bdrv_io_plug) {
5820         drv->bdrv_io_plug(bs);
5821     } else if (bs->file) {
5822         bdrv_io_plug(bs->file);
5823     }
5824 }
5825 
5826 void bdrv_io_unplug(BlockDriverState *bs)
5827 {
5828     BlockDriver *drv = bs->drv;
5829     if (drv && drv->bdrv_io_unplug) {
5830         drv->bdrv_io_unplug(bs);
5831     } else if (bs->file) {
5832         bdrv_io_unplug(bs->file);
5833     }
5834 }
5835 
5836 void bdrv_flush_io_queue(BlockDriverState *bs)
5837 {
5838     BlockDriver *drv = bs->drv;
5839     if (drv && drv->bdrv_flush_io_queue) {
5840         drv->bdrv_flush_io_queue(bs);
5841     } else if (bs->file) {
5842         bdrv_flush_io_queue(bs->file);
5843     }
5844 }
5845 
5846 static bool append_open_options(QDict *d, BlockDriverState *bs)
5847 {
5848     const QDictEntry *entry;
5849     bool found_any = false;
5850 
5851     for (entry = qdict_first(bs->options); entry;
5852          entry = qdict_next(bs->options, entry))
5853     {
5854         /* Only take options for this level and exclude all non-driver-specific
5855          * options */
5856         if (!strchr(qdict_entry_key(entry), '.') &&
5857             strcmp(qdict_entry_key(entry), "node-name"))
5858         {
5859             qobject_incref(qdict_entry_value(entry));
5860             qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
5861             found_any = true;
5862         }
5863     }
5864 
5865     return found_any;
5866 }
5867 
5868 /* Updates the following BDS fields:
5869  *  - exact_filename: A filename which may be used for opening a block device
5870  *                    which (mostly) equals the given BDS (even without any
5871  *                    other options; so reading and writing must return the same
5872  *                    results, but caching etc. may be different)
5873  *  - full_open_options: Options which, when given when opening a block device
5874  *                       (without a filename), result in a BDS (mostly)
5875  *                       equalling the given one
5876  *  - filename: If exact_filename is set, it is copied here. Otherwise,
5877  *              full_open_options is converted to a JSON object, prefixed with
5878  *              "json:" (for use through the JSON pseudo protocol) and put here.
5879  */
5880 void bdrv_refresh_filename(BlockDriverState *bs)
5881 {
5882     BlockDriver *drv = bs->drv;
5883     QDict *opts;
5884 
5885     if (!drv) {
5886         return;
5887     }
5888 
5889     /* This BDS's file name will most probably depend on its file's name, so
5890      * refresh that first */
5891     if (bs->file) {
5892         bdrv_refresh_filename(bs->file);
5893     }
5894 
5895     if (drv->bdrv_refresh_filename) {
5896         /* Obsolete information is of no use here, so drop the old file name
5897          * information before refreshing it */
5898         bs->exact_filename[0] = '\0';
5899         if (bs->full_open_options) {
5900             QDECREF(bs->full_open_options);
5901             bs->full_open_options = NULL;
5902         }
5903 
5904         drv->bdrv_refresh_filename(bs);
5905     } else if (bs->file) {
5906         /* Try to reconstruct valid information from the underlying file */
5907         bool has_open_options;
5908 
5909         bs->exact_filename[0] = '\0';
5910         if (bs->full_open_options) {
5911             QDECREF(bs->full_open_options);
5912             bs->full_open_options = NULL;
5913         }
5914 
5915         opts = qdict_new();
5916         has_open_options = append_open_options(opts, bs);
5917 
5918         /* If no specific options have been given for this BDS, the filename of
5919          * the underlying file should suffice for this one as well */
5920         if (bs->file->exact_filename[0] && !has_open_options) {
5921             strcpy(bs->exact_filename, bs->file->exact_filename);
5922         }
5923         /* Reconstructing the full options QDict is simple for most format block
5924          * drivers, as long as the full options are known for the underlying
5925          * file BDS. The full options QDict of that file BDS should somehow
5926          * contain a representation of the filename, therefore the following
5927          * suffices without querying the (exact_)filename of this BDS. */
5928         if (bs->file->full_open_options) {
5929             qdict_put_obj(opts, "driver",
5930                           QOBJECT(qstring_from_str(drv->format_name)));
5931             QINCREF(bs->file->full_open_options);
5932             qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
5933 
5934             bs->full_open_options = opts;
5935         } else {
5936             QDECREF(opts);
5937         }
5938     } else if (!bs->full_open_options && qdict_size(bs->options)) {
5939         /* There is no underlying file BDS (at least referenced by BDS.file),
5940          * so the full options QDict should be equal to the options given
5941          * specifically for this block device when it was opened (plus the
5942          * driver specification).
5943          * Because those options don't change, there is no need to update
5944          * full_open_options when it's already set. */
5945 
5946         opts = qdict_new();
5947         append_open_options(opts, bs);
5948         qdict_put_obj(opts, "driver",
5949                       QOBJECT(qstring_from_str(drv->format_name)));
5950 
5951         if (bs->exact_filename[0]) {
5952             /* This may not work for all block protocol drivers (some may
5953              * require this filename to be parsed), but we have to find some
5954              * default solution here, so just include it. If some block driver
5955              * does not support pure options without any filename at all or
5956              * needs some special format of the options QDict, it needs to
5957              * implement the driver-specific bdrv_refresh_filename() function.
5958              */
5959             qdict_put_obj(opts, "filename",
5960                           QOBJECT(qstring_from_str(bs->exact_filename)));
5961         }
5962 
5963         bs->full_open_options = opts;
5964     }
5965 
5966     if (bs->exact_filename[0]) {
5967         pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
5968     } else if (bs->full_open_options) {
5969         QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
5970         snprintf(bs->filename, sizeof(bs->filename), "json:%s",
5971                  qstring_get_str(json));
5972         QDECREF(json);
5973     }
5974 }
5975 
5976 /* This accessor function purpose is to allow the device models to access the
5977  * BlockAcctStats structure embedded inside a BlockDriverState without being
5978  * aware of the BlockDriverState structure layout.
5979  * It will go away when the BlockAcctStats structure will be moved inside
5980  * the device models.
5981  */
5982 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
5983 {
5984     return &bs->stats;
5985 }
5986