xref: /openbmc/qemu/block.c (revision 9e7dac7c)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/sysemu.h"
32 #include "qemu/notify.h"
33 #include "block/coroutine.h"
34 #include "block/qapi.h"
35 #include "qmp-commands.h"
36 #include "qemu/timer.h"
37 #include "qapi-event.h"
38 
39 #ifdef CONFIG_BSD
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/queue.h>
44 #ifndef __DragonFly__
45 #include <sys/disk.h>
46 #endif
47 #endif
48 
49 #ifdef _WIN32
50 #include <windows.h>
51 #endif
52 
53 struct BdrvDirtyBitmap {
54     HBitmap *bitmap;
55     QLIST_ENTRY(BdrvDirtyBitmap) list;
56 };
57 
58 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59 
60 #define COROUTINE_POOL_RESERVATION 64 /* number of coroutines to reserve */
61 
62 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
63 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
64         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
65         BlockDriverCompletionFunc *cb, void *opaque);
66 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
67         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
68         BlockDriverCompletionFunc *cb, void *opaque);
69 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
70                                          int64_t sector_num, int nb_sectors,
71                                          QEMUIOVector *iov);
72 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
73                                          int64_t sector_num, int nb_sectors,
74                                          QEMUIOVector *iov);
75 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
76     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
77     BdrvRequestFlags flags);
78 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
79     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
80     BdrvRequestFlags flags);
81 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
82                                                int64_t sector_num,
83                                                QEMUIOVector *qiov,
84                                                int nb_sectors,
85                                                BdrvRequestFlags flags,
86                                                BlockDriverCompletionFunc *cb,
87                                                void *opaque,
88                                                bool is_write);
89 static void coroutine_fn bdrv_co_do_rw(void *opaque);
90 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
91     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
92 
93 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94     QTAILQ_HEAD_INITIALIZER(bdrv_states);
95 
96 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
97     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
98 
99 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
100     QLIST_HEAD_INITIALIZER(bdrv_drivers);
101 
102 /* If non-zero, use only whitelisted block drivers */
103 static int use_bdrv_whitelist;
104 
105 #ifdef _WIN32
106 static int is_windows_drive_prefix(const char *filename)
107 {
108     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
110             filename[1] == ':');
111 }
112 
113 int is_windows_drive(const char *filename)
114 {
115     if (is_windows_drive_prefix(filename) &&
116         filename[2] == '\0')
117         return 1;
118     if (strstart(filename, "\\\\.\\", NULL) ||
119         strstart(filename, "//./", NULL))
120         return 1;
121     return 0;
122 }
123 #endif
124 
125 /* throttling disk I/O limits */
126 void bdrv_set_io_limits(BlockDriverState *bs,
127                         ThrottleConfig *cfg)
128 {
129     int i;
130 
131     throttle_config(&bs->throttle_state, cfg);
132 
133     for (i = 0; i < 2; i++) {
134         qemu_co_enter_next(&bs->throttled_reqs[i]);
135     }
136 }
137 
138 /* this function drain all the throttled IOs */
139 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
140 {
141     bool drained = false;
142     bool enabled = bs->io_limits_enabled;
143     int i;
144 
145     bs->io_limits_enabled = false;
146 
147     for (i = 0; i < 2; i++) {
148         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
149             drained = true;
150         }
151     }
152 
153     bs->io_limits_enabled = enabled;
154 
155     return drained;
156 }
157 
158 void bdrv_io_limits_disable(BlockDriverState *bs)
159 {
160     bs->io_limits_enabled = false;
161 
162     bdrv_start_throttled_reqs(bs);
163 
164     throttle_destroy(&bs->throttle_state);
165 }
166 
167 static void bdrv_throttle_read_timer_cb(void *opaque)
168 {
169     BlockDriverState *bs = opaque;
170     qemu_co_enter_next(&bs->throttled_reqs[0]);
171 }
172 
173 static void bdrv_throttle_write_timer_cb(void *opaque)
174 {
175     BlockDriverState *bs = opaque;
176     qemu_co_enter_next(&bs->throttled_reqs[1]);
177 }
178 
179 /* should be called before bdrv_set_io_limits if a limit is set */
180 void bdrv_io_limits_enable(BlockDriverState *bs)
181 {
182     assert(!bs->io_limits_enabled);
183     throttle_init(&bs->throttle_state,
184                   bdrv_get_aio_context(bs),
185                   QEMU_CLOCK_VIRTUAL,
186                   bdrv_throttle_read_timer_cb,
187                   bdrv_throttle_write_timer_cb,
188                   bs);
189     bs->io_limits_enabled = true;
190 }
191 
192 /* This function makes an IO wait if needed
193  *
194  * @nb_sectors: the number of sectors of the IO
195  * @is_write:   is the IO a write
196  */
197 static void bdrv_io_limits_intercept(BlockDriverState *bs,
198                                      unsigned int bytes,
199                                      bool is_write)
200 {
201     /* does this io must wait */
202     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
203 
204     /* if must wait or any request of this type throttled queue the IO */
205     if (must_wait ||
206         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
207         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
208     }
209 
210     /* the IO will be executed, do the accounting */
211     throttle_account(&bs->throttle_state, is_write, bytes);
212 
213 
214     /* if the next request must wait -> do nothing */
215     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
216         return;
217     }
218 
219     /* else queue next request for execution */
220     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
221 }
222 
223 size_t bdrv_opt_mem_align(BlockDriverState *bs)
224 {
225     if (!bs || !bs->drv) {
226         /* 4k should be on the safe side */
227         return 4096;
228     }
229 
230     return bs->bl.opt_mem_alignment;
231 }
232 
233 /* check if the path starts with "<protocol>:" */
234 static int path_has_protocol(const char *path)
235 {
236     const char *p;
237 
238 #ifdef _WIN32
239     if (is_windows_drive(path) ||
240         is_windows_drive_prefix(path)) {
241         return 0;
242     }
243     p = path + strcspn(path, ":/\\");
244 #else
245     p = path + strcspn(path, ":/");
246 #endif
247 
248     return *p == ':';
249 }
250 
251 int path_is_absolute(const char *path)
252 {
253 #ifdef _WIN32
254     /* specific case for names like: "\\.\d:" */
255     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
256         return 1;
257     }
258     return (*path == '/' || *path == '\\');
259 #else
260     return (*path == '/');
261 #endif
262 }
263 
264 /* if filename is absolute, just copy it to dest. Otherwise, build a
265    path to it by considering it is relative to base_path. URL are
266    supported. */
267 void path_combine(char *dest, int dest_size,
268                   const char *base_path,
269                   const char *filename)
270 {
271     const char *p, *p1;
272     int len;
273 
274     if (dest_size <= 0)
275         return;
276     if (path_is_absolute(filename)) {
277         pstrcpy(dest, dest_size, filename);
278     } else {
279         p = strchr(base_path, ':');
280         if (p)
281             p++;
282         else
283             p = base_path;
284         p1 = strrchr(base_path, '/');
285 #ifdef _WIN32
286         {
287             const char *p2;
288             p2 = strrchr(base_path, '\\');
289             if (!p1 || p2 > p1)
290                 p1 = p2;
291         }
292 #endif
293         if (p1)
294             p1++;
295         else
296             p1 = base_path;
297         if (p1 > p)
298             p = p1;
299         len = p - base_path;
300         if (len > dest_size - 1)
301             len = dest_size - 1;
302         memcpy(dest, base_path, len);
303         dest[len] = '\0';
304         pstrcat(dest, dest_size, filename);
305     }
306 }
307 
308 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
309 {
310     if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
311         pstrcpy(dest, sz, bs->backing_file);
312     } else {
313         path_combine(dest, sz, bs->filename, bs->backing_file);
314     }
315 }
316 
317 void bdrv_register(BlockDriver *bdrv)
318 {
319     /* Block drivers without coroutine functions need emulation */
320     if (!bdrv->bdrv_co_readv) {
321         bdrv->bdrv_co_readv = bdrv_co_readv_em;
322         bdrv->bdrv_co_writev = bdrv_co_writev_em;
323 
324         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
325          * the block driver lacks aio we need to emulate that too.
326          */
327         if (!bdrv->bdrv_aio_readv) {
328             /* add AIO emulation layer */
329             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
330             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
331         }
332     }
333 
334     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
335 }
336 
337 /* create a new block device (by default it is empty) */
338 BlockDriverState *bdrv_new(const char *device_name, Error **errp)
339 {
340     BlockDriverState *bs;
341     int i;
342 
343     if (bdrv_find(device_name)) {
344         error_setg(errp, "Device with id '%s' already exists",
345                    device_name);
346         return NULL;
347     }
348     if (bdrv_find_node(device_name)) {
349         error_setg(errp, "Device with node-name '%s' already exists",
350                    device_name);
351         return NULL;
352     }
353 
354     bs = g_new0(BlockDriverState, 1);
355     QLIST_INIT(&bs->dirty_bitmaps);
356     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
357     if (device_name[0] != '\0') {
358         QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
359     }
360     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
361         QLIST_INIT(&bs->op_blockers[i]);
362     }
363     bdrv_iostatus_disable(bs);
364     notifier_list_init(&bs->close_notifiers);
365     notifier_with_return_list_init(&bs->before_write_notifiers);
366     qemu_co_queue_init(&bs->throttled_reqs[0]);
367     qemu_co_queue_init(&bs->throttled_reqs[1]);
368     bs->refcnt = 1;
369     bs->aio_context = qemu_get_aio_context();
370 
371     return bs;
372 }
373 
374 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
375 {
376     notifier_list_add(&bs->close_notifiers, notify);
377 }
378 
379 BlockDriver *bdrv_find_format(const char *format_name)
380 {
381     BlockDriver *drv1;
382     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
383         if (!strcmp(drv1->format_name, format_name)) {
384             return drv1;
385         }
386     }
387     return NULL;
388 }
389 
390 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
391 {
392     static const char *whitelist_rw[] = {
393         CONFIG_BDRV_RW_WHITELIST
394     };
395     static const char *whitelist_ro[] = {
396         CONFIG_BDRV_RO_WHITELIST
397     };
398     const char **p;
399 
400     if (!whitelist_rw[0] && !whitelist_ro[0]) {
401         return 1;               /* no whitelist, anything goes */
402     }
403 
404     for (p = whitelist_rw; *p; p++) {
405         if (!strcmp(drv->format_name, *p)) {
406             return 1;
407         }
408     }
409     if (read_only) {
410         for (p = whitelist_ro; *p; p++) {
411             if (!strcmp(drv->format_name, *p)) {
412                 return 1;
413             }
414         }
415     }
416     return 0;
417 }
418 
419 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
420                                           bool read_only)
421 {
422     BlockDriver *drv = bdrv_find_format(format_name);
423     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
424 }
425 
426 typedef struct CreateCo {
427     BlockDriver *drv;
428     char *filename;
429     QemuOpts *opts;
430     int ret;
431     Error *err;
432 } CreateCo;
433 
434 static void coroutine_fn bdrv_create_co_entry(void *opaque)
435 {
436     Error *local_err = NULL;
437     int ret;
438 
439     CreateCo *cco = opaque;
440     assert(cco->drv);
441 
442     ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
443     if (local_err) {
444         error_propagate(&cco->err, local_err);
445     }
446     cco->ret = ret;
447 }
448 
449 int bdrv_create(BlockDriver *drv, const char* filename,
450                 QemuOpts *opts, Error **errp)
451 {
452     int ret;
453 
454     Coroutine *co;
455     CreateCo cco = {
456         .drv = drv,
457         .filename = g_strdup(filename),
458         .opts = opts,
459         .ret = NOT_DONE,
460         .err = NULL,
461     };
462 
463     if (!drv->bdrv_create) {
464         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
465         ret = -ENOTSUP;
466         goto out;
467     }
468 
469     if (qemu_in_coroutine()) {
470         /* Fast-path if already in coroutine context */
471         bdrv_create_co_entry(&cco);
472     } else {
473         co = qemu_coroutine_create(bdrv_create_co_entry);
474         qemu_coroutine_enter(co, &cco);
475         while (cco.ret == NOT_DONE) {
476             aio_poll(qemu_get_aio_context(), true);
477         }
478     }
479 
480     ret = cco.ret;
481     if (ret < 0) {
482         if (cco.err) {
483             error_propagate(errp, cco.err);
484         } else {
485             error_setg_errno(errp, -ret, "Could not create image");
486         }
487     }
488 
489 out:
490     g_free(cco.filename);
491     return ret;
492 }
493 
494 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
495 {
496     BlockDriver *drv;
497     Error *local_err = NULL;
498     int ret;
499 
500     drv = bdrv_find_protocol(filename, true);
501     if (drv == NULL) {
502         error_setg(errp, "Could not find protocol for file '%s'", filename);
503         return -ENOENT;
504     }
505 
506     ret = bdrv_create(drv, filename, opts, &local_err);
507     if (local_err) {
508         error_propagate(errp, local_err);
509     }
510     return ret;
511 }
512 
513 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
514 {
515     BlockDriver *drv = bs->drv;
516     Error *local_err = NULL;
517 
518     memset(&bs->bl, 0, sizeof(bs->bl));
519 
520     if (!drv) {
521         return;
522     }
523 
524     /* Take some limits from the children as a default */
525     if (bs->file) {
526         bdrv_refresh_limits(bs->file, &local_err);
527         if (local_err) {
528             error_propagate(errp, local_err);
529             return;
530         }
531         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
532         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
533     } else {
534         bs->bl.opt_mem_alignment = 512;
535     }
536 
537     if (bs->backing_hd) {
538         bdrv_refresh_limits(bs->backing_hd, &local_err);
539         if (local_err) {
540             error_propagate(errp, local_err);
541             return;
542         }
543         bs->bl.opt_transfer_length =
544             MAX(bs->bl.opt_transfer_length,
545                 bs->backing_hd->bl.opt_transfer_length);
546         bs->bl.opt_mem_alignment =
547             MAX(bs->bl.opt_mem_alignment,
548                 bs->backing_hd->bl.opt_mem_alignment);
549     }
550 
551     /* Then let the driver override it */
552     if (drv->bdrv_refresh_limits) {
553         drv->bdrv_refresh_limits(bs, errp);
554     }
555 }
556 
557 /*
558  * Create a uniquely-named empty temporary file.
559  * Return 0 upon success, otherwise a negative errno value.
560  */
561 int get_tmp_filename(char *filename, int size)
562 {
563 #ifdef _WIN32
564     char temp_dir[MAX_PATH];
565     /* GetTempFileName requires that its output buffer (4th param)
566        have length MAX_PATH or greater.  */
567     assert(size >= MAX_PATH);
568     return (GetTempPath(MAX_PATH, temp_dir)
569             && GetTempFileName(temp_dir, "qem", 0, filename)
570             ? 0 : -GetLastError());
571 #else
572     int fd;
573     const char *tmpdir;
574     tmpdir = getenv("TMPDIR");
575     if (!tmpdir) {
576         tmpdir = "/var/tmp";
577     }
578     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
579         return -EOVERFLOW;
580     }
581     fd = mkstemp(filename);
582     if (fd < 0) {
583         return -errno;
584     }
585     if (close(fd) != 0) {
586         unlink(filename);
587         return -errno;
588     }
589     return 0;
590 #endif
591 }
592 
593 /*
594  * Detect host devices. By convention, /dev/cdrom[N] is always
595  * recognized as a host CDROM.
596  */
597 static BlockDriver *find_hdev_driver(const char *filename)
598 {
599     int score_max = 0, score;
600     BlockDriver *drv = NULL, *d;
601 
602     QLIST_FOREACH(d, &bdrv_drivers, list) {
603         if (d->bdrv_probe_device) {
604             score = d->bdrv_probe_device(filename);
605             if (score > score_max) {
606                 score_max = score;
607                 drv = d;
608             }
609         }
610     }
611 
612     return drv;
613 }
614 
615 BlockDriver *bdrv_find_protocol(const char *filename,
616                                 bool allow_protocol_prefix)
617 {
618     BlockDriver *drv1;
619     char protocol[128];
620     int len;
621     const char *p;
622 
623     /* TODO Drivers without bdrv_file_open must be specified explicitly */
624 
625     /*
626      * XXX(hch): we really should not let host device detection
627      * override an explicit protocol specification, but moving this
628      * later breaks access to device names with colons in them.
629      * Thanks to the brain-dead persistent naming schemes on udev-
630      * based Linux systems those actually are quite common.
631      */
632     drv1 = find_hdev_driver(filename);
633     if (drv1) {
634         return drv1;
635     }
636 
637     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
638         return bdrv_find_format("file");
639     }
640 
641     p = strchr(filename, ':');
642     assert(p != NULL);
643     len = p - filename;
644     if (len > sizeof(protocol) - 1)
645         len = sizeof(protocol) - 1;
646     memcpy(protocol, filename, len);
647     protocol[len] = '\0';
648     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
649         if (drv1->protocol_name &&
650             !strcmp(drv1->protocol_name, protocol)) {
651             return drv1;
652         }
653     }
654     return NULL;
655 }
656 
657 static int find_image_format(BlockDriverState *bs, const char *filename,
658                              BlockDriver **pdrv, Error **errp)
659 {
660     int score, score_max;
661     BlockDriver *drv1, *drv;
662     uint8_t buf[2048];
663     int ret = 0;
664 
665     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
666     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
667         drv = bdrv_find_format("raw");
668         if (!drv) {
669             error_setg(errp, "Could not find raw image format");
670             ret = -ENOENT;
671         }
672         *pdrv = drv;
673         return ret;
674     }
675 
676     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
677     if (ret < 0) {
678         error_setg_errno(errp, -ret, "Could not read image for determining its "
679                          "format");
680         *pdrv = NULL;
681         return ret;
682     }
683 
684     score_max = 0;
685     drv = NULL;
686     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
687         if (drv1->bdrv_probe) {
688             score = drv1->bdrv_probe(buf, ret, filename);
689             if (score > score_max) {
690                 score_max = score;
691                 drv = drv1;
692             }
693         }
694     }
695     if (!drv) {
696         error_setg(errp, "Could not determine image format: No compatible "
697                    "driver found");
698         ret = -ENOENT;
699     }
700     *pdrv = drv;
701     return ret;
702 }
703 
704 /**
705  * Set the current 'total_sectors' value
706  * Return 0 on success, -errno on error.
707  */
708 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
709 {
710     BlockDriver *drv = bs->drv;
711 
712     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
713     if (bs->sg)
714         return 0;
715 
716     /* query actual device if possible, otherwise just trust the hint */
717     if (drv->bdrv_getlength) {
718         int64_t length = drv->bdrv_getlength(bs);
719         if (length < 0) {
720             return length;
721         }
722         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
723     }
724 
725     bs->total_sectors = hint;
726     return 0;
727 }
728 
729 /**
730  * Set open flags for a given discard mode
731  *
732  * Return 0 on success, -1 if the discard mode was invalid.
733  */
734 int bdrv_parse_discard_flags(const char *mode, int *flags)
735 {
736     *flags &= ~BDRV_O_UNMAP;
737 
738     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
739         /* do nothing */
740     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
741         *flags |= BDRV_O_UNMAP;
742     } else {
743         return -1;
744     }
745 
746     return 0;
747 }
748 
749 /**
750  * Set open flags for a given cache mode
751  *
752  * Return 0 on success, -1 if the cache mode was invalid.
753  */
754 int bdrv_parse_cache_flags(const char *mode, int *flags)
755 {
756     *flags &= ~BDRV_O_CACHE_MASK;
757 
758     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
759         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
760     } else if (!strcmp(mode, "directsync")) {
761         *flags |= BDRV_O_NOCACHE;
762     } else if (!strcmp(mode, "writeback")) {
763         *flags |= BDRV_O_CACHE_WB;
764     } else if (!strcmp(mode, "unsafe")) {
765         *flags |= BDRV_O_CACHE_WB;
766         *flags |= BDRV_O_NO_FLUSH;
767     } else if (!strcmp(mode, "writethrough")) {
768         /* this is the default */
769     } else {
770         return -1;
771     }
772 
773     return 0;
774 }
775 
776 /**
777  * The copy-on-read flag is actually a reference count so multiple users may
778  * use the feature without worrying about clobbering its previous state.
779  * Copy-on-read stays enabled until all users have called to disable it.
780  */
781 void bdrv_enable_copy_on_read(BlockDriverState *bs)
782 {
783     bs->copy_on_read++;
784 }
785 
786 void bdrv_disable_copy_on_read(BlockDriverState *bs)
787 {
788     assert(bs->copy_on_read > 0);
789     bs->copy_on_read--;
790 }
791 
792 /*
793  * Returns the flags that a temporary snapshot should get, based on the
794  * originally requested flags (the originally requested image will have flags
795  * like a backing file)
796  */
797 static int bdrv_temp_snapshot_flags(int flags)
798 {
799     return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
800 }
801 
802 /*
803  * Returns the flags that bs->file should get, based on the given flags for
804  * the parent BDS
805  */
806 static int bdrv_inherited_flags(int flags)
807 {
808     /* Enable protocol handling, disable format probing for bs->file */
809     flags |= BDRV_O_PROTOCOL;
810 
811     /* Our block drivers take care to send flushes and respect unmap policy,
812      * so we can enable both unconditionally on lower layers. */
813     flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
814 
815     /* Clear flags that only apply to the top layer */
816     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
817 
818     return flags;
819 }
820 
821 /*
822  * Returns the flags that bs->backing_hd should get, based on the given flags
823  * for the parent BDS
824  */
825 static int bdrv_backing_flags(int flags)
826 {
827     /* backing files always opened read-only */
828     flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
829 
830     /* snapshot=on is handled on the top layer */
831     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
832 
833     return flags;
834 }
835 
836 static int bdrv_open_flags(BlockDriverState *bs, int flags)
837 {
838     int open_flags = flags | BDRV_O_CACHE_WB;
839 
840     /*
841      * Clear flags that are internal to the block layer before opening the
842      * image.
843      */
844     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
845 
846     /*
847      * Snapshots should be writable.
848      */
849     if (flags & BDRV_O_TEMPORARY) {
850         open_flags |= BDRV_O_RDWR;
851     }
852 
853     return open_flags;
854 }
855 
856 static void bdrv_assign_node_name(BlockDriverState *bs,
857                                   const char *node_name,
858                                   Error **errp)
859 {
860     if (!node_name) {
861         return;
862     }
863 
864     /* empty string node name is invalid */
865     if (node_name[0] == '\0') {
866         error_setg(errp, "Empty node name");
867         return;
868     }
869 
870     /* takes care of avoiding namespaces collisions */
871     if (bdrv_find(node_name)) {
872         error_setg(errp, "node-name=%s is conflicting with a device id",
873                    node_name);
874         return;
875     }
876 
877     /* takes care of avoiding duplicates node names */
878     if (bdrv_find_node(node_name)) {
879         error_setg(errp, "Duplicate node name");
880         return;
881     }
882 
883     /* copy node name into the bs and insert it into the graph list */
884     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
885     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
886 }
887 
888 /*
889  * Common part for opening disk images and files
890  *
891  * Removes all processed options from *options.
892  */
893 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
894     QDict *options, int flags, BlockDriver *drv, Error **errp)
895 {
896     int ret, open_flags;
897     const char *filename;
898     const char *node_name = NULL;
899     Error *local_err = NULL;
900 
901     assert(drv != NULL);
902     assert(bs->file == NULL);
903     assert(options != NULL && bs->options != options);
904 
905     if (file != NULL) {
906         filename = file->filename;
907     } else {
908         filename = qdict_get_try_str(options, "filename");
909     }
910 
911     if (drv->bdrv_needs_filename && !filename) {
912         error_setg(errp, "The '%s' block driver requires a file name",
913                    drv->format_name);
914         return -EINVAL;
915     }
916 
917     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
918 
919     node_name = qdict_get_try_str(options, "node-name");
920     bdrv_assign_node_name(bs, node_name, &local_err);
921     if (local_err) {
922         error_propagate(errp, local_err);
923         return -EINVAL;
924     }
925     qdict_del(options, "node-name");
926 
927     /* bdrv_open() with directly using a protocol as drv. This layer is already
928      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
929      * and return immediately. */
930     if (file != NULL && drv->bdrv_file_open) {
931         bdrv_swap(file, bs);
932         return 0;
933     }
934 
935     bs->open_flags = flags;
936     bs->guest_block_size = 512;
937     bs->request_alignment = 512;
938     bs->zero_beyond_eof = true;
939     open_flags = bdrv_open_flags(bs, flags);
940     bs->read_only = !(open_flags & BDRV_O_RDWR);
941     bs->growable = !!(flags & BDRV_O_PROTOCOL);
942 
943     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
944         error_setg(errp,
945                    !bs->read_only && bdrv_is_whitelisted(drv, true)
946                         ? "Driver '%s' can only be used for read-only devices"
947                         : "Driver '%s' is not whitelisted",
948                    drv->format_name);
949         return -ENOTSUP;
950     }
951 
952     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
953     if (flags & BDRV_O_COPY_ON_READ) {
954         if (!bs->read_only) {
955             bdrv_enable_copy_on_read(bs);
956         } else {
957             error_setg(errp, "Can't use copy-on-read on read-only device");
958             return -EINVAL;
959         }
960     }
961 
962     if (filename != NULL) {
963         pstrcpy(bs->filename, sizeof(bs->filename), filename);
964     } else {
965         bs->filename[0] = '\0';
966     }
967     pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
968 
969     bs->drv = drv;
970     bs->opaque = g_malloc0(drv->instance_size);
971 
972     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
973 
974     /* Open the image, either directly or using a protocol */
975     if (drv->bdrv_file_open) {
976         assert(file == NULL);
977         assert(!drv->bdrv_needs_filename || filename != NULL);
978         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
979     } else {
980         if (file == NULL) {
981             error_setg(errp, "Can't use '%s' as a block driver for the "
982                        "protocol level", drv->format_name);
983             ret = -EINVAL;
984             goto free_and_fail;
985         }
986         bs->file = file;
987         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
988     }
989 
990     if (ret < 0) {
991         if (local_err) {
992             error_propagate(errp, local_err);
993         } else if (bs->filename[0]) {
994             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
995         } else {
996             error_setg_errno(errp, -ret, "Could not open image");
997         }
998         goto free_and_fail;
999     }
1000 
1001     ret = refresh_total_sectors(bs, bs->total_sectors);
1002     if (ret < 0) {
1003         error_setg_errno(errp, -ret, "Could not refresh total sector count");
1004         goto free_and_fail;
1005     }
1006 
1007     bdrv_refresh_limits(bs, &local_err);
1008     if (local_err) {
1009         error_propagate(errp, local_err);
1010         ret = -EINVAL;
1011         goto free_and_fail;
1012     }
1013 
1014     assert(bdrv_opt_mem_align(bs) != 0);
1015     assert((bs->request_alignment != 0) || bs->sg);
1016     return 0;
1017 
1018 free_and_fail:
1019     bs->file = NULL;
1020     g_free(bs->opaque);
1021     bs->opaque = NULL;
1022     bs->drv = NULL;
1023     return ret;
1024 }
1025 
1026 static QDict *parse_json_filename(const char *filename, Error **errp)
1027 {
1028     QObject *options_obj;
1029     QDict *options;
1030     int ret;
1031 
1032     ret = strstart(filename, "json:", &filename);
1033     assert(ret);
1034 
1035     options_obj = qobject_from_json(filename);
1036     if (!options_obj) {
1037         error_setg(errp, "Could not parse the JSON options");
1038         return NULL;
1039     }
1040 
1041     if (qobject_type(options_obj) != QTYPE_QDICT) {
1042         qobject_decref(options_obj);
1043         error_setg(errp, "Invalid JSON object given");
1044         return NULL;
1045     }
1046 
1047     options = qobject_to_qdict(options_obj);
1048     qdict_flatten(options);
1049 
1050     return options;
1051 }
1052 
1053 /*
1054  * Fills in default options for opening images and converts the legacy
1055  * filename/flags pair to option QDict entries.
1056  */
1057 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1058                              BlockDriver *drv, Error **errp)
1059 {
1060     const char *filename = *pfilename;
1061     const char *drvname;
1062     bool protocol = flags & BDRV_O_PROTOCOL;
1063     bool parse_filename = false;
1064     Error *local_err = NULL;
1065 
1066     /* Parse json: pseudo-protocol */
1067     if (filename && g_str_has_prefix(filename, "json:")) {
1068         QDict *json_options = parse_json_filename(filename, &local_err);
1069         if (local_err) {
1070             error_propagate(errp, local_err);
1071             return -EINVAL;
1072         }
1073 
1074         /* Options given in the filename have lower priority than options
1075          * specified directly */
1076         qdict_join(*options, json_options, false);
1077         QDECREF(json_options);
1078         *pfilename = filename = NULL;
1079     }
1080 
1081     /* Fetch the file name from the options QDict if necessary */
1082     if (protocol && filename) {
1083         if (!qdict_haskey(*options, "filename")) {
1084             qdict_put(*options, "filename", qstring_from_str(filename));
1085             parse_filename = true;
1086         } else {
1087             error_setg(errp, "Can't specify 'file' and 'filename' options at "
1088                              "the same time");
1089             return -EINVAL;
1090         }
1091     }
1092 
1093     /* Find the right block driver */
1094     filename = qdict_get_try_str(*options, "filename");
1095     drvname = qdict_get_try_str(*options, "driver");
1096 
1097     if (drv) {
1098         if (drvname) {
1099             error_setg(errp, "Driver specified twice");
1100             return -EINVAL;
1101         }
1102         drvname = drv->format_name;
1103         qdict_put(*options, "driver", qstring_from_str(drvname));
1104     } else {
1105         if (!drvname && protocol) {
1106             if (filename) {
1107                 drv = bdrv_find_protocol(filename, parse_filename);
1108                 if (!drv) {
1109                     error_setg(errp, "Unknown protocol");
1110                     return -EINVAL;
1111                 }
1112 
1113                 drvname = drv->format_name;
1114                 qdict_put(*options, "driver", qstring_from_str(drvname));
1115             } else {
1116                 error_setg(errp, "Must specify either driver or file");
1117                 return -EINVAL;
1118             }
1119         } else if (drvname) {
1120             drv = bdrv_find_format(drvname);
1121             if (!drv) {
1122                 error_setg(errp, "Unknown driver '%s'", drvname);
1123                 return -ENOENT;
1124             }
1125         }
1126     }
1127 
1128     assert(drv || !protocol);
1129 
1130     /* Driver-specific filename parsing */
1131     if (drv && drv->bdrv_parse_filename && parse_filename) {
1132         drv->bdrv_parse_filename(filename, *options, &local_err);
1133         if (local_err) {
1134             error_propagate(errp, local_err);
1135             return -EINVAL;
1136         }
1137 
1138         if (!drv->bdrv_needs_filename) {
1139             qdict_del(*options, "filename");
1140         }
1141     }
1142 
1143     return 0;
1144 }
1145 
1146 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1147 {
1148 
1149     if (bs->backing_hd) {
1150         assert(bs->backing_blocker);
1151         bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1152     } else if (backing_hd) {
1153         error_setg(&bs->backing_blocker,
1154                    "device is used as backing hd of '%s'",
1155                    bs->device_name);
1156     }
1157 
1158     bs->backing_hd = backing_hd;
1159     if (!backing_hd) {
1160         error_free(bs->backing_blocker);
1161         bs->backing_blocker = NULL;
1162         goto out;
1163     }
1164     bs->open_flags &= ~BDRV_O_NO_BACKING;
1165     pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1166     pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1167             backing_hd->drv ? backing_hd->drv->format_name : "");
1168 
1169     bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1170     /* Otherwise we won't be able to commit due to check in bdrv_commit */
1171     bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1172                     bs->backing_blocker);
1173 out:
1174     bdrv_refresh_limits(bs, NULL);
1175 }
1176 
1177 /*
1178  * Opens the backing file for a BlockDriverState if not yet open
1179  *
1180  * options is a QDict of options to pass to the block drivers, or NULL for an
1181  * empty set of options. The reference to the QDict is transferred to this
1182  * function (even on failure), so if the caller intends to reuse the dictionary,
1183  * it needs to use QINCREF() before calling bdrv_file_open.
1184  */
1185 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1186 {
1187     char *backing_filename = g_malloc0(PATH_MAX);
1188     int ret = 0;
1189     BlockDriver *back_drv = NULL;
1190     BlockDriverState *backing_hd;
1191     Error *local_err = NULL;
1192 
1193     if (bs->backing_hd != NULL) {
1194         QDECREF(options);
1195         goto free_exit;
1196     }
1197 
1198     /* NULL means an empty set of options */
1199     if (options == NULL) {
1200         options = qdict_new();
1201     }
1202 
1203     bs->open_flags &= ~BDRV_O_NO_BACKING;
1204     if (qdict_haskey(options, "file.filename")) {
1205         backing_filename[0] = '\0';
1206     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1207         QDECREF(options);
1208         goto free_exit;
1209     } else {
1210         bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1211     }
1212 
1213     if (!bs->drv || !bs->drv->supports_backing) {
1214         ret = -EINVAL;
1215         error_setg(errp, "Driver doesn't support backing files");
1216         QDECREF(options);
1217         goto free_exit;
1218     }
1219 
1220     backing_hd = bdrv_new("", errp);
1221 
1222     if (bs->backing_format[0] != '\0') {
1223         back_drv = bdrv_find_format(bs->backing_format);
1224     }
1225 
1226     assert(bs->backing_hd == NULL);
1227     ret = bdrv_open(&backing_hd,
1228                     *backing_filename ? backing_filename : NULL, NULL, options,
1229                     bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
1230     if (ret < 0) {
1231         bdrv_unref(backing_hd);
1232         backing_hd = NULL;
1233         bs->open_flags |= BDRV_O_NO_BACKING;
1234         error_setg(errp, "Could not open backing file: %s",
1235                    error_get_pretty(local_err));
1236         error_free(local_err);
1237         goto free_exit;
1238     }
1239     bdrv_set_backing_hd(bs, backing_hd);
1240 
1241 free_exit:
1242     g_free(backing_filename);
1243     return ret;
1244 }
1245 
1246 /*
1247  * Opens a disk image whose options are given as BlockdevRef in another block
1248  * device's options.
1249  *
1250  * If allow_none is true, no image will be opened if filename is false and no
1251  * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1252  *
1253  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1254  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1255  * itself, all options starting with "${bdref_key}." are considered part of the
1256  * BlockdevRef.
1257  *
1258  * The BlockdevRef will be removed from the options QDict.
1259  *
1260  * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1261  */
1262 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1263                     QDict *options, const char *bdref_key, int flags,
1264                     bool allow_none, Error **errp)
1265 {
1266     QDict *image_options;
1267     int ret;
1268     char *bdref_key_dot;
1269     const char *reference;
1270 
1271     assert(pbs);
1272     assert(*pbs == NULL);
1273 
1274     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1275     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1276     g_free(bdref_key_dot);
1277 
1278     reference = qdict_get_try_str(options, bdref_key);
1279     if (!filename && !reference && !qdict_size(image_options)) {
1280         if (allow_none) {
1281             ret = 0;
1282         } else {
1283             error_setg(errp, "A block device must be specified for \"%s\"",
1284                        bdref_key);
1285             ret = -EINVAL;
1286         }
1287         QDECREF(image_options);
1288         goto done;
1289     }
1290 
1291     ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1292 
1293 done:
1294     qdict_del(options, bdref_key);
1295     return ret;
1296 }
1297 
1298 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1299 {
1300     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1301     char *tmp_filename = g_malloc0(PATH_MAX + 1);
1302     int64_t total_size;
1303     BlockDriver *bdrv_qcow2;
1304     QemuOpts *opts = NULL;
1305     QDict *snapshot_options;
1306     BlockDriverState *bs_snapshot;
1307     Error *local_err;
1308     int ret;
1309 
1310     /* if snapshot, we create a temporary backing file and open it
1311        instead of opening 'filename' directly */
1312 
1313     /* Get the required size from the image */
1314     total_size = bdrv_getlength(bs);
1315     if (total_size < 0) {
1316         ret = total_size;
1317         error_setg_errno(errp, -total_size, "Could not get image size");
1318         goto out;
1319     }
1320 
1321     /* Create the temporary image */
1322     ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1323     if (ret < 0) {
1324         error_setg_errno(errp, -ret, "Could not get temporary filename");
1325         goto out;
1326     }
1327 
1328     bdrv_qcow2 = bdrv_find_format("qcow2");
1329     opts = qemu_opts_create(bdrv_qcow2->create_opts, NULL, 0,
1330                             &error_abort);
1331     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
1332     ret = bdrv_create(bdrv_qcow2, tmp_filename, opts, &local_err);
1333     qemu_opts_del(opts);
1334     if (ret < 0) {
1335         error_setg_errno(errp, -ret, "Could not create temporary overlay "
1336                          "'%s': %s", tmp_filename,
1337                          error_get_pretty(local_err));
1338         error_free(local_err);
1339         goto out;
1340     }
1341 
1342     /* Prepare a new options QDict for the temporary file */
1343     snapshot_options = qdict_new();
1344     qdict_put(snapshot_options, "file.driver",
1345               qstring_from_str("file"));
1346     qdict_put(snapshot_options, "file.filename",
1347               qstring_from_str(tmp_filename));
1348 
1349     bs_snapshot = bdrv_new("", &error_abort);
1350 
1351     ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1352                     flags, bdrv_qcow2, &local_err);
1353     if (ret < 0) {
1354         error_propagate(errp, local_err);
1355         goto out;
1356     }
1357 
1358     bdrv_append(bs_snapshot, bs);
1359 
1360 out:
1361     g_free(tmp_filename);
1362     return ret;
1363 }
1364 
1365 /*
1366  * Opens a disk image (raw, qcow2, vmdk, ...)
1367  *
1368  * options is a QDict of options to pass to the block drivers, or NULL for an
1369  * empty set of options. The reference to the QDict belongs to the block layer
1370  * after the call (even on failure), so if the caller intends to reuse the
1371  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1372  *
1373  * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1374  * If it is not NULL, the referenced BDS will be reused.
1375  *
1376  * The reference parameter may be used to specify an existing block device which
1377  * should be opened. If specified, neither options nor a filename may be given,
1378  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1379  */
1380 int bdrv_open(BlockDriverState **pbs, const char *filename,
1381               const char *reference, QDict *options, int flags,
1382               BlockDriver *drv, Error **errp)
1383 {
1384     int ret;
1385     BlockDriverState *file = NULL, *bs;
1386     const char *drvname;
1387     Error *local_err = NULL;
1388     int snapshot_flags = 0;
1389 
1390     assert(pbs);
1391 
1392     if (reference) {
1393         bool options_non_empty = options ? qdict_size(options) : false;
1394         QDECREF(options);
1395 
1396         if (*pbs) {
1397             error_setg(errp, "Cannot reuse an existing BDS when referencing "
1398                        "another block device");
1399             return -EINVAL;
1400         }
1401 
1402         if (filename || options_non_empty) {
1403             error_setg(errp, "Cannot reference an existing block device with "
1404                        "additional options or a new filename");
1405             return -EINVAL;
1406         }
1407 
1408         bs = bdrv_lookup_bs(reference, reference, errp);
1409         if (!bs) {
1410             return -ENODEV;
1411         }
1412         bdrv_ref(bs);
1413         *pbs = bs;
1414         return 0;
1415     }
1416 
1417     if (*pbs) {
1418         bs = *pbs;
1419     } else {
1420         bs = bdrv_new("", &error_abort);
1421     }
1422 
1423     /* NULL means an empty set of options */
1424     if (options == NULL) {
1425         options = qdict_new();
1426     }
1427 
1428     ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1429     if (local_err) {
1430         goto fail;
1431     }
1432 
1433     /* Find the right image format driver */
1434     drv = NULL;
1435     drvname = qdict_get_try_str(options, "driver");
1436     if (drvname) {
1437         drv = bdrv_find_format(drvname);
1438         qdict_del(options, "driver");
1439         if (!drv) {
1440             error_setg(errp, "Unknown driver: '%s'", drvname);
1441             ret = -EINVAL;
1442             goto fail;
1443         }
1444     }
1445 
1446     assert(drvname || !(flags & BDRV_O_PROTOCOL));
1447     if (drv && !drv->bdrv_file_open) {
1448         /* If the user explicitly wants a format driver here, we'll need to add
1449          * another layer for the protocol in bs->file */
1450         flags &= ~BDRV_O_PROTOCOL;
1451     }
1452 
1453     bs->options = options;
1454     options = qdict_clone_shallow(options);
1455 
1456     /* Open image file without format layer */
1457     if ((flags & BDRV_O_PROTOCOL) == 0) {
1458         if (flags & BDRV_O_RDWR) {
1459             flags |= BDRV_O_ALLOW_RDWR;
1460         }
1461         if (flags & BDRV_O_SNAPSHOT) {
1462             snapshot_flags = bdrv_temp_snapshot_flags(flags);
1463             flags = bdrv_backing_flags(flags);
1464         }
1465 
1466         assert(file == NULL);
1467         ret = bdrv_open_image(&file, filename, options, "file",
1468                               bdrv_inherited_flags(flags),
1469                               true, &local_err);
1470         if (ret < 0) {
1471             goto fail;
1472         }
1473     }
1474 
1475     /* Image format probing */
1476     if (!drv && file) {
1477         ret = find_image_format(file, filename, &drv, &local_err);
1478         if (ret < 0) {
1479             goto fail;
1480         }
1481     } else if (!drv) {
1482         error_setg(errp, "Must specify either driver or file");
1483         ret = -EINVAL;
1484         goto fail;
1485     }
1486 
1487     /* Open the image */
1488     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1489     if (ret < 0) {
1490         goto fail;
1491     }
1492 
1493     if (file && (bs->file != file)) {
1494         bdrv_unref(file);
1495         file = NULL;
1496     }
1497 
1498     /* If there is a backing file, use it */
1499     if ((flags & BDRV_O_NO_BACKING) == 0) {
1500         QDict *backing_options;
1501 
1502         qdict_extract_subqdict(options, &backing_options, "backing.");
1503         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1504         if (ret < 0) {
1505             goto close_and_fail;
1506         }
1507     }
1508 
1509     bdrv_refresh_filename(bs);
1510 
1511     /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1512      * temporary snapshot afterwards. */
1513     if (snapshot_flags) {
1514         ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1515         if (local_err) {
1516             goto close_and_fail;
1517         }
1518     }
1519 
1520     /* Check if any unknown options were used */
1521     if (options && (qdict_size(options) != 0)) {
1522         const QDictEntry *entry = qdict_first(options);
1523         if (flags & BDRV_O_PROTOCOL) {
1524             error_setg(errp, "Block protocol '%s' doesn't support the option "
1525                        "'%s'", drv->format_name, entry->key);
1526         } else {
1527             error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1528                        "support the option '%s'", drv->format_name,
1529                        bs->device_name, entry->key);
1530         }
1531 
1532         ret = -EINVAL;
1533         goto close_and_fail;
1534     }
1535 
1536     if (!bdrv_key_required(bs)) {
1537         bdrv_dev_change_media_cb(bs, true);
1538     } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1539                && !runstate_check(RUN_STATE_INMIGRATE)
1540                && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1541         error_setg(errp,
1542                    "Guest must be stopped for opening of encrypted image");
1543         ret = -EBUSY;
1544         goto close_and_fail;
1545     }
1546 
1547     QDECREF(options);
1548     *pbs = bs;
1549     return 0;
1550 
1551 fail:
1552     if (file != NULL) {
1553         bdrv_unref(file);
1554     }
1555     QDECREF(bs->options);
1556     QDECREF(options);
1557     bs->options = NULL;
1558     if (!*pbs) {
1559         /* If *pbs is NULL, a new BDS has been created in this function and
1560            needs to be freed now. Otherwise, it does not need to be closed,
1561            since it has not really been opened yet. */
1562         bdrv_unref(bs);
1563     }
1564     if (local_err) {
1565         error_propagate(errp, local_err);
1566     }
1567     return ret;
1568 
1569 close_and_fail:
1570     /* See fail path, but now the BDS has to be always closed */
1571     if (*pbs) {
1572         bdrv_close(bs);
1573     } else {
1574         bdrv_unref(bs);
1575     }
1576     QDECREF(options);
1577     if (local_err) {
1578         error_propagate(errp, local_err);
1579     }
1580     return ret;
1581 }
1582 
1583 typedef struct BlockReopenQueueEntry {
1584      bool prepared;
1585      BDRVReopenState state;
1586      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1587 } BlockReopenQueueEntry;
1588 
1589 /*
1590  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1591  * reopen of multiple devices.
1592  *
1593  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1594  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1595  * be created and initialized. This newly created BlockReopenQueue should be
1596  * passed back in for subsequent calls that are intended to be of the same
1597  * atomic 'set'.
1598  *
1599  * bs is the BlockDriverState to add to the reopen queue.
1600  *
1601  * flags contains the open flags for the associated bs
1602  *
1603  * returns a pointer to bs_queue, which is either the newly allocated
1604  * bs_queue, or the existing bs_queue being used.
1605  *
1606  */
1607 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1608                                     BlockDriverState *bs, int flags)
1609 {
1610     assert(bs != NULL);
1611 
1612     BlockReopenQueueEntry *bs_entry;
1613     if (bs_queue == NULL) {
1614         bs_queue = g_new0(BlockReopenQueue, 1);
1615         QSIMPLEQ_INIT(bs_queue);
1616     }
1617 
1618     /* bdrv_open() masks this flag out */
1619     flags &= ~BDRV_O_PROTOCOL;
1620 
1621     if (bs->file) {
1622         bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1623     }
1624 
1625     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1626     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1627 
1628     bs_entry->state.bs = bs;
1629     bs_entry->state.flags = flags;
1630 
1631     return bs_queue;
1632 }
1633 
1634 /*
1635  * Reopen multiple BlockDriverStates atomically & transactionally.
1636  *
1637  * The queue passed in (bs_queue) must have been built up previous
1638  * via bdrv_reopen_queue().
1639  *
1640  * Reopens all BDS specified in the queue, with the appropriate
1641  * flags.  All devices are prepared for reopen, and failure of any
1642  * device will cause all device changes to be abandonded, and intermediate
1643  * data cleaned up.
1644  *
1645  * If all devices prepare successfully, then the changes are committed
1646  * to all devices.
1647  *
1648  */
1649 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1650 {
1651     int ret = -1;
1652     BlockReopenQueueEntry *bs_entry, *next;
1653     Error *local_err = NULL;
1654 
1655     assert(bs_queue != NULL);
1656 
1657     bdrv_drain_all();
1658 
1659     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1660         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1661             error_propagate(errp, local_err);
1662             goto cleanup;
1663         }
1664         bs_entry->prepared = true;
1665     }
1666 
1667     /* If we reach this point, we have success and just need to apply the
1668      * changes
1669      */
1670     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1671         bdrv_reopen_commit(&bs_entry->state);
1672     }
1673 
1674     ret = 0;
1675 
1676 cleanup:
1677     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1678         if (ret && bs_entry->prepared) {
1679             bdrv_reopen_abort(&bs_entry->state);
1680         }
1681         g_free(bs_entry);
1682     }
1683     g_free(bs_queue);
1684     return ret;
1685 }
1686 
1687 
1688 /* Reopen a single BlockDriverState with the specified flags. */
1689 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1690 {
1691     int ret = -1;
1692     Error *local_err = NULL;
1693     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1694 
1695     ret = bdrv_reopen_multiple(queue, &local_err);
1696     if (local_err != NULL) {
1697         error_propagate(errp, local_err);
1698     }
1699     return ret;
1700 }
1701 
1702 
1703 /*
1704  * Prepares a BlockDriverState for reopen. All changes are staged in the
1705  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1706  * the block driver layer .bdrv_reopen_prepare()
1707  *
1708  * bs is the BlockDriverState to reopen
1709  * flags are the new open flags
1710  * queue is the reopen queue
1711  *
1712  * Returns 0 on success, non-zero on error.  On error errp will be set
1713  * as well.
1714  *
1715  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1716  * It is the responsibility of the caller to then call the abort() or
1717  * commit() for any other BDS that have been left in a prepare() state
1718  *
1719  */
1720 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1721                         Error **errp)
1722 {
1723     int ret = -1;
1724     Error *local_err = NULL;
1725     BlockDriver *drv;
1726 
1727     assert(reopen_state != NULL);
1728     assert(reopen_state->bs->drv != NULL);
1729     drv = reopen_state->bs->drv;
1730 
1731     /* if we are to stay read-only, do not allow permission change
1732      * to r/w */
1733     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1734         reopen_state->flags & BDRV_O_RDWR) {
1735         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1736                   reopen_state->bs->device_name);
1737         goto error;
1738     }
1739 
1740 
1741     ret = bdrv_flush(reopen_state->bs);
1742     if (ret) {
1743         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1744                   strerror(-ret));
1745         goto error;
1746     }
1747 
1748     if (drv->bdrv_reopen_prepare) {
1749         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1750         if (ret) {
1751             if (local_err != NULL) {
1752                 error_propagate(errp, local_err);
1753             } else {
1754                 error_setg(errp, "failed while preparing to reopen image '%s'",
1755                            reopen_state->bs->filename);
1756             }
1757             goto error;
1758         }
1759     } else {
1760         /* It is currently mandatory to have a bdrv_reopen_prepare()
1761          * handler for each supported drv. */
1762         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1763                   drv->format_name, reopen_state->bs->device_name,
1764                  "reopening of file");
1765         ret = -1;
1766         goto error;
1767     }
1768 
1769     ret = 0;
1770 
1771 error:
1772     return ret;
1773 }
1774 
1775 /*
1776  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1777  * makes them final by swapping the staging BlockDriverState contents into
1778  * the active BlockDriverState contents.
1779  */
1780 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1781 {
1782     BlockDriver *drv;
1783 
1784     assert(reopen_state != NULL);
1785     drv = reopen_state->bs->drv;
1786     assert(drv != NULL);
1787 
1788     /* If there are any driver level actions to take */
1789     if (drv->bdrv_reopen_commit) {
1790         drv->bdrv_reopen_commit(reopen_state);
1791     }
1792 
1793     /* set BDS specific flags now */
1794     reopen_state->bs->open_flags         = reopen_state->flags;
1795     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1796                                               BDRV_O_CACHE_WB);
1797     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1798 
1799     bdrv_refresh_limits(reopen_state->bs, NULL);
1800 }
1801 
1802 /*
1803  * Abort the reopen, and delete and free the staged changes in
1804  * reopen_state
1805  */
1806 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1807 {
1808     BlockDriver *drv;
1809 
1810     assert(reopen_state != NULL);
1811     drv = reopen_state->bs->drv;
1812     assert(drv != NULL);
1813 
1814     if (drv->bdrv_reopen_abort) {
1815         drv->bdrv_reopen_abort(reopen_state);
1816     }
1817 }
1818 
1819 
1820 void bdrv_close(BlockDriverState *bs)
1821 {
1822     BdrvAioNotifier *ban, *ban_next;
1823 
1824     if (bs->job) {
1825         block_job_cancel_sync(bs->job);
1826     }
1827     bdrv_drain_all(); /* complete I/O */
1828     bdrv_flush(bs);
1829     bdrv_drain_all(); /* in case flush left pending I/O */
1830     notifier_list_notify(&bs->close_notifiers, bs);
1831 
1832     if (bs->drv) {
1833         if (bs->backing_hd) {
1834             BlockDriverState *backing_hd = bs->backing_hd;
1835             bdrv_set_backing_hd(bs, NULL);
1836             bdrv_unref(backing_hd);
1837         }
1838         bs->drv->bdrv_close(bs);
1839         g_free(bs->opaque);
1840         bs->opaque = NULL;
1841         bs->drv = NULL;
1842         bs->copy_on_read = 0;
1843         bs->backing_file[0] = '\0';
1844         bs->backing_format[0] = '\0';
1845         bs->total_sectors = 0;
1846         bs->encrypted = 0;
1847         bs->valid_key = 0;
1848         bs->sg = 0;
1849         bs->growable = 0;
1850         bs->zero_beyond_eof = false;
1851         QDECREF(bs->options);
1852         bs->options = NULL;
1853         QDECREF(bs->full_open_options);
1854         bs->full_open_options = NULL;
1855 
1856         if (bs->file != NULL) {
1857             bdrv_unref(bs->file);
1858             bs->file = NULL;
1859         }
1860     }
1861 
1862     bdrv_dev_change_media_cb(bs, false);
1863 
1864     /*throttling disk I/O limits*/
1865     if (bs->io_limits_enabled) {
1866         bdrv_io_limits_disable(bs);
1867     }
1868 
1869     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1870         g_free(ban);
1871     }
1872     QLIST_INIT(&bs->aio_notifiers);
1873 }
1874 
1875 void bdrv_close_all(void)
1876 {
1877     BlockDriverState *bs;
1878 
1879     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1880         AioContext *aio_context = bdrv_get_aio_context(bs);
1881 
1882         aio_context_acquire(aio_context);
1883         bdrv_close(bs);
1884         aio_context_release(aio_context);
1885     }
1886 }
1887 
1888 /* Check if any requests are in-flight (including throttled requests) */
1889 static bool bdrv_requests_pending(BlockDriverState *bs)
1890 {
1891     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1892         return true;
1893     }
1894     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1895         return true;
1896     }
1897     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1898         return true;
1899     }
1900     if (bs->file && bdrv_requests_pending(bs->file)) {
1901         return true;
1902     }
1903     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1904         return true;
1905     }
1906     return false;
1907 }
1908 
1909 /*
1910  * Wait for pending requests to complete across all BlockDriverStates
1911  *
1912  * This function does not flush data to disk, use bdrv_flush_all() for that
1913  * after calling this function.
1914  *
1915  * Note that completion of an asynchronous I/O operation can trigger any
1916  * number of other I/O operations on other devices---for example a coroutine
1917  * can be arbitrarily complex and a constant flow of I/O can come until the
1918  * coroutine is complete.  Because of this, it is not possible to have a
1919  * function to drain a single device's I/O queue.
1920  */
1921 void bdrv_drain_all(void)
1922 {
1923     /* Always run first iteration so any pending completion BHs run */
1924     bool busy = true;
1925     BlockDriverState *bs;
1926 
1927     while (busy) {
1928         busy = false;
1929 
1930         QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1931             AioContext *aio_context = bdrv_get_aio_context(bs);
1932             bool bs_busy;
1933 
1934             aio_context_acquire(aio_context);
1935             bdrv_flush_io_queue(bs);
1936             bdrv_start_throttled_reqs(bs);
1937             bs_busy = bdrv_requests_pending(bs);
1938             bs_busy |= aio_poll(aio_context, bs_busy);
1939             aio_context_release(aio_context);
1940 
1941             busy |= bs_busy;
1942         }
1943     }
1944 }
1945 
1946 /* make a BlockDriverState anonymous by removing from bdrv_state and
1947  * graph_bdrv_state list.
1948    Also, NULL terminate the device_name to prevent double remove */
1949 void bdrv_make_anon(BlockDriverState *bs)
1950 {
1951     if (bs->device_name[0] != '\0') {
1952         QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1953     }
1954     bs->device_name[0] = '\0';
1955     if (bs->node_name[0] != '\0') {
1956         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1957     }
1958     bs->node_name[0] = '\0';
1959 }
1960 
1961 static void bdrv_rebind(BlockDriverState *bs)
1962 {
1963     if (bs->drv && bs->drv->bdrv_rebind) {
1964         bs->drv->bdrv_rebind(bs);
1965     }
1966 }
1967 
1968 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1969                                      BlockDriverState *bs_src)
1970 {
1971     /* move some fields that need to stay attached to the device */
1972 
1973     /* dev info */
1974     bs_dest->dev_ops            = bs_src->dev_ops;
1975     bs_dest->dev_opaque         = bs_src->dev_opaque;
1976     bs_dest->dev                = bs_src->dev;
1977     bs_dest->guest_block_size   = bs_src->guest_block_size;
1978     bs_dest->copy_on_read       = bs_src->copy_on_read;
1979 
1980     bs_dest->enable_write_cache = bs_src->enable_write_cache;
1981 
1982     /* i/o throttled req */
1983     memcpy(&bs_dest->throttle_state,
1984            &bs_src->throttle_state,
1985            sizeof(ThrottleState));
1986     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1987     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1988     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1989 
1990     /* r/w error */
1991     bs_dest->on_read_error      = bs_src->on_read_error;
1992     bs_dest->on_write_error     = bs_src->on_write_error;
1993 
1994     /* i/o status */
1995     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1996     bs_dest->iostatus           = bs_src->iostatus;
1997 
1998     /* dirty bitmap */
1999     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
2000 
2001     /* reference count */
2002     bs_dest->refcnt             = bs_src->refcnt;
2003 
2004     /* job */
2005     bs_dest->job                = bs_src->job;
2006 
2007     /* keep the same entry in bdrv_states */
2008     pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
2009             bs_src->device_name);
2010     bs_dest->device_list = bs_src->device_list;
2011     memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2012            sizeof(bs_dest->op_blockers));
2013 }
2014 
2015 /*
2016  * Swap bs contents for two image chains while they are live,
2017  * while keeping required fields on the BlockDriverState that is
2018  * actually attached to a device.
2019  *
2020  * This will modify the BlockDriverState fields, and swap contents
2021  * between bs_new and bs_old. Both bs_new and bs_old are modified.
2022  *
2023  * bs_new is required to be anonymous.
2024  *
2025  * This function does not create any image files.
2026  */
2027 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2028 {
2029     BlockDriverState tmp;
2030 
2031     /* The code needs to swap the node_name but simply swapping node_list won't
2032      * work so first remove the nodes from the graph list, do the swap then
2033      * insert them back if needed.
2034      */
2035     if (bs_new->node_name[0] != '\0') {
2036         QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2037     }
2038     if (bs_old->node_name[0] != '\0') {
2039         QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2040     }
2041 
2042     /* bs_new must be anonymous and shouldn't have anything fancy enabled */
2043     assert(bs_new->device_name[0] == '\0');
2044     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2045     assert(bs_new->job == NULL);
2046     assert(bs_new->dev == NULL);
2047     assert(bs_new->io_limits_enabled == false);
2048     assert(!throttle_have_timer(&bs_new->throttle_state));
2049 
2050     tmp = *bs_new;
2051     *bs_new = *bs_old;
2052     *bs_old = tmp;
2053 
2054     /* there are some fields that should not be swapped, move them back */
2055     bdrv_move_feature_fields(&tmp, bs_old);
2056     bdrv_move_feature_fields(bs_old, bs_new);
2057     bdrv_move_feature_fields(bs_new, &tmp);
2058 
2059     /* bs_new shouldn't be in bdrv_states even after the swap!  */
2060     assert(bs_new->device_name[0] == '\0');
2061 
2062     /* Check a few fields that should remain attached to the device */
2063     assert(bs_new->dev == NULL);
2064     assert(bs_new->job == NULL);
2065     assert(bs_new->io_limits_enabled == false);
2066     assert(!throttle_have_timer(&bs_new->throttle_state));
2067 
2068     /* insert the nodes back into the graph node list if needed */
2069     if (bs_new->node_name[0] != '\0') {
2070         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2071     }
2072     if (bs_old->node_name[0] != '\0') {
2073         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2074     }
2075 
2076     bdrv_rebind(bs_new);
2077     bdrv_rebind(bs_old);
2078 }
2079 
2080 /*
2081  * Add new bs contents at the top of an image chain while the chain is
2082  * live, while keeping required fields on the top layer.
2083  *
2084  * This will modify the BlockDriverState fields, and swap contents
2085  * between bs_new and bs_top. Both bs_new and bs_top are modified.
2086  *
2087  * bs_new is required to be anonymous.
2088  *
2089  * This function does not create any image files.
2090  */
2091 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2092 {
2093     bdrv_swap(bs_new, bs_top);
2094 
2095     /* The contents of 'tmp' will become bs_top, as we are
2096      * swapping bs_new and bs_top contents. */
2097     bdrv_set_backing_hd(bs_top, bs_new);
2098 }
2099 
2100 static void bdrv_delete(BlockDriverState *bs)
2101 {
2102     assert(!bs->dev);
2103     assert(!bs->job);
2104     assert(bdrv_op_blocker_is_empty(bs));
2105     assert(!bs->refcnt);
2106     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2107 
2108     bdrv_close(bs);
2109 
2110     /* remove from list, if necessary */
2111     bdrv_make_anon(bs);
2112 
2113     g_free(bs);
2114 }
2115 
2116 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
2117 /* TODO change to DeviceState *dev when all users are qdevified */
2118 {
2119     if (bs->dev) {
2120         return -EBUSY;
2121     }
2122     bs->dev = dev;
2123     bdrv_iostatus_reset(bs);
2124 
2125     /* We're expecting I/O from the device so bump up coroutine pool size */
2126     qemu_coroutine_adjust_pool_size(COROUTINE_POOL_RESERVATION);
2127     return 0;
2128 }
2129 
2130 /* TODO qdevified devices don't use this, remove when devices are qdevified */
2131 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
2132 {
2133     if (bdrv_attach_dev(bs, dev) < 0) {
2134         abort();
2135     }
2136 }
2137 
2138 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
2139 /* TODO change to DeviceState *dev when all users are qdevified */
2140 {
2141     assert(bs->dev == dev);
2142     bs->dev = NULL;
2143     bs->dev_ops = NULL;
2144     bs->dev_opaque = NULL;
2145     bs->guest_block_size = 512;
2146     qemu_coroutine_adjust_pool_size(-COROUTINE_POOL_RESERVATION);
2147 }
2148 
2149 /* TODO change to return DeviceState * when all users are qdevified */
2150 void *bdrv_get_attached_dev(BlockDriverState *bs)
2151 {
2152     return bs->dev;
2153 }
2154 
2155 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2156                       void *opaque)
2157 {
2158     bs->dev_ops = ops;
2159     bs->dev_opaque = opaque;
2160 }
2161 
2162 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2163 {
2164     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2165         bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2166         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2167         if (tray_was_closed) {
2168             /* tray open */
2169             qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
2170                                               true, &error_abort);
2171         }
2172         if (load) {
2173             /* tray close */
2174             qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
2175                                               false, &error_abort);
2176         }
2177     }
2178 }
2179 
2180 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2181 {
2182     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2183 }
2184 
2185 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2186 {
2187     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2188         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2189     }
2190 }
2191 
2192 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2193 {
2194     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2195         return bs->dev_ops->is_tray_open(bs->dev_opaque);
2196     }
2197     return false;
2198 }
2199 
2200 static void bdrv_dev_resize_cb(BlockDriverState *bs)
2201 {
2202     if (bs->dev_ops && bs->dev_ops->resize_cb) {
2203         bs->dev_ops->resize_cb(bs->dev_opaque);
2204     }
2205 }
2206 
2207 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2208 {
2209     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2210         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2211     }
2212     return false;
2213 }
2214 
2215 /*
2216  * Run consistency checks on an image
2217  *
2218  * Returns 0 if the check could be completed (it doesn't mean that the image is
2219  * free of errors) or -errno when an internal error occurred. The results of the
2220  * check are stored in res.
2221  */
2222 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2223 {
2224     if (bs->drv == NULL) {
2225         return -ENOMEDIUM;
2226     }
2227     if (bs->drv->bdrv_check == NULL) {
2228         return -ENOTSUP;
2229     }
2230 
2231     memset(res, 0, sizeof(*res));
2232     return bs->drv->bdrv_check(bs, res, fix);
2233 }
2234 
2235 #define COMMIT_BUF_SECTORS 2048
2236 
2237 /* commit COW file into the raw image */
2238 int bdrv_commit(BlockDriverState *bs)
2239 {
2240     BlockDriver *drv = bs->drv;
2241     int64_t sector, total_sectors, length, backing_length;
2242     int n, ro, open_flags;
2243     int ret = 0;
2244     uint8_t *buf = NULL;
2245     char filename[PATH_MAX];
2246 
2247     if (!drv)
2248         return -ENOMEDIUM;
2249 
2250     if (!bs->backing_hd) {
2251         return -ENOTSUP;
2252     }
2253 
2254     if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2255         bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2256         return -EBUSY;
2257     }
2258 
2259     ro = bs->backing_hd->read_only;
2260     /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2261     pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2262     open_flags =  bs->backing_hd->open_flags;
2263 
2264     if (ro) {
2265         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2266             return -EACCES;
2267         }
2268     }
2269 
2270     length = bdrv_getlength(bs);
2271     if (length < 0) {
2272         ret = length;
2273         goto ro_cleanup;
2274     }
2275 
2276     backing_length = bdrv_getlength(bs->backing_hd);
2277     if (backing_length < 0) {
2278         ret = backing_length;
2279         goto ro_cleanup;
2280     }
2281 
2282     /* If our top snapshot is larger than the backing file image,
2283      * grow the backing file image if possible.  If not possible,
2284      * we must return an error */
2285     if (length > backing_length) {
2286         ret = bdrv_truncate(bs->backing_hd, length);
2287         if (ret < 0) {
2288             goto ro_cleanup;
2289         }
2290     }
2291 
2292     total_sectors = length >> BDRV_SECTOR_BITS;
2293 
2294     /* qemu_try_blockalign() for bs will choose an alignment that works for
2295      * bs->backing_hd as well, so no need to compare the alignment manually. */
2296     buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2297     if (buf == NULL) {
2298         ret = -ENOMEM;
2299         goto ro_cleanup;
2300     }
2301 
2302     for (sector = 0; sector < total_sectors; sector += n) {
2303         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2304         if (ret < 0) {
2305             goto ro_cleanup;
2306         }
2307         if (ret) {
2308             ret = bdrv_read(bs, sector, buf, n);
2309             if (ret < 0) {
2310                 goto ro_cleanup;
2311             }
2312 
2313             ret = bdrv_write(bs->backing_hd, sector, buf, n);
2314             if (ret < 0) {
2315                 goto ro_cleanup;
2316             }
2317         }
2318     }
2319 
2320     if (drv->bdrv_make_empty) {
2321         ret = drv->bdrv_make_empty(bs);
2322         if (ret < 0) {
2323             goto ro_cleanup;
2324         }
2325         bdrv_flush(bs);
2326     }
2327 
2328     /*
2329      * Make sure all data we wrote to the backing device is actually
2330      * stable on disk.
2331      */
2332     if (bs->backing_hd) {
2333         bdrv_flush(bs->backing_hd);
2334     }
2335 
2336     ret = 0;
2337 ro_cleanup:
2338     qemu_vfree(buf);
2339 
2340     if (ro) {
2341         /* ignoring error return here */
2342         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2343     }
2344 
2345     return ret;
2346 }
2347 
2348 int bdrv_commit_all(void)
2349 {
2350     BlockDriverState *bs;
2351 
2352     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2353         AioContext *aio_context = bdrv_get_aio_context(bs);
2354 
2355         aio_context_acquire(aio_context);
2356         if (bs->drv && bs->backing_hd) {
2357             int ret = bdrv_commit(bs);
2358             if (ret < 0) {
2359                 aio_context_release(aio_context);
2360                 return ret;
2361             }
2362         }
2363         aio_context_release(aio_context);
2364     }
2365     return 0;
2366 }
2367 
2368 /**
2369  * Remove an active request from the tracked requests list
2370  *
2371  * This function should be called when a tracked request is completing.
2372  */
2373 static void tracked_request_end(BdrvTrackedRequest *req)
2374 {
2375     if (req->serialising) {
2376         req->bs->serialising_in_flight--;
2377     }
2378 
2379     QLIST_REMOVE(req, list);
2380     qemu_co_queue_restart_all(&req->wait_queue);
2381 }
2382 
2383 /**
2384  * Add an active request to the tracked requests list
2385  */
2386 static void tracked_request_begin(BdrvTrackedRequest *req,
2387                                   BlockDriverState *bs,
2388                                   int64_t offset,
2389                                   unsigned int bytes, bool is_write)
2390 {
2391     *req = (BdrvTrackedRequest){
2392         .bs = bs,
2393         .offset         = offset,
2394         .bytes          = bytes,
2395         .is_write       = is_write,
2396         .co             = qemu_coroutine_self(),
2397         .serialising    = false,
2398         .overlap_offset = offset,
2399         .overlap_bytes  = bytes,
2400     };
2401 
2402     qemu_co_queue_init(&req->wait_queue);
2403 
2404     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2405 }
2406 
2407 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2408 {
2409     int64_t overlap_offset = req->offset & ~(align - 1);
2410     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2411                                - overlap_offset;
2412 
2413     if (!req->serialising) {
2414         req->bs->serialising_in_flight++;
2415         req->serialising = true;
2416     }
2417 
2418     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2419     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2420 }
2421 
2422 /**
2423  * Round a region to cluster boundaries
2424  */
2425 void bdrv_round_to_clusters(BlockDriverState *bs,
2426                             int64_t sector_num, int nb_sectors,
2427                             int64_t *cluster_sector_num,
2428                             int *cluster_nb_sectors)
2429 {
2430     BlockDriverInfo bdi;
2431 
2432     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2433         *cluster_sector_num = sector_num;
2434         *cluster_nb_sectors = nb_sectors;
2435     } else {
2436         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2437         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2438         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2439                                             nb_sectors, c);
2440     }
2441 }
2442 
2443 static int bdrv_get_cluster_size(BlockDriverState *bs)
2444 {
2445     BlockDriverInfo bdi;
2446     int ret;
2447 
2448     ret = bdrv_get_info(bs, &bdi);
2449     if (ret < 0 || bdi.cluster_size == 0) {
2450         return bs->request_alignment;
2451     } else {
2452         return bdi.cluster_size;
2453     }
2454 }
2455 
2456 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2457                                      int64_t offset, unsigned int bytes)
2458 {
2459     /*        aaaa   bbbb */
2460     if (offset >= req->overlap_offset + req->overlap_bytes) {
2461         return false;
2462     }
2463     /* bbbb   aaaa        */
2464     if (req->overlap_offset >= offset + bytes) {
2465         return false;
2466     }
2467     return true;
2468 }
2469 
2470 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2471 {
2472     BlockDriverState *bs = self->bs;
2473     BdrvTrackedRequest *req;
2474     bool retry;
2475     bool waited = false;
2476 
2477     if (!bs->serialising_in_flight) {
2478         return false;
2479     }
2480 
2481     do {
2482         retry = false;
2483         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2484             if (req == self || (!req->serialising && !self->serialising)) {
2485                 continue;
2486             }
2487             if (tracked_request_overlaps(req, self->overlap_offset,
2488                                          self->overlap_bytes))
2489             {
2490                 /* Hitting this means there was a reentrant request, for
2491                  * example, a block driver issuing nested requests.  This must
2492                  * never happen since it means deadlock.
2493                  */
2494                 assert(qemu_coroutine_self() != req->co);
2495 
2496                 /* If the request is already (indirectly) waiting for us, or
2497                  * will wait for us as soon as it wakes up, then just go on
2498                  * (instead of producing a deadlock in the former case). */
2499                 if (!req->waiting_for) {
2500                     self->waiting_for = req;
2501                     qemu_co_queue_wait(&req->wait_queue);
2502                     self->waiting_for = NULL;
2503                     retry = true;
2504                     waited = true;
2505                     break;
2506                 }
2507             }
2508         }
2509     } while (retry);
2510 
2511     return waited;
2512 }
2513 
2514 /*
2515  * Return values:
2516  * 0        - success
2517  * -EINVAL  - backing format specified, but no file
2518  * -ENOSPC  - can't update the backing file because no space is left in the
2519  *            image file header
2520  * -ENOTSUP - format driver doesn't support changing the backing file
2521  */
2522 int bdrv_change_backing_file(BlockDriverState *bs,
2523     const char *backing_file, const char *backing_fmt)
2524 {
2525     BlockDriver *drv = bs->drv;
2526     int ret;
2527 
2528     /* Backing file format doesn't make sense without a backing file */
2529     if (backing_fmt && !backing_file) {
2530         return -EINVAL;
2531     }
2532 
2533     if (drv->bdrv_change_backing_file != NULL) {
2534         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2535     } else {
2536         ret = -ENOTSUP;
2537     }
2538 
2539     if (ret == 0) {
2540         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2541         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2542     }
2543     return ret;
2544 }
2545 
2546 /*
2547  * Finds the image layer in the chain that has 'bs' as its backing file.
2548  *
2549  * active is the current topmost image.
2550  *
2551  * Returns NULL if bs is not found in active's image chain,
2552  * or if active == bs.
2553  *
2554  * Returns the bottommost base image if bs == NULL.
2555  */
2556 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2557                                     BlockDriverState *bs)
2558 {
2559     while (active && bs != active->backing_hd) {
2560         active = active->backing_hd;
2561     }
2562 
2563     return active;
2564 }
2565 
2566 /* Given a BDS, searches for the base layer. */
2567 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2568 {
2569     return bdrv_find_overlay(bs, NULL);
2570 }
2571 
2572 typedef struct BlkIntermediateStates {
2573     BlockDriverState *bs;
2574     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2575 } BlkIntermediateStates;
2576 
2577 
2578 /*
2579  * Drops images above 'base' up to and including 'top', and sets the image
2580  * above 'top' to have base as its backing file.
2581  *
2582  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2583  * information in 'bs' can be properly updated.
2584  *
2585  * E.g., this will convert the following chain:
2586  * bottom <- base <- intermediate <- top <- active
2587  *
2588  * to
2589  *
2590  * bottom <- base <- active
2591  *
2592  * It is allowed for bottom==base, in which case it converts:
2593  *
2594  * base <- intermediate <- top <- active
2595  *
2596  * to
2597  *
2598  * base <- active
2599  *
2600  * If backing_file_str is non-NULL, it will be used when modifying top's
2601  * overlay image metadata.
2602  *
2603  * Error conditions:
2604  *  if active == top, that is considered an error
2605  *
2606  */
2607 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2608                            BlockDriverState *base, const char *backing_file_str)
2609 {
2610     BlockDriverState *intermediate;
2611     BlockDriverState *base_bs = NULL;
2612     BlockDriverState *new_top_bs = NULL;
2613     BlkIntermediateStates *intermediate_state, *next;
2614     int ret = -EIO;
2615 
2616     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2617     QSIMPLEQ_INIT(&states_to_delete);
2618 
2619     if (!top->drv || !base->drv) {
2620         goto exit;
2621     }
2622 
2623     new_top_bs = bdrv_find_overlay(active, top);
2624 
2625     if (new_top_bs == NULL) {
2626         /* we could not find the image above 'top', this is an error */
2627         goto exit;
2628     }
2629 
2630     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2631      * to do, no intermediate images */
2632     if (new_top_bs->backing_hd == base) {
2633         ret = 0;
2634         goto exit;
2635     }
2636 
2637     intermediate = top;
2638 
2639     /* now we will go down through the list, and add each BDS we find
2640      * into our deletion queue, until we hit the 'base'
2641      */
2642     while (intermediate) {
2643         intermediate_state = g_new0(BlkIntermediateStates, 1);
2644         intermediate_state->bs = intermediate;
2645         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2646 
2647         if (intermediate->backing_hd == base) {
2648             base_bs = intermediate->backing_hd;
2649             break;
2650         }
2651         intermediate = intermediate->backing_hd;
2652     }
2653     if (base_bs == NULL) {
2654         /* something went wrong, we did not end at the base. safely
2655          * unravel everything, and exit with error */
2656         goto exit;
2657     }
2658 
2659     /* success - we can delete the intermediate states, and link top->base */
2660     backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2661     ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2662                                    base_bs->drv ? base_bs->drv->format_name : "");
2663     if (ret) {
2664         goto exit;
2665     }
2666     bdrv_set_backing_hd(new_top_bs, base_bs);
2667 
2668     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2669         /* so that bdrv_close() does not recursively close the chain */
2670         bdrv_set_backing_hd(intermediate_state->bs, NULL);
2671         bdrv_unref(intermediate_state->bs);
2672     }
2673     ret = 0;
2674 
2675 exit:
2676     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2677         g_free(intermediate_state);
2678     }
2679     return ret;
2680 }
2681 
2682 
2683 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2684                                    size_t size)
2685 {
2686     int64_t len;
2687 
2688     if (size > INT_MAX) {
2689         return -EIO;
2690     }
2691 
2692     if (!bdrv_is_inserted(bs))
2693         return -ENOMEDIUM;
2694 
2695     if (bs->growable)
2696         return 0;
2697 
2698     len = bdrv_getlength(bs);
2699 
2700     if (offset < 0)
2701         return -EIO;
2702 
2703     if ((offset > len) || (len - offset < size))
2704         return -EIO;
2705 
2706     return 0;
2707 }
2708 
2709 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2710                               int nb_sectors)
2711 {
2712     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2713         return -EIO;
2714     }
2715 
2716     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2717                                    nb_sectors * BDRV_SECTOR_SIZE);
2718 }
2719 
2720 typedef struct RwCo {
2721     BlockDriverState *bs;
2722     int64_t offset;
2723     QEMUIOVector *qiov;
2724     bool is_write;
2725     int ret;
2726     BdrvRequestFlags flags;
2727 } RwCo;
2728 
2729 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2730 {
2731     RwCo *rwco = opaque;
2732 
2733     if (!rwco->is_write) {
2734         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2735                                       rwco->qiov->size, rwco->qiov,
2736                                       rwco->flags);
2737     } else {
2738         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2739                                        rwco->qiov->size, rwco->qiov,
2740                                        rwco->flags);
2741     }
2742 }
2743 
2744 /*
2745  * Process a vectored synchronous request using coroutines
2746  */
2747 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2748                         QEMUIOVector *qiov, bool is_write,
2749                         BdrvRequestFlags flags)
2750 {
2751     Coroutine *co;
2752     RwCo rwco = {
2753         .bs = bs,
2754         .offset = offset,
2755         .qiov = qiov,
2756         .is_write = is_write,
2757         .ret = NOT_DONE,
2758         .flags = flags,
2759     };
2760 
2761     /**
2762      * In sync call context, when the vcpu is blocked, this throttling timer
2763      * will not fire; so the I/O throttling function has to be disabled here
2764      * if it has been enabled.
2765      */
2766     if (bs->io_limits_enabled) {
2767         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2768                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2769         bdrv_io_limits_disable(bs);
2770     }
2771 
2772     if (qemu_in_coroutine()) {
2773         /* Fast-path if already in coroutine context */
2774         bdrv_rw_co_entry(&rwco);
2775     } else {
2776         AioContext *aio_context = bdrv_get_aio_context(bs);
2777 
2778         co = qemu_coroutine_create(bdrv_rw_co_entry);
2779         qemu_coroutine_enter(co, &rwco);
2780         while (rwco.ret == NOT_DONE) {
2781             aio_poll(aio_context, true);
2782         }
2783     }
2784     return rwco.ret;
2785 }
2786 
2787 /*
2788  * Process a synchronous request using coroutines
2789  */
2790 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2791                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2792 {
2793     QEMUIOVector qiov;
2794     struct iovec iov = {
2795         .iov_base = (void *)buf,
2796         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2797     };
2798 
2799     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2800         return -EINVAL;
2801     }
2802 
2803     qemu_iovec_init_external(&qiov, &iov, 1);
2804     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2805                         &qiov, is_write, flags);
2806 }
2807 
2808 /* return < 0 if error. See bdrv_write() for the return codes */
2809 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2810               uint8_t *buf, int nb_sectors)
2811 {
2812     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2813 }
2814 
2815 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2816 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2817                           uint8_t *buf, int nb_sectors)
2818 {
2819     bool enabled;
2820     int ret;
2821 
2822     enabled = bs->io_limits_enabled;
2823     bs->io_limits_enabled = false;
2824     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2825     bs->io_limits_enabled = enabled;
2826     return ret;
2827 }
2828 
2829 /* Return < 0 if error. Important errors are:
2830   -EIO         generic I/O error (may happen for all errors)
2831   -ENOMEDIUM   No media inserted.
2832   -EINVAL      Invalid sector number or nb_sectors
2833   -EACCES      Trying to write a read-only device
2834 */
2835 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2836                const uint8_t *buf, int nb_sectors)
2837 {
2838     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2839 }
2840 
2841 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2842                       int nb_sectors, BdrvRequestFlags flags)
2843 {
2844     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2845                       BDRV_REQ_ZERO_WRITE | flags);
2846 }
2847 
2848 /*
2849  * Completely zero out a block device with the help of bdrv_write_zeroes.
2850  * The operation is sped up by checking the block status and only writing
2851  * zeroes to the device if they currently do not return zeroes. Optional
2852  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2853  *
2854  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2855  */
2856 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2857 {
2858     int64_t target_sectors, ret, nb_sectors, sector_num = 0;
2859     int n;
2860 
2861     target_sectors = bdrv_nb_sectors(bs);
2862     if (target_sectors < 0) {
2863         return target_sectors;
2864     }
2865 
2866     for (;;) {
2867         nb_sectors = target_sectors - sector_num;
2868         if (nb_sectors <= 0) {
2869             return 0;
2870         }
2871         if (nb_sectors > INT_MAX) {
2872             nb_sectors = INT_MAX;
2873         }
2874         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2875         if (ret < 0) {
2876             error_report("error getting block status at sector %" PRId64 ": %s",
2877                          sector_num, strerror(-ret));
2878             return ret;
2879         }
2880         if (ret & BDRV_BLOCK_ZERO) {
2881             sector_num += n;
2882             continue;
2883         }
2884         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2885         if (ret < 0) {
2886             error_report("error writing zeroes at sector %" PRId64 ": %s",
2887                          sector_num, strerror(-ret));
2888             return ret;
2889         }
2890         sector_num += n;
2891     }
2892 }
2893 
2894 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2895 {
2896     QEMUIOVector qiov;
2897     struct iovec iov = {
2898         .iov_base = (void *)buf,
2899         .iov_len = bytes,
2900     };
2901     int ret;
2902 
2903     if (bytes < 0) {
2904         return -EINVAL;
2905     }
2906 
2907     qemu_iovec_init_external(&qiov, &iov, 1);
2908     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2909     if (ret < 0) {
2910         return ret;
2911     }
2912 
2913     return bytes;
2914 }
2915 
2916 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2917 {
2918     int ret;
2919 
2920     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2921     if (ret < 0) {
2922         return ret;
2923     }
2924 
2925     return qiov->size;
2926 }
2927 
2928 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2929                 const void *buf, int bytes)
2930 {
2931     QEMUIOVector qiov;
2932     struct iovec iov = {
2933         .iov_base   = (void *) buf,
2934         .iov_len    = bytes,
2935     };
2936 
2937     if (bytes < 0) {
2938         return -EINVAL;
2939     }
2940 
2941     qemu_iovec_init_external(&qiov, &iov, 1);
2942     return bdrv_pwritev(bs, offset, &qiov);
2943 }
2944 
2945 /*
2946  * Writes to the file and ensures that no writes are reordered across this
2947  * request (acts as a barrier)
2948  *
2949  * Returns 0 on success, -errno in error cases.
2950  */
2951 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2952     const void *buf, int count)
2953 {
2954     int ret;
2955 
2956     ret = bdrv_pwrite(bs, offset, buf, count);
2957     if (ret < 0) {
2958         return ret;
2959     }
2960 
2961     /* No flush needed for cache modes that already do it */
2962     if (bs->enable_write_cache) {
2963         bdrv_flush(bs);
2964     }
2965 
2966     return 0;
2967 }
2968 
2969 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2970         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2971 {
2972     /* Perform I/O through a temporary buffer so that users who scribble over
2973      * their read buffer while the operation is in progress do not end up
2974      * modifying the image file.  This is critical for zero-copy guest I/O
2975      * where anything might happen inside guest memory.
2976      */
2977     void *bounce_buffer;
2978 
2979     BlockDriver *drv = bs->drv;
2980     struct iovec iov;
2981     QEMUIOVector bounce_qiov;
2982     int64_t cluster_sector_num;
2983     int cluster_nb_sectors;
2984     size_t skip_bytes;
2985     int ret;
2986 
2987     /* Cover entire cluster so no additional backing file I/O is required when
2988      * allocating cluster in the image file.
2989      */
2990     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2991                            &cluster_sector_num, &cluster_nb_sectors);
2992 
2993     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2994                                    cluster_sector_num, cluster_nb_sectors);
2995 
2996     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2997     iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
2998     if (bounce_buffer == NULL) {
2999         ret = -ENOMEM;
3000         goto err;
3001     }
3002 
3003     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
3004 
3005     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
3006                              &bounce_qiov);
3007     if (ret < 0) {
3008         goto err;
3009     }
3010 
3011     if (drv->bdrv_co_write_zeroes &&
3012         buffer_is_zero(bounce_buffer, iov.iov_len)) {
3013         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
3014                                       cluster_nb_sectors, 0);
3015     } else {
3016         /* This does not change the data on the disk, it is not necessary
3017          * to flush even in cache=writethrough mode.
3018          */
3019         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
3020                                   &bounce_qiov);
3021     }
3022 
3023     if (ret < 0) {
3024         /* It might be okay to ignore write errors for guest requests.  If this
3025          * is a deliberate copy-on-read then we don't want to ignore the error.
3026          * Simply report it in all cases.
3027          */
3028         goto err;
3029     }
3030 
3031     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
3032     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
3033                         nb_sectors * BDRV_SECTOR_SIZE);
3034 
3035 err:
3036     qemu_vfree(bounce_buffer);
3037     return ret;
3038 }
3039 
3040 /*
3041  * Forwards an already correctly aligned request to the BlockDriver. This
3042  * handles copy on read and zeroing after EOF; any other features must be
3043  * implemented by the caller.
3044  */
3045 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
3046     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3047     int64_t align, QEMUIOVector *qiov, int flags)
3048 {
3049     BlockDriver *drv = bs->drv;
3050     int ret;
3051 
3052     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3053     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3054 
3055     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3056     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3057     assert(!qiov || bytes == qiov->size);
3058 
3059     /* Handle Copy on Read and associated serialisation */
3060     if (flags & BDRV_REQ_COPY_ON_READ) {
3061         /* If we touch the same cluster it counts as an overlap.  This
3062          * guarantees that allocating writes will be serialized and not race
3063          * with each other for the same cluster.  For example, in copy-on-read
3064          * it ensures that the CoR read and write operations are atomic and
3065          * guest writes cannot interleave between them. */
3066         mark_request_serialising(req, bdrv_get_cluster_size(bs));
3067     }
3068 
3069     wait_serialising_requests(req);
3070 
3071     if (flags & BDRV_REQ_COPY_ON_READ) {
3072         int pnum;
3073 
3074         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3075         if (ret < 0) {
3076             goto out;
3077         }
3078 
3079         if (!ret || pnum != nb_sectors) {
3080             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3081             goto out;
3082         }
3083     }
3084 
3085     /* Forward the request to the BlockDriver */
3086     if (!(bs->zero_beyond_eof && bs->growable)) {
3087         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3088     } else {
3089         /* Read zeros after EOF of growable BDSes */
3090         int64_t total_sectors, max_nb_sectors;
3091 
3092         total_sectors = bdrv_nb_sectors(bs);
3093         if (total_sectors < 0) {
3094             ret = total_sectors;
3095             goto out;
3096         }
3097 
3098         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3099                                   align >> BDRV_SECTOR_BITS);
3100         if (max_nb_sectors > 0) {
3101             QEMUIOVector local_qiov;
3102             size_t local_sectors;
3103 
3104             max_nb_sectors = MIN(max_nb_sectors, SIZE_MAX / BDRV_SECTOR_BITS);
3105             local_sectors = MIN(max_nb_sectors, nb_sectors);
3106 
3107             qemu_iovec_init(&local_qiov, qiov->niov);
3108             qemu_iovec_concat(&local_qiov, qiov, 0,
3109                               local_sectors * BDRV_SECTOR_SIZE);
3110 
3111             ret = drv->bdrv_co_readv(bs, sector_num, local_sectors,
3112                                      &local_qiov);
3113 
3114             qemu_iovec_destroy(&local_qiov);
3115         } else {
3116             ret = 0;
3117         }
3118 
3119         /* Reading beyond end of file is supposed to produce zeroes */
3120         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3121             uint64_t offset = MAX(0, total_sectors - sector_num);
3122             uint64_t bytes = (sector_num + nb_sectors - offset) *
3123                               BDRV_SECTOR_SIZE;
3124             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3125         }
3126     }
3127 
3128 out:
3129     return ret;
3130 }
3131 
3132 /*
3133  * Handle a read request in coroutine context
3134  */
3135 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3136     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3137     BdrvRequestFlags flags)
3138 {
3139     BlockDriver *drv = bs->drv;
3140     BdrvTrackedRequest req;
3141 
3142     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3143     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3144     uint8_t *head_buf = NULL;
3145     uint8_t *tail_buf = NULL;
3146     QEMUIOVector local_qiov;
3147     bool use_local_qiov = false;
3148     int ret;
3149 
3150     if (!drv) {
3151         return -ENOMEDIUM;
3152     }
3153     if (bdrv_check_byte_request(bs, offset, bytes)) {
3154         return -EIO;
3155     }
3156 
3157     if (bs->copy_on_read) {
3158         flags |= BDRV_REQ_COPY_ON_READ;
3159     }
3160 
3161     /* throttling disk I/O */
3162     if (bs->io_limits_enabled) {
3163         bdrv_io_limits_intercept(bs, bytes, false);
3164     }
3165 
3166     /* Align read if necessary by padding qiov */
3167     if (offset & (align - 1)) {
3168         head_buf = qemu_blockalign(bs, align);
3169         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3170         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3171         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3172         use_local_qiov = true;
3173 
3174         bytes += offset & (align - 1);
3175         offset = offset & ~(align - 1);
3176     }
3177 
3178     if ((offset + bytes) & (align - 1)) {
3179         if (!use_local_qiov) {
3180             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3181             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3182             use_local_qiov = true;
3183         }
3184         tail_buf = qemu_blockalign(bs, align);
3185         qemu_iovec_add(&local_qiov, tail_buf,
3186                        align - ((offset + bytes) & (align - 1)));
3187 
3188         bytes = ROUND_UP(bytes, align);
3189     }
3190 
3191     tracked_request_begin(&req, bs, offset, bytes, false);
3192     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3193                               use_local_qiov ? &local_qiov : qiov,
3194                               flags);
3195     tracked_request_end(&req);
3196 
3197     if (use_local_qiov) {
3198         qemu_iovec_destroy(&local_qiov);
3199         qemu_vfree(head_buf);
3200         qemu_vfree(tail_buf);
3201     }
3202 
3203     return ret;
3204 }
3205 
3206 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3207     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3208     BdrvRequestFlags flags)
3209 {
3210     if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3211         return -EINVAL;
3212     }
3213 
3214     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3215                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3216 }
3217 
3218 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3219     int nb_sectors, QEMUIOVector *qiov)
3220 {
3221     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3222 
3223     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3224 }
3225 
3226 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3227     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3228 {
3229     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3230 
3231     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3232                             BDRV_REQ_COPY_ON_READ);
3233 }
3234 
3235 /* if no limit is specified in the BlockLimits use a default
3236  * of 32768 512-byte sectors (16 MiB) per request.
3237  */
3238 #define MAX_WRITE_ZEROES_DEFAULT 32768
3239 
3240 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3241     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3242 {
3243     BlockDriver *drv = bs->drv;
3244     QEMUIOVector qiov;
3245     struct iovec iov = {0};
3246     int ret = 0;
3247 
3248     int max_write_zeroes = bs->bl.max_write_zeroes ?
3249                            bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3250 
3251     while (nb_sectors > 0 && !ret) {
3252         int num = nb_sectors;
3253 
3254         /* Align request.  Block drivers can expect the "bulk" of the request
3255          * to be aligned.
3256          */
3257         if (bs->bl.write_zeroes_alignment
3258             && num > bs->bl.write_zeroes_alignment) {
3259             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3260                 /* Make a small request up to the first aligned sector.  */
3261                 num = bs->bl.write_zeroes_alignment;
3262                 num -= sector_num % bs->bl.write_zeroes_alignment;
3263             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3264                 /* Shorten the request to the last aligned sector.  num cannot
3265                  * underflow because num > bs->bl.write_zeroes_alignment.
3266                  */
3267                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3268             }
3269         }
3270 
3271         /* limit request size */
3272         if (num > max_write_zeroes) {
3273             num = max_write_zeroes;
3274         }
3275 
3276         ret = -ENOTSUP;
3277         /* First try the efficient write zeroes operation */
3278         if (drv->bdrv_co_write_zeroes) {
3279             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3280         }
3281 
3282         if (ret == -ENOTSUP) {
3283             /* Fall back to bounce buffer if write zeroes is unsupported */
3284             iov.iov_len = num * BDRV_SECTOR_SIZE;
3285             if (iov.iov_base == NULL) {
3286                 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3287                 if (iov.iov_base == NULL) {
3288                     ret = -ENOMEM;
3289                     goto fail;
3290                 }
3291                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3292             }
3293             qemu_iovec_init_external(&qiov, &iov, 1);
3294 
3295             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3296 
3297             /* Keep bounce buffer around if it is big enough for all
3298              * all future requests.
3299              */
3300             if (num < max_write_zeroes) {
3301                 qemu_vfree(iov.iov_base);
3302                 iov.iov_base = NULL;
3303             }
3304         }
3305 
3306         sector_num += num;
3307         nb_sectors -= num;
3308     }
3309 
3310 fail:
3311     qemu_vfree(iov.iov_base);
3312     return ret;
3313 }
3314 
3315 /*
3316  * Forwards an already correctly aligned write request to the BlockDriver.
3317  */
3318 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3319     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3320     QEMUIOVector *qiov, int flags)
3321 {
3322     BlockDriver *drv = bs->drv;
3323     bool waited;
3324     int ret;
3325 
3326     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3327     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3328 
3329     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3330     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3331     assert(!qiov || bytes == qiov->size);
3332 
3333     waited = wait_serialising_requests(req);
3334     assert(!waited || !req->serialising);
3335     assert(req->overlap_offset <= offset);
3336     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3337 
3338     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3339 
3340     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3341         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3342         qemu_iovec_is_zero(qiov)) {
3343         flags |= BDRV_REQ_ZERO_WRITE;
3344         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3345             flags |= BDRV_REQ_MAY_UNMAP;
3346         }
3347     }
3348 
3349     if (ret < 0) {
3350         /* Do nothing, write notifier decided to fail this request */
3351     } else if (flags & BDRV_REQ_ZERO_WRITE) {
3352         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3353         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3354     } else {
3355         BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3356         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3357     }
3358     BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3359 
3360     if (ret == 0 && !bs->enable_write_cache) {
3361         ret = bdrv_co_flush(bs);
3362     }
3363 
3364     bdrv_set_dirty(bs, sector_num, nb_sectors);
3365 
3366     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3367         bs->wr_highest_sector = sector_num + nb_sectors - 1;
3368     }
3369     if (bs->growable && ret >= 0) {
3370         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3371     }
3372 
3373     return ret;
3374 }
3375 
3376 /*
3377  * Handle a write request in coroutine context
3378  */
3379 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3380     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3381     BdrvRequestFlags flags)
3382 {
3383     BdrvTrackedRequest req;
3384     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3385     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3386     uint8_t *head_buf = NULL;
3387     uint8_t *tail_buf = NULL;
3388     QEMUIOVector local_qiov;
3389     bool use_local_qiov = false;
3390     int ret;
3391 
3392     if (!bs->drv) {
3393         return -ENOMEDIUM;
3394     }
3395     if (bs->read_only) {
3396         return -EACCES;
3397     }
3398     if (bdrv_check_byte_request(bs, offset, bytes)) {
3399         return -EIO;
3400     }
3401 
3402     /* throttling disk I/O */
3403     if (bs->io_limits_enabled) {
3404         bdrv_io_limits_intercept(bs, bytes, true);
3405     }
3406 
3407     /*
3408      * Align write if necessary by performing a read-modify-write cycle.
3409      * Pad qiov with the read parts and be sure to have a tracked request not
3410      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3411      */
3412     tracked_request_begin(&req, bs, offset, bytes, true);
3413 
3414     if (offset & (align - 1)) {
3415         QEMUIOVector head_qiov;
3416         struct iovec head_iov;
3417 
3418         mark_request_serialising(&req, align);
3419         wait_serialising_requests(&req);
3420 
3421         head_buf = qemu_blockalign(bs, align);
3422         head_iov = (struct iovec) {
3423             .iov_base   = head_buf,
3424             .iov_len    = align,
3425         };
3426         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3427 
3428         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3429         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3430                                   align, &head_qiov, 0);
3431         if (ret < 0) {
3432             goto fail;
3433         }
3434         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3435 
3436         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3437         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3438         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3439         use_local_qiov = true;
3440 
3441         bytes += offset & (align - 1);
3442         offset = offset & ~(align - 1);
3443     }
3444 
3445     if ((offset + bytes) & (align - 1)) {
3446         QEMUIOVector tail_qiov;
3447         struct iovec tail_iov;
3448         size_t tail_bytes;
3449         bool waited;
3450 
3451         mark_request_serialising(&req, align);
3452         waited = wait_serialising_requests(&req);
3453         assert(!waited || !use_local_qiov);
3454 
3455         tail_buf = qemu_blockalign(bs, align);
3456         tail_iov = (struct iovec) {
3457             .iov_base   = tail_buf,
3458             .iov_len    = align,
3459         };
3460         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3461 
3462         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3463         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3464                                   align, &tail_qiov, 0);
3465         if (ret < 0) {
3466             goto fail;
3467         }
3468         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3469 
3470         if (!use_local_qiov) {
3471             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3472             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3473             use_local_qiov = true;
3474         }
3475 
3476         tail_bytes = (offset + bytes) & (align - 1);
3477         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3478 
3479         bytes = ROUND_UP(bytes, align);
3480     }
3481 
3482     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3483                                use_local_qiov ? &local_qiov : qiov,
3484                                flags);
3485 
3486 fail:
3487     tracked_request_end(&req);
3488 
3489     if (use_local_qiov) {
3490         qemu_iovec_destroy(&local_qiov);
3491     }
3492     qemu_vfree(head_buf);
3493     qemu_vfree(tail_buf);
3494 
3495     return ret;
3496 }
3497 
3498 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3499     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3500     BdrvRequestFlags flags)
3501 {
3502     if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3503         return -EINVAL;
3504     }
3505 
3506     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3507                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3508 }
3509 
3510 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3511     int nb_sectors, QEMUIOVector *qiov)
3512 {
3513     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3514 
3515     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3516 }
3517 
3518 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3519                                       int64_t sector_num, int nb_sectors,
3520                                       BdrvRequestFlags flags)
3521 {
3522     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3523 
3524     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3525         flags &= ~BDRV_REQ_MAY_UNMAP;
3526     }
3527 
3528     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3529                              BDRV_REQ_ZERO_WRITE | flags);
3530 }
3531 
3532 /**
3533  * Truncate file to 'offset' bytes (needed only for file protocols)
3534  */
3535 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3536 {
3537     BlockDriver *drv = bs->drv;
3538     int ret;
3539     if (!drv)
3540         return -ENOMEDIUM;
3541     if (!drv->bdrv_truncate)
3542         return -ENOTSUP;
3543     if (bs->read_only)
3544         return -EACCES;
3545 
3546     ret = drv->bdrv_truncate(bs, offset);
3547     if (ret == 0) {
3548         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3549         bdrv_dev_resize_cb(bs);
3550     }
3551     return ret;
3552 }
3553 
3554 /**
3555  * Length of a allocated file in bytes. Sparse files are counted by actual
3556  * allocated space. Return < 0 if error or unknown.
3557  */
3558 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3559 {
3560     BlockDriver *drv = bs->drv;
3561     if (!drv) {
3562         return -ENOMEDIUM;
3563     }
3564     if (drv->bdrv_get_allocated_file_size) {
3565         return drv->bdrv_get_allocated_file_size(bs);
3566     }
3567     if (bs->file) {
3568         return bdrv_get_allocated_file_size(bs->file);
3569     }
3570     return -ENOTSUP;
3571 }
3572 
3573 /**
3574  * Return number of sectors on success, -errno on error.
3575  */
3576 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3577 {
3578     BlockDriver *drv = bs->drv;
3579 
3580     if (!drv)
3581         return -ENOMEDIUM;
3582 
3583     if (drv->has_variable_length) {
3584         int ret = refresh_total_sectors(bs, bs->total_sectors);
3585         if (ret < 0) {
3586             return ret;
3587         }
3588     }
3589     return bs->total_sectors;
3590 }
3591 
3592 /**
3593  * Return length in bytes on success, -errno on error.
3594  * The length is always a multiple of BDRV_SECTOR_SIZE.
3595  */
3596 int64_t bdrv_getlength(BlockDriverState *bs)
3597 {
3598     int64_t ret = bdrv_nb_sectors(bs);
3599 
3600     return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3601 }
3602 
3603 /* return 0 as number of sectors if no device present or error */
3604 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3605 {
3606     int64_t nb_sectors = bdrv_nb_sectors(bs);
3607 
3608     *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3609 }
3610 
3611 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3612                        BlockdevOnError on_write_error)
3613 {
3614     bs->on_read_error = on_read_error;
3615     bs->on_write_error = on_write_error;
3616 }
3617 
3618 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3619 {
3620     return is_read ? bs->on_read_error : bs->on_write_error;
3621 }
3622 
3623 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3624 {
3625     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3626 
3627     switch (on_err) {
3628     case BLOCKDEV_ON_ERROR_ENOSPC:
3629         return (error == ENOSPC) ?
3630                BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3631     case BLOCKDEV_ON_ERROR_STOP:
3632         return BLOCK_ERROR_ACTION_STOP;
3633     case BLOCKDEV_ON_ERROR_REPORT:
3634         return BLOCK_ERROR_ACTION_REPORT;
3635     case BLOCKDEV_ON_ERROR_IGNORE:
3636         return BLOCK_ERROR_ACTION_IGNORE;
3637     default:
3638         abort();
3639     }
3640 }
3641 
3642 /* This is done by device models because, while the block layer knows
3643  * about the error, it does not know whether an operation comes from
3644  * the device or the block layer (from a job, for example).
3645  */
3646 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3647                        bool is_read, int error)
3648 {
3649     assert(error >= 0);
3650 
3651     if (action == BLOCK_ERROR_ACTION_STOP) {
3652         /* First set the iostatus, so that "info block" returns an iostatus
3653          * that matches the events raised so far (an additional error iostatus
3654          * is fine, but not a lost one).
3655          */
3656         bdrv_iostatus_set_err(bs, error);
3657 
3658         /* Then raise the request to stop the VM and the event.
3659          * qemu_system_vmstop_request_prepare has two effects.  First,
3660          * it ensures that the STOP event always comes after the
3661          * BLOCK_IO_ERROR event.  Second, it ensures that even if management
3662          * can observe the STOP event and do a "cont" before the STOP
3663          * event is issued, the VM will not stop.  In this case, vm_start()
3664          * also ensures that the STOP/RESUME pair of events is emitted.
3665          */
3666         qemu_system_vmstop_request_prepare();
3667         qapi_event_send_block_io_error(bdrv_get_device_name(bs),
3668                                        is_read ? IO_OPERATION_TYPE_READ :
3669                                        IO_OPERATION_TYPE_WRITE,
3670                                        action, &error_abort);
3671         qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3672     } else {
3673         qapi_event_send_block_io_error(bdrv_get_device_name(bs),
3674                                        is_read ? IO_OPERATION_TYPE_READ :
3675                                        IO_OPERATION_TYPE_WRITE,
3676                                        action, &error_abort);
3677     }
3678 }
3679 
3680 int bdrv_is_read_only(BlockDriverState *bs)
3681 {
3682     return bs->read_only;
3683 }
3684 
3685 int bdrv_is_sg(BlockDriverState *bs)
3686 {
3687     return bs->sg;
3688 }
3689 
3690 int bdrv_enable_write_cache(BlockDriverState *bs)
3691 {
3692     return bs->enable_write_cache;
3693 }
3694 
3695 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3696 {
3697     bs->enable_write_cache = wce;
3698 
3699     /* so a reopen() will preserve wce */
3700     if (wce) {
3701         bs->open_flags |= BDRV_O_CACHE_WB;
3702     } else {
3703         bs->open_flags &= ~BDRV_O_CACHE_WB;
3704     }
3705 }
3706 
3707 int bdrv_is_encrypted(BlockDriverState *bs)
3708 {
3709     if (bs->backing_hd && bs->backing_hd->encrypted)
3710         return 1;
3711     return bs->encrypted;
3712 }
3713 
3714 int bdrv_key_required(BlockDriverState *bs)
3715 {
3716     BlockDriverState *backing_hd = bs->backing_hd;
3717 
3718     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3719         return 1;
3720     return (bs->encrypted && !bs->valid_key);
3721 }
3722 
3723 int bdrv_set_key(BlockDriverState *bs, const char *key)
3724 {
3725     int ret;
3726     if (bs->backing_hd && bs->backing_hd->encrypted) {
3727         ret = bdrv_set_key(bs->backing_hd, key);
3728         if (ret < 0)
3729             return ret;
3730         if (!bs->encrypted)
3731             return 0;
3732     }
3733     if (!bs->encrypted) {
3734         return -EINVAL;
3735     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3736         return -ENOMEDIUM;
3737     }
3738     ret = bs->drv->bdrv_set_key(bs, key);
3739     if (ret < 0) {
3740         bs->valid_key = 0;
3741     } else if (!bs->valid_key) {
3742         bs->valid_key = 1;
3743         /* call the change callback now, we skipped it on open */
3744         bdrv_dev_change_media_cb(bs, true);
3745     }
3746     return ret;
3747 }
3748 
3749 const char *bdrv_get_format_name(BlockDriverState *bs)
3750 {
3751     return bs->drv ? bs->drv->format_name : NULL;
3752 }
3753 
3754 static int qsort_strcmp(const void *a, const void *b)
3755 {
3756     return strcmp(a, b);
3757 }
3758 
3759 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3760                          void *opaque)
3761 {
3762     BlockDriver *drv;
3763     int count = 0;
3764     int i;
3765     const char **formats = NULL;
3766 
3767     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3768         if (drv->format_name) {
3769             bool found = false;
3770             int i = count;
3771             while (formats && i && !found) {
3772                 found = !strcmp(formats[--i], drv->format_name);
3773             }
3774 
3775             if (!found) {
3776                 formats = g_renew(const char *, formats, count + 1);
3777                 formats[count++] = drv->format_name;
3778             }
3779         }
3780     }
3781 
3782     qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3783 
3784     for (i = 0; i < count; i++) {
3785         it(opaque, formats[i]);
3786     }
3787 
3788     g_free(formats);
3789 }
3790 
3791 /* This function is to find block backend bs */
3792 BlockDriverState *bdrv_find(const char *name)
3793 {
3794     BlockDriverState *bs;
3795 
3796     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3797         if (!strcmp(name, bs->device_name)) {
3798             return bs;
3799         }
3800     }
3801     return NULL;
3802 }
3803 
3804 /* This function is to find a node in the bs graph */
3805 BlockDriverState *bdrv_find_node(const char *node_name)
3806 {
3807     BlockDriverState *bs;
3808 
3809     assert(node_name);
3810 
3811     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3812         if (!strcmp(node_name, bs->node_name)) {
3813             return bs;
3814         }
3815     }
3816     return NULL;
3817 }
3818 
3819 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3820 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3821 {
3822     BlockDeviceInfoList *list, *entry;
3823     BlockDriverState *bs;
3824 
3825     list = NULL;
3826     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3827         entry = g_malloc0(sizeof(*entry));
3828         entry->value = bdrv_block_device_info(bs);
3829         entry->next = list;
3830         list = entry;
3831     }
3832 
3833     return list;
3834 }
3835 
3836 BlockDriverState *bdrv_lookup_bs(const char *device,
3837                                  const char *node_name,
3838                                  Error **errp)
3839 {
3840     BlockDriverState *bs = NULL;
3841 
3842     if (device) {
3843         bs = bdrv_find(device);
3844 
3845         if (bs) {
3846             return bs;
3847         }
3848     }
3849 
3850     if (node_name) {
3851         bs = bdrv_find_node(node_name);
3852 
3853         if (bs) {
3854             return bs;
3855         }
3856     }
3857 
3858     error_setg(errp, "Cannot find device=%s nor node_name=%s",
3859                      device ? device : "",
3860                      node_name ? node_name : "");
3861     return NULL;
3862 }
3863 
3864 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3865  * return false.  If either argument is NULL, return false. */
3866 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3867 {
3868     while (top && top != base) {
3869         top = top->backing_hd;
3870     }
3871 
3872     return top != NULL;
3873 }
3874 
3875 BlockDriverState *bdrv_next(BlockDriverState *bs)
3876 {
3877     if (!bs) {
3878         return QTAILQ_FIRST(&bdrv_states);
3879     }
3880     return QTAILQ_NEXT(bs, device_list);
3881 }
3882 
3883 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3884 {
3885     BlockDriverState *bs;
3886 
3887     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3888         it(opaque, bs);
3889     }
3890 }
3891 
3892 const char *bdrv_get_device_name(BlockDriverState *bs)
3893 {
3894     return bs->device_name;
3895 }
3896 
3897 int bdrv_get_flags(BlockDriverState *bs)
3898 {
3899     return bs->open_flags;
3900 }
3901 
3902 int bdrv_flush_all(void)
3903 {
3904     BlockDriverState *bs;
3905     int result = 0;
3906 
3907     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3908         AioContext *aio_context = bdrv_get_aio_context(bs);
3909         int ret;
3910 
3911         aio_context_acquire(aio_context);
3912         ret = bdrv_flush(bs);
3913         if (ret < 0 && !result) {
3914             result = ret;
3915         }
3916         aio_context_release(aio_context);
3917     }
3918 
3919     return result;
3920 }
3921 
3922 int bdrv_has_zero_init_1(BlockDriverState *bs)
3923 {
3924     return 1;
3925 }
3926 
3927 int bdrv_has_zero_init(BlockDriverState *bs)
3928 {
3929     assert(bs->drv);
3930 
3931     /* If BS is a copy on write image, it is initialized to
3932        the contents of the base image, which may not be zeroes.  */
3933     if (bs->backing_hd) {
3934         return 0;
3935     }
3936     if (bs->drv->bdrv_has_zero_init) {
3937         return bs->drv->bdrv_has_zero_init(bs);
3938     }
3939 
3940     /* safe default */
3941     return 0;
3942 }
3943 
3944 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3945 {
3946     BlockDriverInfo bdi;
3947 
3948     if (bs->backing_hd) {
3949         return false;
3950     }
3951 
3952     if (bdrv_get_info(bs, &bdi) == 0) {
3953         return bdi.unallocated_blocks_are_zero;
3954     }
3955 
3956     return false;
3957 }
3958 
3959 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3960 {
3961     BlockDriverInfo bdi;
3962 
3963     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3964         return false;
3965     }
3966 
3967     if (bdrv_get_info(bs, &bdi) == 0) {
3968         return bdi.can_write_zeroes_with_unmap;
3969     }
3970 
3971     return false;
3972 }
3973 
3974 typedef struct BdrvCoGetBlockStatusData {
3975     BlockDriverState *bs;
3976     BlockDriverState *base;
3977     int64_t sector_num;
3978     int nb_sectors;
3979     int *pnum;
3980     int64_t ret;
3981     bool done;
3982 } BdrvCoGetBlockStatusData;
3983 
3984 /*
3985  * Returns true iff the specified sector is present in the disk image. Drivers
3986  * not implementing the functionality are assumed to not support backing files,
3987  * hence all their sectors are reported as allocated.
3988  *
3989  * If 'sector_num' is beyond the end of the disk image the return value is 0
3990  * and 'pnum' is set to 0.
3991  *
3992  * 'pnum' is set to the number of sectors (including and immediately following
3993  * the specified sector) that are known to be in the same
3994  * allocated/unallocated state.
3995  *
3996  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3997  * beyond the end of the disk image it will be clamped.
3998  */
3999 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
4000                                                      int64_t sector_num,
4001                                                      int nb_sectors, int *pnum)
4002 {
4003     int64_t total_sectors;
4004     int64_t n;
4005     int64_t ret, ret2;
4006 
4007     total_sectors = bdrv_nb_sectors(bs);
4008     if (total_sectors < 0) {
4009         return total_sectors;
4010     }
4011 
4012     if (sector_num >= total_sectors) {
4013         *pnum = 0;
4014         return 0;
4015     }
4016 
4017     n = total_sectors - sector_num;
4018     if (n < nb_sectors) {
4019         nb_sectors = n;
4020     }
4021 
4022     if (!bs->drv->bdrv_co_get_block_status) {
4023         *pnum = nb_sectors;
4024         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
4025         if (bs->drv->protocol_name) {
4026             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
4027         }
4028         return ret;
4029     }
4030 
4031     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
4032     if (ret < 0) {
4033         *pnum = 0;
4034         return ret;
4035     }
4036 
4037     if (ret & BDRV_BLOCK_RAW) {
4038         assert(ret & BDRV_BLOCK_OFFSET_VALID);
4039         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4040                                      *pnum, pnum);
4041     }
4042 
4043     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
4044         ret |= BDRV_BLOCK_ALLOCATED;
4045     }
4046 
4047     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
4048         if (bdrv_unallocated_blocks_are_zero(bs)) {
4049             ret |= BDRV_BLOCK_ZERO;
4050         } else if (bs->backing_hd) {
4051             BlockDriverState *bs2 = bs->backing_hd;
4052             int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4053             if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
4054                 ret |= BDRV_BLOCK_ZERO;
4055             }
4056         }
4057     }
4058 
4059     if (bs->file &&
4060         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4061         (ret & BDRV_BLOCK_OFFSET_VALID)) {
4062         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4063                                         *pnum, pnum);
4064         if (ret2 >= 0) {
4065             /* Ignore errors.  This is just providing extra information, it
4066              * is useful but not necessary.
4067              */
4068             ret |= (ret2 & BDRV_BLOCK_ZERO);
4069         }
4070     }
4071 
4072     return ret;
4073 }
4074 
4075 /* Coroutine wrapper for bdrv_get_block_status() */
4076 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
4077 {
4078     BdrvCoGetBlockStatusData *data = opaque;
4079     BlockDriverState *bs = data->bs;
4080 
4081     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4082                                          data->pnum);
4083     data->done = true;
4084 }
4085 
4086 /*
4087  * Synchronous wrapper around bdrv_co_get_block_status().
4088  *
4089  * See bdrv_co_get_block_status() for details.
4090  */
4091 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4092                               int nb_sectors, int *pnum)
4093 {
4094     Coroutine *co;
4095     BdrvCoGetBlockStatusData data = {
4096         .bs = bs,
4097         .sector_num = sector_num,
4098         .nb_sectors = nb_sectors,
4099         .pnum = pnum,
4100         .done = false,
4101     };
4102 
4103     if (qemu_in_coroutine()) {
4104         /* Fast-path if already in coroutine context */
4105         bdrv_get_block_status_co_entry(&data);
4106     } else {
4107         AioContext *aio_context = bdrv_get_aio_context(bs);
4108 
4109         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4110         qemu_coroutine_enter(co, &data);
4111         while (!data.done) {
4112             aio_poll(aio_context, true);
4113         }
4114     }
4115     return data.ret;
4116 }
4117 
4118 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4119                                    int nb_sectors, int *pnum)
4120 {
4121     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4122     if (ret < 0) {
4123         return ret;
4124     }
4125     return !!(ret & BDRV_BLOCK_ALLOCATED);
4126 }
4127 
4128 /*
4129  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4130  *
4131  * Return true if the given sector is allocated in any image between
4132  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
4133  * sector is allocated in any image of the chain.  Return false otherwise.
4134  *
4135  * 'pnum' is set to the number of sectors (including and immediately following
4136  *  the specified sector) that are known to be in the same
4137  *  allocated/unallocated state.
4138  *
4139  */
4140 int bdrv_is_allocated_above(BlockDriverState *top,
4141                             BlockDriverState *base,
4142                             int64_t sector_num,
4143                             int nb_sectors, int *pnum)
4144 {
4145     BlockDriverState *intermediate;
4146     int ret, n = nb_sectors;
4147 
4148     intermediate = top;
4149     while (intermediate && intermediate != base) {
4150         int pnum_inter;
4151         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4152                                 &pnum_inter);
4153         if (ret < 0) {
4154             return ret;
4155         } else if (ret) {
4156             *pnum = pnum_inter;
4157             return 1;
4158         }
4159 
4160         /*
4161          * [sector_num, nb_sectors] is unallocated on top but intermediate
4162          * might have
4163          *
4164          * [sector_num+x, nr_sectors] allocated.
4165          */
4166         if (n > pnum_inter &&
4167             (intermediate == top ||
4168              sector_num + pnum_inter < intermediate->total_sectors)) {
4169             n = pnum_inter;
4170         }
4171 
4172         intermediate = intermediate->backing_hd;
4173     }
4174 
4175     *pnum = n;
4176     return 0;
4177 }
4178 
4179 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4180 {
4181     if (bs->backing_hd && bs->backing_hd->encrypted)
4182         return bs->backing_file;
4183     else if (bs->encrypted)
4184         return bs->filename;
4185     else
4186         return NULL;
4187 }
4188 
4189 void bdrv_get_backing_filename(BlockDriverState *bs,
4190                                char *filename, int filename_size)
4191 {
4192     pstrcpy(filename, filename_size, bs->backing_file);
4193 }
4194 
4195 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4196                           const uint8_t *buf, int nb_sectors)
4197 {
4198     BlockDriver *drv = bs->drv;
4199     if (!drv)
4200         return -ENOMEDIUM;
4201     if (!drv->bdrv_write_compressed)
4202         return -ENOTSUP;
4203     if (bdrv_check_request(bs, sector_num, nb_sectors))
4204         return -EIO;
4205 
4206     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4207 
4208     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4209 }
4210 
4211 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4212 {
4213     BlockDriver *drv = bs->drv;
4214     if (!drv)
4215         return -ENOMEDIUM;
4216     if (!drv->bdrv_get_info)
4217         return -ENOTSUP;
4218     memset(bdi, 0, sizeof(*bdi));
4219     return drv->bdrv_get_info(bs, bdi);
4220 }
4221 
4222 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4223 {
4224     BlockDriver *drv = bs->drv;
4225     if (drv && drv->bdrv_get_specific_info) {
4226         return drv->bdrv_get_specific_info(bs);
4227     }
4228     return NULL;
4229 }
4230 
4231 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4232                       int64_t pos, int size)
4233 {
4234     QEMUIOVector qiov;
4235     struct iovec iov = {
4236         .iov_base   = (void *) buf,
4237         .iov_len    = size,
4238     };
4239 
4240     qemu_iovec_init_external(&qiov, &iov, 1);
4241     return bdrv_writev_vmstate(bs, &qiov, pos);
4242 }
4243 
4244 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4245 {
4246     BlockDriver *drv = bs->drv;
4247 
4248     if (!drv) {
4249         return -ENOMEDIUM;
4250     } else if (drv->bdrv_save_vmstate) {
4251         return drv->bdrv_save_vmstate(bs, qiov, pos);
4252     } else if (bs->file) {
4253         return bdrv_writev_vmstate(bs->file, qiov, pos);
4254     }
4255 
4256     return -ENOTSUP;
4257 }
4258 
4259 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4260                       int64_t pos, int size)
4261 {
4262     BlockDriver *drv = bs->drv;
4263     if (!drv)
4264         return -ENOMEDIUM;
4265     if (drv->bdrv_load_vmstate)
4266         return drv->bdrv_load_vmstate(bs, buf, pos, size);
4267     if (bs->file)
4268         return bdrv_load_vmstate(bs->file, buf, pos, size);
4269     return -ENOTSUP;
4270 }
4271 
4272 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4273 {
4274     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4275         return;
4276     }
4277 
4278     bs->drv->bdrv_debug_event(bs, event);
4279 }
4280 
4281 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4282                           const char *tag)
4283 {
4284     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4285         bs = bs->file;
4286     }
4287 
4288     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4289         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4290     }
4291 
4292     return -ENOTSUP;
4293 }
4294 
4295 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4296 {
4297     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4298         bs = bs->file;
4299     }
4300 
4301     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4302         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4303     }
4304 
4305     return -ENOTSUP;
4306 }
4307 
4308 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4309 {
4310     while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4311         bs = bs->file;
4312     }
4313 
4314     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4315         return bs->drv->bdrv_debug_resume(bs, tag);
4316     }
4317 
4318     return -ENOTSUP;
4319 }
4320 
4321 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4322 {
4323     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4324         bs = bs->file;
4325     }
4326 
4327     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4328         return bs->drv->bdrv_debug_is_suspended(bs, tag);
4329     }
4330 
4331     return false;
4332 }
4333 
4334 int bdrv_is_snapshot(BlockDriverState *bs)
4335 {
4336     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4337 }
4338 
4339 /* backing_file can either be relative, or absolute, or a protocol.  If it is
4340  * relative, it must be relative to the chain.  So, passing in bs->filename
4341  * from a BDS as backing_file should not be done, as that may be relative to
4342  * the CWD rather than the chain. */
4343 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4344         const char *backing_file)
4345 {
4346     char *filename_full = NULL;
4347     char *backing_file_full = NULL;
4348     char *filename_tmp = NULL;
4349     int is_protocol = 0;
4350     BlockDriverState *curr_bs = NULL;
4351     BlockDriverState *retval = NULL;
4352 
4353     if (!bs || !bs->drv || !backing_file) {
4354         return NULL;
4355     }
4356 
4357     filename_full     = g_malloc(PATH_MAX);
4358     backing_file_full = g_malloc(PATH_MAX);
4359     filename_tmp      = g_malloc(PATH_MAX);
4360 
4361     is_protocol = path_has_protocol(backing_file);
4362 
4363     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4364 
4365         /* If either of the filename paths is actually a protocol, then
4366          * compare unmodified paths; otherwise make paths relative */
4367         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4368             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4369                 retval = curr_bs->backing_hd;
4370                 break;
4371             }
4372         } else {
4373             /* If not an absolute filename path, make it relative to the current
4374              * image's filename path */
4375             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4376                          backing_file);
4377 
4378             /* We are going to compare absolute pathnames */
4379             if (!realpath(filename_tmp, filename_full)) {
4380                 continue;
4381             }
4382 
4383             /* We need to make sure the backing filename we are comparing against
4384              * is relative to the current image filename (or absolute) */
4385             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4386                          curr_bs->backing_file);
4387 
4388             if (!realpath(filename_tmp, backing_file_full)) {
4389                 continue;
4390             }
4391 
4392             if (strcmp(backing_file_full, filename_full) == 0) {
4393                 retval = curr_bs->backing_hd;
4394                 break;
4395             }
4396         }
4397     }
4398 
4399     g_free(filename_full);
4400     g_free(backing_file_full);
4401     g_free(filename_tmp);
4402     return retval;
4403 }
4404 
4405 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4406 {
4407     if (!bs->drv) {
4408         return 0;
4409     }
4410 
4411     if (!bs->backing_hd) {
4412         return 0;
4413     }
4414 
4415     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4416 }
4417 
4418 /**************************************************************/
4419 /* async I/Os */
4420 
4421 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4422                                  QEMUIOVector *qiov, int nb_sectors,
4423                                  BlockDriverCompletionFunc *cb, void *opaque)
4424 {
4425     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4426 
4427     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4428                                  cb, opaque, false);
4429 }
4430 
4431 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4432                                   QEMUIOVector *qiov, int nb_sectors,
4433                                   BlockDriverCompletionFunc *cb, void *opaque)
4434 {
4435     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4436 
4437     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4438                                  cb, opaque, true);
4439 }
4440 
4441 BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4442         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4443         BlockDriverCompletionFunc *cb, void *opaque)
4444 {
4445     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4446 
4447     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4448                                  BDRV_REQ_ZERO_WRITE | flags,
4449                                  cb, opaque, true);
4450 }
4451 
4452 
4453 typedef struct MultiwriteCB {
4454     int error;
4455     int num_requests;
4456     int num_callbacks;
4457     struct {
4458         BlockDriverCompletionFunc *cb;
4459         void *opaque;
4460         QEMUIOVector *free_qiov;
4461     } callbacks[];
4462 } MultiwriteCB;
4463 
4464 static void multiwrite_user_cb(MultiwriteCB *mcb)
4465 {
4466     int i;
4467 
4468     for (i = 0; i < mcb->num_callbacks; i++) {
4469         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4470         if (mcb->callbacks[i].free_qiov) {
4471             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4472         }
4473         g_free(mcb->callbacks[i].free_qiov);
4474     }
4475 }
4476 
4477 static void multiwrite_cb(void *opaque, int ret)
4478 {
4479     MultiwriteCB *mcb = opaque;
4480 
4481     trace_multiwrite_cb(mcb, ret);
4482 
4483     if (ret < 0 && !mcb->error) {
4484         mcb->error = ret;
4485     }
4486 
4487     mcb->num_requests--;
4488     if (mcb->num_requests == 0) {
4489         multiwrite_user_cb(mcb);
4490         g_free(mcb);
4491     }
4492 }
4493 
4494 static int multiwrite_req_compare(const void *a, const void *b)
4495 {
4496     const BlockRequest *req1 = a, *req2 = b;
4497 
4498     /*
4499      * Note that we can't simply subtract req2->sector from req1->sector
4500      * here as that could overflow the return value.
4501      */
4502     if (req1->sector > req2->sector) {
4503         return 1;
4504     } else if (req1->sector < req2->sector) {
4505         return -1;
4506     } else {
4507         return 0;
4508     }
4509 }
4510 
4511 /*
4512  * Takes a bunch of requests and tries to merge them. Returns the number of
4513  * requests that remain after merging.
4514  */
4515 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4516     int num_reqs, MultiwriteCB *mcb)
4517 {
4518     int i, outidx;
4519 
4520     // Sort requests by start sector
4521     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4522 
4523     // Check if adjacent requests touch the same clusters. If so, combine them,
4524     // filling up gaps with zero sectors.
4525     outidx = 0;
4526     for (i = 1; i < num_reqs; i++) {
4527         int merge = 0;
4528         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4529 
4530         // Handle exactly sequential writes and overlapping writes.
4531         if (reqs[i].sector <= oldreq_last) {
4532             merge = 1;
4533         }
4534 
4535         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4536             merge = 0;
4537         }
4538 
4539         if (merge) {
4540             size_t size;
4541             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4542             qemu_iovec_init(qiov,
4543                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4544 
4545             // Add the first request to the merged one. If the requests are
4546             // overlapping, drop the last sectors of the first request.
4547             size = (reqs[i].sector - reqs[outidx].sector) << 9;
4548             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4549 
4550             // We should need to add any zeros between the two requests
4551             assert (reqs[i].sector <= oldreq_last);
4552 
4553             // Add the second request
4554             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4555 
4556             // Add tail of first request, if necessary
4557             if (qiov->size < reqs[outidx].qiov->size) {
4558                 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4559                                   reqs[outidx].qiov->size - qiov->size);
4560             }
4561 
4562             reqs[outidx].nb_sectors = qiov->size >> 9;
4563             reqs[outidx].qiov = qiov;
4564 
4565             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4566         } else {
4567             outidx++;
4568             reqs[outidx].sector     = reqs[i].sector;
4569             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4570             reqs[outidx].qiov       = reqs[i].qiov;
4571         }
4572     }
4573 
4574     return outidx + 1;
4575 }
4576 
4577 /*
4578  * Submit multiple AIO write requests at once.
4579  *
4580  * On success, the function returns 0 and all requests in the reqs array have
4581  * been submitted. In error case this function returns -1, and any of the
4582  * requests may or may not be submitted yet. In particular, this means that the
4583  * callback will be called for some of the requests, for others it won't. The
4584  * caller must check the error field of the BlockRequest to wait for the right
4585  * callbacks (if error != 0, no callback will be called).
4586  *
4587  * The implementation may modify the contents of the reqs array, e.g. to merge
4588  * requests. However, the fields opaque and error are left unmodified as they
4589  * are used to signal failure for a single request to the caller.
4590  */
4591 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4592 {
4593     MultiwriteCB *mcb;
4594     int i;
4595 
4596     /* don't submit writes if we don't have a medium */
4597     if (bs->drv == NULL) {
4598         for (i = 0; i < num_reqs; i++) {
4599             reqs[i].error = -ENOMEDIUM;
4600         }
4601         return -1;
4602     }
4603 
4604     if (num_reqs == 0) {
4605         return 0;
4606     }
4607 
4608     // Create MultiwriteCB structure
4609     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4610     mcb->num_requests = 0;
4611     mcb->num_callbacks = num_reqs;
4612 
4613     for (i = 0; i < num_reqs; i++) {
4614         mcb->callbacks[i].cb = reqs[i].cb;
4615         mcb->callbacks[i].opaque = reqs[i].opaque;
4616     }
4617 
4618     // Check for mergable requests
4619     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4620 
4621     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4622 
4623     /* Run the aio requests. */
4624     mcb->num_requests = num_reqs;
4625     for (i = 0; i < num_reqs; i++) {
4626         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4627                               reqs[i].nb_sectors, reqs[i].flags,
4628                               multiwrite_cb, mcb,
4629                               true);
4630     }
4631 
4632     return 0;
4633 }
4634 
4635 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4636 {
4637     acb->aiocb_info->cancel(acb);
4638 }
4639 
4640 /**************************************************************/
4641 /* async block device emulation */
4642 
4643 typedef struct BlockDriverAIOCBSync {
4644     BlockDriverAIOCB common;
4645     QEMUBH *bh;
4646     int ret;
4647     /* vector translation state */
4648     QEMUIOVector *qiov;
4649     uint8_t *bounce;
4650     int is_write;
4651 } BlockDriverAIOCBSync;
4652 
4653 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4654 {
4655     BlockDriverAIOCBSync *acb =
4656         container_of(blockacb, BlockDriverAIOCBSync, common);
4657     qemu_bh_delete(acb->bh);
4658     acb->bh = NULL;
4659     qemu_aio_release(acb);
4660 }
4661 
4662 static const AIOCBInfo bdrv_em_aiocb_info = {
4663     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
4664     .cancel             = bdrv_aio_cancel_em,
4665 };
4666 
4667 static void bdrv_aio_bh_cb(void *opaque)
4668 {
4669     BlockDriverAIOCBSync *acb = opaque;
4670 
4671     if (!acb->is_write && acb->ret >= 0) {
4672         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4673     }
4674     qemu_vfree(acb->bounce);
4675     acb->common.cb(acb->common.opaque, acb->ret);
4676     qemu_bh_delete(acb->bh);
4677     acb->bh = NULL;
4678     qemu_aio_release(acb);
4679 }
4680 
4681 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4682                                             int64_t sector_num,
4683                                             QEMUIOVector *qiov,
4684                                             int nb_sectors,
4685                                             BlockDriverCompletionFunc *cb,
4686                                             void *opaque,
4687                                             int is_write)
4688 
4689 {
4690     BlockDriverAIOCBSync *acb;
4691 
4692     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4693     acb->is_write = is_write;
4694     acb->qiov = qiov;
4695     acb->bounce = qemu_try_blockalign(bs, qiov->size);
4696     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4697 
4698     if (acb->bounce == NULL) {
4699         acb->ret = -ENOMEM;
4700     } else if (is_write) {
4701         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4702         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4703     } else {
4704         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4705     }
4706 
4707     qemu_bh_schedule(acb->bh);
4708 
4709     return &acb->common;
4710 }
4711 
4712 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4713         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4714         BlockDriverCompletionFunc *cb, void *opaque)
4715 {
4716     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4717 }
4718 
4719 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4720         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4721         BlockDriverCompletionFunc *cb, void *opaque)
4722 {
4723     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4724 }
4725 
4726 
4727 typedef struct BlockDriverAIOCBCoroutine {
4728     BlockDriverAIOCB common;
4729     BlockRequest req;
4730     bool is_write;
4731     bool *done;
4732     QEMUBH* bh;
4733 } BlockDriverAIOCBCoroutine;
4734 
4735 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4736 {
4737     AioContext *aio_context = bdrv_get_aio_context(blockacb->bs);
4738     BlockDriverAIOCBCoroutine *acb =
4739         container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4740     bool done = false;
4741 
4742     acb->done = &done;
4743     while (!done) {
4744         aio_poll(aio_context, true);
4745     }
4746 }
4747 
4748 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4749     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4750     .cancel             = bdrv_aio_co_cancel_em,
4751 };
4752 
4753 static void bdrv_co_em_bh(void *opaque)
4754 {
4755     BlockDriverAIOCBCoroutine *acb = opaque;
4756 
4757     acb->common.cb(acb->common.opaque, acb->req.error);
4758 
4759     if (acb->done) {
4760         *acb->done = true;
4761     }
4762 
4763     qemu_bh_delete(acb->bh);
4764     qemu_aio_release(acb);
4765 }
4766 
4767 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4768 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4769 {
4770     BlockDriverAIOCBCoroutine *acb = opaque;
4771     BlockDriverState *bs = acb->common.bs;
4772 
4773     if (!acb->is_write) {
4774         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4775             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4776     } else {
4777         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4778             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4779     }
4780 
4781     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4782     qemu_bh_schedule(acb->bh);
4783 }
4784 
4785 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4786                                                int64_t sector_num,
4787                                                QEMUIOVector *qiov,
4788                                                int nb_sectors,
4789                                                BdrvRequestFlags flags,
4790                                                BlockDriverCompletionFunc *cb,
4791                                                void *opaque,
4792                                                bool is_write)
4793 {
4794     Coroutine *co;
4795     BlockDriverAIOCBCoroutine *acb;
4796 
4797     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4798     acb->req.sector = sector_num;
4799     acb->req.nb_sectors = nb_sectors;
4800     acb->req.qiov = qiov;
4801     acb->req.flags = flags;
4802     acb->is_write = is_write;
4803     acb->done = NULL;
4804 
4805     co = qemu_coroutine_create(bdrv_co_do_rw);
4806     qemu_coroutine_enter(co, acb);
4807 
4808     return &acb->common;
4809 }
4810 
4811 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4812 {
4813     BlockDriverAIOCBCoroutine *acb = opaque;
4814     BlockDriverState *bs = acb->common.bs;
4815 
4816     acb->req.error = bdrv_co_flush(bs);
4817     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4818     qemu_bh_schedule(acb->bh);
4819 }
4820 
4821 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4822         BlockDriverCompletionFunc *cb, void *opaque)
4823 {
4824     trace_bdrv_aio_flush(bs, opaque);
4825 
4826     Coroutine *co;
4827     BlockDriverAIOCBCoroutine *acb;
4828 
4829     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4830     acb->done = NULL;
4831 
4832     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4833     qemu_coroutine_enter(co, acb);
4834 
4835     return &acb->common;
4836 }
4837 
4838 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4839 {
4840     BlockDriverAIOCBCoroutine *acb = opaque;
4841     BlockDriverState *bs = acb->common.bs;
4842 
4843     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4844     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4845     qemu_bh_schedule(acb->bh);
4846 }
4847 
4848 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4849         int64_t sector_num, int nb_sectors,
4850         BlockDriverCompletionFunc *cb, void *opaque)
4851 {
4852     Coroutine *co;
4853     BlockDriverAIOCBCoroutine *acb;
4854 
4855     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4856 
4857     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4858     acb->req.sector = sector_num;
4859     acb->req.nb_sectors = nb_sectors;
4860     acb->done = NULL;
4861     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4862     qemu_coroutine_enter(co, acb);
4863 
4864     return &acb->common;
4865 }
4866 
4867 void bdrv_init(void)
4868 {
4869     module_call_init(MODULE_INIT_BLOCK);
4870 }
4871 
4872 void bdrv_init_with_whitelist(void)
4873 {
4874     use_bdrv_whitelist = 1;
4875     bdrv_init();
4876 }
4877 
4878 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4879                    BlockDriverCompletionFunc *cb, void *opaque)
4880 {
4881     BlockDriverAIOCB *acb;
4882 
4883     acb = g_slice_alloc(aiocb_info->aiocb_size);
4884     acb->aiocb_info = aiocb_info;
4885     acb->bs = bs;
4886     acb->cb = cb;
4887     acb->opaque = opaque;
4888     return acb;
4889 }
4890 
4891 void qemu_aio_release(void *p)
4892 {
4893     BlockDriverAIOCB *acb = p;
4894     g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4895 }
4896 
4897 /**************************************************************/
4898 /* Coroutine block device emulation */
4899 
4900 typedef struct CoroutineIOCompletion {
4901     Coroutine *coroutine;
4902     int ret;
4903 } CoroutineIOCompletion;
4904 
4905 static void bdrv_co_io_em_complete(void *opaque, int ret)
4906 {
4907     CoroutineIOCompletion *co = opaque;
4908 
4909     co->ret = ret;
4910     qemu_coroutine_enter(co->coroutine, NULL);
4911 }
4912 
4913 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4914                                       int nb_sectors, QEMUIOVector *iov,
4915                                       bool is_write)
4916 {
4917     CoroutineIOCompletion co = {
4918         .coroutine = qemu_coroutine_self(),
4919     };
4920     BlockDriverAIOCB *acb;
4921 
4922     if (is_write) {
4923         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4924                                        bdrv_co_io_em_complete, &co);
4925     } else {
4926         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4927                                       bdrv_co_io_em_complete, &co);
4928     }
4929 
4930     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4931     if (!acb) {
4932         return -EIO;
4933     }
4934     qemu_coroutine_yield();
4935 
4936     return co.ret;
4937 }
4938 
4939 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4940                                          int64_t sector_num, int nb_sectors,
4941                                          QEMUIOVector *iov)
4942 {
4943     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4944 }
4945 
4946 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4947                                          int64_t sector_num, int nb_sectors,
4948                                          QEMUIOVector *iov)
4949 {
4950     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4951 }
4952 
4953 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4954 {
4955     RwCo *rwco = opaque;
4956 
4957     rwco->ret = bdrv_co_flush(rwco->bs);
4958 }
4959 
4960 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4961 {
4962     int ret;
4963 
4964     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4965         return 0;
4966     }
4967 
4968     /* Write back cached data to the OS even with cache=unsafe */
4969     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4970     if (bs->drv->bdrv_co_flush_to_os) {
4971         ret = bs->drv->bdrv_co_flush_to_os(bs);
4972         if (ret < 0) {
4973             return ret;
4974         }
4975     }
4976 
4977     /* But don't actually force it to the disk with cache=unsafe */
4978     if (bs->open_flags & BDRV_O_NO_FLUSH) {
4979         goto flush_parent;
4980     }
4981 
4982     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4983     if (bs->drv->bdrv_co_flush_to_disk) {
4984         ret = bs->drv->bdrv_co_flush_to_disk(bs);
4985     } else if (bs->drv->bdrv_aio_flush) {
4986         BlockDriverAIOCB *acb;
4987         CoroutineIOCompletion co = {
4988             .coroutine = qemu_coroutine_self(),
4989         };
4990 
4991         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4992         if (acb == NULL) {
4993             ret = -EIO;
4994         } else {
4995             qemu_coroutine_yield();
4996             ret = co.ret;
4997         }
4998     } else {
4999         /*
5000          * Some block drivers always operate in either writethrough or unsafe
5001          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
5002          * know how the server works (because the behaviour is hardcoded or
5003          * depends on server-side configuration), so we can't ensure that
5004          * everything is safe on disk. Returning an error doesn't work because
5005          * that would break guests even if the server operates in writethrough
5006          * mode.
5007          *
5008          * Let's hope the user knows what he's doing.
5009          */
5010         ret = 0;
5011     }
5012     if (ret < 0) {
5013         return ret;
5014     }
5015 
5016     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
5017      * in the case of cache=unsafe, so there are no useless flushes.
5018      */
5019 flush_parent:
5020     return bdrv_co_flush(bs->file);
5021 }
5022 
5023 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
5024 {
5025     Error *local_err = NULL;
5026     int ret;
5027 
5028     if (!bs->drv)  {
5029         return;
5030     }
5031 
5032     if (bs->drv->bdrv_invalidate_cache) {
5033         bs->drv->bdrv_invalidate_cache(bs, &local_err);
5034     } else if (bs->file) {
5035         bdrv_invalidate_cache(bs->file, &local_err);
5036     }
5037     if (local_err) {
5038         error_propagate(errp, local_err);
5039         return;
5040     }
5041 
5042     ret = refresh_total_sectors(bs, bs->total_sectors);
5043     if (ret < 0) {
5044         error_setg_errno(errp, -ret, "Could not refresh total sector count");
5045         return;
5046     }
5047 }
5048 
5049 void bdrv_invalidate_cache_all(Error **errp)
5050 {
5051     BlockDriverState *bs;
5052     Error *local_err = NULL;
5053 
5054     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5055         AioContext *aio_context = bdrv_get_aio_context(bs);
5056 
5057         aio_context_acquire(aio_context);
5058         bdrv_invalidate_cache(bs, &local_err);
5059         aio_context_release(aio_context);
5060         if (local_err) {
5061             error_propagate(errp, local_err);
5062             return;
5063         }
5064     }
5065 }
5066 
5067 void bdrv_clear_incoming_migration_all(void)
5068 {
5069     BlockDriverState *bs;
5070 
5071     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5072         AioContext *aio_context = bdrv_get_aio_context(bs);
5073 
5074         aio_context_acquire(aio_context);
5075         bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
5076         aio_context_release(aio_context);
5077     }
5078 }
5079 
5080 int bdrv_flush(BlockDriverState *bs)
5081 {
5082     Coroutine *co;
5083     RwCo rwco = {
5084         .bs = bs,
5085         .ret = NOT_DONE,
5086     };
5087 
5088     if (qemu_in_coroutine()) {
5089         /* Fast-path if already in coroutine context */
5090         bdrv_flush_co_entry(&rwco);
5091     } else {
5092         AioContext *aio_context = bdrv_get_aio_context(bs);
5093 
5094         co = qemu_coroutine_create(bdrv_flush_co_entry);
5095         qemu_coroutine_enter(co, &rwco);
5096         while (rwco.ret == NOT_DONE) {
5097             aio_poll(aio_context, true);
5098         }
5099     }
5100 
5101     return rwco.ret;
5102 }
5103 
5104 typedef struct DiscardCo {
5105     BlockDriverState *bs;
5106     int64_t sector_num;
5107     int nb_sectors;
5108     int ret;
5109 } DiscardCo;
5110 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5111 {
5112     DiscardCo *rwco = opaque;
5113 
5114     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5115 }
5116 
5117 /* if no limit is specified in the BlockLimits use a default
5118  * of 32768 512-byte sectors (16 MiB) per request.
5119  */
5120 #define MAX_DISCARD_DEFAULT 32768
5121 
5122 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5123                                  int nb_sectors)
5124 {
5125     int max_discard;
5126 
5127     if (!bs->drv) {
5128         return -ENOMEDIUM;
5129     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5130         return -EIO;
5131     } else if (bs->read_only) {
5132         return -EROFS;
5133     }
5134 
5135     bdrv_reset_dirty(bs, sector_num, nb_sectors);
5136 
5137     /* Do nothing if disabled.  */
5138     if (!(bs->open_flags & BDRV_O_UNMAP)) {
5139         return 0;
5140     }
5141 
5142     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5143         return 0;
5144     }
5145 
5146     max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5147     while (nb_sectors > 0) {
5148         int ret;
5149         int num = nb_sectors;
5150 
5151         /* align request */
5152         if (bs->bl.discard_alignment &&
5153             num >= bs->bl.discard_alignment &&
5154             sector_num % bs->bl.discard_alignment) {
5155             if (num > bs->bl.discard_alignment) {
5156                 num = bs->bl.discard_alignment;
5157             }
5158             num -= sector_num % bs->bl.discard_alignment;
5159         }
5160 
5161         /* limit request size */
5162         if (num > max_discard) {
5163             num = max_discard;
5164         }
5165 
5166         if (bs->drv->bdrv_co_discard) {
5167             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5168         } else {
5169             BlockDriverAIOCB *acb;
5170             CoroutineIOCompletion co = {
5171                 .coroutine = qemu_coroutine_self(),
5172             };
5173 
5174             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5175                                             bdrv_co_io_em_complete, &co);
5176             if (acb == NULL) {
5177                 return -EIO;
5178             } else {
5179                 qemu_coroutine_yield();
5180                 ret = co.ret;
5181             }
5182         }
5183         if (ret && ret != -ENOTSUP) {
5184             return ret;
5185         }
5186 
5187         sector_num += num;
5188         nb_sectors -= num;
5189     }
5190     return 0;
5191 }
5192 
5193 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5194 {
5195     Coroutine *co;
5196     DiscardCo rwco = {
5197         .bs = bs,
5198         .sector_num = sector_num,
5199         .nb_sectors = nb_sectors,
5200         .ret = NOT_DONE,
5201     };
5202 
5203     if (qemu_in_coroutine()) {
5204         /* Fast-path if already in coroutine context */
5205         bdrv_discard_co_entry(&rwco);
5206     } else {
5207         AioContext *aio_context = bdrv_get_aio_context(bs);
5208 
5209         co = qemu_coroutine_create(bdrv_discard_co_entry);
5210         qemu_coroutine_enter(co, &rwco);
5211         while (rwco.ret == NOT_DONE) {
5212             aio_poll(aio_context, true);
5213         }
5214     }
5215 
5216     return rwco.ret;
5217 }
5218 
5219 /**************************************************************/
5220 /* removable device support */
5221 
5222 /**
5223  * Return TRUE if the media is present
5224  */
5225 int bdrv_is_inserted(BlockDriverState *bs)
5226 {
5227     BlockDriver *drv = bs->drv;
5228 
5229     if (!drv)
5230         return 0;
5231     if (!drv->bdrv_is_inserted)
5232         return 1;
5233     return drv->bdrv_is_inserted(bs);
5234 }
5235 
5236 /**
5237  * Return whether the media changed since the last call to this
5238  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
5239  */
5240 int bdrv_media_changed(BlockDriverState *bs)
5241 {
5242     BlockDriver *drv = bs->drv;
5243 
5244     if (drv && drv->bdrv_media_changed) {
5245         return drv->bdrv_media_changed(bs);
5246     }
5247     return -ENOTSUP;
5248 }
5249 
5250 /**
5251  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5252  */
5253 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5254 {
5255     BlockDriver *drv = bs->drv;
5256 
5257     if (drv && drv->bdrv_eject) {
5258         drv->bdrv_eject(bs, eject_flag);
5259     }
5260 
5261     if (bs->device_name[0] != '\0') {
5262         qapi_event_send_device_tray_moved(bdrv_get_device_name(bs),
5263                                           eject_flag, &error_abort);
5264     }
5265 }
5266 
5267 /**
5268  * Lock or unlock the media (if it is locked, the user won't be able
5269  * to eject it manually).
5270  */
5271 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5272 {
5273     BlockDriver *drv = bs->drv;
5274 
5275     trace_bdrv_lock_medium(bs, locked);
5276 
5277     if (drv && drv->bdrv_lock_medium) {
5278         drv->bdrv_lock_medium(bs, locked);
5279     }
5280 }
5281 
5282 /* needed for generic scsi interface */
5283 
5284 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5285 {
5286     BlockDriver *drv = bs->drv;
5287 
5288     if (drv && drv->bdrv_ioctl)
5289         return drv->bdrv_ioctl(bs, req, buf);
5290     return -ENOTSUP;
5291 }
5292 
5293 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5294         unsigned long int req, void *buf,
5295         BlockDriverCompletionFunc *cb, void *opaque)
5296 {
5297     BlockDriver *drv = bs->drv;
5298 
5299     if (drv && drv->bdrv_aio_ioctl)
5300         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5301     return NULL;
5302 }
5303 
5304 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5305 {
5306     bs->guest_block_size = align;
5307 }
5308 
5309 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5310 {
5311     return qemu_memalign(bdrv_opt_mem_align(bs), size);
5312 }
5313 
5314 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5315 {
5316     size_t align = bdrv_opt_mem_align(bs);
5317 
5318     /* Ensure that NULL is never returned on success */
5319     assert(align > 0);
5320     if (size == 0) {
5321         size = align;
5322     }
5323 
5324     return qemu_try_memalign(align, size);
5325 }
5326 
5327 /*
5328  * Check if all memory in this vector is sector aligned.
5329  */
5330 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5331 {
5332     int i;
5333     size_t alignment = bdrv_opt_mem_align(bs);
5334 
5335     for (i = 0; i < qiov->niov; i++) {
5336         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5337             return false;
5338         }
5339         if (qiov->iov[i].iov_len % alignment) {
5340             return false;
5341         }
5342     }
5343 
5344     return true;
5345 }
5346 
5347 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5348                                           Error **errp)
5349 {
5350     int64_t bitmap_size;
5351     BdrvDirtyBitmap *bitmap;
5352 
5353     assert((granularity & (granularity - 1)) == 0);
5354 
5355     granularity >>= BDRV_SECTOR_BITS;
5356     assert(granularity);
5357     bitmap_size = bdrv_nb_sectors(bs);
5358     if (bitmap_size < 0) {
5359         error_setg_errno(errp, -bitmap_size, "could not get length of device");
5360         errno = -bitmap_size;
5361         return NULL;
5362     }
5363     bitmap = g_new0(BdrvDirtyBitmap, 1);
5364     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5365     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5366     return bitmap;
5367 }
5368 
5369 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5370 {
5371     BdrvDirtyBitmap *bm, *next;
5372     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5373         if (bm == bitmap) {
5374             QLIST_REMOVE(bitmap, list);
5375             hbitmap_free(bitmap->bitmap);
5376             g_free(bitmap);
5377             return;
5378         }
5379     }
5380 }
5381 
5382 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5383 {
5384     BdrvDirtyBitmap *bm;
5385     BlockDirtyInfoList *list = NULL;
5386     BlockDirtyInfoList **plist = &list;
5387 
5388     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5389         BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5390         BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
5391         info->count = bdrv_get_dirty_count(bs, bm);
5392         info->granularity =
5393             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5394         entry->value = info;
5395         *plist = entry;
5396         plist = &entry->next;
5397     }
5398 
5399     return list;
5400 }
5401 
5402 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5403 {
5404     if (bitmap) {
5405         return hbitmap_get(bitmap->bitmap, sector);
5406     } else {
5407         return 0;
5408     }
5409 }
5410 
5411 void bdrv_dirty_iter_init(BlockDriverState *bs,
5412                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5413 {
5414     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5415 }
5416 
5417 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5418                     int nr_sectors)
5419 {
5420     BdrvDirtyBitmap *bitmap;
5421     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5422         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5423     }
5424 }
5425 
5426 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5427 {
5428     BdrvDirtyBitmap *bitmap;
5429     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5430         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5431     }
5432 }
5433 
5434 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5435 {
5436     return hbitmap_count(bitmap->bitmap);
5437 }
5438 
5439 /* Get a reference to bs */
5440 void bdrv_ref(BlockDriverState *bs)
5441 {
5442     bs->refcnt++;
5443 }
5444 
5445 /* Release a previously grabbed reference to bs.
5446  * If after releasing, reference count is zero, the BlockDriverState is
5447  * deleted. */
5448 void bdrv_unref(BlockDriverState *bs)
5449 {
5450     if (!bs) {
5451         return;
5452     }
5453     assert(bs->refcnt > 0);
5454     if (--bs->refcnt == 0) {
5455         bdrv_delete(bs);
5456     }
5457 }
5458 
5459 struct BdrvOpBlocker {
5460     Error *reason;
5461     QLIST_ENTRY(BdrvOpBlocker) list;
5462 };
5463 
5464 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5465 {
5466     BdrvOpBlocker *blocker;
5467     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5468     if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5469         blocker = QLIST_FIRST(&bs->op_blockers[op]);
5470         if (errp) {
5471             error_setg(errp, "Device '%s' is busy: %s",
5472                        bs->device_name, error_get_pretty(blocker->reason));
5473         }
5474         return true;
5475     }
5476     return false;
5477 }
5478 
5479 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5480 {
5481     BdrvOpBlocker *blocker;
5482     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5483 
5484     blocker = g_new0(BdrvOpBlocker, 1);
5485     blocker->reason = reason;
5486     QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5487 }
5488 
5489 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5490 {
5491     BdrvOpBlocker *blocker, *next;
5492     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5493     QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5494         if (blocker->reason == reason) {
5495             QLIST_REMOVE(blocker, list);
5496             g_free(blocker);
5497         }
5498     }
5499 }
5500 
5501 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5502 {
5503     int i;
5504     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5505         bdrv_op_block(bs, i, reason);
5506     }
5507 }
5508 
5509 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5510 {
5511     int i;
5512     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5513         bdrv_op_unblock(bs, i, reason);
5514     }
5515 }
5516 
5517 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5518 {
5519     int i;
5520 
5521     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5522         if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5523             return false;
5524         }
5525     }
5526     return true;
5527 }
5528 
5529 void bdrv_iostatus_enable(BlockDriverState *bs)
5530 {
5531     bs->iostatus_enabled = true;
5532     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5533 }
5534 
5535 /* The I/O status is only enabled if the drive explicitly
5536  * enables it _and_ the VM is configured to stop on errors */
5537 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5538 {
5539     return (bs->iostatus_enabled &&
5540            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5541             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5542             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5543 }
5544 
5545 void bdrv_iostatus_disable(BlockDriverState *bs)
5546 {
5547     bs->iostatus_enabled = false;
5548 }
5549 
5550 void bdrv_iostatus_reset(BlockDriverState *bs)
5551 {
5552     if (bdrv_iostatus_is_enabled(bs)) {
5553         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5554         if (bs->job) {
5555             block_job_iostatus_reset(bs->job);
5556         }
5557     }
5558 }
5559 
5560 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5561 {
5562     assert(bdrv_iostatus_is_enabled(bs));
5563     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5564         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5565                                          BLOCK_DEVICE_IO_STATUS_FAILED;
5566     }
5567 }
5568 
5569 void
5570 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5571         enum BlockAcctType type)
5572 {
5573     assert(type < BDRV_MAX_IOTYPE);
5574 
5575     cookie->bytes = bytes;
5576     cookie->start_time_ns = get_clock();
5577     cookie->type = type;
5578 }
5579 
5580 void
5581 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5582 {
5583     assert(cookie->type < BDRV_MAX_IOTYPE);
5584 
5585     bs->nr_bytes[cookie->type] += cookie->bytes;
5586     bs->nr_ops[cookie->type]++;
5587     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5588 }
5589 
5590 void bdrv_img_create(const char *filename, const char *fmt,
5591                      const char *base_filename, const char *base_fmt,
5592                      char *options, uint64_t img_size, int flags,
5593                      Error **errp, bool quiet)
5594 {
5595     QemuOptsList *create_opts = NULL;
5596     QemuOpts *opts = NULL;
5597     const char *backing_fmt, *backing_file;
5598     int64_t size;
5599     BlockDriver *drv, *proto_drv;
5600     BlockDriver *backing_drv = NULL;
5601     Error *local_err = NULL;
5602     int ret = 0;
5603 
5604     /* Find driver and parse its options */
5605     drv = bdrv_find_format(fmt);
5606     if (!drv) {
5607         error_setg(errp, "Unknown file format '%s'", fmt);
5608         return;
5609     }
5610 
5611     proto_drv = bdrv_find_protocol(filename, true);
5612     if (!proto_drv) {
5613         error_setg(errp, "Unknown protocol '%s'", filename);
5614         return;
5615     }
5616 
5617     create_opts = qemu_opts_append(create_opts, drv->create_opts);
5618     create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5619 
5620     /* Create parameter list with default values */
5621     opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5622     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
5623 
5624     /* Parse -o options */
5625     if (options) {
5626         if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5627             error_setg(errp, "Invalid options for file format '%s'", fmt);
5628             goto out;
5629         }
5630     }
5631 
5632     if (base_filename) {
5633         if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
5634             error_setg(errp, "Backing file not supported for file format '%s'",
5635                        fmt);
5636             goto out;
5637         }
5638     }
5639 
5640     if (base_fmt) {
5641         if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5642             error_setg(errp, "Backing file format not supported for file "
5643                              "format '%s'", fmt);
5644             goto out;
5645         }
5646     }
5647 
5648     backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5649     if (backing_file) {
5650         if (!strcmp(filename, backing_file)) {
5651             error_setg(errp, "Error: Trying to create an image with the "
5652                              "same filename as the backing file");
5653             goto out;
5654         }
5655     }
5656 
5657     backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5658     if (backing_fmt) {
5659         backing_drv = bdrv_find_format(backing_fmt);
5660         if (!backing_drv) {
5661             error_setg(errp, "Unknown backing file format '%s'",
5662                        backing_fmt);
5663             goto out;
5664         }
5665     }
5666 
5667     // The size for the image must always be specified, with one exception:
5668     // If we are using a backing file, we can obtain the size from there
5669     size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5670     if (size == -1) {
5671         if (backing_file) {
5672             BlockDriverState *bs;
5673             int64_t size;
5674             int back_flags;
5675 
5676             /* backing files always opened read-only */
5677             back_flags =
5678                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5679 
5680             bs = NULL;
5681             ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
5682                             backing_drv, &local_err);
5683             if (ret < 0) {
5684                 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5685                                  backing_file,
5686                                  error_get_pretty(local_err));
5687                 error_free(local_err);
5688                 local_err = NULL;
5689                 goto out;
5690             }
5691             size = bdrv_getlength(bs);
5692             if (size < 0) {
5693                 error_setg_errno(errp, -size, "Could not get size of '%s'",
5694                                  backing_file);
5695                 bdrv_unref(bs);
5696                 goto out;
5697             }
5698 
5699             qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
5700 
5701             bdrv_unref(bs);
5702         } else {
5703             error_setg(errp, "Image creation needs a size parameter");
5704             goto out;
5705         }
5706     }
5707 
5708     if (!quiet) {
5709         printf("Formatting '%s', fmt=%s ", filename, fmt);
5710         qemu_opts_print(opts);
5711         puts("");
5712     }
5713 
5714     ret = bdrv_create(drv, filename, opts, &local_err);
5715 
5716     if (ret == -EFBIG) {
5717         /* This is generally a better message than whatever the driver would
5718          * deliver (especially because of the cluster_size_hint), since that
5719          * is most probably not much different from "image too large". */
5720         const char *cluster_size_hint = "";
5721         if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
5722             cluster_size_hint = " (try using a larger cluster size)";
5723         }
5724         error_setg(errp, "The image size is too large for file format '%s'"
5725                    "%s", fmt, cluster_size_hint);
5726         error_free(local_err);
5727         local_err = NULL;
5728     }
5729 
5730 out:
5731     qemu_opts_del(opts);
5732     qemu_opts_free(create_opts);
5733     if (local_err) {
5734         error_propagate(errp, local_err);
5735     }
5736 }
5737 
5738 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5739 {
5740     return bs->aio_context;
5741 }
5742 
5743 void bdrv_detach_aio_context(BlockDriverState *bs)
5744 {
5745     BdrvAioNotifier *baf;
5746 
5747     if (!bs->drv) {
5748         return;
5749     }
5750 
5751     QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5752         baf->detach_aio_context(baf->opaque);
5753     }
5754 
5755     if (bs->io_limits_enabled) {
5756         throttle_detach_aio_context(&bs->throttle_state);
5757     }
5758     if (bs->drv->bdrv_detach_aio_context) {
5759         bs->drv->bdrv_detach_aio_context(bs);
5760     }
5761     if (bs->file) {
5762         bdrv_detach_aio_context(bs->file);
5763     }
5764     if (bs->backing_hd) {
5765         bdrv_detach_aio_context(bs->backing_hd);
5766     }
5767 
5768     bs->aio_context = NULL;
5769 }
5770 
5771 void bdrv_attach_aio_context(BlockDriverState *bs,
5772                              AioContext *new_context)
5773 {
5774     BdrvAioNotifier *ban;
5775 
5776     if (!bs->drv) {
5777         return;
5778     }
5779 
5780     bs->aio_context = new_context;
5781 
5782     if (bs->backing_hd) {
5783         bdrv_attach_aio_context(bs->backing_hd, new_context);
5784     }
5785     if (bs->file) {
5786         bdrv_attach_aio_context(bs->file, new_context);
5787     }
5788     if (bs->drv->bdrv_attach_aio_context) {
5789         bs->drv->bdrv_attach_aio_context(bs, new_context);
5790     }
5791     if (bs->io_limits_enabled) {
5792         throttle_attach_aio_context(&bs->throttle_state, new_context);
5793     }
5794 
5795     QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5796         ban->attached_aio_context(new_context, ban->opaque);
5797     }
5798 }
5799 
5800 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5801 {
5802     bdrv_drain_all(); /* ensure there are no in-flight requests */
5803 
5804     bdrv_detach_aio_context(bs);
5805 
5806     /* This function executes in the old AioContext so acquire the new one in
5807      * case it runs in a different thread.
5808      */
5809     aio_context_acquire(new_context);
5810     bdrv_attach_aio_context(bs, new_context);
5811     aio_context_release(new_context);
5812 }
5813 
5814 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5815         void (*attached_aio_context)(AioContext *new_context, void *opaque),
5816         void (*detach_aio_context)(void *opaque), void *opaque)
5817 {
5818     BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5819     *ban = (BdrvAioNotifier){
5820         .attached_aio_context = attached_aio_context,
5821         .detach_aio_context   = detach_aio_context,
5822         .opaque               = opaque
5823     };
5824 
5825     QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5826 }
5827 
5828 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5829                                       void (*attached_aio_context)(AioContext *,
5830                                                                    void *),
5831                                       void (*detach_aio_context)(void *),
5832                                       void *opaque)
5833 {
5834     BdrvAioNotifier *ban, *ban_next;
5835 
5836     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5837         if (ban->attached_aio_context == attached_aio_context &&
5838             ban->detach_aio_context   == detach_aio_context   &&
5839             ban->opaque               == opaque)
5840         {
5841             QLIST_REMOVE(ban, list);
5842             g_free(ban);
5843 
5844             return;
5845         }
5846     }
5847 
5848     abort();
5849 }
5850 
5851 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5852                                     NotifierWithReturn *notifier)
5853 {
5854     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5855 }
5856 
5857 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts)
5858 {
5859     if (!bs->drv->bdrv_amend_options) {
5860         return -ENOTSUP;
5861     }
5862     return bs->drv->bdrv_amend_options(bs, opts);
5863 }
5864 
5865 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5866  * of block filter and by bdrv_is_first_non_filter.
5867  * It is used to test if the given bs is the candidate or recurse more in the
5868  * node graph.
5869  */
5870 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5871                                       BlockDriverState *candidate)
5872 {
5873     /* return false if basic checks fails */
5874     if (!bs || !bs->drv) {
5875         return false;
5876     }
5877 
5878     /* the code reached a non block filter driver -> check if the bs is
5879      * the same as the candidate. It's the recursion termination condition.
5880      */
5881     if (!bs->drv->is_filter) {
5882         return bs == candidate;
5883     }
5884     /* Down this path the driver is a block filter driver */
5885 
5886     /* If the block filter recursion method is defined use it to recurse down
5887      * the node graph.
5888      */
5889     if (bs->drv->bdrv_recurse_is_first_non_filter) {
5890         return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5891     }
5892 
5893     /* the driver is a block filter but don't allow to recurse -> return false
5894      */
5895     return false;
5896 }
5897 
5898 /* This function checks if the candidate is the first non filter bs down it's
5899  * bs chain. Since we don't have pointers to parents it explore all bs chains
5900  * from the top. Some filters can choose not to pass down the recursion.
5901  */
5902 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5903 {
5904     BlockDriverState *bs;
5905 
5906     /* walk down the bs forest recursively */
5907     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5908         bool perm;
5909 
5910         /* try to recurse in this top level bs */
5911         perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5912 
5913         /* candidate is the first non filter */
5914         if (perm) {
5915             return true;
5916         }
5917     }
5918 
5919     return false;
5920 }
5921 
5922 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5923 {
5924     BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5925     if (!to_replace_bs) {
5926         error_setg(errp, "Node name '%s' not found", node_name);
5927         return NULL;
5928     }
5929 
5930     if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5931         return NULL;
5932     }
5933 
5934     /* We don't want arbitrary node of the BDS chain to be replaced only the top
5935      * most non filter in order to prevent data corruption.
5936      * Another benefit is that this tests exclude backing files which are
5937      * blocked by the backing blockers.
5938      */
5939     if (!bdrv_is_first_non_filter(to_replace_bs)) {
5940         error_setg(errp, "Only top most non filter can be replaced");
5941         return NULL;
5942     }
5943 
5944     return to_replace_bs;
5945 }
5946 
5947 void bdrv_io_plug(BlockDriverState *bs)
5948 {
5949     BlockDriver *drv = bs->drv;
5950     if (drv && drv->bdrv_io_plug) {
5951         drv->bdrv_io_plug(bs);
5952     } else if (bs->file) {
5953         bdrv_io_plug(bs->file);
5954     }
5955 }
5956 
5957 void bdrv_io_unplug(BlockDriverState *bs)
5958 {
5959     BlockDriver *drv = bs->drv;
5960     if (drv && drv->bdrv_io_unplug) {
5961         drv->bdrv_io_unplug(bs);
5962     } else if (bs->file) {
5963         bdrv_io_unplug(bs->file);
5964     }
5965 }
5966 
5967 void bdrv_flush_io_queue(BlockDriverState *bs)
5968 {
5969     BlockDriver *drv = bs->drv;
5970     if (drv && drv->bdrv_flush_io_queue) {
5971         drv->bdrv_flush_io_queue(bs);
5972     } else if (bs->file) {
5973         bdrv_flush_io_queue(bs->file);
5974     }
5975 }
5976 
5977 static bool append_open_options(QDict *d, BlockDriverState *bs)
5978 {
5979     const QDictEntry *entry;
5980     bool found_any = false;
5981 
5982     for (entry = qdict_first(bs->options); entry;
5983          entry = qdict_next(bs->options, entry))
5984     {
5985         /* Only take options for this level and exclude all non-driver-specific
5986          * options */
5987         if (!strchr(qdict_entry_key(entry), '.') &&
5988             strcmp(qdict_entry_key(entry), "node-name"))
5989         {
5990             qobject_incref(qdict_entry_value(entry));
5991             qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
5992             found_any = true;
5993         }
5994     }
5995 
5996     return found_any;
5997 }
5998 
5999 /* Updates the following BDS fields:
6000  *  - exact_filename: A filename which may be used for opening a block device
6001  *                    which (mostly) equals the given BDS (even without any
6002  *                    other options; so reading and writing must return the same
6003  *                    results, but caching etc. may be different)
6004  *  - full_open_options: Options which, when given when opening a block device
6005  *                       (without a filename), result in a BDS (mostly)
6006  *                       equalling the given one
6007  *  - filename: If exact_filename is set, it is copied here. Otherwise,
6008  *              full_open_options is converted to a JSON object, prefixed with
6009  *              "json:" (for use through the JSON pseudo protocol) and put here.
6010  */
6011 void bdrv_refresh_filename(BlockDriverState *bs)
6012 {
6013     BlockDriver *drv = bs->drv;
6014     QDict *opts;
6015 
6016     if (!drv) {
6017         return;
6018     }
6019 
6020     /* This BDS's file name will most probably depend on its file's name, so
6021      * refresh that first */
6022     if (bs->file) {
6023         bdrv_refresh_filename(bs->file);
6024     }
6025 
6026     if (drv->bdrv_refresh_filename) {
6027         /* Obsolete information is of no use here, so drop the old file name
6028          * information before refreshing it */
6029         bs->exact_filename[0] = '\0';
6030         if (bs->full_open_options) {
6031             QDECREF(bs->full_open_options);
6032             bs->full_open_options = NULL;
6033         }
6034 
6035         drv->bdrv_refresh_filename(bs);
6036     } else if (bs->file) {
6037         /* Try to reconstruct valid information from the underlying file */
6038         bool has_open_options;
6039 
6040         bs->exact_filename[0] = '\0';
6041         if (bs->full_open_options) {
6042             QDECREF(bs->full_open_options);
6043             bs->full_open_options = NULL;
6044         }
6045 
6046         opts = qdict_new();
6047         has_open_options = append_open_options(opts, bs);
6048 
6049         /* If no specific options have been given for this BDS, the filename of
6050          * the underlying file should suffice for this one as well */
6051         if (bs->file->exact_filename[0] && !has_open_options) {
6052             strcpy(bs->exact_filename, bs->file->exact_filename);
6053         }
6054         /* Reconstructing the full options QDict is simple for most format block
6055          * drivers, as long as the full options are known for the underlying
6056          * file BDS. The full options QDict of that file BDS should somehow
6057          * contain a representation of the filename, therefore the following
6058          * suffices without querying the (exact_)filename of this BDS. */
6059         if (bs->file->full_open_options) {
6060             qdict_put_obj(opts, "driver",
6061                           QOBJECT(qstring_from_str(drv->format_name)));
6062             QINCREF(bs->file->full_open_options);
6063             qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6064 
6065             bs->full_open_options = opts;
6066         } else {
6067             QDECREF(opts);
6068         }
6069     } else if (!bs->full_open_options && qdict_size(bs->options)) {
6070         /* There is no underlying file BDS (at least referenced by BDS.file),
6071          * so the full options QDict should be equal to the options given
6072          * specifically for this block device when it was opened (plus the
6073          * driver specification).
6074          * Because those options don't change, there is no need to update
6075          * full_open_options when it's already set. */
6076 
6077         opts = qdict_new();
6078         append_open_options(opts, bs);
6079         qdict_put_obj(opts, "driver",
6080                       QOBJECT(qstring_from_str(drv->format_name)));
6081 
6082         if (bs->exact_filename[0]) {
6083             /* This may not work for all block protocol drivers (some may
6084              * require this filename to be parsed), but we have to find some
6085              * default solution here, so just include it. If some block driver
6086              * does not support pure options without any filename at all or
6087              * needs some special format of the options QDict, it needs to
6088              * implement the driver-specific bdrv_refresh_filename() function.
6089              */
6090             qdict_put_obj(opts, "filename",
6091                           QOBJECT(qstring_from_str(bs->exact_filename)));
6092         }
6093 
6094         bs->full_open_options = opts;
6095     }
6096 
6097     if (bs->exact_filename[0]) {
6098         pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6099     } else if (bs->full_open_options) {
6100         QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6101         snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6102                  qstring_get_str(json));
6103         QDECREF(json);
6104     }
6105 }
6106