xref: /openbmc/qemu/block.c (revision dfa9c2a0f4d0a0c8b2c1449ecdbb1297427e1560)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/block-backend.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 #include "qapi-event.h"
39 
40 #ifdef CONFIG_BSD
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <sys/ioctl.h>
44 #include <sys/queue.h>
45 #ifndef __DragonFly__
46 #include <sys/disk.h>
47 #endif
48 #endif
49 
50 #ifdef _WIN32
51 #include <windows.h>
52 #endif
53 
54 struct BdrvDirtyBitmap {
55     HBitmap *bitmap;
56     QLIST_ENTRY(BdrvDirtyBitmap) list;
57 };
58 
59 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
60 
61 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63         BlockCompletionFunc *cb, void *opaque);
64 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66         BlockCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68                                          int64_t sector_num, int nb_sectors,
69                                          QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71                                          int64_t sector_num, int nb_sectors,
72                                          QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75     BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78     BdrvRequestFlags flags);
79 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80                                          int64_t sector_num,
81                                          QEMUIOVector *qiov,
82                                          int nb_sectors,
83                                          BdrvRequestFlags flags,
84                                          BlockCompletionFunc *cb,
85                                          void *opaque,
86                                          bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90 
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92     QTAILQ_HEAD_INITIALIZER(bdrv_states);
93 
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96 
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98     QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
102 
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
105 {
106     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108             filename[1] == ':');
109 }
110 
111 int is_windows_drive(const char *filename)
112 {
113     if (is_windows_drive_prefix(filename) &&
114         filename[2] == '\0')
115         return 1;
116     if (strstart(filename, "\\\\.\\", NULL) ||
117         strstart(filename, "//./", NULL))
118         return 1;
119     return 0;
120 }
121 #endif
122 
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125                         ThrottleConfig *cfg)
126 {
127     int i;
128 
129     throttle_config(&bs->throttle_state, cfg);
130 
131     for (i = 0; i < 2; i++) {
132         qemu_co_enter_next(&bs->throttled_reqs[i]);
133     }
134 }
135 
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138 {
139     bool drained = false;
140     bool enabled = bs->io_limits_enabled;
141     int i;
142 
143     bs->io_limits_enabled = false;
144 
145     for (i = 0; i < 2; i++) {
146         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147             drained = true;
148         }
149     }
150 
151     bs->io_limits_enabled = enabled;
152 
153     return drained;
154 }
155 
156 void bdrv_io_limits_disable(BlockDriverState *bs)
157 {
158     bs->io_limits_enabled = false;
159 
160     bdrv_start_throttled_reqs(bs);
161 
162     throttle_destroy(&bs->throttle_state);
163 }
164 
165 static void bdrv_throttle_read_timer_cb(void *opaque)
166 {
167     BlockDriverState *bs = opaque;
168     qemu_co_enter_next(&bs->throttled_reqs[0]);
169 }
170 
171 static void bdrv_throttle_write_timer_cb(void *opaque)
172 {
173     BlockDriverState *bs = opaque;
174     qemu_co_enter_next(&bs->throttled_reqs[1]);
175 }
176 
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
179 {
180     assert(!bs->io_limits_enabled);
181     throttle_init(&bs->throttle_state,
182                   bdrv_get_aio_context(bs),
183                   QEMU_CLOCK_VIRTUAL,
184                   bdrv_throttle_read_timer_cb,
185                   bdrv_throttle_write_timer_cb,
186                   bs);
187     bs->io_limits_enabled = true;
188 }
189 
190 /* This function makes an IO wait if needed
191  *
192  * @nb_sectors: the number of sectors of the IO
193  * @is_write:   is the IO a write
194  */
195 static void bdrv_io_limits_intercept(BlockDriverState *bs,
196                                      unsigned int bytes,
197                                      bool is_write)
198 {
199     /* does this io must wait */
200     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
201 
202     /* if must wait or any request of this type throttled queue the IO */
203     if (must_wait ||
204         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
205         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
206     }
207 
208     /* the IO will be executed, do the accounting */
209     throttle_account(&bs->throttle_state, is_write, bytes);
210 
211 
212     /* if the next request must wait -> do nothing */
213     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214         return;
215     }
216 
217     /* else queue next request for execution */
218     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
219 }
220 
221 size_t bdrv_opt_mem_align(BlockDriverState *bs)
222 {
223     if (!bs || !bs->drv) {
224         /* 4k should be on the safe side */
225         return 4096;
226     }
227 
228     return bs->bl.opt_mem_alignment;
229 }
230 
231 /* check if the path starts with "<protocol>:" */
232 int path_has_protocol(const char *path)
233 {
234     const char *p;
235 
236 #ifdef _WIN32
237     if (is_windows_drive(path) ||
238         is_windows_drive_prefix(path)) {
239         return 0;
240     }
241     p = path + strcspn(path, ":/\\");
242 #else
243     p = path + strcspn(path, ":/");
244 #endif
245 
246     return *p == ':';
247 }
248 
249 int path_is_absolute(const char *path)
250 {
251 #ifdef _WIN32
252     /* specific case for names like: "\\.\d:" */
253     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
254         return 1;
255     }
256     return (*path == '/' || *path == '\\');
257 #else
258     return (*path == '/');
259 #endif
260 }
261 
262 /* if filename is absolute, just copy it to dest. Otherwise, build a
263    path to it by considering it is relative to base_path. URL are
264    supported. */
265 void path_combine(char *dest, int dest_size,
266                   const char *base_path,
267                   const char *filename)
268 {
269     const char *p, *p1;
270     int len;
271 
272     if (dest_size <= 0)
273         return;
274     if (path_is_absolute(filename)) {
275         pstrcpy(dest, dest_size, filename);
276     } else {
277         p = strchr(base_path, ':');
278         if (p)
279             p++;
280         else
281             p = base_path;
282         p1 = strrchr(base_path, '/');
283 #ifdef _WIN32
284         {
285             const char *p2;
286             p2 = strrchr(base_path, '\\');
287             if (!p1 || p2 > p1)
288                 p1 = p2;
289         }
290 #endif
291         if (p1)
292             p1++;
293         else
294             p1 = base_path;
295         if (p1 > p)
296             p = p1;
297         len = p - base_path;
298         if (len > dest_size - 1)
299             len = dest_size - 1;
300         memcpy(dest, base_path, len);
301         dest[len] = '\0';
302         pstrcat(dest, dest_size, filename);
303     }
304 }
305 
306 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
307 {
308     if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
309         pstrcpy(dest, sz, bs->backing_file);
310     } else {
311         path_combine(dest, sz, bs->filename, bs->backing_file);
312     }
313 }
314 
315 void bdrv_register(BlockDriver *bdrv)
316 {
317     /* Block drivers without coroutine functions need emulation */
318     if (!bdrv->bdrv_co_readv) {
319         bdrv->bdrv_co_readv = bdrv_co_readv_em;
320         bdrv->bdrv_co_writev = bdrv_co_writev_em;
321 
322         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
323          * the block driver lacks aio we need to emulate that too.
324          */
325         if (!bdrv->bdrv_aio_readv) {
326             /* add AIO emulation layer */
327             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
328             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
329         }
330     }
331 
332     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
333 }
334 
335 BlockDriverState *bdrv_new_root(void)
336 {
337     BlockDriverState *bs = bdrv_new();
338 
339     QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
340     return bs;
341 }
342 
343 BlockDriverState *bdrv_new(void)
344 {
345     BlockDriverState *bs;
346     int i;
347 
348     bs = g_new0(BlockDriverState, 1);
349     QLIST_INIT(&bs->dirty_bitmaps);
350     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
351         QLIST_INIT(&bs->op_blockers[i]);
352     }
353     bdrv_iostatus_disable(bs);
354     notifier_list_init(&bs->close_notifiers);
355     notifier_with_return_list_init(&bs->before_write_notifiers);
356     qemu_co_queue_init(&bs->throttled_reqs[0]);
357     qemu_co_queue_init(&bs->throttled_reqs[1]);
358     bs->refcnt = 1;
359     bs->aio_context = qemu_get_aio_context();
360 
361     return bs;
362 }
363 
364 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
365 {
366     notifier_list_add(&bs->close_notifiers, notify);
367 }
368 
369 BlockDriver *bdrv_find_format(const char *format_name)
370 {
371     BlockDriver *drv1;
372     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
373         if (!strcmp(drv1->format_name, format_name)) {
374             return drv1;
375         }
376     }
377     return NULL;
378 }
379 
380 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
381 {
382     static const char *whitelist_rw[] = {
383         CONFIG_BDRV_RW_WHITELIST
384     };
385     static const char *whitelist_ro[] = {
386         CONFIG_BDRV_RO_WHITELIST
387     };
388     const char **p;
389 
390     if (!whitelist_rw[0] && !whitelist_ro[0]) {
391         return 1;               /* no whitelist, anything goes */
392     }
393 
394     for (p = whitelist_rw; *p; p++) {
395         if (!strcmp(drv->format_name, *p)) {
396             return 1;
397         }
398     }
399     if (read_only) {
400         for (p = whitelist_ro; *p; p++) {
401             if (!strcmp(drv->format_name, *p)) {
402                 return 1;
403             }
404         }
405     }
406     return 0;
407 }
408 
409 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
410                                           bool read_only)
411 {
412     BlockDriver *drv = bdrv_find_format(format_name);
413     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
414 }
415 
416 typedef struct CreateCo {
417     BlockDriver *drv;
418     char *filename;
419     QemuOpts *opts;
420     int ret;
421     Error *err;
422 } CreateCo;
423 
424 static void coroutine_fn bdrv_create_co_entry(void *opaque)
425 {
426     Error *local_err = NULL;
427     int ret;
428 
429     CreateCo *cco = opaque;
430     assert(cco->drv);
431 
432     ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
433     if (local_err) {
434         error_propagate(&cco->err, local_err);
435     }
436     cco->ret = ret;
437 }
438 
439 int bdrv_create(BlockDriver *drv, const char* filename,
440                 QemuOpts *opts, Error **errp)
441 {
442     int ret;
443 
444     Coroutine *co;
445     CreateCo cco = {
446         .drv = drv,
447         .filename = g_strdup(filename),
448         .opts = opts,
449         .ret = NOT_DONE,
450         .err = NULL,
451     };
452 
453     if (!drv->bdrv_create) {
454         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
455         ret = -ENOTSUP;
456         goto out;
457     }
458 
459     if (qemu_in_coroutine()) {
460         /* Fast-path if already in coroutine context */
461         bdrv_create_co_entry(&cco);
462     } else {
463         co = qemu_coroutine_create(bdrv_create_co_entry);
464         qemu_coroutine_enter(co, &cco);
465         while (cco.ret == NOT_DONE) {
466             aio_poll(qemu_get_aio_context(), true);
467         }
468     }
469 
470     ret = cco.ret;
471     if (ret < 0) {
472         if (cco.err) {
473             error_propagate(errp, cco.err);
474         } else {
475             error_setg_errno(errp, -ret, "Could not create image");
476         }
477     }
478 
479 out:
480     g_free(cco.filename);
481     return ret;
482 }
483 
484 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
485 {
486     BlockDriver *drv;
487     Error *local_err = NULL;
488     int ret;
489 
490     drv = bdrv_find_protocol(filename, true);
491     if (drv == NULL) {
492         error_setg(errp, "Could not find protocol for file '%s'", filename);
493         return -ENOENT;
494     }
495 
496     ret = bdrv_create(drv, filename, opts, &local_err);
497     if (local_err) {
498         error_propagate(errp, local_err);
499     }
500     return ret;
501 }
502 
503 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
504 {
505     BlockDriver *drv = bs->drv;
506     Error *local_err = NULL;
507 
508     memset(&bs->bl, 0, sizeof(bs->bl));
509 
510     if (!drv) {
511         return;
512     }
513 
514     /* Take some limits from the children as a default */
515     if (bs->file) {
516         bdrv_refresh_limits(bs->file, &local_err);
517         if (local_err) {
518             error_propagate(errp, local_err);
519             return;
520         }
521         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
522         bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
523         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
524     } else {
525         bs->bl.opt_mem_alignment = 512;
526     }
527 
528     if (bs->backing_hd) {
529         bdrv_refresh_limits(bs->backing_hd, &local_err);
530         if (local_err) {
531             error_propagate(errp, local_err);
532             return;
533         }
534         bs->bl.opt_transfer_length =
535             MAX(bs->bl.opt_transfer_length,
536                 bs->backing_hd->bl.opt_transfer_length);
537         bs->bl.max_transfer_length =
538             MIN_NON_ZERO(bs->bl.max_transfer_length,
539                          bs->backing_hd->bl.max_transfer_length);
540         bs->bl.opt_mem_alignment =
541             MAX(bs->bl.opt_mem_alignment,
542                 bs->backing_hd->bl.opt_mem_alignment);
543     }
544 
545     /* Then let the driver override it */
546     if (drv->bdrv_refresh_limits) {
547         drv->bdrv_refresh_limits(bs, errp);
548     }
549 }
550 
551 /*
552  * Create a uniquely-named empty temporary file.
553  * Return 0 upon success, otherwise a negative errno value.
554  */
555 int get_tmp_filename(char *filename, int size)
556 {
557 #ifdef _WIN32
558     char temp_dir[MAX_PATH];
559     /* GetTempFileName requires that its output buffer (4th param)
560        have length MAX_PATH or greater.  */
561     assert(size >= MAX_PATH);
562     return (GetTempPath(MAX_PATH, temp_dir)
563             && GetTempFileName(temp_dir, "qem", 0, filename)
564             ? 0 : -GetLastError());
565 #else
566     int fd;
567     const char *tmpdir;
568     tmpdir = getenv("TMPDIR");
569     if (!tmpdir) {
570         tmpdir = "/var/tmp";
571     }
572     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
573         return -EOVERFLOW;
574     }
575     fd = mkstemp(filename);
576     if (fd < 0) {
577         return -errno;
578     }
579     if (close(fd) != 0) {
580         unlink(filename);
581         return -errno;
582     }
583     return 0;
584 #endif
585 }
586 
587 /*
588  * Detect host devices. By convention, /dev/cdrom[N] is always
589  * recognized as a host CDROM.
590  */
591 static BlockDriver *find_hdev_driver(const char *filename)
592 {
593     int score_max = 0, score;
594     BlockDriver *drv = NULL, *d;
595 
596     QLIST_FOREACH(d, &bdrv_drivers, list) {
597         if (d->bdrv_probe_device) {
598             score = d->bdrv_probe_device(filename);
599             if (score > score_max) {
600                 score_max = score;
601                 drv = d;
602             }
603         }
604     }
605 
606     return drv;
607 }
608 
609 BlockDriver *bdrv_find_protocol(const char *filename,
610                                 bool allow_protocol_prefix)
611 {
612     BlockDriver *drv1;
613     char protocol[128];
614     int len;
615     const char *p;
616 
617     /* TODO Drivers without bdrv_file_open must be specified explicitly */
618 
619     /*
620      * XXX(hch): we really should not let host device detection
621      * override an explicit protocol specification, but moving this
622      * later breaks access to device names with colons in them.
623      * Thanks to the brain-dead persistent naming schemes on udev-
624      * based Linux systems those actually are quite common.
625      */
626     drv1 = find_hdev_driver(filename);
627     if (drv1) {
628         return drv1;
629     }
630 
631     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
632         return &bdrv_file;
633     }
634 
635     p = strchr(filename, ':');
636     assert(p != NULL);
637     len = p - filename;
638     if (len > sizeof(protocol) - 1)
639         len = sizeof(protocol) - 1;
640     memcpy(protocol, filename, len);
641     protocol[len] = '\0';
642     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
643         if (drv1->protocol_name &&
644             !strcmp(drv1->protocol_name, protocol)) {
645             return drv1;
646         }
647     }
648     return NULL;
649 }
650 
651 /*
652  * Guess image format by probing its contents.
653  * This is not a good idea when your image is raw (CVE-2008-2004), but
654  * we do it anyway for backward compatibility.
655  *
656  * @buf         contains the image's first @buf_size bytes.
657  * @buf_size    is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
658  *              but can be smaller if the image file is smaller)
659  * @filename    is its filename.
660  *
661  * For all block drivers, call the bdrv_probe() method to get its
662  * probing score.
663  * Return the first block driver with the highest probing score.
664  */
665 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
666                             const char *filename)
667 {
668     int score_max = 0, score;
669     BlockDriver *drv = NULL, *d;
670 
671     QLIST_FOREACH(d, &bdrv_drivers, list) {
672         if (d->bdrv_probe) {
673             score = d->bdrv_probe(buf, buf_size, filename);
674             if (score > score_max) {
675                 score_max = score;
676                 drv = d;
677             }
678         }
679     }
680 
681     return drv;
682 }
683 
684 static int find_image_format(BlockDriverState *bs, const char *filename,
685                              BlockDriver **pdrv, Error **errp)
686 {
687     BlockDriver *drv;
688     uint8_t buf[BLOCK_PROBE_BUF_SIZE];
689     int ret = 0;
690 
691     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
692     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
693         *pdrv = &bdrv_raw;
694         return ret;
695     }
696 
697     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
698     if (ret < 0) {
699         error_setg_errno(errp, -ret, "Could not read image for determining its "
700                          "format");
701         *pdrv = NULL;
702         return ret;
703     }
704 
705     drv = bdrv_probe_all(buf, ret, filename);
706     if (!drv) {
707         error_setg(errp, "Could not determine image format: No compatible "
708                    "driver found");
709         ret = -ENOENT;
710     }
711     *pdrv = drv;
712     return ret;
713 }
714 
715 /**
716  * Set the current 'total_sectors' value
717  * Return 0 on success, -errno on error.
718  */
719 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
720 {
721     BlockDriver *drv = bs->drv;
722 
723     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
724     if (bs->sg)
725         return 0;
726 
727     /* query actual device if possible, otherwise just trust the hint */
728     if (drv->bdrv_getlength) {
729         int64_t length = drv->bdrv_getlength(bs);
730         if (length < 0) {
731             return length;
732         }
733         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
734     }
735 
736     bs->total_sectors = hint;
737     return 0;
738 }
739 
740 /**
741  * Set open flags for a given discard mode
742  *
743  * Return 0 on success, -1 if the discard mode was invalid.
744  */
745 int bdrv_parse_discard_flags(const char *mode, int *flags)
746 {
747     *flags &= ~BDRV_O_UNMAP;
748 
749     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
750         /* do nothing */
751     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
752         *flags |= BDRV_O_UNMAP;
753     } else {
754         return -1;
755     }
756 
757     return 0;
758 }
759 
760 /**
761  * Set open flags for a given cache mode
762  *
763  * Return 0 on success, -1 if the cache mode was invalid.
764  */
765 int bdrv_parse_cache_flags(const char *mode, int *flags)
766 {
767     *flags &= ~BDRV_O_CACHE_MASK;
768 
769     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
770         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
771     } else if (!strcmp(mode, "directsync")) {
772         *flags |= BDRV_O_NOCACHE;
773     } else if (!strcmp(mode, "writeback")) {
774         *flags |= BDRV_O_CACHE_WB;
775     } else if (!strcmp(mode, "unsafe")) {
776         *flags |= BDRV_O_CACHE_WB;
777         *flags |= BDRV_O_NO_FLUSH;
778     } else if (!strcmp(mode, "writethrough")) {
779         /* this is the default */
780     } else {
781         return -1;
782     }
783 
784     return 0;
785 }
786 
787 /**
788  * The copy-on-read flag is actually a reference count so multiple users may
789  * use the feature without worrying about clobbering its previous state.
790  * Copy-on-read stays enabled until all users have called to disable it.
791  */
792 void bdrv_enable_copy_on_read(BlockDriverState *bs)
793 {
794     bs->copy_on_read++;
795 }
796 
797 void bdrv_disable_copy_on_read(BlockDriverState *bs)
798 {
799     assert(bs->copy_on_read > 0);
800     bs->copy_on_read--;
801 }
802 
803 /*
804  * Returns the flags that a temporary snapshot should get, based on the
805  * originally requested flags (the originally requested image will have flags
806  * like a backing file)
807  */
808 static int bdrv_temp_snapshot_flags(int flags)
809 {
810     return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
811 }
812 
813 /*
814  * Returns the flags that bs->file should get, based on the given flags for
815  * the parent BDS
816  */
817 static int bdrv_inherited_flags(int flags)
818 {
819     /* Enable protocol handling, disable format probing for bs->file */
820     flags |= BDRV_O_PROTOCOL;
821 
822     /* Our block drivers take care to send flushes and respect unmap policy,
823      * so we can enable both unconditionally on lower layers. */
824     flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
825 
826     /* Clear flags that only apply to the top layer */
827     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
828 
829     return flags;
830 }
831 
832 /*
833  * Returns the flags that bs->backing_hd should get, based on the given flags
834  * for the parent BDS
835  */
836 static int bdrv_backing_flags(int flags)
837 {
838     /* backing files always opened read-only */
839     flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
840 
841     /* snapshot=on is handled on the top layer */
842     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
843 
844     return flags;
845 }
846 
847 static int bdrv_open_flags(BlockDriverState *bs, int flags)
848 {
849     int open_flags = flags | BDRV_O_CACHE_WB;
850 
851     /*
852      * Clear flags that are internal to the block layer before opening the
853      * image.
854      */
855     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
856 
857     /*
858      * Snapshots should be writable.
859      */
860     if (flags & BDRV_O_TEMPORARY) {
861         open_flags |= BDRV_O_RDWR;
862     }
863 
864     return open_flags;
865 }
866 
867 static void bdrv_assign_node_name(BlockDriverState *bs,
868                                   const char *node_name,
869                                   Error **errp)
870 {
871     if (!node_name) {
872         return;
873     }
874 
875     /* Check for empty string or invalid characters */
876     if (!id_wellformed(node_name)) {
877         error_setg(errp, "Invalid node name");
878         return;
879     }
880 
881     /* takes care of avoiding namespaces collisions */
882     if (blk_by_name(node_name)) {
883         error_setg(errp, "node-name=%s is conflicting with a device id",
884                    node_name);
885         return;
886     }
887 
888     /* takes care of avoiding duplicates node names */
889     if (bdrv_find_node(node_name)) {
890         error_setg(errp, "Duplicate node name");
891         return;
892     }
893 
894     /* copy node name into the bs and insert it into the graph list */
895     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
896     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
897 }
898 
899 /*
900  * Common part for opening disk images and files
901  *
902  * Removes all processed options from *options.
903  */
904 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
905     QDict *options, int flags, BlockDriver *drv, Error **errp)
906 {
907     int ret, open_flags;
908     const char *filename;
909     const char *node_name = NULL;
910     Error *local_err = NULL;
911 
912     assert(drv != NULL);
913     assert(bs->file == NULL);
914     assert(options != NULL && bs->options != options);
915 
916     if (file != NULL) {
917         filename = file->filename;
918     } else {
919         filename = qdict_get_try_str(options, "filename");
920     }
921 
922     if (drv->bdrv_needs_filename && !filename) {
923         error_setg(errp, "The '%s' block driver requires a file name",
924                    drv->format_name);
925         return -EINVAL;
926     }
927 
928     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
929 
930     node_name = qdict_get_try_str(options, "node-name");
931     bdrv_assign_node_name(bs, node_name, &local_err);
932     if (local_err) {
933         error_propagate(errp, local_err);
934         return -EINVAL;
935     }
936     qdict_del(options, "node-name");
937 
938     /* bdrv_open() with directly using a protocol as drv. This layer is already
939      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
940      * and return immediately. */
941     if (file != NULL && drv->bdrv_file_open) {
942         bdrv_swap(file, bs);
943         return 0;
944     }
945 
946     bs->open_flags = flags;
947     bs->guest_block_size = 512;
948     bs->request_alignment = 512;
949     bs->zero_beyond_eof = true;
950     open_flags = bdrv_open_flags(bs, flags);
951     bs->read_only = !(open_flags & BDRV_O_RDWR);
952     bs->growable = !!(flags & BDRV_O_PROTOCOL);
953 
954     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
955         error_setg(errp,
956                    !bs->read_only && bdrv_is_whitelisted(drv, true)
957                         ? "Driver '%s' can only be used for read-only devices"
958                         : "Driver '%s' is not whitelisted",
959                    drv->format_name);
960         return -ENOTSUP;
961     }
962 
963     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
964     if (flags & BDRV_O_COPY_ON_READ) {
965         if (!bs->read_only) {
966             bdrv_enable_copy_on_read(bs);
967         } else {
968             error_setg(errp, "Can't use copy-on-read on read-only device");
969             return -EINVAL;
970         }
971     }
972 
973     if (filename != NULL) {
974         pstrcpy(bs->filename, sizeof(bs->filename), filename);
975     } else {
976         bs->filename[0] = '\0';
977     }
978     pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
979 
980     bs->drv = drv;
981     bs->opaque = g_malloc0(drv->instance_size);
982 
983     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
984 
985     /* Open the image, either directly or using a protocol */
986     if (drv->bdrv_file_open) {
987         assert(file == NULL);
988         assert(!drv->bdrv_needs_filename || filename != NULL);
989         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
990     } else {
991         if (file == NULL) {
992             error_setg(errp, "Can't use '%s' as a block driver for the "
993                        "protocol level", drv->format_name);
994             ret = -EINVAL;
995             goto free_and_fail;
996         }
997         bs->file = file;
998         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
999     }
1000 
1001     if (ret < 0) {
1002         if (local_err) {
1003             error_propagate(errp, local_err);
1004         } else if (bs->filename[0]) {
1005             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
1006         } else {
1007             error_setg_errno(errp, -ret, "Could not open image");
1008         }
1009         goto free_and_fail;
1010     }
1011 
1012     ret = refresh_total_sectors(bs, bs->total_sectors);
1013     if (ret < 0) {
1014         error_setg_errno(errp, -ret, "Could not refresh total sector count");
1015         goto free_and_fail;
1016     }
1017 
1018     bdrv_refresh_limits(bs, &local_err);
1019     if (local_err) {
1020         error_propagate(errp, local_err);
1021         ret = -EINVAL;
1022         goto free_and_fail;
1023     }
1024 
1025     assert(bdrv_opt_mem_align(bs) != 0);
1026     assert((bs->request_alignment != 0) || bs->sg);
1027     return 0;
1028 
1029 free_and_fail:
1030     bs->file = NULL;
1031     g_free(bs->opaque);
1032     bs->opaque = NULL;
1033     bs->drv = NULL;
1034     return ret;
1035 }
1036 
1037 static QDict *parse_json_filename(const char *filename, Error **errp)
1038 {
1039     QObject *options_obj;
1040     QDict *options;
1041     int ret;
1042 
1043     ret = strstart(filename, "json:", &filename);
1044     assert(ret);
1045 
1046     options_obj = qobject_from_json(filename);
1047     if (!options_obj) {
1048         error_setg(errp, "Could not parse the JSON options");
1049         return NULL;
1050     }
1051 
1052     if (qobject_type(options_obj) != QTYPE_QDICT) {
1053         qobject_decref(options_obj);
1054         error_setg(errp, "Invalid JSON object given");
1055         return NULL;
1056     }
1057 
1058     options = qobject_to_qdict(options_obj);
1059     qdict_flatten(options);
1060 
1061     return options;
1062 }
1063 
1064 /*
1065  * Fills in default options for opening images and converts the legacy
1066  * filename/flags pair to option QDict entries.
1067  */
1068 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1069                              BlockDriver *drv, Error **errp)
1070 {
1071     const char *filename = *pfilename;
1072     const char *drvname;
1073     bool protocol = flags & BDRV_O_PROTOCOL;
1074     bool parse_filename = false;
1075     Error *local_err = NULL;
1076 
1077     /* Parse json: pseudo-protocol */
1078     if (filename && g_str_has_prefix(filename, "json:")) {
1079         QDict *json_options = parse_json_filename(filename, &local_err);
1080         if (local_err) {
1081             error_propagate(errp, local_err);
1082             return -EINVAL;
1083         }
1084 
1085         /* Options given in the filename have lower priority than options
1086          * specified directly */
1087         qdict_join(*options, json_options, false);
1088         QDECREF(json_options);
1089         *pfilename = filename = NULL;
1090     }
1091 
1092     /* Fetch the file name from the options QDict if necessary */
1093     if (protocol && filename) {
1094         if (!qdict_haskey(*options, "filename")) {
1095             qdict_put(*options, "filename", qstring_from_str(filename));
1096             parse_filename = true;
1097         } else {
1098             error_setg(errp, "Can't specify 'file' and 'filename' options at "
1099                              "the same time");
1100             return -EINVAL;
1101         }
1102     }
1103 
1104     /* Find the right block driver */
1105     filename = qdict_get_try_str(*options, "filename");
1106     drvname = qdict_get_try_str(*options, "driver");
1107 
1108     if (drv) {
1109         if (drvname) {
1110             error_setg(errp, "Driver specified twice");
1111             return -EINVAL;
1112         }
1113         drvname = drv->format_name;
1114         qdict_put(*options, "driver", qstring_from_str(drvname));
1115     } else {
1116         if (!drvname && protocol) {
1117             if (filename) {
1118                 drv = bdrv_find_protocol(filename, parse_filename);
1119                 if (!drv) {
1120                     error_setg(errp, "Unknown protocol");
1121                     return -EINVAL;
1122                 }
1123 
1124                 drvname = drv->format_name;
1125                 qdict_put(*options, "driver", qstring_from_str(drvname));
1126             } else {
1127                 error_setg(errp, "Must specify either driver or file");
1128                 return -EINVAL;
1129             }
1130         } else if (drvname) {
1131             drv = bdrv_find_format(drvname);
1132             if (!drv) {
1133                 error_setg(errp, "Unknown driver '%s'", drvname);
1134                 return -ENOENT;
1135             }
1136         }
1137     }
1138 
1139     assert(drv || !protocol);
1140 
1141     /* Driver-specific filename parsing */
1142     if (drv && drv->bdrv_parse_filename && parse_filename) {
1143         drv->bdrv_parse_filename(filename, *options, &local_err);
1144         if (local_err) {
1145             error_propagate(errp, local_err);
1146             return -EINVAL;
1147         }
1148 
1149         if (!drv->bdrv_needs_filename) {
1150             qdict_del(*options, "filename");
1151         }
1152     }
1153 
1154     return 0;
1155 }
1156 
1157 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1158 {
1159 
1160     if (bs->backing_hd) {
1161         assert(bs->backing_blocker);
1162         bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1163     } else if (backing_hd) {
1164         error_setg(&bs->backing_blocker,
1165                    "device is used as backing hd of '%s'",
1166                    bdrv_get_device_name(bs));
1167     }
1168 
1169     bs->backing_hd = backing_hd;
1170     if (!backing_hd) {
1171         error_free(bs->backing_blocker);
1172         bs->backing_blocker = NULL;
1173         goto out;
1174     }
1175     bs->open_flags &= ~BDRV_O_NO_BACKING;
1176     pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1177     pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1178             backing_hd->drv ? backing_hd->drv->format_name : "");
1179 
1180     bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1181     /* Otherwise we won't be able to commit due to check in bdrv_commit */
1182     bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1183                     bs->backing_blocker);
1184 out:
1185     bdrv_refresh_limits(bs, NULL);
1186 }
1187 
1188 /*
1189  * Opens the backing file for a BlockDriverState if not yet open
1190  *
1191  * options is a QDict of options to pass to the block drivers, or NULL for an
1192  * empty set of options. The reference to the QDict is transferred to this
1193  * function (even on failure), so if the caller intends to reuse the dictionary,
1194  * it needs to use QINCREF() before calling bdrv_file_open.
1195  */
1196 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1197 {
1198     char *backing_filename = g_malloc0(PATH_MAX);
1199     int ret = 0;
1200     BlockDriverState *backing_hd;
1201     Error *local_err = NULL;
1202 
1203     if (bs->backing_hd != NULL) {
1204         QDECREF(options);
1205         goto free_exit;
1206     }
1207 
1208     /* NULL means an empty set of options */
1209     if (options == NULL) {
1210         options = qdict_new();
1211     }
1212 
1213     bs->open_flags &= ~BDRV_O_NO_BACKING;
1214     if (qdict_haskey(options, "file.filename")) {
1215         backing_filename[0] = '\0';
1216     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1217         QDECREF(options);
1218         goto free_exit;
1219     } else {
1220         bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1221     }
1222 
1223     if (!bs->drv || !bs->drv->supports_backing) {
1224         ret = -EINVAL;
1225         error_setg(errp, "Driver doesn't support backing files");
1226         QDECREF(options);
1227         goto free_exit;
1228     }
1229 
1230     backing_hd = bdrv_new();
1231 
1232     if (bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) {
1233         qdict_put(options, "driver", qstring_from_str(bs->backing_format));
1234     }
1235 
1236     assert(bs->backing_hd == NULL);
1237     ret = bdrv_open(&backing_hd,
1238                     *backing_filename ? backing_filename : NULL, NULL, options,
1239                     bdrv_backing_flags(bs->open_flags), NULL, &local_err);
1240     if (ret < 0) {
1241         bdrv_unref(backing_hd);
1242         backing_hd = NULL;
1243         bs->open_flags |= BDRV_O_NO_BACKING;
1244         error_setg(errp, "Could not open backing file: %s",
1245                    error_get_pretty(local_err));
1246         error_free(local_err);
1247         goto free_exit;
1248     }
1249     bdrv_set_backing_hd(bs, backing_hd);
1250 
1251 free_exit:
1252     g_free(backing_filename);
1253     return ret;
1254 }
1255 
1256 /*
1257  * Opens a disk image whose options are given as BlockdevRef in another block
1258  * device's options.
1259  *
1260  * If allow_none is true, no image will be opened if filename is false and no
1261  * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1262  *
1263  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1264  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1265  * itself, all options starting with "${bdref_key}." are considered part of the
1266  * BlockdevRef.
1267  *
1268  * The BlockdevRef will be removed from the options QDict.
1269  *
1270  * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1271  */
1272 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1273                     QDict *options, const char *bdref_key, int flags,
1274                     bool allow_none, Error **errp)
1275 {
1276     QDict *image_options;
1277     int ret;
1278     char *bdref_key_dot;
1279     const char *reference;
1280 
1281     assert(pbs);
1282     assert(*pbs == NULL);
1283 
1284     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1285     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1286     g_free(bdref_key_dot);
1287 
1288     reference = qdict_get_try_str(options, bdref_key);
1289     if (!filename && !reference && !qdict_size(image_options)) {
1290         if (allow_none) {
1291             ret = 0;
1292         } else {
1293             error_setg(errp, "A block device must be specified for \"%s\"",
1294                        bdref_key);
1295             ret = -EINVAL;
1296         }
1297         QDECREF(image_options);
1298         goto done;
1299     }
1300 
1301     ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1302 
1303 done:
1304     qdict_del(options, bdref_key);
1305     return ret;
1306 }
1307 
1308 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1309 {
1310     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1311     char *tmp_filename = g_malloc0(PATH_MAX + 1);
1312     int64_t total_size;
1313     QemuOpts *opts = NULL;
1314     QDict *snapshot_options;
1315     BlockDriverState *bs_snapshot;
1316     Error *local_err;
1317     int ret;
1318 
1319     /* if snapshot, we create a temporary backing file and open it
1320        instead of opening 'filename' directly */
1321 
1322     /* Get the required size from the image */
1323     total_size = bdrv_getlength(bs);
1324     if (total_size < 0) {
1325         ret = total_size;
1326         error_setg_errno(errp, -total_size, "Could not get image size");
1327         goto out;
1328     }
1329 
1330     /* Create the temporary image */
1331     ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1332     if (ret < 0) {
1333         error_setg_errno(errp, -ret, "Could not get temporary filename");
1334         goto out;
1335     }
1336 
1337     opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
1338                             &error_abort);
1339     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
1340     ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, &local_err);
1341     qemu_opts_del(opts);
1342     if (ret < 0) {
1343         error_setg_errno(errp, -ret, "Could not create temporary overlay "
1344                          "'%s': %s", tmp_filename,
1345                          error_get_pretty(local_err));
1346         error_free(local_err);
1347         goto out;
1348     }
1349 
1350     /* Prepare a new options QDict for the temporary file */
1351     snapshot_options = qdict_new();
1352     qdict_put(snapshot_options, "file.driver",
1353               qstring_from_str("file"));
1354     qdict_put(snapshot_options, "file.filename",
1355               qstring_from_str(tmp_filename));
1356 
1357     bs_snapshot = bdrv_new();
1358 
1359     ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1360                     flags, &bdrv_qcow2, &local_err);
1361     if (ret < 0) {
1362         error_propagate(errp, local_err);
1363         goto out;
1364     }
1365 
1366     bdrv_append(bs_snapshot, bs);
1367 
1368 out:
1369     g_free(tmp_filename);
1370     return ret;
1371 }
1372 
1373 /*
1374  * Opens a disk image (raw, qcow2, vmdk, ...)
1375  *
1376  * options is a QDict of options to pass to the block drivers, or NULL for an
1377  * empty set of options. The reference to the QDict belongs to the block layer
1378  * after the call (even on failure), so if the caller intends to reuse the
1379  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1380  *
1381  * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1382  * If it is not NULL, the referenced BDS will be reused.
1383  *
1384  * The reference parameter may be used to specify an existing block device which
1385  * should be opened. If specified, neither options nor a filename may be given,
1386  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1387  */
1388 int bdrv_open(BlockDriverState **pbs, const char *filename,
1389               const char *reference, QDict *options, int flags,
1390               BlockDriver *drv, Error **errp)
1391 {
1392     int ret;
1393     BlockDriverState *file = NULL, *bs;
1394     const char *drvname;
1395     Error *local_err = NULL;
1396     int snapshot_flags = 0;
1397 
1398     assert(pbs);
1399 
1400     if (reference) {
1401         bool options_non_empty = options ? qdict_size(options) : false;
1402         QDECREF(options);
1403 
1404         if (*pbs) {
1405             error_setg(errp, "Cannot reuse an existing BDS when referencing "
1406                        "another block device");
1407             return -EINVAL;
1408         }
1409 
1410         if (filename || options_non_empty) {
1411             error_setg(errp, "Cannot reference an existing block device with "
1412                        "additional options or a new filename");
1413             return -EINVAL;
1414         }
1415 
1416         bs = bdrv_lookup_bs(reference, reference, errp);
1417         if (!bs) {
1418             return -ENODEV;
1419         }
1420         bdrv_ref(bs);
1421         *pbs = bs;
1422         return 0;
1423     }
1424 
1425     if (*pbs) {
1426         bs = *pbs;
1427     } else {
1428         bs = bdrv_new();
1429     }
1430 
1431     /* NULL means an empty set of options */
1432     if (options == NULL) {
1433         options = qdict_new();
1434     }
1435 
1436     ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1437     if (local_err) {
1438         goto fail;
1439     }
1440 
1441     /* Find the right image format driver */
1442     drv = NULL;
1443     drvname = qdict_get_try_str(options, "driver");
1444     if (drvname) {
1445         drv = bdrv_find_format(drvname);
1446         qdict_del(options, "driver");
1447         if (!drv) {
1448             error_setg(errp, "Unknown driver: '%s'", drvname);
1449             ret = -EINVAL;
1450             goto fail;
1451         }
1452     }
1453 
1454     assert(drvname || !(flags & BDRV_O_PROTOCOL));
1455     if (drv && !drv->bdrv_file_open) {
1456         /* If the user explicitly wants a format driver here, we'll need to add
1457          * another layer for the protocol in bs->file */
1458         flags &= ~BDRV_O_PROTOCOL;
1459     }
1460 
1461     bs->options = options;
1462     options = qdict_clone_shallow(options);
1463 
1464     /* Open image file without format layer */
1465     if ((flags & BDRV_O_PROTOCOL) == 0) {
1466         if (flags & BDRV_O_RDWR) {
1467             flags |= BDRV_O_ALLOW_RDWR;
1468         }
1469         if (flags & BDRV_O_SNAPSHOT) {
1470             snapshot_flags = bdrv_temp_snapshot_flags(flags);
1471             flags = bdrv_backing_flags(flags);
1472         }
1473 
1474         assert(file == NULL);
1475         ret = bdrv_open_image(&file, filename, options, "file",
1476                               bdrv_inherited_flags(flags),
1477                               true, &local_err);
1478         if (ret < 0) {
1479             goto fail;
1480         }
1481     }
1482 
1483     /* Image format probing */
1484     bs->probed = !drv;
1485     if (!drv && file) {
1486         ret = find_image_format(file, filename, &drv, &local_err);
1487         if (ret < 0) {
1488             goto fail;
1489         }
1490     } else if (!drv) {
1491         error_setg(errp, "Must specify either driver or file");
1492         ret = -EINVAL;
1493         goto fail;
1494     }
1495 
1496     /* Open the image */
1497     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1498     if (ret < 0) {
1499         goto fail;
1500     }
1501 
1502     if (file && (bs->file != file)) {
1503         bdrv_unref(file);
1504         file = NULL;
1505     }
1506 
1507     /* If there is a backing file, use it */
1508     if ((flags & BDRV_O_NO_BACKING) == 0) {
1509         QDict *backing_options;
1510 
1511         qdict_extract_subqdict(options, &backing_options, "backing.");
1512         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1513         if (ret < 0) {
1514             goto close_and_fail;
1515         }
1516     }
1517 
1518     bdrv_refresh_filename(bs);
1519 
1520     /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1521      * temporary snapshot afterwards. */
1522     if (snapshot_flags) {
1523         ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1524         if (local_err) {
1525             goto close_and_fail;
1526         }
1527     }
1528 
1529     /* Check if any unknown options were used */
1530     if (options && (qdict_size(options) != 0)) {
1531         const QDictEntry *entry = qdict_first(options);
1532         if (flags & BDRV_O_PROTOCOL) {
1533             error_setg(errp, "Block protocol '%s' doesn't support the option "
1534                        "'%s'", drv->format_name, entry->key);
1535         } else {
1536             error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1537                        "support the option '%s'", drv->format_name,
1538                        bdrv_get_device_name(bs), entry->key);
1539         }
1540 
1541         ret = -EINVAL;
1542         goto close_and_fail;
1543     }
1544 
1545     if (!bdrv_key_required(bs)) {
1546         if (bs->blk) {
1547             blk_dev_change_media_cb(bs->blk, true);
1548         }
1549     } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1550                && !runstate_check(RUN_STATE_INMIGRATE)
1551                && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1552         error_setg(errp,
1553                    "Guest must be stopped for opening of encrypted image");
1554         ret = -EBUSY;
1555         goto close_and_fail;
1556     }
1557 
1558     QDECREF(options);
1559     *pbs = bs;
1560     return 0;
1561 
1562 fail:
1563     if (file != NULL) {
1564         bdrv_unref(file);
1565     }
1566     QDECREF(bs->options);
1567     QDECREF(options);
1568     bs->options = NULL;
1569     if (!*pbs) {
1570         /* If *pbs is NULL, a new BDS has been created in this function and
1571            needs to be freed now. Otherwise, it does not need to be closed,
1572            since it has not really been opened yet. */
1573         bdrv_unref(bs);
1574     }
1575     if (local_err) {
1576         error_propagate(errp, local_err);
1577     }
1578     return ret;
1579 
1580 close_and_fail:
1581     /* See fail path, but now the BDS has to be always closed */
1582     if (*pbs) {
1583         bdrv_close(bs);
1584     } else {
1585         bdrv_unref(bs);
1586     }
1587     QDECREF(options);
1588     if (local_err) {
1589         error_propagate(errp, local_err);
1590     }
1591     return ret;
1592 }
1593 
1594 typedef struct BlockReopenQueueEntry {
1595      bool prepared;
1596      BDRVReopenState state;
1597      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1598 } BlockReopenQueueEntry;
1599 
1600 /*
1601  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1602  * reopen of multiple devices.
1603  *
1604  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1605  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1606  * be created and initialized. This newly created BlockReopenQueue should be
1607  * passed back in for subsequent calls that are intended to be of the same
1608  * atomic 'set'.
1609  *
1610  * bs is the BlockDriverState to add to the reopen queue.
1611  *
1612  * flags contains the open flags for the associated bs
1613  *
1614  * returns a pointer to bs_queue, which is either the newly allocated
1615  * bs_queue, or the existing bs_queue being used.
1616  *
1617  */
1618 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1619                                     BlockDriverState *bs, int flags)
1620 {
1621     assert(bs != NULL);
1622 
1623     BlockReopenQueueEntry *bs_entry;
1624     if (bs_queue == NULL) {
1625         bs_queue = g_new0(BlockReopenQueue, 1);
1626         QSIMPLEQ_INIT(bs_queue);
1627     }
1628 
1629     /* bdrv_open() masks this flag out */
1630     flags &= ~BDRV_O_PROTOCOL;
1631 
1632     if (bs->file) {
1633         bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1634     }
1635 
1636     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1637     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1638 
1639     bs_entry->state.bs = bs;
1640     bs_entry->state.flags = flags;
1641 
1642     return bs_queue;
1643 }
1644 
1645 /*
1646  * Reopen multiple BlockDriverStates atomically & transactionally.
1647  *
1648  * The queue passed in (bs_queue) must have been built up previous
1649  * via bdrv_reopen_queue().
1650  *
1651  * Reopens all BDS specified in the queue, with the appropriate
1652  * flags.  All devices are prepared for reopen, and failure of any
1653  * device will cause all device changes to be abandonded, and intermediate
1654  * data cleaned up.
1655  *
1656  * If all devices prepare successfully, then the changes are committed
1657  * to all devices.
1658  *
1659  */
1660 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1661 {
1662     int ret = -1;
1663     BlockReopenQueueEntry *bs_entry, *next;
1664     Error *local_err = NULL;
1665 
1666     assert(bs_queue != NULL);
1667 
1668     bdrv_drain_all();
1669 
1670     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1671         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1672             error_propagate(errp, local_err);
1673             goto cleanup;
1674         }
1675         bs_entry->prepared = true;
1676     }
1677 
1678     /* If we reach this point, we have success and just need to apply the
1679      * changes
1680      */
1681     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1682         bdrv_reopen_commit(&bs_entry->state);
1683     }
1684 
1685     ret = 0;
1686 
1687 cleanup:
1688     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1689         if (ret && bs_entry->prepared) {
1690             bdrv_reopen_abort(&bs_entry->state);
1691         }
1692         g_free(bs_entry);
1693     }
1694     g_free(bs_queue);
1695     return ret;
1696 }
1697 
1698 
1699 /* Reopen a single BlockDriverState with the specified flags. */
1700 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1701 {
1702     int ret = -1;
1703     Error *local_err = NULL;
1704     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1705 
1706     ret = bdrv_reopen_multiple(queue, &local_err);
1707     if (local_err != NULL) {
1708         error_propagate(errp, local_err);
1709     }
1710     return ret;
1711 }
1712 
1713 
1714 /*
1715  * Prepares a BlockDriverState for reopen. All changes are staged in the
1716  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1717  * the block driver layer .bdrv_reopen_prepare()
1718  *
1719  * bs is the BlockDriverState to reopen
1720  * flags are the new open flags
1721  * queue is the reopen queue
1722  *
1723  * Returns 0 on success, non-zero on error.  On error errp will be set
1724  * as well.
1725  *
1726  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1727  * It is the responsibility of the caller to then call the abort() or
1728  * commit() for any other BDS that have been left in a prepare() state
1729  *
1730  */
1731 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1732                         Error **errp)
1733 {
1734     int ret = -1;
1735     Error *local_err = NULL;
1736     BlockDriver *drv;
1737 
1738     assert(reopen_state != NULL);
1739     assert(reopen_state->bs->drv != NULL);
1740     drv = reopen_state->bs->drv;
1741 
1742     /* if we are to stay read-only, do not allow permission change
1743      * to r/w */
1744     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1745         reopen_state->flags & BDRV_O_RDWR) {
1746         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1747                   bdrv_get_device_name(reopen_state->bs));
1748         goto error;
1749     }
1750 
1751 
1752     ret = bdrv_flush(reopen_state->bs);
1753     if (ret) {
1754         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1755                   strerror(-ret));
1756         goto error;
1757     }
1758 
1759     if (drv->bdrv_reopen_prepare) {
1760         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1761         if (ret) {
1762             if (local_err != NULL) {
1763                 error_propagate(errp, local_err);
1764             } else {
1765                 error_setg(errp, "failed while preparing to reopen image '%s'",
1766                            reopen_state->bs->filename);
1767             }
1768             goto error;
1769         }
1770     } else {
1771         /* It is currently mandatory to have a bdrv_reopen_prepare()
1772          * handler for each supported drv. */
1773         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1774                   drv->format_name, bdrv_get_device_name(reopen_state->bs),
1775                  "reopening of file");
1776         ret = -1;
1777         goto error;
1778     }
1779 
1780     ret = 0;
1781 
1782 error:
1783     return ret;
1784 }
1785 
1786 /*
1787  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1788  * makes them final by swapping the staging BlockDriverState contents into
1789  * the active BlockDriverState contents.
1790  */
1791 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1792 {
1793     BlockDriver *drv;
1794 
1795     assert(reopen_state != NULL);
1796     drv = reopen_state->bs->drv;
1797     assert(drv != NULL);
1798 
1799     /* If there are any driver level actions to take */
1800     if (drv->bdrv_reopen_commit) {
1801         drv->bdrv_reopen_commit(reopen_state);
1802     }
1803 
1804     /* set BDS specific flags now */
1805     reopen_state->bs->open_flags         = reopen_state->flags;
1806     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1807                                               BDRV_O_CACHE_WB);
1808     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1809 
1810     bdrv_refresh_limits(reopen_state->bs, NULL);
1811 }
1812 
1813 /*
1814  * Abort the reopen, and delete and free the staged changes in
1815  * reopen_state
1816  */
1817 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1818 {
1819     BlockDriver *drv;
1820 
1821     assert(reopen_state != NULL);
1822     drv = reopen_state->bs->drv;
1823     assert(drv != NULL);
1824 
1825     if (drv->bdrv_reopen_abort) {
1826         drv->bdrv_reopen_abort(reopen_state);
1827     }
1828 }
1829 
1830 
1831 void bdrv_close(BlockDriverState *bs)
1832 {
1833     BdrvAioNotifier *ban, *ban_next;
1834 
1835     if (bs->job) {
1836         block_job_cancel_sync(bs->job);
1837     }
1838     bdrv_drain_all(); /* complete I/O */
1839     bdrv_flush(bs);
1840     bdrv_drain_all(); /* in case flush left pending I/O */
1841     notifier_list_notify(&bs->close_notifiers, bs);
1842 
1843     if (bs->drv) {
1844         if (bs->backing_hd) {
1845             BlockDriverState *backing_hd = bs->backing_hd;
1846             bdrv_set_backing_hd(bs, NULL);
1847             bdrv_unref(backing_hd);
1848         }
1849         bs->drv->bdrv_close(bs);
1850         g_free(bs->opaque);
1851         bs->opaque = NULL;
1852         bs->drv = NULL;
1853         bs->copy_on_read = 0;
1854         bs->backing_file[0] = '\0';
1855         bs->backing_format[0] = '\0';
1856         bs->total_sectors = 0;
1857         bs->encrypted = 0;
1858         bs->valid_key = 0;
1859         bs->sg = 0;
1860         bs->growable = 0;
1861         bs->zero_beyond_eof = false;
1862         QDECREF(bs->options);
1863         bs->options = NULL;
1864         QDECREF(bs->full_open_options);
1865         bs->full_open_options = NULL;
1866 
1867         if (bs->file != NULL) {
1868             bdrv_unref(bs->file);
1869             bs->file = NULL;
1870         }
1871     }
1872 
1873     if (bs->blk) {
1874         blk_dev_change_media_cb(bs->blk, false);
1875     }
1876 
1877     /*throttling disk I/O limits*/
1878     if (bs->io_limits_enabled) {
1879         bdrv_io_limits_disable(bs);
1880     }
1881 
1882     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1883         g_free(ban);
1884     }
1885     QLIST_INIT(&bs->aio_notifiers);
1886 }
1887 
1888 void bdrv_close_all(void)
1889 {
1890     BlockDriverState *bs;
1891 
1892     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1893         AioContext *aio_context = bdrv_get_aio_context(bs);
1894 
1895         aio_context_acquire(aio_context);
1896         bdrv_close(bs);
1897         aio_context_release(aio_context);
1898     }
1899 }
1900 
1901 /* Check if any requests are in-flight (including throttled requests) */
1902 static bool bdrv_requests_pending(BlockDriverState *bs)
1903 {
1904     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1905         return true;
1906     }
1907     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1908         return true;
1909     }
1910     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1911         return true;
1912     }
1913     if (bs->file && bdrv_requests_pending(bs->file)) {
1914         return true;
1915     }
1916     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1917         return true;
1918     }
1919     return false;
1920 }
1921 
1922 static bool bdrv_drain_one(BlockDriverState *bs)
1923 {
1924     bool bs_busy;
1925 
1926     bdrv_flush_io_queue(bs);
1927     bdrv_start_throttled_reqs(bs);
1928     bs_busy = bdrv_requests_pending(bs);
1929     bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
1930     return bs_busy;
1931 }
1932 
1933 /*
1934  * Wait for pending requests to complete on a single BlockDriverState subtree
1935  *
1936  * See the warning in bdrv_drain_all().  This function can only be called if
1937  * you are sure nothing can generate I/O because you have op blockers
1938  * installed.
1939  *
1940  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
1941  * AioContext.
1942  */
1943 void bdrv_drain(BlockDriverState *bs)
1944 {
1945     while (bdrv_drain_one(bs)) {
1946         /* Keep iterating */
1947     }
1948 }
1949 
1950 /*
1951  * Wait for pending requests to complete across all BlockDriverStates
1952  *
1953  * This function does not flush data to disk, use bdrv_flush_all() for that
1954  * after calling this function.
1955  *
1956  * Note that completion of an asynchronous I/O operation can trigger any
1957  * number of other I/O operations on other devices---for example a coroutine
1958  * can be arbitrarily complex and a constant flow of I/O can come until the
1959  * coroutine is complete.  Because of this, it is not possible to have a
1960  * function to drain a single device's I/O queue.
1961  */
1962 void bdrv_drain_all(void)
1963 {
1964     /* Always run first iteration so any pending completion BHs run */
1965     bool busy = true;
1966     BlockDriverState *bs;
1967 
1968     while (busy) {
1969         busy = false;
1970 
1971         QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1972             AioContext *aio_context = bdrv_get_aio_context(bs);
1973 
1974             aio_context_acquire(aio_context);
1975             busy |= bdrv_drain_one(bs);
1976             aio_context_release(aio_context);
1977         }
1978     }
1979 }
1980 
1981 /* make a BlockDriverState anonymous by removing from bdrv_state and
1982  * graph_bdrv_state list.
1983    Also, NULL terminate the device_name to prevent double remove */
1984 void bdrv_make_anon(BlockDriverState *bs)
1985 {
1986     /*
1987      * Take care to remove bs from bdrv_states only when it's actually
1988      * in it.  Note that bs->device_list.tqe_prev is initially null,
1989      * and gets set to non-null by QTAILQ_INSERT_TAIL().  Establish
1990      * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
1991      * resetting it to null on remove.
1992      */
1993     if (bs->device_list.tqe_prev) {
1994         QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1995         bs->device_list.tqe_prev = NULL;
1996     }
1997     if (bs->node_name[0] != '\0') {
1998         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1999     }
2000     bs->node_name[0] = '\0';
2001 }
2002 
2003 static void bdrv_rebind(BlockDriverState *bs)
2004 {
2005     if (bs->drv && bs->drv->bdrv_rebind) {
2006         bs->drv->bdrv_rebind(bs);
2007     }
2008 }
2009 
2010 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
2011                                      BlockDriverState *bs_src)
2012 {
2013     /* move some fields that need to stay attached to the device */
2014 
2015     /* dev info */
2016     bs_dest->guest_block_size   = bs_src->guest_block_size;
2017     bs_dest->copy_on_read       = bs_src->copy_on_read;
2018 
2019     bs_dest->enable_write_cache = bs_src->enable_write_cache;
2020 
2021     /* i/o throttled req */
2022     memcpy(&bs_dest->throttle_state,
2023            &bs_src->throttle_state,
2024            sizeof(ThrottleState));
2025     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
2026     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
2027     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
2028 
2029     /* r/w error */
2030     bs_dest->on_read_error      = bs_src->on_read_error;
2031     bs_dest->on_write_error     = bs_src->on_write_error;
2032 
2033     /* i/o status */
2034     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
2035     bs_dest->iostatus           = bs_src->iostatus;
2036 
2037     /* dirty bitmap */
2038     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
2039 
2040     /* reference count */
2041     bs_dest->refcnt             = bs_src->refcnt;
2042 
2043     /* job */
2044     bs_dest->job                = bs_src->job;
2045 
2046     /* keep the same entry in bdrv_states */
2047     bs_dest->device_list = bs_src->device_list;
2048     bs_dest->blk = bs_src->blk;
2049 
2050     memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2051            sizeof(bs_dest->op_blockers));
2052 }
2053 
2054 /*
2055  * Swap bs contents for two image chains while they are live,
2056  * while keeping required fields on the BlockDriverState that is
2057  * actually attached to a device.
2058  *
2059  * This will modify the BlockDriverState fields, and swap contents
2060  * between bs_new and bs_old. Both bs_new and bs_old are modified.
2061  *
2062  * bs_new must not be attached to a BlockBackend.
2063  *
2064  * This function does not create any image files.
2065  */
2066 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2067 {
2068     BlockDriverState tmp;
2069 
2070     /* The code needs to swap the node_name but simply swapping node_list won't
2071      * work so first remove the nodes from the graph list, do the swap then
2072      * insert them back if needed.
2073      */
2074     if (bs_new->node_name[0] != '\0') {
2075         QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2076     }
2077     if (bs_old->node_name[0] != '\0') {
2078         QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2079     }
2080 
2081     /* bs_new must be unattached and shouldn't have anything fancy enabled */
2082     assert(!bs_new->blk);
2083     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2084     assert(bs_new->job == NULL);
2085     assert(bs_new->io_limits_enabled == false);
2086     assert(!throttle_have_timer(&bs_new->throttle_state));
2087 
2088     tmp = *bs_new;
2089     *bs_new = *bs_old;
2090     *bs_old = tmp;
2091 
2092     /* there are some fields that should not be swapped, move them back */
2093     bdrv_move_feature_fields(&tmp, bs_old);
2094     bdrv_move_feature_fields(bs_old, bs_new);
2095     bdrv_move_feature_fields(bs_new, &tmp);
2096 
2097     /* bs_new must remain unattached */
2098     assert(!bs_new->blk);
2099 
2100     /* Check a few fields that should remain attached to the device */
2101     assert(bs_new->job == NULL);
2102     assert(bs_new->io_limits_enabled == false);
2103     assert(!throttle_have_timer(&bs_new->throttle_state));
2104 
2105     /* insert the nodes back into the graph node list if needed */
2106     if (bs_new->node_name[0] != '\0') {
2107         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2108     }
2109     if (bs_old->node_name[0] != '\0') {
2110         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2111     }
2112 
2113     bdrv_rebind(bs_new);
2114     bdrv_rebind(bs_old);
2115 }
2116 
2117 /*
2118  * Add new bs contents at the top of an image chain while the chain is
2119  * live, while keeping required fields on the top layer.
2120  *
2121  * This will modify the BlockDriverState fields, and swap contents
2122  * between bs_new and bs_top. Both bs_new and bs_top are modified.
2123  *
2124  * bs_new must not be attached to a BlockBackend.
2125  *
2126  * This function does not create any image files.
2127  */
2128 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2129 {
2130     bdrv_swap(bs_new, bs_top);
2131 
2132     /* The contents of 'tmp' will become bs_top, as we are
2133      * swapping bs_new and bs_top contents. */
2134     bdrv_set_backing_hd(bs_top, bs_new);
2135 }
2136 
2137 static void bdrv_delete(BlockDriverState *bs)
2138 {
2139     assert(!bs->job);
2140     assert(bdrv_op_blocker_is_empty(bs));
2141     assert(!bs->refcnt);
2142     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2143 
2144     bdrv_close(bs);
2145 
2146     /* remove from list, if necessary */
2147     bdrv_make_anon(bs);
2148 
2149     g_free(bs);
2150 }
2151 
2152 /*
2153  * Run consistency checks on an image
2154  *
2155  * Returns 0 if the check could be completed (it doesn't mean that the image is
2156  * free of errors) or -errno when an internal error occurred. The results of the
2157  * check are stored in res.
2158  */
2159 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2160 {
2161     if (bs->drv == NULL) {
2162         return -ENOMEDIUM;
2163     }
2164     if (bs->drv->bdrv_check == NULL) {
2165         return -ENOTSUP;
2166     }
2167 
2168     memset(res, 0, sizeof(*res));
2169     return bs->drv->bdrv_check(bs, res, fix);
2170 }
2171 
2172 #define COMMIT_BUF_SECTORS 2048
2173 
2174 /* commit COW file into the raw image */
2175 int bdrv_commit(BlockDriverState *bs)
2176 {
2177     BlockDriver *drv = bs->drv;
2178     int64_t sector, total_sectors, length, backing_length;
2179     int n, ro, open_flags;
2180     int ret = 0;
2181     uint8_t *buf = NULL;
2182     char filename[PATH_MAX];
2183 
2184     if (!drv)
2185         return -ENOMEDIUM;
2186 
2187     if (!bs->backing_hd) {
2188         return -ENOTSUP;
2189     }
2190 
2191     if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2192         bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2193         return -EBUSY;
2194     }
2195 
2196     ro = bs->backing_hd->read_only;
2197     /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2198     pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2199     open_flags =  bs->backing_hd->open_flags;
2200 
2201     if (ro) {
2202         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2203             return -EACCES;
2204         }
2205     }
2206 
2207     length = bdrv_getlength(bs);
2208     if (length < 0) {
2209         ret = length;
2210         goto ro_cleanup;
2211     }
2212 
2213     backing_length = bdrv_getlength(bs->backing_hd);
2214     if (backing_length < 0) {
2215         ret = backing_length;
2216         goto ro_cleanup;
2217     }
2218 
2219     /* If our top snapshot is larger than the backing file image,
2220      * grow the backing file image if possible.  If not possible,
2221      * we must return an error */
2222     if (length > backing_length) {
2223         ret = bdrv_truncate(bs->backing_hd, length);
2224         if (ret < 0) {
2225             goto ro_cleanup;
2226         }
2227     }
2228 
2229     total_sectors = length >> BDRV_SECTOR_BITS;
2230 
2231     /* qemu_try_blockalign() for bs will choose an alignment that works for
2232      * bs->backing_hd as well, so no need to compare the alignment manually. */
2233     buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2234     if (buf == NULL) {
2235         ret = -ENOMEM;
2236         goto ro_cleanup;
2237     }
2238 
2239     for (sector = 0; sector < total_sectors; sector += n) {
2240         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2241         if (ret < 0) {
2242             goto ro_cleanup;
2243         }
2244         if (ret) {
2245             ret = bdrv_read(bs, sector, buf, n);
2246             if (ret < 0) {
2247                 goto ro_cleanup;
2248             }
2249 
2250             ret = bdrv_write(bs->backing_hd, sector, buf, n);
2251             if (ret < 0) {
2252                 goto ro_cleanup;
2253             }
2254         }
2255     }
2256 
2257     if (drv->bdrv_make_empty) {
2258         ret = drv->bdrv_make_empty(bs);
2259         if (ret < 0) {
2260             goto ro_cleanup;
2261         }
2262         bdrv_flush(bs);
2263     }
2264 
2265     /*
2266      * Make sure all data we wrote to the backing device is actually
2267      * stable on disk.
2268      */
2269     if (bs->backing_hd) {
2270         bdrv_flush(bs->backing_hd);
2271     }
2272 
2273     ret = 0;
2274 ro_cleanup:
2275     qemu_vfree(buf);
2276 
2277     if (ro) {
2278         /* ignoring error return here */
2279         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2280     }
2281 
2282     return ret;
2283 }
2284 
2285 int bdrv_commit_all(void)
2286 {
2287     BlockDriverState *bs;
2288 
2289     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2290         AioContext *aio_context = bdrv_get_aio_context(bs);
2291 
2292         aio_context_acquire(aio_context);
2293         if (bs->drv && bs->backing_hd) {
2294             int ret = bdrv_commit(bs);
2295             if (ret < 0) {
2296                 aio_context_release(aio_context);
2297                 return ret;
2298             }
2299         }
2300         aio_context_release(aio_context);
2301     }
2302     return 0;
2303 }
2304 
2305 /**
2306  * Remove an active request from the tracked requests list
2307  *
2308  * This function should be called when a tracked request is completing.
2309  */
2310 static void tracked_request_end(BdrvTrackedRequest *req)
2311 {
2312     if (req->serialising) {
2313         req->bs->serialising_in_flight--;
2314     }
2315 
2316     QLIST_REMOVE(req, list);
2317     qemu_co_queue_restart_all(&req->wait_queue);
2318 }
2319 
2320 /**
2321  * Add an active request to the tracked requests list
2322  */
2323 static void tracked_request_begin(BdrvTrackedRequest *req,
2324                                   BlockDriverState *bs,
2325                                   int64_t offset,
2326                                   unsigned int bytes, bool is_write)
2327 {
2328     *req = (BdrvTrackedRequest){
2329         .bs = bs,
2330         .offset         = offset,
2331         .bytes          = bytes,
2332         .is_write       = is_write,
2333         .co             = qemu_coroutine_self(),
2334         .serialising    = false,
2335         .overlap_offset = offset,
2336         .overlap_bytes  = bytes,
2337     };
2338 
2339     qemu_co_queue_init(&req->wait_queue);
2340 
2341     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2342 }
2343 
2344 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2345 {
2346     int64_t overlap_offset = req->offset & ~(align - 1);
2347     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2348                                - overlap_offset;
2349 
2350     if (!req->serialising) {
2351         req->bs->serialising_in_flight++;
2352         req->serialising = true;
2353     }
2354 
2355     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2356     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2357 }
2358 
2359 /**
2360  * Round a region to cluster boundaries
2361  */
2362 void bdrv_round_to_clusters(BlockDriverState *bs,
2363                             int64_t sector_num, int nb_sectors,
2364                             int64_t *cluster_sector_num,
2365                             int *cluster_nb_sectors)
2366 {
2367     BlockDriverInfo bdi;
2368 
2369     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2370         *cluster_sector_num = sector_num;
2371         *cluster_nb_sectors = nb_sectors;
2372     } else {
2373         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2374         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2375         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2376                                             nb_sectors, c);
2377     }
2378 }
2379 
2380 static int bdrv_get_cluster_size(BlockDriverState *bs)
2381 {
2382     BlockDriverInfo bdi;
2383     int ret;
2384 
2385     ret = bdrv_get_info(bs, &bdi);
2386     if (ret < 0 || bdi.cluster_size == 0) {
2387         return bs->request_alignment;
2388     } else {
2389         return bdi.cluster_size;
2390     }
2391 }
2392 
2393 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2394                                      int64_t offset, unsigned int bytes)
2395 {
2396     /*        aaaa   bbbb */
2397     if (offset >= req->overlap_offset + req->overlap_bytes) {
2398         return false;
2399     }
2400     /* bbbb   aaaa        */
2401     if (req->overlap_offset >= offset + bytes) {
2402         return false;
2403     }
2404     return true;
2405 }
2406 
2407 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2408 {
2409     BlockDriverState *bs = self->bs;
2410     BdrvTrackedRequest *req;
2411     bool retry;
2412     bool waited = false;
2413 
2414     if (!bs->serialising_in_flight) {
2415         return false;
2416     }
2417 
2418     do {
2419         retry = false;
2420         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2421             if (req == self || (!req->serialising && !self->serialising)) {
2422                 continue;
2423             }
2424             if (tracked_request_overlaps(req, self->overlap_offset,
2425                                          self->overlap_bytes))
2426             {
2427                 /* Hitting this means there was a reentrant request, for
2428                  * example, a block driver issuing nested requests.  This must
2429                  * never happen since it means deadlock.
2430                  */
2431                 assert(qemu_coroutine_self() != req->co);
2432 
2433                 /* If the request is already (indirectly) waiting for us, or
2434                  * will wait for us as soon as it wakes up, then just go on
2435                  * (instead of producing a deadlock in the former case). */
2436                 if (!req->waiting_for) {
2437                     self->waiting_for = req;
2438                     qemu_co_queue_wait(&req->wait_queue);
2439                     self->waiting_for = NULL;
2440                     retry = true;
2441                     waited = true;
2442                     break;
2443                 }
2444             }
2445         }
2446     } while (retry);
2447 
2448     return waited;
2449 }
2450 
2451 /*
2452  * Return values:
2453  * 0        - success
2454  * -EINVAL  - backing format specified, but no file
2455  * -ENOSPC  - can't update the backing file because no space is left in the
2456  *            image file header
2457  * -ENOTSUP - format driver doesn't support changing the backing file
2458  */
2459 int bdrv_change_backing_file(BlockDriverState *bs,
2460     const char *backing_file, const char *backing_fmt)
2461 {
2462     BlockDriver *drv = bs->drv;
2463     int ret;
2464 
2465     /* Backing file format doesn't make sense without a backing file */
2466     if (backing_fmt && !backing_file) {
2467         return -EINVAL;
2468     }
2469 
2470     if (drv->bdrv_change_backing_file != NULL) {
2471         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2472     } else {
2473         ret = -ENOTSUP;
2474     }
2475 
2476     if (ret == 0) {
2477         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2478         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2479     }
2480     return ret;
2481 }
2482 
2483 /*
2484  * Finds the image layer in the chain that has 'bs' as its backing file.
2485  *
2486  * active is the current topmost image.
2487  *
2488  * Returns NULL if bs is not found in active's image chain,
2489  * or if active == bs.
2490  *
2491  * Returns the bottommost base image if bs == NULL.
2492  */
2493 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2494                                     BlockDriverState *bs)
2495 {
2496     while (active && bs != active->backing_hd) {
2497         active = active->backing_hd;
2498     }
2499 
2500     return active;
2501 }
2502 
2503 /* Given a BDS, searches for the base layer. */
2504 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2505 {
2506     return bdrv_find_overlay(bs, NULL);
2507 }
2508 
2509 typedef struct BlkIntermediateStates {
2510     BlockDriverState *bs;
2511     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2512 } BlkIntermediateStates;
2513 
2514 
2515 /*
2516  * Drops images above 'base' up to and including 'top', and sets the image
2517  * above 'top' to have base as its backing file.
2518  *
2519  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2520  * information in 'bs' can be properly updated.
2521  *
2522  * E.g., this will convert the following chain:
2523  * bottom <- base <- intermediate <- top <- active
2524  *
2525  * to
2526  *
2527  * bottom <- base <- active
2528  *
2529  * It is allowed for bottom==base, in which case it converts:
2530  *
2531  * base <- intermediate <- top <- active
2532  *
2533  * to
2534  *
2535  * base <- active
2536  *
2537  * If backing_file_str is non-NULL, it will be used when modifying top's
2538  * overlay image metadata.
2539  *
2540  * Error conditions:
2541  *  if active == top, that is considered an error
2542  *
2543  */
2544 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2545                            BlockDriverState *base, const char *backing_file_str)
2546 {
2547     BlockDriverState *intermediate;
2548     BlockDriverState *base_bs = NULL;
2549     BlockDriverState *new_top_bs = NULL;
2550     BlkIntermediateStates *intermediate_state, *next;
2551     int ret = -EIO;
2552 
2553     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2554     QSIMPLEQ_INIT(&states_to_delete);
2555 
2556     if (!top->drv || !base->drv) {
2557         goto exit;
2558     }
2559 
2560     new_top_bs = bdrv_find_overlay(active, top);
2561 
2562     if (new_top_bs == NULL) {
2563         /* we could not find the image above 'top', this is an error */
2564         goto exit;
2565     }
2566 
2567     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2568      * to do, no intermediate images */
2569     if (new_top_bs->backing_hd == base) {
2570         ret = 0;
2571         goto exit;
2572     }
2573 
2574     intermediate = top;
2575 
2576     /* now we will go down through the list, and add each BDS we find
2577      * into our deletion queue, until we hit the 'base'
2578      */
2579     while (intermediate) {
2580         intermediate_state = g_new0(BlkIntermediateStates, 1);
2581         intermediate_state->bs = intermediate;
2582         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2583 
2584         if (intermediate->backing_hd == base) {
2585             base_bs = intermediate->backing_hd;
2586             break;
2587         }
2588         intermediate = intermediate->backing_hd;
2589     }
2590     if (base_bs == NULL) {
2591         /* something went wrong, we did not end at the base. safely
2592          * unravel everything, and exit with error */
2593         goto exit;
2594     }
2595 
2596     /* success - we can delete the intermediate states, and link top->base */
2597     backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2598     ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2599                                    base_bs->drv ? base_bs->drv->format_name : "");
2600     if (ret) {
2601         goto exit;
2602     }
2603     bdrv_set_backing_hd(new_top_bs, base_bs);
2604 
2605     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2606         /* so that bdrv_close() does not recursively close the chain */
2607         bdrv_set_backing_hd(intermediate_state->bs, NULL);
2608         bdrv_unref(intermediate_state->bs);
2609     }
2610     ret = 0;
2611 
2612 exit:
2613     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2614         g_free(intermediate_state);
2615     }
2616     return ret;
2617 }
2618 
2619 
2620 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2621                                    size_t size)
2622 {
2623     int64_t len;
2624 
2625     if (size > INT_MAX) {
2626         return -EIO;
2627     }
2628 
2629     if (!bdrv_is_inserted(bs))
2630         return -ENOMEDIUM;
2631 
2632     if (bs->growable)
2633         return 0;
2634 
2635     len = bdrv_getlength(bs);
2636 
2637     if (offset < 0)
2638         return -EIO;
2639 
2640     if ((offset > len) || (len - offset < size))
2641         return -EIO;
2642 
2643     return 0;
2644 }
2645 
2646 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2647                               int nb_sectors)
2648 {
2649     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2650         return -EIO;
2651     }
2652 
2653     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2654                                    nb_sectors * BDRV_SECTOR_SIZE);
2655 }
2656 
2657 typedef struct RwCo {
2658     BlockDriverState *bs;
2659     int64_t offset;
2660     QEMUIOVector *qiov;
2661     bool is_write;
2662     int ret;
2663     BdrvRequestFlags flags;
2664 } RwCo;
2665 
2666 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2667 {
2668     RwCo *rwco = opaque;
2669 
2670     if (!rwco->is_write) {
2671         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2672                                       rwco->qiov->size, rwco->qiov,
2673                                       rwco->flags);
2674     } else {
2675         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2676                                        rwco->qiov->size, rwco->qiov,
2677                                        rwco->flags);
2678     }
2679 }
2680 
2681 /*
2682  * Process a vectored synchronous request using coroutines
2683  */
2684 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2685                         QEMUIOVector *qiov, bool is_write,
2686                         BdrvRequestFlags flags)
2687 {
2688     Coroutine *co;
2689     RwCo rwco = {
2690         .bs = bs,
2691         .offset = offset,
2692         .qiov = qiov,
2693         .is_write = is_write,
2694         .ret = NOT_DONE,
2695         .flags = flags,
2696     };
2697 
2698     /**
2699      * In sync call context, when the vcpu is blocked, this throttling timer
2700      * will not fire; so the I/O throttling function has to be disabled here
2701      * if it has been enabled.
2702      */
2703     if (bs->io_limits_enabled) {
2704         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2705                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2706         bdrv_io_limits_disable(bs);
2707     }
2708 
2709     if (qemu_in_coroutine()) {
2710         /* Fast-path if already in coroutine context */
2711         bdrv_rw_co_entry(&rwco);
2712     } else {
2713         AioContext *aio_context = bdrv_get_aio_context(bs);
2714 
2715         co = qemu_coroutine_create(bdrv_rw_co_entry);
2716         qemu_coroutine_enter(co, &rwco);
2717         while (rwco.ret == NOT_DONE) {
2718             aio_poll(aio_context, true);
2719         }
2720     }
2721     return rwco.ret;
2722 }
2723 
2724 /*
2725  * Process a synchronous request using coroutines
2726  */
2727 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2728                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2729 {
2730     QEMUIOVector qiov;
2731     struct iovec iov = {
2732         .iov_base = (void *)buf,
2733         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2734     };
2735 
2736     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2737         return -EINVAL;
2738     }
2739 
2740     qemu_iovec_init_external(&qiov, &iov, 1);
2741     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2742                         &qiov, is_write, flags);
2743 }
2744 
2745 /* return < 0 if error. See bdrv_write() for the return codes */
2746 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2747               uint8_t *buf, int nb_sectors)
2748 {
2749     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2750 }
2751 
2752 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2753 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2754                           uint8_t *buf, int nb_sectors)
2755 {
2756     bool enabled;
2757     int ret;
2758 
2759     enabled = bs->io_limits_enabled;
2760     bs->io_limits_enabled = false;
2761     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2762     bs->io_limits_enabled = enabled;
2763     return ret;
2764 }
2765 
2766 /* Return < 0 if error. Important errors are:
2767   -EIO         generic I/O error (may happen for all errors)
2768   -ENOMEDIUM   No media inserted.
2769   -EINVAL      Invalid sector number or nb_sectors
2770   -EACCES      Trying to write a read-only device
2771 */
2772 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2773                const uint8_t *buf, int nb_sectors)
2774 {
2775     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2776 }
2777 
2778 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2779                       int nb_sectors, BdrvRequestFlags flags)
2780 {
2781     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2782                       BDRV_REQ_ZERO_WRITE | flags);
2783 }
2784 
2785 /*
2786  * Completely zero out a block device with the help of bdrv_write_zeroes.
2787  * The operation is sped up by checking the block status and only writing
2788  * zeroes to the device if they currently do not return zeroes. Optional
2789  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2790  *
2791  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2792  */
2793 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2794 {
2795     int64_t target_sectors, ret, nb_sectors, sector_num = 0;
2796     int n;
2797 
2798     target_sectors = bdrv_nb_sectors(bs);
2799     if (target_sectors < 0) {
2800         return target_sectors;
2801     }
2802 
2803     for (;;) {
2804         nb_sectors = target_sectors - sector_num;
2805         if (nb_sectors <= 0) {
2806             return 0;
2807         }
2808         if (nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2809             nb_sectors = INT_MAX / BDRV_SECTOR_SIZE;
2810         }
2811         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2812         if (ret < 0) {
2813             error_report("error getting block status at sector %" PRId64 ": %s",
2814                          sector_num, strerror(-ret));
2815             return ret;
2816         }
2817         if (ret & BDRV_BLOCK_ZERO) {
2818             sector_num += n;
2819             continue;
2820         }
2821         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2822         if (ret < 0) {
2823             error_report("error writing zeroes at sector %" PRId64 ": %s",
2824                          sector_num, strerror(-ret));
2825             return ret;
2826         }
2827         sector_num += n;
2828     }
2829 }
2830 
2831 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2832 {
2833     QEMUIOVector qiov;
2834     struct iovec iov = {
2835         .iov_base = (void *)buf,
2836         .iov_len = bytes,
2837     };
2838     int ret;
2839 
2840     if (bytes < 0) {
2841         return -EINVAL;
2842     }
2843 
2844     qemu_iovec_init_external(&qiov, &iov, 1);
2845     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2846     if (ret < 0) {
2847         return ret;
2848     }
2849 
2850     return bytes;
2851 }
2852 
2853 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2854 {
2855     int ret;
2856 
2857     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2858     if (ret < 0) {
2859         return ret;
2860     }
2861 
2862     return qiov->size;
2863 }
2864 
2865 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2866                 const void *buf, int bytes)
2867 {
2868     QEMUIOVector qiov;
2869     struct iovec iov = {
2870         .iov_base   = (void *) buf,
2871         .iov_len    = bytes,
2872     };
2873 
2874     if (bytes < 0) {
2875         return -EINVAL;
2876     }
2877 
2878     qemu_iovec_init_external(&qiov, &iov, 1);
2879     return bdrv_pwritev(bs, offset, &qiov);
2880 }
2881 
2882 /*
2883  * Writes to the file and ensures that no writes are reordered across this
2884  * request (acts as a barrier)
2885  *
2886  * Returns 0 on success, -errno in error cases.
2887  */
2888 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2889     const void *buf, int count)
2890 {
2891     int ret;
2892 
2893     ret = bdrv_pwrite(bs, offset, buf, count);
2894     if (ret < 0) {
2895         return ret;
2896     }
2897 
2898     /* No flush needed for cache modes that already do it */
2899     if (bs->enable_write_cache) {
2900         bdrv_flush(bs);
2901     }
2902 
2903     return 0;
2904 }
2905 
2906 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2907         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2908 {
2909     /* Perform I/O through a temporary buffer so that users who scribble over
2910      * their read buffer while the operation is in progress do not end up
2911      * modifying the image file.  This is critical for zero-copy guest I/O
2912      * where anything might happen inside guest memory.
2913      */
2914     void *bounce_buffer;
2915 
2916     BlockDriver *drv = bs->drv;
2917     struct iovec iov;
2918     QEMUIOVector bounce_qiov;
2919     int64_t cluster_sector_num;
2920     int cluster_nb_sectors;
2921     size_t skip_bytes;
2922     int ret;
2923 
2924     /* Cover entire cluster so no additional backing file I/O is required when
2925      * allocating cluster in the image file.
2926      */
2927     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2928                            &cluster_sector_num, &cluster_nb_sectors);
2929 
2930     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2931                                    cluster_sector_num, cluster_nb_sectors);
2932 
2933     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2934     iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
2935     if (bounce_buffer == NULL) {
2936         ret = -ENOMEM;
2937         goto err;
2938     }
2939 
2940     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2941 
2942     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2943                              &bounce_qiov);
2944     if (ret < 0) {
2945         goto err;
2946     }
2947 
2948     if (drv->bdrv_co_write_zeroes &&
2949         buffer_is_zero(bounce_buffer, iov.iov_len)) {
2950         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2951                                       cluster_nb_sectors, 0);
2952     } else {
2953         /* This does not change the data on the disk, it is not necessary
2954          * to flush even in cache=writethrough mode.
2955          */
2956         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2957                                   &bounce_qiov);
2958     }
2959 
2960     if (ret < 0) {
2961         /* It might be okay to ignore write errors for guest requests.  If this
2962          * is a deliberate copy-on-read then we don't want to ignore the error.
2963          * Simply report it in all cases.
2964          */
2965         goto err;
2966     }
2967 
2968     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2969     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2970                         nb_sectors * BDRV_SECTOR_SIZE);
2971 
2972 err:
2973     qemu_vfree(bounce_buffer);
2974     return ret;
2975 }
2976 
2977 /*
2978  * Forwards an already correctly aligned request to the BlockDriver. This
2979  * handles copy on read and zeroing after EOF; any other features must be
2980  * implemented by the caller.
2981  */
2982 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2983     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2984     int64_t align, QEMUIOVector *qiov, int flags)
2985 {
2986     BlockDriver *drv = bs->drv;
2987     int ret;
2988 
2989     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2990     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2991 
2992     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2993     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2994     assert(!qiov || bytes == qiov->size);
2995 
2996     /* Handle Copy on Read and associated serialisation */
2997     if (flags & BDRV_REQ_COPY_ON_READ) {
2998         /* If we touch the same cluster it counts as an overlap.  This
2999          * guarantees that allocating writes will be serialized and not race
3000          * with each other for the same cluster.  For example, in copy-on-read
3001          * it ensures that the CoR read and write operations are atomic and
3002          * guest writes cannot interleave between them. */
3003         mark_request_serialising(req, bdrv_get_cluster_size(bs));
3004     }
3005 
3006     wait_serialising_requests(req);
3007 
3008     if (flags & BDRV_REQ_COPY_ON_READ) {
3009         int pnum;
3010 
3011         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3012         if (ret < 0) {
3013             goto out;
3014         }
3015 
3016         if (!ret || pnum != nb_sectors) {
3017             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3018             goto out;
3019         }
3020     }
3021 
3022     /* Forward the request to the BlockDriver */
3023     if (!(bs->zero_beyond_eof && bs->growable)) {
3024         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3025     } else {
3026         /* Read zeros after EOF of growable BDSes */
3027         int64_t total_sectors, max_nb_sectors;
3028 
3029         total_sectors = bdrv_nb_sectors(bs);
3030         if (total_sectors < 0) {
3031             ret = total_sectors;
3032             goto out;
3033         }
3034 
3035         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3036                                   align >> BDRV_SECTOR_BITS);
3037         if (max_nb_sectors > 0) {
3038             QEMUIOVector local_qiov;
3039             size_t local_sectors;
3040 
3041             max_nb_sectors = MIN(max_nb_sectors, SIZE_MAX / BDRV_SECTOR_BITS);
3042             local_sectors = MIN(max_nb_sectors, nb_sectors);
3043 
3044             qemu_iovec_init(&local_qiov, qiov->niov);
3045             qemu_iovec_concat(&local_qiov, qiov, 0,
3046                               local_sectors * BDRV_SECTOR_SIZE);
3047 
3048             ret = drv->bdrv_co_readv(bs, sector_num, local_sectors,
3049                                      &local_qiov);
3050 
3051             qemu_iovec_destroy(&local_qiov);
3052         } else {
3053             ret = 0;
3054         }
3055 
3056         /* Reading beyond end of file is supposed to produce zeroes */
3057         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3058             uint64_t offset = MAX(0, total_sectors - sector_num);
3059             uint64_t bytes = (sector_num + nb_sectors - offset) *
3060                               BDRV_SECTOR_SIZE;
3061             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3062         }
3063     }
3064 
3065 out:
3066     return ret;
3067 }
3068 
3069 /*
3070  * Handle a read request in coroutine context
3071  */
3072 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3073     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3074     BdrvRequestFlags flags)
3075 {
3076     BlockDriver *drv = bs->drv;
3077     BdrvTrackedRequest req;
3078 
3079     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3080     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3081     uint8_t *head_buf = NULL;
3082     uint8_t *tail_buf = NULL;
3083     QEMUIOVector local_qiov;
3084     bool use_local_qiov = false;
3085     int ret;
3086 
3087     if (!drv) {
3088         return -ENOMEDIUM;
3089     }
3090     if (bdrv_check_byte_request(bs, offset, bytes)) {
3091         return -EIO;
3092     }
3093 
3094     if (bs->copy_on_read) {
3095         flags |= BDRV_REQ_COPY_ON_READ;
3096     }
3097 
3098     /* throttling disk I/O */
3099     if (bs->io_limits_enabled) {
3100         bdrv_io_limits_intercept(bs, bytes, false);
3101     }
3102 
3103     /* Align read if necessary by padding qiov */
3104     if (offset & (align - 1)) {
3105         head_buf = qemu_blockalign(bs, align);
3106         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3107         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3108         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3109         use_local_qiov = true;
3110 
3111         bytes += offset & (align - 1);
3112         offset = offset & ~(align - 1);
3113     }
3114 
3115     if ((offset + bytes) & (align - 1)) {
3116         if (!use_local_qiov) {
3117             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3118             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3119             use_local_qiov = true;
3120         }
3121         tail_buf = qemu_blockalign(bs, align);
3122         qemu_iovec_add(&local_qiov, tail_buf,
3123                        align - ((offset + bytes) & (align - 1)));
3124 
3125         bytes = ROUND_UP(bytes, align);
3126     }
3127 
3128     tracked_request_begin(&req, bs, offset, bytes, false);
3129     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3130                               use_local_qiov ? &local_qiov : qiov,
3131                               flags);
3132     tracked_request_end(&req);
3133 
3134     if (use_local_qiov) {
3135         qemu_iovec_destroy(&local_qiov);
3136         qemu_vfree(head_buf);
3137         qemu_vfree(tail_buf);
3138     }
3139 
3140     return ret;
3141 }
3142 
3143 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3144     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3145     BdrvRequestFlags flags)
3146 {
3147     if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3148         return -EINVAL;
3149     }
3150 
3151     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3152                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3153 }
3154 
3155 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3156     int nb_sectors, QEMUIOVector *qiov)
3157 {
3158     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3159 
3160     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3161 }
3162 
3163 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3164     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3165 {
3166     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3167 
3168     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3169                             BDRV_REQ_COPY_ON_READ);
3170 }
3171 
3172 /* if no limit is specified in the BlockLimits use a default
3173  * of 32768 512-byte sectors (16 MiB) per request.
3174  */
3175 #define MAX_WRITE_ZEROES_DEFAULT 32768
3176 
3177 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3178     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3179 {
3180     BlockDriver *drv = bs->drv;
3181     QEMUIOVector qiov;
3182     struct iovec iov = {0};
3183     int ret = 0;
3184 
3185     int max_write_zeroes = bs->bl.max_write_zeroes ?
3186                            bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3187 
3188     while (nb_sectors > 0 && !ret) {
3189         int num = nb_sectors;
3190 
3191         /* Align request.  Block drivers can expect the "bulk" of the request
3192          * to be aligned.
3193          */
3194         if (bs->bl.write_zeroes_alignment
3195             && num > bs->bl.write_zeroes_alignment) {
3196             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3197                 /* Make a small request up to the first aligned sector.  */
3198                 num = bs->bl.write_zeroes_alignment;
3199                 num -= sector_num % bs->bl.write_zeroes_alignment;
3200             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3201                 /* Shorten the request to the last aligned sector.  num cannot
3202                  * underflow because num > bs->bl.write_zeroes_alignment.
3203                  */
3204                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3205             }
3206         }
3207 
3208         /* limit request size */
3209         if (num > max_write_zeroes) {
3210             num = max_write_zeroes;
3211         }
3212 
3213         ret = -ENOTSUP;
3214         /* First try the efficient write zeroes operation */
3215         if (drv->bdrv_co_write_zeroes) {
3216             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3217         }
3218 
3219         if (ret == -ENOTSUP) {
3220             /* Fall back to bounce buffer if write zeroes is unsupported */
3221             iov.iov_len = num * BDRV_SECTOR_SIZE;
3222             if (iov.iov_base == NULL) {
3223                 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3224                 if (iov.iov_base == NULL) {
3225                     ret = -ENOMEM;
3226                     goto fail;
3227                 }
3228                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3229             }
3230             qemu_iovec_init_external(&qiov, &iov, 1);
3231 
3232             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3233 
3234             /* Keep bounce buffer around if it is big enough for all
3235              * all future requests.
3236              */
3237             if (num < max_write_zeroes) {
3238                 qemu_vfree(iov.iov_base);
3239                 iov.iov_base = NULL;
3240             }
3241         }
3242 
3243         sector_num += num;
3244         nb_sectors -= num;
3245     }
3246 
3247 fail:
3248     qemu_vfree(iov.iov_base);
3249     return ret;
3250 }
3251 
3252 /*
3253  * Forwards an already correctly aligned write request to the BlockDriver.
3254  */
3255 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3256     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3257     QEMUIOVector *qiov, int flags)
3258 {
3259     BlockDriver *drv = bs->drv;
3260     bool waited;
3261     int ret;
3262 
3263     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3264     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3265 
3266     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3267     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3268     assert(!qiov || bytes == qiov->size);
3269 
3270     waited = wait_serialising_requests(req);
3271     assert(!waited || !req->serialising);
3272     assert(req->overlap_offset <= offset);
3273     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3274 
3275     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3276 
3277     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3278         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3279         qemu_iovec_is_zero(qiov)) {
3280         flags |= BDRV_REQ_ZERO_WRITE;
3281         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3282             flags |= BDRV_REQ_MAY_UNMAP;
3283         }
3284     }
3285 
3286     if (ret < 0) {
3287         /* Do nothing, write notifier decided to fail this request */
3288     } else if (flags & BDRV_REQ_ZERO_WRITE) {
3289         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3290         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3291     } else {
3292         BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3293         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3294     }
3295     BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3296 
3297     if (ret == 0 && !bs->enable_write_cache) {
3298         ret = bdrv_co_flush(bs);
3299     }
3300 
3301     bdrv_set_dirty(bs, sector_num, nb_sectors);
3302 
3303     block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
3304 
3305     if (bs->growable && ret >= 0) {
3306         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3307     }
3308 
3309     return ret;
3310 }
3311 
3312 /*
3313  * Handle a write request in coroutine context
3314  */
3315 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3316     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3317     BdrvRequestFlags flags)
3318 {
3319     BdrvTrackedRequest req;
3320     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3321     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3322     uint8_t *head_buf = NULL;
3323     uint8_t *tail_buf = NULL;
3324     QEMUIOVector local_qiov;
3325     bool use_local_qiov = false;
3326     int ret;
3327 
3328     if (!bs->drv) {
3329         return -ENOMEDIUM;
3330     }
3331     if (bs->read_only) {
3332         return -EACCES;
3333     }
3334     if (bdrv_check_byte_request(bs, offset, bytes)) {
3335         return -EIO;
3336     }
3337 
3338     /* throttling disk I/O */
3339     if (bs->io_limits_enabled) {
3340         bdrv_io_limits_intercept(bs, bytes, true);
3341     }
3342 
3343     /*
3344      * Align write if necessary by performing a read-modify-write cycle.
3345      * Pad qiov with the read parts and be sure to have a tracked request not
3346      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3347      */
3348     tracked_request_begin(&req, bs, offset, bytes, true);
3349 
3350     if (offset & (align - 1)) {
3351         QEMUIOVector head_qiov;
3352         struct iovec head_iov;
3353 
3354         mark_request_serialising(&req, align);
3355         wait_serialising_requests(&req);
3356 
3357         head_buf = qemu_blockalign(bs, align);
3358         head_iov = (struct iovec) {
3359             .iov_base   = head_buf,
3360             .iov_len    = align,
3361         };
3362         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3363 
3364         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3365         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3366                                   align, &head_qiov, 0);
3367         if (ret < 0) {
3368             goto fail;
3369         }
3370         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3371 
3372         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3373         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3374         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3375         use_local_qiov = true;
3376 
3377         bytes += offset & (align - 1);
3378         offset = offset & ~(align - 1);
3379     }
3380 
3381     if ((offset + bytes) & (align - 1)) {
3382         QEMUIOVector tail_qiov;
3383         struct iovec tail_iov;
3384         size_t tail_bytes;
3385         bool waited;
3386 
3387         mark_request_serialising(&req, align);
3388         waited = wait_serialising_requests(&req);
3389         assert(!waited || !use_local_qiov);
3390 
3391         tail_buf = qemu_blockalign(bs, align);
3392         tail_iov = (struct iovec) {
3393             .iov_base   = tail_buf,
3394             .iov_len    = align,
3395         };
3396         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3397 
3398         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3399         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3400                                   align, &tail_qiov, 0);
3401         if (ret < 0) {
3402             goto fail;
3403         }
3404         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3405 
3406         if (!use_local_qiov) {
3407             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3408             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3409             use_local_qiov = true;
3410         }
3411 
3412         tail_bytes = (offset + bytes) & (align - 1);
3413         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3414 
3415         bytes = ROUND_UP(bytes, align);
3416     }
3417 
3418     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3419                                use_local_qiov ? &local_qiov : qiov,
3420                                flags);
3421 
3422 fail:
3423     tracked_request_end(&req);
3424 
3425     if (use_local_qiov) {
3426         qemu_iovec_destroy(&local_qiov);
3427     }
3428     qemu_vfree(head_buf);
3429     qemu_vfree(tail_buf);
3430 
3431     return ret;
3432 }
3433 
3434 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3435     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3436     BdrvRequestFlags flags)
3437 {
3438     if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3439         return -EINVAL;
3440     }
3441 
3442     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3443                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3444 }
3445 
3446 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3447     int nb_sectors, QEMUIOVector *qiov)
3448 {
3449     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3450 
3451     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3452 }
3453 
3454 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3455                                       int64_t sector_num, int nb_sectors,
3456                                       BdrvRequestFlags flags)
3457 {
3458     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3459 
3460     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3461         flags &= ~BDRV_REQ_MAY_UNMAP;
3462     }
3463 
3464     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3465                              BDRV_REQ_ZERO_WRITE | flags);
3466 }
3467 
3468 /**
3469  * Truncate file to 'offset' bytes (needed only for file protocols)
3470  */
3471 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3472 {
3473     BlockDriver *drv = bs->drv;
3474     int ret;
3475     if (!drv)
3476         return -ENOMEDIUM;
3477     if (!drv->bdrv_truncate)
3478         return -ENOTSUP;
3479     if (bs->read_only)
3480         return -EACCES;
3481 
3482     ret = drv->bdrv_truncate(bs, offset);
3483     if (ret == 0) {
3484         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3485         if (bs->blk) {
3486             blk_dev_resize_cb(bs->blk);
3487         }
3488     }
3489     return ret;
3490 }
3491 
3492 /**
3493  * Length of a allocated file in bytes. Sparse files are counted by actual
3494  * allocated space. Return < 0 if error or unknown.
3495  */
3496 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3497 {
3498     BlockDriver *drv = bs->drv;
3499     if (!drv) {
3500         return -ENOMEDIUM;
3501     }
3502     if (drv->bdrv_get_allocated_file_size) {
3503         return drv->bdrv_get_allocated_file_size(bs);
3504     }
3505     if (bs->file) {
3506         return bdrv_get_allocated_file_size(bs->file);
3507     }
3508     return -ENOTSUP;
3509 }
3510 
3511 /**
3512  * Return number of sectors on success, -errno on error.
3513  */
3514 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3515 {
3516     BlockDriver *drv = bs->drv;
3517 
3518     if (!drv)
3519         return -ENOMEDIUM;
3520 
3521     if (drv->has_variable_length) {
3522         int ret = refresh_total_sectors(bs, bs->total_sectors);
3523         if (ret < 0) {
3524             return ret;
3525         }
3526     }
3527     return bs->total_sectors;
3528 }
3529 
3530 /**
3531  * Return length in bytes on success, -errno on error.
3532  * The length is always a multiple of BDRV_SECTOR_SIZE.
3533  */
3534 int64_t bdrv_getlength(BlockDriverState *bs)
3535 {
3536     int64_t ret = bdrv_nb_sectors(bs);
3537 
3538     return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3539 }
3540 
3541 /* return 0 as number of sectors if no device present or error */
3542 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3543 {
3544     int64_t nb_sectors = bdrv_nb_sectors(bs);
3545 
3546     *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3547 }
3548 
3549 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3550                        BlockdevOnError on_write_error)
3551 {
3552     bs->on_read_error = on_read_error;
3553     bs->on_write_error = on_write_error;
3554 }
3555 
3556 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3557 {
3558     return is_read ? bs->on_read_error : bs->on_write_error;
3559 }
3560 
3561 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3562 {
3563     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3564 
3565     switch (on_err) {
3566     case BLOCKDEV_ON_ERROR_ENOSPC:
3567         return (error == ENOSPC) ?
3568                BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3569     case BLOCKDEV_ON_ERROR_STOP:
3570         return BLOCK_ERROR_ACTION_STOP;
3571     case BLOCKDEV_ON_ERROR_REPORT:
3572         return BLOCK_ERROR_ACTION_REPORT;
3573     case BLOCKDEV_ON_ERROR_IGNORE:
3574         return BLOCK_ERROR_ACTION_IGNORE;
3575     default:
3576         abort();
3577     }
3578 }
3579 
3580 static void send_qmp_error_event(BlockDriverState *bs,
3581                                  BlockErrorAction action,
3582                                  bool is_read, int error)
3583 {
3584     IoOperationType optype;
3585 
3586     optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3587     qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
3588                                    bdrv_iostatus_is_enabled(bs),
3589                                    error == ENOSPC, strerror(error),
3590                                    &error_abort);
3591 }
3592 
3593 /* This is done by device models because, while the block layer knows
3594  * about the error, it does not know whether an operation comes from
3595  * the device or the block layer (from a job, for example).
3596  */
3597 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3598                        bool is_read, int error)
3599 {
3600     assert(error >= 0);
3601 
3602     if (action == BLOCK_ERROR_ACTION_STOP) {
3603         /* First set the iostatus, so that "info block" returns an iostatus
3604          * that matches the events raised so far (an additional error iostatus
3605          * is fine, but not a lost one).
3606          */
3607         bdrv_iostatus_set_err(bs, error);
3608 
3609         /* Then raise the request to stop the VM and the event.
3610          * qemu_system_vmstop_request_prepare has two effects.  First,
3611          * it ensures that the STOP event always comes after the
3612          * BLOCK_IO_ERROR event.  Second, it ensures that even if management
3613          * can observe the STOP event and do a "cont" before the STOP
3614          * event is issued, the VM will not stop.  In this case, vm_start()
3615          * also ensures that the STOP/RESUME pair of events is emitted.
3616          */
3617         qemu_system_vmstop_request_prepare();
3618         send_qmp_error_event(bs, action, is_read, error);
3619         qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3620     } else {
3621         send_qmp_error_event(bs, action, is_read, error);
3622     }
3623 }
3624 
3625 int bdrv_is_read_only(BlockDriverState *bs)
3626 {
3627     return bs->read_only;
3628 }
3629 
3630 int bdrv_is_sg(BlockDriverState *bs)
3631 {
3632     return bs->sg;
3633 }
3634 
3635 int bdrv_enable_write_cache(BlockDriverState *bs)
3636 {
3637     return bs->enable_write_cache;
3638 }
3639 
3640 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3641 {
3642     bs->enable_write_cache = wce;
3643 
3644     /* so a reopen() will preserve wce */
3645     if (wce) {
3646         bs->open_flags |= BDRV_O_CACHE_WB;
3647     } else {
3648         bs->open_flags &= ~BDRV_O_CACHE_WB;
3649     }
3650 }
3651 
3652 int bdrv_is_encrypted(BlockDriverState *bs)
3653 {
3654     if (bs->backing_hd && bs->backing_hd->encrypted)
3655         return 1;
3656     return bs->encrypted;
3657 }
3658 
3659 int bdrv_key_required(BlockDriverState *bs)
3660 {
3661     BlockDriverState *backing_hd = bs->backing_hd;
3662 
3663     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3664         return 1;
3665     return (bs->encrypted && !bs->valid_key);
3666 }
3667 
3668 int bdrv_set_key(BlockDriverState *bs, const char *key)
3669 {
3670     int ret;
3671     if (bs->backing_hd && bs->backing_hd->encrypted) {
3672         ret = bdrv_set_key(bs->backing_hd, key);
3673         if (ret < 0)
3674             return ret;
3675         if (!bs->encrypted)
3676             return 0;
3677     }
3678     if (!bs->encrypted) {
3679         return -EINVAL;
3680     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3681         return -ENOMEDIUM;
3682     }
3683     ret = bs->drv->bdrv_set_key(bs, key);
3684     if (ret < 0) {
3685         bs->valid_key = 0;
3686     } else if (!bs->valid_key) {
3687         bs->valid_key = 1;
3688         if (bs->blk) {
3689             /* call the change callback now, we skipped it on open */
3690             blk_dev_change_media_cb(bs->blk, true);
3691         }
3692     }
3693     return ret;
3694 }
3695 
3696 const char *bdrv_get_format_name(BlockDriverState *bs)
3697 {
3698     return bs->drv ? bs->drv->format_name : NULL;
3699 }
3700 
3701 static int qsort_strcmp(const void *a, const void *b)
3702 {
3703     return strcmp(a, b);
3704 }
3705 
3706 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3707                          void *opaque)
3708 {
3709     BlockDriver *drv;
3710     int count = 0;
3711     int i;
3712     const char **formats = NULL;
3713 
3714     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3715         if (drv->format_name) {
3716             bool found = false;
3717             int i = count;
3718             while (formats && i && !found) {
3719                 found = !strcmp(formats[--i], drv->format_name);
3720             }
3721 
3722             if (!found) {
3723                 formats = g_renew(const char *, formats, count + 1);
3724                 formats[count++] = drv->format_name;
3725             }
3726         }
3727     }
3728 
3729     qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3730 
3731     for (i = 0; i < count; i++) {
3732         it(opaque, formats[i]);
3733     }
3734 
3735     g_free(formats);
3736 }
3737 
3738 /* This function is to find block backend bs */
3739 /* TODO convert callers to blk_by_name(), then remove */
3740 BlockDriverState *bdrv_find(const char *name)
3741 {
3742     BlockBackend *blk = blk_by_name(name);
3743 
3744     return blk ? blk_bs(blk) : NULL;
3745 }
3746 
3747 /* This function is to find a node in the bs graph */
3748 BlockDriverState *bdrv_find_node(const char *node_name)
3749 {
3750     BlockDriverState *bs;
3751 
3752     assert(node_name);
3753 
3754     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3755         if (!strcmp(node_name, bs->node_name)) {
3756             return bs;
3757         }
3758     }
3759     return NULL;
3760 }
3761 
3762 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3763 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3764 {
3765     BlockDeviceInfoList *list, *entry;
3766     BlockDriverState *bs;
3767 
3768     list = NULL;
3769     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3770         entry = g_malloc0(sizeof(*entry));
3771         entry->value = bdrv_block_device_info(bs);
3772         entry->next = list;
3773         list = entry;
3774     }
3775 
3776     return list;
3777 }
3778 
3779 BlockDriverState *bdrv_lookup_bs(const char *device,
3780                                  const char *node_name,
3781                                  Error **errp)
3782 {
3783     BlockBackend *blk;
3784     BlockDriverState *bs;
3785 
3786     if (device) {
3787         blk = blk_by_name(device);
3788 
3789         if (blk) {
3790             return blk_bs(blk);
3791         }
3792     }
3793 
3794     if (node_name) {
3795         bs = bdrv_find_node(node_name);
3796 
3797         if (bs) {
3798             return bs;
3799         }
3800     }
3801 
3802     error_setg(errp, "Cannot find device=%s nor node_name=%s",
3803                      device ? device : "",
3804                      node_name ? node_name : "");
3805     return NULL;
3806 }
3807 
3808 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3809  * return false.  If either argument is NULL, return false. */
3810 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3811 {
3812     while (top && top != base) {
3813         top = top->backing_hd;
3814     }
3815 
3816     return top != NULL;
3817 }
3818 
3819 BlockDriverState *bdrv_next_node(BlockDriverState *bs)
3820 {
3821     if (!bs) {
3822         return QTAILQ_FIRST(&graph_bdrv_states);
3823     }
3824     return QTAILQ_NEXT(bs, node_list);
3825 }
3826 
3827 BlockDriverState *bdrv_next(BlockDriverState *bs)
3828 {
3829     if (!bs) {
3830         return QTAILQ_FIRST(&bdrv_states);
3831     }
3832     return QTAILQ_NEXT(bs, device_list);
3833 }
3834 
3835 const char *bdrv_get_node_name(const BlockDriverState *bs)
3836 {
3837     return bs->node_name;
3838 }
3839 
3840 /* TODO check what callers really want: bs->node_name or blk_name() */
3841 const char *bdrv_get_device_name(const BlockDriverState *bs)
3842 {
3843     return bs->blk ? blk_name(bs->blk) : "";
3844 }
3845 
3846 int bdrv_get_flags(BlockDriverState *bs)
3847 {
3848     return bs->open_flags;
3849 }
3850 
3851 int bdrv_flush_all(void)
3852 {
3853     BlockDriverState *bs;
3854     int result = 0;
3855 
3856     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3857         AioContext *aio_context = bdrv_get_aio_context(bs);
3858         int ret;
3859 
3860         aio_context_acquire(aio_context);
3861         ret = bdrv_flush(bs);
3862         if (ret < 0 && !result) {
3863             result = ret;
3864         }
3865         aio_context_release(aio_context);
3866     }
3867 
3868     return result;
3869 }
3870 
3871 int bdrv_has_zero_init_1(BlockDriverState *bs)
3872 {
3873     return 1;
3874 }
3875 
3876 int bdrv_has_zero_init(BlockDriverState *bs)
3877 {
3878     assert(bs->drv);
3879 
3880     /* If BS is a copy on write image, it is initialized to
3881        the contents of the base image, which may not be zeroes.  */
3882     if (bs->backing_hd) {
3883         return 0;
3884     }
3885     if (bs->drv->bdrv_has_zero_init) {
3886         return bs->drv->bdrv_has_zero_init(bs);
3887     }
3888 
3889     /* safe default */
3890     return 0;
3891 }
3892 
3893 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3894 {
3895     BlockDriverInfo bdi;
3896 
3897     if (bs->backing_hd) {
3898         return false;
3899     }
3900 
3901     if (bdrv_get_info(bs, &bdi) == 0) {
3902         return bdi.unallocated_blocks_are_zero;
3903     }
3904 
3905     return false;
3906 }
3907 
3908 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3909 {
3910     BlockDriverInfo bdi;
3911 
3912     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3913         return false;
3914     }
3915 
3916     if (bdrv_get_info(bs, &bdi) == 0) {
3917         return bdi.can_write_zeroes_with_unmap;
3918     }
3919 
3920     return false;
3921 }
3922 
3923 typedef struct BdrvCoGetBlockStatusData {
3924     BlockDriverState *bs;
3925     BlockDriverState *base;
3926     int64_t sector_num;
3927     int nb_sectors;
3928     int *pnum;
3929     int64_t ret;
3930     bool done;
3931 } BdrvCoGetBlockStatusData;
3932 
3933 /*
3934  * Returns the allocation status of the specified sectors.
3935  * Drivers not implementing the functionality are assumed to not support
3936  * backing files, hence all their sectors are reported as allocated.
3937  *
3938  * If 'sector_num' is beyond the end of the disk image the return value is 0
3939  * and 'pnum' is set to 0.
3940  *
3941  * 'pnum' is set to the number of sectors (including and immediately following
3942  * the specified sector) that are known to be in the same
3943  * allocated/unallocated state.
3944  *
3945  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3946  * beyond the end of the disk image it will be clamped.
3947  */
3948 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3949                                                      int64_t sector_num,
3950                                                      int nb_sectors, int *pnum)
3951 {
3952     int64_t total_sectors;
3953     int64_t n;
3954     int64_t ret, ret2;
3955 
3956     total_sectors = bdrv_nb_sectors(bs);
3957     if (total_sectors < 0) {
3958         return total_sectors;
3959     }
3960 
3961     if (sector_num >= total_sectors) {
3962         *pnum = 0;
3963         return 0;
3964     }
3965 
3966     n = total_sectors - sector_num;
3967     if (n < nb_sectors) {
3968         nb_sectors = n;
3969     }
3970 
3971     if (!bs->drv->bdrv_co_get_block_status) {
3972         *pnum = nb_sectors;
3973         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
3974         if (bs->drv->protocol_name) {
3975             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3976         }
3977         return ret;
3978     }
3979 
3980     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3981     if (ret < 0) {
3982         *pnum = 0;
3983         return ret;
3984     }
3985 
3986     if (ret & BDRV_BLOCK_RAW) {
3987         assert(ret & BDRV_BLOCK_OFFSET_VALID);
3988         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3989                                      *pnum, pnum);
3990     }
3991 
3992     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
3993         ret |= BDRV_BLOCK_ALLOCATED;
3994     }
3995 
3996     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3997         if (bdrv_unallocated_blocks_are_zero(bs)) {
3998             ret |= BDRV_BLOCK_ZERO;
3999         } else if (bs->backing_hd) {
4000             BlockDriverState *bs2 = bs->backing_hd;
4001             int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4002             if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
4003                 ret |= BDRV_BLOCK_ZERO;
4004             }
4005         }
4006     }
4007 
4008     if (bs->file &&
4009         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4010         (ret & BDRV_BLOCK_OFFSET_VALID)) {
4011         int file_pnum;
4012 
4013         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4014                                         *pnum, &file_pnum);
4015         if (ret2 >= 0) {
4016             /* Ignore errors.  This is just providing extra information, it
4017              * is useful but not necessary.
4018              */
4019             if (!file_pnum) {
4020                 /* !file_pnum indicates an offset at or beyond the EOF; it is
4021                  * perfectly valid for the format block driver to point to such
4022                  * offsets, so catch it and mark everything as zero */
4023                 ret |= BDRV_BLOCK_ZERO;
4024             } else {
4025                 /* Limit request to the range reported by the protocol driver */
4026                 *pnum = file_pnum;
4027                 ret |= (ret2 & BDRV_BLOCK_ZERO);
4028             }
4029         }
4030     }
4031 
4032     return ret;
4033 }
4034 
4035 /* Coroutine wrapper for bdrv_get_block_status() */
4036 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
4037 {
4038     BdrvCoGetBlockStatusData *data = opaque;
4039     BlockDriverState *bs = data->bs;
4040 
4041     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4042                                          data->pnum);
4043     data->done = true;
4044 }
4045 
4046 /*
4047  * Synchronous wrapper around bdrv_co_get_block_status().
4048  *
4049  * See bdrv_co_get_block_status() for details.
4050  */
4051 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4052                               int nb_sectors, int *pnum)
4053 {
4054     Coroutine *co;
4055     BdrvCoGetBlockStatusData data = {
4056         .bs = bs,
4057         .sector_num = sector_num,
4058         .nb_sectors = nb_sectors,
4059         .pnum = pnum,
4060         .done = false,
4061     };
4062 
4063     if (qemu_in_coroutine()) {
4064         /* Fast-path if already in coroutine context */
4065         bdrv_get_block_status_co_entry(&data);
4066     } else {
4067         AioContext *aio_context = bdrv_get_aio_context(bs);
4068 
4069         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4070         qemu_coroutine_enter(co, &data);
4071         while (!data.done) {
4072             aio_poll(aio_context, true);
4073         }
4074     }
4075     return data.ret;
4076 }
4077 
4078 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4079                                    int nb_sectors, int *pnum)
4080 {
4081     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4082     if (ret < 0) {
4083         return ret;
4084     }
4085     return !!(ret & BDRV_BLOCK_ALLOCATED);
4086 }
4087 
4088 /*
4089  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4090  *
4091  * Return true if the given sector is allocated in any image between
4092  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
4093  * sector is allocated in any image of the chain.  Return false otherwise.
4094  *
4095  * 'pnum' is set to the number of sectors (including and immediately following
4096  *  the specified sector) that are known to be in the same
4097  *  allocated/unallocated state.
4098  *
4099  */
4100 int bdrv_is_allocated_above(BlockDriverState *top,
4101                             BlockDriverState *base,
4102                             int64_t sector_num,
4103                             int nb_sectors, int *pnum)
4104 {
4105     BlockDriverState *intermediate;
4106     int ret, n = nb_sectors;
4107 
4108     intermediate = top;
4109     while (intermediate && intermediate != base) {
4110         int pnum_inter;
4111         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4112                                 &pnum_inter);
4113         if (ret < 0) {
4114             return ret;
4115         } else if (ret) {
4116             *pnum = pnum_inter;
4117             return 1;
4118         }
4119 
4120         /*
4121          * [sector_num, nb_sectors] is unallocated on top but intermediate
4122          * might have
4123          *
4124          * [sector_num+x, nr_sectors] allocated.
4125          */
4126         if (n > pnum_inter &&
4127             (intermediate == top ||
4128              sector_num + pnum_inter < intermediate->total_sectors)) {
4129             n = pnum_inter;
4130         }
4131 
4132         intermediate = intermediate->backing_hd;
4133     }
4134 
4135     *pnum = n;
4136     return 0;
4137 }
4138 
4139 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4140 {
4141     if (bs->backing_hd && bs->backing_hd->encrypted)
4142         return bs->backing_file;
4143     else if (bs->encrypted)
4144         return bs->filename;
4145     else
4146         return NULL;
4147 }
4148 
4149 void bdrv_get_backing_filename(BlockDriverState *bs,
4150                                char *filename, int filename_size)
4151 {
4152     pstrcpy(filename, filename_size, bs->backing_file);
4153 }
4154 
4155 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4156                           const uint8_t *buf, int nb_sectors)
4157 {
4158     BlockDriver *drv = bs->drv;
4159     if (!drv)
4160         return -ENOMEDIUM;
4161     if (!drv->bdrv_write_compressed)
4162         return -ENOTSUP;
4163     if (bdrv_check_request(bs, sector_num, nb_sectors))
4164         return -EIO;
4165 
4166     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4167 
4168     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4169 }
4170 
4171 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4172 {
4173     BlockDriver *drv = bs->drv;
4174     if (!drv)
4175         return -ENOMEDIUM;
4176     if (!drv->bdrv_get_info)
4177         return -ENOTSUP;
4178     memset(bdi, 0, sizeof(*bdi));
4179     return drv->bdrv_get_info(bs, bdi);
4180 }
4181 
4182 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4183 {
4184     BlockDriver *drv = bs->drv;
4185     if (drv && drv->bdrv_get_specific_info) {
4186         return drv->bdrv_get_specific_info(bs);
4187     }
4188     return NULL;
4189 }
4190 
4191 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4192                       int64_t pos, int size)
4193 {
4194     QEMUIOVector qiov;
4195     struct iovec iov = {
4196         .iov_base   = (void *) buf,
4197         .iov_len    = size,
4198     };
4199 
4200     qemu_iovec_init_external(&qiov, &iov, 1);
4201     return bdrv_writev_vmstate(bs, &qiov, pos);
4202 }
4203 
4204 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4205 {
4206     BlockDriver *drv = bs->drv;
4207 
4208     if (!drv) {
4209         return -ENOMEDIUM;
4210     } else if (drv->bdrv_save_vmstate) {
4211         return drv->bdrv_save_vmstate(bs, qiov, pos);
4212     } else if (bs->file) {
4213         return bdrv_writev_vmstate(bs->file, qiov, pos);
4214     }
4215 
4216     return -ENOTSUP;
4217 }
4218 
4219 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4220                       int64_t pos, int size)
4221 {
4222     BlockDriver *drv = bs->drv;
4223     if (!drv)
4224         return -ENOMEDIUM;
4225     if (drv->bdrv_load_vmstate)
4226         return drv->bdrv_load_vmstate(bs, buf, pos, size);
4227     if (bs->file)
4228         return bdrv_load_vmstate(bs->file, buf, pos, size);
4229     return -ENOTSUP;
4230 }
4231 
4232 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4233 {
4234     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4235         return;
4236     }
4237 
4238     bs->drv->bdrv_debug_event(bs, event);
4239 }
4240 
4241 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4242                           const char *tag)
4243 {
4244     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4245         bs = bs->file;
4246     }
4247 
4248     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4249         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4250     }
4251 
4252     return -ENOTSUP;
4253 }
4254 
4255 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4256 {
4257     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4258         bs = bs->file;
4259     }
4260 
4261     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4262         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4263     }
4264 
4265     return -ENOTSUP;
4266 }
4267 
4268 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4269 {
4270     while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4271         bs = bs->file;
4272     }
4273 
4274     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4275         return bs->drv->bdrv_debug_resume(bs, tag);
4276     }
4277 
4278     return -ENOTSUP;
4279 }
4280 
4281 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4282 {
4283     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4284         bs = bs->file;
4285     }
4286 
4287     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4288         return bs->drv->bdrv_debug_is_suspended(bs, tag);
4289     }
4290 
4291     return false;
4292 }
4293 
4294 int bdrv_is_snapshot(BlockDriverState *bs)
4295 {
4296     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4297 }
4298 
4299 /* backing_file can either be relative, or absolute, or a protocol.  If it is
4300  * relative, it must be relative to the chain.  So, passing in bs->filename
4301  * from a BDS as backing_file should not be done, as that may be relative to
4302  * the CWD rather than the chain. */
4303 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4304         const char *backing_file)
4305 {
4306     char *filename_full = NULL;
4307     char *backing_file_full = NULL;
4308     char *filename_tmp = NULL;
4309     int is_protocol = 0;
4310     BlockDriverState *curr_bs = NULL;
4311     BlockDriverState *retval = NULL;
4312 
4313     if (!bs || !bs->drv || !backing_file) {
4314         return NULL;
4315     }
4316 
4317     filename_full     = g_malloc(PATH_MAX);
4318     backing_file_full = g_malloc(PATH_MAX);
4319     filename_tmp      = g_malloc(PATH_MAX);
4320 
4321     is_protocol = path_has_protocol(backing_file);
4322 
4323     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4324 
4325         /* If either of the filename paths is actually a protocol, then
4326          * compare unmodified paths; otherwise make paths relative */
4327         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4328             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4329                 retval = curr_bs->backing_hd;
4330                 break;
4331             }
4332         } else {
4333             /* If not an absolute filename path, make it relative to the current
4334              * image's filename path */
4335             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4336                          backing_file);
4337 
4338             /* We are going to compare absolute pathnames */
4339             if (!realpath(filename_tmp, filename_full)) {
4340                 continue;
4341             }
4342 
4343             /* We need to make sure the backing filename we are comparing against
4344              * is relative to the current image filename (or absolute) */
4345             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4346                          curr_bs->backing_file);
4347 
4348             if (!realpath(filename_tmp, backing_file_full)) {
4349                 continue;
4350             }
4351 
4352             if (strcmp(backing_file_full, filename_full) == 0) {
4353                 retval = curr_bs->backing_hd;
4354                 break;
4355             }
4356         }
4357     }
4358 
4359     g_free(filename_full);
4360     g_free(backing_file_full);
4361     g_free(filename_tmp);
4362     return retval;
4363 }
4364 
4365 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4366 {
4367     if (!bs->drv) {
4368         return 0;
4369     }
4370 
4371     if (!bs->backing_hd) {
4372         return 0;
4373     }
4374 
4375     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4376 }
4377 
4378 /**************************************************************/
4379 /* async I/Os */
4380 
4381 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4382                            QEMUIOVector *qiov, int nb_sectors,
4383                            BlockCompletionFunc *cb, void *opaque)
4384 {
4385     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4386 
4387     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4388                                  cb, opaque, false);
4389 }
4390 
4391 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4392                             QEMUIOVector *qiov, int nb_sectors,
4393                             BlockCompletionFunc *cb, void *opaque)
4394 {
4395     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4396 
4397     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4398                                  cb, opaque, true);
4399 }
4400 
4401 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4402         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4403         BlockCompletionFunc *cb, void *opaque)
4404 {
4405     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4406 
4407     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4408                                  BDRV_REQ_ZERO_WRITE | flags,
4409                                  cb, opaque, true);
4410 }
4411 
4412 
4413 typedef struct MultiwriteCB {
4414     int error;
4415     int num_requests;
4416     int num_callbacks;
4417     struct {
4418         BlockCompletionFunc *cb;
4419         void *opaque;
4420         QEMUIOVector *free_qiov;
4421     } callbacks[];
4422 } MultiwriteCB;
4423 
4424 static void multiwrite_user_cb(MultiwriteCB *mcb)
4425 {
4426     int i;
4427 
4428     for (i = 0; i < mcb->num_callbacks; i++) {
4429         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4430         if (mcb->callbacks[i].free_qiov) {
4431             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4432         }
4433         g_free(mcb->callbacks[i].free_qiov);
4434     }
4435 }
4436 
4437 static void multiwrite_cb(void *opaque, int ret)
4438 {
4439     MultiwriteCB *mcb = opaque;
4440 
4441     trace_multiwrite_cb(mcb, ret);
4442 
4443     if (ret < 0 && !mcb->error) {
4444         mcb->error = ret;
4445     }
4446 
4447     mcb->num_requests--;
4448     if (mcb->num_requests == 0) {
4449         multiwrite_user_cb(mcb);
4450         g_free(mcb);
4451     }
4452 }
4453 
4454 static int multiwrite_req_compare(const void *a, const void *b)
4455 {
4456     const BlockRequest *req1 = a, *req2 = b;
4457 
4458     /*
4459      * Note that we can't simply subtract req2->sector from req1->sector
4460      * here as that could overflow the return value.
4461      */
4462     if (req1->sector > req2->sector) {
4463         return 1;
4464     } else if (req1->sector < req2->sector) {
4465         return -1;
4466     } else {
4467         return 0;
4468     }
4469 }
4470 
4471 /*
4472  * Takes a bunch of requests and tries to merge them. Returns the number of
4473  * requests that remain after merging.
4474  */
4475 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4476     int num_reqs, MultiwriteCB *mcb)
4477 {
4478     int i, outidx;
4479 
4480     // Sort requests by start sector
4481     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4482 
4483     // Check if adjacent requests touch the same clusters. If so, combine them,
4484     // filling up gaps with zero sectors.
4485     outidx = 0;
4486     for (i = 1; i < num_reqs; i++) {
4487         int merge = 0;
4488         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4489 
4490         // Handle exactly sequential writes and overlapping writes.
4491         if (reqs[i].sector <= oldreq_last) {
4492             merge = 1;
4493         }
4494 
4495         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4496             merge = 0;
4497         }
4498 
4499         if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
4500             reqs[i].nb_sectors > bs->bl.max_transfer_length) {
4501             merge = 0;
4502         }
4503 
4504         if (merge) {
4505             size_t size;
4506             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4507             qemu_iovec_init(qiov,
4508                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4509 
4510             // Add the first request to the merged one. If the requests are
4511             // overlapping, drop the last sectors of the first request.
4512             size = (reqs[i].sector - reqs[outidx].sector) << 9;
4513             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4514 
4515             // We should need to add any zeros between the two requests
4516             assert (reqs[i].sector <= oldreq_last);
4517 
4518             // Add the second request
4519             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4520 
4521             // Add tail of first request, if necessary
4522             if (qiov->size < reqs[outidx].qiov->size) {
4523                 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4524                                   reqs[outidx].qiov->size - qiov->size);
4525             }
4526 
4527             reqs[outidx].nb_sectors = qiov->size >> 9;
4528             reqs[outidx].qiov = qiov;
4529 
4530             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4531         } else {
4532             outidx++;
4533             reqs[outidx].sector     = reqs[i].sector;
4534             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4535             reqs[outidx].qiov       = reqs[i].qiov;
4536         }
4537     }
4538 
4539     return outidx + 1;
4540 }
4541 
4542 /*
4543  * Submit multiple AIO write requests at once.
4544  *
4545  * On success, the function returns 0 and all requests in the reqs array have
4546  * been submitted. In error case this function returns -1, and any of the
4547  * requests may or may not be submitted yet. In particular, this means that the
4548  * callback will be called for some of the requests, for others it won't. The
4549  * caller must check the error field of the BlockRequest to wait for the right
4550  * callbacks (if error != 0, no callback will be called).
4551  *
4552  * The implementation may modify the contents of the reqs array, e.g. to merge
4553  * requests. However, the fields opaque and error are left unmodified as they
4554  * are used to signal failure for a single request to the caller.
4555  */
4556 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4557 {
4558     MultiwriteCB *mcb;
4559     int i;
4560 
4561     /* don't submit writes if we don't have a medium */
4562     if (bs->drv == NULL) {
4563         for (i = 0; i < num_reqs; i++) {
4564             reqs[i].error = -ENOMEDIUM;
4565         }
4566         return -1;
4567     }
4568 
4569     if (num_reqs == 0) {
4570         return 0;
4571     }
4572 
4573     // Create MultiwriteCB structure
4574     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4575     mcb->num_requests = 0;
4576     mcb->num_callbacks = num_reqs;
4577 
4578     for (i = 0; i < num_reqs; i++) {
4579         mcb->callbacks[i].cb = reqs[i].cb;
4580         mcb->callbacks[i].opaque = reqs[i].opaque;
4581     }
4582 
4583     // Check for mergable requests
4584     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4585 
4586     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4587 
4588     /* Run the aio requests. */
4589     mcb->num_requests = num_reqs;
4590     for (i = 0; i < num_reqs; i++) {
4591         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4592                               reqs[i].nb_sectors, reqs[i].flags,
4593                               multiwrite_cb, mcb,
4594                               true);
4595     }
4596 
4597     return 0;
4598 }
4599 
4600 void bdrv_aio_cancel(BlockAIOCB *acb)
4601 {
4602     qemu_aio_ref(acb);
4603     bdrv_aio_cancel_async(acb);
4604     while (acb->refcnt > 1) {
4605         if (acb->aiocb_info->get_aio_context) {
4606             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4607         } else if (acb->bs) {
4608             aio_poll(bdrv_get_aio_context(acb->bs), true);
4609         } else {
4610             abort();
4611         }
4612     }
4613     qemu_aio_unref(acb);
4614 }
4615 
4616 /* Async version of aio cancel. The caller is not blocked if the acb implements
4617  * cancel_async, otherwise we do nothing and let the request normally complete.
4618  * In either case the completion callback must be called. */
4619 void bdrv_aio_cancel_async(BlockAIOCB *acb)
4620 {
4621     if (acb->aiocb_info->cancel_async) {
4622         acb->aiocb_info->cancel_async(acb);
4623     }
4624 }
4625 
4626 /**************************************************************/
4627 /* async block device emulation */
4628 
4629 typedef struct BlockAIOCBSync {
4630     BlockAIOCB common;
4631     QEMUBH *bh;
4632     int ret;
4633     /* vector translation state */
4634     QEMUIOVector *qiov;
4635     uint8_t *bounce;
4636     int is_write;
4637 } BlockAIOCBSync;
4638 
4639 static const AIOCBInfo bdrv_em_aiocb_info = {
4640     .aiocb_size         = sizeof(BlockAIOCBSync),
4641 };
4642 
4643 static void bdrv_aio_bh_cb(void *opaque)
4644 {
4645     BlockAIOCBSync *acb = opaque;
4646 
4647     if (!acb->is_write && acb->ret >= 0) {
4648         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4649     }
4650     qemu_vfree(acb->bounce);
4651     acb->common.cb(acb->common.opaque, acb->ret);
4652     qemu_bh_delete(acb->bh);
4653     acb->bh = NULL;
4654     qemu_aio_unref(acb);
4655 }
4656 
4657 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4658                                       int64_t sector_num,
4659                                       QEMUIOVector *qiov,
4660                                       int nb_sectors,
4661                                       BlockCompletionFunc *cb,
4662                                       void *opaque,
4663                                       int is_write)
4664 
4665 {
4666     BlockAIOCBSync *acb;
4667 
4668     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4669     acb->is_write = is_write;
4670     acb->qiov = qiov;
4671     acb->bounce = qemu_try_blockalign(bs, qiov->size);
4672     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4673 
4674     if (acb->bounce == NULL) {
4675         acb->ret = -ENOMEM;
4676     } else if (is_write) {
4677         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4678         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4679     } else {
4680         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4681     }
4682 
4683     qemu_bh_schedule(acb->bh);
4684 
4685     return &acb->common;
4686 }
4687 
4688 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4689         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4690         BlockCompletionFunc *cb, void *opaque)
4691 {
4692     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4693 }
4694 
4695 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4696         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4697         BlockCompletionFunc *cb, void *opaque)
4698 {
4699     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4700 }
4701 
4702 
4703 typedef struct BlockAIOCBCoroutine {
4704     BlockAIOCB common;
4705     BlockRequest req;
4706     bool is_write;
4707     bool *done;
4708     QEMUBH* bh;
4709 } BlockAIOCBCoroutine;
4710 
4711 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4712     .aiocb_size         = sizeof(BlockAIOCBCoroutine),
4713 };
4714 
4715 static void bdrv_co_em_bh(void *opaque)
4716 {
4717     BlockAIOCBCoroutine *acb = opaque;
4718 
4719     acb->common.cb(acb->common.opaque, acb->req.error);
4720 
4721     qemu_bh_delete(acb->bh);
4722     qemu_aio_unref(acb);
4723 }
4724 
4725 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4726 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4727 {
4728     BlockAIOCBCoroutine *acb = opaque;
4729     BlockDriverState *bs = acb->common.bs;
4730 
4731     if (!acb->is_write) {
4732         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4733             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4734     } else {
4735         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4736             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4737     }
4738 
4739     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4740     qemu_bh_schedule(acb->bh);
4741 }
4742 
4743 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4744                                          int64_t sector_num,
4745                                          QEMUIOVector *qiov,
4746                                          int nb_sectors,
4747                                          BdrvRequestFlags flags,
4748                                          BlockCompletionFunc *cb,
4749                                          void *opaque,
4750                                          bool is_write)
4751 {
4752     Coroutine *co;
4753     BlockAIOCBCoroutine *acb;
4754 
4755     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4756     acb->req.sector = sector_num;
4757     acb->req.nb_sectors = nb_sectors;
4758     acb->req.qiov = qiov;
4759     acb->req.flags = flags;
4760     acb->is_write = is_write;
4761 
4762     co = qemu_coroutine_create(bdrv_co_do_rw);
4763     qemu_coroutine_enter(co, acb);
4764 
4765     return &acb->common;
4766 }
4767 
4768 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4769 {
4770     BlockAIOCBCoroutine *acb = opaque;
4771     BlockDriverState *bs = acb->common.bs;
4772 
4773     acb->req.error = bdrv_co_flush(bs);
4774     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4775     qemu_bh_schedule(acb->bh);
4776 }
4777 
4778 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4779         BlockCompletionFunc *cb, void *opaque)
4780 {
4781     trace_bdrv_aio_flush(bs, opaque);
4782 
4783     Coroutine *co;
4784     BlockAIOCBCoroutine *acb;
4785 
4786     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4787 
4788     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4789     qemu_coroutine_enter(co, acb);
4790 
4791     return &acb->common;
4792 }
4793 
4794 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4795 {
4796     BlockAIOCBCoroutine *acb = opaque;
4797     BlockDriverState *bs = acb->common.bs;
4798 
4799     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4800     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4801     qemu_bh_schedule(acb->bh);
4802 }
4803 
4804 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4805         int64_t sector_num, int nb_sectors,
4806         BlockCompletionFunc *cb, void *opaque)
4807 {
4808     Coroutine *co;
4809     BlockAIOCBCoroutine *acb;
4810 
4811     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4812 
4813     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4814     acb->req.sector = sector_num;
4815     acb->req.nb_sectors = nb_sectors;
4816     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4817     qemu_coroutine_enter(co, acb);
4818 
4819     return &acb->common;
4820 }
4821 
4822 void bdrv_init(void)
4823 {
4824     module_call_init(MODULE_INIT_BLOCK);
4825 }
4826 
4827 void bdrv_init_with_whitelist(void)
4828 {
4829     use_bdrv_whitelist = 1;
4830     bdrv_init();
4831 }
4832 
4833 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4834                    BlockCompletionFunc *cb, void *opaque)
4835 {
4836     BlockAIOCB *acb;
4837 
4838     acb = g_slice_alloc(aiocb_info->aiocb_size);
4839     acb->aiocb_info = aiocb_info;
4840     acb->bs = bs;
4841     acb->cb = cb;
4842     acb->opaque = opaque;
4843     acb->refcnt = 1;
4844     return acb;
4845 }
4846 
4847 void qemu_aio_ref(void *p)
4848 {
4849     BlockAIOCB *acb = p;
4850     acb->refcnt++;
4851 }
4852 
4853 void qemu_aio_unref(void *p)
4854 {
4855     BlockAIOCB *acb = p;
4856     assert(acb->refcnt > 0);
4857     if (--acb->refcnt == 0) {
4858         g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4859     }
4860 }
4861 
4862 /**************************************************************/
4863 /* Coroutine block device emulation */
4864 
4865 typedef struct CoroutineIOCompletion {
4866     Coroutine *coroutine;
4867     int ret;
4868 } CoroutineIOCompletion;
4869 
4870 static void bdrv_co_io_em_complete(void *opaque, int ret)
4871 {
4872     CoroutineIOCompletion *co = opaque;
4873 
4874     co->ret = ret;
4875     qemu_coroutine_enter(co->coroutine, NULL);
4876 }
4877 
4878 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4879                                       int nb_sectors, QEMUIOVector *iov,
4880                                       bool is_write)
4881 {
4882     CoroutineIOCompletion co = {
4883         .coroutine = qemu_coroutine_self(),
4884     };
4885     BlockAIOCB *acb;
4886 
4887     if (is_write) {
4888         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4889                                        bdrv_co_io_em_complete, &co);
4890     } else {
4891         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4892                                       bdrv_co_io_em_complete, &co);
4893     }
4894 
4895     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4896     if (!acb) {
4897         return -EIO;
4898     }
4899     qemu_coroutine_yield();
4900 
4901     return co.ret;
4902 }
4903 
4904 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4905                                          int64_t sector_num, int nb_sectors,
4906                                          QEMUIOVector *iov)
4907 {
4908     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4909 }
4910 
4911 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4912                                          int64_t sector_num, int nb_sectors,
4913                                          QEMUIOVector *iov)
4914 {
4915     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4916 }
4917 
4918 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4919 {
4920     RwCo *rwco = opaque;
4921 
4922     rwco->ret = bdrv_co_flush(rwco->bs);
4923 }
4924 
4925 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4926 {
4927     int ret;
4928 
4929     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4930         return 0;
4931     }
4932 
4933     /* Write back cached data to the OS even with cache=unsafe */
4934     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4935     if (bs->drv->bdrv_co_flush_to_os) {
4936         ret = bs->drv->bdrv_co_flush_to_os(bs);
4937         if (ret < 0) {
4938             return ret;
4939         }
4940     }
4941 
4942     /* But don't actually force it to the disk with cache=unsafe */
4943     if (bs->open_flags & BDRV_O_NO_FLUSH) {
4944         goto flush_parent;
4945     }
4946 
4947     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4948     if (bs->drv->bdrv_co_flush_to_disk) {
4949         ret = bs->drv->bdrv_co_flush_to_disk(bs);
4950     } else if (bs->drv->bdrv_aio_flush) {
4951         BlockAIOCB *acb;
4952         CoroutineIOCompletion co = {
4953             .coroutine = qemu_coroutine_self(),
4954         };
4955 
4956         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4957         if (acb == NULL) {
4958             ret = -EIO;
4959         } else {
4960             qemu_coroutine_yield();
4961             ret = co.ret;
4962         }
4963     } else {
4964         /*
4965          * Some block drivers always operate in either writethrough or unsafe
4966          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4967          * know how the server works (because the behaviour is hardcoded or
4968          * depends on server-side configuration), so we can't ensure that
4969          * everything is safe on disk. Returning an error doesn't work because
4970          * that would break guests even if the server operates in writethrough
4971          * mode.
4972          *
4973          * Let's hope the user knows what he's doing.
4974          */
4975         ret = 0;
4976     }
4977     if (ret < 0) {
4978         return ret;
4979     }
4980 
4981     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4982      * in the case of cache=unsafe, so there are no useless flushes.
4983      */
4984 flush_parent:
4985     return bdrv_co_flush(bs->file);
4986 }
4987 
4988 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
4989 {
4990     Error *local_err = NULL;
4991     int ret;
4992 
4993     if (!bs->drv)  {
4994         return;
4995     }
4996 
4997     if (!(bs->open_flags & BDRV_O_INCOMING)) {
4998         return;
4999     }
5000     bs->open_flags &= ~BDRV_O_INCOMING;
5001 
5002     if (bs->drv->bdrv_invalidate_cache) {
5003         bs->drv->bdrv_invalidate_cache(bs, &local_err);
5004     } else if (bs->file) {
5005         bdrv_invalidate_cache(bs->file, &local_err);
5006     }
5007     if (local_err) {
5008         error_propagate(errp, local_err);
5009         return;
5010     }
5011 
5012     ret = refresh_total_sectors(bs, bs->total_sectors);
5013     if (ret < 0) {
5014         error_setg_errno(errp, -ret, "Could not refresh total sector count");
5015         return;
5016     }
5017 }
5018 
5019 void bdrv_invalidate_cache_all(Error **errp)
5020 {
5021     BlockDriverState *bs;
5022     Error *local_err = NULL;
5023 
5024     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5025         AioContext *aio_context = bdrv_get_aio_context(bs);
5026 
5027         aio_context_acquire(aio_context);
5028         bdrv_invalidate_cache(bs, &local_err);
5029         aio_context_release(aio_context);
5030         if (local_err) {
5031             error_propagate(errp, local_err);
5032             return;
5033         }
5034     }
5035 }
5036 
5037 int bdrv_flush(BlockDriverState *bs)
5038 {
5039     Coroutine *co;
5040     RwCo rwco = {
5041         .bs = bs,
5042         .ret = NOT_DONE,
5043     };
5044 
5045     if (qemu_in_coroutine()) {
5046         /* Fast-path if already in coroutine context */
5047         bdrv_flush_co_entry(&rwco);
5048     } else {
5049         AioContext *aio_context = bdrv_get_aio_context(bs);
5050 
5051         co = qemu_coroutine_create(bdrv_flush_co_entry);
5052         qemu_coroutine_enter(co, &rwco);
5053         while (rwco.ret == NOT_DONE) {
5054             aio_poll(aio_context, true);
5055         }
5056     }
5057 
5058     return rwco.ret;
5059 }
5060 
5061 typedef struct DiscardCo {
5062     BlockDriverState *bs;
5063     int64_t sector_num;
5064     int nb_sectors;
5065     int ret;
5066 } DiscardCo;
5067 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5068 {
5069     DiscardCo *rwco = opaque;
5070 
5071     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5072 }
5073 
5074 /* if no limit is specified in the BlockLimits use a default
5075  * of 32768 512-byte sectors (16 MiB) per request.
5076  */
5077 #define MAX_DISCARD_DEFAULT 32768
5078 
5079 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5080                                  int nb_sectors)
5081 {
5082     int max_discard;
5083 
5084     if (!bs->drv) {
5085         return -ENOMEDIUM;
5086     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5087         return -EIO;
5088     } else if (bs->read_only) {
5089         return -EROFS;
5090     }
5091 
5092     bdrv_reset_dirty(bs, sector_num, nb_sectors);
5093 
5094     /* Do nothing if disabled.  */
5095     if (!(bs->open_flags & BDRV_O_UNMAP)) {
5096         return 0;
5097     }
5098 
5099     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5100         return 0;
5101     }
5102 
5103     max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5104     while (nb_sectors > 0) {
5105         int ret;
5106         int num = nb_sectors;
5107 
5108         /* align request */
5109         if (bs->bl.discard_alignment &&
5110             num >= bs->bl.discard_alignment &&
5111             sector_num % bs->bl.discard_alignment) {
5112             if (num > bs->bl.discard_alignment) {
5113                 num = bs->bl.discard_alignment;
5114             }
5115             num -= sector_num % bs->bl.discard_alignment;
5116         }
5117 
5118         /* limit request size */
5119         if (num > max_discard) {
5120             num = max_discard;
5121         }
5122 
5123         if (bs->drv->bdrv_co_discard) {
5124             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5125         } else {
5126             BlockAIOCB *acb;
5127             CoroutineIOCompletion co = {
5128                 .coroutine = qemu_coroutine_self(),
5129             };
5130 
5131             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5132                                             bdrv_co_io_em_complete, &co);
5133             if (acb == NULL) {
5134                 return -EIO;
5135             } else {
5136                 qemu_coroutine_yield();
5137                 ret = co.ret;
5138             }
5139         }
5140         if (ret && ret != -ENOTSUP) {
5141             return ret;
5142         }
5143 
5144         sector_num += num;
5145         nb_sectors -= num;
5146     }
5147     return 0;
5148 }
5149 
5150 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5151 {
5152     Coroutine *co;
5153     DiscardCo rwco = {
5154         .bs = bs,
5155         .sector_num = sector_num,
5156         .nb_sectors = nb_sectors,
5157         .ret = NOT_DONE,
5158     };
5159 
5160     if (qemu_in_coroutine()) {
5161         /* Fast-path if already in coroutine context */
5162         bdrv_discard_co_entry(&rwco);
5163     } else {
5164         AioContext *aio_context = bdrv_get_aio_context(bs);
5165 
5166         co = qemu_coroutine_create(bdrv_discard_co_entry);
5167         qemu_coroutine_enter(co, &rwco);
5168         while (rwco.ret == NOT_DONE) {
5169             aio_poll(aio_context, true);
5170         }
5171     }
5172 
5173     return rwco.ret;
5174 }
5175 
5176 /**************************************************************/
5177 /* removable device support */
5178 
5179 /**
5180  * Return TRUE if the media is present
5181  */
5182 int bdrv_is_inserted(BlockDriverState *bs)
5183 {
5184     BlockDriver *drv = bs->drv;
5185 
5186     if (!drv)
5187         return 0;
5188     if (!drv->bdrv_is_inserted)
5189         return 1;
5190     return drv->bdrv_is_inserted(bs);
5191 }
5192 
5193 /**
5194  * Return whether the media changed since the last call to this
5195  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
5196  */
5197 int bdrv_media_changed(BlockDriverState *bs)
5198 {
5199     BlockDriver *drv = bs->drv;
5200 
5201     if (drv && drv->bdrv_media_changed) {
5202         return drv->bdrv_media_changed(bs);
5203     }
5204     return -ENOTSUP;
5205 }
5206 
5207 /**
5208  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5209  */
5210 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5211 {
5212     BlockDriver *drv = bs->drv;
5213     const char *device_name;
5214 
5215     if (drv && drv->bdrv_eject) {
5216         drv->bdrv_eject(bs, eject_flag);
5217     }
5218 
5219     device_name = bdrv_get_device_name(bs);
5220     if (device_name[0] != '\0') {
5221         qapi_event_send_device_tray_moved(device_name,
5222                                           eject_flag, &error_abort);
5223     }
5224 }
5225 
5226 /**
5227  * Lock or unlock the media (if it is locked, the user won't be able
5228  * to eject it manually).
5229  */
5230 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5231 {
5232     BlockDriver *drv = bs->drv;
5233 
5234     trace_bdrv_lock_medium(bs, locked);
5235 
5236     if (drv && drv->bdrv_lock_medium) {
5237         drv->bdrv_lock_medium(bs, locked);
5238     }
5239 }
5240 
5241 /* needed for generic scsi interface */
5242 
5243 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5244 {
5245     BlockDriver *drv = bs->drv;
5246 
5247     if (drv && drv->bdrv_ioctl)
5248         return drv->bdrv_ioctl(bs, req, buf);
5249     return -ENOTSUP;
5250 }
5251 
5252 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5253         unsigned long int req, void *buf,
5254         BlockCompletionFunc *cb, void *opaque)
5255 {
5256     BlockDriver *drv = bs->drv;
5257 
5258     if (drv && drv->bdrv_aio_ioctl)
5259         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5260     return NULL;
5261 }
5262 
5263 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5264 {
5265     bs->guest_block_size = align;
5266 }
5267 
5268 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5269 {
5270     return qemu_memalign(bdrv_opt_mem_align(bs), size);
5271 }
5272 
5273 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5274 {
5275     return memset(qemu_blockalign(bs, size), 0, size);
5276 }
5277 
5278 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5279 {
5280     size_t align = bdrv_opt_mem_align(bs);
5281 
5282     /* Ensure that NULL is never returned on success */
5283     assert(align > 0);
5284     if (size == 0) {
5285         size = align;
5286     }
5287 
5288     return qemu_try_memalign(align, size);
5289 }
5290 
5291 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5292 {
5293     void *mem = qemu_try_blockalign(bs, size);
5294 
5295     if (mem) {
5296         memset(mem, 0, size);
5297     }
5298 
5299     return mem;
5300 }
5301 
5302 /*
5303  * Check if all memory in this vector is sector aligned.
5304  */
5305 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5306 {
5307     int i;
5308     size_t alignment = bdrv_opt_mem_align(bs);
5309 
5310     for (i = 0; i < qiov->niov; i++) {
5311         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5312             return false;
5313         }
5314         if (qiov->iov[i].iov_len % alignment) {
5315             return false;
5316         }
5317     }
5318 
5319     return true;
5320 }
5321 
5322 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5323                                           Error **errp)
5324 {
5325     int64_t bitmap_size;
5326     BdrvDirtyBitmap *bitmap;
5327 
5328     assert((granularity & (granularity - 1)) == 0);
5329 
5330     granularity >>= BDRV_SECTOR_BITS;
5331     assert(granularity);
5332     bitmap_size = bdrv_nb_sectors(bs);
5333     if (bitmap_size < 0) {
5334         error_setg_errno(errp, -bitmap_size, "could not get length of device");
5335         errno = -bitmap_size;
5336         return NULL;
5337     }
5338     bitmap = g_new0(BdrvDirtyBitmap, 1);
5339     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5340     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5341     return bitmap;
5342 }
5343 
5344 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5345 {
5346     BdrvDirtyBitmap *bm, *next;
5347     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5348         if (bm == bitmap) {
5349             QLIST_REMOVE(bitmap, list);
5350             hbitmap_free(bitmap->bitmap);
5351             g_free(bitmap);
5352             return;
5353         }
5354     }
5355 }
5356 
5357 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5358 {
5359     BdrvDirtyBitmap *bm;
5360     BlockDirtyInfoList *list = NULL;
5361     BlockDirtyInfoList **plist = &list;
5362 
5363     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5364         BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5365         BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
5366         info->count = bdrv_get_dirty_count(bs, bm);
5367         info->granularity =
5368             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5369         entry->value = info;
5370         *plist = entry;
5371         plist = &entry->next;
5372     }
5373 
5374     return list;
5375 }
5376 
5377 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5378 {
5379     if (bitmap) {
5380         return hbitmap_get(bitmap->bitmap, sector);
5381     } else {
5382         return 0;
5383     }
5384 }
5385 
5386 void bdrv_dirty_iter_init(BlockDriverState *bs,
5387                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5388 {
5389     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5390 }
5391 
5392 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5393                     int nr_sectors)
5394 {
5395     BdrvDirtyBitmap *bitmap;
5396     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5397         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5398     }
5399 }
5400 
5401 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5402 {
5403     BdrvDirtyBitmap *bitmap;
5404     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5405         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5406     }
5407 }
5408 
5409 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5410 {
5411     return hbitmap_count(bitmap->bitmap);
5412 }
5413 
5414 /* Get a reference to bs */
5415 void bdrv_ref(BlockDriverState *bs)
5416 {
5417     bs->refcnt++;
5418 }
5419 
5420 /* Release a previously grabbed reference to bs.
5421  * If after releasing, reference count is zero, the BlockDriverState is
5422  * deleted. */
5423 void bdrv_unref(BlockDriverState *bs)
5424 {
5425     if (!bs) {
5426         return;
5427     }
5428     assert(bs->refcnt > 0);
5429     if (--bs->refcnt == 0) {
5430         bdrv_delete(bs);
5431     }
5432 }
5433 
5434 struct BdrvOpBlocker {
5435     Error *reason;
5436     QLIST_ENTRY(BdrvOpBlocker) list;
5437 };
5438 
5439 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5440 {
5441     BdrvOpBlocker *blocker;
5442     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5443     if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5444         blocker = QLIST_FIRST(&bs->op_blockers[op]);
5445         if (errp) {
5446             error_setg(errp, "Device '%s' is busy: %s",
5447                        bdrv_get_device_name(bs),
5448                        error_get_pretty(blocker->reason));
5449         }
5450         return true;
5451     }
5452     return false;
5453 }
5454 
5455 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5456 {
5457     BdrvOpBlocker *blocker;
5458     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5459 
5460     blocker = g_new0(BdrvOpBlocker, 1);
5461     blocker->reason = reason;
5462     QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5463 }
5464 
5465 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5466 {
5467     BdrvOpBlocker *blocker, *next;
5468     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5469     QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5470         if (blocker->reason == reason) {
5471             QLIST_REMOVE(blocker, list);
5472             g_free(blocker);
5473         }
5474     }
5475 }
5476 
5477 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5478 {
5479     int i;
5480     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5481         bdrv_op_block(bs, i, reason);
5482     }
5483 }
5484 
5485 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5486 {
5487     int i;
5488     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5489         bdrv_op_unblock(bs, i, reason);
5490     }
5491 }
5492 
5493 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5494 {
5495     int i;
5496 
5497     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5498         if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5499             return false;
5500         }
5501     }
5502     return true;
5503 }
5504 
5505 void bdrv_iostatus_enable(BlockDriverState *bs)
5506 {
5507     bs->iostatus_enabled = true;
5508     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5509 }
5510 
5511 /* The I/O status is only enabled if the drive explicitly
5512  * enables it _and_ the VM is configured to stop on errors */
5513 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5514 {
5515     return (bs->iostatus_enabled &&
5516            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5517             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5518             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5519 }
5520 
5521 void bdrv_iostatus_disable(BlockDriverState *bs)
5522 {
5523     bs->iostatus_enabled = false;
5524 }
5525 
5526 void bdrv_iostatus_reset(BlockDriverState *bs)
5527 {
5528     if (bdrv_iostatus_is_enabled(bs)) {
5529         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5530         if (bs->job) {
5531             block_job_iostatus_reset(bs->job);
5532         }
5533     }
5534 }
5535 
5536 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5537 {
5538     assert(bdrv_iostatus_is_enabled(bs));
5539     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5540         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5541                                          BLOCK_DEVICE_IO_STATUS_FAILED;
5542     }
5543 }
5544 
5545 void bdrv_img_create(const char *filename, const char *fmt,
5546                      const char *base_filename, const char *base_fmt,
5547                      char *options, uint64_t img_size, int flags,
5548                      Error **errp, bool quiet)
5549 {
5550     QemuOptsList *create_opts = NULL;
5551     QemuOpts *opts = NULL;
5552     const char *backing_fmt, *backing_file;
5553     int64_t size;
5554     BlockDriver *drv, *proto_drv;
5555     BlockDriver *backing_drv = NULL;
5556     Error *local_err = NULL;
5557     int ret = 0;
5558 
5559     /* Find driver and parse its options */
5560     drv = bdrv_find_format(fmt);
5561     if (!drv) {
5562         error_setg(errp, "Unknown file format '%s'", fmt);
5563         return;
5564     }
5565 
5566     proto_drv = bdrv_find_protocol(filename, true);
5567     if (!proto_drv) {
5568         error_setg(errp, "Unknown protocol '%s'", filename);
5569         return;
5570     }
5571 
5572     if (!drv->create_opts) {
5573         error_setg(errp, "Format driver '%s' does not support image creation",
5574                    drv->format_name);
5575         return;
5576     }
5577 
5578     if (!proto_drv->create_opts) {
5579         error_setg(errp, "Protocol driver '%s' does not support image creation",
5580                    proto_drv->format_name);
5581         return;
5582     }
5583 
5584     create_opts = qemu_opts_append(create_opts, drv->create_opts);
5585     create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5586 
5587     /* Create parameter list with default values */
5588     opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5589     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
5590 
5591     /* Parse -o options */
5592     if (options) {
5593         if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5594             error_setg(errp, "Invalid options for file format '%s'", fmt);
5595             goto out;
5596         }
5597     }
5598 
5599     if (base_filename) {
5600         if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
5601             error_setg(errp, "Backing file not supported for file format '%s'",
5602                        fmt);
5603             goto out;
5604         }
5605     }
5606 
5607     if (base_fmt) {
5608         if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5609             error_setg(errp, "Backing file format not supported for file "
5610                              "format '%s'", fmt);
5611             goto out;
5612         }
5613     }
5614 
5615     backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5616     if (backing_file) {
5617         if (!strcmp(filename, backing_file)) {
5618             error_setg(errp, "Error: Trying to create an image with the "
5619                              "same filename as the backing file");
5620             goto out;
5621         }
5622     }
5623 
5624     backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5625     if (backing_fmt) {
5626         backing_drv = bdrv_find_format(backing_fmt);
5627         if (!backing_drv) {
5628             error_setg(errp, "Unknown backing file format '%s'",
5629                        backing_fmt);
5630             goto out;
5631         }
5632     }
5633 
5634     // The size for the image must always be specified, with one exception:
5635     // If we are using a backing file, we can obtain the size from there
5636     size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5637     if (size == -1) {
5638         if (backing_file) {
5639             BlockDriverState *bs;
5640             int64_t size;
5641             int back_flags;
5642 
5643             /* backing files always opened read-only */
5644             back_flags =
5645                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5646 
5647             bs = NULL;
5648             ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
5649                             backing_drv, &local_err);
5650             if (ret < 0) {
5651                 goto out;
5652             }
5653             size = bdrv_getlength(bs);
5654             if (size < 0) {
5655                 error_setg_errno(errp, -size, "Could not get size of '%s'",
5656                                  backing_file);
5657                 bdrv_unref(bs);
5658                 goto out;
5659             }
5660 
5661             qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
5662 
5663             bdrv_unref(bs);
5664         } else {
5665             error_setg(errp, "Image creation needs a size parameter");
5666             goto out;
5667         }
5668     }
5669 
5670     if (!quiet) {
5671         printf("Formatting '%s', fmt=%s", filename, fmt);
5672         qemu_opts_print(opts, " ");
5673         puts("");
5674     }
5675 
5676     ret = bdrv_create(drv, filename, opts, &local_err);
5677 
5678     if (ret == -EFBIG) {
5679         /* This is generally a better message than whatever the driver would
5680          * deliver (especially because of the cluster_size_hint), since that
5681          * is most probably not much different from "image too large". */
5682         const char *cluster_size_hint = "";
5683         if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
5684             cluster_size_hint = " (try using a larger cluster size)";
5685         }
5686         error_setg(errp, "The image size is too large for file format '%s'"
5687                    "%s", fmt, cluster_size_hint);
5688         error_free(local_err);
5689         local_err = NULL;
5690     }
5691 
5692 out:
5693     qemu_opts_del(opts);
5694     qemu_opts_free(create_opts);
5695     if (local_err) {
5696         error_propagate(errp, local_err);
5697     }
5698 }
5699 
5700 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5701 {
5702     return bs->aio_context;
5703 }
5704 
5705 void bdrv_detach_aio_context(BlockDriverState *bs)
5706 {
5707     BdrvAioNotifier *baf;
5708 
5709     if (!bs->drv) {
5710         return;
5711     }
5712 
5713     QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5714         baf->detach_aio_context(baf->opaque);
5715     }
5716 
5717     if (bs->io_limits_enabled) {
5718         throttle_detach_aio_context(&bs->throttle_state);
5719     }
5720     if (bs->drv->bdrv_detach_aio_context) {
5721         bs->drv->bdrv_detach_aio_context(bs);
5722     }
5723     if (bs->file) {
5724         bdrv_detach_aio_context(bs->file);
5725     }
5726     if (bs->backing_hd) {
5727         bdrv_detach_aio_context(bs->backing_hd);
5728     }
5729 
5730     bs->aio_context = NULL;
5731 }
5732 
5733 void bdrv_attach_aio_context(BlockDriverState *bs,
5734                              AioContext *new_context)
5735 {
5736     BdrvAioNotifier *ban;
5737 
5738     if (!bs->drv) {
5739         return;
5740     }
5741 
5742     bs->aio_context = new_context;
5743 
5744     if (bs->backing_hd) {
5745         bdrv_attach_aio_context(bs->backing_hd, new_context);
5746     }
5747     if (bs->file) {
5748         bdrv_attach_aio_context(bs->file, new_context);
5749     }
5750     if (bs->drv->bdrv_attach_aio_context) {
5751         bs->drv->bdrv_attach_aio_context(bs, new_context);
5752     }
5753     if (bs->io_limits_enabled) {
5754         throttle_attach_aio_context(&bs->throttle_state, new_context);
5755     }
5756 
5757     QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5758         ban->attached_aio_context(new_context, ban->opaque);
5759     }
5760 }
5761 
5762 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5763 {
5764     bdrv_drain_all(); /* ensure there are no in-flight requests */
5765 
5766     bdrv_detach_aio_context(bs);
5767 
5768     /* This function executes in the old AioContext so acquire the new one in
5769      * case it runs in a different thread.
5770      */
5771     aio_context_acquire(new_context);
5772     bdrv_attach_aio_context(bs, new_context);
5773     aio_context_release(new_context);
5774 }
5775 
5776 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5777         void (*attached_aio_context)(AioContext *new_context, void *opaque),
5778         void (*detach_aio_context)(void *opaque), void *opaque)
5779 {
5780     BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5781     *ban = (BdrvAioNotifier){
5782         .attached_aio_context = attached_aio_context,
5783         .detach_aio_context   = detach_aio_context,
5784         .opaque               = opaque
5785     };
5786 
5787     QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5788 }
5789 
5790 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5791                                       void (*attached_aio_context)(AioContext *,
5792                                                                    void *),
5793                                       void (*detach_aio_context)(void *),
5794                                       void *opaque)
5795 {
5796     BdrvAioNotifier *ban, *ban_next;
5797 
5798     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5799         if (ban->attached_aio_context == attached_aio_context &&
5800             ban->detach_aio_context   == detach_aio_context   &&
5801             ban->opaque               == opaque)
5802         {
5803             QLIST_REMOVE(ban, list);
5804             g_free(ban);
5805 
5806             return;
5807         }
5808     }
5809 
5810     abort();
5811 }
5812 
5813 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5814                                     NotifierWithReturn *notifier)
5815 {
5816     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5817 }
5818 
5819 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
5820                        BlockDriverAmendStatusCB *status_cb)
5821 {
5822     if (!bs->drv->bdrv_amend_options) {
5823         return -ENOTSUP;
5824     }
5825     return bs->drv->bdrv_amend_options(bs, opts, status_cb);
5826 }
5827 
5828 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5829  * of block filter and by bdrv_is_first_non_filter.
5830  * It is used to test if the given bs is the candidate or recurse more in the
5831  * node graph.
5832  */
5833 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5834                                       BlockDriverState *candidate)
5835 {
5836     /* return false if basic checks fails */
5837     if (!bs || !bs->drv) {
5838         return false;
5839     }
5840 
5841     /* the code reached a non block filter driver -> check if the bs is
5842      * the same as the candidate. It's the recursion termination condition.
5843      */
5844     if (!bs->drv->is_filter) {
5845         return bs == candidate;
5846     }
5847     /* Down this path the driver is a block filter driver */
5848 
5849     /* If the block filter recursion method is defined use it to recurse down
5850      * the node graph.
5851      */
5852     if (bs->drv->bdrv_recurse_is_first_non_filter) {
5853         return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5854     }
5855 
5856     /* the driver is a block filter but don't allow to recurse -> return false
5857      */
5858     return false;
5859 }
5860 
5861 /* This function checks if the candidate is the first non filter bs down it's
5862  * bs chain. Since we don't have pointers to parents it explore all bs chains
5863  * from the top. Some filters can choose not to pass down the recursion.
5864  */
5865 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5866 {
5867     BlockDriverState *bs;
5868 
5869     /* walk down the bs forest recursively */
5870     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5871         bool perm;
5872 
5873         /* try to recurse in this top level bs */
5874         perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5875 
5876         /* candidate is the first non filter */
5877         if (perm) {
5878             return true;
5879         }
5880     }
5881 
5882     return false;
5883 }
5884 
5885 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5886 {
5887     BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5888     AioContext *aio_context;
5889 
5890     if (!to_replace_bs) {
5891         error_setg(errp, "Node name '%s' not found", node_name);
5892         return NULL;
5893     }
5894 
5895     aio_context = bdrv_get_aio_context(to_replace_bs);
5896     aio_context_acquire(aio_context);
5897 
5898     if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5899         to_replace_bs = NULL;
5900         goto out;
5901     }
5902 
5903     /* We don't want arbitrary node of the BDS chain to be replaced only the top
5904      * most non filter in order to prevent data corruption.
5905      * Another benefit is that this tests exclude backing files which are
5906      * blocked by the backing blockers.
5907      */
5908     if (!bdrv_is_first_non_filter(to_replace_bs)) {
5909         error_setg(errp, "Only top most non filter can be replaced");
5910         to_replace_bs = NULL;
5911         goto out;
5912     }
5913 
5914 out:
5915     aio_context_release(aio_context);
5916     return to_replace_bs;
5917 }
5918 
5919 void bdrv_io_plug(BlockDriverState *bs)
5920 {
5921     BlockDriver *drv = bs->drv;
5922     if (drv && drv->bdrv_io_plug) {
5923         drv->bdrv_io_plug(bs);
5924     } else if (bs->file) {
5925         bdrv_io_plug(bs->file);
5926     }
5927 }
5928 
5929 void bdrv_io_unplug(BlockDriverState *bs)
5930 {
5931     BlockDriver *drv = bs->drv;
5932     if (drv && drv->bdrv_io_unplug) {
5933         drv->bdrv_io_unplug(bs);
5934     } else if (bs->file) {
5935         bdrv_io_unplug(bs->file);
5936     }
5937 }
5938 
5939 void bdrv_flush_io_queue(BlockDriverState *bs)
5940 {
5941     BlockDriver *drv = bs->drv;
5942     if (drv && drv->bdrv_flush_io_queue) {
5943         drv->bdrv_flush_io_queue(bs);
5944     } else if (bs->file) {
5945         bdrv_flush_io_queue(bs->file);
5946     }
5947 }
5948 
5949 static bool append_open_options(QDict *d, BlockDriverState *bs)
5950 {
5951     const QDictEntry *entry;
5952     bool found_any = false;
5953 
5954     for (entry = qdict_first(bs->options); entry;
5955          entry = qdict_next(bs->options, entry))
5956     {
5957         /* Only take options for this level and exclude all non-driver-specific
5958          * options */
5959         if (!strchr(qdict_entry_key(entry), '.') &&
5960             strcmp(qdict_entry_key(entry), "node-name"))
5961         {
5962             qobject_incref(qdict_entry_value(entry));
5963             qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
5964             found_any = true;
5965         }
5966     }
5967 
5968     return found_any;
5969 }
5970 
5971 /* Updates the following BDS fields:
5972  *  - exact_filename: A filename which may be used for opening a block device
5973  *                    which (mostly) equals the given BDS (even without any
5974  *                    other options; so reading and writing must return the same
5975  *                    results, but caching etc. may be different)
5976  *  - full_open_options: Options which, when given when opening a block device
5977  *                       (without a filename), result in a BDS (mostly)
5978  *                       equalling the given one
5979  *  - filename: If exact_filename is set, it is copied here. Otherwise,
5980  *              full_open_options is converted to a JSON object, prefixed with
5981  *              "json:" (for use through the JSON pseudo protocol) and put here.
5982  */
5983 void bdrv_refresh_filename(BlockDriverState *bs)
5984 {
5985     BlockDriver *drv = bs->drv;
5986     QDict *opts;
5987 
5988     if (!drv) {
5989         return;
5990     }
5991 
5992     /* This BDS's file name will most probably depend on its file's name, so
5993      * refresh that first */
5994     if (bs->file) {
5995         bdrv_refresh_filename(bs->file);
5996     }
5997 
5998     if (drv->bdrv_refresh_filename) {
5999         /* Obsolete information is of no use here, so drop the old file name
6000          * information before refreshing it */
6001         bs->exact_filename[0] = '\0';
6002         if (bs->full_open_options) {
6003             QDECREF(bs->full_open_options);
6004             bs->full_open_options = NULL;
6005         }
6006 
6007         drv->bdrv_refresh_filename(bs);
6008     } else if (bs->file) {
6009         /* Try to reconstruct valid information from the underlying file */
6010         bool has_open_options;
6011 
6012         bs->exact_filename[0] = '\0';
6013         if (bs->full_open_options) {
6014             QDECREF(bs->full_open_options);
6015             bs->full_open_options = NULL;
6016         }
6017 
6018         opts = qdict_new();
6019         has_open_options = append_open_options(opts, bs);
6020 
6021         /* If no specific options have been given for this BDS, the filename of
6022          * the underlying file should suffice for this one as well */
6023         if (bs->file->exact_filename[0] && !has_open_options) {
6024             strcpy(bs->exact_filename, bs->file->exact_filename);
6025         }
6026         /* Reconstructing the full options QDict is simple for most format block
6027          * drivers, as long as the full options are known for the underlying
6028          * file BDS. The full options QDict of that file BDS should somehow
6029          * contain a representation of the filename, therefore the following
6030          * suffices without querying the (exact_)filename of this BDS. */
6031         if (bs->file->full_open_options) {
6032             qdict_put_obj(opts, "driver",
6033                           QOBJECT(qstring_from_str(drv->format_name)));
6034             QINCREF(bs->file->full_open_options);
6035             qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6036 
6037             bs->full_open_options = opts;
6038         } else {
6039             QDECREF(opts);
6040         }
6041     } else if (!bs->full_open_options && qdict_size(bs->options)) {
6042         /* There is no underlying file BDS (at least referenced by BDS.file),
6043          * so the full options QDict should be equal to the options given
6044          * specifically for this block device when it was opened (plus the
6045          * driver specification).
6046          * Because those options don't change, there is no need to update
6047          * full_open_options when it's already set. */
6048 
6049         opts = qdict_new();
6050         append_open_options(opts, bs);
6051         qdict_put_obj(opts, "driver",
6052                       QOBJECT(qstring_from_str(drv->format_name)));
6053 
6054         if (bs->exact_filename[0]) {
6055             /* This may not work for all block protocol drivers (some may
6056              * require this filename to be parsed), but we have to find some
6057              * default solution here, so just include it. If some block driver
6058              * does not support pure options without any filename at all or
6059              * needs some special format of the options QDict, it needs to
6060              * implement the driver-specific bdrv_refresh_filename() function.
6061              */
6062             qdict_put_obj(opts, "filename",
6063                           QOBJECT(qstring_from_str(bs->exact_filename)));
6064         }
6065 
6066         bs->full_open_options = opts;
6067     }
6068 
6069     if (bs->exact_filename[0]) {
6070         pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6071     } else if (bs->full_open_options) {
6072         QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6073         snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6074                  qstring_get_str(json));
6075         QDECREF(json);
6076     }
6077 }
6078 
6079 /* This accessor function purpose is to allow the device models to access the
6080  * BlockAcctStats structure embedded inside a BlockDriverState without being
6081  * aware of the BlockDriverState structure layout.
6082  * It will go away when the BlockAcctStats structure will be moved inside
6083  * the device models.
6084  */
6085 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6086 {
6087     return &bs->stats;
6088 }
6089