xref: /openbmc/qemu/block.c (revision 27215a22)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/block-backend.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 #include "qapi-event.h"
39 
40 #ifdef CONFIG_BSD
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <sys/ioctl.h>
44 #include <sys/queue.h>
45 #ifndef __DragonFly__
46 #include <sys/disk.h>
47 #endif
48 #endif
49 
50 #ifdef _WIN32
51 #include <windows.h>
52 #endif
53 
54 struct BdrvDirtyBitmap {
55     HBitmap *bitmap;
56     QLIST_ENTRY(BdrvDirtyBitmap) list;
57 };
58 
59 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
60 
61 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63         BlockCompletionFunc *cb, void *opaque);
64 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66         BlockCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68                                          int64_t sector_num, int nb_sectors,
69                                          QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71                                          int64_t sector_num, int nb_sectors,
72                                          QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75     BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78     BdrvRequestFlags flags);
79 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80                                          int64_t sector_num,
81                                          QEMUIOVector *qiov,
82                                          int nb_sectors,
83                                          BdrvRequestFlags flags,
84                                          BlockCompletionFunc *cb,
85                                          void *opaque,
86                                          bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90 
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92     QTAILQ_HEAD_INITIALIZER(bdrv_states);
93 
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96 
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98     QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 
100 static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
101                            int nr_sectors);
102 static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
103                              int nr_sectors);
104 /* If non-zero, use only whitelisted block drivers */
105 static int use_bdrv_whitelist;
106 
107 #ifdef _WIN32
108 static int is_windows_drive_prefix(const char *filename)
109 {
110     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
111              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
112             filename[1] == ':');
113 }
114 
115 int is_windows_drive(const char *filename)
116 {
117     if (is_windows_drive_prefix(filename) &&
118         filename[2] == '\0')
119         return 1;
120     if (strstart(filename, "\\\\.\\", NULL) ||
121         strstart(filename, "//./", NULL))
122         return 1;
123     return 0;
124 }
125 #endif
126 
127 /* throttling disk I/O limits */
128 void bdrv_set_io_limits(BlockDriverState *bs,
129                         ThrottleConfig *cfg)
130 {
131     int i;
132 
133     throttle_config(&bs->throttle_state, cfg);
134 
135     for (i = 0; i < 2; i++) {
136         qemu_co_enter_next(&bs->throttled_reqs[i]);
137     }
138 }
139 
140 /* this function drain all the throttled IOs */
141 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
142 {
143     bool drained = false;
144     bool enabled = bs->io_limits_enabled;
145     int i;
146 
147     bs->io_limits_enabled = false;
148 
149     for (i = 0; i < 2; i++) {
150         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
151             drained = true;
152         }
153     }
154 
155     bs->io_limits_enabled = enabled;
156 
157     return drained;
158 }
159 
160 void bdrv_io_limits_disable(BlockDriverState *bs)
161 {
162     bs->io_limits_enabled = false;
163 
164     bdrv_start_throttled_reqs(bs);
165 
166     throttle_destroy(&bs->throttle_state);
167 }
168 
169 static void bdrv_throttle_read_timer_cb(void *opaque)
170 {
171     BlockDriverState *bs = opaque;
172     qemu_co_enter_next(&bs->throttled_reqs[0]);
173 }
174 
175 static void bdrv_throttle_write_timer_cb(void *opaque)
176 {
177     BlockDriverState *bs = opaque;
178     qemu_co_enter_next(&bs->throttled_reqs[1]);
179 }
180 
181 /* should be called before bdrv_set_io_limits if a limit is set */
182 void bdrv_io_limits_enable(BlockDriverState *bs)
183 {
184     assert(!bs->io_limits_enabled);
185     throttle_init(&bs->throttle_state,
186                   bdrv_get_aio_context(bs),
187                   QEMU_CLOCK_VIRTUAL,
188                   bdrv_throttle_read_timer_cb,
189                   bdrv_throttle_write_timer_cb,
190                   bs);
191     bs->io_limits_enabled = true;
192 }
193 
194 /* This function makes an IO wait if needed
195  *
196  * @nb_sectors: the number of sectors of the IO
197  * @is_write:   is the IO a write
198  */
199 static void bdrv_io_limits_intercept(BlockDriverState *bs,
200                                      unsigned int bytes,
201                                      bool is_write)
202 {
203     /* does this io must wait */
204     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
205 
206     /* if must wait or any request of this type throttled queue the IO */
207     if (must_wait ||
208         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
209         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
210     }
211 
212     /* the IO will be executed, do the accounting */
213     throttle_account(&bs->throttle_state, is_write, bytes);
214 
215 
216     /* if the next request must wait -> do nothing */
217     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
218         return;
219     }
220 
221     /* else queue next request for execution */
222     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
223 }
224 
225 size_t bdrv_opt_mem_align(BlockDriverState *bs)
226 {
227     if (!bs || !bs->drv) {
228         /* 4k should be on the safe side */
229         return 4096;
230     }
231 
232     return bs->bl.opt_mem_alignment;
233 }
234 
235 /* check if the path starts with "<protocol>:" */
236 int path_has_protocol(const char *path)
237 {
238     const char *p;
239 
240 #ifdef _WIN32
241     if (is_windows_drive(path) ||
242         is_windows_drive_prefix(path)) {
243         return 0;
244     }
245     p = path + strcspn(path, ":/\\");
246 #else
247     p = path + strcspn(path, ":/");
248 #endif
249 
250     return *p == ':';
251 }
252 
253 int path_is_absolute(const char *path)
254 {
255 #ifdef _WIN32
256     /* specific case for names like: "\\.\d:" */
257     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
258         return 1;
259     }
260     return (*path == '/' || *path == '\\');
261 #else
262     return (*path == '/');
263 #endif
264 }
265 
266 /* if filename is absolute, just copy it to dest. Otherwise, build a
267    path to it by considering it is relative to base_path. URL are
268    supported. */
269 void path_combine(char *dest, int dest_size,
270                   const char *base_path,
271                   const char *filename)
272 {
273     const char *p, *p1;
274     int len;
275 
276     if (dest_size <= 0)
277         return;
278     if (path_is_absolute(filename)) {
279         pstrcpy(dest, dest_size, filename);
280     } else {
281         p = strchr(base_path, ':');
282         if (p)
283             p++;
284         else
285             p = base_path;
286         p1 = strrchr(base_path, '/');
287 #ifdef _WIN32
288         {
289             const char *p2;
290             p2 = strrchr(base_path, '\\');
291             if (!p1 || p2 > p1)
292                 p1 = p2;
293         }
294 #endif
295         if (p1)
296             p1++;
297         else
298             p1 = base_path;
299         if (p1 > p)
300             p = p1;
301         len = p - base_path;
302         if (len > dest_size - 1)
303             len = dest_size - 1;
304         memcpy(dest, base_path, len);
305         dest[len] = '\0';
306         pstrcat(dest, dest_size, filename);
307     }
308 }
309 
310 void bdrv_get_full_backing_filename_from_filename(const char *backed,
311                                                   const char *backing,
312                                                   char *dest, size_t sz,
313                                                   Error **errp)
314 {
315     if (backing[0] == '\0' || path_has_protocol(backing) ||
316         path_is_absolute(backing))
317     {
318         pstrcpy(dest, sz, backing);
319     } else if (backed[0] == '\0' || strstart(backed, "json:", NULL)) {
320         error_setg(errp, "Cannot use relative backing file names for '%s'",
321                    backed);
322     } else {
323         path_combine(dest, sz, backed, backing);
324     }
325 }
326 
327 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz,
328                                     Error **errp)
329 {
330     char *backed = bs->exact_filename[0] ? bs->exact_filename : bs->filename;
331 
332     bdrv_get_full_backing_filename_from_filename(backed, bs->backing_file,
333                                                  dest, sz, errp);
334 }
335 
336 void bdrv_register(BlockDriver *bdrv)
337 {
338     /* Block drivers without coroutine functions need emulation */
339     if (!bdrv->bdrv_co_readv) {
340         bdrv->bdrv_co_readv = bdrv_co_readv_em;
341         bdrv->bdrv_co_writev = bdrv_co_writev_em;
342 
343         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
344          * the block driver lacks aio we need to emulate that too.
345          */
346         if (!bdrv->bdrv_aio_readv) {
347             /* add AIO emulation layer */
348             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
349             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
350         }
351     }
352 
353     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
354 }
355 
356 BlockDriverState *bdrv_new_root(void)
357 {
358     BlockDriverState *bs = bdrv_new();
359 
360     QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
361     return bs;
362 }
363 
364 BlockDriverState *bdrv_new(void)
365 {
366     BlockDriverState *bs;
367     int i;
368 
369     bs = g_new0(BlockDriverState, 1);
370     QLIST_INIT(&bs->dirty_bitmaps);
371     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
372         QLIST_INIT(&bs->op_blockers[i]);
373     }
374     bdrv_iostatus_disable(bs);
375     notifier_list_init(&bs->close_notifiers);
376     notifier_with_return_list_init(&bs->before_write_notifiers);
377     qemu_co_queue_init(&bs->throttled_reqs[0]);
378     qemu_co_queue_init(&bs->throttled_reqs[1]);
379     bs->refcnt = 1;
380     bs->aio_context = qemu_get_aio_context();
381 
382     return bs;
383 }
384 
385 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
386 {
387     notifier_list_add(&bs->close_notifiers, notify);
388 }
389 
390 BlockDriver *bdrv_find_format(const char *format_name)
391 {
392     BlockDriver *drv1;
393     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
394         if (!strcmp(drv1->format_name, format_name)) {
395             return drv1;
396         }
397     }
398     return NULL;
399 }
400 
401 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
402 {
403     static const char *whitelist_rw[] = {
404         CONFIG_BDRV_RW_WHITELIST
405     };
406     static const char *whitelist_ro[] = {
407         CONFIG_BDRV_RO_WHITELIST
408     };
409     const char **p;
410 
411     if (!whitelist_rw[0] && !whitelist_ro[0]) {
412         return 1;               /* no whitelist, anything goes */
413     }
414 
415     for (p = whitelist_rw; *p; p++) {
416         if (!strcmp(drv->format_name, *p)) {
417             return 1;
418         }
419     }
420     if (read_only) {
421         for (p = whitelist_ro; *p; p++) {
422             if (!strcmp(drv->format_name, *p)) {
423                 return 1;
424             }
425         }
426     }
427     return 0;
428 }
429 
430 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
431                                           bool read_only)
432 {
433     BlockDriver *drv = bdrv_find_format(format_name);
434     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
435 }
436 
437 typedef struct CreateCo {
438     BlockDriver *drv;
439     char *filename;
440     QemuOpts *opts;
441     int ret;
442     Error *err;
443 } CreateCo;
444 
445 static void coroutine_fn bdrv_create_co_entry(void *opaque)
446 {
447     Error *local_err = NULL;
448     int ret;
449 
450     CreateCo *cco = opaque;
451     assert(cco->drv);
452 
453     ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
454     if (local_err) {
455         error_propagate(&cco->err, local_err);
456     }
457     cco->ret = ret;
458 }
459 
460 int bdrv_create(BlockDriver *drv, const char* filename,
461                 QemuOpts *opts, Error **errp)
462 {
463     int ret;
464 
465     Coroutine *co;
466     CreateCo cco = {
467         .drv = drv,
468         .filename = g_strdup(filename),
469         .opts = opts,
470         .ret = NOT_DONE,
471         .err = NULL,
472     };
473 
474     if (!drv->bdrv_create) {
475         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
476         ret = -ENOTSUP;
477         goto out;
478     }
479 
480     if (qemu_in_coroutine()) {
481         /* Fast-path if already in coroutine context */
482         bdrv_create_co_entry(&cco);
483     } else {
484         co = qemu_coroutine_create(bdrv_create_co_entry);
485         qemu_coroutine_enter(co, &cco);
486         while (cco.ret == NOT_DONE) {
487             aio_poll(qemu_get_aio_context(), true);
488         }
489     }
490 
491     ret = cco.ret;
492     if (ret < 0) {
493         if (cco.err) {
494             error_propagate(errp, cco.err);
495         } else {
496             error_setg_errno(errp, -ret, "Could not create image");
497         }
498     }
499 
500 out:
501     g_free(cco.filename);
502     return ret;
503 }
504 
505 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
506 {
507     BlockDriver *drv;
508     Error *local_err = NULL;
509     int ret;
510 
511     drv = bdrv_find_protocol(filename, true, errp);
512     if (drv == NULL) {
513         return -ENOENT;
514     }
515 
516     ret = bdrv_create(drv, filename, opts, &local_err);
517     if (local_err) {
518         error_propagate(errp, local_err);
519     }
520     return ret;
521 }
522 
523 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
524 {
525     BlockDriver *drv = bs->drv;
526     Error *local_err = NULL;
527 
528     memset(&bs->bl, 0, sizeof(bs->bl));
529 
530     if (!drv) {
531         return;
532     }
533 
534     /* Take some limits from the children as a default */
535     if (bs->file) {
536         bdrv_refresh_limits(bs->file, &local_err);
537         if (local_err) {
538             error_propagate(errp, local_err);
539             return;
540         }
541         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
542         bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
543         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
544     } else {
545         bs->bl.opt_mem_alignment = 512;
546     }
547 
548     if (bs->backing_hd) {
549         bdrv_refresh_limits(bs->backing_hd, &local_err);
550         if (local_err) {
551             error_propagate(errp, local_err);
552             return;
553         }
554         bs->bl.opt_transfer_length =
555             MAX(bs->bl.opt_transfer_length,
556                 bs->backing_hd->bl.opt_transfer_length);
557         bs->bl.max_transfer_length =
558             MIN_NON_ZERO(bs->bl.max_transfer_length,
559                          bs->backing_hd->bl.max_transfer_length);
560         bs->bl.opt_mem_alignment =
561             MAX(bs->bl.opt_mem_alignment,
562                 bs->backing_hd->bl.opt_mem_alignment);
563     }
564 
565     /* Then let the driver override it */
566     if (drv->bdrv_refresh_limits) {
567         drv->bdrv_refresh_limits(bs, errp);
568     }
569 }
570 
571 /*
572  * Create a uniquely-named empty temporary file.
573  * Return 0 upon success, otherwise a negative errno value.
574  */
575 int get_tmp_filename(char *filename, int size)
576 {
577 #ifdef _WIN32
578     char temp_dir[MAX_PATH];
579     /* GetTempFileName requires that its output buffer (4th param)
580        have length MAX_PATH or greater.  */
581     assert(size >= MAX_PATH);
582     return (GetTempPath(MAX_PATH, temp_dir)
583             && GetTempFileName(temp_dir, "qem", 0, filename)
584             ? 0 : -GetLastError());
585 #else
586     int fd;
587     const char *tmpdir;
588     tmpdir = getenv("TMPDIR");
589     if (!tmpdir) {
590         tmpdir = "/var/tmp";
591     }
592     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
593         return -EOVERFLOW;
594     }
595     fd = mkstemp(filename);
596     if (fd < 0) {
597         return -errno;
598     }
599     if (close(fd) != 0) {
600         unlink(filename);
601         return -errno;
602     }
603     return 0;
604 #endif
605 }
606 
607 /*
608  * Detect host devices. By convention, /dev/cdrom[N] is always
609  * recognized as a host CDROM.
610  */
611 static BlockDriver *find_hdev_driver(const char *filename)
612 {
613     int score_max = 0, score;
614     BlockDriver *drv = NULL, *d;
615 
616     QLIST_FOREACH(d, &bdrv_drivers, list) {
617         if (d->bdrv_probe_device) {
618             score = d->bdrv_probe_device(filename);
619             if (score > score_max) {
620                 score_max = score;
621                 drv = d;
622             }
623         }
624     }
625 
626     return drv;
627 }
628 
629 BlockDriver *bdrv_find_protocol(const char *filename,
630                                 bool allow_protocol_prefix,
631                                 Error **errp)
632 {
633     BlockDriver *drv1;
634     char protocol[128];
635     int len;
636     const char *p;
637 
638     /* TODO Drivers without bdrv_file_open must be specified explicitly */
639 
640     /*
641      * XXX(hch): we really should not let host device detection
642      * override an explicit protocol specification, but moving this
643      * later breaks access to device names with colons in them.
644      * Thanks to the brain-dead persistent naming schemes on udev-
645      * based Linux systems those actually are quite common.
646      */
647     drv1 = find_hdev_driver(filename);
648     if (drv1) {
649         return drv1;
650     }
651 
652     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
653         return &bdrv_file;
654     }
655 
656     p = strchr(filename, ':');
657     assert(p != NULL);
658     len = p - filename;
659     if (len > sizeof(protocol) - 1)
660         len = sizeof(protocol) - 1;
661     memcpy(protocol, filename, len);
662     protocol[len] = '\0';
663     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
664         if (drv1->protocol_name &&
665             !strcmp(drv1->protocol_name, protocol)) {
666             return drv1;
667         }
668     }
669 
670     error_setg(errp, "Unknown protocol '%s'", protocol);
671     return NULL;
672 }
673 
674 /*
675  * Guess image format by probing its contents.
676  * This is not a good idea when your image is raw (CVE-2008-2004), but
677  * we do it anyway for backward compatibility.
678  *
679  * @buf         contains the image's first @buf_size bytes.
680  * @buf_size    is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
681  *              but can be smaller if the image file is smaller)
682  * @filename    is its filename.
683  *
684  * For all block drivers, call the bdrv_probe() method to get its
685  * probing score.
686  * Return the first block driver with the highest probing score.
687  */
688 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
689                             const char *filename)
690 {
691     int score_max = 0, score;
692     BlockDriver *drv = NULL, *d;
693 
694     QLIST_FOREACH(d, &bdrv_drivers, list) {
695         if (d->bdrv_probe) {
696             score = d->bdrv_probe(buf, buf_size, filename);
697             if (score > score_max) {
698                 score_max = score;
699                 drv = d;
700             }
701         }
702     }
703 
704     return drv;
705 }
706 
707 static int find_image_format(BlockDriverState *bs, const char *filename,
708                              BlockDriver **pdrv, Error **errp)
709 {
710     BlockDriver *drv;
711     uint8_t buf[BLOCK_PROBE_BUF_SIZE];
712     int ret = 0;
713 
714     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
715     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
716         *pdrv = &bdrv_raw;
717         return ret;
718     }
719 
720     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
721     if (ret < 0) {
722         error_setg_errno(errp, -ret, "Could not read image for determining its "
723                          "format");
724         *pdrv = NULL;
725         return ret;
726     }
727 
728     drv = bdrv_probe_all(buf, ret, filename);
729     if (!drv) {
730         error_setg(errp, "Could not determine image format: No compatible "
731                    "driver found");
732         ret = -ENOENT;
733     }
734     *pdrv = drv;
735     return ret;
736 }
737 
738 /**
739  * Set the current 'total_sectors' value
740  * Return 0 on success, -errno on error.
741  */
742 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
743 {
744     BlockDriver *drv = bs->drv;
745 
746     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
747     if (bs->sg)
748         return 0;
749 
750     /* query actual device if possible, otherwise just trust the hint */
751     if (drv->bdrv_getlength) {
752         int64_t length = drv->bdrv_getlength(bs);
753         if (length < 0) {
754             return length;
755         }
756         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
757     }
758 
759     bs->total_sectors = hint;
760     return 0;
761 }
762 
763 /**
764  * Set open flags for a given discard mode
765  *
766  * Return 0 on success, -1 if the discard mode was invalid.
767  */
768 int bdrv_parse_discard_flags(const char *mode, int *flags)
769 {
770     *flags &= ~BDRV_O_UNMAP;
771 
772     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
773         /* do nothing */
774     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
775         *flags |= BDRV_O_UNMAP;
776     } else {
777         return -1;
778     }
779 
780     return 0;
781 }
782 
783 /**
784  * Set open flags for a given cache mode
785  *
786  * Return 0 on success, -1 if the cache mode was invalid.
787  */
788 int bdrv_parse_cache_flags(const char *mode, int *flags)
789 {
790     *flags &= ~BDRV_O_CACHE_MASK;
791 
792     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
793         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
794     } else if (!strcmp(mode, "directsync")) {
795         *flags |= BDRV_O_NOCACHE;
796     } else if (!strcmp(mode, "writeback")) {
797         *flags |= BDRV_O_CACHE_WB;
798     } else if (!strcmp(mode, "unsafe")) {
799         *flags |= BDRV_O_CACHE_WB;
800         *flags |= BDRV_O_NO_FLUSH;
801     } else if (!strcmp(mode, "writethrough")) {
802         /* this is the default */
803     } else {
804         return -1;
805     }
806 
807     return 0;
808 }
809 
810 /**
811  * The copy-on-read flag is actually a reference count so multiple users may
812  * use the feature without worrying about clobbering its previous state.
813  * Copy-on-read stays enabled until all users have called to disable it.
814  */
815 void bdrv_enable_copy_on_read(BlockDriverState *bs)
816 {
817     bs->copy_on_read++;
818 }
819 
820 void bdrv_disable_copy_on_read(BlockDriverState *bs)
821 {
822     assert(bs->copy_on_read > 0);
823     bs->copy_on_read--;
824 }
825 
826 /*
827  * Returns the flags that a temporary snapshot should get, based on the
828  * originally requested flags (the originally requested image will have flags
829  * like a backing file)
830  */
831 static int bdrv_temp_snapshot_flags(int flags)
832 {
833     return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
834 }
835 
836 /*
837  * Returns the flags that bs->file should get, based on the given flags for
838  * the parent BDS
839  */
840 static int bdrv_inherited_flags(int flags)
841 {
842     /* Enable protocol handling, disable format probing for bs->file */
843     flags |= BDRV_O_PROTOCOL;
844 
845     /* Our block drivers take care to send flushes and respect unmap policy,
846      * so we can enable both unconditionally on lower layers. */
847     flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
848 
849     /* Clear flags that only apply to the top layer */
850     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
851 
852     return flags;
853 }
854 
855 /*
856  * Returns the flags that bs->backing_hd should get, based on the given flags
857  * for the parent BDS
858  */
859 static int bdrv_backing_flags(int flags)
860 {
861     /* backing files always opened read-only */
862     flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
863 
864     /* snapshot=on is handled on the top layer */
865     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
866 
867     return flags;
868 }
869 
870 static int bdrv_open_flags(BlockDriverState *bs, int flags)
871 {
872     int open_flags = flags | BDRV_O_CACHE_WB;
873 
874     /*
875      * Clear flags that are internal to the block layer before opening the
876      * image.
877      */
878     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
879 
880     /*
881      * Snapshots should be writable.
882      */
883     if (flags & BDRV_O_TEMPORARY) {
884         open_flags |= BDRV_O_RDWR;
885     }
886 
887     return open_flags;
888 }
889 
890 static void bdrv_assign_node_name(BlockDriverState *bs,
891                                   const char *node_name,
892                                   Error **errp)
893 {
894     if (!node_name) {
895         return;
896     }
897 
898     /* Check for empty string or invalid characters */
899     if (!id_wellformed(node_name)) {
900         error_setg(errp, "Invalid node name");
901         return;
902     }
903 
904     /* takes care of avoiding namespaces collisions */
905     if (blk_by_name(node_name)) {
906         error_setg(errp, "node-name=%s is conflicting with a device id",
907                    node_name);
908         return;
909     }
910 
911     /* takes care of avoiding duplicates node names */
912     if (bdrv_find_node(node_name)) {
913         error_setg(errp, "Duplicate node name");
914         return;
915     }
916 
917     /* copy node name into the bs and insert it into the graph list */
918     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
919     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
920 }
921 
922 /*
923  * Common part for opening disk images and files
924  *
925  * Removes all processed options from *options.
926  */
927 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
928     QDict *options, int flags, BlockDriver *drv, Error **errp)
929 {
930     int ret, open_flags;
931     const char *filename;
932     const char *node_name = NULL;
933     Error *local_err = NULL;
934 
935     assert(drv != NULL);
936     assert(bs->file == NULL);
937     assert(options != NULL && bs->options != options);
938 
939     if (file != NULL) {
940         filename = file->filename;
941     } else {
942         filename = qdict_get_try_str(options, "filename");
943     }
944 
945     if (drv->bdrv_needs_filename && !filename) {
946         error_setg(errp, "The '%s' block driver requires a file name",
947                    drv->format_name);
948         return -EINVAL;
949     }
950 
951     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
952 
953     node_name = qdict_get_try_str(options, "node-name");
954     bdrv_assign_node_name(bs, node_name, &local_err);
955     if (local_err) {
956         error_propagate(errp, local_err);
957         return -EINVAL;
958     }
959     qdict_del(options, "node-name");
960 
961     /* bdrv_open() with directly using a protocol as drv. This layer is already
962      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
963      * and return immediately. */
964     if (file != NULL && drv->bdrv_file_open) {
965         bdrv_swap(file, bs);
966         return 0;
967     }
968 
969     bs->open_flags = flags;
970     bs->guest_block_size = 512;
971     bs->request_alignment = 512;
972     bs->zero_beyond_eof = true;
973     open_flags = bdrv_open_flags(bs, flags);
974     bs->read_only = !(open_flags & BDRV_O_RDWR);
975 
976     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
977         error_setg(errp,
978                    !bs->read_only && bdrv_is_whitelisted(drv, true)
979                         ? "Driver '%s' can only be used for read-only devices"
980                         : "Driver '%s' is not whitelisted",
981                    drv->format_name);
982         return -ENOTSUP;
983     }
984 
985     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
986     if (flags & BDRV_O_COPY_ON_READ) {
987         if (!bs->read_only) {
988             bdrv_enable_copy_on_read(bs);
989         } else {
990             error_setg(errp, "Can't use copy-on-read on read-only device");
991             return -EINVAL;
992         }
993     }
994 
995     if (filename != NULL) {
996         pstrcpy(bs->filename, sizeof(bs->filename), filename);
997     } else {
998         bs->filename[0] = '\0';
999     }
1000     pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
1001 
1002     bs->drv = drv;
1003     bs->opaque = g_malloc0(drv->instance_size);
1004 
1005     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
1006 
1007     /* Open the image, either directly or using a protocol */
1008     if (drv->bdrv_file_open) {
1009         assert(file == NULL);
1010         assert(!drv->bdrv_needs_filename || filename != NULL);
1011         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
1012     } else {
1013         if (file == NULL) {
1014             error_setg(errp, "Can't use '%s' as a block driver for the "
1015                        "protocol level", drv->format_name);
1016             ret = -EINVAL;
1017             goto free_and_fail;
1018         }
1019         bs->file = file;
1020         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
1021     }
1022 
1023     if (ret < 0) {
1024         if (local_err) {
1025             error_propagate(errp, local_err);
1026         } else if (bs->filename[0]) {
1027             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
1028         } else {
1029             error_setg_errno(errp, -ret, "Could not open image");
1030         }
1031         goto free_and_fail;
1032     }
1033 
1034     ret = refresh_total_sectors(bs, bs->total_sectors);
1035     if (ret < 0) {
1036         error_setg_errno(errp, -ret, "Could not refresh total sector count");
1037         goto free_and_fail;
1038     }
1039 
1040     bdrv_refresh_limits(bs, &local_err);
1041     if (local_err) {
1042         error_propagate(errp, local_err);
1043         ret = -EINVAL;
1044         goto free_and_fail;
1045     }
1046 
1047     assert(bdrv_opt_mem_align(bs) != 0);
1048     assert((bs->request_alignment != 0) || bs->sg);
1049     return 0;
1050 
1051 free_and_fail:
1052     bs->file = NULL;
1053     g_free(bs->opaque);
1054     bs->opaque = NULL;
1055     bs->drv = NULL;
1056     return ret;
1057 }
1058 
1059 static QDict *parse_json_filename(const char *filename, Error **errp)
1060 {
1061     QObject *options_obj;
1062     QDict *options;
1063     int ret;
1064 
1065     ret = strstart(filename, "json:", &filename);
1066     assert(ret);
1067 
1068     options_obj = qobject_from_json(filename);
1069     if (!options_obj) {
1070         error_setg(errp, "Could not parse the JSON options");
1071         return NULL;
1072     }
1073 
1074     if (qobject_type(options_obj) != QTYPE_QDICT) {
1075         qobject_decref(options_obj);
1076         error_setg(errp, "Invalid JSON object given");
1077         return NULL;
1078     }
1079 
1080     options = qobject_to_qdict(options_obj);
1081     qdict_flatten(options);
1082 
1083     return options;
1084 }
1085 
1086 /*
1087  * Fills in default options for opening images and converts the legacy
1088  * filename/flags pair to option QDict entries.
1089  */
1090 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1091                              BlockDriver *drv, Error **errp)
1092 {
1093     const char *filename = *pfilename;
1094     const char *drvname;
1095     bool protocol = flags & BDRV_O_PROTOCOL;
1096     bool parse_filename = false;
1097     Error *local_err = NULL;
1098 
1099     /* Parse json: pseudo-protocol */
1100     if (filename && g_str_has_prefix(filename, "json:")) {
1101         QDict *json_options = parse_json_filename(filename, &local_err);
1102         if (local_err) {
1103             error_propagate(errp, local_err);
1104             return -EINVAL;
1105         }
1106 
1107         /* Options given in the filename have lower priority than options
1108          * specified directly */
1109         qdict_join(*options, json_options, false);
1110         QDECREF(json_options);
1111         *pfilename = filename = NULL;
1112     }
1113 
1114     /* Fetch the file name from the options QDict if necessary */
1115     if (protocol && filename) {
1116         if (!qdict_haskey(*options, "filename")) {
1117             qdict_put(*options, "filename", qstring_from_str(filename));
1118             parse_filename = true;
1119         } else {
1120             error_setg(errp, "Can't specify 'file' and 'filename' options at "
1121                              "the same time");
1122             return -EINVAL;
1123         }
1124     }
1125 
1126     /* Find the right block driver */
1127     filename = qdict_get_try_str(*options, "filename");
1128     drvname = qdict_get_try_str(*options, "driver");
1129 
1130     if (drv) {
1131         if (drvname) {
1132             error_setg(errp, "Driver specified twice");
1133             return -EINVAL;
1134         }
1135         drvname = drv->format_name;
1136         qdict_put(*options, "driver", qstring_from_str(drvname));
1137     } else {
1138         if (!drvname && protocol) {
1139             if (filename) {
1140                 drv = bdrv_find_protocol(filename, parse_filename, errp);
1141                 if (!drv) {
1142                     return -EINVAL;
1143                 }
1144 
1145                 drvname = drv->format_name;
1146                 qdict_put(*options, "driver", qstring_from_str(drvname));
1147             } else {
1148                 error_setg(errp, "Must specify either driver or file");
1149                 return -EINVAL;
1150             }
1151         } else if (drvname) {
1152             drv = bdrv_find_format(drvname);
1153             if (!drv) {
1154                 error_setg(errp, "Unknown driver '%s'", drvname);
1155                 return -ENOENT;
1156             }
1157         }
1158     }
1159 
1160     assert(drv || !protocol);
1161 
1162     /* Driver-specific filename parsing */
1163     if (drv && drv->bdrv_parse_filename && parse_filename) {
1164         drv->bdrv_parse_filename(filename, *options, &local_err);
1165         if (local_err) {
1166             error_propagate(errp, local_err);
1167             return -EINVAL;
1168         }
1169 
1170         if (!drv->bdrv_needs_filename) {
1171             qdict_del(*options, "filename");
1172         }
1173     }
1174 
1175     return 0;
1176 }
1177 
1178 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1179 {
1180 
1181     if (bs->backing_hd) {
1182         assert(bs->backing_blocker);
1183         bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1184     } else if (backing_hd) {
1185         error_setg(&bs->backing_blocker,
1186                    "device is used as backing hd of '%s'",
1187                    bdrv_get_device_name(bs));
1188     }
1189 
1190     bs->backing_hd = backing_hd;
1191     if (!backing_hd) {
1192         error_free(bs->backing_blocker);
1193         bs->backing_blocker = NULL;
1194         goto out;
1195     }
1196     bs->open_flags &= ~BDRV_O_NO_BACKING;
1197     pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1198     pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1199             backing_hd->drv ? backing_hd->drv->format_name : "");
1200 
1201     bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1202     /* Otherwise we won't be able to commit due to check in bdrv_commit */
1203     bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET,
1204                     bs->backing_blocker);
1205 out:
1206     bdrv_refresh_limits(bs, NULL);
1207 }
1208 
1209 /*
1210  * Opens the backing file for a BlockDriverState if not yet open
1211  *
1212  * options is a QDict of options to pass to the block drivers, or NULL for an
1213  * empty set of options. The reference to the QDict is transferred to this
1214  * function (even on failure), so if the caller intends to reuse the dictionary,
1215  * it needs to use QINCREF() before calling bdrv_file_open.
1216  */
1217 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1218 {
1219     char *backing_filename = g_malloc0(PATH_MAX);
1220     int ret = 0;
1221     BlockDriverState *backing_hd;
1222     Error *local_err = NULL;
1223 
1224     if (bs->backing_hd != NULL) {
1225         QDECREF(options);
1226         goto free_exit;
1227     }
1228 
1229     /* NULL means an empty set of options */
1230     if (options == NULL) {
1231         options = qdict_new();
1232     }
1233 
1234     bs->open_flags &= ~BDRV_O_NO_BACKING;
1235     if (qdict_haskey(options, "file.filename")) {
1236         backing_filename[0] = '\0';
1237     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1238         QDECREF(options);
1239         goto free_exit;
1240     } else {
1241         bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX,
1242                                        &local_err);
1243         if (local_err) {
1244             ret = -EINVAL;
1245             error_propagate(errp, local_err);
1246             QDECREF(options);
1247             goto free_exit;
1248         }
1249     }
1250 
1251     if (!bs->drv || !bs->drv->supports_backing) {
1252         ret = -EINVAL;
1253         error_setg(errp, "Driver doesn't support backing files");
1254         QDECREF(options);
1255         goto free_exit;
1256     }
1257 
1258     backing_hd = bdrv_new();
1259 
1260     if (bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) {
1261         qdict_put(options, "driver", qstring_from_str(bs->backing_format));
1262     }
1263 
1264     assert(bs->backing_hd == NULL);
1265     ret = bdrv_open(&backing_hd,
1266                     *backing_filename ? backing_filename : NULL, NULL, options,
1267                     bdrv_backing_flags(bs->open_flags), NULL, &local_err);
1268     if (ret < 0) {
1269         bdrv_unref(backing_hd);
1270         backing_hd = NULL;
1271         bs->open_flags |= BDRV_O_NO_BACKING;
1272         error_setg(errp, "Could not open backing file: %s",
1273                    error_get_pretty(local_err));
1274         error_free(local_err);
1275         goto free_exit;
1276     }
1277     bdrv_set_backing_hd(bs, backing_hd);
1278 
1279 free_exit:
1280     g_free(backing_filename);
1281     return ret;
1282 }
1283 
1284 /*
1285  * Opens a disk image whose options are given as BlockdevRef in another block
1286  * device's options.
1287  *
1288  * If allow_none is true, no image will be opened if filename is false and no
1289  * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1290  *
1291  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1292  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1293  * itself, all options starting with "${bdref_key}." are considered part of the
1294  * BlockdevRef.
1295  *
1296  * The BlockdevRef will be removed from the options QDict.
1297  *
1298  * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1299  */
1300 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1301                     QDict *options, const char *bdref_key, int flags,
1302                     bool allow_none, Error **errp)
1303 {
1304     QDict *image_options;
1305     int ret;
1306     char *bdref_key_dot;
1307     const char *reference;
1308 
1309     assert(pbs);
1310     assert(*pbs == NULL);
1311 
1312     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1313     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1314     g_free(bdref_key_dot);
1315 
1316     reference = qdict_get_try_str(options, bdref_key);
1317     if (!filename && !reference && !qdict_size(image_options)) {
1318         if (allow_none) {
1319             ret = 0;
1320         } else {
1321             error_setg(errp, "A block device must be specified for \"%s\"",
1322                        bdref_key);
1323             ret = -EINVAL;
1324         }
1325         QDECREF(image_options);
1326         goto done;
1327     }
1328 
1329     ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1330 
1331 done:
1332     qdict_del(options, bdref_key);
1333     return ret;
1334 }
1335 
1336 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1337 {
1338     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1339     char *tmp_filename = g_malloc0(PATH_MAX + 1);
1340     int64_t total_size;
1341     QemuOpts *opts = NULL;
1342     QDict *snapshot_options;
1343     BlockDriverState *bs_snapshot;
1344     Error *local_err;
1345     int ret;
1346 
1347     /* if snapshot, we create a temporary backing file and open it
1348        instead of opening 'filename' directly */
1349 
1350     /* Get the required size from the image */
1351     total_size = bdrv_getlength(bs);
1352     if (total_size < 0) {
1353         ret = total_size;
1354         error_setg_errno(errp, -total_size, "Could not get image size");
1355         goto out;
1356     }
1357 
1358     /* Create the temporary image */
1359     ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1360     if (ret < 0) {
1361         error_setg_errno(errp, -ret, "Could not get temporary filename");
1362         goto out;
1363     }
1364 
1365     opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
1366                             &error_abort);
1367     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size, &error_abort);
1368     ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, &local_err);
1369     qemu_opts_del(opts);
1370     if (ret < 0) {
1371         error_setg_errno(errp, -ret, "Could not create temporary overlay "
1372                          "'%s': %s", tmp_filename,
1373                          error_get_pretty(local_err));
1374         error_free(local_err);
1375         goto out;
1376     }
1377 
1378     /* Prepare a new options QDict for the temporary file */
1379     snapshot_options = qdict_new();
1380     qdict_put(snapshot_options, "file.driver",
1381               qstring_from_str("file"));
1382     qdict_put(snapshot_options, "file.filename",
1383               qstring_from_str(tmp_filename));
1384 
1385     bs_snapshot = bdrv_new();
1386 
1387     ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1388                     flags, &bdrv_qcow2, &local_err);
1389     if (ret < 0) {
1390         error_propagate(errp, local_err);
1391         goto out;
1392     }
1393 
1394     bdrv_append(bs_snapshot, bs);
1395 
1396 out:
1397     g_free(tmp_filename);
1398     return ret;
1399 }
1400 
1401 /*
1402  * Opens a disk image (raw, qcow2, vmdk, ...)
1403  *
1404  * options is a QDict of options to pass to the block drivers, or NULL for an
1405  * empty set of options. The reference to the QDict belongs to the block layer
1406  * after the call (even on failure), so if the caller intends to reuse the
1407  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1408  *
1409  * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1410  * If it is not NULL, the referenced BDS will be reused.
1411  *
1412  * The reference parameter may be used to specify an existing block device which
1413  * should be opened. If specified, neither options nor a filename may be given,
1414  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1415  */
1416 int bdrv_open(BlockDriverState **pbs, const char *filename,
1417               const char *reference, QDict *options, int flags,
1418               BlockDriver *drv, Error **errp)
1419 {
1420     int ret;
1421     BlockDriverState *file = NULL, *bs;
1422     const char *drvname;
1423     Error *local_err = NULL;
1424     int snapshot_flags = 0;
1425 
1426     assert(pbs);
1427 
1428     if (reference) {
1429         bool options_non_empty = options ? qdict_size(options) : false;
1430         QDECREF(options);
1431 
1432         if (*pbs) {
1433             error_setg(errp, "Cannot reuse an existing BDS when referencing "
1434                        "another block device");
1435             return -EINVAL;
1436         }
1437 
1438         if (filename || options_non_empty) {
1439             error_setg(errp, "Cannot reference an existing block device with "
1440                        "additional options or a new filename");
1441             return -EINVAL;
1442         }
1443 
1444         bs = bdrv_lookup_bs(reference, reference, errp);
1445         if (!bs) {
1446             return -ENODEV;
1447         }
1448         bdrv_ref(bs);
1449         *pbs = bs;
1450         return 0;
1451     }
1452 
1453     if (*pbs) {
1454         bs = *pbs;
1455     } else {
1456         bs = bdrv_new();
1457     }
1458 
1459     /* NULL means an empty set of options */
1460     if (options == NULL) {
1461         options = qdict_new();
1462     }
1463 
1464     ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1465     if (local_err) {
1466         goto fail;
1467     }
1468 
1469     /* Find the right image format driver */
1470     drv = NULL;
1471     drvname = qdict_get_try_str(options, "driver");
1472     if (drvname) {
1473         drv = bdrv_find_format(drvname);
1474         qdict_del(options, "driver");
1475         if (!drv) {
1476             error_setg(errp, "Unknown driver: '%s'", drvname);
1477             ret = -EINVAL;
1478             goto fail;
1479         }
1480     }
1481 
1482     assert(drvname || !(flags & BDRV_O_PROTOCOL));
1483     if (drv && !drv->bdrv_file_open) {
1484         /* If the user explicitly wants a format driver here, we'll need to add
1485          * another layer for the protocol in bs->file */
1486         flags &= ~BDRV_O_PROTOCOL;
1487     }
1488 
1489     bs->options = options;
1490     options = qdict_clone_shallow(options);
1491 
1492     /* Open image file without format layer */
1493     if ((flags & BDRV_O_PROTOCOL) == 0) {
1494         if (flags & BDRV_O_RDWR) {
1495             flags |= BDRV_O_ALLOW_RDWR;
1496         }
1497         if (flags & BDRV_O_SNAPSHOT) {
1498             snapshot_flags = bdrv_temp_snapshot_flags(flags);
1499             flags = bdrv_backing_flags(flags);
1500         }
1501 
1502         assert(file == NULL);
1503         ret = bdrv_open_image(&file, filename, options, "file",
1504                               bdrv_inherited_flags(flags),
1505                               true, &local_err);
1506         if (ret < 0) {
1507             goto fail;
1508         }
1509     }
1510 
1511     /* Image format probing */
1512     bs->probed = !drv;
1513     if (!drv && file) {
1514         ret = find_image_format(file, filename, &drv, &local_err);
1515         if (ret < 0) {
1516             goto fail;
1517         }
1518     } else if (!drv) {
1519         error_setg(errp, "Must specify either driver or file");
1520         ret = -EINVAL;
1521         goto fail;
1522     }
1523 
1524     /* Open the image */
1525     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1526     if (ret < 0) {
1527         goto fail;
1528     }
1529 
1530     if (file && (bs->file != file)) {
1531         bdrv_unref(file);
1532         file = NULL;
1533     }
1534 
1535     /* If there is a backing file, use it */
1536     if ((flags & BDRV_O_NO_BACKING) == 0) {
1537         QDict *backing_options;
1538 
1539         qdict_extract_subqdict(options, &backing_options, "backing.");
1540         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1541         if (ret < 0) {
1542             goto close_and_fail;
1543         }
1544     }
1545 
1546     bdrv_refresh_filename(bs);
1547 
1548     /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1549      * temporary snapshot afterwards. */
1550     if (snapshot_flags) {
1551         ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1552         if (local_err) {
1553             goto close_and_fail;
1554         }
1555     }
1556 
1557     /* Check if any unknown options were used */
1558     if (options && (qdict_size(options) != 0)) {
1559         const QDictEntry *entry = qdict_first(options);
1560         if (flags & BDRV_O_PROTOCOL) {
1561             error_setg(errp, "Block protocol '%s' doesn't support the option "
1562                        "'%s'", drv->format_name, entry->key);
1563         } else {
1564             error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1565                        "support the option '%s'", drv->format_name,
1566                        bdrv_get_device_name(bs), entry->key);
1567         }
1568 
1569         ret = -EINVAL;
1570         goto close_and_fail;
1571     }
1572 
1573     if (!bdrv_key_required(bs)) {
1574         if (bs->blk) {
1575             blk_dev_change_media_cb(bs->blk, true);
1576         }
1577     } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1578                && !runstate_check(RUN_STATE_INMIGRATE)
1579                && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1580         error_setg(errp,
1581                    "Guest must be stopped for opening of encrypted image");
1582         ret = -EBUSY;
1583         goto close_and_fail;
1584     }
1585 
1586     QDECREF(options);
1587     *pbs = bs;
1588     return 0;
1589 
1590 fail:
1591     if (file != NULL) {
1592         bdrv_unref(file);
1593     }
1594     QDECREF(bs->options);
1595     QDECREF(options);
1596     bs->options = NULL;
1597     if (!*pbs) {
1598         /* If *pbs is NULL, a new BDS has been created in this function and
1599            needs to be freed now. Otherwise, it does not need to be closed,
1600            since it has not really been opened yet. */
1601         bdrv_unref(bs);
1602     }
1603     if (local_err) {
1604         error_propagate(errp, local_err);
1605     }
1606     return ret;
1607 
1608 close_and_fail:
1609     /* See fail path, but now the BDS has to be always closed */
1610     if (*pbs) {
1611         bdrv_close(bs);
1612     } else {
1613         bdrv_unref(bs);
1614     }
1615     QDECREF(options);
1616     if (local_err) {
1617         error_propagate(errp, local_err);
1618     }
1619     return ret;
1620 }
1621 
1622 typedef struct BlockReopenQueueEntry {
1623      bool prepared;
1624      BDRVReopenState state;
1625      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1626 } BlockReopenQueueEntry;
1627 
1628 /*
1629  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1630  * reopen of multiple devices.
1631  *
1632  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1633  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1634  * be created and initialized. This newly created BlockReopenQueue should be
1635  * passed back in for subsequent calls that are intended to be of the same
1636  * atomic 'set'.
1637  *
1638  * bs is the BlockDriverState to add to the reopen queue.
1639  *
1640  * flags contains the open flags for the associated bs
1641  *
1642  * returns a pointer to bs_queue, which is either the newly allocated
1643  * bs_queue, or the existing bs_queue being used.
1644  *
1645  */
1646 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1647                                     BlockDriverState *bs, int flags)
1648 {
1649     assert(bs != NULL);
1650 
1651     BlockReopenQueueEntry *bs_entry;
1652     if (bs_queue == NULL) {
1653         bs_queue = g_new0(BlockReopenQueue, 1);
1654         QSIMPLEQ_INIT(bs_queue);
1655     }
1656 
1657     /* bdrv_open() masks this flag out */
1658     flags &= ~BDRV_O_PROTOCOL;
1659 
1660     if (bs->file) {
1661         bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1662     }
1663 
1664     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1665     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1666 
1667     bs_entry->state.bs = bs;
1668     bs_entry->state.flags = flags;
1669 
1670     return bs_queue;
1671 }
1672 
1673 /*
1674  * Reopen multiple BlockDriverStates atomically & transactionally.
1675  *
1676  * The queue passed in (bs_queue) must have been built up previous
1677  * via bdrv_reopen_queue().
1678  *
1679  * Reopens all BDS specified in the queue, with the appropriate
1680  * flags.  All devices are prepared for reopen, and failure of any
1681  * device will cause all device changes to be abandonded, and intermediate
1682  * data cleaned up.
1683  *
1684  * If all devices prepare successfully, then the changes are committed
1685  * to all devices.
1686  *
1687  */
1688 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1689 {
1690     int ret = -1;
1691     BlockReopenQueueEntry *bs_entry, *next;
1692     Error *local_err = NULL;
1693 
1694     assert(bs_queue != NULL);
1695 
1696     bdrv_drain_all();
1697 
1698     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1699         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1700             error_propagate(errp, local_err);
1701             goto cleanup;
1702         }
1703         bs_entry->prepared = true;
1704     }
1705 
1706     /* If we reach this point, we have success and just need to apply the
1707      * changes
1708      */
1709     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1710         bdrv_reopen_commit(&bs_entry->state);
1711     }
1712 
1713     ret = 0;
1714 
1715 cleanup:
1716     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1717         if (ret && bs_entry->prepared) {
1718             bdrv_reopen_abort(&bs_entry->state);
1719         }
1720         g_free(bs_entry);
1721     }
1722     g_free(bs_queue);
1723     return ret;
1724 }
1725 
1726 
1727 /* Reopen a single BlockDriverState with the specified flags. */
1728 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1729 {
1730     int ret = -1;
1731     Error *local_err = NULL;
1732     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1733 
1734     ret = bdrv_reopen_multiple(queue, &local_err);
1735     if (local_err != NULL) {
1736         error_propagate(errp, local_err);
1737     }
1738     return ret;
1739 }
1740 
1741 
1742 /*
1743  * Prepares a BlockDriverState for reopen. All changes are staged in the
1744  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1745  * the block driver layer .bdrv_reopen_prepare()
1746  *
1747  * bs is the BlockDriverState to reopen
1748  * flags are the new open flags
1749  * queue is the reopen queue
1750  *
1751  * Returns 0 on success, non-zero on error.  On error errp will be set
1752  * as well.
1753  *
1754  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1755  * It is the responsibility of the caller to then call the abort() or
1756  * commit() for any other BDS that have been left in a prepare() state
1757  *
1758  */
1759 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1760                         Error **errp)
1761 {
1762     int ret = -1;
1763     Error *local_err = NULL;
1764     BlockDriver *drv;
1765 
1766     assert(reopen_state != NULL);
1767     assert(reopen_state->bs->drv != NULL);
1768     drv = reopen_state->bs->drv;
1769 
1770     /* if we are to stay read-only, do not allow permission change
1771      * to r/w */
1772     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1773         reopen_state->flags & BDRV_O_RDWR) {
1774         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1775                   bdrv_get_device_name(reopen_state->bs));
1776         goto error;
1777     }
1778 
1779 
1780     ret = bdrv_flush(reopen_state->bs);
1781     if (ret) {
1782         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1783                   strerror(-ret));
1784         goto error;
1785     }
1786 
1787     if (drv->bdrv_reopen_prepare) {
1788         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1789         if (ret) {
1790             if (local_err != NULL) {
1791                 error_propagate(errp, local_err);
1792             } else {
1793                 error_setg(errp, "failed while preparing to reopen image '%s'",
1794                            reopen_state->bs->filename);
1795             }
1796             goto error;
1797         }
1798     } else {
1799         /* It is currently mandatory to have a bdrv_reopen_prepare()
1800          * handler for each supported drv. */
1801         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1802                   drv->format_name, bdrv_get_device_name(reopen_state->bs),
1803                  "reopening of file");
1804         ret = -1;
1805         goto error;
1806     }
1807 
1808     ret = 0;
1809 
1810 error:
1811     return ret;
1812 }
1813 
1814 /*
1815  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1816  * makes them final by swapping the staging BlockDriverState contents into
1817  * the active BlockDriverState contents.
1818  */
1819 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1820 {
1821     BlockDriver *drv;
1822 
1823     assert(reopen_state != NULL);
1824     drv = reopen_state->bs->drv;
1825     assert(drv != NULL);
1826 
1827     /* If there are any driver level actions to take */
1828     if (drv->bdrv_reopen_commit) {
1829         drv->bdrv_reopen_commit(reopen_state);
1830     }
1831 
1832     /* set BDS specific flags now */
1833     reopen_state->bs->open_flags         = reopen_state->flags;
1834     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1835                                               BDRV_O_CACHE_WB);
1836     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1837 
1838     bdrv_refresh_limits(reopen_state->bs, NULL);
1839 }
1840 
1841 /*
1842  * Abort the reopen, and delete and free the staged changes in
1843  * reopen_state
1844  */
1845 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1846 {
1847     BlockDriver *drv;
1848 
1849     assert(reopen_state != NULL);
1850     drv = reopen_state->bs->drv;
1851     assert(drv != NULL);
1852 
1853     if (drv->bdrv_reopen_abort) {
1854         drv->bdrv_reopen_abort(reopen_state);
1855     }
1856 }
1857 
1858 
1859 void bdrv_close(BlockDriverState *bs)
1860 {
1861     BdrvAioNotifier *ban, *ban_next;
1862 
1863     if (bs->job) {
1864         block_job_cancel_sync(bs->job);
1865     }
1866     bdrv_drain_all(); /* complete I/O */
1867     bdrv_flush(bs);
1868     bdrv_drain_all(); /* in case flush left pending I/O */
1869     notifier_list_notify(&bs->close_notifiers, bs);
1870 
1871     if (bs->drv) {
1872         if (bs->backing_hd) {
1873             BlockDriverState *backing_hd = bs->backing_hd;
1874             bdrv_set_backing_hd(bs, NULL);
1875             bdrv_unref(backing_hd);
1876         }
1877         bs->drv->bdrv_close(bs);
1878         g_free(bs->opaque);
1879         bs->opaque = NULL;
1880         bs->drv = NULL;
1881         bs->copy_on_read = 0;
1882         bs->backing_file[0] = '\0';
1883         bs->backing_format[0] = '\0';
1884         bs->total_sectors = 0;
1885         bs->encrypted = 0;
1886         bs->valid_key = 0;
1887         bs->sg = 0;
1888         bs->zero_beyond_eof = false;
1889         QDECREF(bs->options);
1890         bs->options = NULL;
1891         QDECREF(bs->full_open_options);
1892         bs->full_open_options = NULL;
1893 
1894         if (bs->file != NULL) {
1895             bdrv_unref(bs->file);
1896             bs->file = NULL;
1897         }
1898     }
1899 
1900     if (bs->blk) {
1901         blk_dev_change_media_cb(bs->blk, false);
1902     }
1903 
1904     /*throttling disk I/O limits*/
1905     if (bs->io_limits_enabled) {
1906         bdrv_io_limits_disable(bs);
1907     }
1908 
1909     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1910         g_free(ban);
1911     }
1912     QLIST_INIT(&bs->aio_notifiers);
1913 }
1914 
1915 void bdrv_close_all(void)
1916 {
1917     BlockDriverState *bs;
1918 
1919     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1920         AioContext *aio_context = bdrv_get_aio_context(bs);
1921 
1922         aio_context_acquire(aio_context);
1923         bdrv_close(bs);
1924         aio_context_release(aio_context);
1925     }
1926 }
1927 
1928 /* Check if any requests are in-flight (including throttled requests) */
1929 static bool bdrv_requests_pending(BlockDriverState *bs)
1930 {
1931     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1932         return true;
1933     }
1934     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1935         return true;
1936     }
1937     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1938         return true;
1939     }
1940     if (bs->file && bdrv_requests_pending(bs->file)) {
1941         return true;
1942     }
1943     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1944         return true;
1945     }
1946     return false;
1947 }
1948 
1949 static bool bdrv_drain_one(BlockDriverState *bs)
1950 {
1951     bool bs_busy;
1952 
1953     bdrv_flush_io_queue(bs);
1954     bdrv_start_throttled_reqs(bs);
1955     bs_busy = bdrv_requests_pending(bs);
1956     bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
1957     return bs_busy;
1958 }
1959 
1960 /*
1961  * Wait for pending requests to complete on a single BlockDriverState subtree
1962  *
1963  * See the warning in bdrv_drain_all().  This function can only be called if
1964  * you are sure nothing can generate I/O because you have op blockers
1965  * installed.
1966  *
1967  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
1968  * AioContext.
1969  */
1970 void bdrv_drain(BlockDriverState *bs)
1971 {
1972     while (bdrv_drain_one(bs)) {
1973         /* Keep iterating */
1974     }
1975 }
1976 
1977 /*
1978  * Wait for pending requests to complete across all BlockDriverStates
1979  *
1980  * This function does not flush data to disk, use bdrv_flush_all() for that
1981  * after calling this function.
1982  *
1983  * Note that completion of an asynchronous I/O operation can trigger any
1984  * number of other I/O operations on other devices---for example a coroutine
1985  * can be arbitrarily complex and a constant flow of I/O can come until the
1986  * coroutine is complete.  Because of this, it is not possible to have a
1987  * function to drain a single device's I/O queue.
1988  */
1989 void bdrv_drain_all(void)
1990 {
1991     /* Always run first iteration so any pending completion BHs run */
1992     bool busy = true;
1993     BlockDriverState *bs;
1994 
1995     while (busy) {
1996         busy = false;
1997 
1998         QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1999             AioContext *aio_context = bdrv_get_aio_context(bs);
2000 
2001             aio_context_acquire(aio_context);
2002             busy |= bdrv_drain_one(bs);
2003             aio_context_release(aio_context);
2004         }
2005     }
2006 }
2007 
2008 /* make a BlockDriverState anonymous by removing from bdrv_state and
2009  * graph_bdrv_state list.
2010    Also, NULL terminate the device_name to prevent double remove */
2011 void bdrv_make_anon(BlockDriverState *bs)
2012 {
2013     /*
2014      * Take care to remove bs from bdrv_states only when it's actually
2015      * in it.  Note that bs->device_list.tqe_prev is initially null,
2016      * and gets set to non-null by QTAILQ_INSERT_TAIL().  Establish
2017      * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
2018      * resetting it to null on remove.
2019      */
2020     if (bs->device_list.tqe_prev) {
2021         QTAILQ_REMOVE(&bdrv_states, bs, device_list);
2022         bs->device_list.tqe_prev = NULL;
2023     }
2024     if (bs->node_name[0] != '\0') {
2025         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
2026     }
2027     bs->node_name[0] = '\0';
2028 }
2029 
2030 static void bdrv_rebind(BlockDriverState *bs)
2031 {
2032     if (bs->drv && bs->drv->bdrv_rebind) {
2033         bs->drv->bdrv_rebind(bs);
2034     }
2035 }
2036 
2037 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
2038                                      BlockDriverState *bs_src)
2039 {
2040     /* move some fields that need to stay attached to the device */
2041 
2042     /* dev info */
2043     bs_dest->guest_block_size   = bs_src->guest_block_size;
2044     bs_dest->copy_on_read       = bs_src->copy_on_read;
2045 
2046     bs_dest->enable_write_cache = bs_src->enable_write_cache;
2047 
2048     /* i/o throttled req */
2049     memcpy(&bs_dest->throttle_state,
2050            &bs_src->throttle_state,
2051            sizeof(ThrottleState));
2052     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
2053     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
2054     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
2055 
2056     /* r/w error */
2057     bs_dest->on_read_error      = bs_src->on_read_error;
2058     bs_dest->on_write_error     = bs_src->on_write_error;
2059 
2060     /* i/o status */
2061     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
2062     bs_dest->iostatus           = bs_src->iostatus;
2063 
2064     /* dirty bitmap */
2065     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
2066 
2067     /* reference count */
2068     bs_dest->refcnt             = bs_src->refcnt;
2069 
2070     /* job */
2071     bs_dest->job                = bs_src->job;
2072 
2073     /* keep the same entry in bdrv_states */
2074     bs_dest->device_list = bs_src->device_list;
2075     bs_dest->blk = bs_src->blk;
2076 
2077     memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2078            sizeof(bs_dest->op_blockers));
2079 }
2080 
2081 /*
2082  * Swap bs contents for two image chains while they are live,
2083  * while keeping required fields on the BlockDriverState that is
2084  * actually attached to a device.
2085  *
2086  * This will modify the BlockDriverState fields, and swap contents
2087  * between bs_new and bs_old. Both bs_new and bs_old are modified.
2088  *
2089  * bs_new must not be attached to a BlockBackend.
2090  *
2091  * This function does not create any image files.
2092  */
2093 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2094 {
2095     BlockDriverState tmp;
2096 
2097     /* The code needs to swap the node_name but simply swapping node_list won't
2098      * work so first remove the nodes from the graph list, do the swap then
2099      * insert them back if needed.
2100      */
2101     if (bs_new->node_name[0] != '\0') {
2102         QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2103     }
2104     if (bs_old->node_name[0] != '\0') {
2105         QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2106     }
2107 
2108     /* bs_new must be unattached and shouldn't have anything fancy enabled */
2109     assert(!bs_new->blk);
2110     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2111     assert(bs_new->job == NULL);
2112     assert(bs_new->io_limits_enabled == false);
2113     assert(!throttle_have_timer(&bs_new->throttle_state));
2114 
2115     tmp = *bs_new;
2116     *bs_new = *bs_old;
2117     *bs_old = tmp;
2118 
2119     /* there are some fields that should not be swapped, move them back */
2120     bdrv_move_feature_fields(&tmp, bs_old);
2121     bdrv_move_feature_fields(bs_old, bs_new);
2122     bdrv_move_feature_fields(bs_new, &tmp);
2123 
2124     /* bs_new must remain unattached */
2125     assert(!bs_new->blk);
2126 
2127     /* Check a few fields that should remain attached to the device */
2128     assert(bs_new->job == NULL);
2129     assert(bs_new->io_limits_enabled == false);
2130     assert(!throttle_have_timer(&bs_new->throttle_state));
2131 
2132     /* insert the nodes back into the graph node list if needed */
2133     if (bs_new->node_name[0] != '\0') {
2134         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2135     }
2136     if (bs_old->node_name[0] != '\0') {
2137         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2138     }
2139 
2140     bdrv_rebind(bs_new);
2141     bdrv_rebind(bs_old);
2142 }
2143 
2144 /*
2145  * Add new bs contents at the top of an image chain while the chain is
2146  * live, while keeping required fields on the top layer.
2147  *
2148  * This will modify the BlockDriverState fields, and swap contents
2149  * between bs_new and bs_top. Both bs_new and bs_top are modified.
2150  *
2151  * bs_new must not be attached to a BlockBackend.
2152  *
2153  * This function does not create any image files.
2154  */
2155 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2156 {
2157     bdrv_swap(bs_new, bs_top);
2158 
2159     /* The contents of 'tmp' will become bs_top, as we are
2160      * swapping bs_new and bs_top contents. */
2161     bdrv_set_backing_hd(bs_top, bs_new);
2162 }
2163 
2164 static void bdrv_delete(BlockDriverState *bs)
2165 {
2166     assert(!bs->job);
2167     assert(bdrv_op_blocker_is_empty(bs));
2168     assert(!bs->refcnt);
2169     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2170 
2171     bdrv_close(bs);
2172 
2173     /* remove from list, if necessary */
2174     bdrv_make_anon(bs);
2175 
2176     g_free(bs);
2177 }
2178 
2179 /*
2180  * Run consistency checks on an image
2181  *
2182  * Returns 0 if the check could be completed (it doesn't mean that the image is
2183  * free of errors) or -errno when an internal error occurred. The results of the
2184  * check are stored in res.
2185  */
2186 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2187 {
2188     if (bs->drv == NULL) {
2189         return -ENOMEDIUM;
2190     }
2191     if (bs->drv->bdrv_check == NULL) {
2192         return -ENOTSUP;
2193     }
2194 
2195     memset(res, 0, sizeof(*res));
2196     return bs->drv->bdrv_check(bs, res, fix);
2197 }
2198 
2199 #define COMMIT_BUF_SECTORS 2048
2200 
2201 /* commit COW file into the raw image */
2202 int bdrv_commit(BlockDriverState *bs)
2203 {
2204     BlockDriver *drv = bs->drv;
2205     int64_t sector, total_sectors, length, backing_length;
2206     int n, ro, open_flags;
2207     int ret = 0;
2208     uint8_t *buf = NULL;
2209 
2210     if (!drv)
2211         return -ENOMEDIUM;
2212 
2213     if (!bs->backing_hd) {
2214         return -ENOTSUP;
2215     }
2216 
2217     if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) ||
2218         bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET, NULL)) {
2219         return -EBUSY;
2220     }
2221 
2222     ro = bs->backing_hd->read_only;
2223     open_flags =  bs->backing_hd->open_flags;
2224 
2225     if (ro) {
2226         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2227             return -EACCES;
2228         }
2229     }
2230 
2231     length = bdrv_getlength(bs);
2232     if (length < 0) {
2233         ret = length;
2234         goto ro_cleanup;
2235     }
2236 
2237     backing_length = bdrv_getlength(bs->backing_hd);
2238     if (backing_length < 0) {
2239         ret = backing_length;
2240         goto ro_cleanup;
2241     }
2242 
2243     /* If our top snapshot is larger than the backing file image,
2244      * grow the backing file image if possible.  If not possible,
2245      * we must return an error */
2246     if (length > backing_length) {
2247         ret = bdrv_truncate(bs->backing_hd, length);
2248         if (ret < 0) {
2249             goto ro_cleanup;
2250         }
2251     }
2252 
2253     total_sectors = length >> BDRV_SECTOR_BITS;
2254 
2255     /* qemu_try_blockalign() for bs will choose an alignment that works for
2256      * bs->backing_hd as well, so no need to compare the alignment manually. */
2257     buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2258     if (buf == NULL) {
2259         ret = -ENOMEM;
2260         goto ro_cleanup;
2261     }
2262 
2263     for (sector = 0; sector < total_sectors; sector += n) {
2264         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2265         if (ret < 0) {
2266             goto ro_cleanup;
2267         }
2268         if (ret) {
2269             ret = bdrv_read(bs, sector, buf, n);
2270             if (ret < 0) {
2271                 goto ro_cleanup;
2272             }
2273 
2274             ret = bdrv_write(bs->backing_hd, sector, buf, n);
2275             if (ret < 0) {
2276                 goto ro_cleanup;
2277             }
2278         }
2279     }
2280 
2281     if (drv->bdrv_make_empty) {
2282         ret = drv->bdrv_make_empty(bs);
2283         if (ret < 0) {
2284             goto ro_cleanup;
2285         }
2286         bdrv_flush(bs);
2287     }
2288 
2289     /*
2290      * Make sure all data we wrote to the backing device is actually
2291      * stable on disk.
2292      */
2293     if (bs->backing_hd) {
2294         bdrv_flush(bs->backing_hd);
2295     }
2296 
2297     ret = 0;
2298 ro_cleanup:
2299     qemu_vfree(buf);
2300 
2301     if (ro) {
2302         /* ignoring error return here */
2303         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2304     }
2305 
2306     return ret;
2307 }
2308 
2309 int bdrv_commit_all(void)
2310 {
2311     BlockDriverState *bs;
2312 
2313     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2314         AioContext *aio_context = bdrv_get_aio_context(bs);
2315 
2316         aio_context_acquire(aio_context);
2317         if (bs->drv && bs->backing_hd) {
2318             int ret = bdrv_commit(bs);
2319             if (ret < 0) {
2320                 aio_context_release(aio_context);
2321                 return ret;
2322             }
2323         }
2324         aio_context_release(aio_context);
2325     }
2326     return 0;
2327 }
2328 
2329 /**
2330  * Remove an active request from the tracked requests list
2331  *
2332  * This function should be called when a tracked request is completing.
2333  */
2334 static void tracked_request_end(BdrvTrackedRequest *req)
2335 {
2336     if (req->serialising) {
2337         req->bs->serialising_in_flight--;
2338     }
2339 
2340     QLIST_REMOVE(req, list);
2341     qemu_co_queue_restart_all(&req->wait_queue);
2342 }
2343 
2344 /**
2345  * Add an active request to the tracked requests list
2346  */
2347 static void tracked_request_begin(BdrvTrackedRequest *req,
2348                                   BlockDriverState *bs,
2349                                   int64_t offset,
2350                                   unsigned int bytes, bool is_write)
2351 {
2352     *req = (BdrvTrackedRequest){
2353         .bs = bs,
2354         .offset         = offset,
2355         .bytes          = bytes,
2356         .is_write       = is_write,
2357         .co             = qemu_coroutine_self(),
2358         .serialising    = false,
2359         .overlap_offset = offset,
2360         .overlap_bytes  = bytes,
2361     };
2362 
2363     qemu_co_queue_init(&req->wait_queue);
2364 
2365     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2366 }
2367 
2368 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2369 {
2370     int64_t overlap_offset = req->offset & ~(align - 1);
2371     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2372                                - overlap_offset;
2373 
2374     if (!req->serialising) {
2375         req->bs->serialising_in_flight++;
2376         req->serialising = true;
2377     }
2378 
2379     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2380     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2381 }
2382 
2383 /**
2384  * Round a region to cluster boundaries
2385  */
2386 void bdrv_round_to_clusters(BlockDriverState *bs,
2387                             int64_t sector_num, int nb_sectors,
2388                             int64_t *cluster_sector_num,
2389                             int *cluster_nb_sectors)
2390 {
2391     BlockDriverInfo bdi;
2392 
2393     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2394         *cluster_sector_num = sector_num;
2395         *cluster_nb_sectors = nb_sectors;
2396     } else {
2397         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2398         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2399         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2400                                             nb_sectors, c);
2401     }
2402 }
2403 
2404 static int bdrv_get_cluster_size(BlockDriverState *bs)
2405 {
2406     BlockDriverInfo bdi;
2407     int ret;
2408 
2409     ret = bdrv_get_info(bs, &bdi);
2410     if (ret < 0 || bdi.cluster_size == 0) {
2411         return bs->request_alignment;
2412     } else {
2413         return bdi.cluster_size;
2414     }
2415 }
2416 
2417 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2418                                      int64_t offset, unsigned int bytes)
2419 {
2420     /*        aaaa   bbbb */
2421     if (offset >= req->overlap_offset + req->overlap_bytes) {
2422         return false;
2423     }
2424     /* bbbb   aaaa        */
2425     if (req->overlap_offset >= offset + bytes) {
2426         return false;
2427     }
2428     return true;
2429 }
2430 
2431 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2432 {
2433     BlockDriverState *bs = self->bs;
2434     BdrvTrackedRequest *req;
2435     bool retry;
2436     bool waited = false;
2437 
2438     if (!bs->serialising_in_flight) {
2439         return false;
2440     }
2441 
2442     do {
2443         retry = false;
2444         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2445             if (req == self || (!req->serialising && !self->serialising)) {
2446                 continue;
2447             }
2448             if (tracked_request_overlaps(req, self->overlap_offset,
2449                                          self->overlap_bytes))
2450             {
2451                 /* Hitting this means there was a reentrant request, for
2452                  * example, a block driver issuing nested requests.  This must
2453                  * never happen since it means deadlock.
2454                  */
2455                 assert(qemu_coroutine_self() != req->co);
2456 
2457                 /* If the request is already (indirectly) waiting for us, or
2458                  * will wait for us as soon as it wakes up, then just go on
2459                  * (instead of producing a deadlock in the former case). */
2460                 if (!req->waiting_for) {
2461                     self->waiting_for = req;
2462                     qemu_co_queue_wait(&req->wait_queue);
2463                     self->waiting_for = NULL;
2464                     retry = true;
2465                     waited = true;
2466                     break;
2467                 }
2468             }
2469         }
2470     } while (retry);
2471 
2472     return waited;
2473 }
2474 
2475 /*
2476  * Return values:
2477  * 0        - success
2478  * -EINVAL  - backing format specified, but no file
2479  * -ENOSPC  - can't update the backing file because no space is left in the
2480  *            image file header
2481  * -ENOTSUP - format driver doesn't support changing the backing file
2482  */
2483 int bdrv_change_backing_file(BlockDriverState *bs,
2484     const char *backing_file, const char *backing_fmt)
2485 {
2486     BlockDriver *drv = bs->drv;
2487     int ret;
2488 
2489     /* Backing file format doesn't make sense without a backing file */
2490     if (backing_fmt && !backing_file) {
2491         return -EINVAL;
2492     }
2493 
2494     if (drv->bdrv_change_backing_file != NULL) {
2495         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2496     } else {
2497         ret = -ENOTSUP;
2498     }
2499 
2500     if (ret == 0) {
2501         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2502         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2503     }
2504     return ret;
2505 }
2506 
2507 /*
2508  * Finds the image layer in the chain that has 'bs' as its backing file.
2509  *
2510  * active is the current topmost image.
2511  *
2512  * Returns NULL if bs is not found in active's image chain,
2513  * or if active == bs.
2514  *
2515  * Returns the bottommost base image if bs == NULL.
2516  */
2517 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2518                                     BlockDriverState *bs)
2519 {
2520     while (active && bs != active->backing_hd) {
2521         active = active->backing_hd;
2522     }
2523 
2524     return active;
2525 }
2526 
2527 /* Given a BDS, searches for the base layer. */
2528 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2529 {
2530     return bdrv_find_overlay(bs, NULL);
2531 }
2532 
2533 typedef struct BlkIntermediateStates {
2534     BlockDriverState *bs;
2535     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2536 } BlkIntermediateStates;
2537 
2538 
2539 /*
2540  * Drops images above 'base' up to and including 'top', and sets the image
2541  * above 'top' to have base as its backing file.
2542  *
2543  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2544  * information in 'bs' can be properly updated.
2545  *
2546  * E.g., this will convert the following chain:
2547  * bottom <- base <- intermediate <- top <- active
2548  *
2549  * to
2550  *
2551  * bottom <- base <- active
2552  *
2553  * It is allowed for bottom==base, in which case it converts:
2554  *
2555  * base <- intermediate <- top <- active
2556  *
2557  * to
2558  *
2559  * base <- active
2560  *
2561  * If backing_file_str is non-NULL, it will be used when modifying top's
2562  * overlay image metadata.
2563  *
2564  * Error conditions:
2565  *  if active == top, that is considered an error
2566  *
2567  */
2568 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2569                            BlockDriverState *base, const char *backing_file_str)
2570 {
2571     BlockDriverState *intermediate;
2572     BlockDriverState *base_bs = NULL;
2573     BlockDriverState *new_top_bs = NULL;
2574     BlkIntermediateStates *intermediate_state, *next;
2575     int ret = -EIO;
2576 
2577     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2578     QSIMPLEQ_INIT(&states_to_delete);
2579 
2580     if (!top->drv || !base->drv) {
2581         goto exit;
2582     }
2583 
2584     new_top_bs = bdrv_find_overlay(active, top);
2585 
2586     if (new_top_bs == NULL) {
2587         /* we could not find the image above 'top', this is an error */
2588         goto exit;
2589     }
2590 
2591     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2592      * to do, no intermediate images */
2593     if (new_top_bs->backing_hd == base) {
2594         ret = 0;
2595         goto exit;
2596     }
2597 
2598     intermediate = top;
2599 
2600     /* now we will go down through the list, and add each BDS we find
2601      * into our deletion queue, until we hit the 'base'
2602      */
2603     while (intermediate) {
2604         intermediate_state = g_new0(BlkIntermediateStates, 1);
2605         intermediate_state->bs = intermediate;
2606         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2607 
2608         if (intermediate->backing_hd == base) {
2609             base_bs = intermediate->backing_hd;
2610             break;
2611         }
2612         intermediate = intermediate->backing_hd;
2613     }
2614     if (base_bs == NULL) {
2615         /* something went wrong, we did not end at the base. safely
2616          * unravel everything, and exit with error */
2617         goto exit;
2618     }
2619 
2620     /* success - we can delete the intermediate states, and link top->base */
2621     backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2622     ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2623                                    base_bs->drv ? base_bs->drv->format_name : "");
2624     if (ret) {
2625         goto exit;
2626     }
2627     bdrv_set_backing_hd(new_top_bs, base_bs);
2628 
2629     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2630         /* so that bdrv_close() does not recursively close the chain */
2631         bdrv_set_backing_hd(intermediate_state->bs, NULL);
2632         bdrv_unref(intermediate_state->bs);
2633     }
2634     ret = 0;
2635 
2636 exit:
2637     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2638         g_free(intermediate_state);
2639     }
2640     return ret;
2641 }
2642 
2643 
2644 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2645                                    size_t size)
2646 {
2647     if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
2648         return -EIO;
2649     }
2650 
2651     if (!bdrv_is_inserted(bs)) {
2652         return -ENOMEDIUM;
2653     }
2654 
2655     if (offset < 0) {
2656         return -EIO;
2657     }
2658 
2659     return 0;
2660 }
2661 
2662 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2663                               int nb_sectors)
2664 {
2665     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
2666         return -EIO;
2667     }
2668 
2669     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2670                                    nb_sectors * BDRV_SECTOR_SIZE);
2671 }
2672 
2673 typedef struct RwCo {
2674     BlockDriverState *bs;
2675     int64_t offset;
2676     QEMUIOVector *qiov;
2677     bool is_write;
2678     int ret;
2679     BdrvRequestFlags flags;
2680 } RwCo;
2681 
2682 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2683 {
2684     RwCo *rwco = opaque;
2685 
2686     if (!rwco->is_write) {
2687         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2688                                       rwco->qiov->size, rwco->qiov,
2689                                       rwco->flags);
2690     } else {
2691         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2692                                        rwco->qiov->size, rwco->qiov,
2693                                        rwco->flags);
2694     }
2695 }
2696 
2697 /*
2698  * Process a vectored synchronous request using coroutines
2699  */
2700 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2701                         QEMUIOVector *qiov, bool is_write,
2702                         BdrvRequestFlags flags)
2703 {
2704     Coroutine *co;
2705     RwCo rwco = {
2706         .bs = bs,
2707         .offset = offset,
2708         .qiov = qiov,
2709         .is_write = is_write,
2710         .ret = NOT_DONE,
2711         .flags = flags,
2712     };
2713 
2714     /**
2715      * In sync call context, when the vcpu is blocked, this throttling timer
2716      * will not fire; so the I/O throttling function has to be disabled here
2717      * if it has been enabled.
2718      */
2719     if (bs->io_limits_enabled) {
2720         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2721                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2722         bdrv_io_limits_disable(bs);
2723     }
2724 
2725     if (qemu_in_coroutine()) {
2726         /* Fast-path if already in coroutine context */
2727         bdrv_rw_co_entry(&rwco);
2728     } else {
2729         AioContext *aio_context = bdrv_get_aio_context(bs);
2730 
2731         co = qemu_coroutine_create(bdrv_rw_co_entry);
2732         qemu_coroutine_enter(co, &rwco);
2733         while (rwco.ret == NOT_DONE) {
2734             aio_poll(aio_context, true);
2735         }
2736     }
2737     return rwco.ret;
2738 }
2739 
2740 /*
2741  * Process a synchronous request using coroutines
2742  */
2743 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2744                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2745 {
2746     QEMUIOVector qiov;
2747     struct iovec iov = {
2748         .iov_base = (void *)buf,
2749         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2750     };
2751 
2752     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
2753         return -EINVAL;
2754     }
2755 
2756     qemu_iovec_init_external(&qiov, &iov, 1);
2757     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2758                         &qiov, is_write, flags);
2759 }
2760 
2761 /* return < 0 if error. See bdrv_write() for the return codes */
2762 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2763               uint8_t *buf, int nb_sectors)
2764 {
2765     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2766 }
2767 
2768 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2769 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2770                           uint8_t *buf, int nb_sectors)
2771 {
2772     bool enabled;
2773     int ret;
2774 
2775     enabled = bs->io_limits_enabled;
2776     bs->io_limits_enabled = false;
2777     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2778     bs->io_limits_enabled = enabled;
2779     return ret;
2780 }
2781 
2782 /* Return < 0 if error. Important errors are:
2783   -EIO         generic I/O error (may happen for all errors)
2784   -ENOMEDIUM   No media inserted.
2785   -EINVAL      Invalid sector number or nb_sectors
2786   -EACCES      Trying to write a read-only device
2787 */
2788 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2789                const uint8_t *buf, int nb_sectors)
2790 {
2791     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2792 }
2793 
2794 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2795                       int nb_sectors, BdrvRequestFlags flags)
2796 {
2797     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2798                       BDRV_REQ_ZERO_WRITE | flags);
2799 }
2800 
2801 /*
2802  * Completely zero out a block device with the help of bdrv_write_zeroes.
2803  * The operation is sped up by checking the block status and only writing
2804  * zeroes to the device if they currently do not return zeroes. Optional
2805  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2806  *
2807  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2808  */
2809 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2810 {
2811     int64_t target_sectors, ret, nb_sectors, sector_num = 0;
2812     int n;
2813 
2814     target_sectors = bdrv_nb_sectors(bs);
2815     if (target_sectors < 0) {
2816         return target_sectors;
2817     }
2818 
2819     for (;;) {
2820         nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
2821         if (nb_sectors <= 0) {
2822             return 0;
2823         }
2824         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2825         if (ret < 0) {
2826             error_report("error getting block status at sector %" PRId64 ": %s",
2827                          sector_num, strerror(-ret));
2828             return ret;
2829         }
2830         if (ret & BDRV_BLOCK_ZERO) {
2831             sector_num += n;
2832             continue;
2833         }
2834         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2835         if (ret < 0) {
2836             error_report("error writing zeroes at sector %" PRId64 ": %s",
2837                          sector_num, strerror(-ret));
2838             return ret;
2839         }
2840         sector_num += n;
2841     }
2842 }
2843 
2844 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2845 {
2846     QEMUIOVector qiov;
2847     struct iovec iov = {
2848         .iov_base = (void *)buf,
2849         .iov_len = bytes,
2850     };
2851     int ret;
2852 
2853     if (bytes < 0) {
2854         return -EINVAL;
2855     }
2856 
2857     qemu_iovec_init_external(&qiov, &iov, 1);
2858     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2859     if (ret < 0) {
2860         return ret;
2861     }
2862 
2863     return bytes;
2864 }
2865 
2866 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2867 {
2868     int ret;
2869 
2870     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2871     if (ret < 0) {
2872         return ret;
2873     }
2874 
2875     return qiov->size;
2876 }
2877 
2878 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2879                 const void *buf, int bytes)
2880 {
2881     QEMUIOVector qiov;
2882     struct iovec iov = {
2883         .iov_base   = (void *) buf,
2884         .iov_len    = bytes,
2885     };
2886 
2887     if (bytes < 0) {
2888         return -EINVAL;
2889     }
2890 
2891     qemu_iovec_init_external(&qiov, &iov, 1);
2892     return bdrv_pwritev(bs, offset, &qiov);
2893 }
2894 
2895 /*
2896  * Writes to the file and ensures that no writes are reordered across this
2897  * request (acts as a barrier)
2898  *
2899  * Returns 0 on success, -errno in error cases.
2900  */
2901 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2902     const void *buf, int count)
2903 {
2904     int ret;
2905 
2906     ret = bdrv_pwrite(bs, offset, buf, count);
2907     if (ret < 0) {
2908         return ret;
2909     }
2910 
2911     /* No flush needed for cache modes that already do it */
2912     if (bs->enable_write_cache) {
2913         bdrv_flush(bs);
2914     }
2915 
2916     return 0;
2917 }
2918 
2919 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2920         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2921 {
2922     /* Perform I/O through a temporary buffer so that users who scribble over
2923      * their read buffer while the operation is in progress do not end up
2924      * modifying the image file.  This is critical for zero-copy guest I/O
2925      * where anything might happen inside guest memory.
2926      */
2927     void *bounce_buffer;
2928 
2929     BlockDriver *drv = bs->drv;
2930     struct iovec iov;
2931     QEMUIOVector bounce_qiov;
2932     int64_t cluster_sector_num;
2933     int cluster_nb_sectors;
2934     size_t skip_bytes;
2935     int ret;
2936 
2937     /* Cover entire cluster so no additional backing file I/O is required when
2938      * allocating cluster in the image file.
2939      */
2940     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2941                            &cluster_sector_num, &cluster_nb_sectors);
2942 
2943     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2944                                    cluster_sector_num, cluster_nb_sectors);
2945 
2946     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2947     iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
2948     if (bounce_buffer == NULL) {
2949         ret = -ENOMEM;
2950         goto err;
2951     }
2952 
2953     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2954 
2955     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2956                              &bounce_qiov);
2957     if (ret < 0) {
2958         goto err;
2959     }
2960 
2961     if (drv->bdrv_co_write_zeroes &&
2962         buffer_is_zero(bounce_buffer, iov.iov_len)) {
2963         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2964                                       cluster_nb_sectors, 0);
2965     } else {
2966         /* This does not change the data on the disk, it is not necessary
2967          * to flush even in cache=writethrough mode.
2968          */
2969         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2970                                   &bounce_qiov);
2971     }
2972 
2973     if (ret < 0) {
2974         /* It might be okay to ignore write errors for guest requests.  If this
2975          * is a deliberate copy-on-read then we don't want to ignore the error.
2976          * Simply report it in all cases.
2977          */
2978         goto err;
2979     }
2980 
2981     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2982     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2983                         nb_sectors * BDRV_SECTOR_SIZE);
2984 
2985 err:
2986     qemu_vfree(bounce_buffer);
2987     return ret;
2988 }
2989 
2990 /*
2991  * Forwards an already correctly aligned request to the BlockDriver. This
2992  * handles copy on read and zeroing after EOF; any other features must be
2993  * implemented by the caller.
2994  */
2995 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2996     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2997     int64_t align, QEMUIOVector *qiov, int flags)
2998 {
2999     BlockDriver *drv = bs->drv;
3000     int ret;
3001 
3002     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3003     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3004 
3005     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3006     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3007     assert(!qiov || bytes == qiov->size);
3008 
3009     /* Handle Copy on Read and associated serialisation */
3010     if (flags & BDRV_REQ_COPY_ON_READ) {
3011         /* If we touch the same cluster it counts as an overlap.  This
3012          * guarantees that allocating writes will be serialized and not race
3013          * with each other for the same cluster.  For example, in copy-on-read
3014          * it ensures that the CoR read and write operations are atomic and
3015          * guest writes cannot interleave between them. */
3016         mark_request_serialising(req, bdrv_get_cluster_size(bs));
3017     }
3018 
3019     wait_serialising_requests(req);
3020 
3021     if (flags & BDRV_REQ_COPY_ON_READ) {
3022         int pnum;
3023 
3024         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3025         if (ret < 0) {
3026             goto out;
3027         }
3028 
3029         if (!ret || pnum != nb_sectors) {
3030             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3031             goto out;
3032         }
3033     }
3034 
3035     /* Forward the request to the BlockDriver */
3036     if (!bs->zero_beyond_eof) {
3037         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3038     } else {
3039         /* Read zeros after EOF */
3040         int64_t total_sectors, max_nb_sectors;
3041 
3042         total_sectors = bdrv_nb_sectors(bs);
3043         if (total_sectors < 0) {
3044             ret = total_sectors;
3045             goto out;
3046         }
3047 
3048         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3049                                   align >> BDRV_SECTOR_BITS);
3050         if (nb_sectors < max_nb_sectors) {
3051             ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3052         } else if (max_nb_sectors > 0) {
3053             QEMUIOVector local_qiov;
3054 
3055             qemu_iovec_init(&local_qiov, qiov->niov);
3056             qemu_iovec_concat(&local_qiov, qiov, 0,
3057                               max_nb_sectors * BDRV_SECTOR_SIZE);
3058 
3059             ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
3060                                      &local_qiov);
3061 
3062             qemu_iovec_destroy(&local_qiov);
3063         } else {
3064             ret = 0;
3065         }
3066 
3067         /* Reading beyond end of file is supposed to produce zeroes */
3068         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3069             uint64_t offset = MAX(0, total_sectors - sector_num);
3070             uint64_t bytes = (sector_num + nb_sectors - offset) *
3071                               BDRV_SECTOR_SIZE;
3072             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3073         }
3074     }
3075 
3076 out:
3077     return ret;
3078 }
3079 
3080 /*
3081  * Handle a read request in coroutine context
3082  */
3083 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3084     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3085     BdrvRequestFlags flags)
3086 {
3087     BlockDriver *drv = bs->drv;
3088     BdrvTrackedRequest req;
3089 
3090     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3091     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3092     uint8_t *head_buf = NULL;
3093     uint8_t *tail_buf = NULL;
3094     QEMUIOVector local_qiov;
3095     bool use_local_qiov = false;
3096     int ret;
3097 
3098     if (!drv) {
3099         return -ENOMEDIUM;
3100     }
3101 
3102     ret = bdrv_check_byte_request(bs, offset, bytes);
3103     if (ret < 0) {
3104         return ret;
3105     }
3106 
3107     if (bs->copy_on_read) {
3108         flags |= BDRV_REQ_COPY_ON_READ;
3109     }
3110 
3111     /* throttling disk I/O */
3112     if (bs->io_limits_enabled) {
3113         bdrv_io_limits_intercept(bs, bytes, false);
3114     }
3115 
3116     /* Align read if necessary by padding qiov */
3117     if (offset & (align - 1)) {
3118         head_buf = qemu_blockalign(bs, align);
3119         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3120         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3121         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3122         use_local_qiov = true;
3123 
3124         bytes += offset & (align - 1);
3125         offset = offset & ~(align - 1);
3126     }
3127 
3128     if ((offset + bytes) & (align - 1)) {
3129         if (!use_local_qiov) {
3130             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3131             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3132             use_local_qiov = true;
3133         }
3134         tail_buf = qemu_blockalign(bs, align);
3135         qemu_iovec_add(&local_qiov, tail_buf,
3136                        align - ((offset + bytes) & (align - 1)));
3137 
3138         bytes = ROUND_UP(bytes, align);
3139     }
3140 
3141     tracked_request_begin(&req, bs, offset, bytes, false);
3142     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3143                               use_local_qiov ? &local_qiov : qiov,
3144                               flags);
3145     tracked_request_end(&req);
3146 
3147     if (use_local_qiov) {
3148         qemu_iovec_destroy(&local_qiov);
3149         qemu_vfree(head_buf);
3150         qemu_vfree(tail_buf);
3151     }
3152 
3153     return ret;
3154 }
3155 
3156 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3157     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3158     BdrvRequestFlags flags)
3159 {
3160     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
3161         return -EINVAL;
3162     }
3163 
3164     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3165                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3166 }
3167 
3168 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3169     int nb_sectors, QEMUIOVector *qiov)
3170 {
3171     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3172 
3173     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3174 }
3175 
3176 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3177     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3178 {
3179     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3180 
3181     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3182                             BDRV_REQ_COPY_ON_READ);
3183 }
3184 
3185 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
3186 
3187 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3188     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3189 {
3190     BlockDriver *drv = bs->drv;
3191     QEMUIOVector qiov;
3192     struct iovec iov = {0};
3193     int ret = 0;
3194 
3195     int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
3196                                         BDRV_REQUEST_MAX_SECTORS);
3197 
3198     while (nb_sectors > 0 && !ret) {
3199         int num = nb_sectors;
3200 
3201         /* Align request.  Block drivers can expect the "bulk" of the request
3202          * to be aligned.
3203          */
3204         if (bs->bl.write_zeroes_alignment
3205             && num > bs->bl.write_zeroes_alignment) {
3206             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3207                 /* Make a small request up to the first aligned sector.  */
3208                 num = bs->bl.write_zeroes_alignment;
3209                 num -= sector_num % bs->bl.write_zeroes_alignment;
3210             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3211                 /* Shorten the request to the last aligned sector.  num cannot
3212                  * underflow because num > bs->bl.write_zeroes_alignment.
3213                  */
3214                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3215             }
3216         }
3217 
3218         /* limit request size */
3219         if (num > max_write_zeroes) {
3220             num = max_write_zeroes;
3221         }
3222 
3223         ret = -ENOTSUP;
3224         /* First try the efficient write zeroes operation */
3225         if (drv->bdrv_co_write_zeroes) {
3226             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3227         }
3228 
3229         if (ret == -ENOTSUP) {
3230             /* Fall back to bounce buffer if write zeroes is unsupported */
3231             int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
3232                                             MAX_WRITE_ZEROES_BOUNCE_BUFFER);
3233             num = MIN(num, max_xfer_len);
3234             iov.iov_len = num * BDRV_SECTOR_SIZE;
3235             if (iov.iov_base == NULL) {
3236                 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3237                 if (iov.iov_base == NULL) {
3238                     ret = -ENOMEM;
3239                     goto fail;
3240                 }
3241                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3242             }
3243             qemu_iovec_init_external(&qiov, &iov, 1);
3244 
3245             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3246 
3247             /* Keep bounce buffer around if it is big enough for all
3248              * all future requests.
3249              */
3250             if (num < max_xfer_len) {
3251                 qemu_vfree(iov.iov_base);
3252                 iov.iov_base = NULL;
3253             }
3254         }
3255 
3256         sector_num += num;
3257         nb_sectors -= num;
3258     }
3259 
3260 fail:
3261     qemu_vfree(iov.iov_base);
3262     return ret;
3263 }
3264 
3265 /*
3266  * Forwards an already correctly aligned write request to the BlockDriver.
3267  */
3268 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3269     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3270     QEMUIOVector *qiov, int flags)
3271 {
3272     BlockDriver *drv = bs->drv;
3273     bool waited;
3274     int ret;
3275 
3276     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3277     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3278 
3279     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3280     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3281     assert(!qiov || bytes == qiov->size);
3282 
3283     waited = wait_serialising_requests(req);
3284     assert(!waited || !req->serialising);
3285     assert(req->overlap_offset <= offset);
3286     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3287 
3288     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3289 
3290     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3291         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3292         qemu_iovec_is_zero(qiov)) {
3293         flags |= BDRV_REQ_ZERO_WRITE;
3294         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3295             flags |= BDRV_REQ_MAY_UNMAP;
3296         }
3297     }
3298 
3299     if (ret < 0) {
3300         /* Do nothing, write notifier decided to fail this request */
3301     } else if (flags & BDRV_REQ_ZERO_WRITE) {
3302         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3303         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3304     } else {
3305         BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3306         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3307     }
3308     BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3309 
3310     if (ret == 0 && !bs->enable_write_cache) {
3311         ret = bdrv_co_flush(bs);
3312     }
3313 
3314     bdrv_set_dirty(bs, sector_num, nb_sectors);
3315 
3316     block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
3317 
3318     if (ret >= 0) {
3319         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3320     }
3321 
3322     return ret;
3323 }
3324 
3325 /*
3326  * Handle a write request in coroutine context
3327  */
3328 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3329     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3330     BdrvRequestFlags flags)
3331 {
3332     BdrvTrackedRequest req;
3333     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3334     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3335     uint8_t *head_buf = NULL;
3336     uint8_t *tail_buf = NULL;
3337     QEMUIOVector local_qiov;
3338     bool use_local_qiov = false;
3339     int ret;
3340 
3341     if (!bs->drv) {
3342         return -ENOMEDIUM;
3343     }
3344     if (bs->read_only) {
3345         return -EACCES;
3346     }
3347 
3348     ret = bdrv_check_byte_request(bs, offset, bytes);
3349     if (ret < 0) {
3350         return ret;
3351     }
3352 
3353     /* throttling disk I/O */
3354     if (bs->io_limits_enabled) {
3355         bdrv_io_limits_intercept(bs, bytes, true);
3356     }
3357 
3358     /*
3359      * Align write if necessary by performing a read-modify-write cycle.
3360      * Pad qiov with the read parts and be sure to have a tracked request not
3361      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3362      */
3363     tracked_request_begin(&req, bs, offset, bytes, true);
3364 
3365     if (offset & (align - 1)) {
3366         QEMUIOVector head_qiov;
3367         struct iovec head_iov;
3368 
3369         mark_request_serialising(&req, align);
3370         wait_serialising_requests(&req);
3371 
3372         head_buf = qemu_blockalign(bs, align);
3373         head_iov = (struct iovec) {
3374             .iov_base   = head_buf,
3375             .iov_len    = align,
3376         };
3377         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3378 
3379         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3380         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3381                                   align, &head_qiov, 0);
3382         if (ret < 0) {
3383             goto fail;
3384         }
3385         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3386 
3387         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3388         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3389         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3390         use_local_qiov = true;
3391 
3392         bytes += offset & (align - 1);
3393         offset = offset & ~(align - 1);
3394     }
3395 
3396     if ((offset + bytes) & (align - 1)) {
3397         QEMUIOVector tail_qiov;
3398         struct iovec tail_iov;
3399         size_t tail_bytes;
3400         bool waited;
3401 
3402         mark_request_serialising(&req, align);
3403         waited = wait_serialising_requests(&req);
3404         assert(!waited || !use_local_qiov);
3405 
3406         tail_buf = qemu_blockalign(bs, align);
3407         tail_iov = (struct iovec) {
3408             .iov_base   = tail_buf,
3409             .iov_len    = align,
3410         };
3411         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3412 
3413         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3414         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3415                                   align, &tail_qiov, 0);
3416         if (ret < 0) {
3417             goto fail;
3418         }
3419         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3420 
3421         if (!use_local_qiov) {
3422             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3423             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3424             use_local_qiov = true;
3425         }
3426 
3427         tail_bytes = (offset + bytes) & (align - 1);
3428         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3429 
3430         bytes = ROUND_UP(bytes, align);
3431     }
3432 
3433     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3434                                use_local_qiov ? &local_qiov : qiov,
3435                                flags);
3436 
3437 fail:
3438     tracked_request_end(&req);
3439 
3440     if (use_local_qiov) {
3441         qemu_iovec_destroy(&local_qiov);
3442     }
3443     qemu_vfree(head_buf);
3444     qemu_vfree(tail_buf);
3445 
3446     return ret;
3447 }
3448 
3449 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3450     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3451     BdrvRequestFlags flags)
3452 {
3453     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
3454         return -EINVAL;
3455     }
3456 
3457     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3458                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3459 }
3460 
3461 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3462     int nb_sectors, QEMUIOVector *qiov)
3463 {
3464     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3465 
3466     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3467 }
3468 
3469 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3470                                       int64_t sector_num, int nb_sectors,
3471                                       BdrvRequestFlags flags)
3472 {
3473     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3474 
3475     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3476         flags &= ~BDRV_REQ_MAY_UNMAP;
3477     }
3478 
3479     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3480                              BDRV_REQ_ZERO_WRITE | flags);
3481 }
3482 
3483 /**
3484  * Truncate file to 'offset' bytes (needed only for file protocols)
3485  */
3486 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3487 {
3488     BlockDriver *drv = bs->drv;
3489     int ret;
3490     if (!drv)
3491         return -ENOMEDIUM;
3492     if (!drv->bdrv_truncate)
3493         return -ENOTSUP;
3494     if (bs->read_only)
3495         return -EACCES;
3496 
3497     ret = drv->bdrv_truncate(bs, offset);
3498     if (ret == 0) {
3499         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3500         if (bs->blk) {
3501             blk_dev_resize_cb(bs->blk);
3502         }
3503     }
3504     return ret;
3505 }
3506 
3507 /**
3508  * Length of a allocated file in bytes. Sparse files are counted by actual
3509  * allocated space. Return < 0 if error or unknown.
3510  */
3511 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3512 {
3513     BlockDriver *drv = bs->drv;
3514     if (!drv) {
3515         return -ENOMEDIUM;
3516     }
3517     if (drv->bdrv_get_allocated_file_size) {
3518         return drv->bdrv_get_allocated_file_size(bs);
3519     }
3520     if (bs->file) {
3521         return bdrv_get_allocated_file_size(bs->file);
3522     }
3523     return -ENOTSUP;
3524 }
3525 
3526 /**
3527  * Return number of sectors on success, -errno on error.
3528  */
3529 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3530 {
3531     BlockDriver *drv = bs->drv;
3532 
3533     if (!drv)
3534         return -ENOMEDIUM;
3535 
3536     if (drv->has_variable_length) {
3537         int ret = refresh_total_sectors(bs, bs->total_sectors);
3538         if (ret < 0) {
3539             return ret;
3540         }
3541     }
3542     return bs->total_sectors;
3543 }
3544 
3545 /**
3546  * Return length in bytes on success, -errno on error.
3547  * The length is always a multiple of BDRV_SECTOR_SIZE.
3548  */
3549 int64_t bdrv_getlength(BlockDriverState *bs)
3550 {
3551     int64_t ret = bdrv_nb_sectors(bs);
3552 
3553     return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3554 }
3555 
3556 /* return 0 as number of sectors if no device present or error */
3557 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3558 {
3559     int64_t nb_sectors = bdrv_nb_sectors(bs);
3560 
3561     *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3562 }
3563 
3564 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3565                        BlockdevOnError on_write_error)
3566 {
3567     bs->on_read_error = on_read_error;
3568     bs->on_write_error = on_write_error;
3569 }
3570 
3571 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3572 {
3573     return is_read ? bs->on_read_error : bs->on_write_error;
3574 }
3575 
3576 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3577 {
3578     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3579 
3580     switch (on_err) {
3581     case BLOCKDEV_ON_ERROR_ENOSPC:
3582         return (error == ENOSPC) ?
3583                BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3584     case BLOCKDEV_ON_ERROR_STOP:
3585         return BLOCK_ERROR_ACTION_STOP;
3586     case BLOCKDEV_ON_ERROR_REPORT:
3587         return BLOCK_ERROR_ACTION_REPORT;
3588     case BLOCKDEV_ON_ERROR_IGNORE:
3589         return BLOCK_ERROR_ACTION_IGNORE;
3590     default:
3591         abort();
3592     }
3593 }
3594 
3595 static void send_qmp_error_event(BlockDriverState *bs,
3596                                  BlockErrorAction action,
3597                                  bool is_read, int error)
3598 {
3599     IoOperationType optype;
3600 
3601     optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3602     qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
3603                                    bdrv_iostatus_is_enabled(bs),
3604                                    error == ENOSPC, strerror(error),
3605                                    &error_abort);
3606 }
3607 
3608 /* This is done by device models because, while the block layer knows
3609  * about the error, it does not know whether an operation comes from
3610  * the device or the block layer (from a job, for example).
3611  */
3612 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3613                        bool is_read, int error)
3614 {
3615     assert(error >= 0);
3616 
3617     if (action == BLOCK_ERROR_ACTION_STOP) {
3618         /* First set the iostatus, so that "info block" returns an iostatus
3619          * that matches the events raised so far (an additional error iostatus
3620          * is fine, but not a lost one).
3621          */
3622         bdrv_iostatus_set_err(bs, error);
3623 
3624         /* Then raise the request to stop the VM and the event.
3625          * qemu_system_vmstop_request_prepare has two effects.  First,
3626          * it ensures that the STOP event always comes after the
3627          * BLOCK_IO_ERROR event.  Second, it ensures that even if management
3628          * can observe the STOP event and do a "cont" before the STOP
3629          * event is issued, the VM will not stop.  In this case, vm_start()
3630          * also ensures that the STOP/RESUME pair of events is emitted.
3631          */
3632         qemu_system_vmstop_request_prepare();
3633         send_qmp_error_event(bs, action, is_read, error);
3634         qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3635     } else {
3636         send_qmp_error_event(bs, action, is_read, error);
3637     }
3638 }
3639 
3640 int bdrv_is_read_only(BlockDriverState *bs)
3641 {
3642     return bs->read_only;
3643 }
3644 
3645 int bdrv_is_sg(BlockDriverState *bs)
3646 {
3647     return bs->sg;
3648 }
3649 
3650 int bdrv_enable_write_cache(BlockDriverState *bs)
3651 {
3652     return bs->enable_write_cache;
3653 }
3654 
3655 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3656 {
3657     bs->enable_write_cache = wce;
3658 
3659     /* so a reopen() will preserve wce */
3660     if (wce) {
3661         bs->open_flags |= BDRV_O_CACHE_WB;
3662     } else {
3663         bs->open_flags &= ~BDRV_O_CACHE_WB;
3664     }
3665 }
3666 
3667 int bdrv_is_encrypted(BlockDriverState *bs)
3668 {
3669     if (bs->backing_hd && bs->backing_hd->encrypted)
3670         return 1;
3671     return bs->encrypted;
3672 }
3673 
3674 int bdrv_key_required(BlockDriverState *bs)
3675 {
3676     BlockDriverState *backing_hd = bs->backing_hd;
3677 
3678     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3679         return 1;
3680     return (bs->encrypted && !bs->valid_key);
3681 }
3682 
3683 int bdrv_set_key(BlockDriverState *bs, const char *key)
3684 {
3685     int ret;
3686     if (bs->backing_hd && bs->backing_hd->encrypted) {
3687         ret = bdrv_set_key(bs->backing_hd, key);
3688         if (ret < 0)
3689             return ret;
3690         if (!bs->encrypted)
3691             return 0;
3692     }
3693     if (!bs->encrypted) {
3694         return -EINVAL;
3695     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3696         return -ENOMEDIUM;
3697     }
3698     ret = bs->drv->bdrv_set_key(bs, key);
3699     if (ret < 0) {
3700         bs->valid_key = 0;
3701     } else if (!bs->valid_key) {
3702         bs->valid_key = 1;
3703         if (bs->blk) {
3704             /* call the change callback now, we skipped it on open */
3705             blk_dev_change_media_cb(bs->blk, true);
3706         }
3707     }
3708     return ret;
3709 }
3710 
3711 /*
3712  * Provide an encryption key for @bs.
3713  * If @key is non-null:
3714  *     If @bs is not encrypted, fail.
3715  *     Else if the key is invalid, fail.
3716  *     Else set @bs's key to @key, replacing the existing key, if any.
3717  * If @key is null:
3718  *     If @bs is encrypted and still lacks a key, fail.
3719  *     Else do nothing.
3720  * On failure, store an error object through @errp if non-null.
3721  */
3722 void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp)
3723 {
3724     if (key) {
3725         if (!bdrv_is_encrypted(bs)) {
3726             error_setg(errp, "Device '%s' is not encrypted",
3727                       bdrv_get_device_name(bs));
3728         } else if (bdrv_set_key(bs, key) < 0) {
3729             error_set(errp, QERR_INVALID_PASSWORD);
3730         }
3731     } else {
3732         if (bdrv_key_required(bs)) {
3733             error_set(errp, ERROR_CLASS_DEVICE_ENCRYPTED,
3734                       "'%s' (%s) is encrypted",
3735                       bdrv_get_device_name(bs),
3736                       bdrv_get_encrypted_filename(bs));
3737         }
3738     }
3739 }
3740 
3741 const char *bdrv_get_format_name(BlockDriverState *bs)
3742 {
3743     return bs->drv ? bs->drv->format_name : NULL;
3744 }
3745 
3746 static int qsort_strcmp(const void *a, const void *b)
3747 {
3748     return strcmp(a, b);
3749 }
3750 
3751 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3752                          void *opaque)
3753 {
3754     BlockDriver *drv;
3755     int count = 0;
3756     int i;
3757     const char **formats = NULL;
3758 
3759     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3760         if (drv->format_name) {
3761             bool found = false;
3762             int i = count;
3763             while (formats && i && !found) {
3764                 found = !strcmp(formats[--i], drv->format_name);
3765             }
3766 
3767             if (!found) {
3768                 formats = g_renew(const char *, formats, count + 1);
3769                 formats[count++] = drv->format_name;
3770             }
3771         }
3772     }
3773 
3774     qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3775 
3776     for (i = 0; i < count; i++) {
3777         it(opaque, formats[i]);
3778     }
3779 
3780     g_free(formats);
3781 }
3782 
3783 /* This function is to find block backend bs */
3784 /* TODO convert callers to blk_by_name(), then remove */
3785 BlockDriverState *bdrv_find(const char *name)
3786 {
3787     BlockBackend *blk = blk_by_name(name);
3788 
3789     return blk ? blk_bs(blk) : NULL;
3790 }
3791 
3792 /* This function is to find a node in the bs graph */
3793 BlockDriverState *bdrv_find_node(const char *node_name)
3794 {
3795     BlockDriverState *bs;
3796 
3797     assert(node_name);
3798 
3799     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3800         if (!strcmp(node_name, bs->node_name)) {
3801             return bs;
3802         }
3803     }
3804     return NULL;
3805 }
3806 
3807 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3808 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3809 {
3810     BlockDeviceInfoList *list, *entry;
3811     BlockDriverState *bs;
3812 
3813     list = NULL;
3814     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3815         entry = g_malloc0(sizeof(*entry));
3816         entry->value = bdrv_block_device_info(bs);
3817         entry->next = list;
3818         list = entry;
3819     }
3820 
3821     return list;
3822 }
3823 
3824 BlockDriverState *bdrv_lookup_bs(const char *device,
3825                                  const char *node_name,
3826                                  Error **errp)
3827 {
3828     BlockBackend *blk;
3829     BlockDriverState *bs;
3830 
3831     if (device) {
3832         blk = blk_by_name(device);
3833 
3834         if (blk) {
3835             return blk_bs(blk);
3836         }
3837     }
3838 
3839     if (node_name) {
3840         bs = bdrv_find_node(node_name);
3841 
3842         if (bs) {
3843             return bs;
3844         }
3845     }
3846 
3847     error_setg(errp, "Cannot find device=%s nor node_name=%s",
3848                      device ? device : "",
3849                      node_name ? node_name : "");
3850     return NULL;
3851 }
3852 
3853 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3854  * return false.  If either argument is NULL, return false. */
3855 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3856 {
3857     while (top && top != base) {
3858         top = top->backing_hd;
3859     }
3860 
3861     return top != NULL;
3862 }
3863 
3864 BlockDriverState *bdrv_next_node(BlockDriverState *bs)
3865 {
3866     if (!bs) {
3867         return QTAILQ_FIRST(&graph_bdrv_states);
3868     }
3869     return QTAILQ_NEXT(bs, node_list);
3870 }
3871 
3872 BlockDriverState *bdrv_next(BlockDriverState *bs)
3873 {
3874     if (!bs) {
3875         return QTAILQ_FIRST(&bdrv_states);
3876     }
3877     return QTAILQ_NEXT(bs, device_list);
3878 }
3879 
3880 const char *bdrv_get_node_name(const BlockDriverState *bs)
3881 {
3882     return bs->node_name;
3883 }
3884 
3885 /* TODO check what callers really want: bs->node_name or blk_name() */
3886 const char *bdrv_get_device_name(const BlockDriverState *bs)
3887 {
3888     return bs->blk ? blk_name(bs->blk) : "";
3889 }
3890 
3891 int bdrv_get_flags(BlockDriverState *bs)
3892 {
3893     return bs->open_flags;
3894 }
3895 
3896 int bdrv_flush_all(void)
3897 {
3898     BlockDriverState *bs;
3899     int result = 0;
3900 
3901     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3902         AioContext *aio_context = bdrv_get_aio_context(bs);
3903         int ret;
3904 
3905         aio_context_acquire(aio_context);
3906         ret = bdrv_flush(bs);
3907         if (ret < 0 && !result) {
3908             result = ret;
3909         }
3910         aio_context_release(aio_context);
3911     }
3912 
3913     return result;
3914 }
3915 
3916 int bdrv_has_zero_init_1(BlockDriverState *bs)
3917 {
3918     return 1;
3919 }
3920 
3921 int bdrv_has_zero_init(BlockDriverState *bs)
3922 {
3923     assert(bs->drv);
3924 
3925     /* If BS is a copy on write image, it is initialized to
3926        the contents of the base image, which may not be zeroes.  */
3927     if (bs->backing_hd) {
3928         return 0;
3929     }
3930     if (bs->drv->bdrv_has_zero_init) {
3931         return bs->drv->bdrv_has_zero_init(bs);
3932     }
3933 
3934     /* safe default */
3935     return 0;
3936 }
3937 
3938 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3939 {
3940     BlockDriverInfo bdi;
3941 
3942     if (bs->backing_hd) {
3943         return false;
3944     }
3945 
3946     if (bdrv_get_info(bs, &bdi) == 0) {
3947         return bdi.unallocated_blocks_are_zero;
3948     }
3949 
3950     return false;
3951 }
3952 
3953 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3954 {
3955     BlockDriverInfo bdi;
3956 
3957     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3958         return false;
3959     }
3960 
3961     if (bdrv_get_info(bs, &bdi) == 0) {
3962         return bdi.can_write_zeroes_with_unmap;
3963     }
3964 
3965     return false;
3966 }
3967 
3968 typedef struct BdrvCoGetBlockStatusData {
3969     BlockDriverState *bs;
3970     BlockDriverState *base;
3971     int64_t sector_num;
3972     int nb_sectors;
3973     int *pnum;
3974     int64_t ret;
3975     bool done;
3976 } BdrvCoGetBlockStatusData;
3977 
3978 /*
3979  * Returns the allocation status of the specified sectors.
3980  * Drivers not implementing the functionality are assumed to not support
3981  * backing files, hence all their sectors are reported as allocated.
3982  *
3983  * If 'sector_num' is beyond the end of the disk image the return value is 0
3984  * and 'pnum' is set to 0.
3985  *
3986  * 'pnum' is set to the number of sectors (including and immediately following
3987  * the specified sector) that are known to be in the same
3988  * allocated/unallocated state.
3989  *
3990  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3991  * beyond the end of the disk image it will be clamped.
3992  */
3993 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3994                                                      int64_t sector_num,
3995                                                      int nb_sectors, int *pnum)
3996 {
3997     int64_t total_sectors;
3998     int64_t n;
3999     int64_t ret, ret2;
4000 
4001     total_sectors = bdrv_nb_sectors(bs);
4002     if (total_sectors < 0) {
4003         return total_sectors;
4004     }
4005 
4006     if (sector_num >= total_sectors) {
4007         *pnum = 0;
4008         return 0;
4009     }
4010 
4011     n = total_sectors - sector_num;
4012     if (n < nb_sectors) {
4013         nb_sectors = n;
4014     }
4015 
4016     if (!bs->drv->bdrv_co_get_block_status) {
4017         *pnum = nb_sectors;
4018         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
4019         if (bs->drv->protocol_name) {
4020             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
4021         }
4022         return ret;
4023     }
4024 
4025     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
4026     if (ret < 0) {
4027         *pnum = 0;
4028         return ret;
4029     }
4030 
4031     if (ret & BDRV_BLOCK_RAW) {
4032         assert(ret & BDRV_BLOCK_OFFSET_VALID);
4033         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4034                                      *pnum, pnum);
4035     }
4036 
4037     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
4038         ret |= BDRV_BLOCK_ALLOCATED;
4039     }
4040 
4041     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
4042         if (bdrv_unallocated_blocks_are_zero(bs)) {
4043             ret |= BDRV_BLOCK_ZERO;
4044         } else if (bs->backing_hd) {
4045             BlockDriverState *bs2 = bs->backing_hd;
4046             int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4047             if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
4048                 ret |= BDRV_BLOCK_ZERO;
4049             }
4050         }
4051     }
4052 
4053     if (bs->file &&
4054         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4055         (ret & BDRV_BLOCK_OFFSET_VALID)) {
4056         int file_pnum;
4057 
4058         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4059                                         *pnum, &file_pnum);
4060         if (ret2 >= 0) {
4061             /* Ignore errors.  This is just providing extra information, it
4062              * is useful but not necessary.
4063              */
4064             if (!file_pnum) {
4065                 /* !file_pnum indicates an offset at or beyond the EOF; it is
4066                  * perfectly valid for the format block driver to point to such
4067                  * offsets, so catch it and mark everything as zero */
4068                 ret |= BDRV_BLOCK_ZERO;
4069             } else {
4070                 /* Limit request to the range reported by the protocol driver */
4071                 *pnum = file_pnum;
4072                 ret |= (ret2 & BDRV_BLOCK_ZERO);
4073             }
4074         }
4075     }
4076 
4077     return ret;
4078 }
4079 
4080 /* Coroutine wrapper for bdrv_get_block_status() */
4081 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
4082 {
4083     BdrvCoGetBlockStatusData *data = opaque;
4084     BlockDriverState *bs = data->bs;
4085 
4086     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4087                                          data->pnum);
4088     data->done = true;
4089 }
4090 
4091 /*
4092  * Synchronous wrapper around bdrv_co_get_block_status().
4093  *
4094  * See bdrv_co_get_block_status() for details.
4095  */
4096 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4097                               int nb_sectors, int *pnum)
4098 {
4099     Coroutine *co;
4100     BdrvCoGetBlockStatusData data = {
4101         .bs = bs,
4102         .sector_num = sector_num,
4103         .nb_sectors = nb_sectors,
4104         .pnum = pnum,
4105         .done = false,
4106     };
4107 
4108     if (qemu_in_coroutine()) {
4109         /* Fast-path if already in coroutine context */
4110         bdrv_get_block_status_co_entry(&data);
4111     } else {
4112         AioContext *aio_context = bdrv_get_aio_context(bs);
4113 
4114         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4115         qemu_coroutine_enter(co, &data);
4116         while (!data.done) {
4117             aio_poll(aio_context, true);
4118         }
4119     }
4120     return data.ret;
4121 }
4122 
4123 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4124                                    int nb_sectors, int *pnum)
4125 {
4126     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4127     if (ret < 0) {
4128         return ret;
4129     }
4130     return !!(ret & BDRV_BLOCK_ALLOCATED);
4131 }
4132 
4133 /*
4134  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4135  *
4136  * Return true if the given sector is allocated in any image between
4137  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
4138  * sector is allocated in any image of the chain.  Return false otherwise.
4139  *
4140  * 'pnum' is set to the number of sectors (including and immediately following
4141  *  the specified sector) that are known to be in the same
4142  *  allocated/unallocated state.
4143  *
4144  */
4145 int bdrv_is_allocated_above(BlockDriverState *top,
4146                             BlockDriverState *base,
4147                             int64_t sector_num,
4148                             int nb_sectors, int *pnum)
4149 {
4150     BlockDriverState *intermediate;
4151     int ret, n = nb_sectors;
4152 
4153     intermediate = top;
4154     while (intermediate && intermediate != base) {
4155         int pnum_inter;
4156         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4157                                 &pnum_inter);
4158         if (ret < 0) {
4159             return ret;
4160         } else if (ret) {
4161             *pnum = pnum_inter;
4162             return 1;
4163         }
4164 
4165         /*
4166          * [sector_num, nb_sectors] is unallocated on top but intermediate
4167          * might have
4168          *
4169          * [sector_num+x, nr_sectors] allocated.
4170          */
4171         if (n > pnum_inter &&
4172             (intermediate == top ||
4173              sector_num + pnum_inter < intermediate->total_sectors)) {
4174             n = pnum_inter;
4175         }
4176 
4177         intermediate = intermediate->backing_hd;
4178     }
4179 
4180     *pnum = n;
4181     return 0;
4182 }
4183 
4184 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4185 {
4186     if (bs->backing_hd && bs->backing_hd->encrypted)
4187         return bs->backing_file;
4188     else if (bs->encrypted)
4189         return bs->filename;
4190     else
4191         return NULL;
4192 }
4193 
4194 void bdrv_get_backing_filename(BlockDriverState *bs,
4195                                char *filename, int filename_size)
4196 {
4197     pstrcpy(filename, filename_size, bs->backing_file);
4198 }
4199 
4200 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4201                           const uint8_t *buf, int nb_sectors)
4202 {
4203     BlockDriver *drv = bs->drv;
4204     int ret;
4205 
4206     if (!drv) {
4207         return -ENOMEDIUM;
4208     }
4209     if (!drv->bdrv_write_compressed) {
4210         return -ENOTSUP;
4211     }
4212     ret = bdrv_check_request(bs, sector_num, nb_sectors);
4213     if (ret < 0) {
4214         return ret;
4215     }
4216 
4217     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4218 
4219     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4220 }
4221 
4222 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4223 {
4224     BlockDriver *drv = bs->drv;
4225     if (!drv)
4226         return -ENOMEDIUM;
4227     if (!drv->bdrv_get_info)
4228         return -ENOTSUP;
4229     memset(bdi, 0, sizeof(*bdi));
4230     return drv->bdrv_get_info(bs, bdi);
4231 }
4232 
4233 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4234 {
4235     BlockDriver *drv = bs->drv;
4236     if (drv && drv->bdrv_get_specific_info) {
4237         return drv->bdrv_get_specific_info(bs);
4238     }
4239     return NULL;
4240 }
4241 
4242 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4243                       int64_t pos, int size)
4244 {
4245     QEMUIOVector qiov;
4246     struct iovec iov = {
4247         .iov_base   = (void *) buf,
4248         .iov_len    = size,
4249     };
4250 
4251     qemu_iovec_init_external(&qiov, &iov, 1);
4252     return bdrv_writev_vmstate(bs, &qiov, pos);
4253 }
4254 
4255 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4256 {
4257     BlockDriver *drv = bs->drv;
4258 
4259     if (!drv) {
4260         return -ENOMEDIUM;
4261     } else if (drv->bdrv_save_vmstate) {
4262         return drv->bdrv_save_vmstate(bs, qiov, pos);
4263     } else if (bs->file) {
4264         return bdrv_writev_vmstate(bs->file, qiov, pos);
4265     }
4266 
4267     return -ENOTSUP;
4268 }
4269 
4270 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4271                       int64_t pos, int size)
4272 {
4273     BlockDriver *drv = bs->drv;
4274     if (!drv)
4275         return -ENOMEDIUM;
4276     if (drv->bdrv_load_vmstate)
4277         return drv->bdrv_load_vmstate(bs, buf, pos, size);
4278     if (bs->file)
4279         return bdrv_load_vmstate(bs->file, buf, pos, size);
4280     return -ENOTSUP;
4281 }
4282 
4283 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4284 {
4285     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4286         return;
4287     }
4288 
4289     bs->drv->bdrv_debug_event(bs, event);
4290 }
4291 
4292 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4293                           const char *tag)
4294 {
4295     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4296         bs = bs->file;
4297     }
4298 
4299     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4300         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4301     }
4302 
4303     return -ENOTSUP;
4304 }
4305 
4306 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4307 {
4308     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4309         bs = bs->file;
4310     }
4311 
4312     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4313         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4314     }
4315 
4316     return -ENOTSUP;
4317 }
4318 
4319 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4320 {
4321     while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4322         bs = bs->file;
4323     }
4324 
4325     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4326         return bs->drv->bdrv_debug_resume(bs, tag);
4327     }
4328 
4329     return -ENOTSUP;
4330 }
4331 
4332 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4333 {
4334     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4335         bs = bs->file;
4336     }
4337 
4338     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4339         return bs->drv->bdrv_debug_is_suspended(bs, tag);
4340     }
4341 
4342     return false;
4343 }
4344 
4345 int bdrv_is_snapshot(BlockDriverState *bs)
4346 {
4347     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4348 }
4349 
4350 /* backing_file can either be relative, or absolute, or a protocol.  If it is
4351  * relative, it must be relative to the chain.  So, passing in bs->filename
4352  * from a BDS as backing_file should not be done, as that may be relative to
4353  * the CWD rather than the chain. */
4354 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4355         const char *backing_file)
4356 {
4357     char *filename_full = NULL;
4358     char *backing_file_full = NULL;
4359     char *filename_tmp = NULL;
4360     int is_protocol = 0;
4361     BlockDriverState *curr_bs = NULL;
4362     BlockDriverState *retval = NULL;
4363 
4364     if (!bs || !bs->drv || !backing_file) {
4365         return NULL;
4366     }
4367 
4368     filename_full     = g_malloc(PATH_MAX);
4369     backing_file_full = g_malloc(PATH_MAX);
4370     filename_tmp      = g_malloc(PATH_MAX);
4371 
4372     is_protocol = path_has_protocol(backing_file);
4373 
4374     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4375 
4376         /* If either of the filename paths is actually a protocol, then
4377          * compare unmodified paths; otherwise make paths relative */
4378         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4379             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4380                 retval = curr_bs->backing_hd;
4381                 break;
4382             }
4383         } else {
4384             /* If not an absolute filename path, make it relative to the current
4385              * image's filename path */
4386             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4387                          backing_file);
4388 
4389             /* We are going to compare absolute pathnames */
4390             if (!realpath(filename_tmp, filename_full)) {
4391                 continue;
4392             }
4393 
4394             /* We need to make sure the backing filename we are comparing against
4395              * is relative to the current image filename (or absolute) */
4396             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4397                          curr_bs->backing_file);
4398 
4399             if (!realpath(filename_tmp, backing_file_full)) {
4400                 continue;
4401             }
4402 
4403             if (strcmp(backing_file_full, filename_full) == 0) {
4404                 retval = curr_bs->backing_hd;
4405                 break;
4406             }
4407         }
4408     }
4409 
4410     g_free(filename_full);
4411     g_free(backing_file_full);
4412     g_free(filename_tmp);
4413     return retval;
4414 }
4415 
4416 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4417 {
4418     if (!bs->drv) {
4419         return 0;
4420     }
4421 
4422     if (!bs->backing_hd) {
4423         return 0;
4424     }
4425 
4426     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4427 }
4428 
4429 /**************************************************************/
4430 /* async I/Os */
4431 
4432 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4433                            QEMUIOVector *qiov, int nb_sectors,
4434                            BlockCompletionFunc *cb, void *opaque)
4435 {
4436     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4437 
4438     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4439                                  cb, opaque, false);
4440 }
4441 
4442 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4443                             QEMUIOVector *qiov, int nb_sectors,
4444                             BlockCompletionFunc *cb, void *opaque)
4445 {
4446     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4447 
4448     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4449                                  cb, opaque, true);
4450 }
4451 
4452 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4453         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4454         BlockCompletionFunc *cb, void *opaque)
4455 {
4456     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4457 
4458     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4459                                  BDRV_REQ_ZERO_WRITE | flags,
4460                                  cb, opaque, true);
4461 }
4462 
4463 
4464 typedef struct MultiwriteCB {
4465     int error;
4466     int num_requests;
4467     int num_callbacks;
4468     struct {
4469         BlockCompletionFunc *cb;
4470         void *opaque;
4471         QEMUIOVector *free_qiov;
4472     } callbacks[];
4473 } MultiwriteCB;
4474 
4475 static void multiwrite_user_cb(MultiwriteCB *mcb)
4476 {
4477     int i;
4478 
4479     for (i = 0; i < mcb->num_callbacks; i++) {
4480         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4481         if (mcb->callbacks[i].free_qiov) {
4482             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4483         }
4484         g_free(mcb->callbacks[i].free_qiov);
4485     }
4486 }
4487 
4488 static void multiwrite_cb(void *opaque, int ret)
4489 {
4490     MultiwriteCB *mcb = opaque;
4491 
4492     trace_multiwrite_cb(mcb, ret);
4493 
4494     if (ret < 0 && !mcb->error) {
4495         mcb->error = ret;
4496     }
4497 
4498     mcb->num_requests--;
4499     if (mcb->num_requests == 0) {
4500         multiwrite_user_cb(mcb);
4501         g_free(mcb);
4502     }
4503 }
4504 
4505 static int multiwrite_req_compare(const void *a, const void *b)
4506 {
4507     const BlockRequest *req1 = a, *req2 = b;
4508 
4509     /*
4510      * Note that we can't simply subtract req2->sector from req1->sector
4511      * here as that could overflow the return value.
4512      */
4513     if (req1->sector > req2->sector) {
4514         return 1;
4515     } else if (req1->sector < req2->sector) {
4516         return -1;
4517     } else {
4518         return 0;
4519     }
4520 }
4521 
4522 /*
4523  * Takes a bunch of requests and tries to merge them. Returns the number of
4524  * requests that remain after merging.
4525  */
4526 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4527     int num_reqs, MultiwriteCB *mcb)
4528 {
4529     int i, outidx;
4530 
4531     // Sort requests by start sector
4532     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4533 
4534     // Check if adjacent requests touch the same clusters. If so, combine them,
4535     // filling up gaps with zero sectors.
4536     outidx = 0;
4537     for (i = 1; i < num_reqs; i++) {
4538         int merge = 0;
4539         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4540 
4541         // Handle exactly sequential writes and overlapping writes.
4542         if (reqs[i].sector <= oldreq_last) {
4543             merge = 1;
4544         }
4545 
4546         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4547             merge = 0;
4548         }
4549 
4550         if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
4551             reqs[i].nb_sectors > bs->bl.max_transfer_length) {
4552             merge = 0;
4553         }
4554 
4555         if (merge) {
4556             size_t size;
4557             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4558             qemu_iovec_init(qiov,
4559                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4560 
4561             // Add the first request to the merged one. If the requests are
4562             // overlapping, drop the last sectors of the first request.
4563             size = (reqs[i].sector - reqs[outidx].sector) << 9;
4564             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4565 
4566             // We should need to add any zeros between the two requests
4567             assert (reqs[i].sector <= oldreq_last);
4568 
4569             // Add the second request
4570             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4571 
4572             // Add tail of first request, if necessary
4573             if (qiov->size < reqs[outidx].qiov->size) {
4574                 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4575                                   reqs[outidx].qiov->size - qiov->size);
4576             }
4577 
4578             reqs[outidx].nb_sectors = qiov->size >> 9;
4579             reqs[outidx].qiov = qiov;
4580 
4581             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4582         } else {
4583             outidx++;
4584             reqs[outidx].sector     = reqs[i].sector;
4585             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4586             reqs[outidx].qiov       = reqs[i].qiov;
4587         }
4588     }
4589 
4590     block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
4591 
4592     return outidx + 1;
4593 }
4594 
4595 /*
4596  * Submit multiple AIO write requests at once.
4597  *
4598  * On success, the function returns 0 and all requests in the reqs array have
4599  * been submitted. In error case this function returns -1, and any of the
4600  * requests may or may not be submitted yet. In particular, this means that the
4601  * callback will be called for some of the requests, for others it won't. The
4602  * caller must check the error field of the BlockRequest to wait for the right
4603  * callbacks (if error != 0, no callback will be called).
4604  *
4605  * The implementation may modify the contents of the reqs array, e.g. to merge
4606  * requests. However, the fields opaque and error are left unmodified as they
4607  * are used to signal failure for a single request to the caller.
4608  */
4609 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4610 {
4611     MultiwriteCB *mcb;
4612     int i;
4613 
4614     /* don't submit writes if we don't have a medium */
4615     if (bs->drv == NULL) {
4616         for (i = 0; i < num_reqs; i++) {
4617             reqs[i].error = -ENOMEDIUM;
4618         }
4619         return -1;
4620     }
4621 
4622     if (num_reqs == 0) {
4623         return 0;
4624     }
4625 
4626     // Create MultiwriteCB structure
4627     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4628     mcb->num_requests = 0;
4629     mcb->num_callbacks = num_reqs;
4630 
4631     for (i = 0; i < num_reqs; i++) {
4632         mcb->callbacks[i].cb = reqs[i].cb;
4633         mcb->callbacks[i].opaque = reqs[i].opaque;
4634     }
4635 
4636     // Check for mergable requests
4637     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4638 
4639     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4640 
4641     /* Run the aio requests. */
4642     mcb->num_requests = num_reqs;
4643     for (i = 0; i < num_reqs; i++) {
4644         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4645                               reqs[i].nb_sectors, reqs[i].flags,
4646                               multiwrite_cb, mcb,
4647                               true);
4648     }
4649 
4650     return 0;
4651 }
4652 
4653 void bdrv_aio_cancel(BlockAIOCB *acb)
4654 {
4655     qemu_aio_ref(acb);
4656     bdrv_aio_cancel_async(acb);
4657     while (acb->refcnt > 1) {
4658         if (acb->aiocb_info->get_aio_context) {
4659             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4660         } else if (acb->bs) {
4661             aio_poll(bdrv_get_aio_context(acb->bs), true);
4662         } else {
4663             abort();
4664         }
4665     }
4666     qemu_aio_unref(acb);
4667 }
4668 
4669 /* Async version of aio cancel. The caller is not blocked if the acb implements
4670  * cancel_async, otherwise we do nothing and let the request normally complete.
4671  * In either case the completion callback must be called. */
4672 void bdrv_aio_cancel_async(BlockAIOCB *acb)
4673 {
4674     if (acb->aiocb_info->cancel_async) {
4675         acb->aiocb_info->cancel_async(acb);
4676     }
4677 }
4678 
4679 /**************************************************************/
4680 /* async block device emulation */
4681 
4682 typedef struct BlockAIOCBSync {
4683     BlockAIOCB common;
4684     QEMUBH *bh;
4685     int ret;
4686     /* vector translation state */
4687     QEMUIOVector *qiov;
4688     uint8_t *bounce;
4689     int is_write;
4690 } BlockAIOCBSync;
4691 
4692 static const AIOCBInfo bdrv_em_aiocb_info = {
4693     .aiocb_size         = sizeof(BlockAIOCBSync),
4694 };
4695 
4696 static void bdrv_aio_bh_cb(void *opaque)
4697 {
4698     BlockAIOCBSync *acb = opaque;
4699 
4700     if (!acb->is_write && acb->ret >= 0) {
4701         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4702     }
4703     qemu_vfree(acb->bounce);
4704     acb->common.cb(acb->common.opaque, acb->ret);
4705     qemu_bh_delete(acb->bh);
4706     acb->bh = NULL;
4707     qemu_aio_unref(acb);
4708 }
4709 
4710 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4711                                       int64_t sector_num,
4712                                       QEMUIOVector *qiov,
4713                                       int nb_sectors,
4714                                       BlockCompletionFunc *cb,
4715                                       void *opaque,
4716                                       int is_write)
4717 
4718 {
4719     BlockAIOCBSync *acb;
4720 
4721     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4722     acb->is_write = is_write;
4723     acb->qiov = qiov;
4724     acb->bounce = qemu_try_blockalign(bs, qiov->size);
4725     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4726 
4727     if (acb->bounce == NULL) {
4728         acb->ret = -ENOMEM;
4729     } else if (is_write) {
4730         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4731         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4732     } else {
4733         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4734     }
4735 
4736     qemu_bh_schedule(acb->bh);
4737 
4738     return &acb->common;
4739 }
4740 
4741 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4742         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4743         BlockCompletionFunc *cb, void *opaque)
4744 {
4745     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4746 }
4747 
4748 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4749         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4750         BlockCompletionFunc *cb, void *opaque)
4751 {
4752     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4753 }
4754 
4755 
4756 typedef struct BlockAIOCBCoroutine {
4757     BlockAIOCB common;
4758     BlockRequest req;
4759     bool is_write;
4760     bool *done;
4761     QEMUBH* bh;
4762 } BlockAIOCBCoroutine;
4763 
4764 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4765     .aiocb_size         = sizeof(BlockAIOCBCoroutine),
4766 };
4767 
4768 static void bdrv_co_em_bh(void *opaque)
4769 {
4770     BlockAIOCBCoroutine *acb = opaque;
4771 
4772     acb->common.cb(acb->common.opaque, acb->req.error);
4773 
4774     qemu_bh_delete(acb->bh);
4775     qemu_aio_unref(acb);
4776 }
4777 
4778 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4779 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4780 {
4781     BlockAIOCBCoroutine *acb = opaque;
4782     BlockDriverState *bs = acb->common.bs;
4783 
4784     if (!acb->is_write) {
4785         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4786             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4787     } else {
4788         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4789             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4790     }
4791 
4792     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4793     qemu_bh_schedule(acb->bh);
4794 }
4795 
4796 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4797                                          int64_t sector_num,
4798                                          QEMUIOVector *qiov,
4799                                          int nb_sectors,
4800                                          BdrvRequestFlags flags,
4801                                          BlockCompletionFunc *cb,
4802                                          void *opaque,
4803                                          bool is_write)
4804 {
4805     Coroutine *co;
4806     BlockAIOCBCoroutine *acb;
4807 
4808     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4809     acb->req.sector = sector_num;
4810     acb->req.nb_sectors = nb_sectors;
4811     acb->req.qiov = qiov;
4812     acb->req.flags = flags;
4813     acb->is_write = is_write;
4814 
4815     co = qemu_coroutine_create(bdrv_co_do_rw);
4816     qemu_coroutine_enter(co, acb);
4817 
4818     return &acb->common;
4819 }
4820 
4821 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4822 {
4823     BlockAIOCBCoroutine *acb = opaque;
4824     BlockDriverState *bs = acb->common.bs;
4825 
4826     acb->req.error = bdrv_co_flush(bs);
4827     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4828     qemu_bh_schedule(acb->bh);
4829 }
4830 
4831 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4832         BlockCompletionFunc *cb, void *opaque)
4833 {
4834     trace_bdrv_aio_flush(bs, opaque);
4835 
4836     Coroutine *co;
4837     BlockAIOCBCoroutine *acb;
4838 
4839     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4840 
4841     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4842     qemu_coroutine_enter(co, acb);
4843 
4844     return &acb->common;
4845 }
4846 
4847 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4848 {
4849     BlockAIOCBCoroutine *acb = opaque;
4850     BlockDriverState *bs = acb->common.bs;
4851 
4852     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4853     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4854     qemu_bh_schedule(acb->bh);
4855 }
4856 
4857 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4858         int64_t sector_num, int nb_sectors,
4859         BlockCompletionFunc *cb, void *opaque)
4860 {
4861     Coroutine *co;
4862     BlockAIOCBCoroutine *acb;
4863 
4864     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4865 
4866     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4867     acb->req.sector = sector_num;
4868     acb->req.nb_sectors = nb_sectors;
4869     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4870     qemu_coroutine_enter(co, acb);
4871 
4872     return &acb->common;
4873 }
4874 
4875 void bdrv_init(void)
4876 {
4877     module_call_init(MODULE_INIT_BLOCK);
4878 }
4879 
4880 void bdrv_init_with_whitelist(void)
4881 {
4882     use_bdrv_whitelist = 1;
4883     bdrv_init();
4884 }
4885 
4886 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4887                    BlockCompletionFunc *cb, void *opaque)
4888 {
4889     BlockAIOCB *acb;
4890 
4891     acb = g_slice_alloc(aiocb_info->aiocb_size);
4892     acb->aiocb_info = aiocb_info;
4893     acb->bs = bs;
4894     acb->cb = cb;
4895     acb->opaque = opaque;
4896     acb->refcnt = 1;
4897     return acb;
4898 }
4899 
4900 void qemu_aio_ref(void *p)
4901 {
4902     BlockAIOCB *acb = p;
4903     acb->refcnt++;
4904 }
4905 
4906 void qemu_aio_unref(void *p)
4907 {
4908     BlockAIOCB *acb = p;
4909     assert(acb->refcnt > 0);
4910     if (--acb->refcnt == 0) {
4911         g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4912     }
4913 }
4914 
4915 /**************************************************************/
4916 /* Coroutine block device emulation */
4917 
4918 typedef struct CoroutineIOCompletion {
4919     Coroutine *coroutine;
4920     int ret;
4921 } CoroutineIOCompletion;
4922 
4923 static void bdrv_co_io_em_complete(void *opaque, int ret)
4924 {
4925     CoroutineIOCompletion *co = opaque;
4926 
4927     co->ret = ret;
4928     qemu_coroutine_enter(co->coroutine, NULL);
4929 }
4930 
4931 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4932                                       int nb_sectors, QEMUIOVector *iov,
4933                                       bool is_write)
4934 {
4935     CoroutineIOCompletion co = {
4936         .coroutine = qemu_coroutine_self(),
4937     };
4938     BlockAIOCB *acb;
4939 
4940     if (is_write) {
4941         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4942                                        bdrv_co_io_em_complete, &co);
4943     } else {
4944         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4945                                       bdrv_co_io_em_complete, &co);
4946     }
4947 
4948     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4949     if (!acb) {
4950         return -EIO;
4951     }
4952     qemu_coroutine_yield();
4953 
4954     return co.ret;
4955 }
4956 
4957 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4958                                          int64_t sector_num, int nb_sectors,
4959                                          QEMUIOVector *iov)
4960 {
4961     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4962 }
4963 
4964 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4965                                          int64_t sector_num, int nb_sectors,
4966                                          QEMUIOVector *iov)
4967 {
4968     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4969 }
4970 
4971 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4972 {
4973     RwCo *rwco = opaque;
4974 
4975     rwco->ret = bdrv_co_flush(rwco->bs);
4976 }
4977 
4978 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4979 {
4980     int ret;
4981 
4982     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4983         return 0;
4984     }
4985 
4986     /* Write back cached data to the OS even with cache=unsafe */
4987     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4988     if (bs->drv->bdrv_co_flush_to_os) {
4989         ret = bs->drv->bdrv_co_flush_to_os(bs);
4990         if (ret < 0) {
4991             return ret;
4992         }
4993     }
4994 
4995     /* But don't actually force it to the disk with cache=unsafe */
4996     if (bs->open_flags & BDRV_O_NO_FLUSH) {
4997         goto flush_parent;
4998     }
4999 
5000     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
5001     if (bs->drv->bdrv_co_flush_to_disk) {
5002         ret = bs->drv->bdrv_co_flush_to_disk(bs);
5003     } else if (bs->drv->bdrv_aio_flush) {
5004         BlockAIOCB *acb;
5005         CoroutineIOCompletion co = {
5006             .coroutine = qemu_coroutine_self(),
5007         };
5008 
5009         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
5010         if (acb == NULL) {
5011             ret = -EIO;
5012         } else {
5013             qemu_coroutine_yield();
5014             ret = co.ret;
5015         }
5016     } else {
5017         /*
5018          * Some block drivers always operate in either writethrough or unsafe
5019          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
5020          * know how the server works (because the behaviour is hardcoded or
5021          * depends on server-side configuration), so we can't ensure that
5022          * everything is safe on disk. Returning an error doesn't work because
5023          * that would break guests even if the server operates in writethrough
5024          * mode.
5025          *
5026          * Let's hope the user knows what he's doing.
5027          */
5028         ret = 0;
5029     }
5030     if (ret < 0) {
5031         return ret;
5032     }
5033 
5034     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
5035      * in the case of cache=unsafe, so there are no useless flushes.
5036      */
5037 flush_parent:
5038     return bdrv_co_flush(bs->file);
5039 }
5040 
5041 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
5042 {
5043     Error *local_err = NULL;
5044     int ret;
5045 
5046     if (!bs->drv)  {
5047         return;
5048     }
5049 
5050     if (!(bs->open_flags & BDRV_O_INCOMING)) {
5051         return;
5052     }
5053     bs->open_flags &= ~BDRV_O_INCOMING;
5054 
5055     if (bs->drv->bdrv_invalidate_cache) {
5056         bs->drv->bdrv_invalidate_cache(bs, &local_err);
5057     } else if (bs->file) {
5058         bdrv_invalidate_cache(bs->file, &local_err);
5059     }
5060     if (local_err) {
5061         error_propagate(errp, local_err);
5062         return;
5063     }
5064 
5065     ret = refresh_total_sectors(bs, bs->total_sectors);
5066     if (ret < 0) {
5067         error_setg_errno(errp, -ret, "Could not refresh total sector count");
5068         return;
5069     }
5070 }
5071 
5072 void bdrv_invalidate_cache_all(Error **errp)
5073 {
5074     BlockDriverState *bs;
5075     Error *local_err = NULL;
5076 
5077     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5078         AioContext *aio_context = bdrv_get_aio_context(bs);
5079 
5080         aio_context_acquire(aio_context);
5081         bdrv_invalidate_cache(bs, &local_err);
5082         aio_context_release(aio_context);
5083         if (local_err) {
5084             error_propagate(errp, local_err);
5085             return;
5086         }
5087     }
5088 }
5089 
5090 int bdrv_flush(BlockDriverState *bs)
5091 {
5092     Coroutine *co;
5093     RwCo rwco = {
5094         .bs = bs,
5095         .ret = NOT_DONE,
5096     };
5097 
5098     if (qemu_in_coroutine()) {
5099         /* Fast-path if already in coroutine context */
5100         bdrv_flush_co_entry(&rwco);
5101     } else {
5102         AioContext *aio_context = bdrv_get_aio_context(bs);
5103 
5104         co = qemu_coroutine_create(bdrv_flush_co_entry);
5105         qemu_coroutine_enter(co, &rwco);
5106         while (rwco.ret == NOT_DONE) {
5107             aio_poll(aio_context, true);
5108         }
5109     }
5110 
5111     return rwco.ret;
5112 }
5113 
5114 typedef struct DiscardCo {
5115     BlockDriverState *bs;
5116     int64_t sector_num;
5117     int nb_sectors;
5118     int ret;
5119 } DiscardCo;
5120 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5121 {
5122     DiscardCo *rwco = opaque;
5123 
5124     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5125 }
5126 
5127 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5128                                  int nb_sectors)
5129 {
5130     int max_discard, ret;
5131 
5132     if (!bs->drv) {
5133         return -ENOMEDIUM;
5134     }
5135 
5136     ret = bdrv_check_request(bs, sector_num, nb_sectors);
5137     if (ret < 0) {
5138         return ret;
5139     } else if (bs->read_only) {
5140         return -EROFS;
5141     }
5142 
5143     bdrv_reset_dirty(bs, sector_num, nb_sectors);
5144 
5145     /* Do nothing if disabled.  */
5146     if (!(bs->open_flags & BDRV_O_UNMAP)) {
5147         return 0;
5148     }
5149 
5150     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5151         return 0;
5152     }
5153 
5154     max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
5155     while (nb_sectors > 0) {
5156         int ret;
5157         int num = nb_sectors;
5158 
5159         /* align request */
5160         if (bs->bl.discard_alignment &&
5161             num >= bs->bl.discard_alignment &&
5162             sector_num % bs->bl.discard_alignment) {
5163             if (num > bs->bl.discard_alignment) {
5164                 num = bs->bl.discard_alignment;
5165             }
5166             num -= sector_num % bs->bl.discard_alignment;
5167         }
5168 
5169         /* limit request size */
5170         if (num > max_discard) {
5171             num = max_discard;
5172         }
5173 
5174         if (bs->drv->bdrv_co_discard) {
5175             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5176         } else {
5177             BlockAIOCB *acb;
5178             CoroutineIOCompletion co = {
5179                 .coroutine = qemu_coroutine_self(),
5180             };
5181 
5182             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5183                                             bdrv_co_io_em_complete, &co);
5184             if (acb == NULL) {
5185                 return -EIO;
5186             } else {
5187                 qemu_coroutine_yield();
5188                 ret = co.ret;
5189             }
5190         }
5191         if (ret && ret != -ENOTSUP) {
5192             return ret;
5193         }
5194 
5195         sector_num += num;
5196         nb_sectors -= num;
5197     }
5198     return 0;
5199 }
5200 
5201 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5202 {
5203     Coroutine *co;
5204     DiscardCo rwco = {
5205         .bs = bs,
5206         .sector_num = sector_num,
5207         .nb_sectors = nb_sectors,
5208         .ret = NOT_DONE,
5209     };
5210 
5211     if (qemu_in_coroutine()) {
5212         /* Fast-path if already in coroutine context */
5213         bdrv_discard_co_entry(&rwco);
5214     } else {
5215         AioContext *aio_context = bdrv_get_aio_context(bs);
5216 
5217         co = qemu_coroutine_create(bdrv_discard_co_entry);
5218         qemu_coroutine_enter(co, &rwco);
5219         while (rwco.ret == NOT_DONE) {
5220             aio_poll(aio_context, true);
5221         }
5222     }
5223 
5224     return rwco.ret;
5225 }
5226 
5227 /**************************************************************/
5228 /* removable device support */
5229 
5230 /**
5231  * Return TRUE if the media is present
5232  */
5233 int bdrv_is_inserted(BlockDriverState *bs)
5234 {
5235     BlockDriver *drv = bs->drv;
5236 
5237     if (!drv)
5238         return 0;
5239     if (!drv->bdrv_is_inserted)
5240         return 1;
5241     return drv->bdrv_is_inserted(bs);
5242 }
5243 
5244 /**
5245  * Return whether the media changed since the last call to this
5246  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
5247  */
5248 int bdrv_media_changed(BlockDriverState *bs)
5249 {
5250     BlockDriver *drv = bs->drv;
5251 
5252     if (drv && drv->bdrv_media_changed) {
5253         return drv->bdrv_media_changed(bs);
5254     }
5255     return -ENOTSUP;
5256 }
5257 
5258 /**
5259  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5260  */
5261 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5262 {
5263     BlockDriver *drv = bs->drv;
5264     const char *device_name;
5265 
5266     if (drv && drv->bdrv_eject) {
5267         drv->bdrv_eject(bs, eject_flag);
5268     }
5269 
5270     device_name = bdrv_get_device_name(bs);
5271     if (device_name[0] != '\0') {
5272         qapi_event_send_device_tray_moved(device_name,
5273                                           eject_flag, &error_abort);
5274     }
5275 }
5276 
5277 /**
5278  * Lock or unlock the media (if it is locked, the user won't be able
5279  * to eject it manually).
5280  */
5281 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5282 {
5283     BlockDriver *drv = bs->drv;
5284 
5285     trace_bdrv_lock_medium(bs, locked);
5286 
5287     if (drv && drv->bdrv_lock_medium) {
5288         drv->bdrv_lock_medium(bs, locked);
5289     }
5290 }
5291 
5292 /* needed for generic scsi interface */
5293 
5294 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5295 {
5296     BlockDriver *drv = bs->drv;
5297 
5298     if (drv && drv->bdrv_ioctl)
5299         return drv->bdrv_ioctl(bs, req, buf);
5300     return -ENOTSUP;
5301 }
5302 
5303 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5304         unsigned long int req, void *buf,
5305         BlockCompletionFunc *cb, void *opaque)
5306 {
5307     BlockDriver *drv = bs->drv;
5308 
5309     if (drv && drv->bdrv_aio_ioctl)
5310         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5311     return NULL;
5312 }
5313 
5314 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5315 {
5316     bs->guest_block_size = align;
5317 }
5318 
5319 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5320 {
5321     return qemu_memalign(bdrv_opt_mem_align(bs), size);
5322 }
5323 
5324 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5325 {
5326     return memset(qemu_blockalign(bs, size), 0, size);
5327 }
5328 
5329 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5330 {
5331     size_t align = bdrv_opt_mem_align(bs);
5332 
5333     /* Ensure that NULL is never returned on success */
5334     assert(align > 0);
5335     if (size == 0) {
5336         size = align;
5337     }
5338 
5339     return qemu_try_memalign(align, size);
5340 }
5341 
5342 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5343 {
5344     void *mem = qemu_try_blockalign(bs, size);
5345 
5346     if (mem) {
5347         memset(mem, 0, size);
5348     }
5349 
5350     return mem;
5351 }
5352 
5353 /*
5354  * Check if all memory in this vector is sector aligned.
5355  */
5356 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5357 {
5358     int i;
5359     size_t alignment = bdrv_opt_mem_align(bs);
5360 
5361     for (i = 0; i < qiov->niov; i++) {
5362         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5363             return false;
5364         }
5365         if (qiov->iov[i].iov_len % alignment) {
5366             return false;
5367         }
5368     }
5369 
5370     return true;
5371 }
5372 
5373 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5374                                           Error **errp)
5375 {
5376     int64_t bitmap_size;
5377     BdrvDirtyBitmap *bitmap;
5378 
5379     assert((granularity & (granularity - 1)) == 0);
5380 
5381     granularity >>= BDRV_SECTOR_BITS;
5382     assert(granularity);
5383     bitmap_size = bdrv_nb_sectors(bs);
5384     if (bitmap_size < 0) {
5385         error_setg_errno(errp, -bitmap_size, "could not get length of device");
5386         errno = -bitmap_size;
5387         return NULL;
5388     }
5389     bitmap = g_new0(BdrvDirtyBitmap, 1);
5390     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5391     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5392     return bitmap;
5393 }
5394 
5395 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5396 {
5397     BdrvDirtyBitmap *bm, *next;
5398     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5399         if (bm == bitmap) {
5400             QLIST_REMOVE(bitmap, list);
5401             hbitmap_free(bitmap->bitmap);
5402             g_free(bitmap);
5403             return;
5404         }
5405     }
5406 }
5407 
5408 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5409 {
5410     BdrvDirtyBitmap *bm;
5411     BlockDirtyInfoList *list = NULL;
5412     BlockDirtyInfoList **plist = &list;
5413 
5414     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5415         BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5416         BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
5417         info->count = bdrv_get_dirty_count(bs, bm);
5418         info->granularity =
5419             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5420         entry->value = info;
5421         *plist = entry;
5422         plist = &entry->next;
5423     }
5424 
5425     return list;
5426 }
5427 
5428 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5429 {
5430     if (bitmap) {
5431         return hbitmap_get(bitmap->bitmap, sector);
5432     } else {
5433         return 0;
5434     }
5435 }
5436 
5437 void bdrv_dirty_iter_init(BlockDriverState *bs,
5438                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5439 {
5440     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5441 }
5442 
5443 void bdrv_set_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
5444                            int64_t cur_sector, int nr_sectors)
5445 {
5446     hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5447 }
5448 
5449 void bdrv_reset_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
5450                              int64_t cur_sector, int nr_sectors)
5451 {
5452     hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5453 }
5454 
5455 static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5456                            int nr_sectors)
5457 {
5458     BdrvDirtyBitmap *bitmap;
5459     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5460         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5461     }
5462 }
5463 
5464 static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
5465                              int nr_sectors)
5466 {
5467     BdrvDirtyBitmap *bitmap;
5468     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5469         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5470     }
5471 }
5472 
5473 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5474 {
5475     return hbitmap_count(bitmap->bitmap);
5476 }
5477 
5478 /* Get a reference to bs */
5479 void bdrv_ref(BlockDriverState *bs)
5480 {
5481     bs->refcnt++;
5482 }
5483 
5484 /* Release a previously grabbed reference to bs.
5485  * If after releasing, reference count is zero, the BlockDriverState is
5486  * deleted. */
5487 void bdrv_unref(BlockDriverState *bs)
5488 {
5489     if (!bs) {
5490         return;
5491     }
5492     assert(bs->refcnt > 0);
5493     if (--bs->refcnt == 0) {
5494         bdrv_delete(bs);
5495     }
5496 }
5497 
5498 struct BdrvOpBlocker {
5499     Error *reason;
5500     QLIST_ENTRY(BdrvOpBlocker) list;
5501 };
5502 
5503 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5504 {
5505     BdrvOpBlocker *blocker;
5506     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5507     if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5508         blocker = QLIST_FIRST(&bs->op_blockers[op]);
5509         if (errp) {
5510             error_setg(errp, "Device '%s' is busy: %s",
5511                        bdrv_get_device_name(bs),
5512                        error_get_pretty(blocker->reason));
5513         }
5514         return true;
5515     }
5516     return false;
5517 }
5518 
5519 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5520 {
5521     BdrvOpBlocker *blocker;
5522     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5523 
5524     blocker = g_new0(BdrvOpBlocker, 1);
5525     blocker->reason = reason;
5526     QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5527 }
5528 
5529 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5530 {
5531     BdrvOpBlocker *blocker, *next;
5532     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5533     QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5534         if (blocker->reason == reason) {
5535             QLIST_REMOVE(blocker, list);
5536             g_free(blocker);
5537         }
5538     }
5539 }
5540 
5541 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5542 {
5543     int i;
5544     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5545         bdrv_op_block(bs, i, reason);
5546     }
5547 }
5548 
5549 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5550 {
5551     int i;
5552     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5553         bdrv_op_unblock(bs, i, reason);
5554     }
5555 }
5556 
5557 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5558 {
5559     int i;
5560 
5561     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5562         if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5563             return false;
5564         }
5565     }
5566     return true;
5567 }
5568 
5569 void bdrv_iostatus_enable(BlockDriverState *bs)
5570 {
5571     bs->iostatus_enabled = true;
5572     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5573 }
5574 
5575 /* The I/O status is only enabled if the drive explicitly
5576  * enables it _and_ the VM is configured to stop on errors */
5577 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5578 {
5579     return (bs->iostatus_enabled &&
5580            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5581             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5582             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5583 }
5584 
5585 void bdrv_iostatus_disable(BlockDriverState *bs)
5586 {
5587     bs->iostatus_enabled = false;
5588 }
5589 
5590 void bdrv_iostatus_reset(BlockDriverState *bs)
5591 {
5592     if (bdrv_iostatus_is_enabled(bs)) {
5593         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5594         if (bs->job) {
5595             block_job_iostatus_reset(bs->job);
5596         }
5597     }
5598 }
5599 
5600 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5601 {
5602     assert(bdrv_iostatus_is_enabled(bs));
5603     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5604         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5605                                          BLOCK_DEVICE_IO_STATUS_FAILED;
5606     }
5607 }
5608 
5609 void bdrv_img_create(const char *filename, const char *fmt,
5610                      const char *base_filename, const char *base_fmt,
5611                      char *options, uint64_t img_size, int flags,
5612                      Error **errp, bool quiet)
5613 {
5614     QemuOptsList *create_opts = NULL;
5615     QemuOpts *opts = NULL;
5616     const char *backing_fmt, *backing_file;
5617     int64_t size;
5618     BlockDriver *drv, *proto_drv;
5619     BlockDriver *backing_drv = NULL;
5620     Error *local_err = NULL;
5621     int ret = 0;
5622 
5623     /* Find driver and parse its options */
5624     drv = bdrv_find_format(fmt);
5625     if (!drv) {
5626         error_setg(errp, "Unknown file format '%s'", fmt);
5627         return;
5628     }
5629 
5630     proto_drv = bdrv_find_protocol(filename, true, errp);
5631     if (!proto_drv) {
5632         return;
5633     }
5634 
5635     if (!drv->create_opts) {
5636         error_setg(errp, "Format driver '%s' does not support image creation",
5637                    drv->format_name);
5638         return;
5639     }
5640 
5641     if (!proto_drv->create_opts) {
5642         error_setg(errp, "Protocol driver '%s' does not support image creation",
5643                    proto_drv->format_name);
5644         return;
5645     }
5646 
5647     create_opts = qemu_opts_append(create_opts, drv->create_opts);
5648     create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5649 
5650     /* Create parameter list with default values */
5651     opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5652     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size, &error_abort);
5653 
5654     /* Parse -o options */
5655     if (options) {
5656         qemu_opts_do_parse(opts, options, NULL, &local_err);
5657         if (local_err) {
5658             error_report_err(local_err);
5659             local_err = NULL;
5660             error_setg(errp, "Invalid options for file format '%s'", fmt);
5661             goto out;
5662         }
5663     }
5664 
5665     if (base_filename) {
5666         qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename, &local_err);
5667         if (local_err) {
5668             error_setg(errp, "Backing file not supported for file format '%s'",
5669                        fmt);
5670             goto out;
5671         }
5672     }
5673 
5674     if (base_fmt) {
5675         qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt, &local_err);
5676         if (local_err) {
5677             error_setg(errp, "Backing file format not supported for file "
5678                              "format '%s'", fmt);
5679             goto out;
5680         }
5681     }
5682 
5683     backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5684     if (backing_file) {
5685         if (!strcmp(filename, backing_file)) {
5686             error_setg(errp, "Error: Trying to create an image with the "
5687                              "same filename as the backing file");
5688             goto out;
5689         }
5690     }
5691 
5692     backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5693     if (backing_fmt) {
5694         backing_drv = bdrv_find_format(backing_fmt);
5695         if (!backing_drv) {
5696             error_setg(errp, "Unknown backing file format '%s'",
5697                        backing_fmt);
5698             goto out;
5699         }
5700     }
5701 
5702     // The size for the image must always be specified, with one exception:
5703     // If we are using a backing file, we can obtain the size from there
5704     size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5705     if (size == -1) {
5706         if (backing_file) {
5707             BlockDriverState *bs;
5708             char *full_backing = g_new0(char, PATH_MAX);
5709             int64_t size;
5710             int back_flags;
5711 
5712             bdrv_get_full_backing_filename_from_filename(filename, backing_file,
5713                                                          full_backing, PATH_MAX,
5714                                                          &local_err);
5715             if (local_err) {
5716                 g_free(full_backing);
5717                 goto out;
5718             }
5719 
5720             /* backing files always opened read-only */
5721             back_flags =
5722                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5723 
5724             bs = NULL;
5725             ret = bdrv_open(&bs, full_backing, NULL, NULL, back_flags,
5726                             backing_drv, &local_err);
5727             g_free(full_backing);
5728             if (ret < 0) {
5729                 goto out;
5730             }
5731             size = bdrv_getlength(bs);
5732             if (size < 0) {
5733                 error_setg_errno(errp, -size, "Could not get size of '%s'",
5734                                  backing_file);
5735                 bdrv_unref(bs);
5736                 goto out;
5737             }
5738 
5739             qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size, &error_abort);
5740 
5741             bdrv_unref(bs);
5742         } else {
5743             error_setg(errp, "Image creation needs a size parameter");
5744             goto out;
5745         }
5746     }
5747 
5748     if (!quiet) {
5749         printf("Formatting '%s', fmt=%s", filename, fmt);
5750         qemu_opts_print(opts, " ");
5751         puts("");
5752     }
5753 
5754     ret = bdrv_create(drv, filename, opts, &local_err);
5755 
5756     if (ret == -EFBIG) {
5757         /* This is generally a better message than whatever the driver would
5758          * deliver (especially because of the cluster_size_hint), since that
5759          * is most probably not much different from "image too large". */
5760         const char *cluster_size_hint = "";
5761         if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
5762             cluster_size_hint = " (try using a larger cluster size)";
5763         }
5764         error_setg(errp, "The image size is too large for file format '%s'"
5765                    "%s", fmt, cluster_size_hint);
5766         error_free(local_err);
5767         local_err = NULL;
5768     }
5769 
5770 out:
5771     qemu_opts_del(opts);
5772     qemu_opts_free(create_opts);
5773     if (local_err) {
5774         error_propagate(errp, local_err);
5775     }
5776 }
5777 
5778 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5779 {
5780     return bs->aio_context;
5781 }
5782 
5783 void bdrv_detach_aio_context(BlockDriverState *bs)
5784 {
5785     BdrvAioNotifier *baf;
5786 
5787     if (!bs->drv) {
5788         return;
5789     }
5790 
5791     QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5792         baf->detach_aio_context(baf->opaque);
5793     }
5794 
5795     if (bs->io_limits_enabled) {
5796         throttle_detach_aio_context(&bs->throttle_state);
5797     }
5798     if (bs->drv->bdrv_detach_aio_context) {
5799         bs->drv->bdrv_detach_aio_context(bs);
5800     }
5801     if (bs->file) {
5802         bdrv_detach_aio_context(bs->file);
5803     }
5804     if (bs->backing_hd) {
5805         bdrv_detach_aio_context(bs->backing_hd);
5806     }
5807 
5808     bs->aio_context = NULL;
5809 }
5810 
5811 void bdrv_attach_aio_context(BlockDriverState *bs,
5812                              AioContext *new_context)
5813 {
5814     BdrvAioNotifier *ban;
5815 
5816     if (!bs->drv) {
5817         return;
5818     }
5819 
5820     bs->aio_context = new_context;
5821 
5822     if (bs->backing_hd) {
5823         bdrv_attach_aio_context(bs->backing_hd, new_context);
5824     }
5825     if (bs->file) {
5826         bdrv_attach_aio_context(bs->file, new_context);
5827     }
5828     if (bs->drv->bdrv_attach_aio_context) {
5829         bs->drv->bdrv_attach_aio_context(bs, new_context);
5830     }
5831     if (bs->io_limits_enabled) {
5832         throttle_attach_aio_context(&bs->throttle_state, new_context);
5833     }
5834 
5835     QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5836         ban->attached_aio_context(new_context, ban->opaque);
5837     }
5838 }
5839 
5840 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5841 {
5842     bdrv_drain_all(); /* ensure there are no in-flight requests */
5843 
5844     bdrv_detach_aio_context(bs);
5845 
5846     /* This function executes in the old AioContext so acquire the new one in
5847      * case it runs in a different thread.
5848      */
5849     aio_context_acquire(new_context);
5850     bdrv_attach_aio_context(bs, new_context);
5851     aio_context_release(new_context);
5852 }
5853 
5854 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5855         void (*attached_aio_context)(AioContext *new_context, void *opaque),
5856         void (*detach_aio_context)(void *opaque), void *opaque)
5857 {
5858     BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5859     *ban = (BdrvAioNotifier){
5860         .attached_aio_context = attached_aio_context,
5861         .detach_aio_context   = detach_aio_context,
5862         .opaque               = opaque
5863     };
5864 
5865     QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5866 }
5867 
5868 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5869                                       void (*attached_aio_context)(AioContext *,
5870                                                                    void *),
5871                                       void (*detach_aio_context)(void *),
5872                                       void *opaque)
5873 {
5874     BdrvAioNotifier *ban, *ban_next;
5875 
5876     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5877         if (ban->attached_aio_context == attached_aio_context &&
5878             ban->detach_aio_context   == detach_aio_context   &&
5879             ban->opaque               == opaque)
5880         {
5881             QLIST_REMOVE(ban, list);
5882             g_free(ban);
5883 
5884             return;
5885         }
5886     }
5887 
5888     abort();
5889 }
5890 
5891 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5892                                     NotifierWithReturn *notifier)
5893 {
5894     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5895 }
5896 
5897 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
5898                        BlockDriverAmendStatusCB *status_cb)
5899 {
5900     if (!bs->drv->bdrv_amend_options) {
5901         return -ENOTSUP;
5902     }
5903     return bs->drv->bdrv_amend_options(bs, opts, status_cb);
5904 }
5905 
5906 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5907  * of block filter and by bdrv_is_first_non_filter.
5908  * It is used to test if the given bs is the candidate or recurse more in the
5909  * node graph.
5910  */
5911 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5912                                       BlockDriverState *candidate)
5913 {
5914     /* return false if basic checks fails */
5915     if (!bs || !bs->drv) {
5916         return false;
5917     }
5918 
5919     /* the code reached a non block filter driver -> check if the bs is
5920      * the same as the candidate. It's the recursion termination condition.
5921      */
5922     if (!bs->drv->is_filter) {
5923         return bs == candidate;
5924     }
5925     /* Down this path the driver is a block filter driver */
5926 
5927     /* If the block filter recursion method is defined use it to recurse down
5928      * the node graph.
5929      */
5930     if (bs->drv->bdrv_recurse_is_first_non_filter) {
5931         return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5932     }
5933 
5934     /* the driver is a block filter but don't allow to recurse -> return false
5935      */
5936     return false;
5937 }
5938 
5939 /* This function checks if the candidate is the first non filter bs down it's
5940  * bs chain. Since we don't have pointers to parents it explore all bs chains
5941  * from the top. Some filters can choose not to pass down the recursion.
5942  */
5943 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5944 {
5945     BlockDriverState *bs;
5946 
5947     /* walk down the bs forest recursively */
5948     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5949         bool perm;
5950 
5951         /* try to recurse in this top level bs */
5952         perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5953 
5954         /* candidate is the first non filter */
5955         if (perm) {
5956             return true;
5957         }
5958     }
5959 
5960     return false;
5961 }
5962 
5963 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5964 {
5965     BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5966     AioContext *aio_context;
5967 
5968     if (!to_replace_bs) {
5969         error_setg(errp, "Node name '%s' not found", node_name);
5970         return NULL;
5971     }
5972 
5973     aio_context = bdrv_get_aio_context(to_replace_bs);
5974     aio_context_acquire(aio_context);
5975 
5976     if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5977         to_replace_bs = NULL;
5978         goto out;
5979     }
5980 
5981     /* We don't want arbitrary node of the BDS chain to be replaced only the top
5982      * most non filter in order to prevent data corruption.
5983      * Another benefit is that this tests exclude backing files which are
5984      * blocked by the backing blockers.
5985      */
5986     if (!bdrv_is_first_non_filter(to_replace_bs)) {
5987         error_setg(errp, "Only top most non filter can be replaced");
5988         to_replace_bs = NULL;
5989         goto out;
5990     }
5991 
5992 out:
5993     aio_context_release(aio_context);
5994     return to_replace_bs;
5995 }
5996 
5997 void bdrv_io_plug(BlockDriverState *bs)
5998 {
5999     BlockDriver *drv = bs->drv;
6000     if (drv && drv->bdrv_io_plug) {
6001         drv->bdrv_io_plug(bs);
6002     } else if (bs->file) {
6003         bdrv_io_plug(bs->file);
6004     }
6005 }
6006 
6007 void bdrv_io_unplug(BlockDriverState *bs)
6008 {
6009     BlockDriver *drv = bs->drv;
6010     if (drv && drv->bdrv_io_unplug) {
6011         drv->bdrv_io_unplug(bs);
6012     } else if (bs->file) {
6013         bdrv_io_unplug(bs->file);
6014     }
6015 }
6016 
6017 void bdrv_flush_io_queue(BlockDriverState *bs)
6018 {
6019     BlockDriver *drv = bs->drv;
6020     if (drv && drv->bdrv_flush_io_queue) {
6021         drv->bdrv_flush_io_queue(bs);
6022     } else if (bs->file) {
6023         bdrv_flush_io_queue(bs->file);
6024     }
6025 }
6026 
6027 static bool append_open_options(QDict *d, BlockDriverState *bs)
6028 {
6029     const QDictEntry *entry;
6030     bool found_any = false;
6031 
6032     for (entry = qdict_first(bs->options); entry;
6033          entry = qdict_next(bs->options, entry))
6034     {
6035         /* Only take options for this level and exclude all non-driver-specific
6036          * options */
6037         if (!strchr(qdict_entry_key(entry), '.') &&
6038             strcmp(qdict_entry_key(entry), "node-name"))
6039         {
6040             qobject_incref(qdict_entry_value(entry));
6041             qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
6042             found_any = true;
6043         }
6044     }
6045 
6046     return found_any;
6047 }
6048 
6049 /* Updates the following BDS fields:
6050  *  - exact_filename: A filename which may be used for opening a block device
6051  *                    which (mostly) equals the given BDS (even without any
6052  *                    other options; so reading and writing must return the same
6053  *                    results, but caching etc. may be different)
6054  *  - full_open_options: Options which, when given when opening a block device
6055  *                       (without a filename), result in a BDS (mostly)
6056  *                       equalling the given one
6057  *  - filename: If exact_filename is set, it is copied here. Otherwise,
6058  *              full_open_options is converted to a JSON object, prefixed with
6059  *              "json:" (for use through the JSON pseudo protocol) and put here.
6060  */
6061 void bdrv_refresh_filename(BlockDriverState *bs)
6062 {
6063     BlockDriver *drv = bs->drv;
6064     QDict *opts;
6065 
6066     if (!drv) {
6067         return;
6068     }
6069 
6070     /* This BDS's file name will most probably depend on its file's name, so
6071      * refresh that first */
6072     if (bs->file) {
6073         bdrv_refresh_filename(bs->file);
6074     }
6075 
6076     if (drv->bdrv_refresh_filename) {
6077         /* Obsolete information is of no use here, so drop the old file name
6078          * information before refreshing it */
6079         bs->exact_filename[0] = '\0';
6080         if (bs->full_open_options) {
6081             QDECREF(bs->full_open_options);
6082             bs->full_open_options = NULL;
6083         }
6084 
6085         drv->bdrv_refresh_filename(bs);
6086     } else if (bs->file) {
6087         /* Try to reconstruct valid information from the underlying file */
6088         bool has_open_options;
6089 
6090         bs->exact_filename[0] = '\0';
6091         if (bs->full_open_options) {
6092             QDECREF(bs->full_open_options);
6093             bs->full_open_options = NULL;
6094         }
6095 
6096         opts = qdict_new();
6097         has_open_options = append_open_options(opts, bs);
6098 
6099         /* If no specific options have been given for this BDS, the filename of
6100          * the underlying file should suffice for this one as well */
6101         if (bs->file->exact_filename[0] && !has_open_options) {
6102             strcpy(bs->exact_filename, bs->file->exact_filename);
6103         }
6104         /* Reconstructing the full options QDict is simple for most format block
6105          * drivers, as long as the full options are known for the underlying
6106          * file BDS. The full options QDict of that file BDS should somehow
6107          * contain a representation of the filename, therefore the following
6108          * suffices without querying the (exact_)filename of this BDS. */
6109         if (bs->file->full_open_options) {
6110             qdict_put_obj(opts, "driver",
6111                           QOBJECT(qstring_from_str(drv->format_name)));
6112             QINCREF(bs->file->full_open_options);
6113             qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6114 
6115             bs->full_open_options = opts;
6116         } else {
6117             QDECREF(opts);
6118         }
6119     } else if (!bs->full_open_options && qdict_size(bs->options)) {
6120         /* There is no underlying file BDS (at least referenced by BDS.file),
6121          * so the full options QDict should be equal to the options given
6122          * specifically for this block device when it was opened (plus the
6123          * driver specification).
6124          * Because those options don't change, there is no need to update
6125          * full_open_options when it's already set. */
6126 
6127         opts = qdict_new();
6128         append_open_options(opts, bs);
6129         qdict_put_obj(opts, "driver",
6130                       QOBJECT(qstring_from_str(drv->format_name)));
6131 
6132         if (bs->exact_filename[0]) {
6133             /* This may not work for all block protocol drivers (some may
6134              * require this filename to be parsed), but we have to find some
6135              * default solution here, so just include it. If some block driver
6136              * does not support pure options without any filename at all or
6137              * needs some special format of the options QDict, it needs to
6138              * implement the driver-specific bdrv_refresh_filename() function.
6139              */
6140             qdict_put_obj(opts, "filename",
6141                           QOBJECT(qstring_from_str(bs->exact_filename)));
6142         }
6143 
6144         bs->full_open_options = opts;
6145     }
6146 
6147     if (bs->exact_filename[0]) {
6148         pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6149     } else if (bs->full_open_options) {
6150         QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6151         snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6152                  qstring_get_str(json));
6153         QDECREF(json);
6154     }
6155 }
6156 
6157 /* This accessor function purpose is to allow the device models to access the
6158  * BlockAcctStats structure embedded inside a BlockDriverState without being
6159  * aware of the BlockDriverState structure layout.
6160  * It will go away when the BlockAcctStats structure will be moved inside
6161  * the device models.
6162  */
6163 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6164 {
6165     return &bs->stats;
6166 }
6167