xref: /openbmc/qemu/block.c (revision 465bee1da82e43f18d10c43cc7566d0284ad13a9)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor/monitor.h"
28 #include "block/block_int.h"
29 #include "block/blockjob.h"
30 #include "qemu/module.h"
31 #include "qapi/qmp/qjson.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 
39 #ifdef CONFIG_BSD
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/queue.h>
44 #ifndef __DragonFly__
45 #include <sys/disk.h>
46 #endif
47 #endif
48 
49 #ifdef _WIN32
50 #include <windows.h>
51 #endif
52 
53 struct BdrvDirtyBitmap {
54     HBitmap *bitmap;
55     QLIST_ENTRY(BdrvDirtyBitmap) list;
56 };
57 
58 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59 
60 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63         BlockDriverCompletionFunc *cb, void *opaque);
64 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66         BlockDriverCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68                                          int64_t sector_num, int nb_sectors,
69                                          QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71                                          int64_t sector_num, int nb_sectors,
72                                          QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75     BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78     BdrvRequestFlags flags);
79 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80                                                int64_t sector_num,
81                                                QEMUIOVector *qiov,
82                                                int nb_sectors,
83                                                BdrvRequestFlags flags,
84                                                BlockDriverCompletionFunc *cb,
85                                                void *opaque,
86                                                bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90 
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92     QTAILQ_HEAD_INITIALIZER(bdrv_states);
93 
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96 
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98     QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
102 
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
105 {
106     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108             filename[1] == ':');
109 }
110 
111 int is_windows_drive(const char *filename)
112 {
113     if (is_windows_drive_prefix(filename) &&
114         filename[2] == '\0')
115         return 1;
116     if (strstart(filename, "\\\\.\\", NULL) ||
117         strstart(filename, "//./", NULL))
118         return 1;
119     return 0;
120 }
121 #endif
122 
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125                         ThrottleConfig *cfg)
126 {
127     int i;
128 
129     throttle_config(&bs->throttle_state, cfg);
130 
131     for (i = 0; i < 2; i++) {
132         qemu_co_enter_next(&bs->throttled_reqs[i]);
133     }
134 }
135 
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138 {
139     bool drained = false;
140     bool enabled = bs->io_limits_enabled;
141     int i;
142 
143     bs->io_limits_enabled = false;
144 
145     for (i = 0; i < 2; i++) {
146         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147             drained = true;
148         }
149     }
150 
151     bs->io_limits_enabled = enabled;
152 
153     return drained;
154 }
155 
156 void bdrv_io_limits_disable(BlockDriverState *bs)
157 {
158     bs->io_limits_enabled = false;
159 
160     bdrv_start_throttled_reqs(bs);
161 
162     throttle_destroy(&bs->throttle_state);
163 }
164 
165 static void bdrv_throttle_read_timer_cb(void *opaque)
166 {
167     BlockDriverState *bs = opaque;
168     qemu_co_enter_next(&bs->throttled_reqs[0]);
169 }
170 
171 static void bdrv_throttle_write_timer_cb(void *opaque)
172 {
173     BlockDriverState *bs = opaque;
174     qemu_co_enter_next(&bs->throttled_reqs[1]);
175 }
176 
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
179 {
180     assert(!bs->io_limits_enabled);
181     throttle_init(&bs->throttle_state,
182                   QEMU_CLOCK_VIRTUAL,
183                   bdrv_throttle_read_timer_cb,
184                   bdrv_throttle_write_timer_cb,
185                   bs);
186     bs->io_limits_enabled = true;
187 }
188 
189 /* This function makes an IO wait if needed
190  *
191  * @nb_sectors: the number of sectors of the IO
192  * @is_write:   is the IO a write
193  */
194 static void bdrv_io_limits_intercept(BlockDriverState *bs,
195                                      unsigned int bytes,
196                                      bool is_write)
197 {
198     /* does this io must wait */
199     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
200 
201     /* if must wait or any request of this type throttled queue the IO */
202     if (must_wait ||
203         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
205     }
206 
207     /* the IO will be executed, do the accounting */
208     throttle_account(&bs->throttle_state, is_write, bytes);
209 
210 
211     /* if the next request must wait -> do nothing */
212     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
213         return;
214     }
215 
216     /* else queue next request for execution */
217     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
218 }
219 
220 size_t bdrv_opt_mem_align(BlockDriverState *bs)
221 {
222     if (!bs || !bs->drv) {
223         /* 4k should be on the safe side */
224         return 4096;
225     }
226 
227     return bs->bl.opt_mem_alignment;
228 }
229 
230 /* check if the path starts with "<protocol>:" */
231 static int path_has_protocol(const char *path)
232 {
233     const char *p;
234 
235 #ifdef _WIN32
236     if (is_windows_drive(path) ||
237         is_windows_drive_prefix(path)) {
238         return 0;
239     }
240     p = path + strcspn(path, ":/\\");
241 #else
242     p = path + strcspn(path, ":/");
243 #endif
244 
245     return *p == ':';
246 }
247 
248 int path_is_absolute(const char *path)
249 {
250 #ifdef _WIN32
251     /* specific case for names like: "\\.\d:" */
252     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
253         return 1;
254     }
255     return (*path == '/' || *path == '\\');
256 #else
257     return (*path == '/');
258 #endif
259 }
260 
261 /* if filename is absolute, just copy it to dest. Otherwise, build a
262    path to it by considering it is relative to base_path. URL are
263    supported. */
264 void path_combine(char *dest, int dest_size,
265                   const char *base_path,
266                   const char *filename)
267 {
268     const char *p, *p1;
269     int len;
270 
271     if (dest_size <= 0)
272         return;
273     if (path_is_absolute(filename)) {
274         pstrcpy(dest, dest_size, filename);
275     } else {
276         p = strchr(base_path, ':');
277         if (p)
278             p++;
279         else
280             p = base_path;
281         p1 = strrchr(base_path, '/');
282 #ifdef _WIN32
283         {
284             const char *p2;
285             p2 = strrchr(base_path, '\\');
286             if (!p1 || p2 > p1)
287                 p1 = p2;
288         }
289 #endif
290         if (p1)
291             p1++;
292         else
293             p1 = base_path;
294         if (p1 > p)
295             p = p1;
296         len = p - base_path;
297         if (len > dest_size - 1)
298             len = dest_size - 1;
299         memcpy(dest, base_path, len);
300         dest[len] = '\0';
301         pstrcat(dest, dest_size, filename);
302     }
303 }
304 
305 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
306 {
307     if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
308         pstrcpy(dest, sz, bs->backing_file);
309     } else {
310         path_combine(dest, sz, bs->filename, bs->backing_file);
311     }
312 }
313 
314 void bdrv_register(BlockDriver *bdrv)
315 {
316     /* Block drivers without coroutine functions need emulation */
317     if (!bdrv->bdrv_co_readv) {
318         bdrv->bdrv_co_readv = bdrv_co_readv_em;
319         bdrv->bdrv_co_writev = bdrv_co_writev_em;
320 
321         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
322          * the block driver lacks aio we need to emulate that too.
323          */
324         if (!bdrv->bdrv_aio_readv) {
325             /* add AIO emulation layer */
326             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
327             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
328         }
329     }
330 
331     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
332 }
333 
334 /* create a new block device (by default it is empty) */
335 BlockDriverState *bdrv_new(const char *device_name, Error **errp)
336 {
337     BlockDriverState *bs;
338 
339     if (bdrv_find(device_name)) {
340         error_setg(errp, "Device with id '%s' already exists",
341                    device_name);
342         return NULL;
343     }
344     if (bdrv_find_node(device_name)) {
345         error_setg(errp, "Device with node-name '%s' already exists",
346                    device_name);
347         return NULL;
348     }
349 
350     bs = g_malloc0(sizeof(BlockDriverState));
351     QLIST_INIT(&bs->dirty_bitmaps);
352     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
353     if (device_name[0] != '\0') {
354         QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
355     }
356     bdrv_iostatus_disable(bs);
357     notifier_list_init(&bs->close_notifiers);
358     notifier_with_return_list_init(&bs->before_write_notifiers);
359     qemu_co_queue_init(&bs->throttled_reqs[0]);
360     qemu_co_queue_init(&bs->throttled_reqs[1]);
361     bs->refcnt = 1;
362 
363     return bs;
364 }
365 
366 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
367 {
368     notifier_list_add(&bs->close_notifiers, notify);
369 }
370 
371 BlockDriver *bdrv_find_format(const char *format_name)
372 {
373     BlockDriver *drv1;
374     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
375         if (!strcmp(drv1->format_name, format_name)) {
376             return drv1;
377         }
378     }
379     return NULL;
380 }
381 
382 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
383 {
384     static const char *whitelist_rw[] = {
385         CONFIG_BDRV_RW_WHITELIST
386     };
387     static const char *whitelist_ro[] = {
388         CONFIG_BDRV_RO_WHITELIST
389     };
390     const char **p;
391 
392     if (!whitelist_rw[0] && !whitelist_ro[0]) {
393         return 1;               /* no whitelist, anything goes */
394     }
395 
396     for (p = whitelist_rw; *p; p++) {
397         if (!strcmp(drv->format_name, *p)) {
398             return 1;
399         }
400     }
401     if (read_only) {
402         for (p = whitelist_ro; *p; p++) {
403             if (!strcmp(drv->format_name, *p)) {
404                 return 1;
405             }
406         }
407     }
408     return 0;
409 }
410 
411 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
412                                           bool read_only)
413 {
414     BlockDriver *drv = bdrv_find_format(format_name);
415     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
416 }
417 
418 typedef struct CreateCo {
419     BlockDriver *drv;
420     char *filename;
421     QEMUOptionParameter *options;
422     int ret;
423     Error *err;
424 } CreateCo;
425 
426 static void coroutine_fn bdrv_create_co_entry(void *opaque)
427 {
428     Error *local_err = NULL;
429     int ret;
430 
431     CreateCo *cco = opaque;
432     assert(cco->drv);
433 
434     ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
435     if (local_err) {
436         error_propagate(&cco->err, local_err);
437     }
438     cco->ret = ret;
439 }
440 
441 int bdrv_create(BlockDriver *drv, const char* filename,
442     QEMUOptionParameter *options, Error **errp)
443 {
444     int ret;
445 
446     Coroutine *co;
447     CreateCo cco = {
448         .drv = drv,
449         .filename = g_strdup(filename),
450         .options = options,
451         .ret = NOT_DONE,
452         .err = NULL,
453     };
454 
455     if (!drv->bdrv_create) {
456         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
457         ret = -ENOTSUP;
458         goto out;
459     }
460 
461     if (qemu_in_coroutine()) {
462         /* Fast-path if already in coroutine context */
463         bdrv_create_co_entry(&cco);
464     } else {
465         co = qemu_coroutine_create(bdrv_create_co_entry);
466         qemu_coroutine_enter(co, &cco);
467         while (cco.ret == NOT_DONE) {
468             qemu_aio_wait();
469         }
470     }
471 
472     ret = cco.ret;
473     if (ret < 0) {
474         if (cco.err) {
475             error_propagate(errp, cco.err);
476         } else {
477             error_setg_errno(errp, -ret, "Could not create image");
478         }
479     }
480 
481 out:
482     g_free(cco.filename);
483     return ret;
484 }
485 
486 int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
487                      Error **errp)
488 {
489     BlockDriver *drv;
490     Error *local_err = NULL;
491     int ret;
492 
493     drv = bdrv_find_protocol(filename, true);
494     if (drv == NULL) {
495         error_setg(errp, "Could not find protocol for file '%s'", filename);
496         return -ENOENT;
497     }
498 
499     ret = bdrv_create(drv, filename, options, &local_err);
500     if (local_err) {
501         error_propagate(errp, local_err);
502     }
503     return ret;
504 }
505 
506 int bdrv_refresh_limits(BlockDriverState *bs)
507 {
508     BlockDriver *drv = bs->drv;
509 
510     memset(&bs->bl, 0, sizeof(bs->bl));
511 
512     if (!drv) {
513         return 0;
514     }
515 
516     /* Take some limits from the children as a default */
517     if (bs->file) {
518         bdrv_refresh_limits(bs->file);
519         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
520         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
521     } else {
522         bs->bl.opt_mem_alignment = 512;
523     }
524 
525     if (bs->backing_hd) {
526         bdrv_refresh_limits(bs->backing_hd);
527         bs->bl.opt_transfer_length =
528             MAX(bs->bl.opt_transfer_length,
529                 bs->backing_hd->bl.opt_transfer_length);
530         bs->bl.opt_mem_alignment =
531             MAX(bs->bl.opt_mem_alignment,
532                 bs->backing_hd->bl.opt_mem_alignment);
533     }
534 
535     /* Then let the driver override it */
536     if (drv->bdrv_refresh_limits) {
537         return drv->bdrv_refresh_limits(bs);
538     }
539 
540     return 0;
541 }
542 
543 /*
544  * Create a uniquely-named empty temporary file.
545  * Return 0 upon success, otherwise a negative errno value.
546  */
547 int get_tmp_filename(char *filename, int size)
548 {
549 #ifdef _WIN32
550     char temp_dir[MAX_PATH];
551     /* GetTempFileName requires that its output buffer (4th param)
552        have length MAX_PATH or greater.  */
553     assert(size >= MAX_PATH);
554     return (GetTempPath(MAX_PATH, temp_dir)
555             && GetTempFileName(temp_dir, "qem", 0, filename)
556             ? 0 : -GetLastError());
557 #else
558     int fd;
559     const char *tmpdir;
560     tmpdir = getenv("TMPDIR");
561     if (!tmpdir) {
562         tmpdir = "/var/tmp";
563     }
564     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
565         return -EOVERFLOW;
566     }
567     fd = mkstemp(filename);
568     if (fd < 0) {
569         return -errno;
570     }
571     if (close(fd) != 0) {
572         unlink(filename);
573         return -errno;
574     }
575     return 0;
576 #endif
577 }
578 
579 /*
580  * Detect host devices. By convention, /dev/cdrom[N] is always
581  * recognized as a host CDROM.
582  */
583 static BlockDriver *find_hdev_driver(const char *filename)
584 {
585     int score_max = 0, score;
586     BlockDriver *drv = NULL, *d;
587 
588     QLIST_FOREACH(d, &bdrv_drivers, list) {
589         if (d->bdrv_probe_device) {
590             score = d->bdrv_probe_device(filename);
591             if (score > score_max) {
592                 score_max = score;
593                 drv = d;
594             }
595         }
596     }
597 
598     return drv;
599 }
600 
601 BlockDriver *bdrv_find_protocol(const char *filename,
602                                 bool allow_protocol_prefix)
603 {
604     BlockDriver *drv1;
605     char protocol[128];
606     int len;
607     const char *p;
608 
609     /* TODO Drivers without bdrv_file_open must be specified explicitly */
610 
611     /*
612      * XXX(hch): we really should not let host device detection
613      * override an explicit protocol specification, but moving this
614      * later breaks access to device names with colons in them.
615      * Thanks to the brain-dead persistent naming schemes on udev-
616      * based Linux systems those actually are quite common.
617      */
618     drv1 = find_hdev_driver(filename);
619     if (drv1) {
620         return drv1;
621     }
622 
623     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
624         return bdrv_find_format("file");
625     }
626 
627     p = strchr(filename, ':');
628     assert(p != NULL);
629     len = p - filename;
630     if (len > sizeof(protocol) - 1)
631         len = sizeof(protocol) - 1;
632     memcpy(protocol, filename, len);
633     protocol[len] = '\0';
634     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
635         if (drv1->protocol_name &&
636             !strcmp(drv1->protocol_name, protocol)) {
637             return drv1;
638         }
639     }
640     return NULL;
641 }
642 
643 static int find_image_format(BlockDriverState *bs, const char *filename,
644                              BlockDriver **pdrv, Error **errp)
645 {
646     int score, score_max;
647     BlockDriver *drv1, *drv;
648     uint8_t buf[2048];
649     int ret = 0;
650 
651     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
652     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
653         drv = bdrv_find_format("raw");
654         if (!drv) {
655             error_setg(errp, "Could not find raw image format");
656             ret = -ENOENT;
657         }
658         *pdrv = drv;
659         return ret;
660     }
661 
662     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
663     if (ret < 0) {
664         error_setg_errno(errp, -ret, "Could not read image for determining its "
665                          "format");
666         *pdrv = NULL;
667         return ret;
668     }
669 
670     score_max = 0;
671     drv = NULL;
672     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
673         if (drv1->bdrv_probe) {
674             score = drv1->bdrv_probe(buf, ret, filename);
675             if (score > score_max) {
676                 score_max = score;
677                 drv = drv1;
678             }
679         }
680     }
681     if (!drv) {
682         error_setg(errp, "Could not determine image format: No compatible "
683                    "driver found");
684         ret = -ENOENT;
685     }
686     *pdrv = drv;
687     return ret;
688 }
689 
690 /**
691  * Set the current 'total_sectors' value
692  */
693 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
694 {
695     BlockDriver *drv = bs->drv;
696 
697     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
698     if (bs->sg)
699         return 0;
700 
701     /* query actual device if possible, otherwise just trust the hint */
702     if (drv->bdrv_getlength) {
703         int64_t length = drv->bdrv_getlength(bs);
704         if (length < 0) {
705             return length;
706         }
707         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
708     }
709 
710     bs->total_sectors = hint;
711     return 0;
712 }
713 
714 /**
715  * Set open flags for a given discard mode
716  *
717  * Return 0 on success, -1 if the discard mode was invalid.
718  */
719 int bdrv_parse_discard_flags(const char *mode, int *flags)
720 {
721     *flags &= ~BDRV_O_UNMAP;
722 
723     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
724         /* do nothing */
725     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
726         *flags |= BDRV_O_UNMAP;
727     } else {
728         return -1;
729     }
730 
731     return 0;
732 }
733 
734 /**
735  * Set open flags for a given cache mode
736  *
737  * Return 0 on success, -1 if the cache mode was invalid.
738  */
739 int bdrv_parse_cache_flags(const char *mode, int *flags)
740 {
741     *flags &= ~BDRV_O_CACHE_MASK;
742 
743     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
744         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
745     } else if (!strcmp(mode, "directsync")) {
746         *flags |= BDRV_O_NOCACHE;
747     } else if (!strcmp(mode, "writeback")) {
748         *flags |= BDRV_O_CACHE_WB;
749     } else if (!strcmp(mode, "unsafe")) {
750         *flags |= BDRV_O_CACHE_WB;
751         *flags |= BDRV_O_NO_FLUSH;
752     } else if (!strcmp(mode, "writethrough")) {
753         /* this is the default */
754     } else {
755         return -1;
756     }
757 
758     return 0;
759 }
760 
761 /**
762  * The copy-on-read flag is actually a reference count so multiple users may
763  * use the feature without worrying about clobbering its previous state.
764  * Copy-on-read stays enabled until all users have called to disable it.
765  */
766 void bdrv_enable_copy_on_read(BlockDriverState *bs)
767 {
768     bs->copy_on_read++;
769 }
770 
771 void bdrv_disable_copy_on_read(BlockDriverState *bs)
772 {
773     assert(bs->copy_on_read > 0);
774     bs->copy_on_read--;
775 }
776 
777 /*
778  * Returns the flags that a temporary snapshot should get, based on the
779  * originally requested flags (the originally requested image will have flags
780  * like a backing file)
781  */
782 static int bdrv_temp_snapshot_flags(int flags)
783 {
784     return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
785 }
786 
787 /*
788  * Returns the flags that bs->file should get, based on the given flags for
789  * the parent BDS
790  */
791 static int bdrv_inherited_flags(int flags)
792 {
793     /* Enable protocol handling, disable format probing for bs->file */
794     flags |= BDRV_O_PROTOCOL;
795 
796     /* Our block drivers take care to send flushes and respect unmap policy,
797      * so we can enable both unconditionally on lower layers. */
798     flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
799 
800     /* Clear flags that only apply to the top layer */
801     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
802 
803     return flags;
804 }
805 
806 /*
807  * Returns the flags that bs->backing_hd should get, based on the given flags
808  * for the parent BDS
809  */
810 static int bdrv_backing_flags(int flags)
811 {
812     /* backing files always opened read-only */
813     flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
814 
815     /* snapshot=on is handled on the top layer */
816     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
817 
818     return flags;
819 }
820 
821 static int bdrv_open_flags(BlockDriverState *bs, int flags)
822 {
823     int open_flags = flags | BDRV_O_CACHE_WB;
824 
825     /*
826      * Clear flags that are internal to the block layer before opening the
827      * image.
828      */
829     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
830 
831     /*
832      * Snapshots should be writable.
833      */
834     if (flags & BDRV_O_TEMPORARY) {
835         open_flags |= BDRV_O_RDWR;
836     }
837 
838     return open_flags;
839 }
840 
841 static void bdrv_assign_node_name(BlockDriverState *bs,
842                                   const char *node_name,
843                                   Error **errp)
844 {
845     if (!node_name) {
846         return;
847     }
848 
849     /* empty string node name is invalid */
850     if (node_name[0] == '\0') {
851         error_setg(errp, "Empty node name");
852         return;
853     }
854 
855     /* takes care of avoiding namespaces collisions */
856     if (bdrv_find(node_name)) {
857         error_setg(errp, "node-name=%s is conflicting with a device id",
858                    node_name);
859         return;
860     }
861 
862     /* takes care of avoiding duplicates node names */
863     if (bdrv_find_node(node_name)) {
864         error_setg(errp, "Duplicate node name");
865         return;
866     }
867 
868     /* copy node name into the bs and insert it into the graph list */
869     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
870     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
871 }
872 
873 /*
874  * Common part for opening disk images and files
875  *
876  * Removes all processed options from *options.
877  */
878 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
879     QDict *options, int flags, BlockDriver *drv, Error **errp)
880 {
881     int ret, open_flags;
882     const char *filename;
883     const char *node_name = NULL;
884     Error *local_err = NULL;
885 
886     assert(drv != NULL);
887     assert(bs->file == NULL);
888     assert(options != NULL && bs->options != options);
889 
890     if (file != NULL) {
891         filename = file->filename;
892     } else {
893         filename = qdict_get_try_str(options, "filename");
894     }
895 
896     if (drv->bdrv_needs_filename && !filename) {
897         error_setg(errp, "The '%s' block driver requires a file name",
898                    drv->format_name);
899         return -EINVAL;
900     }
901 
902     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
903 
904     node_name = qdict_get_try_str(options, "node-name");
905     bdrv_assign_node_name(bs, node_name, &local_err);
906     if (local_err) {
907         error_propagate(errp, local_err);
908         return -EINVAL;
909     }
910     qdict_del(options, "node-name");
911 
912     /* bdrv_open() with directly using a protocol as drv. This layer is already
913      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
914      * and return immediately. */
915     if (file != NULL && drv->bdrv_file_open) {
916         bdrv_swap(file, bs);
917         return 0;
918     }
919 
920     bs->open_flags = flags;
921     bs->guest_block_size = 512;
922     bs->request_alignment = 512;
923     bs->zero_beyond_eof = true;
924     open_flags = bdrv_open_flags(bs, flags);
925     bs->read_only = !(open_flags & BDRV_O_RDWR);
926 
927     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
928         error_setg(errp,
929                    !bs->read_only && bdrv_is_whitelisted(drv, true)
930                         ? "Driver '%s' can only be used for read-only devices"
931                         : "Driver '%s' is not whitelisted",
932                    drv->format_name);
933         return -ENOTSUP;
934     }
935 
936     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
937     if (flags & BDRV_O_COPY_ON_READ) {
938         if (!bs->read_only) {
939             bdrv_enable_copy_on_read(bs);
940         } else {
941             error_setg(errp, "Can't use copy-on-read on read-only device");
942             return -EINVAL;
943         }
944     }
945 
946     if (filename != NULL) {
947         pstrcpy(bs->filename, sizeof(bs->filename), filename);
948     } else {
949         bs->filename[0] = '\0';
950     }
951 
952     bs->drv = drv;
953     bs->opaque = g_malloc0(drv->instance_size);
954 
955     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
956 
957     /* Open the image, either directly or using a protocol */
958     if (drv->bdrv_file_open) {
959         assert(file == NULL);
960         assert(!drv->bdrv_needs_filename || filename != NULL);
961         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
962     } else {
963         if (file == NULL) {
964             error_setg(errp, "Can't use '%s' as a block driver for the "
965                        "protocol level", drv->format_name);
966             ret = -EINVAL;
967             goto free_and_fail;
968         }
969         bs->file = file;
970         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
971     }
972 
973     if (ret < 0) {
974         if (local_err) {
975             error_propagate(errp, local_err);
976         } else if (bs->filename[0]) {
977             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
978         } else {
979             error_setg_errno(errp, -ret, "Could not open image");
980         }
981         goto free_and_fail;
982     }
983 
984     ret = refresh_total_sectors(bs, bs->total_sectors);
985     if (ret < 0) {
986         error_setg_errno(errp, -ret, "Could not refresh total sector count");
987         goto free_and_fail;
988     }
989 
990     bdrv_refresh_limits(bs);
991     assert(bdrv_opt_mem_align(bs) != 0);
992     assert((bs->request_alignment != 0) || bs->sg);
993     return 0;
994 
995 free_and_fail:
996     bs->file = NULL;
997     g_free(bs->opaque);
998     bs->opaque = NULL;
999     bs->drv = NULL;
1000     return ret;
1001 }
1002 
1003 /*
1004  * Opens a file using a protocol (file, host_device, nbd, ...)
1005  *
1006  * options is an indirect pointer to a QDict of options to pass to the block
1007  * drivers, or pointer to NULL for an empty set of options. If this function
1008  * takes ownership of the QDict reference, it will set *options to NULL;
1009  * otherwise, it will contain unused/unrecognized options after this function
1010  * returns. Then, the caller is responsible for freeing it. If it intends to
1011  * reuse the QDict, QINCREF() should be called beforehand.
1012  */
1013 static int bdrv_file_open(BlockDriverState *bs, const char *filename,
1014                           QDict **options, int flags, Error **errp)
1015 {
1016     BlockDriver *drv;
1017     const char *drvname;
1018     bool parse_filename = false;
1019     Error *local_err = NULL;
1020     int ret;
1021 
1022     /* Fetch the file name from the options QDict if necessary */
1023     if (!filename) {
1024         filename = qdict_get_try_str(*options, "filename");
1025     } else if (filename && !qdict_haskey(*options, "filename")) {
1026         qdict_put(*options, "filename", qstring_from_str(filename));
1027         parse_filename = true;
1028     } else {
1029         error_setg(errp, "Can't specify 'file' and 'filename' options at the "
1030                    "same time");
1031         ret = -EINVAL;
1032         goto fail;
1033     }
1034 
1035     /* Find the right block driver */
1036     drvname = qdict_get_try_str(*options, "driver");
1037     if (drvname) {
1038         drv = bdrv_find_format(drvname);
1039         if (!drv) {
1040             error_setg(errp, "Unknown driver '%s'", drvname);
1041         }
1042         qdict_del(*options, "driver");
1043     } else if (filename) {
1044         drv = bdrv_find_protocol(filename, parse_filename);
1045         if (!drv) {
1046             error_setg(errp, "Unknown protocol");
1047         }
1048     } else {
1049         error_setg(errp, "Must specify either driver or file");
1050         drv = NULL;
1051     }
1052 
1053     if (!drv) {
1054         /* errp has been set already */
1055         ret = -ENOENT;
1056         goto fail;
1057     }
1058 
1059     /* Parse the filename and open it */
1060     if (drv->bdrv_parse_filename && parse_filename) {
1061         drv->bdrv_parse_filename(filename, *options, &local_err);
1062         if (local_err) {
1063             error_propagate(errp, local_err);
1064             ret = -EINVAL;
1065             goto fail;
1066         }
1067 
1068         if (!drv->bdrv_needs_filename) {
1069             qdict_del(*options, "filename");
1070         } else {
1071             filename = qdict_get_str(*options, "filename");
1072         }
1073     }
1074 
1075     if (!drv->bdrv_file_open) {
1076         ret = bdrv_open(&bs, filename, NULL, *options, flags, drv, &local_err);
1077         *options = NULL;
1078     } else {
1079         ret = bdrv_open_common(bs, NULL, *options, flags, drv, &local_err);
1080     }
1081     if (ret < 0) {
1082         error_propagate(errp, local_err);
1083         goto fail;
1084     }
1085 
1086     bs->growable = 1;
1087     return 0;
1088 
1089 fail:
1090     return ret;
1091 }
1092 
1093 /*
1094  * Opens the backing file for a BlockDriverState if not yet open
1095  *
1096  * options is a QDict of options to pass to the block drivers, or NULL for an
1097  * empty set of options. The reference to the QDict is transferred to this
1098  * function (even on failure), so if the caller intends to reuse the dictionary,
1099  * it needs to use QINCREF() before calling bdrv_file_open.
1100  */
1101 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1102 {
1103     char *backing_filename = g_malloc0(PATH_MAX);
1104     int ret = 0;
1105     BlockDriver *back_drv = NULL;
1106     Error *local_err = NULL;
1107 
1108     if (bs->backing_hd != NULL) {
1109         QDECREF(options);
1110         goto free_exit;
1111     }
1112 
1113     /* NULL means an empty set of options */
1114     if (options == NULL) {
1115         options = qdict_new();
1116     }
1117 
1118     bs->open_flags &= ~BDRV_O_NO_BACKING;
1119     if (qdict_haskey(options, "file.filename")) {
1120         backing_filename[0] = '\0';
1121     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1122         QDECREF(options);
1123         goto free_exit;
1124     } else {
1125         bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1126     }
1127 
1128     if (bs->backing_format[0] != '\0') {
1129         back_drv = bdrv_find_format(bs->backing_format);
1130     }
1131 
1132     assert(bs->backing_hd == NULL);
1133     ret = bdrv_open(&bs->backing_hd,
1134                     *backing_filename ? backing_filename : NULL, NULL, options,
1135                     bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
1136     if (ret < 0) {
1137         bs->backing_hd = NULL;
1138         bs->open_flags |= BDRV_O_NO_BACKING;
1139         error_setg(errp, "Could not open backing file: %s",
1140                    error_get_pretty(local_err));
1141         error_free(local_err);
1142         goto free_exit;
1143     }
1144 
1145     if (bs->backing_hd->file) {
1146         pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1147                 bs->backing_hd->file->filename);
1148     }
1149 
1150     /* Recalculate the BlockLimits with the backing file */
1151     bdrv_refresh_limits(bs);
1152 
1153 free_exit:
1154     g_free(backing_filename);
1155     return ret;
1156 }
1157 
1158 /*
1159  * Opens a disk image whose options are given as BlockdevRef in another block
1160  * device's options.
1161  *
1162  * If allow_none is true, no image will be opened if filename is false and no
1163  * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1164  *
1165  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1166  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1167  * itself, all options starting with "${bdref_key}." are considered part of the
1168  * BlockdevRef.
1169  *
1170  * The BlockdevRef will be removed from the options QDict.
1171  *
1172  * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1173  */
1174 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1175                     QDict *options, const char *bdref_key, int flags,
1176                     bool allow_none, Error **errp)
1177 {
1178     QDict *image_options;
1179     int ret;
1180     char *bdref_key_dot;
1181     const char *reference;
1182 
1183     assert(pbs);
1184     assert(*pbs == NULL);
1185 
1186     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1187     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1188     g_free(bdref_key_dot);
1189 
1190     reference = qdict_get_try_str(options, bdref_key);
1191     if (!filename && !reference && !qdict_size(image_options)) {
1192         if (allow_none) {
1193             ret = 0;
1194         } else {
1195             error_setg(errp, "A block device must be specified for \"%s\"",
1196                        bdref_key);
1197             ret = -EINVAL;
1198         }
1199         goto done;
1200     }
1201 
1202     ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1203 
1204 done:
1205     qdict_del(options, bdref_key);
1206     return ret;
1207 }
1208 
1209 void bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1210 {
1211     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1212     char *tmp_filename = g_malloc0(PATH_MAX + 1);
1213     int64_t total_size;
1214     BlockDriver *bdrv_qcow2;
1215     QEMUOptionParameter *create_options;
1216     QDict *snapshot_options;
1217     BlockDriverState *bs_snapshot;
1218     Error *local_err;
1219     int ret;
1220 
1221     /* if snapshot, we create a temporary backing file and open it
1222        instead of opening 'filename' directly */
1223 
1224     /* Get the required size from the image */
1225     total_size = bdrv_getlength(bs);
1226     if (total_size < 0) {
1227         error_setg_errno(errp, -total_size, "Could not get image size");
1228         goto out;
1229     }
1230     total_size &= BDRV_SECTOR_MASK;
1231 
1232     /* Create the temporary image */
1233     ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1234     if (ret < 0) {
1235         error_setg_errno(errp, -ret, "Could not get temporary filename");
1236         goto out;
1237     }
1238 
1239     bdrv_qcow2 = bdrv_find_format("qcow2");
1240     create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1241                                              NULL);
1242 
1243     set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1244 
1245     ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1246     free_option_parameters(create_options);
1247     if (ret < 0) {
1248         error_setg_errno(errp, -ret, "Could not create temporary overlay "
1249                          "'%s': %s", tmp_filename,
1250                          error_get_pretty(local_err));
1251         error_free(local_err);
1252         goto out;
1253     }
1254 
1255     /* Prepare a new options QDict for the temporary file */
1256     snapshot_options = qdict_new();
1257     qdict_put(snapshot_options, "file.driver",
1258               qstring_from_str("file"));
1259     qdict_put(snapshot_options, "file.filename",
1260               qstring_from_str(tmp_filename));
1261 
1262     bs_snapshot = bdrv_new("", &error_abort);
1263 
1264     ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1265                     flags, bdrv_qcow2, &local_err);
1266     if (ret < 0) {
1267         error_propagate(errp, local_err);
1268         goto out;
1269     }
1270 
1271     bdrv_append(bs_snapshot, bs);
1272 
1273 out:
1274     g_free(tmp_filename);
1275 }
1276 
1277 static QDict *parse_json_filename(const char *filename, Error **errp)
1278 {
1279     QObject *options_obj;
1280     QDict *options;
1281     int ret;
1282 
1283     ret = strstart(filename, "json:", &filename);
1284     assert(ret);
1285 
1286     options_obj = qobject_from_json(filename);
1287     if (!options_obj) {
1288         error_setg(errp, "Could not parse the JSON options");
1289         return NULL;
1290     }
1291 
1292     if (qobject_type(options_obj) != QTYPE_QDICT) {
1293         qobject_decref(options_obj);
1294         error_setg(errp, "Invalid JSON object given");
1295         return NULL;
1296     }
1297 
1298     options = qobject_to_qdict(options_obj);
1299     qdict_flatten(options);
1300 
1301     return options;
1302 }
1303 
1304 /*
1305  * Opens a disk image (raw, qcow2, vmdk, ...)
1306  *
1307  * options is a QDict of options to pass to the block drivers, or NULL for an
1308  * empty set of options. The reference to the QDict belongs to the block layer
1309  * after the call (even on failure), so if the caller intends to reuse the
1310  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1311  *
1312  * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1313  * If it is not NULL, the referenced BDS will be reused.
1314  *
1315  * The reference parameter may be used to specify an existing block device which
1316  * should be opened. If specified, neither options nor a filename may be given,
1317  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1318  */
1319 int bdrv_open(BlockDriverState **pbs, const char *filename,
1320               const char *reference, QDict *options, int flags,
1321               BlockDriver *drv, Error **errp)
1322 {
1323     int ret;
1324     BlockDriverState *file = NULL, *bs;
1325     const char *drvname;
1326     Error *local_err = NULL;
1327     int snapshot_flags = 0;
1328 
1329     assert(pbs);
1330 
1331     if (reference) {
1332         bool options_non_empty = options ? qdict_size(options) : false;
1333         QDECREF(options);
1334 
1335         if (*pbs) {
1336             error_setg(errp, "Cannot reuse an existing BDS when referencing "
1337                        "another block device");
1338             return -EINVAL;
1339         }
1340 
1341         if (filename || options_non_empty) {
1342             error_setg(errp, "Cannot reference an existing block device with "
1343                        "additional options or a new filename");
1344             return -EINVAL;
1345         }
1346 
1347         bs = bdrv_lookup_bs(reference, reference, errp);
1348         if (!bs) {
1349             return -ENODEV;
1350         }
1351         bdrv_ref(bs);
1352         *pbs = bs;
1353         return 0;
1354     }
1355 
1356     if (*pbs) {
1357         bs = *pbs;
1358     } else {
1359         bs = bdrv_new("", &error_abort);
1360     }
1361 
1362     /* NULL means an empty set of options */
1363     if (options == NULL) {
1364         options = qdict_new();
1365     }
1366 
1367     if (filename && g_str_has_prefix(filename, "json:")) {
1368         QDict *json_options = parse_json_filename(filename, &local_err);
1369         if (local_err) {
1370             ret = -EINVAL;
1371             goto fail;
1372         }
1373 
1374         /* Options given in the filename have lower priority than options
1375          * specified directly */
1376         qdict_join(options, json_options, false);
1377         QDECREF(json_options);
1378         filename = NULL;
1379     }
1380 
1381     bs->options = options;
1382     options = qdict_clone_shallow(options);
1383 
1384     if (flags & BDRV_O_PROTOCOL) {
1385         assert(!drv);
1386         ret = bdrv_file_open(bs, filename, &options, flags & ~BDRV_O_PROTOCOL,
1387                              &local_err);
1388         if (!ret) {
1389             drv = bs->drv;
1390             goto done;
1391         } else if (bs->drv) {
1392             goto close_and_fail;
1393         } else {
1394             goto fail;
1395         }
1396     }
1397 
1398     /* Open image file without format layer */
1399     if (flags & BDRV_O_RDWR) {
1400         flags |= BDRV_O_ALLOW_RDWR;
1401     }
1402     if (flags & BDRV_O_SNAPSHOT) {
1403         snapshot_flags = bdrv_temp_snapshot_flags(flags);
1404         flags = bdrv_backing_flags(flags);
1405     }
1406 
1407     assert(file == NULL);
1408     ret = bdrv_open_image(&file, filename, options, "file",
1409                           bdrv_inherited_flags(flags),
1410                           true, &local_err);
1411     if (ret < 0) {
1412         goto fail;
1413     }
1414 
1415     /* Find the right image format driver */
1416     drvname = qdict_get_try_str(options, "driver");
1417     if (drvname) {
1418         drv = bdrv_find_format(drvname);
1419         qdict_del(options, "driver");
1420         if (!drv) {
1421             error_setg(errp, "Invalid driver: '%s'", drvname);
1422             ret = -EINVAL;
1423             goto fail;
1424         }
1425     }
1426 
1427     if (!drv) {
1428         if (file) {
1429             ret = find_image_format(file, filename, &drv, &local_err);
1430         } else {
1431             error_setg(errp, "Must specify either driver or file");
1432             ret = -EINVAL;
1433             goto fail;
1434         }
1435     }
1436 
1437     if (!drv) {
1438         goto fail;
1439     }
1440 
1441     /* Open the image */
1442     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1443     if (ret < 0) {
1444         goto fail;
1445     }
1446 
1447     if (file && (bs->file != file)) {
1448         bdrv_unref(file);
1449         file = NULL;
1450     }
1451 
1452     /* If there is a backing file, use it */
1453     if ((flags & BDRV_O_NO_BACKING) == 0) {
1454         QDict *backing_options;
1455 
1456         qdict_extract_subqdict(options, &backing_options, "backing.");
1457         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1458         if (ret < 0) {
1459             goto close_and_fail;
1460         }
1461     }
1462 
1463     /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1464      * temporary snapshot afterwards. */
1465     if (snapshot_flags) {
1466         bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1467         if (local_err) {
1468             error_propagate(errp, local_err);
1469             goto close_and_fail;
1470         }
1471     }
1472 
1473 
1474 done:
1475     /* Check if any unknown options were used */
1476     if (options && (qdict_size(options) != 0)) {
1477         const QDictEntry *entry = qdict_first(options);
1478         if (flags & BDRV_O_PROTOCOL) {
1479             error_setg(errp, "Block protocol '%s' doesn't support the option "
1480                        "'%s'", drv->format_name, entry->key);
1481         } else {
1482             error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1483                        "support the option '%s'", drv->format_name,
1484                        bs->device_name, entry->key);
1485         }
1486 
1487         ret = -EINVAL;
1488         goto close_and_fail;
1489     }
1490 
1491     if (!bdrv_key_required(bs)) {
1492         bdrv_dev_change_media_cb(bs, true);
1493     } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1494                && !runstate_check(RUN_STATE_INMIGRATE)
1495                && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1496         error_setg(errp,
1497                    "Guest must be stopped for opening of encrypted image");
1498         ret = -EBUSY;
1499         goto close_and_fail;
1500     }
1501 
1502     QDECREF(options);
1503     *pbs = bs;
1504     return 0;
1505 
1506 fail:
1507     if (file != NULL) {
1508         bdrv_unref(file);
1509     }
1510     QDECREF(bs->options);
1511     QDECREF(options);
1512     bs->options = NULL;
1513     if (!*pbs) {
1514         /* If *pbs is NULL, a new BDS has been created in this function and
1515            needs to be freed now. Otherwise, it does not need to be closed,
1516            since it has not really been opened yet. */
1517         bdrv_unref(bs);
1518     }
1519     if (local_err) {
1520         error_propagate(errp, local_err);
1521     }
1522     return ret;
1523 
1524 close_and_fail:
1525     /* See fail path, but now the BDS has to be always closed */
1526     if (*pbs) {
1527         bdrv_close(bs);
1528     } else {
1529         bdrv_unref(bs);
1530     }
1531     QDECREF(options);
1532     if (local_err) {
1533         error_propagate(errp, local_err);
1534     }
1535     return ret;
1536 }
1537 
1538 typedef struct BlockReopenQueueEntry {
1539      bool prepared;
1540      BDRVReopenState state;
1541      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1542 } BlockReopenQueueEntry;
1543 
1544 /*
1545  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1546  * reopen of multiple devices.
1547  *
1548  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1549  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1550  * be created and initialized. This newly created BlockReopenQueue should be
1551  * passed back in for subsequent calls that are intended to be of the same
1552  * atomic 'set'.
1553  *
1554  * bs is the BlockDriverState to add to the reopen queue.
1555  *
1556  * flags contains the open flags for the associated bs
1557  *
1558  * returns a pointer to bs_queue, which is either the newly allocated
1559  * bs_queue, or the existing bs_queue being used.
1560  *
1561  */
1562 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1563                                     BlockDriverState *bs, int flags)
1564 {
1565     assert(bs != NULL);
1566 
1567     BlockReopenQueueEntry *bs_entry;
1568     if (bs_queue == NULL) {
1569         bs_queue = g_new0(BlockReopenQueue, 1);
1570         QSIMPLEQ_INIT(bs_queue);
1571     }
1572 
1573     /* bdrv_open() masks this flag out */
1574     flags &= ~BDRV_O_PROTOCOL;
1575 
1576     if (bs->file) {
1577         bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1578     }
1579 
1580     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1581     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1582 
1583     bs_entry->state.bs = bs;
1584     bs_entry->state.flags = flags;
1585 
1586     return bs_queue;
1587 }
1588 
1589 /*
1590  * Reopen multiple BlockDriverStates atomically & transactionally.
1591  *
1592  * The queue passed in (bs_queue) must have been built up previous
1593  * via bdrv_reopen_queue().
1594  *
1595  * Reopens all BDS specified in the queue, with the appropriate
1596  * flags.  All devices are prepared for reopen, and failure of any
1597  * device will cause all device changes to be abandonded, and intermediate
1598  * data cleaned up.
1599  *
1600  * If all devices prepare successfully, then the changes are committed
1601  * to all devices.
1602  *
1603  */
1604 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1605 {
1606     int ret = -1;
1607     BlockReopenQueueEntry *bs_entry, *next;
1608     Error *local_err = NULL;
1609 
1610     assert(bs_queue != NULL);
1611 
1612     bdrv_drain_all();
1613 
1614     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1615         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1616             error_propagate(errp, local_err);
1617             goto cleanup;
1618         }
1619         bs_entry->prepared = true;
1620     }
1621 
1622     /* If we reach this point, we have success and just need to apply the
1623      * changes
1624      */
1625     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1626         bdrv_reopen_commit(&bs_entry->state);
1627     }
1628 
1629     ret = 0;
1630 
1631 cleanup:
1632     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1633         if (ret && bs_entry->prepared) {
1634             bdrv_reopen_abort(&bs_entry->state);
1635         }
1636         g_free(bs_entry);
1637     }
1638     g_free(bs_queue);
1639     return ret;
1640 }
1641 
1642 
1643 /* Reopen a single BlockDriverState with the specified flags. */
1644 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1645 {
1646     int ret = -1;
1647     Error *local_err = NULL;
1648     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1649 
1650     ret = bdrv_reopen_multiple(queue, &local_err);
1651     if (local_err != NULL) {
1652         error_propagate(errp, local_err);
1653     }
1654     return ret;
1655 }
1656 
1657 
1658 /*
1659  * Prepares a BlockDriverState for reopen. All changes are staged in the
1660  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1661  * the block driver layer .bdrv_reopen_prepare()
1662  *
1663  * bs is the BlockDriverState to reopen
1664  * flags are the new open flags
1665  * queue is the reopen queue
1666  *
1667  * Returns 0 on success, non-zero on error.  On error errp will be set
1668  * as well.
1669  *
1670  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1671  * It is the responsibility of the caller to then call the abort() or
1672  * commit() for any other BDS that have been left in a prepare() state
1673  *
1674  */
1675 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1676                         Error **errp)
1677 {
1678     int ret = -1;
1679     Error *local_err = NULL;
1680     BlockDriver *drv;
1681 
1682     assert(reopen_state != NULL);
1683     assert(reopen_state->bs->drv != NULL);
1684     drv = reopen_state->bs->drv;
1685 
1686     /* if we are to stay read-only, do not allow permission change
1687      * to r/w */
1688     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1689         reopen_state->flags & BDRV_O_RDWR) {
1690         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1691                   reopen_state->bs->device_name);
1692         goto error;
1693     }
1694 
1695 
1696     ret = bdrv_flush(reopen_state->bs);
1697     if (ret) {
1698         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1699                   strerror(-ret));
1700         goto error;
1701     }
1702 
1703     if (drv->bdrv_reopen_prepare) {
1704         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1705         if (ret) {
1706             if (local_err != NULL) {
1707                 error_propagate(errp, local_err);
1708             } else {
1709                 error_setg(errp, "failed while preparing to reopen image '%s'",
1710                            reopen_state->bs->filename);
1711             }
1712             goto error;
1713         }
1714     } else {
1715         /* It is currently mandatory to have a bdrv_reopen_prepare()
1716          * handler for each supported drv. */
1717         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1718                   drv->format_name, reopen_state->bs->device_name,
1719                  "reopening of file");
1720         ret = -1;
1721         goto error;
1722     }
1723 
1724     ret = 0;
1725 
1726 error:
1727     return ret;
1728 }
1729 
1730 /*
1731  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1732  * makes them final by swapping the staging BlockDriverState contents into
1733  * the active BlockDriverState contents.
1734  */
1735 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1736 {
1737     BlockDriver *drv;
1738 
1739     assert(reopen_state != NULL);
1740     drv = reopen_state->bs->drv;
1741     assert(drv != NULL);
1742 
1743     /* If there are any driver level actions to take */
1744     if (drv->bdrv_reopen_commit) {
1745         drv->bdrv_reopen_commit(reopen_state);
1746     }
1747 
1748     /* set BDS specific flags now */
1749     reopen_state->bs->open_flags         = reopen_state->flags;
1750     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1751                                               BDRV_O_CACHE_WB);
1752     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1753 
1754     bdrv_refresh_limits(reopen_state->bs);
1755 }
1756 
1757 /*
1758  * Abort the reopen, and delete and free the staged changes in
1759  * reopen_state
1760  */
1761 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1762 {
1763     BlockDriver *drv;
1764 
1765     assert(reopen_state != NULL);
1766     drv = reopen_state->bs->drv;
1767     assert(drv != NULL);
1768 
1769     if (drv->bdrv_reopen_abort) {
1770         drv->bdrv_reopen_abort(reopen_state);
1771     }
1772 }
1773 
1774 
1775 void bdrv_close(BlockDriverState *bs)
1776 {
1777     if (bs->job) {
1778         block_job_cancel_sync(bs->job);
1779     }
1780     bdrv_drain_all(); /* complete I/O */
1781     bdrv_flush(bs);
1782     bdrv_drain_all(); /* in case flush left pending I/O */
1783     notifier_list_notify(&bs->close_notifiers, bs);
1784 
1785     if (bs->drv) {
1786         if (bs->backing_hd) {
1787             bdrv_unref(bs->backing_hd);
1788             bs->backing_hd = NULL;
1789         }
1790         bs->drv->bdrv_close(bs);
1791         g_free(bs->opaque);
1792         bs->opaque = NULL;
1793         bs->drv = NULL;
1794         bs->copy_on_read = 0;
1795         bs->backing_file[0] = '\0';
1796         bs->backing_format[0] = '\0';
1797         bs->total_sectors = 0;
1798         bs->encrypted = 0;
1799         bs->valid_key = 0;
1800         bs->sg = 0;
1801         bs->growable = 0;
1802         bs->zero_beyond_eof = false;
1803         QDECREF(bs->options);
1804         bs->options = NULL;
1805 
1806         if (bs->file != NULL) {
1807             bdrv_unref(bs->file);
1808             bs->file = NULL;
1809         }
1810     }
1811 
1812     bdrv_dev_change_media_cb(bs, false);
1813 
1814     /*throttling disk I/O limits*/
1815     if (bs->io_limits_enabled) {
1816         bdrv_io_limits_disable(bs);
1817     }
1818 }
1819 
1820 void bdrv_close_all(void)
1821 {
1822     BlockDriverState *bs;
1823 
1824     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1825         bdrv_close(bs);
1826     }
1827 }
1828 
1829 /* Check if any requests are in-flight (including throttled requests) */
1830 static bool bdrv_requests_pending(BlockDriverState *bs)
1831 {
1832     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1833         return true;
1834     }
1835     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1836         return true;
1837     }
1838     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1839         return true;
1840     }
1841     if (bs->file && bdrv_requests_pending(bs->file)) {
1842         return true;
1843     }
1844     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1845         return true;
1846     }
1847     return false;
1848 }
1849 
1850 static bool bdrv_requests_pending_all(void)
1851 {
1852     BlockDriverState *bs;
1853     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1854         if (bdrv_requests_pending(bs)) {
1855             return true;
1856         }
1857     }
1858     return false;
1859 }
1860 
1861 /*
1862  * Wait for pending requests to complete across all BlockDriverStates
1863  *
1864  * This function does not flush data to disk, use bdrv_flush_all() for that
1865  * after calling this function.
1866  *
1867  * Note that completion of an asynchronous I/O operation can trigger any
1868  * number of other I/O operations on other devices---for example a coroutine
1869  * can be arbitrarily complex and a constant flow of I/O can come until the
1870  * coroutine is complete.  Because of this, it is not possible to have a
1871  * function to drain a single device's I/O queue.
1872  */
1873 void bdrv_drain_all(void)
1874 {
1875     /* Always run first iteration so any pending completion BHs run */
1876     bool busy = true;
1877     BlockDriverState *bs;
1878 
1879     while (busy) {
1880         QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1881             bdrv_start_throttled_reqs(bs);
1882         }
1883 
1884         busy = bdrv_requests_pending_all();
1885         busy |= aio_poll(qemu_get_aio_context(), busy);
1886     }
1887 }
1888 
1889 /* make a BlockDriverState anonymous by removing from bdrv_state and
1890  * graph_bdrv_state list.
1891    Also, NULL terminate the device_name to prevent double remove */
1892 void bdrv_make_anon(BlockDriverState *bs)
1893 {
1894     if (bs->device_name[0] != '\0') {
1895         QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1896     }
1897     bs->device_name[0] = '\0';
1898     if (bs->node_name[0] != '\0') {
1899         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1900     }
1901     bs->node_name[0] = '\0';
1902 }
1903 
1904 static void bdrv_rebind(BlockDriverState *bs)
1905 {
1906     if (bs->drv && bs->drv->bdrv_rebind) {
1907         bs->drv->bdrv_rebind(bs);
1908     }
1909 }
1910 
1911 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1912                                      BlockDriverState *bs_src)
1913 {
1914     /* move some fields that need to stay attached to the device */
1915 
1916     /* dev info */
1917     bs_dest->dev_ops            = bs_src->dev_ops;
1918     bs_dest->dev_opaque         = bs_src->dev_opaque;
1919     bs_dest->dev                = bs_src->dev;
1920     bs_dest->guest_block_size   = bs_src->guest_block_size;
1921     bs_dest->copy_on_read       = bs_src->copy_on_read;
1922 
1923     bs_dest->enable_write_cache = bs_src->enable_write_cache;
1924 
1925     /* i/o throttled req */
1926     memcpy(&bs_dest->throttle_state,
1927            &bs_src->throttle_state,
1928            sizeof(ThrottleState));
1929     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1930     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1931     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1932 
1933     /* r/w error */
1934     bs_dest->on_read_error      = bs_src->on_read_error;
1935     bs_dest->on_write_error     = bs_src->on_write_error;
1936 
1937     /* i/o status */
1938     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1939     bs_dest->iostatus           = bs_src->iostatus;
1940 
1941     /* dirty bitmap */
1942     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1943 
1944     /* reference count */
1945     bs_dest->refcnt             = bs_src->refcnt;
1946 
1947     /* job */
1948     bs_dest->in_use             = bs_src->in_use;
1949     bs_dest->job                = bs_src->job;
1950 
1951     /* keep the same entry in bdrv_states */
1952     pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1953             bs_src->device_name);
1954     bs_dest->device_list = bs_src->device_list;
1955 }
1956 
1957 /*
1958  * Swap bs contents for two image chains while they are live,
1959  * while keeping required fields on the BlockDriverState that is
1960  * actually attached to a device.
1961  *
1962  * This will modify the BlockDriverState fields, and swap contents
1963  * between bs_new and bs_old. Both bs_new and bs_old are modified.
1964  *
1965  * bs_new is required to be anonymous.
1966  *
1967  * This function does not create any image files.
1968  */
1969 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1970 {
1971     BlockDriverState tmp;
1972 
1973     /* The code needs to swap the node_name but simply swapping node_list won't
1974      * work so first remove the nodes from the graph list, do the swap then
1975      * insert them back if needed.
1976      */
1977     if (bs_new->node_name[0] != '\0') {
1978         QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
1979     }
1980     if (bs_old->node_name[0] != '\0') {
1981         QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
1982     }
1983 
1984     /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1985     assert(bs_new->device_name[0] == '\0');
1986     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1987     assert(bs_new->job == NULL);
1988     assert(bs_new->dev == NULL);
1989     assert(bs_new->in_use == 0);
1990     assert(bs_new->io_limits_enabled == false);
1991     assert(!throttle_have_timer(&bs_new->throttle_state));
1992 
1993     tmp = *bs_new;
1994     *bs_new = *bs_old;
1995     *bs_old = tmp;
1996 
1997     /* there are some fields that should not be swapped, move them back */
1998     bdrv_move_feature_fields(&tmp, bs_old);
1999     bdrv_move_feature_fields(bs_old, bs_new);
2000     bdrv_move_feature_fields(bs_new, &tmp);
2001 
2002     /* bs_new shouldn't be in bdrv_states even after the swap!  */
2003     assert(bs_new->device_name[0] == '\0');
2004 
2005     /* Check a few fields that should remain attached to the device */
2006     assert(bs_new->dev == NULL);
2007     assert(bs_new->job == NULL);
2008     assert(bs_new->in_use == 0);
2009     assert(bs_new->io_limits_enabled == false);
2010     assert(!throttle_have_timer(&bs_new->throttle_state));
2011 
2012     /* insert the nodes back into the graph node list if needed */
2013     if (bs_new->node_name[0] != '\0') {
2014         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2015     }
2016     if (bs_old->node_name[0] != '\0') {
2017         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2018     }
2019 
2020     bdrv_rebind(bs_new);
2021     bdrv_rebind(bs_old);
2022 }
2023 
2024 /*
2025  * Add new bs contents at the top of an image chain while the chain is
2026  * live, while keeping required fields on the top layer.
2027  *
2028  * This will modify the BlockDriverState fields, and swap contents
2029  * between bs_new and bs_top. Both bs_new and bs_top are modified.
2030  *
2031  * bs_new is required to be anonymous.
2032  *
2033  * This function does not create any image files.
2034  */
2035 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2036 {
2037     bdrv_swap(bs_new, bs_top);
2038 
2039     /* The contents of 'tmp' will become bs_top, as we are
2040      * swapping bs_new and bs_top contents. */
2041     bs_top->backing_hd = bs_new;
2042     bs_top->open_flags &= ~BDRV_O_NO_BACKING;
2043     pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
2044             bs_new->filename);
2045     pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
2046             bs_new->drv ? bs_new->drv->format_name : "");
2047 }
2048 
2049 static void bdrv_delete(BlockDriverState *bs)
2050 {
2051     assert(!bs->dev);
2052     assert(!bs->job);
2053     assert(!bs->in_use);
2054     assert(!bs->refcnt);
2055     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2056 
2057     bdrv_close(bs);
2058 
2059     /* remove from list, if necessary */
2060     bdrv_make_anon(bs);
2061 
2062     g_free(bs);
2063 }
2064 
2065 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
2066 /* TODO change to DeviceState *dev when all users are qdevified */
2067 {
2068     if (bs->dev) {
2069         return -EBUSY;
2070     }
2071     bs->dev = dev;
2072     bdrv_iostatus_reset(bs);
2073     return 0;
2074 }
2075 
2076 /* TODO qdevified devices don't use this, remove when devices are qdevified */
2077 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
2078 {
2079     if (bdrv_attach_dev(bs, dev) < 0) {
2080         abort();
2081     }
2082 }
2083 
2084 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
2085 /* TODO change to DeviceState *dev when all users are qdevified */
2086 {
2087     assert(bs->dev == dev);
2088     bs->dev = NULL;
2089     bs->dev_ops = NULL;
2090     bs->dev_opaque = NULL;
2091     bs->guest_block_size = 512;
2092 }
2093 
2094 /* TODO change to return DeviceState * when all users are qdevified */
2095 void *bdrv_get_attached_dev(BlockDriverState *bs)
2096 {
2097     return bs->dev;
2098 }
2099 
2100 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2101                       void *opaque)
2102 {
2103     bs->dev_ops = ops;
2104     bs->dev_opaque = opaque;
2105 }
2106 
2107 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
2108                                enum MonitorEvent ev,
2109                                BlockErrorAction action, bool is_read)
2110 {
2111     QObject *data;
2112     const char *action_str;
2113 
2114     switch (action) {
2115     case BDRV_ACTION_REPORT:
2116         action_str = "report";
2117         break;
2118     case BDRV_ACTION_IGNORE:
2119         action_str = "ignore";
2120         break;
2121     case BDRV_ACTION_STOP:
2122         action_str = "stop";
2123         break;
2124     default:
2125         abort();
2126     }
2127 
2128     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2129                               bdrv->device_name,
2130                               action_str,
2131                               is_read ? "read" : "write");
2132     monitor_protocol_event(ev, data);
2133 
2134     qobject_decref(data);
2135 }
2136 
2137 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2138 {
2139     QObject *data;
2140 
2141     data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2142                               bdrv_get_device_name(bs), ejected);
2143     monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2144 
2145     qobject_decref(data);
2146 }
2147 
2148 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2149 {
2150     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2151         bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2152         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2153         if (tray_was_closed) {
2154             /* tray open */
2155             bdrv_emit_qmp_eject_event(bs, true);
2156         }
2157         if (load) {
2158             /* tray close */
2159             bdrv_emit_qmp_eject_event(bs, false);
2160         }
2161     }
2162 }
2163 
2164 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2165 {
2166     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2167 }
2168 
2169 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2170 {
2171     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2172         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2173     }
2174 }
2175 
2176 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2177 {
2178     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2179         return bs->dev_ops->is_tray_open(bs->dev_opaque);
2180     }
2181     return false;
2182 }
2183 
2184 static void bdrv_dev_resize_cb(BlockDriverState *bs)
2185 {
2186     if (bs->dev_ops && bs->dev_ops->resize_cb) {
2187         bs->dev_ops->resize_cb(bs->dev_opaque);
2188     }
2189 }
2190 
2191 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2192 {
2193     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2194         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2195     }
2196     return false;
2197 }
2198 
2199 /*
2200  * Run consistency checks on an image
2201  *
2202  * Returns 0 if the check could be completed (it doesn't mean that the image is
2203  * free of errors) or -errno when an internal error occurred. The results of the
2204  * check are stored in res.
2205  */
2206 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2207 {
2208     if (bs->drv->bdrv_check == NULL) {
2209         return -ENOTSUP;
2210     }
2211 
2212     memset(res, 0, sizeof(*res));
2213     return bs->drv->bdrv_check(bs, res, fix);
2214 }
2215 
2216 #define COMMIT_BUF_SECTORS 2048
2217 
2218 /* commit COW file into the raw image */
2219 int bdrv_commit(BlockDriverState *bs)
2220 {
2221     BlockDriver *drv = bs->drv;
2222     int64_t sector, total_sectors, length, backing_length;
2223     int n, ro, open_flags;
2224     int ret = 0;
2225     uint8_t *buf = NULL;
2226     char filename[PATH_MAX];
2227 
2228     if (!drv)
2229         return -ENOMEDIUM;
2230 
2231     if (!bs->backing_hd) {
2232         return -ENOTSUP;
2233     }
2234 
2235     if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
2236         return -EBUSY;
2237     }
2238 
2239     ro = bs->backing_hd->read_only;
2240     /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2241     pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2242     open_flags =  bs->backing_hd->open_flags;
2243 
2244     if (ro) {
2245         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2246             return -EACCES;
2247         }
2248     }
2249 
2250     length = bdrv_getlength(bs);
2251     if (length < 0) {
2252         ret = length;
2253         goto ro_cleanup;
2254     }
2255 
2256     backing_length = bdrv_getlength(bs->backing_hd);
2257     if (backing_length < 0) {
2258         ret = backing_length;
2259         goto ro_cleanup;
2260     }
2261 
2262     /* If our top snapshot is larger than the backing file image,
2263      * grow the backing file image if possible.  If not possible,
2264      * we must return an error */
2265     if (length > backing_length) {
2266         ret = bdrv_truncate(bs->backing_hd, length);
2267         if (ret < 0) {
2268             goto ro_cleanup;
2269         }
2270     }
2271 
2272     total_sectors = length >> BDRV_SECTOR_BITS;
2273     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2274 
2275     for (sector = 0; sector < total_sectors; sector += n) {
2276         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2277         if (ret < 0) {
2278             goto ro_cleanup;
2279         }
2280         if (ret) {
2281             ret = bdrv_read(bs, sector, buf, n);
2282             if (ret < 0) {
2283                 goto ro_cleanup;
2284             }
2285 
2286             ret = bdrv_write(bs->backing_hd, sector, buf, n);
2287             if (ret < 0) {
2288                 goto ro_cleanup;
2289             }
2290         }
2291     }
2292 
2293     if (drv->bdrv_make_empty) {
2294         ret = drv->bdrv_make_empty(bs);
2295         if (ret < 0) {
2296             goto ro_cleanup;
2297         }
2298         bdrv_flush(bs);
2299     }
2300 
2301     /*
2302      * Make sure all data we wrote to the backing device is actually
2303      * stable on disk.
2304      */
2305     if (bs->backing_hd) {
2306         bdrv_flush(bs->backing_hd);
2307     }
2308 
2309     ret = 0;
2310 ro_cleanup:
2311     g_free(buf);
2312 
2313     if (ro) {
2314         /* ignoring error return here */
2315         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2316     }
2317 
2318     return ret;
2319 }
2320 
2321 int bdrv_commit_all(void)
2322 {
2323     BlockDriverState *bs;
2324 
2325     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2326         if (bs->drv && bs->backing_hd) {
2327             int ret = bdrv_commit(bs);
2328             if (ret < 0) {
2329                 return ret;
2330             }
2331         }
2332     }
2333     return 0;
2334 }
2335 
2336 /**
2337  * Remove an active request from the tracked requests list
2338  *
2339  * This function should be called when a tracked request is completing.
2340  */
2341 static void tracked_request_end(BdrvTrackedRequest *req)
2342 {
2343     if (req->serialising) {
2344         req->bs->serialising_in_flight--;
2345     }
2346 
2347     QLIST_REMOVE(req, list);
2348     qemu_co_queue_restart_all(&req->wait_queue);
2349 }
2350 
2351 /**
2352  * Add an active request to the tracked requests list
2353  */
2354 static void tracked_request_begin(BdrvTrackedRequest *req,
2355                                   BlockDriverState *bs,
2356                                   int64_t offset,
2357                                   unsigned int bytes, bool is_write)
2358 {
2359     *req = (BdrvTrackedRequest){
2360         .bs = bs,
2361         .offset         = offset,
2362         .bytes          = bytes,
2363         .is_write       = is_write,
2364         .co             = qemu_coroutine_self(),
2365         .serialising    = false,
2366         .overlap_offset = offset,
2367         .overlap_bytes  = bytes,
2368     };
2369 
2370     qemu_co_queue_init(&req->wait_queue);
2371 
2372     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2373 }
2374 
2375 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2376 {
2377     int64_t overlap_offset = req->offset & ~(align - 1);
2378     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2379                                - overlap_offset;
2380 
2381     if (!req->serialising) {
2382         req->bs->serialising_in_flight++;
2383         req->serialising = true;
2384     }
2385 
2386     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2387     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2388 }
2389 
2390 /**
2391  * Round a region to cluster boundaries
2392  */
2393 void bdrv_round_to_clusters(BlockDriverState *bs,
2394                             int64_t sector_num, int nb_sectors,
2395                             int64_t *cluster_sector_num,
2396                             int *cluster_nb_sectors)
2397 {
2398     BlockDriverInfo bdi;
2399 
2400     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2401         *cluster_sector_num = sector_num;
2402         *cluster_nb_sectors = nb_sectors;
2403     } else {
2404         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2405         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2406         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2407                                             nb_sectors, c);
2408     }
2409 }
2410 
2411 static int bdrv_get_cluster_size(BlockDriverState *bs)
2412 {
2413     BlockDriverInfo bdi;
2414     int ret;
2415 
2416     ret = bdrv_get_info(bs, &bdi);
2417     if (ret < 0 || bdi.cluster_size == 0) {
2418         return bs->request_alignment;
2419     } else {
2420         return bdi.cluster_size;
2421     }
2422 }
2423 
2424 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2425                                      int64_t offset, unsigned int bytes)
2426 {
2427     /*        aaaa   bbbb */
2428     if (offset >= req->overlap_offset + req->overlap_bytes) {
2429         return false;
2430     }
2431     /* bbbb   aaaa        */
2432     if (req->overlap_offset >= offset + bytes) {
2433         return false;
2434     }
2435     return true;
2436 }
2437 
2438 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2439 {
2440     BlockDriverState *bs = self->bs;
2441     BdrvTrackedRequest *req;
2442     bool retry;
2443     bool waited = false;
2444 
2445     if (!bs->serialising_in_flight) {
2446         return false;
2447     }
2448 
2449     do {
2450         retry = false;
2451         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2452             if (req == self || (!req->serialising && !self->serialising)) {
2453                 continue;
2454             }
2455             if (tracked_request_overlaps(req, self->overlap_offset,
2456                                          self->overlap_bytes))
2457             {
2458                 /* Hitting this means there was a reentrant request, for
2459                  * example, a block driver issuing nested requests.  This must
2460                  * never happen since it means deadlock.
2461                  */
2462                 assert(qemu_coroutine_self() != req->co);
2463 
2464                 /* If the request is already (indirectly) waiting for us, or
2465                  * will wait for us as soon as it wakes up, then just go on
2466                  * (instead of producing a deadlock in the former case). */
2467                 if (!req->waiting_for) {
2468                     self->waiting_for = req;
2469                     qemu_co_queue_wait(&req->wait_queue);
2470                     self->waiting_for = NULL;
2471                     retry = true;
2472                     waited = true;
2473                     break;
2474                 }
2475             }
2476         }
2477     } while (retry);
2478 
2479     return waited;
2480 }
2481 
2482 /*
2483  * Return values:
2484  * 0        - success
2485  * -EINVAL  - backing format specified, but no file
2486  * -ENOSPC  - can't update the backing file because no space is left in the
2487  *            image file header
2488  * -ENOTSUP - format driver doesn't support changing the backing file
2489  */
2490 int bdrv_change_backing_file(BlockDriverState *bs,
2491     const char *backing_file, const char *backing_fmt)
2492 {
2493     BlockDriver *drv = bs->drv;
2494     int ret;
2495 
2496     /* Backing file format doesn't make sense without a backing file */
2497     if (backing_fmt && !backing_file) {
2498         return -EINVAL;
2499     }
2500 
2501     if (drv->bdrv_change_backing_file != NULL) {
2502         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2503     } else {
2504         ret = -ENOTSUP;
2505     }
2506 
2507     if (ret == 0) {
2508         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2509         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2510     }
2511     return ret;
2512 }
2513 
2514 /*
2515  * Finds the image layer in the chain that has 'bs' as its backing file.
2516  *
2517  * active is the current topmost image.
2518  *
2519  * Returns NULL if bs is not found in active's image chain,
2520  * or if active == bs.
2521  */
2522 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2523                                     BlockDriverState *bs)
2524 {
2525     BlockDriverState *overlay = NULL;
2526     BlockDriverState *intermediate;
2527 
2528     assert(active != NULL);
2529     assert(bs != NULL);
2530 
2531     /* if bs is the same as active, then by definition it has no overlay
2532      */
2533     if (active == bs) {
2534         return NULL;
2535     }
2536 
2537     intermediate = active;
2538     while (intermediate->backing_hd) {
2539         if (intermediate->backing_hd == bs) {
2540             overlay = intermediate;
2541             break;
2542         }
2543         intermediate = intermediate->backing_hd;
2544     }
2545 
2546     return overlay;
2547 }
2548 
2549 typedef struct BlkIntermediateStates {
2550     BlockDriverState *bs;
2551     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2552 } BlkIntermediateStates;
2553 
2554 
2555 /*
2556  * Drops images above 'base' up to and including 'top', and sets the image
2557  * above 'top' to have base as its backing file.
2558  *
2559  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2560  * information in 'bs' can be properly updated.
2561  *
2562  * E.g., this will convert the following chain:
2563  * bottom <- base <- intermediate <- top <- active
2564  *
2565  * to
2566  *
2567  * bottom <- base <- active
2568  *
2569  * It is allowed for bottom==base, in which case it converts:
2570  *
2571  * base <- intermediate <- top <- active
2572  *
2573  * to
2574  *
2575  * base <- active
2576  *
2577  * Error conditions:
2578  *  if active == top, that is considered an error
2579  *
2580  */
2581 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2582                            BlockDriverState *base)
2583 {
2584     BlockDriverState *intermediate;
2585     BlockDriverState *base_bs = NULL;
2586     BlockDriverState *new_top_bs = NULL;
2587     BlkIntermediateStates *intermediate_state, *next;
2588     int ret = -EIO;
2589 
2590     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2591     QSIMPLEQ_INIT(&states_to_delete);
2592 
2593     if (!top->drv || !base->drv) {
2594         goto exit;
2595     }
2596 
2597     new_top_bs = bdrv_find_overlay(active, top);
2598 
2599     if (new_top_bs == NULL) {
2600         /* we could not find the image above 'top', this is an error */
2601         goto exit;
2602     }
2603 
2604     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2605      * to do, no intermediate images */
2606     if (new_top_bs->backing_hd == base) {
2607         ret = 0;
2608         goto exit;
2609     }
2610 
2611     intermediate = top;
2612 
2613     /* now we will go down through the list, and add each BDS we find
2614      * into our deletion queue, until we hit the 'base'
2615      */
2616     while (intermediate) {
2617         intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2618         intermediate_state->bs = intermediate;
2619         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2620 
2621         if (intermediate->backing_hd == base) {
2622             base_bs = intermediate->backing_hd;
2623             break;
2624         }
2625         intermediate = intermediate->backing_hd;
2626     }
2627     if (base_bs == NULL) {
2628         /* something went wrong, we did not end at the base. safely
2629          * unravel everything, and exit with error */
2630         goto exit;
2631     }
2632 
2633     /* success - we can delete the intermediate states, and link top->base */
2634     ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2635                                    base_bs->drv ? base_bs->drv->format_name : "");
2636     if (ret) {
2637         goto exit;
2638     }
2639     new_top_bs->backing_hd = base_bs;
2640 
2641     bdrv_refresh_limits(new_top_bs);
2642 
2643     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2644         /* so that bdrv_close() does not recursively close the chain */
2645         intermediate_state->bs->backing_hd = NULL;
2646         bdrv_unref(intermediate_state->bs);
2647     }
2648     ret = 0;
2649 
2650 exit:
2651     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2652         g_free(intermediate_state);
2653     }
2654     return ret;
2655 }
2656 
2657 
2658 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2659                                    size_t size)
2660 {
2661     int64_t len;
2662 
2663     if (size > INT_MAX) {
2664         return -EIO;
2665     }
2666 
2667     if (!bdrv_is_inserted(bs))
2668         return -ENOMEDIUM;
2669 
2670     if (bs->growable)
2671         return 0;
2672 
2673     len = bdrv_getlength(bs);
2674 
2675     if (offset < 0)
2676         return -EIO;
2677 
2678     if ((offset > len) || (len - offset < size))
2679         return -EIO;
2680 
2681     return 0;
2682 }
2683 
2684 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2685                               int nb_sectors)
2686 {
2687     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2688         return -EIO;
2689     }
2690 
2691     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2692                                    nb_sectors * BDRV_SECTOR_SIZE);
2693 }
2694 
2695 typedef struct RwCo {
2696     BlockDriverState *bs;
2697     int64_t offset;
2698     QEMUIOVector *qiov;
2699     bool is_write;
2700     int ret;
2701     BdrvRequestFlags flags;
2702 } RwCo;
2703 
2704 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2705 {
2706     RwCo *rwco = opaque;
2707 
2708     if (!rwco->is_write) {
2709         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2710                                       rwco->qiov->size, rwco->qiov,
2711                                       rwco->flags);
2712     } else {
2713         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2714                                        rwco->qiov->size, rwco->qiov,
2715                                        rwco->flags);
2716     }
2717 }
2718 
2719 /*
2720  * Process a vectored synchronous request using coroutines
2721  */
2722 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2723                         QEMUIOVector *qiov, bool is_write,
2724                         BdrvRequestFlags flags)
2725 {
2726     Coroutine *co;
2727     RwCo rwco = {
2728         .bs = bs,
2729         .offset = offset,
2730         .qiov = qiov,
2731         .is_write = is_write,
2732         .ret = NOT_DONE,
2733         .flags = flags,
2734     };
2735 
2736     /**
2737      * In sync call context, when the vcpu is blocked, this throttling timer
2738      * will not fire; so the I/O throttling function has to be disabled here
2739      * if it has been enabled.
2740      */
2741     if (bs->io_limits_enabled) {
2742         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2743                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2744         bdrv_io_limits_disable(bs);
2745     }
2746 
2747     if (qemu_in_coroutine()) {
2748         /* Fast-path if already in coroutine context */
2749         bdrv_rw_co_entry(&rwco);
2750     } else {
2751         co = qemu_coroutine_create(bdrv_rw_co_entry);
2752         qemu_coroutine_enter(co, &rwco);
2753         while (rwco.ret == NOT_DONE) {
2754             qemu_aio_wait();
2755         }
2756     }
2757     return rwco.ret;
2758 }
2759 
2760 /*
2761  * Process a synchronous request using coroutines
2762  */
2763 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2764                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2765 {
2766     QEMUIOVector qiov;
2767     struct iovec iov = {
2768         .iov_base = (void *)buf,
2769         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2770     };
2771 
2772     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2773         return -EINVAL;
2774     }
2775 
2776     qemu_iovec_init_external(&qiov, &iov, 1);
2777     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2778                         &qiov, is_write, flags);
2779 }
2780 
2781 /* return < 0 if error. See bdrv_write() for the return codes */
2782 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2783               uint8_t *buf, int nb_sectors)
2784 {
2785     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2786 }
2787 
2788 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2789 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2790                           uint8_t *buf, int nb_sectors)
2791 {
2792     bool enabled;
2793     int ret;
2794 
2795     enabled = bs->io_limits_enabled;
2796     bs->io_limits_enabled = false;
2797     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2798     bs->io_limits_enabled = enabled;
2799     return ret;
2800 }
2801 
2802 /* Return < 0 if error. Important errors are:
2803   -EIO         generic I/O error (may happen for all errors)
2804   -ENOMEDIUM   No media inserted.
2805   -EINVAL      Invalid sector number or nb_sectors
2806   -EACCES      Trying to write a read-only device
2807 */
2808 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2809                const uint8_t *buf, int nb_sectors)
2810 {
2811     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2812 }
2813 
2814 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2815                       int nb_sectors, BdrvRequestFlags flags)
2816 {
2817     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2818                       BDRV_REQ_ZERO_WRITE | flags);
2819 }
2820 
2821 /*
2822  * Completely zero out a block device with the help of bdrv_write_zeroes.
2823  * The operation is sped up by checking the block status and only writing
2824  * zeroes to the device if they currently do not return zeroes. Optional
2825  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2826  *
2827  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2828  */
2829 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2830 {
2831     int64_t target_size;
2832     int64_t ret, nb_sectors, sector_num = 0;
2833     int n;
2834 
2835     target_size = bdrv_getlength(bs);
2836     if (target_size < 0) {
2837         return target_size;
2838     }
2839     target_size /= BDRV_SECTOR_SIZE;
2840 
2841     for (;;) {
2842         nb_sectors = target_size - sector_num;
2843         if (nb_sectors <= 0) {
2844             return 0;
2845         }
2846         if (nb_sectors > INT_MAX) {
2847             nb_sectors = INT_MAX;
2848         }
2849         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2850         if (ret < 0) {
2851             error_report("error getting block status at sector %" PRId64 ": %s",
2852                          sector_num, strerror(-ret));
2853             return ret;
2854         }
2855         if (ret & BDRV_BLOCK_ZERO) {
2856             sector_num += n;
2857             continue;
2858         }
2859         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2860         if (ret < 0) {
2861             error_report("error writing zeroes at sector %" PRId64 ": %s",
2862                          sector_num, strerror(-ret));
2863             return ret;
2864         }
2865         sector_num += n;
2866     }
2867 }
2868 
2869 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2870 {
2871     QEMUIOVector qiov;
2872     struct iovec iov = {
2873         .iov_base = (void *)buf,
2874         .iov_len = bytes,
2875     };
2876     int ret;
2877 
2878     if (bytes < 0) {
2879         return -EINVAL;
2880     }
2881 
2882     qemu_iovec_init_external(&qiov, &iov, 1);
2883     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2884     if (ret < 0) {
2885         return ret;
2886     }
2887 
2888     return bytes;
2889 }
2890 
2891 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2892 {
2893     int ret;
2894 
2895     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2896     if (ret < 0) {
2897         return ret;
2898     }
2899 
2900     return qiov->size;
2901 }
2902 
2903 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2904                 const void *buf, int bytes)
2905 {
2906     QEMUIOVector qiov;
2907     struct iovec iov = {
2908         .iov_base   = (void *) buf,
2909         .iov_len    = bytes,
2910     };
2911 
2912     if (bytes < 0) {
2913         return -EINVAL;
2914     }
2915 
2916     qemu_iovec_init_external(&qiov, &iov, 1);
2917     return bdrv_pwritev(bs, offset, &qiov);
2918 }
2919 
2920 /*
2921  * Writes to the file and ensures that no writes are reordered across this
2922  * request (acts as a barrier)
2923  *
2924  * Returns 0 on success, -errno in error cases.
2925  */
2926 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2927     const void *buf, int count)
2928 {
2929     int ret;
2930 
2931     ret = bdrv_pwrite(bs, offset, buf, count);
2932     if (ret < 0) {
2933         return ret;
2934     }
2935 
2936     /* No flush needed for cache modes that already do it */
2937     if (bs->enable_write_cache) {
2938         bdrv_flush(bs);
2939     }
2940 
2941     return 0;
2942 }
2943 
2944 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2945         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2946 {
2947     /* Perform I/O through a temporary buffer so that users who scribble over
2948      * their read buffer while the operation is in progress do not end up
2949      * modifying the image file.  This is critical for zero-copy guest I/O
2950      * where anything might happen inside guest memory.
2951      */
2952     void *bounce_buffer;
2953 
2954     BlockDriver *drv = bs->drv;
2955     struct iovec iov;
2956     QEMUIOVector bounce_qiov;
2957     int64_t cluster_sector_num;
2958     int cluster_nb_sectors;
2959     size_t skip_bytes;
2960     int ret;
2961 
2962     /* Cover entire cluster so no additional backing file I/O is required when
2963      * allocating cluster in the image file.
2964      */
2965     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2966                            &cluster_sector_num, &cluster_nb_sectors);
2967 
2968     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2969                                    cluster_sector_num, cluster_nb_sectors);
2970 
2971     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2972     iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2973     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2974 
2975     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2976                              &bounce_qiov);
2977     if (ret < 0) {
2978         goto err;
2979     }
2980 
2981     if (drv->bdrv_co_write_zeroes &&
2982         buffer_is_zero(bounce_buffer, iov.iov_len)) {
2983         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2984                                       cluster_nb_sectors, 0);
2985     } else {
2986         /* This does not change the data on the disk, it is not necessary
2987          * to flush even in cache=writethrough mode.
2988          */
2989         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2990                                   &bounce_qiov);
2991     }
2992 
2993     if (ret < 0) {
2994         /* It might be okay to ignore write errors for guest requests.  If this
2995          * is a deliberate copy-on-read then we don't want to ignore the error.
2996          * Simply report it in all cases.
2997          */
2998         goto err;
2999     }
3000 
3001     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
3002     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
3003                         nb_sectors * BDRV_SECTOR_SIZE);
3004 
3005 err:
3006     qemu_vfree(bounce_buffer);
3007     return ret;
3008 }
3009 
3010 /*
3011  * Forwards an already correctly aligned request to the BlockDriver. This
3012  * handles copy on read and zeroing after EOF; any other features must be
3013  * implemented by the caller.
3014  */
3015 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
3016     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3017     int64_t align, QEMUIOVector *qiov, int flags)
3018 {
3019     BlockDriver *drv = bs->drv;
3020     int ret;
3021 
3022     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3023     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3024 
3025     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3026     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3027 
3028     /* Handle Copy on Read and associated serialisation */
3029     if (flags & BDRV_REQ_COPY_ON_READ) {
3030         /* If we touch the same cluster it counts as an overlap.  This
3031          * guarantees that allocating writes will be serialized and not race
3032          * with each other for the same cluster.  For example, in copy-on-read
3033          * it ensures that the CoR read and write operations are atomic and
3034          * guest writes cannot interleave between them. */
3035         mark_request_serialising(req, bdrv_get_cluster_size(bs));
3036     }
3037 
3038     wait_serialising_requests(req);
3039 
3040     if (flags & BDRV_REQ_COPY_ON_READ) {
3041         int pnum;
3042 
3043         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3044         if (ret < 0) {
3045             goto out;
3046         }
3047 
3048         if (!ret || pnum != nb_sectors) {
3049             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3050             goto out;
3051         }
3052     }
3053 
3054     /* Forward the request to the BlockDriver */
3055     if (!(bs->zero_beyond_eof && bs->growable)) {
3056         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3057     } else {
3058         /* Read zeros after EOF of growable BDSes */
3059         int64_t len, total_sectors, max_nb_sectors;
3060 
3061         len = bdrv_getlength(bs);
3062         if (len < 0) {
3063             ret = len;
3064             goto out;
3065         }
3066 
3067         total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
3068         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3069                                   align >> BDRV_SECTOR_BITS);
3070         if (max_nb_sectors > 0) {
3071             ret = drv->bdrv_co_readv(bs, sector_num,
3072                                      MIN(nb_sectors, max_nb_sectors), qiov);
3073         } else {
3074             ret = 0;
3075         }
3076 
3077         /* Reading beyond end of file is supposed to produce zeroes */
3078         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3079             uint64_t offset = MAX(0, total_sectors - sector_num);
3080             uint64_t bytes = (sector_num + nb_sectors - offset) *
3081                               BDRV_SECTOR_SIZE;
3082             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3083         }
3084     }
3085 
3086 out:
3087     return ret;
3088 }
3089 
3090 /*
3091  * Handle a read request in coroutine context
3092  */
3093 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3094     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3095     BdrvRequestFlags flags)
3096 {
3097     BlockDriver *drv = bs->drv;
3098     BdrvTrackedRequest req;
3099 
3100     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3101     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3102     uint8_t *head_buf = NULL;
3103     uint8_t *tail_buf = NULL;
3104     QEMUIOVector local_qiov;
3105     bool use_local_qiov = false;
3106     int ret;
3107 
3108     if (!drv) {
3109         return -ENOMEDIUM;
3110     }
3111     if (bdrv_check_byte_request(bs, offset, bytes)) {
3112         return -EIO;
3113     }
3114 
3115     if (bs->copy_on_read) {
3116         flags |= BDRV_REQ_COPY_ON_READ;
3117     }
3118 
3119     /* throttling disk I/O */
3120     if (bs->io_limits_enabled) {
3121         bdrv_io_limits_intercept(bs, bytes, false);
3122     }
3123 
3124     /* Align read if necessary by padding qiov */
3125     if (offset & (align - 1)) {
3126         head_buf = qemu_blockalign(bs, align);
3127         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3128         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3129         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3130         use_local_qiov = true;
3131 
3132         bytes += offset & (align - 1);
3133         offset = offset & ~(align - 1);
3134     }
3135 
3136     if ((offset + bytes) & (align - 1)) {
3137         if (!use_local_qiov) {
3138             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3139             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3140             use_local_qiov = true;
3141         }
3142         tail_buf = qemu_blockalign(bs, align);
3143         qemu_iovec_add(&local_qiov, tail_buf,
3144                        align - ((offset + bytes) & (align - 1)));
3145 
3146         bytes = ROUND_UP(bytes, align);
3147     }
3148 
3149     tracked_request_begin(&req, bs, offset, bytes, false);
3150     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3151                               use_local_qiov ? &local_qiov : qiov,
3152                               flags);
3153     tracked_request_end(&req);
3154 
3155     if (use_local_qiov) {
3156         qemu_iovec_destroy(&local_qiov);
3157         qemu_vfree(head_buf);
3158         qemu_vfree(tail_buf);
3159     }
3160 
3161     return ret;
3162 }
3163 
3164 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3165     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3166     BdrvRequestFlags flags)
3167 {
3168     if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3169         return -EINVAL;
3170     }
3171 
3172     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3173                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3174 }
3175 
3176 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3177     int nb_sectors, QEMUIOVector *qiov)
3178 {
3179     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3180 
3181     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3182 }
3183 
3184 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3185     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3186 {
3187     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3188 
3189     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3190                             BDRV_REQ_COPY_ON_READ);
3191 }
3192 
3193 /* if no limit is specified in the BlockLimits use a default
3194  * of 32768 512-byte sectors (16 MiB) per request.
3195  */
3196 #define MAX_WRITE_ZEROES_DEFAULT 32768
3197 
3198 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3199     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3200 {
3201     BlockDriver *drv = bs->drv;
3202     QEMUIOVector qiov;
3203     struct iovec iov = {0};
3204     int ret = 0;
3205 
3206     int max_write_zeroes = bs->bl.max_write_zeroes ?
3207                            bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3208 
3209     while (nb_sectors > 0 && !ret) {
3210         int num = nb_sectors;
3211 
3212         /* Align request.  Block drivers can expect the "bulk" of the request
3213          * to be aligned.
3214          */
3215         if (bs->bl.write_zeroes_alignment
3216             && num > bs->bl.write_zeroes_alignment) {
3217             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3218                 /* Make a small request up to the first aligned sector.  */
3219                 num = bs->bl.write_zeroes_alignment;
3220                 num -= sector_num % bs->bl.write_zeroes_alignment;
3221             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3222                 /* Shorten the request to the last aligned sector.  num cannot
3223                  * underflow because num > bs->bl.write_zeroes_alignment.
3224                  */
3225                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3226             }
3227         }
3228 
3229         /* limit request size */
3230         if (num > max_write_zeroes) {
3231             num = max_write_zeroes;
3232         }
3233 
3234         ret = -ENOTSUP;
3235         /* First try the efficient write zeroes operation */
3236         if (drv->bdrv_co_write_zeroes) {
3237             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3238         }
3239 
3240         if (ret == -ENOTSUP) {
3241             /* Fall back to bounce buffer if write zeroes is unsupported */
3242             iov.iov_len = num * BDRV_SECTOR_SIZE;
3243             if (iov.iov_base == NULL) {
3244                 iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3245                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3246             }
3247             qemu_iovec_init_external(&qiov, &iov, 1);
3248 
3249             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3250 
3251             /* Keep bounce buffer around if it is big enough for all
3252              * all future requests.
3253              */
3254             if (num < max_write_zeroes) {
3255                 qemu_vfree(iov.iov_base);
3256                 iov.iov_base = NULL;
3257             }
3258         }
3259 
3260         sector_num += num;
3261         nb_sectors -= num;
3262     }
3263 
3264     qemu_vfree(iov.iov_base);
3265     return ret;
3266 }
3267 
3268 /*
3269  * Forwards an already correctly aligned write request to the BlockDriver.
3270  */
3271 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3272     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3273     QEMUIOVector *qiov, int flags)
3274 {
3275     BlockDriver *drv = bs->drv;
3276     bool waited;
3277     int ret;
3278 
3279     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3280     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3281 
3282     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3283     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3284 
3285     waited = wait_serialising_requests(req);
3286     assert(!waited || !req->serialising);
3287     assert(req->overlap_offset <= offset);
3288     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3289 
3290     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3291 
3292     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3293         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3294         qemu_iovec_is_zero(qiov)) {
3295         flags |= BDRV_REQ_ZERO_WRITE;
3296         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3297             flags |= BDRV_REQ_MAY_UNMAP;
3298         }
3299     }
3300 
3301     if (ret < 0) {
3302         /* Do nothing, write notifier decided to fail this request */
3303     } else if (flags & BDRV_REQ_ZERO_WRITE) {
3304         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3305         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3306     } else {
3307         BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3308         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3309     }
3310     BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3311 
3312     if (ret == 0 && !bs->enable_write_cache) {
3313         ret = bdrv_co_flush(bs);
3314     }
3315 
3316     bdrv_set_dirty(bs, sector_num, nb_sectors);
3317 
3318     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3319         bs->wr_highest_sector = sector_num + nb_sectors - 1;
3320     }
3321     if (bs->growable && ret >= 0) {
3322         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3323     }
3324 
3325     return ret;
3326 }
3327 
3328 /*
3329  * Handle a write request in coroutine context
3330  */
3331 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3332     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3333     BdrvRequestFlags flags)
3334 {
3335     BdrvTrackedRequest req;
3336     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3337     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3338     uint8_t *head_buf = NULL;
3339     uint8_t *tail_buf = NULL;
3340     QEMUIOVector local_qiov;
3341     bool use_local_qiov = false;
3342     int ret;
3343 
3344     if (!bs->drv) {
3345         return -ENOMEDIUM;
3346     }
3347     if (bs->read_only) {
3348         return -EACCES;
3349     }
3350     if (bdrv_check_byte_request(bs, offset, bytes)) {
3351         return -EIO;
3352     }
3353 
3354     /* throttling disk I/O */
3355     if (bs->io_limits_enabled) {
3356         bdrv_io_limits_intercept(bs, bytes, true);
3357     }
3358 
3359     /*
3360      * Align write if necessary by performing a read-modify-write cycle.
3361      * Pad qiov with the read parts and be sure to have a tracked request not
3362      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3363      */
3364     tracked_request_begin(&req, bs, offset, bytes, true);
3365 
3366     if (offset & (align - 1)) {
3367         QEMUIOVector head_qiov;
3368         struct iovec head_iov;
3369 
3370         mark_request_serialising(&req, align);
3371         wait_serialising_requests(&req);
3372 
3373         head_buf = qemu_blockalign(bs, align);
3374         head_iov = (struct iovec) {
3375             .iov_base   = head_buf,
3376             .iov_len    = align,
3377         };
3378         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3379 
3380         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3381         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3382                                   align, &head_qiov, 0);
3383         if (ret < 0) {
3384             goto fail;
3385         }
3386         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3387 
3388         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3389         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3390         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3391         use_local_qiov = true;
3392 
3393         bytes += offset & (align - 1);
3394         offset = offset & ~(align - 1);
3395     }
3396 
3397     if ((offset + bytes) & (align - 1)) {
3398         QEMUIOVector tail_qiov;
3399         struct iovec tail_iov;
3400         size_t tail_bytes;
3401         bool waited;
3402 
3403         mark_request_serialising(&req, align);
3404         waited = wait_serialising_requests(&req);
3405         assert(!waited || !use_local_qiov);
3406 
3407         tail_buf = qemu_blockalign(bs, align);
3408         tail_iov = (struct iovec) {
3409             .iov_base   = tail_buf,
3410             .iov_len    = align,
3411         };
3412         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3413 
3414         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3415         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3416                                   align, &tail_qiov, 0);
3417         if (ret < 0) {
3418             goto fail;
3419         }
3420         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3421 
3422         if (!use_local_qiov) {
3423             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3424             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3425             use_local_qiov = true;
3426         }
3427 
3428         tail_bytes = (offset + bytes) & (align - 1);
3429         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3430 
3431         bytes = ROUND_UP(bytes, align);
3432     }
3433 
3434     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3435                                use_local_qiov ? &local_qiov : qiov,
3436                                flags);
3437 
3438 fail:
3439     tracked_request_end(&req);
3440 
3441     if (use_local_qiov) {
3442         qemu_iovec_destroy(&local_qiov);
3443     }
3444     qemu_vfree(head_buf);
3445     qemu_vfree(tail_buf);
3446 
3447     return ret;
3448 }
3449 
3450 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3451     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3452     BdrvRequestFlags flags)
3453 {
3454     if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3455         return -EINVAL;
3456     }
3457 
3458     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3459                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3460 }
3461 
3462 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3463     int nb_sectors, QEMUIOVector *qiov)
3464 {
3465     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3466 
3467     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3468 }
3469 
3470 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3471                                       int64_t sector_num, int nb_sectors,
3472                                       BdrvRequestFlags flags)
3473 {
3474     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3475 
3476     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3477         flags &= ~BDRV_REQ_MAY_UNMAP;
3478     }
3479 
3480     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3481                              BDRV_REQ_ZERO_WRITE | flags);
3482 }
3483 
3484 /**
3485  * Truncate file to 'offset' bytes (needed only for file protocols)
3486  */
3487 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3488 {
3489     BlockDriver *drv = bs->drv;
3490     int ret;
3491     if (!drv)
3492         return -ENOMEDIUM;
3493     if (!drv->bdrv_truncate)
3494         return -ENOTSUP;
3495     if (bs->read_only)
3496         return -EACCES;
3497     if (bdrv_in_use(bs))
3498         return -EBUSY;
3499     ret = drv->bdrv_truncate(bs, offset);
3500     if (ret == 0) {
3501         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3502         bdrv_dev_resize_cb(bs);
3503     }
3504     return ret;
3505 }
3506 
3507 /**
3508  * Length of a allocated file in bytes. Sparse files are counted by actual
3509  * allocated space. Return < 0 if error or unknown.
3510  */
3511 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3512 {
3513     BlockDriver *drv = bs->drv;
3514     if (!drv) {
3515         return -ENOMEDIUM;
3516     }
3517     if (drv->bdrv_get_allocated_file_size) {
3518         return drv->bdrv_get_allocated_file_size(bs);
3519     }
3520     if (bs->file) {
3521         return bdrv_get_allocated_file_size(bs->file);
3522     }
3523     return -ENOTSUP;
3524 }
3525 
3526 /**
3527  * Length of a file in bytes. Return < 0 if error or unknown.
3528  */
3529 int64_t bdrv_getlength(BlockDriverState *bs)
3530 {
3531     BlockDriver *drv = bs->drv;
3532     if (!drv)
3533         return -ENOMEDIUM;
3534 
3535     if (drv->has_variable_length) {
3536         int ret = refresh_total_sectors(bs, bs->total_sectors);
3537         if (ret < 0) {
3538             return ret;
3539         }
3540     }
3541     return bs->total_sectors * BDRV_SECTOR_SIZE;
3542 }
3543 
3544 /* return 0 as number of sectors if no device present or error */
3545 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3546 {
3547     int64_t length;
3548     length = bdrv_getlength(bs);
3549     if (length < 0)
3550         length = 0;
3551     else
3552         length = length >> BDRV_SECTOR_BITS;
3553     *nb_sectors_ptr = length;
3554 }
3555 
3556 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3557                        BlockdevOnError on_write_error)
3558 {
3559     bs->on_read_error = on_read_error;
3560     bs->on_write_error = on_write_error;
3561 }
3562 
3563 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3564 {
3565     return is_read ? bs->on_read_error : bs->on_write_error;
3566 }
3567 
3568 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3569 {
3570     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3571 
3572     switch (on_err) {
3573     case BLOCKDEV_ON_ERROR_ENOSPC:
3574         return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3575     case BLOCKDEV_ON_ERROR_STOP:
3576         return BDRV_ACTION_STOP;
3577     case BLOCKDEV_ON_ERROR_REPORT:
3578         return BDRV_ACTION_REPORT;
3579     case BLOCKDEV_ON_ERROR_IGNORE:
3580         return BDRV_ACTION_IGNORE;
3581     default:
3582         abort();
3583     }
3584 }
3585 
3586 /* This is done by device models because, while the block layer knows
3587  * about the error, it does not know whether an operation comes from
3588  * the device or the block layer (from a job, for example).
3589  */
3590 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3591                        bool is_read, int error)
3592 {
3593     assert(error >= 0);
3594     bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3595     if (action == BDRV_ACTION_STOP) {
3596         vm_stop(RUN_STATE_IO_ERROR);
3597         bdrv_iostatus_set_err(bs, error);
3598     }
3599 }
3600 
3601 int bdrv_is_read_only(BlockDriverState *bs)
3602 {
3603     return bs->read_only;
3604 }
3605 
3606 int bdrv_is_sg(BlockDriverState *bs)
3607 {
3608     return bs->sg;
3609 }
3610 
3611 int bdrv_enable_write_cache(BlockDriverState *bs)
3612 {
3613     return bs->enable_write_cache;
3614 }
3615 
3616 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3617 {
3618     bs->enable_write_cache = wce;
3619 
3620     /* so a reopen() will preserve wce */
3621     if (wce) {
3622         bs->open_flags |= BDRV_O_CACHE_WB;
3623     } else {
3624         bs->open_flags &= ~BDRV_O_CACHE_WB;
3625     }
3626 }
3627 
3628 int bdrv_is_encrypted(BlockDriverState *bs)
3629 {
3630     if (bs->backing_hd && bs->backing_hd->encrypted)
3631         return 1;
3632     return bs->encrypted;
3633 }
3634 
3635 int bdrv_key_required(BlockDriverState *bs)
3636 {
3637     BlockDriverState *backing_hd = bs->backing_hd;
3638 
3639     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3640         return 1;
3641     return (bs->encrypted && !bs->valid_key);
3642 }
3643 
3644 int bdrv_set_key(BlockDriverState *bs, const char *key)
3645 {
3646     int ret;
3647     if (bs->backing_hd && bs->backing_hd->encrypted) {
3648         ret = bdrv_set_key(bs->backing_hd, key);
3649         if (ret < 0)
3650             return ret;
3651         if (!bs->encrypted)
3652             return 0;
3653     }
3654     if (!bs->encrypted) {
3655         return -EINVAL;
3656     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3657         return -ENOMEDIUM;
3658     }
3659     ret = bs->drv->bdrv_set_key(bs, key);
3660     if (ret < 0) {
3661         bs->valid_key = 0;
3662     } else if (!bs->valid_key) {
3663         bs->valid_key = 1;
3664         /* call the change callback now, we skipped it on open */
3665         bdrv_dev_change_media_cb(bs, true);
3666     }
3667     return ret;
3668 }
3669 
3670 const char *bdrv_get_format_name(BlockDriverState *bs)
3671 {
3672     return bs->drv ? bs->drv->format_name : NULL;
3673 }
3674 
3675 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3676                          void *opaque)
3677 {
3678     BlockDriver *drv;
3679     int count = 0;
3680     const char **formats = NULL;
3681 
3682     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3683         if (drv->format_name) {
3684             bool found = false;
3685             int i = count;
3686             while (formats && i && !found) {
3687                 found = !strcmp(formats[--i], drv->format_name);
3688             }
3689 
3690             if (!found) {
3691                 formats = g_realloc(formats, (count + 1) * sizeof(char *));
3692                 formats[count++] = drv->format_name;
3693                 it(opaque, drv->format_name);
3694             }
3695         }
3696     }
3697     g_free(formats);
3698 }
3699 
3700 /* This function is to find block backend bs */
3701 BlockDriverState *bdrv_find(const char *name)
3702 {
3703     BlockDriverState *bs;
3704 
3705     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3706         if (!strcmp(name, bs->device_name)) {
3707             return bs;
3708         }
3709     }
3710     return NULL;
3711 }
3712 
3713 /* This function is to find a node in the bs graph */
3714 BlockDriverState *bdrv_find_node(const char *node_name)
3715 {
3716     BlockDriverState *bs;
3717 
3718     assert(node_name);
3719 
3720     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3721         if (!strcmp(node_name, bs->node_name)) {
3722             return bs;
3723         }
3724     }
3725     return NULL;
3726 }
3727 
3728 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3729 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3730 {
3731     BlockDeviceInfoList *list, *entry;
3732     BlockDriverState *bs;
3733 
3734     list = NULL;
3735     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3736         entry = g_malloc0(sizeof(*entry));
3737         entry->value = bdrv_block_device_info(bs);
3738         entry->next = list;
3739         list = entry;
3740     }
3741 
3742     return list;
3743 }
3744 
3745 BlockDriverState *bdrv_lookup_bs(const char *device,
3746                                  const char *node_name,
3747                                  Error **errp)
3748 {
3749     BlockDriverState *bs = NULL;
3750 
3751     if (device) {
3752         bs = bdrv_find(device);
3753 
3754         if (bs) {
3755             return bs;
3756         }
3757     }
3758 
3759     if (node_name) {
3760         bs = bdrv_find_node(node_name);
3761 
3762         if (bs) {
3763             return bs;
3764         }
3765     }
3766 
3767     error_setg(errp, "Cannot find device=%s nor node_name=%s",
3768                      device ? device : "",
3769                      node_name ? node_name : "");
3770     return NULL;
3771 }
3772 
3773 BlockDriverState *bdrv_next(BlockDriverState *bs)
3774 {
3775     if (!bs) {
3776         return QTAILQ_FIRST(&bdrv_states);
3777     }
3778     return QTAILQ_NEXT(bs, device_list);
3779 }
3780 
3781 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3782 {
3783     BlockDriverState *bs;
3784 
3785     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3786         it(opaque, bs);
3787     }
3788 }
3789 
3790 const char *bdrv_get_device_name(BlockDriverState *bs)
3791 {
3792     return bs->device_name;
3793 }
3794 
3795 int bdrv_get_flags(BlockDriverState *bs)
3796 {
3797     return bs->open_flags;
3798 }
3799 
3800 int bdrv_flush_all(void)
3801 {
3802     BlockDriverState *bs;
3803     int result = 0;
3804 
3805     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3806         int ret = bdrv_flush(bs);
3807         if (ret < 0 && !result) {
3808             result = ret;
3809         }
3810     }
3811 
3812     return result;
3813 }
3814 
3815 int bdrv_has_zero_init_1(BlockDriverState *bs)
3816 {
3817     return 1;
3818 }
3819 
3820 int bdrv_has_zero_init(BlockDriverState *bs)
3821 {
3822     assert(bs->drv);
3823 
3824     /* If BS is a copy on write image, it is initialized to
3825        the contents of the base image, which may not be zeroes.  */
3826     if (bs->backing_hd) {
3827         return 0;
3828     }
3829     if (bs->drv->bdrv_has_zero_init) {
3830         return bs->drv->bdrv_has_zero_init(bs);
3831     }
3832 
3833     /* safe default */
3834     return 0;
3835 }
3836 
3837 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3838 {
3839     BlockDriverInfo bdi;
3840 
3841     if (bs->backing_hd) {
3842         return false;
3843     }
3844 
3845     if (bdrv_get_info(bs, &bdi) == 0) {
3846         return bdi.unallocated_blocks_are_zero;
3847     }
3848 
3849     return false;
3850 }
3851 
3852 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3853 {
3854     BlockDriverInfo bdi;
3855 
3856     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3857         return false;
3858     }
3859 
3860     if (bdrv_get_info(bs, &bdi) == 0) {
3861         return bdi.can_write_zeroes_with_unmap;
3862     }
3863 
3864     return false;
3865 }
3866 
3867 typedef struct BdrvCoGetBlockStatusData {
3868     BlockDriverState *bs;
3869     BlockDriverState *base;
3870     int64_t sector_num;
3871     int nb_sectors;
3872     int *pnum;
3873     int64_t ret;
3874     bool done;
3875 } BdrvCoGetBlockStatusData;
3876 
3877 /*
3878  * Returns true iff the specified sector is present in the disk image. Drivers
3879  * not implementing the functionality are assumed to not support backing files,
3880  * hence all their sectors are reported as allocated.
3881  *
3882  * If 'sector_num' is beyond the end of the disk image the return value is 0
3883  * and 'pnum' is set to 0.
3884  *
3885  * 'pnum' is set to the number of sectors (including and immediately following
3886  * the specified sector) that are known to be in the same
3887  * allocated/unallocated state.
3888  *
3889  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3890  * beyond the end of the disk image it will be clamped.
3891  */
3892 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3893                                                      int64_t sector_num,
3894                                                      int nb_sectors, int *pnum)
3895 {
3896     int64_t length;
3897     int64_t n;
3898     int64_t ret, ret2;
3899 
3900     length = bdrv_getlength(bs);
3901     if (length < 0) {
3902         return length;
3903     }
3904 
3905     if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3906         *pnum = 0;
3907         return 0;
3908     }
3909 
3910     n = bs->total_sectors - sector_num;
3911     if (n < nb_sectors) {
3912         nb_sectors = n;
3913     }
3914 
3915     if (!bs->drv->bdrv_co_get_block_status) {
3916         *pnum = nb_sectors;
3917         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
3918         if (bs->drv->protocol_name) {
3919             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3920         }
3921         return ret;
3922     }
3923 
3924     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3925     if (ret < 0) {
3926         *pnum = 0;
3927         return ret;
3928     }
3929 
3930     if (ret & BDRV_BLOCK_RAW) {
3931         assert(ret & BDRV_BLOCK_OFFSET_VALID);
3932         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3933                                      *pnum, pnum);
3934     }
3935 
3936     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
3937         ret |= BDRV_BLOCK_ALLOCATED;
3938     }
3939 
3940     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3941         if (bdrv_unallocated_blocks_are_zero(bs)) {
3942             ret |= BDRV_BLOCK_ZERO;
3943         } else if (bs->backing_hd) {
3944             BlockDriverState *bs2 = bs->backing_hd;
3945             int64_t length2 = bdrv_getlength(bs2);
3946             if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3947                 ret |= BDRV_BLOCK_ZERO;
3948             }
3949         }
3950     }
3951 
3952     if (bs->file &&
3953         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3954         (ret & BDRV_BLOCK_OFFSET_VALID)) {
3955         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3956                                         *pnum, pnum);
3957         if (ret2 >= 0) {
3958             /* Ignore errors.  This is just providing extra information, it
3959              * is useful but not necessary.
3960              */
3961             ret |= (ret2 & BDRV_BLOCK_ZERO);
3962         }
3963     }
3964 
3965     return ret;
3966 }
3967 
3968 /* Coroutine wrapper for bdrv_get_block_status() */
3969 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3970 {
3971     BdrvCoGetBlockStatusData *data = opaque;
3972     BlockDriverState *bs = data->bs;
3973 
3974     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3975                                          data->pnum);
3976     data->done = true;
3977 }
3978 
3979 /*
3980  * Synchronous wrapper around bdrv_co_get_block_status().
3981  *
3982  * See bdrv_co_get_block_status() for details.
3983  */
3984 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3985                               int nb_sectors, int *pnum)
3986 {
3987     Coroutine *co;
3988     BdrvCoGetBlockStatusData data = {
3989         .bs = bs,
3990         .sector_num = sector_num,
3991         .nb_sectors = nb_sectors,
3992         .pnum = pnum,
3993         .done = false,
3994     };
3995 
3996     if (qemu_in_coroutine()) {
3997         /* Fast-path if already in coroutine context */
3998         bdrv_get_block_status_co_entry(&data);
3999     } else {
4000         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4001         qemu_coroutine_enter(co, &data);
4002         while (!data.done) {
4003             qemu_aio_wait();
4004         }
4005     }
4006     return data.ret;
4007 }
4008 
4009 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4010                                    int nb_sectors, int *pnum)
4011 {
4012     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4013     if (ret < 0) {
4014         return ret;
4015     }
4016     return (ret & BDRV_BLOCK_ALLOCATED);
4017 }
4018 
4019 /*
4020  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4021  *
4022  * Return true if the given sector is allocated in any image between
4023  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
4024  * sector is allocated in any image of the chain.  Return false otherwise.
4025  *
4026  * 'pnum' is set to the number of sectors (including and immediately following
4027  *  the specified sector) that are known to be in the same
4028  *  allocated/unallocated state.
4029  *
4030  */
4031 int bdrv_is_allocated_above(BlockDriverState *top,
4032                             BlockDriverState *base,
4033                             int64_t sector_num,
4034                             int nb_sectors, int *pnum)
4035 {
4036     BlockDriverState *intermediate;
4037     int ret, n = nb_sectors;
4038 
4039     intermediate = top;
4040     while (intermediate && intermediate != base) {
4041         int pnum_inter;
4042         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4043                                 &pnum_inter);
4044         if (ret < 0) {
4045             return ret;
4046         } else if (ret) {
4047             *pnum = pnum_inter;
4048             return 1;
4049         }
4050 
4051         /*
4052          * [sector_num, nb_sectors] is unallocated on top but intermediate
4053          * might have
4054          *
4055          * [sector_num+x, nr_sectors] allocated.
4056          */
4057         if (n > pnum_inter &&
4058             (intermediate == top ||
4059              sector_num + pnum_inter < intermediate->total_sectors)) {
4060             n = pnum_inter;
4061         }
4062 
4063         intermediate = intermediate->backing_hd;
4064     }
4065 
4066     *pnum = n;
4067     return 0;
4068 }
4069 
4070 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4071 {
4072     if (bs->backing_hd && bs->backing_hd->encrypted)
4073         return bs->backing_file;
4074     else if (bs->encrypted)
4075         return bs->filename;
4076     else
4077         return NULL;
4078 }
4079 
4080 void bdrv_get_backing_filename(BlockDriverState *bs,
4081                                char *filename, int filename_size)
4082 {
4083     pstrcpy(filename, filename_size, bs->backing_file);
4084 }
4085 
4086 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4087                           const uint8_t *buf, int nb_sectors)
4088 {
4089     BlockDriver *drv = bs->drv;
4090     if (!drv)
4091         return -ENOMEDIUM;
4092     if (!drv->bdrv_write_compressed)
4093         return -ENOTSUP;
4094     if (bdrv_check_request(bs, sector_num, nb_sectors))
4095         return -EIO;
4096 
4097     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4098 
4099     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4100 }
4101 
4102 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4103 {
4104     BlockDriver *drv = bs->drv;
4105     if (!drv)
4106         return -ENOMEDIUM;
4107     if (!drv->bdrv_get_info)
4108         return -ENOTSUP;
4109     memset(bdi, 0, sizeof(*bdi));
4110     return drv->bdrv_get_info(bs, bdi);
4111 }
4112 
4113 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4114 {
4115     BlockDriver *drv = bs->drv;
4116     if (drv && drv->bdrv_get_specific_info) {
4117         return drv->bdrv_get_specific_info(bs);
4118     }
4119     return NULL;
4120 }
4121 
4122 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4123                       int64_t pos, int size)
4124 {
4125     QEMUIOVector qiov;
4126     struct iovec iov = {
4127         .iov_base   = (void *) buf,
4128         .iov_len    = size,
4129     };
4130 
4131     qemu_iovec_init_external(&qiov, &iov, 1);
4132     return bdrv_writev_vmstate(bs, &qiov, pos);
4133 }
4134 
4135 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4136 {
4137     BlockDriver *drv = bs->drv;
4138 
4139     if (!drv) {
4140         return -ENOMEDIUM;
4141     } else if (drv->bdrv_save_vmstate) {
4142         return drv->bdrv_save_vmstate(bs, qiov, pos);
4143     } else if (bs->file) {
4144         return bdrv_writev_vmstate(bs->file, qiov, pos);
4145     }
4146 
4147     return -ENOTSUP;
4148 }
4149 
4150 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4151                       int64_t pos, int size)
4152 {
4153     BlockDriver *drv = bs->drv;
4154     if (!drv)
4155         return -ENOMEDIUM;
4156     if (drv->bdrv_load_vmstate)
4157         return drv->bdrv_load_vmstate(bs, buf, pos, size);
4158     if (bs->file)
4159         return bdrv_load_vmstate(bs->file, buf, pos, size);
4160     return -ENOTSUP;
4161 }
4162 
4163 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4164 {
4165     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4166         return;
4167     }
4168 
4169     bs->drv->bdrv_debug_event(bs, event);
4170 }
4171 
4172 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4173                           const char *tag)
4174 {
4175     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4176         bs = bs->file;
4177     }
4178 
4179     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4180         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4181     }
4182 
4183     return -ENOTSUP;
4184 }
4185 
4186 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4187 {
4188     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4189         bs = bs->file;
4190     }
4191 
4192     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4193         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4194     }
4195 
4196     return -ENOTSUP;
4197 }
4198 
4199 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4200 {
4201     while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4202         bs = bs->file;
4203     }
4204 
4205     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4206         return bs->drv->bdrv_debug_resume(bs, tag);
4207     }
4208 
4209     return -ENOTSUP;
4210 }
4211 
4212 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4213 {
4214     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4215         bs = bs->file;
4216     }
4217 
4218     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4219         return bs->drv->bdrv_debug_is_suspended(bs, tag);
4220     }
4221 
4222     return false;
4223 }
4224 
4225 int bdrv_is_snapshot(BlockDriverState *bs)
4226 {
4227     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4228 }
4229 
4230 /* backing_file can either be relative, or absolute, or a protocol.  If it is
4231  * relative, it must be relative to the chain.  So, passing in bs->filename
4232  * from a BDS as backing_file should not be done, as that may be relative to
4233  * the CWD rather than the chain. */
4234 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4235         const char *backing_file)
4236 {
4237     char *filename_full = NULL;
4238     char *backing_file_full = NULL;
4239     char *filename_tmp = NULL;
4240     int is_protocol = 0;
4241     BlockDriverState *curr_bs = NULL;
4242     BlockDriverState *retval = NULL;
4243 
4244     if (!bs || !bs->drv || !backing_file) {
4245         return NULL;
4246     }
4247 
4248     filename_full     = g_malloc(PATH_MAX);
4249     backing_file_full = g_malloc(PATH_MAX);
4250     filename_tmp      = g_malloc(PATH_MAX);
4251 
4252     is_protocol = path_has_protocol(backing_file);
4253 
4254     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4255 
4256         /* If either of the filename paths is actually a protocol, then
4257          * compare unmodified paths; otherwise make paths relative */
4258         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4259             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4260                 retval = curr_bs->backing_hd;
4261                 break;
4262             }
4263         } else {
4264             /* If not an absolute filename path, make it relative to the current
4265              * image's filename path */
4266             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4267                          backing_file);
4268 
4269             /* We are going to compare absolute pathnames */
4270             if (!realpath(filename_tmp, filename_full)) {
4271                 continue;
4272             }
4273 
4274             /* We need to make sure the backing filename we are comparing against
4275              * is relative to the current image filename (or absolute) */
4276             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4277                          curr_bs->backing_file);
4278 
4279             if (!realpath(filename_tmp, backing_file_full)) {
4280                 continue;
4281             }
4282 
4283             if (strcmp(backing_file_full, filename_full) == 0) {
4284                 retval = curr_bs->backing_hd;
4285                 break;
4286             }
4287         }
4288     }
4289 
4290     g_free(filename_full);
4291     g_free(backing_file_full);
4292     g_free(filename_tmp);
4293     return retval;
4294 }
4295 
4296 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4297 {
4298     if (!bs->drv) {
4299         return 0;
4300     }
4301 
4302     if (!bs->backing_hd) {
4303         return 0;
4304     }
4305 
4306     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4307 }
4308 
4309 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4310 {
4311     BlockDriverState *curr_bs = NULL;
4312 
4313     if (!bs) {
4314         return NULL;
4315     }
4316 
4317     curr_bs = bs;
4318 
4319     while (curr_bs->backing_hd) {
4320         curr_bs = curr_bs->backing_hd;
4321     }
4322     return curr_bs;
4323 }
4324 
4325 /**************************************************************/
4326 /* async I/Os */
4327 
4328 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4329                                  QEMUIOVector *qiov, int nb_sectors,
4330                                  BlockDriverCompletionFunc *cb, void *opaque)
4331 {
4332     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4333 
4334     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4335                                  cb, opaque, false);
4336 }
4337 
4338 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4339                                   QEMUIOVector *qiov, int nb_sectors,
4340                                   BlockDriverCompletionFunc *cb, void *opaque)
4341 {
4342     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4343 
4344     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4345                                  cb, opaque, true);
4346 }
4347 
4348 BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4349         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4350         BlockDriverCompletionFunc *cb, void *opaque)
4351 {
4352     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4353 
4354     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4355                                  BDRV_REQ_ZERO_WRITE | flags,
4356                                  cb, opaque, true);
4357 }
4358 
4359 
4360 typedef struct MultiwriteCB {
4361     int error;
4362     int num_requests;
4363     int num_callbacks;
4364     struct {
4365         BlockDriverCompletionFunc *cb;
4366         void *opaque;
4367         QEMUIOVector *free_qiov;
4368     } callbacks[];
4369 } MultiwriteCB;
4370 
4371 static void multiwrite_user_cb(MultiwriteCB *mcb)
4372 {
4373     int i;
4374 
4375     for (i = 0; i < mcb->num_callbacks; i++) {
4376         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4377         if (mcb->callbacks[i].free_qiov) {
4378             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4379         }
4380         g_free(mcb->callbacks[i].free_qiov);
4381     }
4382 }
4383 
4384 static void multiwrite_cb(void *opaque, int ret)
4385 {
4386     MultiwriteCB *mcb = opaque;
4387 
4388     trace_multiwrite_cb(mcb, ret);
4389 
4390     if (ret < 0 && !mcb->error) {
4391         mcb->error = ret;
4392     }
4393 
4394     mcb->num_requests--;
4395     if (mcb->num_requests == 0) {
4396         multiwrite_user_cb(mcb);
4397         g_free(mcb);
4398     }
4399 }
4400 
4401 static int multiwrite_req_compare(const void *a, const void *b)
4402 {
4403     const BlockRequest *req1 = a, *req2 = b;
4404 
4405     /*
4406      * Note that we can't simply subtract req2->sector from req1->sector
4407      * here as that could overflow the return value.
4408      */
4409     if (req1->sector > req2->sector) {
4410         return 1;
4411     } else if (req1->sector < req2->sector) {
4412         return -1;
4413     } else {
4414         return 0;
4415     }
4416 }
4417 
4418 /*
4419  * Takes a bunch of requests and tries to merge them. Returns the number of
4420  * requests that remain after merging.
4421  */
4422 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4423     int num_reqs, MultiwriteCB *mcb)
4424 {
4425     int i, outidx;
4426 
4427     // Sort requests by start sector
4428     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4429 
4430     // Check if adjacent requests touch the same clusters. If so, combine them,
4431     // filling up gaps with zero sectors.
4432     outidx = 0;
4433     for (i = 1; i < num_reqs; i++) {
4434         int merge = 0;
4435         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4436 
4437         // Handle exactly sequential writes and overlapping writes.
4438         if (reqs[i].sector <= oldreq_last) {
4439             merge = 1;
4440         }
4441 
4442         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4443             merge = 0;
4444         }
4445 
4446         if (merge) {
4447             size_t size;
4448             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4449             qemu_iovec_init(qiov,
4450                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4451 
4452             // Add the first request to the merged one. If the requests are
4453             // overlapping, drop the last sectors of the first request.
4454             size = (reqs[i].sector - reqs[outidx].sector) << 9;
4455             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4456 
4457             // We should need to add any zeros between the two requests
4458             assert (reqs[i].sector <= oldreq_last);
4459 
4460             // Add the second request
4461             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4462 
4463             reqs[outidx].nb_sectors = qiov->size >> 9;
4464             reqs[outidx].qiov = qiov;
4465 
4466             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4467         } else {
4468             outidx++;
4469             reqs[outidx].sector     = reqs[i].sector;
4470             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4471             reqs[outidx].qiov       = reqs[i].qiov;
4472         }
4473     }
4474 
4475     return outidx + 1;
4476 }
4477 
4478 /*
4479  * Submit multiple AIO write requests at once.
4480  *
4481  * On success, the function returns 0 and all requests in the reqs array have
4482  * been submitted. In error case this function returns -1, and any of the
4483  * requests may or may not be submitted yet. In particular, this means that the
4484  * callback will be called for some of the requests, for others it won't. The
4485  * caller must check the error field of the BlockRequest to wait for the right
4486  * callbacks (if error != 0, no callback will be called).
4487  *
4488  * The implementation may modify the contents of the reqs array, e.g. to merge
4489  * requests. However, the fields opaque and error are left unmodified as they
4490  * are used to signal failure for a single request to the caller.
4491  */
4492 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4493 {
4494     MultiwriteCB *mcb;
4495     int i;
4496 
4497     /* don't submit writes if we don't have a medium */
4498     if (bs->drv == NULL) {
4499         for (i = 0; i < num_reqs; i++) {
4500             reqs[i].error = -ENOMEDIUM;
4501         }
4502         return -1;
4503     }
4504 
4505     if (num_reqs == 0) {
4506         return 0;
4507     }
4508 
4509     // Create MultiwriteCB structure
4510     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4511     mcb->num_requests = 0;
4512     mcb->num_callbacks = num_reqs;
4513 
4514     for (i = 0; i < num_reqs; i++) {
4515         mcb->callbacks[i].cb = reqs[i].cb;
4516         mcb->callbacks[i].opaque = reqs[i].opaque;
4517     }
4518 
4519     // Check for mergable requests
4520     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4521 
4522     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4523 
4524     /* Run the aio requests. */
4525     mcb->num_requests = num_reqs;
4526     for (i = 0; i < num_reqs; i++) {
4527         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4528                               reqs[i].nb_sectors, reqs[i].flags,
4529                               multiwrite_cb, mcb,
4530                               true);
4531     }
4532 
4533     return 0;
4534 }
4535 
4536 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4537 {
4538     acb->aiocb_info->cancel(acb);
4539 }
4540 
4541 /**************************************************************/
4542 /* async block device emulation */
4543 
4544 typedef struct BlockDriverAIOCBSync {
4545     BlockDriverAIOCB common;
4546     QEMUBH *bh;
4547     int ret;
4548     /* vector translation state */
4549     QEMUIOVector *qiov;
4550     uint8_t *bounce;
4551     int is_write;
4552 } BlockDriverAIOCBSync;
4553 
4554 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4555 {
4556     BlockDriverAIOCBSync *acb =
4557         container_of(blockacb, BlockDriverAIOCBSync, common);
4558     qemu_bh_delete(acb->bh);
4559     acb->bh = NULL;
4560     qemu_aio_release(acb);
4561 }
4562 
4563 static const AIOCBInfo bdrv_em_aiocb_info = {
4564     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
4565     .cancel             = bdrv_aio_cancel_em,
4566 };
4567 
4568 static void bdrv_aio_bh_cb(void *opaque)
4569 {
4570     BlockDriverAIOCBSync *acb = opaque;
4571 
4572     if (!acb->is_write)
4573         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4574     qemu_vfree(acb->bounce);
4575     acb->common.cb(acb->common.opaque, acb->ret);
4576     qemu_bh_delete(acb->bh);
4577     acb->bh = NULL;
4578     qemu_aio_release(acb);
4579 }
4580 
4581 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4582                                             int64_t sector_num,
4583                                             QEMUIOVector *qiov,
4584                                             int nb_sectors,
4585                                             BlockDriverCompletionFunc *cb,
4586                                             void *opaque,
4587                                             int is_write)
4588 
4589 {
4590     BlockDriverAIOCBSync *acb;
4591 
4592     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4593     acb->is_write = is_write;
4594     acb->qiov = qiov;
4595     acb->bounce = qemu_blockalign(bs, qiov->size);
4596     acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4597 
4598     if (is_write) {
4599         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4600         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4601     } else {
4602         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4603     }
4604 
4605     qemu_bh_schedule(acb->bh);
4606 
4607     return &acb->common;
4608 }
4609 
4610 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4611         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4612         BlockDriverCompletionFunc *cb, void *opaque)
4613 {
4614     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4615 }
4616 
4617 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4618         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4619         BlockDriverCompletionFunc *cb, void *opaque)
4620 {
4621     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4622 }
4623 
4624 
4625 typedef struct BlockDriverAIOCBCoroutine {
4626     BlockDriverAIOCB common;
4627     BlockRequest req;
4628     bool is_write;
4629     bool *done;
4630     QEMUBH* bh;
4631 } BlockDriverAIOCBCoroutine;
4632 
4633 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4634 {
4635     BlockDriverAIOCBCoroutine *acb =
4636         container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4637     bool done = false;
4638 
4639     acb->done = &done;
4640     while (!done) {
4641         qemu_aio_wait();
4642     }
4643 }
4644 
4645 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4646     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4647     .cancel             = bdrv_aio_co_cancel_em,
4648 };
4649 
4650 static void bdrv_co_em_bh(void *opaque)
4651 {
4652     BlockDriverAIOCBCoroutine *acb = opaque;
4653 
4654     acb->common.cb(acb->common.opaque, acb->req.error);
4655 
4656     if (acb->done) {
4657         *acb->done = true;
4658     }
4659 
4660     qemu_bh_delete(acb->bh);
4661     qemu_aio_release(acb);
4662 }
4663 
4664 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4665 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4666 {
4667     BlockDriverAIOCBCoroutine *acb = opaque;
4668     BlockDriverState *bs = acb->common.bs;
4669 
4670     if (!acb->is_write) {
4671         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4672             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4673     } else {
4674         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4675             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4676     }
4677 
4678     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4679     qemu_bh_schedule(acb->bh);
4680 }
4681 
4682 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4683                                                int64_t sector_num,
4684                                                QEMUIOVector *qiov,
4685                                                int nb_sectors,
4686                                                BdrvRequestFlags flags,
4687                                                BlockDriverCompletionFunc *cb,
4688                                                void *opaque,
4689                                                bool is_write)
4690 {
4691     Coroutine *co;
4692     BlockDriverAIOCBCoroutine *acb;
4693 
4694     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4695     acb->req.sector = sector_num;
4696     acb->req.nb_sectors = nb_sectors;
4697     acb->req.qiov = qiov;
4698     acb->req.flags = flags;
4699     acb->is_write = is_write;
4700     acb->done = NULL;
4701 
4702     co = qemu_coroutine_create(bdrv_co_do_rw);
4703     qemu_coroutine_enter(co, acb);
4704 
4705     return &acb->common;
4706 }
4707 
4708 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4709 {
4710     BlockDriverAIOCBCoroutine *acb = opaque;
4711     BlockDriverState *bs = acb->common.bs;
4712 
4713     acb->req.error = bdrv_co_flush(bs);
4714     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4715     qemu_bh_schedule(acb->bh);
4716 }
4717 
4718 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4719         BlockDriverCompletionFunc *cb, void *opaque)
4720 {
4721     trace_bdrv_aio_flush(bs, opaque);
4722 
4723     Coroutine *co;
4724     BlockDriverAIOCBCoroutine *acb;
4725 
4726     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4727     acb->done = NULL;
4728 
4729     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4730     qemu_coroutine_enter(co, acb);
4731 
4732     return &acb->common;
4733 }
4734 
4735 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4736 {
4737     BlockDriverAIOCBCoroutine *acb = opaque;
4738     BlockDriverState *bs = acb->common.bs;
4739 
4740     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4741     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4742     qemu_bh_schedule(acb->bh);
4743 }
4744 
4745 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4746         int64_t sector_num, int nb_sectors,
4747         BlockDriverCompletionFunc *cb, void *opaque)
4748 {
4749     Coroutine *co;
4750     BlockDriverAIOCBCoroutine *acb;
4751 
4752     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4753 
4754     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4755     acb->req.sector = sector_num;
4756     acb->req.nb_sectors = nb_sectors;
4757     acb->done = NULL;
4758     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4759     qemu_coroutine_enter(co, acb);
4760 
4761     return &acb->common;
4762 }
4763 
4764 void bdrv_init(void)
4765 {
4766     module_call_init(MODULE_INIT_BLOCK);
4767 }
4768 
4769 void bdrv_init_with_whitelist(void)
4770 {
4771     use_bdrv_whitelist = 1;
4772     bdrv_init();
4773 }
4774 
4775 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4776                    BlockDriverCompletionFunc *cb, void *opaque)
4777 {
4778     BlockDriverAIOCB *acb;
4779 
4780     acb = g_slice_alloc(aiocb_info->aiocb_size);
4781     acb->aiocb_info = aiocb_info;
4782     acb->bs = bs;
4783     acb->cb = cb;
4784     acb->opaque = opaque;
4785     return acb;
4786 }
4787 
4788 void qemu_aio_release(void *p)
4789 {
4790     BlockDriverAIOCB *acb = p;
4791     g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4792 }
4793 
4794 /**************************************************************/
4795 /* Coroutine block device emulation */
4796 
4797 typedef struct CoroutineIOCompletion {
4798     Coroutine *coroutine;
4799     int ret;
4800 } CoroutineIOCompletion;
4801 
4802 static void bdrv_co_io_em_complete(void *opaque, int ret)
4803 {
4804     CoroutineIOCompletion *co = opaque;
4805 
4806     co->ret = ret;
4807     qemu_coroutine_enter(co->coroutine, NULL);
4808 }
4809 
4810 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4811                                       int nb_sectors, QEMUIOVector *iov,
4812                                       bool is_write)
4813 {
4814     CoroutineIOCompletion co = {
4815         .coroutine = qemu_coroutine_self(),
4816     };
4817     BlockDriverAIOCB *acb;
4818 
4819     if (is_write) {
4820         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4821                                        bdrv_co_io_em_complete, &co);
4822     } else {
4823         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4824                                       bdrv_co_io_em_complete, &co);
4825     }
4826 
4827     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4828     if (!acb) {
4829         return -EIO;
4830     }
4831     qemu_coroutine_yield();
4832 
4833     return co.ret;
4834 }
4835 
4836 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4837                                          int64_t sector_num, int nb_sectors,
4838                                          QEMUIOVector *iov)
4839 {
4840     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4841 }
4842 
4843 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4844                                          int64_t sector_num, int nb_sectors,
4845                                          QEMUIOVector *iov)
4846 {
4847     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4848 }
4849 
4850 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4851 {
4852     RwCo *rwco = opaque;
4853 
4854     rwco->ret = bdrv_co_flush(rwco->bs);
4855 }
4856 
4857 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4858 {
4859     int ret;
4860 
4861     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4862         return 0;
4863     }
4864 
4865     /* Write back cached data to the OS even with cache=unsafe */
4866     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4867     if (bs->drv->bdrv_co_flush_to_os) {
4868         ret = bs->drv->bdrv_co_flush_to_os(bs);
4869         if (ret < 0) {
4870             return ret;
4871         }
4872     }
4873 
4874     /* But don't actually force it to the disk with cache=unsafe */
4875     if (bs->open_flags & BDRV_O_NO_FLUSH) {
4876         goto flush_parent;
4877     }
4878 
4879     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4880     if (bs->drv->bdrv_co_flush_to_disk) {
4881         ret = bs->drv->bdrv_co_flush_to_disk(bs);
4882     } else if (bs->drv->bdrv_aio_flush) {
4883         BlockDriverAIOCB *acb;
4884         CoroutineIOCompletion co = {
4885             .coroutine = qemu_coroutine_self(),
4886         };
4887 
4888         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4889         if (acb == NULL) {
4890             ret = -EIO;
4891         } else {
4892             qemu_coroutine_yield();
4893             ret = co.ret;
4894         }
4895     } else {
4896         /*
4897          * Some block drivers always operate in either writethrough or unsafe
4898          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4899          * know how the server works (because the behaviour is hardcoded or
4900          * depends on server-side configuration), so we can't ensure that
4901          * everything is safe on disk. Returning an error doesn't work because
4902          * that would break guests even if the server operates in writethrough
4903          * mode.
4904          *
4905          * Let's hope the user knows what he's doing.
4906          */
4907         ret = 0;
4908     }
4909     if (ret < 0) {
4910         return ret;
4911     }
4912 
4913     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4914      * in the case of cache=unsafe, so there are no useless flushes.
4915      */
4916 flush_parent:
4917     return bdrv_co_flush(bs->file);
4918 }
4919 
4920 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
4921 {
4922     Error *local_err = NULL;
4923     int ret;
4924 
4925     if (!bs->drv)  {
4926         return;
4927     }
4928 
4929     if (bs->drv->bdrv_invalidate_cache) {
4930         bs->drv->bdrv_invalidate_cache(bs, &local_err);
4931     } else if (bs->file) {
4932         bdrv_invalidate_cache(bs->file, &local_err);
4933     }
4934     if (local_err) {
4935         error_propagate(errp, local_err);
4936         return;
4937     }
4938 
4939     ret = refresh_total_sectors(bs, bs->total_sectors);
4940     if (ret < 0) {
4941         error_setg_errno(errp, -ret, "Could not refresh total sector count");
4942         return;
4943     }
4944 }
4945 
4946 void bdrv_invalidate_cache_all(Error **errp)
4947 {
4948     BlockDriverState *bs;
4949     Error *local_err = NULL;
4950 
4951     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4952         bdrv_invalidate_cache(bs, &local_err);
4953         if (local_err) {
4954             error_propagate(errp, local_err);
4955             return;
4956         }
4957     }
4958 }
4959 
4960 void bdrv_clear_incoming_migration_all(void)
4961 {
4962     BlockDriverState *bs;
4963 
4964     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4965         bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4966     }
4967 }
4968 
4969 int bdrv_flush(BlockDriverState *bs)
4970 {
4971     Coroutine *co;
4972     RwCo rwco = {
4973         .bs = bs,
4974         .ret = NOT_DONE,
4975     };
4976 
4977     if (qemu_in_coroutine()) {
4978         /* Fast-path if already in coroutine context */
4979         bdrv_flush_co_entry(&rwco);
4980     } else {
4981         co = qemu_coroutine_create(bdrv_flush_co_entry);
4982         qemu_coroutine_enter(co, &rwco);
4983         while (rwco.ret == NOT_DONE) {
4984             qemu_aio_wait();
4985         }
4986     }
4987 
4988     return rwco.ret;
4989 }
4990 
4991 typedef struct DiscardCo {
4992     BlockDriverState *bs;
4993     int64_t sector_num;
4994     int nb_sectors;
4995     int ret;
4996 } DiscardCo;
4997 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4998 {
4999     DiscardCo *rwco = opaque;
5000 
5001     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5002 }
5003 
5004 /* if no limit is specified in the BlockLimits use a default
5005  * of 32768 512-byte sectors (16 MiB) per request.
5006  */
5007 #define MAX_DISCARD_DEFAULT 32768
5008 
5009 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5010                                  int nb_sectors)
5011 {
5012     int max_discard;
5013 
5014     if (!bs->drv) {
5015         return -ENOMEDIUM;
5016     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5017         return -EIO;
5018     } else if (bs->read_only) {
5019         return -EROFS;
5020     }
5021 
5022     bdrv_reset_dirty(bs, sector_num, nb_sectors);
5023 
5024     /* Do nothing if disabled.  */
5025     if (!(bs->open_flags & BDRV_O_UNMAP)) {
5026         return 0;
5027     }
5028 
5029     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5030         return 0;
5031     }
5032 
5033     max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5034     while (nb_sectors > 0) {
5035         int ret;
5036         int num = nb_sectors;
5037 
5038         /* align request */
5039         if (bs->bl.discard_alignment &&
5040             num >= bs->bl.discard_alignment &&
5041             sector_num % bs->bl.discard_alignment) {
5042             if (num > bs->bl.discard_alignment) {
5043                 num = bs->bl.discard_alignment;
5044             }
5045             num -= sector_num % bs->bl.discard_alignment;
5046         }
5047 
5048         /* limit request size */
5049         if (num > max_discard) {
5050             num = max_discard;
5051         }
5052 
5053         if (bs->drv->bdrv_co_discard) {
5054             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5055         } else {
5056             BlockDriverAIOCB *acb;
5057             CoroutineIOCompletion co = {
5058                 .coroutine = qemu_coroutine_self(),
5059             };
5060 
5061             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5062                                             bdrv_co_io_em_complete, &co);
5063             if (acb == NULL) {
5064                 return -EIO;
5065             } else {
5066                 qemu_coroutine_yield();
5067                 ret = co.ret;
5068             }
5069         }
5070         if (ret && ret != -ENOTSUP) {
5071             return ret;
5072         }
5073 
5074         sector_num += num;
5075         nb_sectors -= num;
5076     }
5077     return 0;
5078 }
5079 
5080 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5081 {
5082     Coroutine *co;
5083     DiscardCo rwco = {
5084         .bs = bs,
5085         .sector_num = sector_num,
5086         .nb_sectors = nb_sectors,
5087         .ret = NOT_DONE,
5088     };
5089 
5090     if (qemu_in_coroutine()) {
5091         /* Fast-path if already in coroutine context */
5092         bdrv_discard_co_entry(&rwco);
5093     } else {
5094         co = qemu_coroutine_create(bdrv_discard_co_entry);
5095         qemu_coroutine_enter(co, &rwco);
5096         while (rwco.ret == NOT_DONE) {
5097             qemu_aio_wait();
5098         }
5099     }
5100 
5101     return rwco.ret;
5102 }
5103 
5104 /**************************************************************/
5105 /* removable device support */
5106 
5107 /**
5108  * Return TRUE if the media is present
5109  */
5110 int bdrv_is_inserted(BlockDriverState *bs)
5111 {
5112     BlockDriver *drv = bs->drv;
5113 
5114     if (!drv)
5115         return 0;
5116     if (!drv->bdrv_is_inserted)
5117         return 1;
5118     return drv->bdrv_is_inserted(bs);
5119 }
5120 
5121 /**
5122  * Return whether the media changed since the last call to this
5123  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
5124  */
5125 int bdrv_media_changed(BlockDriverState *bs)
5126 {
5127     BlockDriver *drv = bs->drv;
5128 
5129     if (drv && drv->bdrv_media_changed) {
5130         return drv->bdrv_media_changed(bs);
5131     }
5132     return -ENOTSUP;
5133 }
5134 
5135 /**
5136  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5137  */
5138 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5139 {
5140     BlockDriver *drv = bs->drv;
5141 
5142     if (drv && drv->bdrv_eject) {
5143         drv->bdrv_eject(bs, eject_flag);
5144     }
5145 
5146     if (bs->device_name[0] != '\0') {
5147         bdrv_emit_qmp_eject_event(bs, eject_flag);
5148     }
5149 }
5150 
5151 /**
5152  * Lock or unlock the media (if it is locked, the user won't be able
5153  * to eject it manually).
5154  */
5155 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5156 {
5157     BlockDriver *drv = bs->drv;
5158 
5159     trace_bdrv_lock_medium(bs, locked);
5160 
5161     if (drv && drv->bdrv_lock_medium) {
5162         drv->bdrv_lock_medium(bs, locked);
5163     }
5164 }
5165 
5166 /* needed for generic scsi interface */
5167 
5168 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5169 {
5170     BlockDriver *drv = bs->drv;
5171 
5172     if (drv && drv->bdrv_ioctl)
5173         return drv->bdrv_ioctl(bs, req, buf);
5174     return -ENOTSUP;
5175 }
5176 
5177 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5178         unsigned long int req, void *buf,
5179         BlockDriverCompletionFunc *cb, void *opaque)
5180 {
5181     BlockDriver *drv = bs->drv;
5182 
5183     if (drv && drv->bdrv_aio_ioctl)
5184         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5185     return NULL;
5186 }
5187 
5188 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5189 {
5190     bs->guest_block_size = align;
5191 }
5192 
5193 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5194 {
5195     return qemu_memalign(bdrv_opt_mem_align(bs), size);
5196 }
5197 
5198 /*
5199  * Check if all memory in this vector is sector aligned.
5200  */
5201 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5202 {
5203     int i;
5204     size_t alignment = bdrv_opt_mem_align(bs);
5205 
5206     for (i = 0; i < qiov->niov; i++) {
5207         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5208             return false;
5209         }
5210         if (qiov->iov[i].iov_len % alignment) {
5211             return false;
5212         }
5213     }
5214 
5215     return true;
5216 }
5217 
5218 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5219                                           Error **errp)
5220 {
5221     int64_t bitmap_size;
5222     BdrvDirtyBitmap *bitmap;
5223 
5224     assert((granularity & (granularity - 1)) == 0);
5225 
5226     granularity >>= BDRV_SECTOR_BITS;
5227     assert(granularity);
5228     bitmap_size = bdrv_getlength(bs);
5229     if (bitmap_size < 0) {
5230         error_setg_errno(errp, -bitmap_size, "could not get length of device");
5231         errno = -bitmap_size;
5232         return NULL;
5233     }
5234     bitmap_size >>= BDRV_SECTOR_BITS;
5235     bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5236     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5237     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5238     return bitmap;
5239 }
5240 
5241 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5242 {
5243     BdrvDirtyBitmap *bm, *next;
5244     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5245         if (bm == bitmap) {
5246             QLIST_REMOVE(bitmap, list);
5247             hbitmap_free(bitmap->bitmap);
5248             g_free(bitmap);
5249             return;
5250         }
5251     }
5252 }
5253 
5254 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5255 {
5256     BdrvDirtyBitmap *bm;
5257     BlockDirtyInfoList *list = NULL;
5258     BlockDirtyInfoList **plist = &list;
5259 
5260     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5261         BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5262         BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5263         info->count = bdrv_get_dirty_count(bs, bm);
5264         info->granularity =
5265             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5266         entry->value = info;
5267         *plist = entry;
5268         plist = &entry->next;
5269     }
5270 
5271     return list;
5272 }
5273 
5274 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5275 {
5276     if (bitmap) {
5277         return hbitmap_get(bitmap->bitmap, sector);
5278     } else {
5279         return 0;
5280     }
5281 }
5282 
5283 void bdrv_dirty_iter_init(BlockDriverState *bs,
5284                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5285 {
5286     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5287 }
5288 
5289 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5290                     int nr_sectors)
5291 {
5292     BdrvDirtyBitmap *bitmap;
5293     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5294         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5295     }
5296 }
5297 
5298 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5299 {
5300     BdrvDirtyBitmap *bitmap;
5301     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5302         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5303     }
5304 }
5305 
5306 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5307 {
5308     return hbitmap_count(bitmap->bitmap);
5309 }
5310 
5311 /* Get a reference to bs */
5312 void bdrv_ref(BlockDriverState *bs)
5313 {
5314     bs->refcnt++;
5315 }
5316 
5317 /* Release a previously grabbed reference to bs.
5318  * If after releasing, reference count is zero, the BlockDriverState is
5319  * deleted. */
5320 void bdrv_unref(BlockDriverState *bs)
5321 {
5322     assert(bs->refcnt > 0);
5323     if (--bs->refcnt == 0) {
5324         bdrv_delete(bs);
5325     }
5326 }
5327 
5328 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
5329 {
5330     assert(bs->in_use != in_use);
5331     bs->in_use = in_use;
5332 }
5333 
5334 int bdrv_in_use(BlockDriverState *bs)
5335 {
5336     return bs->in_use;
5337 }
5338 
5339 void bdrv_iostatus_enable(BlockDriverState *bs)
5340 {
5341     bs->iostatus_enabled = true;
5342     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5343 }
5344 
5345 /* The I/O status is only enabled if the drive explicitly
5346  * enables it _and_ the VM is configured to stop on errors */
5347 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5348 {
5349     return (bs->iostatus_enabled &&
5350            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5351             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5352             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5353 }
5354 
5355 void bdrv_iostatus_disable(BlockDriverState *bs)
5356 {
5357     bs->iostatus_enabled = false;
5358 }
5359 
5360 void bdrv_iostatus_reset(BlockDriverState *bs)
5361 {
5362     if (bdrv_iostatus_is_enabled(bs)) {
5363         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5364         if (bs->job) {
5365             block_job_iostatus_reset(bs->job);
5366         }
5367     }
5368 }
5369 
5370 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5371 {
5372     assert(bdrv_iostatus_is_enabled(bs));
5373     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5374         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5375                                          BLOCK_DEVICE_IO_STATUS_FAILED;
5376     }
5377 }
5378 
5379 void
5380 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5381         enum BlockAcctType type)
5382 {
5383     assert(type < BDRV_MAX_IOTYPE);
5384 
5385     cookie->bytes = bytes;
5386     cookie->start_time_ns = get_clock();
5387     cookie->type = type;
5388 }
5389 
5390 void
5391 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5392 {
5393     assert(cookie->type < BDRV_MAX_IOTYPE);
5394 
5395     bs->nr_bytes[cookie->type] += cookie->bytes;
5396     bs->nr_ops[cookie->type]++;
5397     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5398 }
5399 
5400 void bdrv_img_create(const char *filename, const char *fmt,
5401                      const char *base_filename, const char *base_fmt,
5402                      char *options, uint64_t img_size, int flags,
5403                      Error **errp, bool quiet)
5404 {
5405     QEMUOptionParameter *param = NULL, *create_options = NULL;
5406     QEMUOptionParameter *backing_fmt, *backing_file, *size;
5407     BlockDriver *drv, *proto_drv;
5408     BlockDriver *backing_drv = NULL;
5409     Error *local_err = NULL;
5410     int ret = 0;
5411 
5412     /* Find driver and parse its options */
5413     drv = bdrv_find_format(fmt);
5414     if (!drv) {
5415         error_setg(errp, "Unknown file format '%s'", fmt);
5416         return;
5417     }
5418 
5419     proto_drv = bdrv_find_protocol(filename, true);
5420     if (!proto_drv) {
5421         error_setg(errp, "Unknown protocol '%s'", filename);
5422         return;
5423     }
5424 
5425     create_options = append_option_parameters(create_options,
5426                                               drv->create_options);
5427     create_options = append_option_parameters(create_options,
5428                                               proto_drv->create_options);
5429 
5430     /* Create parameter list with default values */
5431     param = parse_option_parameters("", create_options, param);
5432 
5433     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5434 
5435     /* Parse -o options */
5436     if (options) {
5437         param = parse_option_parameters(options, create_options, param);
5438         if (param == NULL) {
5439             error_setg(errp, "Invalid options for file format '%s'.", fmt);
5440             goto out;
5441         }
5442     }
5443 
5444     if (base_filename) {
5445         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5446                                  base_filename)) {
5447             error_setg(errp, "Backing file not supported for file format '%s'",
5448                        fmt);
5449             goto out;
5450         }
5451     }
5452 
5453     if (base_fmt) {
5454         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5455             error_setg(errp, "Backing file format not supported for file "
5456                              "format '%s'", fmt);
5457             goto out;
5458         }
5459     }
5460 
5461     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5462     if (backing_file && backing_file->value.s) {
5463         if (!strcmp(filename, backing_file->value.s)) {
5464             error_setg(errp, "Error: Trying to create an image with the "
5465                              "same filename as the backing file");
5466             goto out;
5467         }
5468     }
5469 
5470     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5471     if (backing_fmt && backing_fmt->value.s) {
5472         backing_drv = bdrv_find_format(backing_fmt->value.s);
5473         if (!backing_drv) {
5474             error_setg(errp, "Unknown backing file format '%s'",
5475                        backing_fmt->value.s);
5476             goto out;
5477         }
5478     }
5479 
5480     // The size for the image must always be specified, with one exception:
5481     // If we are using a backing file, we can obtain the size from there
5482     size = get_option_parameter(param, BLOCK_OPT_SIZE);
5483     if (size && size->value.n == -1) {
5484         if (backing_file && backing_file->value.s) {
5485             BlockDriverState *bs;
5486             uint64_t size;
5487             char buf[32];
5488             int back_flags;
5489 
5490             /* backing files always opened read-only */
5491             back_flags =
5492                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5493 
5494             bs = NULL;
5495             ret = bdrv_open(&bs, backing_file->value.s, NULL, NULL, back_flags,
5496                             backing_drv, &local_err);
5497             if (ret < 0) {
5498                 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5499                                  backing_file->value.s,
5500                                  error_get_pretty(local_err));
5501                 error_free(local_err);
5502                 local_err = NULL;
5503                 goto out;
5504             }
5505             bdrv_get_geometry(bs, &size);
5506             size *= 512;
5507 
5508             snprintf(buf, sizeof(buf), "%" PRId64, size);
5509             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
5510 
5511             bdrv_unref(bs);
5512         } else {
5513             error_setg(errp, "Image creation needs a size parameter");
5514             goto out;
5515         }
5516     }
5517 
5518     if (!quiet) {
5519         printf("Formatting '%s', fmt=%s ", filename, fmt);
5520         print_option_parameters(param);
5521         puts("");
5522     }
5523     ret = bdrv_create(drv, filename, param, &local_err);
5524     if (ret == -EFBIG) {
5525         /* This is generally a better message than whatever the driver would
5526          * deliver (especially because of the cluster_size_hint), since that
5527          * is most probably not much different from "image too large". */
5528         const char *cluster_size_hint = "";
5529         if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5530             cluster_size_hint = " (try using a larger cluster size)";
5531         }
5532         error_setg(errp, "The image size is too large for file format '%s'"
5533                    "%s", fmt, cluster_size_hint);
5534         error_free(local_err);
5535         local_err = NULL;
5536     }
5537 
5538 out:
5539     free_option_parameters(create_options);
5540     free_option_parameters(param);
5541 
5542     if (local_err) {
5543         error_propagate(errp, local_err);
5544     }
5545 }
5546 
5547 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5548 {
5549     /* Currently BlockDriverState always uses the main loop AioContext */
5550     return qemu_get_aio_context();
5551 }
5552 
5553 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5554                                     NotifierWithReturn *notifier)
5555 {
5556     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5557 }
5558 
5559 int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5560 {
5561     if (bs->drv->bdrv_amend_options == NULL) {
5562         return -ENOTSUP;
5563     }
5564     return bs->drv->bdrv_amend_options(bs, options);
5565 }
5566 
5567 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5568  * of block filter and by bdrv_is_first_non_filter.
5569  * It is used to test if the given bs is the candidate or recurse more in the
5570  * node graph.
5571  */
5572 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5573                                       BlockDriverState *candidate)
5574 {
5575     /* return false if basic checks fails */
5576     if (!bs || !bs->drv) {
5577         return false;
5578     }
5579 
5580     /* the code reached a non block filter driver -> check if the bs is
5581      * the same as the candidate. It's the recursion termination condition.
5582      */
5583     if (!bs->drv->is_filter) {
5584         return bs == candidate;
5585     }
5586     /* Down this path the driver is a block filter driver */
5587 
5588     /* If the block filter recursion method is defined use it to recurse down
5589      * the node graph.
5590      */
5591     if (bs->drv->bdrv_recurse_is_first_non_filter) {
5592         return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5593     }
5594 
5595     /* the driver is a block filter but don't allow to recurse -> return false
5596      */
5597     return false;
5598 }
5599 
5600 /* This function checks if the candidate is the first non filter bs down it's
5601  * bs chain. Since we don't have pointers to parents it explore all bs chains
5602  * from the top. Some filters can choose not to pass down the recursion.
5603  */
5604 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5605 {
5606     BlockDriverState *bs;
5607 
5608     /* walk down the bs forest recursively */
5609     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5610         bool perm;
5611 
5612         /* try to recurse in this top level bs */
5613         perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5614 
5615         /* candidate is the first non filter */
5616         if (perm) {
5617             return true;
5618         }
5619     }
5620 
5621     return false;
5622 }
5623