xref: /openbmc/qemu/block.c (revision 4993f7ea7e63f18f18880289d6be8a9ab1591409)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor/monitor.h"
28 #include "block/block_int.h"
29 #include "block/blockjob.h"
30 #include "qemu/module.h"
31 #include "qapi/qmp/qjson.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 
39 #ifdef CONFIG_BSD
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/queue.h>
44 #ifndef __DragonFly__
45 #include <sys/disk.h>
46 #endif
47 #endif
48 
49 #ifdef _WIN32
50 #include <windows.h>
51 #endif
52 
53 struct BdrvDirtyBitmap {
54     HBitmap *bitmap;
55     QLIST_ENTRY(BdrvDirtyBitmap) list;
56 };
57 
58 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59 
60 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63         BlockDriverCompletionFunc *cb, void *opaque);
64 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66         BlockDriverCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68                                          int64_t sector_num, int nb_sectors,
69                                          QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71                                          int64_t sector_num, int nb_sectors,
72                                          QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75     BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78     BdrvRequestFlags flags);
79 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80                                                int64_t sector_num,
81                                                QEMUIOVector *qiov,
82                                                int nb_sectors,
83                                                BdrvRequestFlags flags,
84                                                BlockDriverCompletionFunc *cb,
85                                                void *opaque,
86                                                bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90 
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92     QTAILQ_HEAD_INITIALIZER(bdrv_states);
93 
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96 
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98     QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
102 
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
105 {
106     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108             filename[1] == ':');
109 }
110 
111 int is_windows_drive(const char *filename)
112 {
113     if (is_windows_drive_prefix(filename) &&
114         filename[2] == '\0')
115         return 1;
116     if (strstart(filename, "\\\\.\\", NULL) ||
117         strstart(filename, "//./", NULL))
118         return 1;
119     return 0;
120 }
121 #endif
122 
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125                         ThrottleConfig *cfg)
126 {
127     int i;
128 
129     throttle_config(&bs->throttle_state, cfg);
130 
131     for (i = 0; i < 2; i++) {
132         qemu_co_enter_next(&bs->throttled_reqs[i]);
133     }
134 }
135 
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138 {
139     bool drained = false;
140     bool enabled = bs->io_limits_enabled;
141     int i;
142 
143     bs->io_limits_enabled = false;
144 
145     for (i = 0; i < 2; i++) {
146         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147             drained = true;
148         }
149     }
150 
151     bs->io_limits_enabled = enabled;
152 
153     return drained;
154 }
155 
156 void bdrv_io_limits_disable(BlockDriverState *bs)
157 {
158     bs->io_limits_enabled = false;
159 
160     bdrv_start_throttled_reqs(bs);
161 
162     throttle_destroy(&bs->throttle_state);
163 }
164 
165 static void bdrv_throttle_read_timer_cb(void *opaque)
166 {
167     BlockDriverState *bs = opaque;
168     qemu_co_enter_next(&bs->throttled_reqs[0]);
169 }
170 
171 static void bdrv_throttle_write_timer_cb(void *opaque)
172 {
173     BlockDriverState *bs = opaque;
174     qemu_co_enter_next(&bs->throttled_reqs[1]);
175 }
176 
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
179 {
180     assert(!bs->io_limits_enabled);
181     throttle_init(&bs->throttle_state,
182                   QEMU_CLOCK_VIRTUAL,
183                   bdrv_throttle_read_timer_cb,
184                   bdrv_throttle_write_timer_cb,
185                   bs);
186     bs->io_limits_enabled = true;
187 }
188 
189 /* This function makes an IO wait if needed
190  *
191  * @nb_sectors: the number of sectors of the IO
192  * @is_write:   is the IO a write
193  */
194 static void bdrv_io_limits_intercept(BlockDriverState *bs,
195                                      unsigned int bytes,
196                                      bool is_write)
197 {
198     /* does this io must wait */
199     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
200 
201     /* if must wait or any request of this type throttled queue the IO */
202     if (must_wait ||
203         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
205     }
206 
207     /* the IO will be executed, do the accounting */
208     throttle_account(&bs->throttle_state, is_write, bytes);
209 
210 
211     /* if the next request must wait -> do nothing */
212     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
213         return;
214     }
215 
216     /* else queue next request for execution */
217     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
218 }
219 
220 size_t bdrv_opt_mem_align(BlockDriverState *bs)
221 {
222     if (!bs || !bs->drv) {
223         /* 4k should be on the safe side */
224         return 4096;
225     }
226 
227     return bs->bl.opt_mem_alignment;
228 }
229 
230 /* check if the path starts with "<protocol>:" */
231 static int path_has_protocol(const char *path)
232 {
233     const char *p;
234 
235 #ifdef _WIN32
236     if (is_windows_drive(path) ||
237         is_windows_drive_prefix(path)) {
238         return 0;
239     }
240     p = path + strcspn(path, ":/\\");
241 #else
242     p = path + strcspn(path, ":/");
243 #endif
244 
245     return *p == ':';
246 }
247 
248 int path_is_absolute(const char *path)
249 {
250 #ifdef _WIN32
251     /* specific case for names like: "\\.\d:" */
252     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
253         return 1;
254     }
255     return (*path == '/' || *path == '\\');
256 #else
257     return (*path == '/');
258 #endif
259 }
260 
261 /* if filename is absolute, just copy it to dest. Otherwise, build a
262    path to it by considering it is relative to base_path. URL are
263    supported. */
264 void path_combine(char *dest, int dest_size,
265                   const char *base_path,
266                   const char *filename)
267 {
268     const char *p, *p1;
269     int len;
270 
271     if (dest_size <= 0)
272         return;
273     if (path_is_absolute(filename)) {
274         pstrcpy(dest, dest_size, filename);
275     } else {
276         p = strchr(base_path, ':');
277         if (p)
278             p++;
279         else
280             p = base_path;
281         p1 = strrchr(base_path, '/');
282 #ifdef _WIN32
283         {
284             const char *p2;
285             p2 = strrchr(base_path, '\\');
286             if (!p1 || p2 > p1)
287                 p1 = p2;
288         }
289 #endif
290         if (p1)
291             p1++;
292         else
293             p1 = base_path;
294         if (p1 > p)
295             p = p1;
296         len = p - base_path;
297         if (len > dest_size - 1)
298             len = dest_size - 1;
299         memcpy(dest, base_path, len);
300         dest[len] = '\0';
301         pstrcat(dest, dest_size, filename);
302     }
303 }
304 
305 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
306 {
307     if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
308         pstrcpy(dest, sz, bs->backing_file);
309     } else {
310         path_combine(dest, sz, bs->filename, bs->backing_file);
311     }
312 }
313 
314 void bdrv_register(BlockDriver *bdrv)
315 {
316     /* Block drivers without coroutine functions need emulation */
317     if (!bdrv->bdrv_co_readv) {
318         bdrv->bdrv_co_readv = bdrv_co_readv_em;
319         bdrv->bdrv_co_writev = bdrv_co_writev_em;
320 
321         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
322          * the block driver lacks aio we need to emulate that too.
323          */
324         if (!bdrv->bdrv_aio_readv) {
325             /* add AIO emulation layer */
326             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
327             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
328         }
329     }
330 
331     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
332 }
333 
334 /* create a new block device (by default it is empty) */
335 BlockDriverState *bdrv_new(const char *device_name, Error **errp)
336 {
337     BlockDriverState *bs;
338 
339     if (bdrv_find(device_name)) {
340         error_setg(errp, "Device with id '%s' already exists",
341                    device_name);
342         return NULL;
343     }
344     if (bdrv_find_node(device_name)) {
345         error_setg(errp, "Device with node-name '%s' already exists",
346                    device_name);
347         return NULL;
348     }
349 
350     bs = g_malloc0(sizeof(BlockDriverState));
351     QLIST_INIT(&bs->dirty_bitmaps);
352     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
353     if (device_name[0] != '\0') {
354         QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
355     }
356     bdrv_iostatus_disable(bs);
357     notifier_list_init(&bs->close_notifiers);
358     notifier_with_return_list_init(&bs->before_write_notifiers);
359     qemu_co_queue_init(&bs->throttled_reqs[0]);
360     qemu_co_queue_init(&bs->throttled_reqs[1]);
361     bs->refcnt = 1;
362 
363     return bs;
364 }
365 
366 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
367 {
368     notifier_list_add(&bs->close_notifiers, notify);
369 }
370 
371 BlockDriver *bdrv_find_format(const char *format_name)
372 {
373     BlockDriver *drv1;
374     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
375         if (!strcmp(drv1->format_name, format_name)) {
376             return drv1;
377         }
378     }
379     return NULL;
380 }
381 
382 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
383 {
384     static const char *whitelist_rw[] = {
385         CONFIG_BDRV_RW_WHITELIST
386     };
387     static const char *whitelist_ro[] = {
388         CONFIG_BDRV_RO_WHITELIST
389     };
390     const char **p;
391 
392     if (!whitelist_rw[0] && !whitelist_ro[0]) {
393         return 1;               /* no whitelist, anything goes */
394     }
395 
396     for (p = whitelist_rw; *p; p++) {
397         if (!strcmp(drv->format_name, *p)) {
398             return 1;
399         }
400     }
401     if (read_only) {
402         for (p = whitelist_ro; *p; p++) {
403             if (!strcmp(drv->format_name, *p)) {
404                 return 1;
405             }
406         }
407     }
408     return 0;
409 }
410 
411 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
412                                           bool read_only)
413 {
414     BlockDriver *drv = bdrv_find_format(format_name);
415     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
416 }
417 
418 typedef struct CreateCo {
419     BlockDriver *drv;
420     char *filename;
421     QEMUOptionParameter *options;
422     int ret;
423     Error *err;
424 } CreateCo;
425 
426 static void coroutine_fn bdrv_create_co_entry(void *opaque)
427 {
428     Error *local_err = NULL;
429     int ret;
430 
431     CreateCo *cco = opaque;
432     assert(cco->drv);
433 
434     ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
435     if (local_err) {
436         error_propagate(&cco->err, local_err);
437     }
438     cco->ret = ret;
439 }
440 
441 int bdrv_create(BlockDriver *drv, const char* filename,
442     QEMUOptionParameter *options, Error **errp)
443 {
444     int ret;
445 
446     Coroutine *co;
447     CreateCo cco = {
448         .drv = drv,
449         .filename = g_strdup(filename),
450         .options = options,
451         .ret = NOT_DONE,
452         .err = NULL,
453     };
454 
455     if (!drv->bdrv_create) {
456         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
457         ret = -ENOTSUP;
458         goto out;
459     }
460 
461     if (qemu_in_coroutine()) {
462         /* Fast-path if already in coroutine context */
463         bdrv_create_co_entry(&cco);
464     } else {
465         co = qemu_coroutine_create(bdrv_create_co_entry);
466         qemu_coroutine_enter(co, &cco);
467         while (cco.ret == NOT_DONE) {
468             qemu_aio_wait();
469         }
470     }
471 
472     ret = cco.ret;
473     if (ret < 0) {
474         if (cco.err) {
475             error_propagate(errp, cco.err);
476         } else {
477             error_setg_errno(errp, -ret, "Could not create image");
478         }
479     }
480 
481 out:
482     g_free(cco.filename);
483     return ret;
484 }
485 
486 int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
487                      Error **errp)
488 {
489     BlockDriver *drv;
490     Error *local_err = NULL;
491     int ret;
492 
493     drv = bdrv_find_protocol(filename, true);
494     if (drv == NULL) {
495         error_setg(errp, "Could not find protocol for file '%s'", filename);
496         return -ENOENT;
497     }
498 
499     ret = bdrv_create(drv, filename, options, &local_err);
500     if (local_err) {
501         error_propagate(errp, local_err);
502     }
503     return ret;
504 }
505 
506 int bdrv_refresh_limits(BlockDriverState *bs)
507 {
508     BlockDriver *drv = bs->drv;
509 
510     memset(&bs->bl, 0, sizeof(bs->bl));
511 
512     if (!drv) {
513         return 0;
514     }
515 
516     /* Take some limits from the children as a default */
517     if (bs->file) {
518         bdrv_refresh_limits(bs->file);
519         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
520         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
521     } else {
522         bs->bl.opt_mem_alignment = 512;
523     }
524 
525     if (bs->backing_hd) {
526         bdrv_refresh_limits(bs->backing_hd);
527         bs->bl.opt_transfer_length =
528             MAX(bs->bl.opt_transfer_length,
529                 bs->backing_hd->bl.opt_transfer_length);
530         bs->bl.opt_mem_alignment =
531             MAX(bs->bl.opt_mem_alignment,
532                 bs->backing_hd->bl.opt_mem_alignment);
533     }
534 
535     /* Then let the driver override it */
536     if (drv->bdrv_refresh_limits) {
537         return drv->bdrv_refresh_limits(bs);
538     }
539 
540     return 0;
541 }
542 
543 /*
544  * Create a uniquely-named empty temporary file.
545  * Return 0 upon success, otherwise a negative errno value.
546  */
547 int get_tmp_filename(char *filename, int size)
548 {
549 #ifdef _WIN32
550     char temp_dir[MAX_PATH];
551     /* GetTempFileName requires that its output buffer (4th param)
552        have length MAX_PATH or greater.  */
553     assert(size >= MAX_PATH);
554     return (GetTempPath(MAX_PATH, temp_dir)
555             && GetTempFileName(temp_dir, "qem", 0, filename)
556             ? 0 : -GetLastError());
557 #else
558     int fd;
559     const char *tmpdir;
560     tmpdir = getenv("TMPDIR");
561     if (!tmpdir) {
562         tmpdir = "/var/tmp";
563     }
564     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
565         return -EOVERFLOW;
566     }
567     fd = mkstemp(filename);
568     if (fd < 0) {
569         return -errno;
570     }
571     if (close(fd) != 0) {
572         unlink(filename);
573         return -errno;
574     }
575     return 0;
576 #endif
577 }
578 
579 /*
580  * Detect host devices. By convention, /dev/cdrom[N] is always
581  * recognized as a host CDROM.
582  */
583 static BlockDriver *find_hdev_driver(const char *filename)
584 {
585     int score_max = 0, score;
586     BlockDriver *drv = NULL, *d;
587 
588     QLIST_FOREACH(d, &bdrv_drivers, list) {
589         if (d->bdrv_probe_device) {
590             score = d->bdrv_probe_device(filename);
591             if (score > score_max) {
592                 score_max = score;
593                 drv = d;
594             }
595         }
596     }
597 
598     return drv;
599 }
600 
601 BlockDriver *bdrv_find_protocol(const char *filename,
602                                 bool allow_protocol_prefix)
603 {
604     BlockDriver *drv1;
605     char protocol[128];
606     int len;
607     const char *p;
608 
609     /* TODO Drivers without bdrv_file_open must be specified explicitly */
610 
611     /*
612      * XXX(hch): we really should not let host device detection
613      * override an explicit protocol specification, but moving this
614      * later breaks access to device names with colons in them.
615      * Thanks to the brain-dead persistent naming schemes on udev-
616      * based Linux systems those actually are quite common.
617      */
618     drv1 = find_hdev_driver(filename);
619     if (drv1) {
620         return drv1;
621     }
622 
623     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
624         return bdrv_find_format("file");
625     }
626 
627     p = strchr(filename, ':');
628     assert(p != NULL);
629     len = p - filename;
630     if (len > sizeof(protocol) - 1)
631         len = sizeof(protocol) - 1;
632     memcpy(protocol, filename, len);
633     protocol[len] = '\0';
634     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
635         if (drv1->protocol_name &&
636             !strcmp(drv1->protocol_name, protocol)) {
637             return drv1;
638         }
639     }
640     return NULL;
641 }
642 
643 static int find_image_format(BlockDriverState *bs, const char *filename,
644                              BlockDriver **pdrv, Error **errp)
645 {
646     int score, score_max;
647     BlockDriver *drv1, *drv;
648     uint8_t buf[2048];
649     int ret = 0;
650 
651     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
652     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
653         drv = bdrv_find_format("raw");
654         if (!drv) {
655             error_setg(errp, "Could not find raw image format");
656             ret = -ENOENT;
657         }
658         *pdrv = drv;
659         return ret;
660     }
661 
662     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
663     if (ret < 0) {
664         error_setg_errno(errp, -ret, "Could not read image for determining its "
665                          "format");
666         *pdrv = NULL;
667         return ret;
668     }
669 
670     score_max = 0;
671     drv = NULL;
672     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
673         if (drv1->bdrv_probe) {
674             score = drv1->bdrv_probe(buf, ret, filename);
675             if (score > score_max) {
676                 score_max = score;
677                 drv = drv1;
678             }
679         }
680     }
681     if (!drv) {
682         error_setg(errp, "Could not determine image format: No compatible "
683                    "driver found");
684         ret = -ENOENT;
685     }
686     *pdrv = drv;
687     return ret;
688 }
689 
690 /**
691  * Set the current 'total_sectors' value
692  */
693 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
694 {
695     BlockDriver *drv = bs->drv;
696 
697     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
698     if (bs->sg)
699         return 0;
700 
701     /* query actual device if possible, otherwise just trust the hint */
702     if (drv->bdrv_getlength) {
703         int64_t length = drv->bdrv_getlength(bs);
704         if (length < 0) {
705             return length;
706         }
707         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
708     }
709 
710     bs->total_sectors = hint;
711     return 0;
712 }
713 
714 /**
715  * Set open flags for a given discard mode
716  *
717  * Return 0 on success, -1 if the discard mode was invalid.
718  */
719 int bdrv_parse_discard_flags(const char *mode, int *flags)
720 {
721     *flags &= ~BDRV_O_UNMAP;
722 
723     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
724         /* do nothing */
725     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
726         *flags |= BDRV_O_UNMAP;
727     } else {
728         return -1;
729     }
730 
731     return 0;
732 }
733 
734 /**
735  * Set open flags for a given cache mode
736  *
737  * Return 0 on success, -1 if the cache mode was invalid.
738  */
739 int bdrv_parse_cache_flags(const char *mode, int *flags)
740 {
741     *flags &= ~BDRV_O_CACHE_MASK;
742 
743     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
744         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
745     } else if (!strcmp(mode, "directsync")) {
746         *flags |= BDRV_O_NOCACHE;
747     } else if (!strcmp(mode, "writeback")) {
748         *flags |= BDRV_O_CACHE_WB;
749     } else if (!strcmp(mode, "unsafe")) {
750         *flags |= BDRV_O_CACHE_WB;
751         *flags |= BDRV_O_NO_FLUSH;
752     } else if (!strcmp(mode, "writethrough")) {
753         /* this is the default */
754     } else {
755         return -1;
756     }
757 
758     return 0;
759 }
760 
761 /**
762  * The copy-on-read flag is actually a reference count so multiple users may
763  * use the feature without worrying about clobbering its previous state.
764  * Copy-on-read stays enabled until all users have called to disable it.
765  */
766 void bdrv_enable_copy_on_read(BlockDriverState *bs)
767 {
768     bs->copy_on_read++;
769 }
770 
771 void bdrv_disable_copy_on_read(BlockDriverState *bs)
772 {
773     assert(bs->copy_on_read > 0);
774     bs->copy_on_read--;
775 }
776 
777 /*
778  * Returns the flags that a temporary snapshot should get, based on the
779  * originally requested flags (the originally requested image will have flags
780  * like a backing file)
781  */
782 static int bdrv_temp_snapshot_flags(int flags)
783 {
784     return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
785 }
786 
787 /*
788  * Returns the flags that bs->file should get, based on the given flags for
789  * the parent BDS
790  */
791 static int bdrv_inherited_flags(int flags)
792 {
793     /* Enable protocol handling, disable format probing for bs->file */
794     flags |= BDRV_O_PROTOCOL;
795 
796     /* Our block drivers take care to send flushes and respect unmap policy,
797      * so we can enable both unconditionally on lower layers. */
798     flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
799 
800     /* Clear flags that only apply to the top layer */
801     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
802 
803     return flags;
804 }
805 
806 /*
807  * Returns the flags that bs->backing_hd should get, based on the given flags
808  * for the parent BDS
809  */
810 static int bdrv_backing_flags(int flags)
811 {
812     /* backing files always opened read-only */
813     flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
814 
815     /* snapshot=on is handled on the top layer */
816     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
817 
818     return flags;
819 }
820 
821 static int bdrv_open_flags(BlockDriverState *bs, int flags)
822 {
823     int open_flags = flags | BDRV_O_CACHE_WB;
824 
825     /*
826      * Clear flags that are internal to the block layer before opening the
827      * image.
828      */
829     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
830 
831     /*
832      * Snapshots should be writable.
833      */
834     if (flags & BDRV_O_TEMPORARY) {
835         open_flags |= BDRV_O_RDWR;
836     }
837 
838     return open_flags;
839 }
840 
841 static void bdrv_assign_node_name(BlockDriverState *bs,
842                                   const char *node_name,
843                                   Error **errp)
844 {
845     if (!node_name) {
846         return;
847     }
848 
849     /* empty string node name is invalid */
850     if (node_name[0] == '\0') {
851         error_setg(errp, "Empty node name");
852         return;
853     }
854 
855     /* takes care of avoiding namespaces collisions */
856     if (bdrv_find(node_name)) {
857         error_setg(errp, "node-name=%s is conflicting with a device id",
858                    node_name);
859         return;
860     }
861 
862     /* takes care of avoiding duplicates node names */
863     if (bdrv_find_node(node_name)) {
864         error_setg(errp, "Duplicate node name");
865         return;
866     }
867 
868     /* copy node name into the bs and insert it into the graph list */
869     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
870     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
871 }
872 
873 /*
874  * Common part for opening disk images and files
875  *
876  * Removes all processed options from *options.
877  */
878 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
879     QDict *options, int flags, BlockDriver *drv, Error **errp)
880 {
881     int ret, open_flags;
882     const char *filename;
883     const char *node_name = NULL;
884     Error *local_err = NULL;
885 
886     assert(drv != NULL);
887     assert(bs->file == NULL);
888     assert(options != NULL && bs->options != options);
889 
890     if (file != NULL) {
891         filename = file->filename;
892     } else {
893         filename = qdict_get_try_str(options, "filename");
894     }
895 
896     if (drv->bdrv_needs_filename && !filename) {
897         error_setg(errp, "The '%s' block driver requires a file name",
898                    drv->format_name);
899         return -EINVAL;
900     }
901 
902     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
903 
904     node_name = qdict_get_try_str(options, "node-name");
905     bdrv_assign_node_name(bs, node_name, &local_err);
906     if (local_err) {
907         error_propagate(errp, local_err);
908         return -EINVAL;
909     }
910     qdict_del(options, "node-name");
911 
912     /* bdrv_open() with directly using a protocol as drv. This layer is already
913      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
914      * and return immediately. */
915     if (file != NULL && drv->bdrv_file_open) {
916         bdrv_swap(file, bs);
917         return 0;
918     }
919 
920     bs->open_flags = flags;
921     bs->guest_block_size = 512;
922     bs->request_alignment = 512;
923     bs->zero_beyond_eof = true;
924     open_flags = bdrv_open_flags(bs, flags);
925     bs->read_only = !(open_flags & BDRV_O_RDWR);
926 
927     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
928         error_setg(errp,
929                    !bs->read_only && bdrv_is_whitelisted(drv, true)
930                         ? "Driver '%s' can only be used for read-only devices"
931                         : "Driver '%s' is not whitelisted",
932                    drv->format_name);
933         return -ENOTSUP;
934     }
935 
936     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
937     if (flags & BDRV_O_COPY_ON_READ) {
938         if (!bs->read_only) {
939             bdrv_enable_copy_on_read(bs);
940         } else {
941             error_setg(errp, "Can't use copy-on-read on read-only device");
942             return -EINVAL;
943         }
944     }
945 
946     if (filename != NULL) {
947         pstrcpy(bs->filename, sizeof(bs->filename), filename);
948     } else {
949         bs->filename[0] = '\0';
950     }
951 
952     bs->drv = drv;
953     bs->opaque = g_malloc0(drv->instance_size);
954 
955     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
956 
957     /* Open the image, either directly or using a protocol */
958     if (drv->bdrv_file_open) {
959         assert(file == NULL);
960         assert(!drv->bdrv_needs_filename || filename != NULL);
961         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
962     } else {
963         if (file == NULL) {
964             error_setg(errp, "Can't use '%s' as a block driver for the "
965                        "protocol level", drv->format_name);
966             ret = -EINVAL;
967             goto free_and_fail;
968         }
969         bs->file = file;
970         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
971     }
972 
973     if (ret < 0) {
974         if (local_err) {
975             error_propagate(errp, local_err);
976         } else if (bs->filename[0]) {
977             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
978         } else {
979             error_setg_errno(errp, -ret, "Could not open image");
980         }
981         goto free_and_fail;
982     }
983 
984     ret = refresh_total_sectors(bs, bs->total_sectors);
985     if (ret < 0) {
986         error_setg_errno(errp, -ret, "Could not refresh total sector count");
987         goto free_and_fail;
988     }
989 
990     bdrv_refresh_limits(bs);
991     assert(bdrv_opt_mem_align(bs) != 0);
992     assert((bs->request_alignment != 0) || bs->sg);
993     return 0;
994 
995 free_and_fail:
996     bs->file = NULL;
997     g_free(bs->opaque);
998     bs->opaque = NULL;
999     bs->drv = NULL;
1000     return ret;
1001 }
1002 
1003 /*
1004  * Opens a file using a protocol (file, host_device, nbd, ...)
1005  *
1006  * options is an indirect pointer to a QDict of options to pass to the block
1007  * drivers, or pointer to NULL for an empty set of options. If this function
1008  * takes ownership of the QDict reference, it will set *options to NULL;
1009  * otherwise, it will contain unused/unrecognized options after this function
1010  * returns. Then, the caller is responsible for freeing it. If it intends to
1011  * reuse the QDict, QINCREF() should be called beforehand.
1012  */
1013 static int bdrv_file_open(BlockDriverState *bs, const char *filename,
1014                           QDict **options, int flags, Error **errp)
1015 {
1016     BlockDriver *drv;
1017     const char *drvname;
1018     bool parse_filename = false;
1019     Error *local_err = NULL;
1020     int ret;
1021 
1022     /* Fetch the file name from the options QDict if necessary */
1023     if (!filename) {
1024         filename = qdict_get_try_str(*options, "filename");
1025     } else if (filename && !qdict_haskey(*options, "filename")) {
1026         qdict_put(*options, "filename", qstring_from_str(filename));
1027         parse_filename = true;
1028     } else {
1029         error_setg(errp, "Can't specify 'file' and 'filename' options at the "
1030                    "same time");
1031         ret = -EINVAL;
1032         goto fail;
1033     }
1034 
1035     /* Find the right block driver */
1036     drvname = qdict_get_try_str(*options, "driver");
1037     if (drvname) {
1038         drv = bdrv_find_format(drvname);
1039         if (!drv) {
1040             error_setg(errp, "Unknown driver '%s'", drvname);
1041         }
1042         qdict_del(*options, "driver");
1043     } else if (filename) {
1044         drv = bdrv_find_protocol(filename, parse_filename);
1045         if (!drv) {
1046             error_setg(errp, "Unknown protocol");
1047         }
1048     } else {
1049         error_setg(errp, "Must specify either driver or file");
1050         drv = NULL;
1051     }
1052 
1053     if (!drv) {
1054         /* errp has been set already */
1055         ret = -ENOENT;
1056         goto fail;
1057     }
1058 
1059     /* Parse the filename and open it */
1060     if (drv->bdrv_parse_filename && parse_filename) {
1061         drv->bdrv_parse_filename(filename, *options, &local_err);
1062         if (local_err) {
1063             error_propagate(errp, local_err);
1064             ret = -EINVAL;
1065             goto fail;
1066         }
1067 
1068         if (!drv->bdrv_needs_filename) {
1069             qdict_del(*options, "filename");
1070         } else {
1071             filename = qdict_get_str(*options, "filename");
1072         }
1073     }
1074 
1075     if (!drv->bdrv_file_open) {
1076         ret = bdrv_open(&bs, filename, NULL, *options, flags, drv, &local_err);
1077         *options = NULL;
1078     } else {
1079         ret = bdrv_open_common(bs, NULL, *options, flags, drv, &local_err);
1080     }
1081     if (ret < 0) {
1082         error_propagate(errp, local_err);
1083         goto fail;
1084     }
1085 
1086     bs->growable = 1;
1087     return 0;
1088 
1089 fail:
1090     return ret;
1091 }
1092 
1093 /*
1094  * Opens the backing file for a BlockDriverState if not yet open
1095  *
1096  * options is a QDict of options to pass to the block drivers, or NULL for an
1097  * empty set of options. The reference to the QDict is transferred to this
1098  * function (even on failure), so if the caller intends to reuse the dictionary,
1099  * it needs to use QINCREF() before calling bdrv_file_open.
1100  */
1101 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1102 {
1103     char *backing_filename = g_malloc0(PATH_MAX);
1104     int ret = 0;
1105     BlockDriver *back_drv = NULL;
1106     Error *local_err = NULL;
1107 
1108     if (bs->backing_hd != NULL) {
1109         QDECREF(options);
1110         goto free_exit;
1111     }
1112 
1113     /* NULL means an empty set of options */
1114     if (options == NULL) {
1115         options = qdict_new();
1116     }
1117 
1118     bs->open_flags &= ~BDRV_O_NO_BACKING;
1119     if (qdict_haskey(options, "file.filename")) {
1120         backing_filename[0] = '\0';
1121     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1122         QDECREF(options);
1123         goto free_exit;
1124     } else {
1125         bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1126     }
1127 
1128     if (bs->backing_format[0] != '\0') {
1129         back_drv = bdrv_find_format(bs->backing_format);
1130     }
1131 
1132     assert(bs->backing_hd == NULL);
1133     ret = bdrv_open(&bs->backing_hd,
1134                     *backing_filename ? backing_filename : NULL, NULL, options,
1135                     bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
1136     if (ret < 0) {
1137         bs->backing_hd = NULL;
1138         bs->open_flags |= BDRV_O_NO_BACKING;
1139         error_setg(errp, "Could not open backing file: %s",
1140                    error_get_pretty(local_err));
1141         error_free(local_err);
1142         goto free_exit;
1143     }
1144 
1145     if (bs->backing_hd->file) {
1146         pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1147                 bs->backing_hd->file->filename);
1148     }
1149 
1150     /* Recalculate the BlockLimits with the backing file */
1151     bdrv_refresh_limits(bs);
1152 
1153 free_exit:
1154     g_free(backing_filename);
1155     return ret;
1156 }
1157 
1158 /*
1159  * Opens a disk image whose options are given as BlockdevRef in another block
1160  * device's options.
1161  *
1162  * If allow_none is true, no image will be opened if filename is false and no
1163  * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1164  *
1165  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1166  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1167  * itself, all options starting with "${bdref_key}." are considered part of the
1168  * BlockdevRef.
1169  *
1170  * The BlockdevRef will be removed from the options QDict.
1171  *
1172  * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1173  */
1174 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1175                     QDict *options, const char *bdref_key, int flags,
1176                     bool allow_none, Error **errp)
1177 {
1178     QDict *image_options;
1179     int ret;
1180     char *bdref_key_dot;
1181     const char *reference;
1182 
1183     assert(pbs);
1184     assert(*pbs == NULL);
1185 
1186     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1187     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1188     g_free(bdref_key_dot);
1189 
1190     reference = qdict_get_try_str(options, bdref_key);
1191     if (!filename && !reference && !qdict_size(image_options)) {
1192         if (allow_none) {
1193             ret = 0;
1194         } else {
1195             error_setg(errp, "A block device must be specified for \"%s\"",
1196                        bdref_key);
1197             ret = -EINVAL;
1198         }
1199         goto done;
1200     }
1201 
1202     ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1203 
1204 done:
1205     qdict_del(options, bdref_key);
1206     return ret;
1207 }
1208 
1209 void bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1210 {
1211     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1212     char *tmp_filename = g_malloc0(PATH_MAX + 1);
1213     int64_t total_size;
1214     BlockDriver *bdrv_qcow2;
1215     QEMUOptionParameter *create_options;
1216     QDict *snapshot_options;
1217     BlockDriverState *bs_snapshot;
1218     Error *local_err;
1219     int ret;
1220 
1221     /* if snapshot, we create a temporary backing file and open it
1222        instead of opening 'filename' directly */
1223 
1224     /* Get the required size from the image */
1225     total_size = bdrv_getlength(bs);
1226     if (total_size < 0) {
1227         error_setg_errno(errp, -total_size, "Could not get image size");
1228         goto out;
1229     }
1230     total_size &= BDRV_SECTOR_MASK;
1231 
1232     /* Create the temporary image */
1233     ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1234     if (ret < 0) {
1235         error_setg_errno(errp, -ret, "Could not get temporary filename");
1236         goto out;
1237     }
1238 
1239     bdrv_qcow2 = bdrv_find_format("qcow2");
1240     create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1241                                              NULL);
1242 
1243     set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1244 
1245     ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1246     free_option_parameters(create_options);
1247     if (ret < 0) {
1248         error_setg_errno(errp, -ret, "Could not create temporary overlay "
1249                          "'%s': %s", tmp_filename,
1250                          error_get_pretty(local_err));
1251         error_free(local_err);
1252         goto out;
1253     }
1254 
1255     /* Prepare a new options QDict for the temporary file */
1256     snapshot_options = qdict_new();
1257     qdict_put(snapshot_options, "file.driver",
1258               qstring_from_str("file"));
1259     qdict_put(snapshot_options, "file.filename",
1260               qstring_from_str(tmp_filename));
1261 
1262     bs_snapshot = bdrv_new("", &error_abort);
1263 
1264     ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1265                     flags, bdrv_qcow2, &local_err);
1266     if (ret < 0) {
1267         error_propagate(errp, local_err);
1268         goto out;
1269     }
1270 
1271     bdrv_append(bs_snapshot, bs);
1272 
1273 out:
1274     g_free(tmp_filename);
1275 }
1276 
1277 static QDict *parse_json_filename(const char *filename, Error **errp)
1278 {
1279     QObject *options_obj;
1280     QDict *options;
1281     int ret;
1282 
1283     ret = strstart(filename, "json:", &filename);
1284     assert(ret);
1285 
1286     options_obj = qobject_from_json(filename);
1287     if (!options_obj) {
1288         error_setg(errp, "Could not parse the JSON options");
1289         return NULL;
1290     }
1291 
1292     if (qobject_type(options_obj) != QTYPE_QDICT) {
1293         qobject_decref(options_obj);
1294         error_setg(errp, "Invalid JSON object given");
1295         return NULL;
1296     }
1297 
1298     options = qobject_to_qdict(options_obj);
1299     qdict_flatten(options);
1300 
1301     return options;
1302 }
1303 
1304 /*
1305  * Opens a disk image (raw, qcow2, vmdk, ...)
1306  *
1307  * options is a QDict of options to pass to the block drivers, or NULL for an
1308  * empty set of options. The reference to the QDict belongs to the block layer
1309  * after the call (even on failure), so if the caller intends to reuse the
1310  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1311  *
1312  * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1313  * If it is not NULL, the referenced BDS will be reused.
1314  *
1315  * The reference parameter may be used to specify an existing block device which
1316  * should be opened. If specified, neither options nor a filename may be given,
1317  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1318  */
1319 int bdrv_open(BlockDriverState **pbs, const char *filename,
1320               const char *reference, QDict *options, int flags,
1321               BlockDriver *drv, Error **errp)
1322 {
1323     int ret;
1324     BlockDriverState *file = NULL, *bs;
1325     const char *drvname;
1326     Error *local_err = NULL;
1327     int snapshot_flags = 0;
1328 
1329     assert(pbs);
1330 
1331     if (reference) {
1332         bool options_non_empty = options ? qdict_size(options) : false;
1333         QDECREF(options);
1334 
1335         if (*pbs) {
1336             error_setg(errp, "Cannot reuse an existing BDS when referencing "
1337                        "another block device");
1338             return -EINVAL;
1339         }
1340 
1341         if (filename || options_non_empty) {
1342             error_setg(errp, "Cannot reference an existing block device with "
1343                        "additional options or a new filename");
1344             return -EINVAL;
1345         }
1346 
1347         bs = bdrv_lookup_bs(reference, reference, errp);
1348         if (!bs) {
1349             return -ENODEV;
1350         }
1351         bdrv_ref(bs);
1352         *pbs = bs;
1353         return 0;
1354     }
1355 
1356     if (*pbs) {
1357         bs = *pbs;
1358     } else {
1359         bs = bdrv_new("", &error_abort);
1360     }
1361 
1362     /* NULL means an empty set of options */
1363     if (options == NULL) {
1364         options = qdict_new();
1365     }
1366 
1367     if (filename && g_str_has_prefix(filename, "json:")) {
1368         QDict *json_options = parse_json_filename(filename, &local_err);
1369         if (local_err) {
1370             ret = -EINVAL;
1371             goto fail;
1372         }
1373 
1374         /* Options given in the filename have lower priority than options
1375          * specified directly */
1376         qdict_join(options, json_options, false);
1377         QDECREF(json_options);
1378         filename = NULL;
1379     }
1380 
1381     bs->options = options;
1382     options = qdict_clone_shallow(options);
1383 
1384     if (flags & BDRV_O_PROTOCOL) {
1385         assert(!drv);
1386         ret = bdrv_file_open(bs, filename, &options, flags & ~BDRV_O_PROTOCOL,
1387                              &local_err);
1388         if (!ret) {
1389             drv = bs->drv;
1390             goto done;
1391         } else if (bs->drv) {
1392             goto close_and_fail;
1393         } else {
1394             goto fail;
1395         }
1396     }
1397 
1398     /* Open image file without format layer */
1399     if (flags & BDRV_O_RDWR) {
1400         flags |= BDRV_O_ALLOW_RDWR;
1401     }
1402     if (flags & BDRV_O_SNAPSHOT) {
1403         snapshot_flags = bdrv_temp_snapshot_flags(flags);
1404         flags = bdrv_backing_flags(flags);
1405     }
1406 
1407     assert(file == NULL);
1408     ret = bdrv_open_image(&file, filename, options, "file",
1409                           bdrv_inherited_flags(flags),
1410                           true, &local_err);
1411     if (ret < 0) {
1412         goto fail;
1413     }
1414 
1415     /* Find the right image format driver */
1416     drvname = qdict_get_try_str(options, "driver");
1417     if (drvname) {
1418         drv = bdrv_find_format(drvname);
1419         qdict_del(options, "driver");
1420         if (!drv) {
1421             error_setg(errp, "Invalid driver: '%s'", drvname);
1422             ret = -EINVAL;
1423             goto fail;
1424         }
1425     }
1426 
1427     if (!drv) {
1428         if (file) {
1429             ret = find_image_format(file, filename, &drv, &local_err);
1430         } else {
1431             error_setg(errp, "Must specify either driver or file");
1432             ret = -EINVAL;
1433             goto fail;
1434         }
1435     }
1436 
1437     if (!drv) {
1438         goto fail;
1439     }
1440 
1441     /* Open the image */
1442     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1443     if (ret < 0) {
1444         goto fail;
1445     }
1446 
1447     if (file && (bs->file != file)) {
1448         bdrv_unref(file);
1449         file = NULL;
1450     }
1451 
1452     /* If there is a backing file, use it */
1453     if ((flags & BDRV_O_NO_BACKING) == 0) {
1454         QDict *backing_options;
1455 
1456         qdict_extract_subqdict(options, &backing_options, "backing.");
1457         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1458         if (ret < 0) {
1459             goto close_and_fail;
1460         }
1461     }
1462 
1463     /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1464      * temporary snapshot afterwards. */
1465     if (snapshot_flags) {
1466         bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1467         if (local_err) {
1468             error_propagate(errp, local_err);
1469             goto close_and_fail;
1470         }
1471     }
1472 
1473 
1474 done:
1475     /* Check if any unknown options were used */
1476     if (options && (qdict_size(options) != 0)) {
1477         const QDictEntry *entry = qdict_first(options);
1478         if (flags & BDRV_O_PROTOCOL) {
1479             error_setg(errp, "Block protocol '%s' doesn't support the option "
1480                        "'%s'", drv->format_name, entry->key);
1481         } else {
1482             error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1483                        "support the option '%s'", drv->format_name,
1484                        bs->device_name, entry->key);
1485         }
1486 
1487         ret = -EINVAL;
1488         goto close_and_fail;
1489     }
1490 
1491     if (!bdrv_key_required(bs)) {
1492         bdrv_dev_change_media_cb(bs, true);
1493     } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1494                && !runstate_check(RUN_STATE_INMIGRATE)
1495                && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1496         error_setg(errp,
1497                    "Guest must be stopped for opening of encrypted image");
1498         ret = -EBUSY;
1499         goto close_and_fail;
1500     }
1501 
1502     QDECREF(options);
1503     *pbs = bs;
1504     return 0;
1505 
1506 fail:
1507     if (file != NULL) {
1508         bdrv_unref(file);
1509     }
1510     QDECREF(bs->options);
1511     QDECREF(options);
1512     bs->options = NULL;
1513     if (!*pbs) {
1514         /* If *pbs is NULL, a new BDS has been created in this function and
1515            needs to be freed now. Otherwise, it does not need to be closed,
1516            since it has not really been opened yet. */
1517         bdrv_unref(bs);
1518     }
1519     if (local_err) {
1520         error_propagate(errp, local_err);
1521     }
1522     return ret;
1523 
1524 close_and_fail:
1525     /* See fail path, but now the BDS has to be always closed */
1526     if (*pbs) {
1527         bdrv_close(bs);
1528     } else {
1529         bdrv_unref(bs);
1530     }
1531     QDECREF(options);
1532     if (local_err) {
1533         error_propagate(errp, local_err);
1534     }
1535     return ret;
1536 }
1537 
1538 typedef struct BlockReopenQueueEntry {
1539      bool prepared;
1540      BDRVReopenState state;
1541      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1542 } BlockReopenQueueEntry;
1543 
1544 /*
1545  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1546  * reopen of multiple devices.
1547  *
1548  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1549  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1550  * be created and initialized. This newly created BlockReopenQueue should be
1551  * passed back in for subsequent calls that are intended to be of the same
1552  * atomic 'set'.
1553  *
1554  * bs is the BlockDriverState to add to the reopen queue.
1555  *
1556  * flags contains the open flags for the associated bs
1557  *
1558  * returns a pointer to bs_queue, which is either the newly allocated
1559  * bs_queue, or the existing bs_queue being used.
1560  *
1561  */
1562 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1563                                     BlockDriverState *bs, int flags)
1564 {
1565     assert(bs != NULL);
1566 
1567     BlockReopenQueueEntry *bs_entry;
1568     if (bs_queue == NULL) {
1569         bs_queue = g_new0(BlockReopenQueue, 1);
1570         QSIMPLEQ_INIT(bs_queue);
1571     }
1572 
1573     /* bdrv_open() masks this flag out */
1574     flags &= ~BDRV_O_PROTOCOL;
1575 
1576     if (bs->file) {
1577         bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1578     }
1579 
1580     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1581     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1582 
1583     bs_entry->state.bs = bs;
1584     bs_entry->state.flags = flags;
1585 
1586     return bs_queue;
1587 }
1588 
1589 /*
1590  * Reopen multiple BlockDriverStates atomically & transactionally.
1591  *
1592  * The queue passed in (bs_queue) must have been built up previous
1593  * via bdrv_reopen_queue().
1594  *
1595  * Reopens all BDS specified in the queue, with the appropriate
1596  * flags.  All devices are prepared for reopen, and failure of any
1597  * device will cause all device changes to be abandonded, and intermediate
1598  * data cleaned up.
1599  *
1600  * If all devices prepare successfully, then the changes are committed
1601  * to all devices.
1602  *
1603  */
1604 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1605 {
1606     int ret = -1;
1607     BlockReopenQueueEntry *bs_entry, *next;
1608     Error *local_err = NULL;
1609 
1610     assert(bs_queue != NULL);
1611 
1612     bdrv_drain_all();
1613 
1614     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1615         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1616             error_propagate(errp, local_err);
1617             goto cleanup;
1618         }
1619         bs_entry->prepared = true;
1620     }
1621 
1622     /* If we reach this point, we have success and just need to apply the
1623      * changes
1624      */
1625     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1626         bdrv_reopen_commit(&bs_entry->state);
1627     }
1628 
1629     ret = 0;
1630 
1631 cleanup:
1632     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1633         if (ret && bs_entry->prepared) {
1634             bdrv_reopen_abort(&bs_entry->state);
1635         }
1636         g_free(bs_entry);
1637     }
1638     g_free(bs_queue);
1639     return ret;
1640 }
1641 
1642 
1643 /* Reopen a single BlockDriverState with the specified flags. */
1644 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1645 {
1646     int ret = -1;
1647     Error *local_err = NULL;
1648     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1649 
1650     ret = bdrv_reopen_multiple(queue, &local_err);
1651     if (local_err != NULL) {
1652         error_propagate(errp, local_err);
1653     }
1654     return ret;
1655 }
1656 
1657 
1658 /*
1659  * Prepares a BlockDriverState for reopen. All changes are staged in the
1660  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1661  * the block driver layer .bdrv_reopen_prepare()
1662  *
1663  * bs is the BlockDriverState to reopen
1664  * flags are the new open flags
1665  * queue is the reopen queue
1666  *
1667  * Returns 0 on success, non-zero on error.  On error errp will be set
1668  * as well.
1669  *
1670  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1671  * It is the responsibility of the caller to then call the abort() or
1672  * commit() for any other BDS that have been left in a prepare() state
1673  *
1674  */
1675 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1676                         Error **errp)
1677 {
1678     int ret = -1;
1679     Error *local_err = NULL;
1680     BlockDriver *drv;
1681 
1682     assert(reopen_state != NULL);
1683     assert(reopen_state->bs->drv != NULL);
1684     drv = reopen_state->bs->drv;
1685 
1686     /* if we are to stay read-only, do not allow permission change
1687      * to r/w */
1688     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1689         reopen_state->flags & BDRV_O_RDWR) {
1690         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1691                   reopen_state->bs->device_name);
1692         goto error;
1693     }
1694 
1695 
1696     ret = bdrv_flush(reopen_state->bs);
1697     if (ret) {
1698         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1699                   strerror(-ret));
1700         goto error;
1701     }
1702 
1703     if (drv->bdrv_reopen_prepare) {
1704         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1705         if (ret) {
1706             if (local_err != NULL) {
1707                 error_propagate(errp, local_err);
1708             } else {
1709                 error_setg(errp, "failed while preparing to reopen image '%s'",
1710                            reopen_state->bs->filename);
1711             }
1712             goto error;
1713         }
1714     } else {
1715         /* It is currently mandatory to have a bdrv_reopen_prepare()
1716          * handler for each supported drv. */
1717         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1718                   drv->format_name, reopen_state->bs->device_name,
1719                  "reopening of file");
1720         ret = -1;
1721         goto error;
1722     }
1723 
1724     ret = 0;
1725 
1726 error:
1727     return ret;
1728 }
1729 
1730 /*
1731  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1732  * makes them final by swapping the staging BlockDriverState contents into
1733  * the active BlockDriverState contents.
1734  */
1735 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1736 {
1737     BlockDriver *drv;
1738 
1739     assert(reopen_state != NULL);
1740     drv = reopen_state->bs->drv;
1741     assert(drv != NULL);
1742 
1743     /* If there are any driver level actions to take */
1744     if (drv->bdrv_reopen_commit) {
1745         drv->bdrv_reopen_commit(reopen_state);
1746     }
1747 
1748     /* set BDS specific flags now */
1749     reopen_state->bs->open_flags         = reopen_state->flags;
1750     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1751                                               BDRV_O_CACHE_WB);
1752     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1753 
1754     bdrv_refresh_limits(reopen_state->bs);
1755 }
1756 
1757 /*
1758  * Abort the reopen, and delete and free the staged changes in
1759  * reopen_state
1760  */
1761 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1762 {
1763     BlockDriver *drv;
1764 
1765     assert(reopen_state != NULL);
1766     drv = reopen_state->bs->drv;
1767     assert(drv != NULL);
1768 
1769     if (drv->bdrv_reopen_abort) {
1770         drv->bdrv_reopen_abort(reopen_state);
1771     }
1772 }
1773 
1774 
1775 void bdrv_close(BlockDriverState *bs)
1776 {
1777     if (bs->job) {
1778         block_job_cancel_sync(bs->job);
1779     }
1780     bdrv_drain_all(); /* complete I/O */
1781     bdrv_flush(bs);
1782     bdrv_drain_all(); /* in case flush left pending I/O */
1783     notifier_list_notify(&bs->close_notifiers, bs);
1784 
1785     if (bs->drv) {
1786         if (bs->backing_hd) {
1787             bdrv_unref(bs->backing_hd);
1788             bs->backing_hd = NULL;
1789         }
1790         bs->drv->bdrv_close(bs);
1791         g_free(bs->opaque);
1792         bs->opaque = NULL;
1793         bs->drv = NULL;
1794         bs->copy_on_read = 0;
1795         bs->backing_file[0] = '\0';
1796         bs->backing_format[0] = '\0';
1797         bs->total_sectors = 0;
1798         bs->encrypted = 0;
1799         bs->valid_key = 0;
1800         bs->sg = 0;
1801         bs->growable = 0;
1802         bs->zero_beyond_eof = false;
1803         QDECREF(bs->options);
1804         bs->options = NULL;
1805 
1806         if (bs->file != NULL) {
1807             bdrv_unref(bs->file);
1808             bs->file = NULL;
1809         }
1810     }
1811 
1812     bdrv_dev_change_media_cb(bs, false);
1813 
1814     /*throttling disk I/O limits*/
1815     if (bs->io_limits_enabled) {
1816         bdrv_io_limits_disable(bs);
1817     }
1818 }
1819 
1820 void bdrv_close_all(void)
1821 {
1822     BlockDriverState *bs;
1823 
1824     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1825         bdrv_close(bs);
1826     }
1827 }
1828 
1829 /* Check if any requests are in-flight (including throttled requests) */
1830 static bool bdrv_requests_pending(BlockDriverState *bs)
1831 {
1832     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1833         return true;
1834     }
1835     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1836         return true;
1837     }
1838     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1839         return true;
1840     }
1841     if (bs->file && bdrv_requests_pending(bs->file)) {
1842         return true;
1843     }
1844     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1845         return true;
1846     }
1847     return false;
1848 }
1849 
1850 static bool bdrv_requests_pending_all(void)
1851 {
1852     BlockDriverState *bs;
1853     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1854         if (bdrv_requests_pending(bs)) {
1855             return true;
1856         }
1857     }
1858     return false;
1859 }
1860 
1861 /*
1862  * Wait for pending requests to complete across all BlockDriverStates
1863  *
1864  * This function does not flush data to disk, use bdrv_flush_all() for that
1865  * after calling this function.
1866  *
1867  * Note that completion of an asynchronous I/O operation can trigger any
1868  * number of other I/O operations on other devices---for example a coroutine
1869  * can be arbitrarily complex and a constant flow of I/O can come until the
1870  * coroutine is complete.  Because of this, it is not possible to have a
1871  * function to drain a single device's I/O queue.
1872  */
1873 void bdrv_drain_all(void)
1874 {
1875     /* Always run first iteration so any pending completion BHs run */
1876     bool busy = true;
1877     BlockDriverState *bs;
1878 
1879     while (busy) {
1880         QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1881             bdrv_start_throttled_reqs(bs);
1882         }
1883 
1884         busy = bdrv_requests_pending_all();
1885         busy |= aio_poll(qemu_get_aio_context(), busy);
1886     }
1887 }
1888 
1889 /* make a BlockDriverState anonymous by removing from bdrv_state and
1890  * graph_bdrv_state list.
1891    Also, NULL terminate the device_name to prevent double remove */
1892 void bdrv_make_anon(BlockDriverState *bs)
1893 {
1894     if (bs->device_name[0] != '\0') {
1895         QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1896     }
1897     bs->device_name[0] = '\0';
1898     if (bs->node_name[0] != '\0') {
1899         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1900     }
1901     bs->node_name[0] = '\0';
1902 }
1903 
1904 static void bdrv_rebind(BlockDriverState *bs)
1905 {
1906     if (bs->drv && bs->drv->bdrv_rebind) {
1907         bs->drv->bdrv_rebind(bs);
1908     }
1909 }
1910 
1911 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1912                                      BlockDriverState *bs_src)
1913 {
1914     /* move some fields that need to stay attached to the device */
1915 
1916     /* dev info */
1917     bs_dest->dev_ops            = bs_src->dev_ops;
1918     bs_dest->dev_opaque         = bs_src->dev_opaque;
1919     bs_dest->dev                = bs_src->dev;
1920     bs_dest->guest_block_size   = bs_src->guest_block_size;
1921     bs_dest->copy_on_read       = bs_src->copy_on_read;
1922 
1923     bs_dest->enable_write_cache = bs_src->enable_write_cache;
1924 
1925     /* i/o throttled req */
1926     memcpy(&bs_dest->throttle_state,
1927            &bs_src->throttle_state,
1928            sizeof(ThrottleState));
1929     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1930     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1931     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1932 
1933     /* r/w error */
1934     bs_dest->on_read_error      = bs_src->on_read_error;
1935     bs_dest->on_write_error     = bs_src->on_write_error;
1936 
1937     /* i/o status */
1938     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1939     bs_dest->iostatus           = bs_src->iostatus;
1940 
1941     /* dirty bitmap */
1942     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1943 
1944     /* reference count */
1945     bs_dest->refcnt             = bs_src->refcnt;
1946 
1947     /* job */
1948     bs_dest->in_use             = bs_src->in_use;
1949     bs_dest->job                = bs_src->job;
1950 
1951     /* keep the same entry in bdrv_states */
1952     pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1953             bs_src->device_name);
1954     bs_dest->device_list = bs_src->device_list;
1955 }
1956 
1957 /*
1958  * Swap bs contents for two image chains while they are live,
1959  * while keeping required fields on the BlockDriverState that is
1960  * actually attached to a device.
1961  *
1962  * This will modify the BlockDriverState fields, and swap contents
1963  * between bs_new and bs_old. Both bs_new and bs_old are modified.
1964  *
1965  * bs_new is required to be anonymous.
1966  *
1967  * This function does not create any image files.
1968  */
1969 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1970 {
1971     BlockDriverState tmp;
1972 
1973     /* The code needs to swap the node_name but simply swapping node_list won't
1974      * work so first remove the nodes from the graph list, do the swap then
1975      * insert them back if needed.
1976      */
1977     if (bs_new->node_name[0] != '\0') {
1978         QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
1979     }
1980     if (bs_old->node_name[0] != '\0') {
1981         QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
1982     }
1983 
1984     /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1985     assert(bs_new->device_name[0] == '\0');
1986     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1987     assert(bs_new->job == NULL);
1988     assert(bs_new->dev == NULL);
1989     assert(bs_new->in_use == 0);
1990     assert(bs_new->io_limits_enabled == false);
1991     assert(!throttle_have_timer(&bs_new->throttle_state));
1992 
1993     tmp = *bs_new;
1994     *bs_new = *bs_old;
1995     *bs_old = tmp;
1996 
1997     /* there are some fields that should not be swapped, move them back */
1998     bdrv_move_feature_fields(&tmp, bs_old);
1999     bdrv_move_feature_fields(bs_old, bs_new);
2000     bdrv_move_feature_fields(bs_new, &tmp);
2001 
2002     /* bs_new shouldn't be in bdrv_states even after the swap!  */
2003     assert(bs_new->device_name[0] == '\0');
2004 
2005     /* Check a few fields that should remain attached to the device */
2006     assert(bs_new->dev == NULL);
2007     assert(bs_new->job == NULL);
2008     assert(bs_new->in_use == 0);
2009     assert(bs_new->io_limits_enabled == false);
2010     assert(!throttle_have_timer(&bs_new->throttle_state));
2011 
2012     /* insert the nodes back into the graph node list if needed */
2013     if (bs_new->node_name[0] != '\0') {
2014         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2015     }
2016     if (bs_old->node_name[0] != '\0') {
2017         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2018     }
2019 
2020     bdrv_rebind(bs_new);
2021     bdrv_rebind(bs_old);
2022 }
2023 
2024 /*
2025  * Add new bs contents at the top of an image chain while the chain is
2026  * live, while keeping required fields on the top layer.
2027  *
2028  * This will modify the BlockDriverState fields, and swap contents
2029  * between bs_new and bs_top. Both bs_new and bs_top are modified.
2030  *
2031  * bs_new is required to be anonymous.
2032  *
2033  * This function does not create any image files.
2034  */
2035 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2036 {
2037     bdrv_swap(bs_new, bs_top);
2038 
2039     /* The contents of 'tmp' will become bs_top, as we are
2040      * swapping bs_new and bs_top contents. */
2041     bs_top->backing_hd = bs_new;
2042     bs_top->open_flags &= ~BDRV_O_NO_BACKING;
2043     pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
2044             bs_new->filename);
2045     pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
2046             bs_new->drv ? bs_new->drv->format_name : "");
2047 }
2048 
2049 static void bdrv_delete(BlockDriverState *bs)
2050 {
2051     assert(!bs->dev);
2052     assert(!bs->job);
2053     assert(!bs->in_use);
2054     assert(!bs->refcnt);
2055     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2056 
2057     bdrv_close(bs);
2058 
2059     /* remove from list, if necessary */
2060     bdrv_make_anon(bs);
2061 
2062     g_free(bs);
2063 }
2064 
2065 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
2066 /* TODO change to DeviceState *dev when all users are qdevified */
2067 {
2068     if (bs->dev) {
2069         return -EBUSY;
2070     }
2071     bs->dev = dev;
2072     bdrv_iostatus_reset(bs);
2073     return 0;
2074 }
2075 
2076 /* TODO qdevified devices don't use this, remove when devices are qdevified */
2077 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
2078 {
2079     if (bdrv_attach_dev(bs, dev) < 0) {
2080         abort();
2081     }
2082 }
2083 
2084 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
2085 /* TODO change to DeviceState *dev when all users are qdevified */
2086 {
2087     assert(bs->dev == dev);
2088     bs->dev = NULL;
2089     bs->dev_ops = NULL;
2090     bs->dev_opaque = NULL;
2091     bs->guest_block_size = 512;
2092 }
2093 
2094 /* TODO change to return DeviceState * when all users are qdevified */
2095 void *bdrv_get_attached_dev(BlockDriverState *bs)
2096 {
2097     return bs->dev;
2098 }
2099 
2100 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2101                       void *opaque)
2102 {
2103     bs->dev_ops = ops;
2104     bs->dev_opaque = opaque;
2105 }
2106 
2107 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
2108                                enum MonitorEvent ev,
2109                                BlockErrorAction action, bool is_read)
2110 {
2111     QObject *data;
2112     const char *action_str;
2113 
2114     switch (action) {
2115     case BDRV_ACTION_REPORT:
2116         action_str = "report";
2117         break;
2118     case BDRV_ACTION_IGNORE:
2119         action_str = "ignore";
2120         break;
2121     case BDRV_ACTION_STOP:
2122         action_str = "stop";
2123         break;
2124     default:
2125         abort();
2126     }
2127 
2128     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2129                               bdrv->device_name,
2130                               action_str,
2131                               is_read ? "read" : "write");
2132     monitor_protocol_event(ev, data);
2133 
2134     qobject_decref(data);
2135 }
2136 
2137 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2138 {
2139     QObject *data;
2140 
2141     data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2142                               bdrv_get_device_name(bs), ejected);
2143     monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2144 
2145     qobject_decref(data);
2146 }
2147 
2148 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2149 {
2150     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2151         bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2152         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2153         if (tray_was_closed) {
2154             /* tray open */
2155             bdrv_emit_qmp_eject_event(bs, true);
2156         }
2157         if (load) {
2158             /* tray close */
2159             bdrv_emit_qmp_eject_event(bs, false);
2160         }
2161     }
2162 }
2163 
2164 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2165 {
2166     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2167 }
2168 
2169 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2170 {
2171     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2172         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2173     }
2174 }
2175 
2176 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2177 {
2178     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2179         return bs->dev_ops->is_tray_open(bs->dev_opaque);
2180     }
2181     return false;
2182 }
2183 
2184 static void bdrv_dev_resize_cb(BlockDriverState *bs)
2185 {
2186     if (bs->dev_ops && bs->dev_ops->resize_cb) {
2187         bs->dev_ops->resize_cb(bs->dev_opaque);
2188     }
2189 }
2190 
2191 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2192 {
2193     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2194         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2195     }
2196     return false;
2197 }
2198 
2199 /*
2200  * Run consistency checks on an image
2201  *
2202  * Returns 0 if the check could be completed (it doesn't mean that the image is
2203  * free of errors) or -errno when an internal error occurred. The results of the
2204  * check are stored in res.
2205  */
2206 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2207 {
2208     if (bs->drv->bdrv_check == NULL) {
2209         return -ENOTSUP;
2210     }
2211 
2212     memset(res, 0, sizeof(*res));
2213     return bs->drv->bdrv_check(bs, res, fix);
2214 }
2215 
2216 #define COMMIT_BUF_SECTORS 2048
2217 
2218 /* commit COW file into the raw image */
2219 int bdrv_commit(BlockDriverState *bs)
2220 {
2221     BlockDriver *drv = bs->drv;
2222     int64_t sector, total_sectors, length, backing_length;
2223     int n, ro, open_flags;
2224     int ret = 0;
2225     uint8_t *buf = NULL;
2226     char filename[PATH_MAX];
2227 
2228     if (!drv)
2229         return -ENOMEDIUM;
2230 
2231     if (!bs->backing_hd) {
2232         return -ENOTSUP;
2233     }
2234 
2235     if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
2236         return -EBUSY;
2237     }
2238 
2239     ro = bs->backing_hd->read_only;
2240     /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2241     pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2242     open_flags =  bs->backing_hd->open_flags;
2243 
2244     if (ro) {
2245         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2246             return -EACCES;
2247         }
2248     }
2249 
2250     length = bdrv_getlength(bs);
2251     if (length < 0) {
2252         ret = length;
2253         goto ro_cleanup;
2254     }
2255 
2256     backing_length = bdrv_getlength(bs->backing_hd);
2257     if (backing_length < 0) {
2258         ret = backing_length;
2259         goto ro_cleanup;
2260     }
2261 
2262     /* If our top snapshot is larger than the backing file image,
2263      * grow the backing file image if possible.  If not possible,
2264      * we must return an error */
2265     if (length > backing_length) {
2266         ret = bdrv_truncate(bs->backing_hd, length);
2267         if (ret < 0) {
2268             goto ro_cleanup;
2269         }
2270     }
2271 
2272     total_sectors = length >> BDRV_SECTOR_BITS;
2273     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2274 
2275     for (sector = 0; sector < total_sectors; sector += n) {
2276         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2277         if (ret < 0) {
2278             goto ro_cleanup;
2279         }
2280         if (ret) {
2281             ret = bdrv_read(bs, sector, buf, n);
2282             if (ret < 0) {
2283                 goto ro_cleanup;
2284             }
2285 
2286             ret = bdrv_write(bs->backing_hd, sector, buf, n);
2287             if (ret < 0) {
2288                 goto ro_cleanup;
2289             }
2290         }
2291     }
2292 
2293     if (drv->bdrv_make_empty) {
2294         ret = drv->bdrv_make_empty(bs);
2295         if (ret < 0) {
2296             goto ro_cleanup;
2297         }
2298         bdrv_flush(bs);
2299     }
2300 
2301     /*
2302      * Make sure all data we wrote to the backing device is actually
2303      * stable on disk.
2304      */
2305     if (bs->backing_hd) {
2306         bdrv_flush(bs->backing_hd);
2307     }
2308 
2309     ret = 0;
2310 ro_cleanup:
2311     g_free(buf);
2312 
2313     if (ro) {
2314         /* ignoring error return here */
2315         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2316     }
2317 
2318     return ret;
2319 }
2320 
2321 int bdrv_commit_all(void)
2322 {
2323     BlockDriverState *bs;
2324 
2325     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2326         if (bs->drv && bs->backing_hd) {
2327             int ret = bdrv_commit(bs);
2328             if (ret < 0) {
2329                 return ret;
2330             }
2331         }
2332     }
2333     return 0;
2334 }
2335 
2336 /**
2337  * Remove an active request from the tracked requests list
2338  *
2339  * This function should be called when a tracked request is completing.
2340  */
2341 static void tracked_request_end(BdrvTrackedRequest *req)
2342 {
2343     if (req->serialising) {
2344         req->bs->serialising_in_flight--;
2345     }
2346 
2347     QLIST_REMOVE(req, list);
2348     qemu_co_queue_restart_all(&req->wait_queue);
2349 }
2350 
2351 /**
2352  * Add an active request to the tracked requests list
2353  */
2354 static void tracked_request_begin(BdrvTrackedRequest *req,
2355                                   BlockDriverState *bs,
2356                                   int64_t offset,
2357                                   unsigned int bytes, bool is_write)
2358 {
2359     *req = (BdrvTrackedRequest){
2360         .bs = bs,
2361         .offset         = offset,
2362         .bytes          = bytes,
2363         .is_write       = is_write,
2364         .co             = qemu_coroutine_self(),
2365         .serialising    = false,
2366         .overlap_offset = offset,
2367         .overlap_bytes  = bytes,
2368     };
2369 
2370     qemu_co_queue_init(&req->wait_queue);
2371 
2372     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2373 }
2374 
2375 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2376 {
2377     int64_t overlap_offset = req->offset & ~(align - 1);
2378     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2379                                - overlap_offset;
2380 
2381     if (!req->serialising) {
2382         req->bs->serialising_in_flight++;
2383         req->serialising = true;
2384     }
2385 
2386     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2387     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2388 }
2389 
2390 /**
2391  * Round a region to cluster boundaries
2392  */
2393 void bdrv_round_to_clusters(BlockDriverState *bs,
2394                             int64_t sector_num, int nb_sectors,
2395                             int64_t *cluster_sector_num,
2396                             int *cluster_nb_sectors)
2397 {
2398     BlockDriverInfo bdi;
2399 
2400     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2401         *cluster_sector_num = sector_num;
2402         *cluster_nb_sectors = nb_sectors;
2403     } else {
2404         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2405         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2406         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2407                                             nb_sectors, c);
2408     }
2409 }
2410 
2411 static int bdrv_get_cluster_size(BlockDriverState *bs)
2412 {
2413     BlockDriverInfo bdi;
2414     int ret;
2415 
2416     ret = bdrv_get_info(bs, &bdi);
2417     if (ret < 0 || bdi.cluster_size == 0) {
2418         return bs->request_alignment;
2419     } else {
2420         return bdi.cluster_size;
2421     }
2422 }
2423 
2424 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2425                                      int64_t offset, unsigned int bytes)
2426 {
2427     /*        aaaa   bbbb */
2428     if (offset >= req->overlap_offset + req->overlap_bytes) {
2429         return false;
2430     }
2431     /* bbbb   aaaa        */
2432     if (req->overlap_offset >= offset + bytes) {
2433         return false;
2434     }
2435     return true;
2436 }
2437 
2438 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2439 {
2440     BlockDriverState *bs = self->bs;
2441     BdrvTrackedRequest *req;
2442     bool retry;
2443     bool waited = false;
2444 
2445     if (!bs->serialising_in_flight) {
2446         return false;
2447     }
2448 
2449     do {
2450         retry = false;
2451         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2452             if (req == self || (!req->serialising && !self->serialising)) {
2453                 continue;
2454             }
2455             if (tracked_request_overlaps(req, self->overlap_offset,
2456                                          self->overlap_bytes))
2457             {
2458                 /* Hitting this means there was a reentrant request, for
2459                  * example, a block driver issuing nested requests.  This must
2460                  * never happen since it means deadlock.
2461                  */
2462                 assert(qemu_coroutine_self() != req->co);
2463 
2464                 /* If the request is already (indirectly) waiting for us, or
2465                  * will wait for us as soon as it wakes up, then just go on
2466                  * (instead of producing a deadlock in the former case). */
2467                 if (!req->waiting_for) {
2468                     self->waiting_for = req;
2469                     qemu_co_queue_wait(&req->wait_queue);
2470                     self->waiting_for = NULL;
2471                     retry = true;
2472                     waited = true;
2473                     break;
2474                 }
2475             }
2476         }
2477     } while (retry);
2478 
2479     return waited;
2480 }
2481 
2482 /*
2483  * Return values:
2484  * 0        - success
2485  * -EINVAL  - backing format specified, but no file
2486  * -ENOSPC  - can't update the backing file because no space is left in the
2487  *            image file header
2488  * -ENOTSUP - format driver doesn't support changing the backing file
2489  */
2490 int bdrv_change_backing_file(BlockDriverState *bs,
2491     const char *backing_file, const char *backing_fmt)
2492 {
2493     BlockDriver *drv = bs->drv;
2494     int ret;
2495 
2496     /* Backing file format doesn't make sense without a backing file */
2497     if (backing_fmt && !backing_file) {
2498         return -EINVAL;
2499     }
2500 
2501     if (drv->bdrv_change_backing_file != NULL) {
2502         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2503     } else {
2504         ret = -ENOTSUP;
2505     }
2506 
2507     if (ret == 0) {
2508         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2509         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2510     }
2511     return ret;
2512 }
2513 
2514 /*
2515  * Finds the image layer in the chain that has 'bs' as its backing file.
2516  *
2517  * active is the current topmost image.
2518  *
2519  * Returns NULL if bs is not found in active's image chain,
2520  * or if active == bs.
2521  */
2522 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2523                                     BlockDriverState *bs)
2524 {
2525     BlockDriverState *overlay = NULL;
2526     BlockDriverState *intermediate;
2527 
2528     assert(active != NULL);
2529     assert(bs != NULL);
2530 
2531     /* if bs is the same as active, then by definition it has no overlay
2532      */
2533     if (active == bs) {
2534         return NULL;
2535     }
2536 
2537     intermediate = active;
2538     while (intermediate->backing_hd) {
2539         if (intermediate->backing_hd == bs) {
2540             overlay = intermediate;
2541             break;
2542         }
2543         intermediate = intermediate->backing_hd;
2544     }
2545 
2546     return overlay;
2547 }
2548 
2549 typedef struct BlkIntermediateStates {
2550     BlockDriverState *bs;
2551     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2552 } BlkIntermediateStates;
2553 
2554 
2555 /*
2556  * Drops images above 'base' up to and including 'top', and sets the image
2557  * above 'top' to have base as its backing file.
2558  *
2559  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2560  * information in 'bs' can be properly updated.
2561  *
2562  * E.g., this will convert the following chain:
2563  * bottom <- base <- intermediate <- top <- active
2564  *
2565  * to
2566  *
2567  * bottom <- base <- active
2568  *
2569  * It is allowed for bottom==base, in which case it converts:
2570  *
2571  * base <- intermediate <- top <- active
2572  *
2573  * to
2574  *
2575  * base <- active
2576  *
2577  * Error conditions:
2578  *  if active == top, that is considered an error
2579  *
2580  */
2581 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2582                            BlockDriverState *base)
2583 {
2584     BlockDriverState *intermediate;
2585     BlockDriverState *base_bs = NULL;
2586     BlockDriverState *new_top_bs = NULL;
2587     BlkIntermediateStates *intermediate_state, *next;
2588     int ret = -EIO;
2589 
2590     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2591     QSIMPLEQ_INIT(&states_to_delete);
2592 
2593     if (!top->drv || !base->drv) {
2594         goto exit;
2595     }
2596 
2597     new_top_bs = bdrv_find_overlay(active, top);
2598 
2599     if (new_top_bs == NULL) {
2600         /* we could not find the image above 'top', this is an error */
2601         goto exit;
2602     }
2603 
2604     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2605      * to do, no intermediate images */
2606     if (new_top_bs->backing_hd == base) {
2607         ret = 0;
2608         goto exit;
2609     }
2610 
2611     intermediate = top;
2612 
2613     /* now we will go down through the list, and add each BDS we find
2614      * into our deletion queue, until we hit the 'base'
2615      */
2616     while (intermediate) {
2617         intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2618         intermediate_state->bs = intermediate;
2619         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2620 
2621         if (intermediate->backing_hd == base) {
2622             base_bs = intermediate->backing_hd;
2623             break;
2624         }
2625         intermediate = intermediate->backing_hd;
2626     }
2627     if (base_bs == NULL) {
2628         /* something went wrong, we did not end at the base. safely
2629          * unravel everything, and exit with error */
2630         goto exit;
2631     }
2632 
2633     /* success - we can delete the intermediate states, and link top->base */
2634     ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2635                                    base_bs->drv ? base_bs->drv->format_name : "");
2636     if (ret) {
2637         goto exit;
2638     }
2639     new_top_bs->backing_hd = base_bs;
2640 
2641     bdrv_refresh_limits(new_top_bs);
2642 
2643     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2644         /* so that bdrv_close() does not recursively close the chain */
2645         intermediate_state->bs->backing_hd = NULL;
2646         bdrv_unref(intermediate_state->bs);
2647     }
2648     ret = 0;
2649 
2650 exit:
2651     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2652         g_free(intermediate_state);
2653     }
2654     return ret;
2655 }
2656 
2657 
2658 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2659                                    size_t size)
2660 {
2661     int64_t len;
2662 
2663     if (size > INT_MAX) {
2664         return -EIO;
2665     }
2666 
2667     if (!bdrv_is_inserted(bs))
2668         return -ENOMEDIUM;
2669 
2670     if (bs->growable)
2671         return 0;
2672 
2673     len = bdrv_getlength(bs);
2674 
2675     if (offset < 0)
2676         return -EIO;
2677 
2678     if ((offset > len) || (len - offset < size))
2679         return -EIO;
2680 
2681     return 0;
2682 }
2683 
2684 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2685                               int nb_sectors)
2686 {
2687     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2688         return -EIO;
2689     }
2690 
2691     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2692                                    nb_sectors * BDRV_SECTOR_SIZE);
2693 }
2694 
2695 typedef struct RwCo {
2696     BlockDriverState *bs;
2697     int64_t offset;
2698     QEMUIOVector *qiov;
2699     bool is_write;
2700     int ret;
2701     BdrvRequestFlags flags;
2702 } RwCo;
2703 
2704 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2705 {
2706     RwCo *rwco = opaque;
2707 
2708     if (!rwco->is_write) {
2709         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2710                                       rwco->qiov->size, rwco->qiov,
2711                                       rwco->flags);
2712     } else {
2713         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2714                                        rwco->qiov->size, rwco->qiov,
2715                                        rwco->flags);
2716     }
2717 }
2718 
2719 /*
2720  * Process a vectored synchronous request using coroutines
2721  */
2722 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2723                         QEMUIOVector *qiov, bool is_write,
2724                         BdrvRequestFlags flags)
2725 {
2726     Coroutine *co;
2727     RwCo rwco = {
2728         .bs = bs,
2729         .offset = offset,
2730         .qiov = qiov,
2731         .is_write = is_write,
2732         .ret = NOT_DONE,
2733         .flags = flags,
2734     };
2735 
2736     /**
2737      * In sync call context, when the vcpu is blocked, this throttling timer
2738      * will not fire; so the I/O throttling function has to be disabled here
2739      * if it has been enabled.
2740      */
2741     if (bs->io_limits_enabled) {
2742         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2743                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2744         bdrv_io_limits_disable(bs);
2745     }
2746 
2747     if (qemu_in_coroutine()) {
2748         /* Fast-path if already in coroutine context */
2749         bdrv_rw_co_entry(&rwco);
2750     } else {
2751         co = qemu_coroutine_create(bdrv_rw_co_entry);
2752         qemu_coroutine_enter(co, &rwco);
2753         while (rwco.ret == NOT_DONE) {
2754             qemu_aio_wait();
2755         }
2756     }
2757     return rwco.ret;
2758 }
2759 
2760 /*
2761  * Process a synchronous request using coroutines
2762  */
2763 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2764                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2765 {
2766     QEMUIOVector qiov;
2767     struct iovec iov = {
2768         .iov_base = (void *)buf,
2769         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2770     };
2771 
2772     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2773         return -EINVAL;
2774     }
2775 
2776     qemu_iovec_init_external(&qiov, &iov, 1);
2777     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2778                         &qiov, is_write, flags);
2779 }
2780 
2781 /* return < 0 if error. See bdrv_write() for the return codes */
2782 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2783               uint8_t *buf, int nb_sectors)
2784 {
2785     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2786 }
2787 
2788 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2789 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2790                           uint8_t *buf, int nb_sectors)
2791 {
2792     bool enabled;
2793     int ret;
2794 
2795     enabled = bs->io_limits_enabled;
2796     bs->io_limits_enabled = false;
2797     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2798     bs->io_limits_enabled = enabled;
2799     return ret;
2800 }
2801 
2802 /* Return < 0 if error. Important errors are:
2803   -EIO         generic I/O error (may happen for all errors)
2804   -ENOMEDIUM   No media inserted.
2805   -EINVAL      Invalid sector number or nb_sectors
2806   -EACCES      Trying to write a read-only device
2807 */
2808 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2809                const uint8_t *buf, int nb_sectors)
2810 {
2811     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2812 }
2813 
2814 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2815                       int nb_sectors, BdrvRequestFlags flags)
2816 {
2817     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2818                       BDRV_REQ_ZERO_WRITE | flags);
2819 }
2820 
2821 /*
2822  * Completely zero out a block device with the help of bdrv_write_zeroes.
2823  * The operation is sped up by checking the block status and only writing
2824  * zeroes to the device if they currently do not return zeroes. Optional
2825  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2826  *
2827  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2828  */
2829 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2830 {
2831     int64_t target_size;
2832     int64_t ret, nb_sectors, sector_num = 0;
2833     int n;
2834 
2835     target_size = bdrv_getlength(bs);
2836     if (target_size < 0) {
2837         return target_size;
2838     }
2839     target_size /= BDRV_SECTOR_SIZE;
2840 
2841     for (;;) {
2842         nb_sectors = target_size - sector_num;
2843         if (nb_sectors <= 0) {
2844             return 0;
2845         }
2846         if (nb_sectors > INT_MAX) {
2847             nb_sectors = INT_MAX;
2848         }
2849         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2850         if (ret < 0) {
2851             error_report("error getting block status at sector %" PRId64 ": %s",
2852                          sector_num, strerror(-ret));
2853             return ret;
2854         }
2855         if (ret & BDRV_BLOCK_ZERO) {
2856             sector_num += n;
2857             continue;
2858         }
2859         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2860         if (ret < 0) {
2861             error_report("error writing zeroes at sector %" PRId64 ": %s",
2862                          sector_num, strerror(-ret));
2863             return ret;
2864         }
2865         sector_num += n;
2866     }
2867 }
2868 
2869 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2870 {
2871     QEMUIOVector qiov;
2872     struct iovec iov = {
2873         .iov_base = (void *)buf,
2874         .iov_len = bytes,
2875     };
2876     int ret;
2877 
2878     if (bytes < 0) {
2879         return -EINVAL;
2880     }
2881 
2882     qemu_iovec_init_external(&qiov, &iov, 1);
2883     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2884     if (ret < 0) {
2885         return ret;
2886     }
2887 
2888     return bytes;
2889 }
2890 
2891 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2892 {
2893     int ret;
2894 
2895     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2896     if (ret < 0) {
2897         return ret;
2898     }
2899 
2900     return qiov->size;
2901 }
2902 
2903 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2904                 const void *buf, int bytes)
2905 {
2906     QEMUIOVector qiov;
2907     struct iovec iov = {
2908         .iov_base   = (void *) buf,
2909         .iov_len    = bytes,
2910     };
2911 
2912     if (bytes < 0) {
2913         return -EINVAL;
2914     }
2915 
2916     qemu_iovec_init_external(&qiov, &iov, 1);
2917     return bdrv_pwritev(bs, offset, &qiov);
2918 }
2919 
2920 /*
2921  * Writes to the file and ensures that no writes are reordered across this
2922  * request (acts as a barrier)
2923  *
2924  * Returns 0 on success, -errno in error cases.
2925  */
2926 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2927     const void *buf, int count)
2928 {
2929     int ret;
2930 
2931     ret = bdrv_pwrite(bs, offset, buf, count);
2932     if (ret < 0) {
2933         return ret;
2934     }
2935 
2936     /* No flush needed for cache modes that already do it */
2937     if (bs->enable_write_cache) {
2938         bdrv_flush(bs);
2939     }
2940 
2941     return 0;
2942 }
2943 
2944 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2945         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2946 {
2947     /* Perform I/O through a temporary buffer so that users who scribble over
2948      * their read buffer while the operation is in progress do not end up
2949      * modifying the image file.  This is critical for zero-copy guest I/O
2950      * where anything might happen inside guest memory.
2951      */
2952     void *bounce_buffer;
2953 
2954     BlockDriver *drv = bs->drv;
2955     struct iovec iov;
2956     QEMUIOVector bounce_qiov;
2957     int64_t cluster_sector_num;
2958     int cluster_nb_sectors;
2959     size_t skip_bytes;
2960     int ret;
2961 
2962     /* Cover entire cluster so no additional backing file I/O is required when
2963      * allocating cluster in the image file.
2964      */
2965     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2966                            &cluster_sector_num, &cluster_nb_sectors);
2967 
2968     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2969                                    cluster_sector_num, cluster_nb_sectors);
2970 
2971     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2972     iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2973     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2974 
2975     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2976                              &bounce_qiov);
2977     if (ret < 0) {
2978         goto err;
2979     }
2980 
2981     if (drv->bdrv_co_write_zeroes &&
2982         buffer_is_zero(bounce_buffer, iov.iov_len)) {
2983         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2984                                       cluster_nb_sectors, 0);
2985     } else {
2986         /* This does not change the data on the disk, it is not necessary
2987          * to flush even in cache=writethrough mode.
2988          */
2989         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2990                                   &bounce_qiov);
2991     }
2992 
2993     if (ret < 0) {
2994         /* It might be okay to ignore write errors for guest requests.  If this
2995          * is a deliberate copy-on-read then we don't want to ignore the error.
2996          * Simply report it in all cases.
2997          */
2998         goto err;
2999     }
3000 
3001     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
3002     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
3003                         nb_sectors * BDRV_SECTOR_SIZE);
3004 
3005 err:
3006     qemu_vfree(bounce_buffer);
3007     return ret;
3008 }
3009 
3010 /*
3011  * Forwards an already correctly aligned request to the BlockDriver. This
3012  * handles copy on read and zeroing after EOF; any other features must be
3013  * implemented by the caller.
3014  */
3015 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
3016     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3017     int64_t align, QEMUIOVector *qiov, int flags)
3018 {
3019     BlockDriver *drv = bs->drv;
3020     int ret;
3021 
3022     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3023     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3024 
3025     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3026     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3027 
3028     /* Handle Copy on Read and associated serialisation */
3029     if (flags & BDRV_REQ_COPY_ON_READ) {
3030         /* If we touch the same cluster it counts as an overlap.  This
3031          * guarantees that allocating writes will be serialized and not race
3032          * with each other for the same cluster.  For example, in copy-on-read
3033          * it ensures that the CoR read and write operations are atomic and
3034          * guest writes cannot interleave between them. */
3035         mark_request_serialising(req, bdrv_get_cluster_size(bs));
3036     }
3037 
3038     wait_serialising_requests(req);
3039 
3040     if (flags & BDRV_REQ_COPY_ON_READ) {
3041         int pnum;
3042 
3043         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3044         if (ret < 0) {
3045             goto out;
3046         }
3047 
3048         if (!ret || pnum != nb_sectors) {
3049             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3050             goto out;
3051         }
3052     }
3053 
3054     /* Forward the request to the BlockDriver */
3055     if (!(bs->zero_beyond_eof && bs->growable)) {
3056         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3057     } else {
3058         /* Read zeros after EOF of growable BDSes */
3059         int64_t len, total_sectors, max_nb_sectors;
3060 
3061         len = bdrv_getlength(bs);
3062         if (len < 0) {
3063             ret = len;
3064             goto out;
3065         }
3066 
3067         total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
3068         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3069                                   align >> BDRV_SECTOR_BITS);
3070         if (max_nb_sectors > 0) {
3071             ret = drv->bdrv_co_readv(bs, sector_num,
3072                                      MIN(nb_sectors, max_nb_sectors), qiov);
3073         } else {
3074             ret = 0;
3075         }
3076 
3077         /* Reading beyond end of file is supposed to produce zeroes */
3078         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3079             uint64_t offset = MAX(0, total_sectors - sector_num);
3080             uint64_t bytes = (sector_num + nb_sectors - offset) *
3081                               BDRV_SECTOR_SIZE;
3082             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3083         }
3084     }
3085 
3086 out:
3087     return ret;
3088 }
3089 
3090 /*
3091  * Handle a read request in coroutine context
3092  */
3093 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3094     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3095     BdrvRequestFlags flags)
3096 {
3097     BlockDriver *drv = bs->drv;
3098     BdrvTrackedRequest req;
3099 
3100     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3101     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3102     uint8_t *head_buf = NULL;
3103     uint8_t *tail_buf = NULL;
3104     QEMUIOVector local_qiov;
3105     bool use_local_qiov = false;
3106     int ret;
3107 
3108     if (!drv) {
3109         return -ENOMEDIUM;
3110     }
3111     if (bdrv_check_byte_request(bs, offset, bytes)) {
3112         return -EIO;
3113     }
3114 
3115     if (bs->copy_on_read) {
3116         flags |= BDRV_REQ_COPY_ON_READ;
3117     }
3118 
3119     /* throttling disk I/O */
3120     if (bs->io_limits_enabled) {
3121         bdrv_io_limits_intercept(bs, bytes, false);
3122     }
3123 
3124     /* Align read if necessary by padding qiov */
3125     if (offset & (align - 1)) {
3126         head_buf = qemu_blockalign(bs, align);
3127         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3128         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3129         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3130         use_local_qiov = true;
3131 
3132         bytes += offset & (align - 1);
3133         offset = offset & ~(align - 1);
3134     }
3135 
3136     if ((offset + bytes) & (align - 1)) {
3137         if (!use_local_qiov) {
3138             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3139             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3140             use_local_qiov = true;
3141         }
3142         tail_buf = qemu_blockalign(bs, align);
3143         qemu_iovec_add(&local_qiov, tail_buf,
3144                        align - ((offset + bytes) & (align - 1)));
3145 
3146         bytes = ROUND_UP(bytes, align);
3147     }
3148 
3149     tracked_request_begin(&req, bs, offset, bytes, false);
3150     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3151                               use_local_qiov ? &local_qiov : qiov,
3152                               flags);
3153     tracked_request_end(&req);
3154 
3155     if (use_local_qiov) {
3156         qemu_iovec_destroy(&local_qiov);
3157         qemu_vfree(head_buf);
3158         qemu_vfree(tail_buf);
3159     }
3160 
3161     return ret;
3162 }
3163 
3164 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3165     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3166     BdrvRequestFlags flags)
3167 {
3168     if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3169         return -EINVAL;
3170     }
3171 
3172     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3173                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3174 }
3175 
3176 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3177     int nb_sectors, QEMUIOVector *qiov)
3178 {
3179     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3180 
3181     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3182 }
3183 
3184 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3185     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3186 {
3187     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3188 
3189     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3190                             BDRV_REQ_COPY_ON_READ);
3191 }
3192 
3193 /* if no limit is specified in the BlockLimits use a default
3194  * of 32768 512-byte sectors (16 MiB) per request.
3195  */
3196 #define MAX_WRITE_ZEROES_DEFAULT 32768
3197 
3198 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3199     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3200 {
3201     BlockDriver *drv = bs->drv;
3202     QEMUIOVector qiov;
3203     struct iovec iov = {0};
3204     int ret = 0;
3205 
3206     int max_write_zeroes = bs->bl.max_write_zeroes ?
3207                            bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3208 
3209     while (nb_sectors > 0 && !ret) {
3210         int num = nb_sectors;
3211 
3212         /* Align request.  Block drivers can expect the "bulk" of the request
3213          * to be aligned.
3214          */
3215         if (bs->bl.write_zeroes_alignment
3216             && num > bs->bl.write_zeroes_alignment) {
3217             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3218                 /* Make a small request up to the first aligned sector.  */
3219                 num = bs->bl.write_zeroes_alignment;
3220                 num -= sector_num % bs->bl.write_zeroes_alignment;
3221             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3222                 /* Shorten the request to the last aligned sector.  num cannot
3223                  * underflow because num > bs->bl.write_zeroes_alignment.
3224                  */
3225                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3226             }
3227         }
3228 
3229         /* limit request size */
3230         if (num > max_write_zeroes) {
3231             num = max_write_zeroes;
3232         }
3233 
3234         ret = -ENOTSUP;
3235         /* First try the efficient write zeroes operation */
3236         if (drv->bdrv_co_write_zeroes) {
3237             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3238         }
3239 
3240         if (ret == -ENOTSUP) {
3241             /* Fall back to bounce buffer if write zeroes is unsupported */
3242             iov.iov_len = num * BDRV_SECTOR_SIZE;
3243             if (iov.iov_base == NULL) {
3244                 iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3245                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3246             }
3247             qemu_iovec_init_external(&qiov, &iov, 1);
3248 
3249             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3250 
3251             /* Keep bounce buffer around if it is big enough for all
3252              * all future requests.
3253              */
3254             if (num < max_write_zeroes) {
3255                 qemu_vfree(iov.iov_base);
3256                 iov.iov_base = NULL;
3257             }
3258         }
3259 
3260         sector_num += num;
3261         nb_sectors -= num;
3262     }
3263 
3264     qemu_vfree(iov.iov_base);
3265     return ret;
3266 }
3267 
3268 /*
3269  * Forwards an already correctly aligned write request to the BlockDriver.
3270  */
3271 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3272     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3273     QEMUIOVector *qiov, int flags)
3274 {
3275     BlockDriver *drv = bs->drv;
3276     bool waited;
3277     int ret;
3278 
3279     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3280     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3281 
3282     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3283     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3284 
3285     waited = wait_serialising_requests(req);
3286     assert(!waited || !req->serialising);
3287     assert(req->overlap_offset <= offset);
3288     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3289 
3290     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3291 
3292     if (ret < 0) {
3293         /* Do nothing, write notifier decided to fail this request */
3294     } else if (flags & BDRV_REQ_ZERO_WRITE) {
3295         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3296         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3297     } else {
3298         BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3299         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3300     }
3301     BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3302 
3303     if (ret == 0 && !bs->enable_write_cache) {
3304         ret = bdrv_co_flush(bs);
3305     }
3306 
3307     bdrv_set_dirty(bs, sector_num, nb_sectors);
3308 
3309     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3310         bs->wr_highest_sector = sector_num + nb_sectors - 1;
3311     }
3312     if (bs->growable && ret >= 0) {
3313         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3314     }
3315 
3316     return ret;
3317 }
3318 
3319 /*
3320  * Handle a write request in coroutine context
3321  */
3322 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3323     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3324     BdrvRequestFlags flags)
3325 {
3326     BdrvTrackedRequest req;
3327     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3328     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3329     uint8_t *head_buf = NULL;
3330     uint8_t *tail_buf = NULL;
3331     QEMUIOVector local_qiov;
3332     bool use_local_qiov = false;
3333     int ret;
3334 
3335     if (!bs->drv) {
3336         return -ENOMEDIUM;
3337     }
3338     if (bs->read_only) {
3339         return -EACCES;
3340     }
3341     if (bdrv_check_byte_request(bs, offset, bytes)) {
3342         return -EIO;
3343     }
3344 
3345     /* throttling disk I/O */
3346     if (bs->io_limits_enabled) {
3347         bdrv_io_limits_intercept(bs, bytes, true);
3348     }
3349 
3350     /*
3351      * Align write if necessary by performing a read-modify-write cycle.
3352      * Pad qiov with the read parts and be sure to have a tracked request not
3353      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3354      */
3355     tracked_request_begin(&req, bs, offset, bytes, true);
3356 
3357     if (offset & (align - 1)) {
3358         QEMUIOVector head_qiov;
3359         struct iovec head_iov;
3360 
3361         mark_request_serialising(&req, align);
3362         wait_serialising_requests(&req);
3363 
3364         head_buf = qemu_blockalign(bs, align);
3365         head_iov = (struct iovec) {
3366             .iov_base   = head_buf,
3367             .iov_len    = align,
3368         };
3369         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3370 
3371         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3372         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3373                                   align, &head_qiov, 0);
3374         if (ret < 0) {
3375             goto fail;
3376         }
3377         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3378 
3379         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3380         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3381         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3382         use_local_qiov = true;
3383 
3384         bytes += offset & (align - 1);
3385         offset = offset & ~(align - 1);
3386     }
3387 
3388     if ((offset + bytes) & (align - 1)) {
3389         QEMUIOVector tail_qiov;
3390         struct iovec tail_iov;
3391         size_t tail_bytes;
3392         bool waited;
3393 
3394         mark_request_serialising(&req, align);
3395         waited = wait_serialising_requests(&req);
3396         assert(!waited || !use_local_qiov);
3397 
3398         tail_buf = qemu_blockalign(bs, align);
3399         tail_iov = (struct iovec) {
3400             .iov_base   = tail_buf,
3401             .iov_len    = align,
3402         };
3403         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3404 
3405         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3406         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3407                                   align, &tail_qiov, 0);
3408         if (ret < 0) {
3409             goto fail;
3410         }
3411         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3412 
3413         if (!use_local_qiov) {
3414             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3415             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3416             use_local_qiov = true;
3417         }
3418 
3419         tail_bytes = (offset + bytes) & (align - 1);
3420         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3421 
3422         bytes = ROUND_UP(bytes, align);
3423     }
3424 
3425     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3426                                use_local_qiov ? &local_qiov : qiov,
3427                                flags);
3428 
3429 fail:
3430     tracked_request_end(&req);
3431 
3432     if (use_local_qiov) {
3433         qemu_iovec_destroy(&local_qiov);
3434     }
3435     qemu_vfree(head_buf);
3436     qemu_vfree(tail_buf);
3437 
3438     return ret;
3439 }
3440 
3441 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3442     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3443     BdrvRequestFlags flags)
3444 {
3445     if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3446         return -EINVAL;
3447     }
3448 
3449     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3450                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3451 }
3452 
3453 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3454     int nb_sectors, QEMUIOVector *qiov)
3455 {
3456     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3457 
3458     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3459 }
3460 
3461 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3462                                       int64_t sector_num, int nb_sectors,
3463                                       BdrvRequestFlags flags)
3464 {
3465     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3466 
3467     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3468         flags &= ~BDRV_REQ_MAY_UNMAP;
3469     }
3470 
3471     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3472                              BDRV_REQ_ZERO_WRITE | flags);
3473 }
3474 
3475 /**
3476  * Truncate file to 'offset' bytes (needed only for file protocols)
3477  */
3478 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3479 {
3480     BlockDriver *drv = bs->drv;
3481     int ret;
3482     if (!drv)
3483         return -ENOMEDIUM;
3484     if (!drv->bdrv_truncate)
3485         return -ENOTSUP;
3486     if (bs->read_only)
3487         return -EACCES;
3488     if (bdrv_in_use(bs))
3489         return -EBUSY;
3490     ret = drv->bdrv_truncate(bs, offset);
3491     if (ret == 0) {
3492         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3493         bdrv_dev_resize_cb(bs);
3494     }
3495     return ret;
3496 }
3497 
3498 /**
3499  * Length of a allocated file in bytes. Sparse files are counted by actual
3500  * allocated space. Return < 0 if error or unknown.
3501  */
3502 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3503 {
3504     BlockDriver *drv = bs->drv;
3505     if (!drv) {
3506         return -ENOMEDIUM;
3507     }
3508     if (drv->bdrv_get_allocated_file_size) {
3509         return drv->bdrv_get_allocated_file_size(bs);
3510     }
3511     if (bs->file) {
3512         return bdrv_get_allocated_file_size(bs->file);
3513     }
3514     return -ENOTSUP;
3515 }
3516 
3517 /**
3518  * Length of a file in bytes. Return < 0 if error or unknown.
3519  */
3520 int64_t bdrv_getlength(BlockDriverState *bs)
3521 {
3522     BlockDriver *drv = bs->drv;
3523     if (!drv)
3524         return -ENOMEDIUM;
3525 
3526     if (drv->has_variable_length) {
3527         int ret = refresh_total_sectors(bs, bs->total_sectors);
3528         if (ret < 0) {
3529             return ret;
3530         }
3531     }
3532     return bs->total_sectors * BDRV_SECTOR_SIZE;
3533 }
3534 
3535 /* return 0 as number of sectors if no device present or error */
3536 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3537 {
3538     int64_t length;
3539     length = bdrv_getlength(bs);
3540     if (length < 0)
3541         length = 0;
3542     else
3543         length = length >> BDRV_SECTOR_BITS;
3544     *nb_sectors_ptr = length;
3545 }
3546 
3547 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3548                        BlockdevOnError on_write_error)
3549 {
3550     bs->on_read_error = on_read_error;
3551     bs->on_write_error = on_write_error;
3552 }
3553 
3554 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3555 {
3556     return is_read ? bs->on_read_error : bs->on_write_error;
3557 }
3558 
3559 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3560 {
3561     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3562 
3563     switch (on_err) {
3564     case BLOCKDEV_ON_ERROR_ENOSPC:
3565         return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3566     case BLOCKDEV_ON_ERROR_STOP:
3567         return BDRV_ACTION_STOP;
3568     case BLOCKDEV_ON_ERROR_REPORT:
3569         return BDRV_ACTION_REPORT;
3570     case BLOCKDEV_ON_ERROR_IGNORE:
3571         return BDRV_ACTION_IGNORE;
3572     default:
3573         abort();
3574     }
3575 }
3576 
3577 /* This is done by device models because, while the block layer knows
3578  * about the error, it does not know whether an operation comes from
3579  * the device or the block layer (from a job, for example).
3580  */
3581 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3582                        bool is_read, int error)
3583 {
3584     assert(error >= 0);
3585     bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3586     if (action == BDRV_ACTION_STOP) {
3587         vm_stop(RUN_STATE_IO_ERROR);
3588         bdrv_iostatus_set_err(bs, error);
3589     }
3590 }
3591 
3592 int bdrv_is_read_only(BlockDriverState *bs)
3593 {
3594     return bs->read_only;
3595 }
3596 
3597 int bdrv_is_sg(BlockDriverState *bs)
3598 {
3599     return bs->sg;
3600 }
3601 
3602 int bdrv_enable_write_cache(BlockDriverState *bs)
3603 {
3604     return bs->enable_write_cache;
3605 }
3606 
3607 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3608 {
3609     bs->enable_write_cache = wce;
3610 
3611     /* so a reopen() will preserve wce */
3612     if (wce) {
3613         bs->open_flags |= BDRV_O_CACHE_WB;
3614     } else {
3615         bs->open_flags &= ~BDRV_O_CACHE_WB;
3616     }
3617 }
3618 
3619 int bdrv_is_encrypted(BlockDriverState *bs)
3620 {
3621     if (bs->backing_hd && bs->backing_hd->encrypted)
3622         return 1;
3623     return bs->encrypted;
3624 }
3625 
3626 int bdrv_key_required(BlockDriverState *bs)
3627 {
3628     BlockDriverState *backing_hd = bs->backing_hd;
3629 
3630     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3631         return 1;
3632     return (bs->encrypted && !bs->valid_key);
3633 }
3634 
3635 int bdrv_set_key(BlockDriverState *bs, const char *key)
3636 {
3637     int ret;
3638     if (bs->backing_hd && bs->backing_hd->encrypted) {
3639         ret = bdrv_set_key(bs->backing_hd, key);
3640         if (ret < 0)
3641             return ret;
3642         if (!bs->encrypted)
3643             return 0;
3644     }
3645     if (!bs->encrypted) {
3646         return -EINVAL;
3647     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3648         return -ENOMEDIUM;
3649     }
3650     ret = bs->drv->bdrv_set_key(bs, key);
3651     if (ret < 0) {
3652         bs->valid_key = 0;
3653     } else if (!bs->valid_key) {
3654         bs->valid_key = 1;
3655         /* call the change callback now, we skipped it on open */
3656         bdrv_dev_change_media_cb(bs, true);
3657     }
3658     return ret;
3659 }
3660 
3661 const char *bdrv_get_format_name(BlockDriverState *bs)
3662 {
3663     return bs->drv ? bs->drv->format_name : NULL;
3664 }
3665 
3666 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3667                          void *opaque)
3668 {
3669     BlockDriver *drv;
3670     int count = 0;
3671     const char **formats = NULL;
3672 
3673     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3674         if (drv->format_name) {
3675             bool found = false;
3676             int i = count;
3677             while (formats && i && !found) {
3678                 found = !strcmp(formats[--i], drv->format_name);
3679             }
3680 
3681             if (!found) {
3682                 formats = g_realloc(formats, (count + 1) * sizeof(char *));
3683                 formats[count++] = drv->format_name;
3684                 it(opaque, drv->format_name);
3685             }
3686         }
3687     }
3688     g_free(formats);
3689 }
3690 
3691 /* This function is to find block backend bs */
3692 BlockDriverState *bdrv_find(const char *name)
3693 {
3694     BlockDriverState *bs;
3695 
3696     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3697         if (!strcmp(name, bs->device_name)) {
3698             return bs;
3699         }
3700     }
3701     return NULL;
3702 }
3703 
3704 /* This function is to find a node in the bs graph */
3705 BlockDriverState *bdrv_find_node(const char *node_name)
3706 {
3707     BlockDriverState *bs;
3708 
3709     assert(node_name);
3710 
3711     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3712         if (!strcmp(node_name, bs->node_name)) {
3713             return bs;
3714         }
3715     }
3716     return NULL;
3717 }
3718 
3719 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3720 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3721 {
3722     BlockDeviceInfoList *list, *entry;
3723     BlockDriverState *bs;
3724 
3725     list = NULL;
3726     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3727         entry = g_malloc0(sizeof(*entry));
3728         entry->value = bdrv_block_device_info(bs);
3729         entry->next = list;
3730         list = entry;
3731     }
3732 
3733     return list;
3734 }
3735 
3736 BlockDriverState *bdrv_lookup_bs(const char *device,
3737                                  const char *node_name,
3738                                  Error **errp)
3739 {
3740     BlockDriverState *bs = NULL;
3741 
3742     if (device) {
3743         bs = bdrv_find(device);
3744 
3745         if (bs) {
3746             return bs;
3747         }
3748     }
3749 
3750     if (node_name) {
3751         bs = bdrv_find_node(node_name);
3752 
3753         if (bs) {
3754             return bs;
3755         }
3756     }
3757 
3758     error_setg(errp, "Cannot find device=%s nor node_name=%s",
3759                      device ? device : "",
3760                      node_name ? node_name : "");
3761     return NULL;
3762 }
3763 
3764 BlockDriverState *bdrv_next(BlockDriverState *bs)
3765 {
3766     if (!bs) {
3767         return QTAILQ_FIRST(&bdrv_states);
3768     }
3769     return QTAILQ_NEXT(bs, device_list);
3770 }
3771 
3772 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3773 {
3774     BlockDriverState *bs;
3775 
3776     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3777         it(opaque, bs);
3778     }
3779 }
3780 
3781 const char *bdrv_get_device_name(BlockDriverState *bs)
3782 {
3783     return bs->device_name;
3784 }
3785 
3786 int bdrv_get_flags(BlockDriverState *bs)
3787 {
3788     return bs->open_flags;
3789 }
3790 
3791 int bdrv_flush_all(void)
3792 {
3793     BlockDriverState *bs;
3794     int result = 0;
3795 
3796     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3797         int ret = bdrv_flush(bs);
3798         if (ret < 0 && !result) {
3799             result = ret;
3800         }
3801     }
3802 
3803     return result;
3804 }
3805 
3806 int bdrv_has_zero_init_1(BlockDriverState *bs)
3807 {
3808     return 1;
3809 }
3810 
3811 int bdrv_has_zero_init(BlockDriverState *bs)
3812 {
3813     assert(bs->drv);
3814 
3815     /* If BS is a copy on write image, it is initialized to
3816        the contents of the base image, which may not be zeroes.  */
3817     if (bs->backing_hd) {
3818         return 0;
3819     }
3820     if (bs->drv->bdrv_has_zero_init) {
3821         return bs->drv->bdrv_has_zero_init(bs);
3822     }
3823 
3824     /* safe default */
3825     return 0;
3826 }
3827 
3828 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3829 {
3830     BlockDriverInfo bdi;
3831 
3832     if (bs->backing_hd) {
3833         return false;
3834     }
3835 
3836     if (bdrv_get_info(bs, &bdi) == 0) {
3837         return bdi.unallocated_blocks_are_zero;
3838     }
3839 
3840     return false;
3841 }
3842 
3843 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3844 {
3845     BlockDriverInfo bdi;
3846 
3847     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3848         return false;
3849     }
3850 
3851     if (bdrv_get_info(bs, &bdi) == 0) {
3852         return bdi.can_write_zeroes_with_unmap;
3853     }
3854 
3855     return false;
3856 }
3857 
3858 typedef struct BdrvCoGetBlockStatusData {
3859     BlockDriverState *bs;
3860     BlockDriverState *base;
3861     int64_t sector_num;
3862     int nb_sectors;
3863     int *pnum;
3864     int64_t ret;
3865     bool done;
3866 } BdrvCoGetBlockStatusData;
3867 
3868 /*
3869  * Returns true iff the specified sector is present in the disk image. Drivers
3870  * not implementing the functionality are assumed to not support backing files,
3871  * hence all their sectors are reported as allocated.
3872  *
3873  * If 'sector_num' is beyond the end of the disk image the return value is 0
3874  * and 'pnum' is set to 0.
3875  *
3876  * 'pnum' is set to the number of sectors (including and immediately following
3877  * the specified sector) that are known to be in the same
3878  * allocated/unallocated state.
3879  *
3880  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3881  * beyond the end of the disk image it will be clamped.
3882  */
3883 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3884                                                      int64_t sector_num,
3885                                                      int nb_sectors, int *pnum)
3886 {
3887     int64_t length;
3888     int64_t n;
3889     int64_t ret, ret2;
3890 
3891     length = bdrv_getlength(bs);
3892     if (length < 0) {
3893         return length;
3894     }
3895 
3896     if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3897         *pnum = 0;
3898         return 0;
3899     }
3900 
3901     n = bs->total_sectors - sector_num;
3902     if (n < nb_sectors) {
3903         nb_sectors = n;
3904     }
3905 
3906     if (!bs->drv->bdrv_co_get_block_status) {
3907         *pnum = nb_sectors;
3908         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
3909         if (bs->drv->protocol_name) {
3910             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3911         }
3912         return ret;
3913     }
3914 
3915     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3916     if (ret < 0) {
3917         *pnum = 0;
3918         return ret;
3919     }
3920 
3921     if (ret & BDRV_BLOCK_RAW) {
3922         assert(ret & BDRV_BLOCK_OFFSET_VALID);
3923         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3924                                      *pnum, pnum);
3925     }
3926 
3927     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
3928         ret |= BDRV_BLOCK_ALLOCATED;
3929     }
3930 
3931     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3932         if (bdrv_unallocated_blocks_are_zero(bs)) {
3933             ret |= BDRV_BLOCK_ZERO;
3934         } else if (bs->backing_hd) {
3935             BlockDriverState *bs2 = bs->backing_hd;
3936             int64_t length2 = bdrv_getlength(bs2);
3937             if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3938                 ret |= BDRV_BLOCK_ZERO;
3939             }
3940         }
3941     }
3942 
3943     if (bs->file &&
3944         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3945         (ret & BDRV_BLOCK_OFFSET_VALID)) {
3946         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3947                                         *pnum, pnum);
3948         if (ret2 >= 0) {
3949             /* Ignore errors.  This is just providing extra information, it
3950              * is useful but not necessary.
3951              */
3952             ret |= (ret2 & BDRV_BLOCK_ZERO);
3953         }
3954     }
3955 
3956     return ret;
3957 }
3958 
3959 /* Coroutine wrapper for bdrv_get_block_status() */
3960 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3961 {
3962     BdrvCoGetBlockStatusData *data = opaque;
3963     BlockDriverState *bs = data->bs;
3964 
3965     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3966                                          data->pnum);
3967     data->done = true;
3968 }
3969 
3970 /*
3971  * Synchronous wrapper around bdrv_co_get_block_status().
3972  *
3973  * See bdrv_co_get_block_status() for details.
3974  */
3975 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3976                               int nb_sectors, int *pnum)
3977 {
3978     Coroutine *co;
3979     BdrvCoGetBlockStatusData data = {
3980         .bs = bs,
3981         .sector_num = sector_num,
3982         .nb_sectors = nb_sectors,
3983         .pnum = pnum,
3984         .done = false,
3985     };
3986 
3987     if (qemu_in_coroutine()) {
3988         /* Fast-path if already in coroutine context */
3989         bdrv_get_block_status_co_entry(&data);
3990     } else {
3991         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3992         qemu_coroutine_enter(co, &data);
3993         while (!data.done) {
3994             qemu_aio_wait();
3995         }
3996     }
3997     return data.ret;
3998 }
3999 
4000 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4001                                    int nb_sectors, int *pnum)
4002 {
4003     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4004     if (ret < 0) {
4005         return ret;
4006     }
4007     return (ret & BDRV_BLOCK_ALLOCATED);
4008 }
4009 
4010 /*
4011  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4012  *
4013  * Return true if the given sector is allocated in any image between
4014  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
4015  * sector is allocated in any image of the chain.  Return false otherwise.
4016  *
4017  * 'pnum' is set to the number of sectors (including and immediately following
4018  *  the specified sector) that are known to be in the same
4019  *  allocated/unallocated state.
4020  *
4021  */
4022 int bdrv_is_allocated_above(BlockDriverState *top,
4023                             BlockDriverState *base,
4024                             int64_t sector_num,
4025                             int nb_sectors, int *pnum)
4026 {
4027     BlockDriverState *intermediate;
4028     int ret, n = nb_sectors;
4029 
4030     intermediate = top;
4031     while (intermediate && intermediate != base) {
4032         int pnum_inter;
4033         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4034                                 &pnum_inter);
4035         if (ret < 0) {
4036             return ret;
4037         } else if (ret) {
4038             *pnum = pnum_inter;
4039             return 1;
4040         }
4041 
4042         /*
4043          * [sector_num, nb_sectors] is unallocated on top but intermediate
4044          * might have
4045          *
4046          * [sector_num+x, nr_sectors] allocated.
4047          */
4048         if (n > pnum_inter &&
4049             (intermediate == top ||
4050              sector_num + pnum_inter < intermediate->total_sectors)) {
4051             n = pnum_inter;
4052         }
4053 
4054         intermediate = intermediate->backing_hd;
4055     }
4056 
4057     *pnum = n;
4058     return 0;
4059 }
4060 
4061 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4062 {
4063     if (bs->backing_hd && bs->backing_hd->encrypted)
4064         return bs->backing_file;
4065     else if (bs->encrypted)
4066         return bs->filename;
4067     else
4068         return NULL;
4069 }
4070 
4071 void bdrv_get_backing_filename(BlockDriverState *bs,
4072                                char *filename, int filename_size)
4073 {
4074     pstrcpy(filename, filename_size, bs->backing_file);
4075 }
4076 
4077 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4078                           const uint8_t *buf, int nb_sectors)
4079 {
4080     BlockDriver *drv = bs->drv;
4081     if (!drv)
4082         return -ENOMEDIUM;
4083     if (!drv->bdrv_write_compressed)
4084         return -ENOTSUP;
4085     if (bdrv_check_request(bs, sector_num, nb_sectors))
4086         return -EIO;
4087 
4088     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4089 
4090     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4091 }
4092 
4093 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4094 {
4095     BlockDriver *drv = bs->drv;
4096     if (!drv)
4097         return -ENOMEDIUM;
4098     if (!drv->bdrv_get_info)
4099         return -ENOTSUP;
4100     memset(bdi, 0, sizeof(*bdi));
4101     return drv->bdrv_get_info(bs, bdi);
4102 }
4103 
4104 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4105 {
4106     BlockDriver *drv = bs->drv;
4107     if (drv && drv->bdrv_get_specific_info) {
4108         return drv->bdrv_get_specific_info(bs);
4109     }
4110     return NULL;
4111 }
4112 
4113 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4114                       int64_t pos, int size)
4115 {
4116     QEMUIOVector qiov;
4117     struct iovec iov = {
4118         .iov_base   = (void *) buf,
4119         .iov_len    = size,
4120     };
4121 
4122     qemu_iovec_init_external(&qiov, &iov, 1);
4123     return bdrv_writev_vmstate(bs, &qiov, pos);
4124 }
4125 
4126 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4127 {
4128     BlockDriver *drv = bs->drv;
4129 
4130     if (!drv) {
4131         return -ENOMEDIUM;
4132     } else if (drv->bdrv_save_vmstate) {
4133         return drv->bdrv_save_vmstate(bs, qiov, pos);
4134     } else if (bs->file) {
4135         return bdrv_writev_vmstate(bs->file, qiov, pos);
4136     }
4137 
4138     return -ENOTSUP;
4139 }
4140 
4141 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4142                       int64_t pos, int size)
4143 {
4144     BlockDriver *drv = bs->drv;
4145     if (!drv)
4146         return -ENOMEDIUM;
4147     if (drv->bdrv_load_vmstate)
4148         return drv->bdrv_load_vmstate(bs, buf, pos, size);
4149     if (bs->file)
4150         return bdrv_load_vmstate(bs->file, buf, pos, size);
4151     return -ENOTSUP;
4152 }
4153 
4154 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4155 {
4156     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4157         return;
4158     }
4159 
4160     bs->drv->bdrv_debug_event(bs, event);
4161 }
4162 
4163 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4164                           const char *tag)
4165 {
4166     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4167         bs = bs->file;
4168     }
4169 
4170     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4171         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4172     }
4173 
4174     return -ENOTSUP;
4175 }
4176 
4177 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4178 {
4179     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4180         bs = bs->file;
4181     }
4182 
4183     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4184         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4185     }
4186 
4187     return -ENOTSUP;
4188 }
4189 
4190 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4191 {
4192     while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4193         bs = bs->file;
4194     }
4195 
4196     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4197         return bs->drv->bdrv_debug_resume(bs, tag);
4198     }
4199 
4200     return -ENOTSUP;
4201 }
4202 
4203 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4204 {
4205     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4206         bs = bs->file;
4207     }
4208 
4209     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4210         return bs->drv->bdrv_debug_is_suspended(bs, tag);
4211     }
4212 
4213     return false;
4214 }
4215 
4216 int bdrv_is_snapshot(BlockDriverState *bs)
4217 {
4218     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4219 }
4220 
4221 /* backing_file can either be relative, or absolute, or a protocol.  If it is
4222  * relative, it must be relative to the chain.  So, passing in bs->filename
4223  * from a BDS as backing_file should not be done, as that may be relative to
4224  * the CWD rather than the chain. */
4225 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4226         const char *backing_file)
4227 {
4228     char *filename_full = NULL;
4229     char *backing_file_full = NULL;
4230     char *filename_tmp = NULL;
4231     int is_protocol = 0;
4232     BlockDriverState *curr_bs = NULL;
4233     BlockDriverState *retval = NULL;
4234 
4235     if (!bs || !bs->drv || !backing_file) {
4236         return NULL;
4237     }
4238 
4239     filename_full     = g_malloc(PATH_MAX);
4240     backing_file_full = g_malloc(PATH_MAX);
4241     filename_tmp      = g_malloc(PATH_MAX);
4242 
4243     is_protocol = path_has_protocol(backing_file);
4244 
4245     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4246 
4247         /* If either of the filename paths is actually a protocol, then
4248          * compare unmodified paths; otherwise make paths relative */
4249         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4250             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4251                 retval = curr_bs->backing_hd;
4252                 break;
4253             }
4254         } else {
4255             /* If not an absolute filename path, make it relative to the current
4256              * image's filename path */
4257             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4258                          backing_file);
4259 
4260             /* We are going to compare absolute pathnames */
4261             if (!realpath(filename_tmp, filename_full)) {
4262                 continue;
4263             }
4264 
4265             /* We need to make sure the backing filename we are comparing against
4266              * is relative to the current image filename (or absolute) */
4267             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4268                          curr_bs->backing_file);
4269 
4270             if (!realpath(filename_tmp, backing_file_full)) {
4271                 continue;
4272             }
4273 
4274             if (strcmp(backing_file_full, filename_full) == 0) {
4275                 retval = curr_bs->backing_hd;
4276                 break;
4277             }
4278         }
4279     }
4280 
4281     g_free(filename_full);
4282     g_free(backing_file_full);
4283     g_free(filename_tmp);
4284     return retval;
4285 }
4286 
4287 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4288 {
4289     if (!bs->drv) {
4290         return 0;
4291     }
4292 
4293     if (!bs->backing_hd) {
4294         return 0;
4295     }
4296 
4297     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4298 }
4299 
4300 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4301 {
4302     BlockDriverState *curr_bs = NULL;
4303 
4304     if (!bs) {
4305         return NULL;
4306     }
4307 
4308     curr_bs = bs;
4309 
4310     while (curr_bs->backing_hd) {
4311         curr_bs = curr_bs->backing_hd;
4312     }
4313     return curr_bs;
4314 }
4315 
4316 /**************************************************************/
4317 /* async I/Os */
4318 
4319 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4320                                  QEMUIOVector *qiov, int nb_sectors,
4321                                  BlockDriverCompletionFunc *cb, void *opaque)
4322 {
4323     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4324 
4325     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4326                                  cb, opaque, false);
4327 }
4328 
4329 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4330                                   QEMUIOVector *qiov, int nb_sectors,
4331                                   BlockDriverCompletionFunc *cb, void *opaque)
4332 {
4333     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4334 
4335     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4336                                  cb, opaque, true);
4337 }
4338 
4339 BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4340         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4341         BlockDriverCompletionFunc *cb, void *opaque)
4342 {
4343     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4344 
4345     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4346                                  BDRV_REQ_ZERO_WRITE | flags,
4347                                  cb, opaque, true);
4348 }
4349 
4350 
4351 typedef struct MultiwriteCB {
4352     int error;
4353     int num_requests;
4354     int num_callbacks;
4355     struct {
4356         BlockDriverCompletionFunc *cb;
4357         void *opaque;
4358         QEMUIOVector *free_qiov;
4359     } callbacks[];
4360 } MultiwriteCB;
4361 
4362 static void multiwrite_user_cb(MultiwriteCB *mcb)
4363 {
4364     int i;
4365 
4366     for (i = 0; i < mcb->num_callbacks; i++) {
4367         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4368         if (mcb->callbacks[i].free_qiov) {
4369             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4370         }
4371         g_free(mcb->callbacks[i].free_qiov);
4372     }
4373 }
4374 
4375 static void multiwrite_cb(void *opaque, int ret)
4376 {
4377     MultiwriteCB *mcb = opaque;
4378 
4379     trace_multiwrite_cb(mcb, ret);
4380 
4381     if (ret < 0 && !mcb->error) {
4382         mcb->error = ret;
4383     }
4384 
4385     mcb->num_requests--;
4386     if (mcb->num_requests == 0) {
4387         multiwrite_user_cb(mcb);
4388         g_free(mcb);
4389     }
4390 }
4391 
4392 static int multiwrite_req_compare(const void *a, const void *b)
4393 {
4394     const BlockRequest *req1 = a, *req2 = b;
4395 
4396     /*
4397      * Note that we can't simply subtract req2->sector from req1->sector
4398      * here as that could overflow the return value.
4399      */
4400     if (req1->sector > req2->sector) {
4401         return 1;
4402     } else if (req1->sector < req2->sector) {
4403         return -1;
4404     } else {
4405         return 0;
4406     }
4407 }
4408 
4409 /*
4410  * Takes a bunch of requests and tries to merge them. Returns the number of
4411  * requests that remain after merging.
4412  */
4413 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4414     int num_reqs, MultiwriteCB *mcb)
4415 {
4416     int i, outidx;
4417 
4418     // Sort requests by start sector
4419     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4420 
4421     // Check if adjacent requests touch the same clusters. If so, combine them,
4422     // filling up gaps with zero sectors.
4423     outidx = 0;
4424     for (i = 1; i < num_reqs; i++) {
4425         int merge = 0;
4426         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4427 
4428         // Handle exactly sequential writes and overlapping writes.
4429         if (reqs[i].sector <= oldreq_last) {
4430             merge = 1;
4431         }
4432 
4433         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4434             merge = 0;
4435         }
4436 
4437         if (merge) {
4438             size_t size;
4439             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4440             qemu_iovec_init(qiov,
4441                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4442 
4443             // Add the first request to the merged one. If the requests are
4444             // overlapping, drop the last sectors of the first request.
4445             size = (reqs[i].sector - reqs[outidx].sector) << 9;
4446             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4447 
4448             // We should need to add any zeros between the two requests
4449             assert (reqs[i].sector <= oldreq_last);
4450 
4451             // Add the second request
4452             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4453 
4454             reqs[outidx].nb_sectors = qiov->size >> 9;
4455             reqs[outidx].qiov = qiov;
4456 
4457             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4458         } else {
4459             outidx++;
4460             reqs[outidx].sector     = reqs[i].sector;
4461             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4462             reqs[outidx].qiov       = reqs[i].qiov;
4463         }
4464     }
4465 
4466     return outidx + 1;
4467 }
4468 
4469 /*
4470  * Submit multiple AIO write requests at once.
4471  *
4472  * On success, the function returns 0 and all requests in the reqs array have
4473  * been submitted. In error case this function returns -1, and any of the
4474  * requests may or may not be submitted yet. In particular, this means that the
4475  * callback will be called for some of the requests, for others it won't. The
4476  * caller must check the error field of the BlockRequest to wait for the right
4477  * callbacks (if error != 0, no callback will be called).
4478  *
4479  * The implementation may modify the contents of the reqs array, e.g. to merge
4480  * requests. However, the fields opaque and error are left unmodified as they
4481  * are used to signal failure for a single request to the caller.
4482  */
4483 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4484 {
4485     MultiwriteCB *mcb;
4486     int i;
4487 
4488     /* don't submit writes if we don't have a medium */
4489     if (bs->drv == NULL) {
4490         for (i = 0; i < num_reqs; i++) {
4491             reqs[i].error = -ENOMEDIUM;
4492         }
4493         return -1;
4494     }
4495 
4496     if (num_reqs == 0) {
4497         return 0;
4498     }
4499 
4500     // Create MultiwriteCB structure
4501     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4502     mcb->num_requests = 0;
4503     mcb->num_callbacks = num_reqs;
4504 
4505     for (i = 0; i < num_reqs; i++) {
4506         mcb->callbacks[i].cb = reqs[i].cb;
4507         mcb->callbacks[i].opaque = reqs[i].opaque;
4508     }
4509 
4510     // Check for mergable requests
4511     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4512 
4513     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4514 
4515     /* Run the aio requests. */
4516     mcb->num_requests = num_reqs;
4517     for (i = 0; i < num_reqs; i++) {
4518         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4519                               reqs[i].nb_sectors, reqs[i].flags,
4520                               multiwrite_cb, mcb,
4521                               true);
4522     }
4523 
4524     return 0;
4525 }
4526 
4527 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4528 {
4529     acb->aiocb_info->cancel(acb);
4530 }
4531 
4532 /**************************************************************/
4533 /* async block device emulation */
4534 
4535 typedef struct BlockDriverAIOCBSync {
4536     BlockDriverAIOCB common;
4537     QEMUBH *bh;
4538     int ret;
4539     /* vector translation state */
4540     QEMUIOVector *qiov;
4541     uint8_t *bounce;
4542     int is_write;
4543 } BlockDriverAIOCBSync;
4544 
4545 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4546 {
4547     BlockDriverAIOCBSync *acb =
4548         container_of(blockacb, BlockDriverAIOCBSync, common);
4549     qemu_bh_delete(acb->bh);
4550     acb->bh = NULL;
4551     qemu_aio_release(acb);
4552 }
4553 
4554 static const AIOCBInfo bdrv_em_aiocb_info = {
4555     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
4556     .cancel             = bdrv_aio_cancel_em,
4557 };
4558 
4559 static void bdrv_aio_bh_cb(void *opaque)
4560 {
4561     BlockDriverAIOCBSync *acb = opaque;
4562 
4563     if (!acb->is_write)
4564         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4565     qemu_vfree(acb->bounce);
4566     acb->common.cb(acb->common.opaque, acb->ret);
4567     qemu_bh_delete(acb->bh);
4568     acb->bh = NULL;
4569     qemu_aio_release(acb);
4570 }
4571 
4572 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4573                                             int64_t sector_num,
4574                                             QEMUIOVector *qiov,
4575                                             int nb_sectors,
4576                                             BlockDriverCompletionFunc *cb,
4577                                             void *opaque,
4578                                             int is_write)
4579 
4580 {
4581     BlockDriverAIOCBSync *acb;
4582 
4583     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4584     acb->is_write = is_write;
4585     acb->qiov = qiov;
4586     acb->bounce = qemu_blockalign(bs, qiov->size);
4587     acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4588 
4589     if (is_write) {
4590         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4591         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4592     } else {
4593         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4594     }
4595 
4596     qemu_bh_schedule(acb->bh);
4597 
4598     return &acb->common;
4599 }
4600 
4601 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4602         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4603         BlockDriverCompletionFunc *cb, void *opaque)
4604 {
4605     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4606 }
4607 
4608 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4609         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4610         BlockDriverCompletionFunc *cb, void *opaque)
4611 {
4612     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4613 }
4614 
4615 
4616 typedef struct BlockDriverAIOCBCoroutine {
4617     BlockDriverAIOCB common;
4618     BlockRequest req;
4619     bool is_write;
4620     bool *done;
4621     QEMUBH* bh;
4622 } BlockDriverAIOCBCoroutine;
4623 
4624 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4625 {
4626     BlockDriverAIOCBCoroutine *acb =
4627         container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4628     bool done = false;
4629 
4630     acb->done = &done;
4631     while (!done) {
4632         qemu_aio_wait();
4633     }
4634 }
4635 
4636 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4637     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4638     .cancel             = bdrv_aio_co_cancel_em,
4639 };
4640 
4641 static void bdrv_co_em_bh(void *opaque)
4642 {
4643     BlockDriverAIOCBCoroutine *acb = opaque;
4644 
4645     acb->common.cb(acb->common.opaque, acb->req.error);
4646 
4647     if (acb->done) {
4648         *acb->done = true;
4649     }
4650 
4651     qemu_bh_delete(acb->bh);
4652     qemu_aio_release(acb);
4653 }
4654 
4655 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4656 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4657 {
4658     BlockDriverAIOCBCoroutine *acb = opaque;
4659     BlockDriverState *bs = acb->common.bs;
4660 
4661     if (!acb->is_write) {
4662         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4663             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4664     } else {
4665         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4666             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4667     }
4668 
4669     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4670     qemu_bh_schedule(acb->bh);
4671 }
4672 
4673 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4674                                                int64_t sector_num,
4675                                                QEMUIOVector *qiov,
4676                                                int nb_sectors,
4677                                                BdrvRequestFlags flags,
4678                                                BlockDriverCompletionFunc *cb,
4679                                                void *opaque,
4680                                                bool is_write)
4681 {
4682     Coroutine *co;
4683     BlockDriverAIOCBCoroutine *acb;
4684 
4685     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4686     acb->req.sector = sector_num;
4687     acb->req.nb_sectors = nb_sectors;
4688     acb->req.qiov = qiov;
4689     acb->req.flags = flags;
4690     acb->is_write = is_write;
4691     acb->done = NULL;
4692 
4693     co = qemu_coroutine_create(bdrv_co_do_rw);
4694     qemu_coroutine_enter(co, acb);
4695 
4696     return &acb->common;
4697 }
4698 
4699 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4700 {
4701     BlockDriverAIOCBCoroutine *acb = opaque;
4702     BlockDriverState *bs = acb->common.bs;
4703 
4704     acb->req.error = bdrv_co_flush(bs);
4705     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4706     qemu_bh_schedule(acb->bh);
4707 }
4708 
4709 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4710         BlockDriverCompletionFunc *cb, void *opaque)
4711 {
4712     trace_bdrv_aio_flush(bs, opaque);
4713 
4714     Coroutine *co;
4715     BlockDriverAIOCBCoroutine *acb;
4716 
4717     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4718     acb->done = NULL;
4719 
4720     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4721     qemu_coroutine_enter(co, acb);
4722 
4723     return &acb->common;
4724 }
4725 
4726 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4727 {
4728     BlockDriverAIOCBCoroutine *acb = opaque;
4729     BlockDriverState *bs = acb->common.bs;
4730 
4731     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4732     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4733     qemu_bh_schedule(acb->bh);
4734 }
4735 
4736 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4737         int64_t sector_num, int nb_sectors,
4738         BlockDriverCompletionFunc *cb, void *opaque)
4739 {
4740     Coroutine *co;
4741     BlockDriverAIOCBCoroutine *acb;
4742 
4743     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4744 
4745     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4746     acb->req.sector = sector_num;
4747     acb->req.nb_sectors = nb_sectors;
4748     acb->done = NULL;
4749     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4750     qemu_coroutine_enter(co, acb);
4751 
4752     return &acb->common;
4753 }
4754 
4755 void bdrv_init(void)
4756 {
4757     module_call_init(MODULE_INIT_BLOCK);
4758 }
4759 
4760 void bdrv_init_with_whitelist(void)
4761 {
4762     use_bdrv_whitelist = 1;
4763     bdrv_init();
4764 }
4765 
4766 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4767                    BlockDriverCompletionFunc *cb, void *opaque)
4768 {
4769     BlockDriverAIOCB *acb;
4770 
4771     acb = g_slice_alloc(aiocb_info->aiocb_size);
4772     acb->aiocb_info = aiocb_info;
4773     acb->bs = bs;
4774     acb->cb = cb;
4775     acb->opaque = opaque;
4776     return acb;
4777 }
4778 
4779 void qemu_aio_release(void *p)
4780 {
4781     BlockDriverAIOCB *acb = p;
4782     g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4783 }
4784 
4785 /**************************************************************/
4786 /* Coroutine block device emulation */
4787 
4788 typedef struct CoroutineIOCompletion {
4789     Coroutine *coroutine;
4790     int ret;
4791 } CoroutineIOCompletion;
4792 
4793 static void bdrv_co_io_em_complete(void *opaque, int ret)
4794 {
4795     CoroutineIOCompletion *co = opaque;
4796 
4797     co->ret = ret;
4798     qemu_coroutine_enter(co->coroutine, NULL);
4799 }
4800 
4801 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4802                                       int nb_sectors, QEMUIOVector *iov,
4803                                       bool is_write)
4804 {
4805     CoroutineIOCompletion co = {
4806         .coroutine = qemu_coroutine_self(),
4807     };
4808     BlockDriverAIOCB *acb;
4809 
4810     if (is_write) {
4811         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4812                                        bdrv_co_io_em_complete, &co);
4813     } else {
4814         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4815                                       bdrv_co_io_em_complete, &co);
4816     }
4817 
4818     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4819     if (!acb) {
4820         return -EIO;
4821     }
4822     qemu_coroutine_yield();
4823 
4824     return co.ret;
4825 }
4826 
4827 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4828                                          int64_t sector_num, int nb_sectors,
4829                                          QEMUIOVector *iov)
4830 {
4831     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4832 }
4833 
4834 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4835                                          int64_t sector_num, int nb_sectors,
4836                                          QEMUIOVector *iov)
4837 {
4838     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4839 }
4840 
4841 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4842 {
4843     RwCo *rwco = opaque;
4844 
4845     rwco->ret = bdrv_co_flush(rwco->bs);
4846 }
4847 
4848 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4849 {
4850     int ret;
4851 
4852     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4853         return 0;
4854     }
4855 
4856     /* Write back cached data to the OS even with cache=unsafe */
4857     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4858     if (bs->drv->bdrv_co_flush_to_os) {
4859         ret = bs->drv->bdrv_co_flush_to_os(bs);
4860         if (ret < 0) {
4861             return ret;
4862         }
4863     }
4864 
4865     /* But don't actually force it to the disk with cache=unsafe */
4866     if (bs->open_flags & BDRV_O_NO_FLUSH) {
4867         goto flush_parent;
4868     }
4869 
4870     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4871     if (bs->drv->bdrv_co_flush_to_disk) {
4872         ret = bs->drv->bdrv_co_flush_to_disk(bs);
4873     } else if (bs->drv->bdrv_aio_flush) {
4874         BlockDriverAIOCB *acb;
4875         CoroutineIOCompletion co = {
4876             .coroutine = qemu_coroutine_self(),
4877         };
4878 
4879         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4880         if (acb == NULL) {
4881             ret = -EIO;
4882         } else {
4883             qemu_coroutine_yield();
4884             ret = co.ret;
4885         }
4886     } else {
4887         /*
4888          * Some block drivers always operate in either writethrough or unsafe
4889          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4890          * know how the server works (because the behaviour is hardcoded or
4891          * depends on server-side configuration), so we can't ensure that
4892          * everything is safe on disk. Returning an error doesn't work because
4893          * that would break guests even if the server operates in writethrough
4894          * mode.
4895          *
4896          * Let's hope the user knows what he's doing.
4897          */
4898         ret = 0;
4899     }
4900     if (ret < 0) {
4901         return ret;
4902     }
4903 
4904     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4905      * in the case of cache=unsafe, so there are no useless flushes.
4906      */
4907 flush_parent:
4908     return bdrv_co_flush(bs->file);
4909 }
4910 
4911 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
4912 {
4913     Error *local_err = NULL;
4914     int ret;
4915 
4916     if (!bs->drv)  {
4917         return;
4918     }
4919 
4920     if (bs->drv->bdrv_invalidate_cache) {
4921         bs->drv->bdrv_invalidate_cache(bs, &local_err);
4922     } else if (bs->file) {
4923         bdrv_invalidate_cache(bs->file, &local_err);
4924     }
4925     if (local_err) {
4926         error_propagate(errp, local_err);
4927         return;
4928     }
4929 
4930     ret = refresh_total_sectors(bs, bs->total_sectors);
4931     if (ret < 0) {
4932         error_setg_errno(errp, -ret, "Could not refresh total sector count");
4933         return;
4934     }
4935 }
4936 
4937 void bdrv_invalidate_cache_all(Error **errp)
4938 {
4939     BlockDriverState *bs;
4940     Error *local_err = NULL;
4941 
4942     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4943         bdrv_invalidate_cache(bs, &local_err);
4944         if (local_err) {
4945             error_propagate(errp, local_err);
4946             return;
4947         }
4948     }
4949 }
4950 
4951 void bdrv_clear_incoming_migration_all(void)
4952 {
4953     BlockDriverState *bs;
4954 
4955     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4956         bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4957     }
4958 }
4959 
4960 int bdrv_flush(BlockDriverState *bs)
4961 {
4962     Coroutine *co;
4963     RwCo rwco = {
4964         .bs = bs,
4965         .ret = NOT_DONE,
4966     };
4967 
4968     if (qemu_in_coroutine()) {
4969         /* Fast-path if already in coroutine context */
4970         bdrv_flush_co_entry(&rwco);
4971     } else {
4972         co = qemu_coroutine_create(bdrv_flush_co_entry);
4973         qemu_coroutine_enter(co, &rwco);
4974         while (rwco.ret == NOT_DONE) {
4975             qemu_aio_wait();
4976         }
4977     }
4978 
4979     return rwco.ret;
4980 }
4981 
4982 typedef struct DiscardCo {
4983     BlockDriverState *bs;
4984     int64_t sector_num;
4985     int nb_sectors;
4986     int ret;
4987 } DiscardCo;
4988 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4989 {
4990     DiscardCo *rwco = opaque;
4991 
4992     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4993 }
4994 
4995 /* if no limit is specified in the BlockLimits use a default
4996  * of 32768 512-byte sectors (16 MiB) per request.
4997  */
4998 #define MAX_DISCARD_DEFAULT 32768
4999 
5000 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5001                                  int nb_sectors)
5002 {
5003     int max_discard;
5004 
5005     if (!bs->drv) {
5006         return -ENOMEDIUM;
5007     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5008         return -EIO;
5009     } else if (bs->read_only) {
5010         return -EROFS;
5011     }
5012 
5013     bdrv_reset_dirty(bs, sector_num, nb_sectors);
5014 
5015     /* Do nothing if disabled.  */
5016     if (!(bs->open_flags & BDRV_O_UNMAP)) {
5017         return 0;
5018     }
5019 
5020     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5021         return 0;
5022     }
5023 
5024     max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5025     while (nb_sectors > 0) {
5026         int ret;
5027         int num = nb_sectors;
5028 
5029         /* align request */
5030         if (bs->bl.discard_alignment &&
5031             num >= bs->bl.discard_alignment &&
5032             sector_num % bs->bl.discard_alignment) {
5033             if (num > bs->bl.discard_alignment) {
5034                 num = bs->bl.discard_alignment;
5035             }
5036             num -= sector_num % bs->bl.discard_alignment;
5037         }
5038 
5039         /* limit request size */
5040         if (num > max_discard) {
5041             num = max_discard;
5042         }
5043 
5044         if (bs->drv->bdrv_co_discard) {
5045             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5046         } else {
5047             BlockDriverAIOCB *acb;
5048             CoroutineIOCompletion co = {
5049                 .coroutine = qemu_coroutine_self(),
5050             };
5051 
5052             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5053                                             bdrv_co_io_em_complete, &co);
5054             if (acb == NULL) {
5055                 return -EIO;
5056             } else {
5057                 qemu_coroutine_yield();
5058                 ret = co.ret;
5059             }
5060         }
5061         if (ret && ret != -ENOTSUP) {
5062             return ret;
5063         }
5064 
5065         sector_num += num;
5066         nb_sectors -= num;
5067     }
5068     return 0;
5069 }
5070 
5071 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5072 {
5073     Coroutine *co;
5074     DiscardCo rwco = {
5075         .bs = bs,
5076         .sector_num = sector_num,
5077         .nb_sectors = nb_sectors,
5078         .ret = NOT_DONE,
5079     };
5080 
5081     if (qemu_in_coroutine()) {
5082         /* Fast-path if already in coroutine context */
5083         bdrv_discard_co_entry(&rwco);
5084     } else {
5085         co = qemu_coroutine_create(bdrv_discard_co_entry);
5086         qemu_coroutine_enter(co, &rwco);
5087         while (rwco.ret == NOT_DONE) {
5088             qemu_aio_wait();
5089         }
5090     }
5091 
5092     return rwco.ret;
5093 }
5094 
5095 /**************************************************************/
5096 /* removable device support */
5097 
5098 /**
5099  * Return TRUE if the media is present
5100  */
5101 int bdrv_is_inserted(BlockDriverState *bs)
5102 {
5103     BlockDriver *drv = bs->drv;
5104 
5105     if (!drv)
5106         return 0;
5107     if (!drv->bdrv_is_inserted)
5108         return 1;
5109     return drv->bdrv_is_inserted(bs);
5110 }
5111 
5112 /**
5113  * Return whether the media changed since the last call to this
5114  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
5115  */
5116 int bdrv_media_changed(BlockDriverState *bs)
5117 {
5118     BlockDriver *drv = bs->drv;
5119 
5120     if (drv && drv->bdrv_media_changed) {
5121         return drv->bdrv_media_changed(bs);
5122     }
5123     return -ENOTSUP;
5124 }
5125 
5126 /**
5127  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5128  */
5129 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5130 {
5131     BlockDriver *drv = bs->drv;
5132 
5133     if (drv && drv->bdrv_eject) {
5134         drv->bdrv_eject(bs, eject_flag);
5135     }
5136 
5137     if (bs->device_name[0] != '\0') {
5138         bdrv_emit_qmp_eject_event(bs, eject_flag);
5139     }
5140 }
5141 
5142 /**
5143  * Lock or unlock the media (if it is locked, the user won't be able
5144  * to eject it manually).
5145  */
5146 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5147 {
5148     BlockDriver *drv = bs->drv;
5149 
5150     trace_bdrv_lock_medium(bs, locked);
5151 
5152     if (drv && drv->bdrv_lock_medium) {
5153         drv->bdrv_lock_medium(bs, locked);
5154     }
5155 }
5156 
5157 /* needed for generic scsi interface */
5158 
5159 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5160 {
5161     BlockDriver *drv = bs->drv;
5162 
5163     if (drv && drv->bdrv_ioctl)
5164         return drv->bdrv_ioctl(bs, req, buf);
5165     return -ENOTSUP;
5166 }
5167 
5168 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5169         unsigned long int req, void *buf,
5170         BlockDriverCompletionFunc *cb, void *opaque)
5171 {
5172     BlockDriver *drv = bs->drv;
5173 
5174     if (drv && drv->bdrv_aio_ioctl)
5175         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5176     return NULL;
5177 }
5178 
5179 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5180 {
5181     bs->guest_block_size = align;
5182 }
5183 
5184 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5185 {
5186     return qemu_memalign(bdrv_opt_mem_align(bs), size);
5187 }
5188 
5189 /*
5190  * Check if all memory in this vector is sector aligned.
5191  */
5192 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5193 {
5194     int i;
5195     size_t alignment = bdrv_opt_mem_align(bs);
5196 
5197     for (i = 0; i < qiov->niov; i++) {
5198         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5199             return false;
5200         }
5201         if (qiov->iov[i].iov_len % alignment) {
5202             return false;
5203         }
5204     }
5205 
5206     return true;
5207 }
5208 
5209 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5210                                           Error **errp)
5211 {
5212     int64_t bitmap_size;
5213     BdrvDirtyBitmap *bitmap;
5214 
5215     assert((granularity & (granularity - 1)) == 0);
5216 
5217     granularity >>= BDRV_SECTOR_BITS;
5218     assert(granularity);
5219     bitmap_size = bdrv_getlength(bs);
5220     if (bitmap_size < 0) {
5221         error_setg_errno(errp, -bitmap_size, "could not get length of device");
5222         errno = -bitmap_size;
5223         return NULL;
5224     }
5225     bitmap_size >>= BDRV_SECTOR_BITS;
5226     bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5227     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5228     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5229     return bitmap;
5230 }
5231 
5232 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5233 {
5234     BdrvDirtyBitmap *bm, *next;
5235     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5236         if (bm == bitmap) {
5237             QLIST_REMOVE(bitmap, list);
5238             hbitmap_free(bitmap->bitmap);
5239             g_free(bitmap);
5240             return;
5241         }
5242     }
5243 }
5244 
5245 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5246 {
5247     BdrvDirtyBitmap *bm;
5248     BlockDirtyInfoList *list = NULL;
5249     BlockDirtyInfoList **plist = &list;
5250 
5251     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5252         BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5253         BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5254         info->count = bdrv_get_dirty_count(bs, bm);
5255         info->granularity =
5256             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5257         entry->value = info;
5258         *plist = entry;
5259         plist = &entry->next;
5260     }
5261 
5262     return list;
5263 }
5264 
5265 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5266 {
5267     if (bitmap) {
5268         return hbitmap_get(bitmap->bitmap, sector);
5269     } else {
5270         return 0;
5271     }
5272 }
5273 
5274 void bdrv_dirty_iter_init(BlockDriverState *bs,
5275                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5276 {
5277     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5278 }
5279 
5280 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5281                     int nr_sectors)
5282 {
5283     BdrvDirtyBitmap *bitmap;
5284     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5285         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5286     }
5287 }
5288 
5289 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5290 {
5291     BdrvDirtyBitmap *bitmap;
5292     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5293         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5294     }
5295 }
5296 
5297 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5298 {
5299     return hbitmap_count(bitmap->bitmap);
5300 }
5301 
5302 /* Get a reference to bs */
5303 void bdrv_ref(BlockDriverState *bs)
5304 {
5305     bs->refcnt++;
5306 }
5307 
5308 /* Release a previously grabbed reference to bs.
5309  * If after releasing, reference count is zero, the BlockDriverState is
5310  * deleted. */
5311 void bdrv_unref(BlockDriverState *bs)
5312 {
5313     assert(bs->refcnt > 0);
5314     if (--bs->refcnt == 0) {
5315         bdrv_delete(bs);
5316     }
5317 }
5318 
5319 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
5320 {
5321     assert(bs->in_use != in_use);
5322     bs->in_use = in_use;
5323 }
5324 
5325 int bdrv_in_use(BlockDriverState *bs)
5326 {
5327     return bs->in_use;
5328 }
5329 
5330 void bdrv_iostatus_enable(BlockDriverState *bs)
5331 {
5332     bs->iostatus_enabled = true;
5333     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5334 }
5335 
5336 /* The I/O status is only enabled if the drive explicitly
5337  * enables it _and_ the VM is configured to stop on errors */
5338 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5339 {
5340     return (bs->iostatus_enabled &&
5341            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5342             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5343             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5344 }
5345 
5346 void bdrv_iostatus_disable(BlockDriverState *bs)
5347 {
5348     bs->iostatus_enabled = false;
5349 }
5350 
5351 void bdrv_iostatus_reset(BlockDriverState *bs)
5352 {
5353     if (bdrv_iostatus_is_enabled(bs)) {
5354         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5355         if (bs->job) {
5356             block_job_iostatus_reset(bs->job);
5357         }
5358     }
5359 }
5360 
5361 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5362 {
5363     assert(bdrv_iostatus_is_enabled(bs));
5364     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5365         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5366                                          BLOCK_DEVICE_IO_STATUS_FAILED;
5367     }
5368 }
5369 
5370 void
5371 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5372         enum BlockAcctType type)
5373 {
5374     assert(type < BDRV_MAX_IOTYPE);
5375 
5376     cookie->bytes = bytes;
5377     cookie->start_time_ns = get_clock();
5378     cookie->type = type;
5379 }
5380 
5381 void
5382 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5383 {
5384     assert(cookie->type < BDRV_MAX_IOTYPE);
5385 
5386     bs->nr_bytes[cookie->type] += cookie->bytes;
5387     bs->nr_ops[cookie->type]++;
5388     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5389 }
5390 
5391 void bdrv_img_create(const char *filename, const char *fmt,
5392                      const char *base_filename, const char *base_fmt,
5393                      char *options, uint64_t img_size, int flags,
5394                      Error **errp, bool quiet)
5395 {
5396     QEMUOptionParameter *param = NULL, *create_options = NULL;
5397     QEMUOptionParameter *backing_fmt, *backing_file, *size;
5398     BlockDriver *drv, *proto_drv;
5399     BlockDriver *backing_drv = NULL;
5400     Error *local_err = NULL;
5401     int ret = 0;
5402 
5403     /* Find driver and parse its options */
5404     drv = bdrv_find_format(fmt);
5405     if (!drv) {
5406         error_setg(errp, "Unknown file format '%s'", fmt);
5407         return;
5408     }
5409 
5410     proto_drv = bdrv_find_protocol(filename, true);
5411     if (!proto_drv) {
5412         error_setg(errp, "Unknown protocol '%s'", filename);
5413         return;
5414     }
5415 
5416     create_options = append_option_parameters(create_options,
5417                                               drv->create_options);
5418     create_options = append_option_parameters(create_options,
5419                                               proto_drv->create_options);
5420 
5421     /* Create parameter list with default values */
5422     param = parse_option_parameters("", create_options, param);
5423 
5424     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5425 
5426     /* Parse -o options */
5427     if (options) {
5428         param = parse_option_parameters(options, create_options, param);
5429         if (param == NULL) {
5430             error_setg(errp, "Invalid options for file format '%s'.", fmt);
5431             goto out;
5432         }
5433     }
5434 
5435     if (base_filename) {
5436         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5437                                  base_filename)) {
5438             error_setg(errp, "Backing file not supported for file format '%s'",
5439                        fmt);
5440             goto out;
5441         }
5442     }
5443 
5444     if (base_fmt) {
5445         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5446             error_setg(errp, "Backing file format not supported for file "
5447                              "format '%s'", fmt);
5448             goto out;
5449         }
5450     }
5451 
5452     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5453     if (backing_file && backing_file->value.s) {
5454         if (!strcmp(filename, backing_file->value.s)) {
5455             error_setg(errp, "Error: Trying to create an image with the "
5456                              "same filename as the backing file");
5457             goto out;
5458         }
5459     }
5460 
5461     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5462     if (backing_fmt && backing_fmt->value.s) {
5463         backing_drv = bdrv_find_format(backing_fmt->value.s);
5464         if (!backing_drv) {
5465             error_setg(errp, "Unknown backing file format '%s'",
5466                        backing_fmt->value.s);
5467             goto out;
5468         }
5469     }
5470 
5471     // The size for the image must always be specified, with one exception:
5472     // If we are using a backing file, we can obtain the size from there
5473     size = get_option_parameter(param, BLOCK_OPT_SIZE);
5474     if (size && size->value.n == -1) {
5475         if (backing_file && backing_file->value.s) {
5476             BlockDriverState *bs;
5477             uint64_t size;
5478             char buf[32];
5479             int back_flags;
5480 
5481             /* backing files always opened read-only */
5482             back_flags =
5483                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5484 
5485             bs = NULL;
5486             ret = bdrv_open(&bs, backing_file->value.s, NULL, NULL, back_flags,
5487                             backing_drv, &local_err);
5488             if (ret < 0) {
5489                 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5490                                  backing_file->value.s,
5491                                  error_get_pretty(local_err));
5492                 error_free(local_err);
5493                 local_err = NULL;
5494                 goto out;
5495             }
5496             bdrv_get_geometry(bs, &size);
5497             size *= 512;
5498 
5499             snprintf(buf, sizeof(buf), "%" PRId64, size);
5500             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
5501 
5502             bdrv_unref(bs);
5503         } else {
5504             error_setg(errp, "Image creation needs a size parameter");
5505             goto out;
5506         }
5507     }
5508 
5509     if (!quiet) {
5510         printf("Formatting '%s', fmt=%s ", filename, fmt);
5511         print_option_parameters(param);
5512         puts("");
5513     }
5514     ret = bdrv_create(drv, filename, param, &local_err);
5515     if (ret == -EFBIG) {
5516         /* This is generally a better message than whatever the driver would
5517          * deliver (especially because of the cluster_size_hint), since that
5518          * is most probably not much different from "image too large". */
5519         const char *cluster_size_hint = "";
5520         if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5521             cluster_size_hint = " (try using a larger cluster size)";
5522         }
5523         error_setg(errp, "The image size is too large for file format '%s'"
5524                    "%s", fmt, cluster_size_hint);
5525         error_free(local_err);
5526         local_err = NULL;
5527     }
5528 
5529 out:
5530     free_option_parameters(create_options);
5531     free_option_parameters(param);
5532 
5533     if (local_err) {
5534         error_propagate(errp, local_err);
5535     }
5536 }
5537 
5538 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5539 {
5540     /* Currently BlockDriverState always uses the main loop AioContext */
5541     return qemu_get_aio_context();
5542 }
5543 
5544 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5545                                     NotifierWithReturn *notifier)
5546 {
5547     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5548 }
5549 
5550 int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5551 {
5552     if (bs->drv->bdrv_amend_options == NULL) {
5553         return -ENOTSUP;
5554     }
5555     return bs->drv->bdrv_amend_options(bs, options);
5556 }
5557 
5558 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5559  * of block filter and by bdrv_is_first_non_filter.
5560  * It is used to test if the given bs is the candidate or recurse more in the
5561  * node graph.
5562  */
5563 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5564                                       BlockDriverState *candidate)
5565 {
5566     /* return false if basic checks fails */
5567     if (!bs || !bs->drv) {
5568         return false;
5569     }
5570 
5571     /* the code reached a non block filter driver -> check if the bs is
5572      * the same as the candidate. It's the recursion termination condition.
5573      */
5574     if (!bs->drv->is_filter) {
5575         return bs == candidate;
5576     }
5577     /* Down this path the driver is a block filter driver */
5578 
5579     /* If the block filter recursion method is defined use it to recurse down
5580      * the node graph.
5581      */
5582     if (bs->drv->bdrv_recurse_is_first_non_filter) {
5583         return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5584     }
5585 
5586     /* the driver is a block filter but don't allow to recurse -> return false
5587      */
5588     return false;
5589 }
5590 
5591 /* This function checks if the candidate is the first non filter bs down it's
5592  * bs chain. Since we don't have pointers to parents it explore all bs chains
5593  * from the top. Some filters can choose not to pass down the recursion.
5594  */
5595 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5596 {
5597     BlockDriverState *bs;
5598 
5599     /* walk down the bs forest recursively */
5600     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5601         bool perm;
5602 
5603         /* try to recurse in this top level bs */
5604         perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5605 
5606         /* candidate is the first non filter */
5607         if (perm) {
5608             return true;
5609         }
5610     }
5611 
5612     return false;
5613 }
5614