xref: /openbmc/qemu/block.c (revision 636ea3708c253e9d2ddac6bd7d96854ba95fb7f5)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor/monitor.h"
28 #include "block/block_int.h"
29 #include "block/blockjob.h"
30 #include "qemu/module.h"
31 #include "qapi/qmp/qjson.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 
39 #ifdef CONFIG_BSD
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/queue.h>
44 #ifndef __DragonFly__
45 #include <sys/disk.h>
46 #endif
47 #endif
48 
49 #ifdef _WIN32
50 #include <windows.h>
51 #endif
52 
53 struct BdrvDirtyBitmap {
54     HBitmap *bitmap;
55     QLIST_ENTRY(BdrvDirtyBitmap) list;
56 };
57 
58 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59 
60 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63         BlockDriverCompletionFunc *cb, void *opaque);
64 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66         BlockDriverCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68                                          int64_t sector_num, int nb_sectors,
69                                          QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71                                          int64_t sector_num, int nb_sectors,
72                                          QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75     BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78     BdrvRequestFlags flags);
79 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80                                                int64_t sector_num,
81                                                QEMUIOVector *qiov,
82                                                int nb_sectors,
83                                                BdrvRequestFlags flags,
84                                                BlockDriverCompletionFunc *cb,
85                                                void *opaque,
86                                                bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90 
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92     QTAILQ_HEAD_INITIALIZER(bdrv_states);
93 
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96 
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98     QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
102 
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
105 {
106     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108             filename[1] == ':');
109 }
110 
111 int is_windows_drive(const char *filename)
112 {
113     if (is_windows_drive_prefix(filename) &&
114         filename[2] == '\0')
115         return 1;
116     if (strstart(filename, "\\\\.\\", NULL) ||
117         strstart(filename, "//./", NULL))
118         return 1;
119     return 0;
120 }
121 #endif
122 
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125                         ThrottleConfig *cfg)
126 {
127     int i;
128 
129     throttle_config(&bs->throttle_state, cfg);
130 
131     for (i = 0; i < 2; i++) {
132         qemu_co_enter_next(&bs->throttled_reqs[i]);
133     }
134 }
135 
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138 {
139     bool drained = false;
140     bool enabled = bs->io_limits_enabled;
141     int i;
142 
143     bs->io_limits_enabled = false;
144 
145     for (i = 0; i < 2; i++) {
146         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147             drained = true;
148         }
149     }
150 
151     bs->io_limits_enabled = enabled;
152 
153     return drained;
154 }
155 
156 void bdrv_io_limits_disable(BlockDriverState *bs)
157 {
158     bs->io_limits_enabled = false;
159 
160     bdrv_start_throttled_reqs(bs);
161 
162     throttle_destroy(&bs->throttle_state);
163 }
164 
165 static void bdrv_throttle_read_timer_cb(void *opaque)
166 {
167     BlockDriverState *bs = opaque;
168     qemu_co_enter_next(&bs->throttled_reqs[0]);
169 }
170 
171 static void bdrv_throttle_write_timer_cb(void *opaque)
172 {
173     BlockDriverState *bs = opaque;
174     qemu_co_enter_next(&bs->throttled_reqs[1]);
175 }
176 
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
179 {
180     assert(!bs->io_limits_enabled);
181     throttle_init(&bs->throttle_state,
182                   QEMU_CLOCK_VIRTUAL,
183                   bdrv_throttle_read_timer_cb,
184                   bdrv_throttle_write_timer_cb,
185                   bs);
186     bs->io_limits_enabled = true;
187 }
188 
189 /* This function makes an IO wait if needed
190  *
191  * @nb_sectors: the number of sectors of the IO
192  * @is_write:   is the IO a write
193  */
194 static void bdrv_io_limits_intercept(BlockDriverState *bs,
195                                      unsigned int bytes,
196                                      bool is_write)
197 {
198     /* does this io must wait */
199     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
200 
201     /* if must wait or any request of this type throttled queue the IO */
202     if (must_wait ||
203         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
205     }
206 
207     /* the IO will be executed, do the accounting */
208     throttle_account(&bs->throttle_state, is_write, bytes);
209 
210 
211     /* if the next request must wait -> do nothing */
212     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
213         return;
214     }
215 
216     /* else queue next request for execution */
217     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
218 }
219 
220 size_t bdrv_opt_mem_align(BlockDriverState *bs)
221 {
222     if (!bs || !bs->drv) {
223         /* 4k should be on the safe side */
224         return 4096;
225     }
226 
227     return bs->bl.opt_mem_alignment;
228 }
229 
230 /* check if the path starts with "<protocol>:" */
231 static int path_has_protocol(const char *path)
232 {
233     const char *p;
234 
235 #ifdef _WIN32
236     if (is_windows_drive(path) ||
237         is_windows_drive_prefix(path)) {
238         return 0;
239     }
240     p = path + strcspn(path, ":/\\");
241 #else
242     p = path + strcspn(path, ":/");
243 #endif
244 
245     return *p == ':';
246 }
247 
248 int path_is_absolute(const char *path)
249 {
250 #ifdef _WIN32
251     /* specific case for names like: "\\.\d:" */
252     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
253         return 1;
254     }
255     return (*path == '/' || *path == '\\');
256 #else
257     return (*path == '/');
258 #endif
259 }
260 
261 /* if filename is absolute, just copy it to dest. Otherwise, build a
262    path to it by considering it is relative to base_path. URL are
263    supported. */
264 void path_combine(char *dest, int dest_size,
265                   const char *base_path,
266                   const char *filename)
267 {
268     const char *p, *p1;
269     int len;
270 
271     if (dest_size <= 0)
272         return;
273     if (path_is_absolute(filename)) {
274         pstrcpy(dest, dest_size, filename);
275     } else {
276         p = strchr(base_path, ':');
277         if (p)
278             p++;
279         else
280             p = base_path;
281         p1 = strrchr(base_path, '/');
282 #ifdef _WIN32
283         {
284             const char *p2;
285             p2 = strrchr(base_path, '\\');
286             if (!p1 || p2 > p1)
287                 p1 = p2;
288         }
289 #endif
290         if (p1)
291             p1++;
292         else
293             p1 = base_path;
294         if (p1 > p)
295             p = p1;
296         len = p - base_path;
297         if (len > dest_size - 1)
298             len = dest_size - 1;
299         memcpy(dest, base_path, len);
300         dest[len] = '\0';
301         pstrcat(dest, dest_size, filename);
302     }
303 }
304 
305 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
306 {
307     if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
308         pstrcpy(dest, sz, bs->backing_file);
309     } else {
310         path_combine(dest, sz, bs->filename, bs->backing_file);
311     }
312 }
313 
314 void bdrv_register(BlockDriver *bdrv)
315 {
316     /* Block drivers without coroutine functions need emulation */
317     if (!bdrv->bdrv_co_readv) {
318         bdrv->bdrv_co_readv = bdrv_co_readv_em;
319         bdrv->bdrv_co_writev = bdrv_co_writev_em;
320 
321         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
322          * the block driver lacks aio we need to emulate that too.
323          */
324         if (!bdrv->bdrv_aio_readv) {
325             /* add AIO emulation layer */
326             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
327             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
328         }
329     }
330 
331     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
332 }
333 
334 /* create a new block device (by default it is empty) */
335 BlockDriverState *bdrv_new(const char *device_name)
336 {
337     BlockDriverState *bs;
338 
339     bs = g_malloc0(sizeof(BlockDriverState));
340     QLIST_INIT(&bs->dirty_bitmaps);
341     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
342     if (device_name[0] != '\0') {
343         QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
344     }
345     bdrv_iostatus_disable(bs);
346     notifier_list_init(&bs->close_notifiers);
347     notifier_with_return_list_init(&bs->before_write_notifiers);
348     qemu_co_queue_init(&bs->throttled_reqs[0]);
349     qemu_co_queue_init(&bs->throttled_reqs[1]);
350     bs->refcnt = 1;
351 
352     return bs;
353 }
354 
355 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
356 {
357     notifier_list_add(&bs->close_notifiers, notify);
358 }
359 
360 BlockDriver *bdrv_find_format(const char *format_name)
361 {
362     BlockDriver *drv1;
363     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
364         if (!strcmp(drv1->format_name, format_name)) {
365             return drv1;
366         }
367     }
368     return NULL;
369 }
370 
371 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
372 {
373     static const char *whitelist_rw[] = {
374         CONFIG_BDRV_RW_WHITELIST
375     };
376     static const char *whitelist_ro[] = {
377         CONFIG_BDRV_RO_WHITELIST
378     };
379     const char **p;
380 
381     if (!whitelist_rw[0] && !whitelist_ro[0]) {
382         return 1;               /* no whitelist, anything goes */
383     }
384 
385     for (p = whitelist_rw; *p; p++) {
386         if (!strcmp(drv->format_name, *p)) {
387             return 1;
388         }
389     }
390     if (read_only) {
391         for (p = whitelist_ro; *p; p++) {
392             if (!strcmp(drv->format_name, *p)) {
393                 return 1;
394             }
395         }
396     }
397     return 0;
398 }
399 
400 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
401                                           bool read_only)
402 {
403     BlockDriver *drv = bdrv_find_format(format_name);
404     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
405 }
406 
407 typedef struct CreateCo {
408     BlockDriver *drv;
409     char *filename;
410     QEMUOptionParameter *options;
411     int ret;
412     Error *err;
413 } CreateCo;
414 
415 static void coroutine_fn bdrv_create_co_entry(void *opaque)
416 {
417     Error *local_err = NULL;
418     int ret;
419 
420     CreateCo *cco = opaque;
421     assert(cco->drv);
422 
423     ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
424     if (local_err) {
425         error_propagate(&cco->err, local_err);
426     }
427     cco->ret = ret;
428 }
429 
430 int bdrv_create(BlockDriver *drv, const char* filename,
431     QEMUOptionParameter *options, Error **errp)
432 {
433     int ret;
434 
435     Coroutine *co;
436     CreateCo cco = {
437         .drv = drv,
438         .filename = g_strdup(filename),
439         .options = options,
440         .ret = NOT_DONE,
441         .err = NULL,
442     };
443 
444     if (!drv->bdrv_create) {
445         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
446         ret = -ENOTSUP;
447         goto out;
448     }
449 
450     if (qemu_in_coroutine()) {
451         /* Fast-path if already in coroutine context */
452         bdrv_create_co_entry(&cco);
453     } else {
454         co = qemu_coroutine_create(bdrv_create_co_entry);
455         qemu_coroutine_enter(co, &cco);
456         while (cco.ret == NOT_DONE) {
457             qemu_aio_wait();
458         }
459     }
460 
461     ret = cco.ret;
462     if (ret < 0) {
463         if (cco.err) {
464             error_propagate(errp, cco.err);
465         } else {
466             error_setg_errno(errp, -ret, "Could not create image");
467         }
468     }
469 
470 out:
471     g_free(cco.filename);
472     return ret;
473 }
474 
475 int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
476                      Error **errp)
477 {
478     BlockDriver *drv;
479     Error *local_err = NULL;
480     int ret;
481 
482     drv = bdrv_find_protocol(filename, true);
483     if (drv == NULL) {
484         error_setg(errp, "Could not find protocol for file '%s'", filename);
485         return -ENOENT;
486     }
487 
488     ret = bdrv_create(drv, filename, options, &local_err);
489     if (local_err) {
490         error_propagate(errp, local_err);
491     }
492     return ret;
493 }
494 
495 int bdrv_refresh_limits(BlockDriverState *bs)
496 {
497     BlockDriver *drv = bs->drv;
498 
499     memset(&bs->bl, 0, sizeof(bs->bl));
500 
501     if (!drv) {
502         return 0;
503     }
504 
505     /* Take some limits from the children as a default */
506     if (bs->file) {
507         bdrv_refresh_limits(bs->file);
508         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
509         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
510     } else {
511         bs->bl.opt_mem_alignment = 512;
512     }
513 
514     if (bs->backing_hd) {
515         bdrv_refresh_limits(bs->backing_hd);
516         bs->bl.opt_transfer_length =
517             MAX(bs->bl.opt_transfer_length,
518                 bs->backing_hd->bl.opt_transfer_length);
519         bs->bl.opt_mem_alignment =
520             MAX(bs->bl.opt_mem_alignment,
521                 bs->backing_hd->bl.opt_mem_alignment);
522     }
523 
524     /* Then let the driver override it */
525     if (drv->bdrv_refresh_limits) {
526         return drv->bdrv_refresh_limits(bs);
527     }
528 
529     return 0;
530 }
531 
532 /*
533  * Create a uniquely-named empty temporary file.
534  * Return 0 upon success, otherwise a negative errno value.
535  */
536 int get_tmp_filename(char *filename, int size)
537 {
538 #ifdef _WIN32
539     char temp_dir[MAX_PATH];
540     /* GetTempFileName requires that its output buffer (4th param)
541        have length MAX_PATH or greater.  */
542     assert(size >= MAX_PATH);
543     return (GetTempPath(MAX_PATH, temp_dir)
544             && GetTempFileName(temp_dir, "qem", 0, filename)
545             ? 0 : -GetLastError());
546 #else
547     int fd;
548     const char *tmpdir;
549     tmpdir = getenv("TMPDIR");
550     if (!tmpdir) {
551         tmpdir = "/var/tmp";
552     }
553     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
554         return -EOVERFLOW;
555     }
556     fd = mkstemp(filename);
557     if (fd < 0) {
558         return -errno;
559     }
560     if (close(fd) != 0) {
561         unlink(filename);
562         return -errno;
563     }
564     return 0;
565 #endif
566 }
567 
568 /*
569  * Detect host devices. By convention, /dev/cdrom[N] is always
570  * recognized as a host CDROM.
571  */
572 static BlockDriver *find_hdev_driver(const char *filename)
573 {
574     int score_max = 0, score;
575     BlockDriver *drv = NULL, *d;
576 
577     QLIST_FOREACH(d, &bdrv_drivers, list) {
578         if (d->bdrv_probe_device) {
579             score = d->bdrv_probe_device(filename);
580             if (score > score_max) {
581                 score_max = score;
582                 drv = d;
583             }
584         }
585     }
586 
587     return drv;
588 }
589 
590 BlockDriver *bdrv_find_protocol(const char *filename,
591                                 bool allow_protocol_prefix)
592 {
593     BlockDriver *drv1;
594     char protocol[128];
595     int len;
596     const char *p;
597 
598     /* TODO Drivers without bdrv_file_open must be specified explicitly */
599 
600     /*
601      * XXX(hch): we really should not let host device detection
602      * override an explicit protocol specification, but moving this
603      * later breaks access to device names with colons in them.
604      * Thanks to the brain-dead persistent naming schemes on udev-
605      * based Linux systems those actually are quite common.
606      */
607     drv1 = find_hdev_driver(filename);
608     if (drv1) {
609         return drv1;
610     }
611 
612     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
613         return bdrv_find_format("file");
614     }
615 
616     p = strchr(filename, ':');
617     assert(p != NULL);
618     len = p - filename;
619     if (len > sizeof(protocol) - 1)
620         len = sizeof(protocol) - 1;
621     memcpy(protocol, filename, len);
622     protocol[len] = '\0';
623     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
624         if (drv1->protocol_name &&
625             !strcmp(drv1->protocol_name, protocol)) {
626             return drv1;
627         }
628     }
629     return NULL;
630 }
631 
632 static int find_image_format(BlockDriverState *bs, const char *filename,
633                              BlockDriver **pdrv, Error **errp)
634 {
635     int score, score_max;
636     BlockDriver *drv1, *drv;
637     uint8_t buf[2048];
638     int ret = 0;
639 
640     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
641     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
642         drv = bdrv_find_format("raw");
643         if (!drv) {
644             error_setg(errp, "Could not find raw image format");
645             ret = -ENOENT;
646         }
647         *pdrv = drv;
648         return ret;
649     }
650 
651     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
652     if (ret < 0) {
653         error_setg_errno(errp, -ret, "Could not read image for determining its "
654                          "format");
655         *pdrv = NULL;
656         return ret;
657     }
658 
659     score_max = 0;
660     drv = NULL;
661     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
662         if (drv1->bdrv_probe) {
663             score = drv1->bdrv_probe(buf, ret, filename);
664             if (score > score_max) {
665                 score_max = score;
666                 drv = drv1;
667             }
668         }
669     }
670     if (!drv) {
671         error_setg(errp, "Could not determine image format: No compatible "
672                    "driver found");
673         ret = -ENOENT;
674     }
675     *pdrv = drv;
676     return ret;
677 }
678 
679 /**
680  * Set the current 'total_sectors' value
681  */
682 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
683 {
684     BlockDriver *drv = bs->drv;
685 
686     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
687     if (bs->sg)
688         return 0;
689 
690     /* query actual device if possible, otherwise just trust the hint */
691     if (drv->bdrv_getlength) {
692         int64_t length = drv->bdrv_getlength(bs);
693         if (length < 0) {
694             return length;
695         }
696         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
697     }
698 
699     bs->total_sectors = hint;
700     return 0;
701 }
702 
703 /**
704  * Set open flags for a given discard mode
705  *
706  * Return 0 on success, -1 if the discard mode was invalid.
707  */
708 int bdrv_parse_discard_flags(const char *mode, int *flags)
709 {
710     *flags &= ~BDRV_O_UNMAP;
711 
712     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
713         /* do nothing */
714     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
715         *flags |= BDRV_O_UNMAP;
716     } else {
717         return -1;
718     }
719 
720     return 0;
721 }
722 
723 /**
724  * Set open flags for a given cache mode
725  *
726  * Return 0 on success, -1 if the cache mode was invalid.
727  */
728 int bdrv_parse_cache_flags(const char *mode, int *flags)
729 {
730     *flags &= ~BDRV_O_CACHE_MASK;
731 
732     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
733         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
734     } else if (!strcmp(mode, "directsync")) {
735         *flags |= BDRV_O_NOCACHE;
736     } else if (!strcmp(mode, "writeback")) {
737         *flags |= BDRV_O_CACHE_WB;
738     } else if (!strcmp(mode, "unsafe")) {
739         *flags |= BDRV_O_CACHE_WB;
740         *flags |= BDRV_O_NO_FLUSH;
741     } else if (!strcmp(mode, "writethrough")) {
742         /* this is the default */
743     } else {
744         return -1;
745     }
746 
747     return 0;
748 }
749 
750 /**
751  * The copy-on-read flag is actually a reference count so multiple users may
752  * use the feature without worrying about clobbering its previous state.
753  * Copy-on-read stays enabled until all users have called to disable it.
754  */
755 void bdrv_enable_copy_on_read(BlockDriverState *bs)
756 {
757     bs->copy_on_read++;
758 }
759 
760 void bdrv_disable_copy_on_read(BlockDriverState *bs)
761 {
762     assert(bs->copy_on_read > 0);
763     bs->copy_on_read--;
764 }
765 
766 static int bdrv_open_flags(BlockDriverState *bs, int flags)
767 {
768     int open_flags = flags | BDRV_O_CACHE_WB;
769 
770     /* The backing file of a temporary snapshot is read-only */
771     if (flags & BDRV_O_SNAPSHOT) {
772         open_flags &= ~BDRV_O_RDWR;
773     }
774 
775     /*
776      * Clear flags that are internal to the block layer before opening the
777      * image.
778      */
779     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
780 
781     /*
782      * Snapshots should be writable.
783      */
784     if (bs->is_temporary) {
785         open_flags |= BDRV_O_RDWR;
786     }
787 
788     return open_flags;
789 }
790 
791 static void bdrv_assign_node_name(BlockDriverState *bs,
792                                   const char *node_name,
793                                   Error **errp)
794 {
795     if (!node_name) {
796         return;
797     }
798 
799     /* empty string node name is invalid */
800     if (node_name[0] == '\0') {
801         error_setg(errp, "Empty node name");
802         return;
803     }
804 
805     /* takes care of avoiding namespaces collisions */
806     if (bdrv_find(node_name)) {
807         error_setg(errp, "node-name=%s is conflicting with a device id",
808                    node_name);
809         return;
810     }
811 
812     /* takes care of avoiding duplicates node names */
813     if (bdrv_find_node(node_name)) {
814         error_setg(errp, "Duplicate node name");
815         return;
816     }
817 
818     /* copy node name into the bs and insert it into the graph list */
819     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
820     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
821 }
822 
823 /*
824  * Common part for opening disk images and files
825  *
826  * Removes all processed options from *options.
827  */
828 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
829     QDict *options, int flags, BlockDriver *drv, Error **errp)
830 {
831     int ret, open_flags;
832     const char *filename;
833     const char *node_name = NULL;
834     Error *local_err = NULL;
835 
836     assert(drv != NULL);
837     assert(bs->file == NULL);
838     assert(options != NULL && bs->options != options);
839 
840     if (file != NULL) {
841         filename = file->filename;
842     } else {
843         filename = qdict_get_try_str(options, "filename");
844     }
845 
846     if (drv->bdrv_needs_filename && !filename) {
847         error_setg(errp, "The '%s' block driver requires a file name",
848                    drv->format_name);
849         return -EINVAL;
850     }
851 
852     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
853 
854     node_name = qdict_get_try_str(options, "node-name");
855     bdrv_assign_node_name(bs, node_name, &local_err);
856     if (error_is_set(&local_err)) {
857         error_propagate(errp, local_err);
858         return -EINVAL;
859     }
860     qdict_del(options, "node-name");
861 
862     /* bdrv_open() with directly using a protocol as drv. This layer is already
863      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
864      * and return immediately. */
865     if (file != NULL && drv->bdrv_file_open) {
866         bdrv_swap(file, bs);
867         return 0;
868     }
869 
870     bs->open_flags = flags;
871     bs->guest_block_size = 512;
872     bs->request_alignment = 512;
873     bs->zero_beyond_eof = true;
874     open_flags = bdrv_open_flags(bs, flags);
875     bs->read_only = !(open_flags & BDRV_O_RDWR);
876 
877     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
878         error_setg(errp,
879                    !bs->read_only && bdrv_is_whitelisted(drv, true)
880                         ? "Driver '%s' can only be used for read-only devices"
881                         : "Driver '%s' is not whitelisted",
882                    drv->format_name);
883         return -ENOTSUP;
884     }
885 
886     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
887     if (flags & BDRV_O_COPY_ON_READ) {
888         if (!bs->read_only) {
889             bdrv_enable_copy_on_read(bs);
890         } else {
891             error_setg(errp, "Can't use copy-on-read on read-only device");
892             return -EINVAL;
893         }
894     }
895 
896     if (filename != NULL) {
897         pstrcpy(bs->filename, sizeof(bs->filename), filename);
898     } else {
899         bs->filename[0] = '\0';
900     }
901 
902     bs->drv = drv;
903     bs->opaque = g_malloc0(drv->instance_size);
904 
905     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
906 
907     /* Open the image, either directly or using a protocol */
908     if (drv->bdrv_file_open) {
909         assert(file == NULL);
910         assert(!drv->bdrv_needs_filename || filename != NULL);
911         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
912     } else {
913         if (file == NULL) {
914             error_setg(errp, "Can't use '%s' as a block driver for the "
915                        "protocol level", drv->format_name);
916             ret = -EINVAL;
917             goto free_and_fail;
918         }
919         bs->file = file;
920         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
921     }
922 
923     if (ret < 0) {
924         if (local_err) {
925             error_propagate(errp, local_err);
926         } else if (bs->filename[0]) {
927             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
928         } else {
929             error_setg_errno(errp, -ret, "Could not open image");
930         }
931         goto free_and_fail;
932     }
933 
934     ret = refresh_total_sectors(bs, bs->total_sectors);
935     if (ret < 0) {
936         error_setg_errno(errp, -ret, "Could not refresh total sector count");
937         goto free_and_fail;
938     }
939 
940     bdrv_refresh_limits(bs);
941     assert(bdrv_opt_mem_align(bs) != 0);
942     assert((bs->request_alignment != 0) || bs->sg);
943 
944 #ifndef _WIN32
945     if (bs->is_temporary) {
946         assert(bs->filename[0] != '\0');
947         unlink(bs->filename);
948     }
949 #endif
950     return 0;
951 
952 free_and_fail:
953     bs->file = NULL;
954     g_free(bs->opaque);
955     bs->opaque = NULL;
956     bs->drv = NULL;
957     return ret;
958 }
959 
960 /*
961  * Opens a file using a protocol (file, host_device, nbd, ...)
962  *
963  * options is an indirect pointer to a QDict of options to pass to the block
964  * drivers, or pointer to NULL for an empty set of options. If this function
965  * takes ownership of the QDict reference, it will set *options to NULL;
966  * otherwise, it will contain unused/unrecognized options after this function
967  * returns. Then, the caller is responsible for freeing it. If it intends to
968  * reuse the QDict, QINCREF() should be called beforehand.
969  */
970 static int bdrv_file_open(BlockDriverState *bs, const char *filename,
971                           QDict **options, int flags, Error **errp)
972 {
973     BlockDriver *drv;
974     const char *drvname;
975     bool parse_filename = false;
976     Error *local_err = NULL;
977     int ret;
978 
979     /* Fetch the file name from the options QDict if necessary */
980     if (!filename) {
981         filename = qdict_get_try_str(*options, "filename");
982     } else if (filename && !qdict_haskey(*options, "filename")) {
983         qdict_put(*options, "filename", qstring_from_str(filename));
984         parse_filename = true;
985     } else {
986         error_setg(errp, "Can't specify 'file' and 'filename' options at the "
987                    "same time");
988         ret = -EINVAL;
989         goto fail;
990     }
991 
992     /* Find the right block driver */
993     drvname = qdict_get_try_str(*options, "driver");
994     if (drvname) {
995         drv = bdrv_find_format(drvname);
996         if (!drv) {
997             error_setg(errp, "Unknown driver '%s'", drvname);
998         }
999         qdict_del(*options, "driver");
1000     } else if (filename) {
1001         drv = bdrv_find_protocol(filename, parse_filename);
1002         if (!drv) {
1003             error_setg(errp, "Unknown protocol");
1004         }
1005     } else {
1006         error_setg(errp, "Must specify either driver or file");
1007         drv = NULL;
1008     }
1009 
1010     if (!drv) {
1011         /* errp has been set already */
1012         ret = -ENOENT;
1013         goto fail;
1014     }
1015 
1016     /* Parse the filename and open it */
1017     if (drv->bdrv_parse_filename && parse_filename) {
1018         drv->bdrv_parse_filename(filename, *options, &local_err);
1019         if (local_err) {
1020             error_propagate(errp, local_err);
1021             ret = -EINVAL;
1022             goto fail;
1023         }
1024 
1025         if (!drv->bdrv_needs_filename) {
1026             qdict_del(*options, "filename");
1027         } else {
1028             filename = qdict_get_str(*options, "filename");
1029         }
1030     }
1031 
1032     if (!drv->bdrv_file_open) {
1033         ret = bdrv_open(&bs, filename, NULL, *options, flags, drv, &local_err);
1034         *options = NULL;
1035     } else {
1036         ret = bdrv_open_common(bs, NULL, *options, flags, drv, &local_err);
1037     }
1038     if (ret < 0) {
1039         error_propagate(errp, local_err);
1040         goto fail;
1041     }
1042 
1043     bs->growable = 1;
1044     return 0;
1045 
1046 fail:
1047     return ret;
1048 }
1049 
1050 /*
1051  * Opens the backing file for a BlockDriverState if not yet open
1052  *
1053  * options is a QDict of options to pass to the block drivers, or NULL for an
1054  * empty set of options. The reference to the QDict is transferred to this
1055  * function (even on failure), so if the caller intends to reuse the dictionary,
1056  * it needs to use QINCREF() before calling bdrv_file_open.
1057  */
1058 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1059 {
1060     char backing_filename[PATH_MAX];
1061     int back_flags, ret;
1062     BlockDriver *back_drv = NULL;
1063     Error *local_err = NULL;
1064 
1065     if (bs->backing_hd != NULL) {
1066         QDECREF(options);
1067         return 0;
1068     }
1069 
1070     /* NULL means an empty set of options */
1071     if (options == NULL) {
1072         options = qdict_new();
1073     }
1074 
1075     bs->open_flags &= ~BDRV_O_NO_BACKING;
1076     if (qdict_haskey(options, "file.filename")) {
1077         backing_filename[0] = '\0';
1078     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1079         QDECREF(options);
1080         return 0;
1081     } else {
1082         bdrv_get_full_backing_filename(bs, backing_filename,
1083                                        sizeof(backing_filename));
1084     }
1085 
1086     if (bs->backing_format[0] != '\0') {
1087         back_drv = bdrv_find_format(bs->backing_format);
1088     }
1089 
1090     /* backing files always opened read-only */
1091     back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
1092                                     BDRV_O_COPY_ON_READ);
1093 
1094     assert(bs->backing_hd == NULL);
1095     ret = bdrv_open(&bs->backing_hd,
1096                     *backing_filename ? backing_filename : NULL, NULL, options,
1097                     back_flags, back_drv, &local_err);
1098     if (ret < 0) {
1099         bs->backing_hd = NULL;
1100         bs->open_flags |= BDRV_O_NO_BACKING;
1101         error_setg(errp, "Could not open backing file: %s",
1102                    error_get_pretty(local_err));
1103         error_free(local_err);
1104         return ret;
1105     }
1106 
1107     if (bs->backing_hd->file) {
1108         pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1109                 bs->backing_hd->file->filename);
1110     }
1111 
1112     /* Recalculate the BlockLimits with the backing file */
1113     bdrv_refresh_limits(bs);
1114 
1115     return 0;
1116 }
1117 
1118 /*
1119  * Opens a disk image whose options are given as BlockdevRef in another block
1120  * device's options.
1121  *
1122  * If allow_none is true, no image will be opened if filename is false and no
1123  * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1124  *
1125  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1126  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1127  * itself, all options starting with "${bdref_key}." are considered part of the
1128  * BlockdevRef.
1129  *
1130  * The BlockdevRef will be removed from the options QDict.
1131  *
1132  * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1133  */
1134 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1135                     QDict *options, const char *bdref_key, int flags,
1136                     bool allow_none, Error **errp)
1137 {
1138     QDict *image_options;
1139     int ret;
1140     char *bdref_key_dot;
1141     const char *reference;
1142 
1143     assert(pbs);
1144     assert(*pbs == NULL);
1145 
1146     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1147     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1148     g_free(bdref_key_dot);
1149 
1150     reference = qdict_get_try_str(options, bdref_key);
1151     if (!filename && !reference && !qdict_size(image_options)) {
1152         if (allow_none) {
1153             ret = 0;
1154         } else {
1155             error_setg(errp, "A block device must be specified for \"%s\"",
1156                        bdref_key);
1157             ret = -EINVAL;
1158         }
1159         goto done;
1160     }
1161 
1162     ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1163 
1164 done:
1165     qdict_del(options, bdref_key);
1166     return ret;
1167 }
1168 
1169 void bdrv_append_temp_snapshot(BlockDriverState *bs, Error **errp)
1170 {
1171     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1172     char tmp_filename[PATH_MAX + 1];
1173 
1174     int64_t total_size;
1175     BlockDriver *bdrv_qcow2;
1176     QEMUOptionParameter *create_options;
1177     QDict *snapshot_options;
1178     BlockDriverState *bs_snapshot;
1179     Error *local_err;
1180     int ret;
1181 
1182     /* if snapshot, we create a temporary backing file and open it
1183        instead of opening 'filename' directly */
1184 
1185     /* Get the required size from the image */
1186     total_size = bdrv_getlength(bs);
1187     if (total_size < 0) {
1188         error_setg_errno(errp, -total_size, "Could not get image size");
1189         return;
1190     }
1191     total_size &= BDRV_SECTOR_MASK;
1192 
1193     /* Create the temporary image */
1194     ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1195     if (ret < 0) {
1196         error_setg_errno(errp, -ret, "Could not get temporary filename");
1197         return;
1198     }
1199 
1200     bdrv_qcow2 = bdrv_find_format("qcow2");
1201     create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1202                                              NULL);
1203 
1204     set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1205 
1206     ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1207     free_option_parameters(create_options);
1208     if (ret < 0) {
1209         error_setg_errno(errp, -ret, "Could not create temporary overlay "
1210                          "'%s': %s", tmp_filename,
1211                          error_get_pretty(local_err));
1212         error_free(local_err);
1213         return;
1214     }
1215 
1216     /* Prepare a new options QDict for the temporary file */
1217     snapshot_options = qdict_new();
1218     qdict_put(snapshot_options, "file.driver",
1219               qstring_from_str("file"));
1220     qdict_put(snapshot_options, "file.filename",
1221               qstring_from_str(tmp_filename));
1222 
1223     bs_snapshot = bdrv_new("");
1224     bs_snapshot->is_temporary = 1;
1225 
1226     ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1227                     bs->open_flags & ~BDRV_O_SNAPSHOT, bdrv_qcow2, &local_err);
1228     if (ret < 0) {
1229         error_propagate(errp, local_err);
1230         return;
1231     }
1232 
1233     bdrv_append(bs_snapshot, bs);
1234 }
1235 
1236 /*
1237  * Opens a disk image (raw, qcow2, vmdk, ...)
1238  *
1239  * options is a QDict of options to pass to the block drivers, or NULL for an
1240  * empty set of options. The reference to the QDict belongs to the block layer
1241  * after the call (even on failure), so if the caller intends to reuse the
1242  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1243  *
1244  * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1245  * If it is not NULL, the referenced BDS will be reused.
1246  *
1247  * The reference parameter may be used to specify an existing block device which
1248  * should be opened. If specified, neither options nor a filename may be given,
1249  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1250  */
1251 int bdrv_open(BlockDriverState **pbs, const char *filename,
1252               const char *reference, QDict *options, int flags,
1253               BlockDriver *drv, Error **errp)
1254 {
1255     int ret;
1256     BlockDriverState *file = NULL, *bs;
1257     const char *drvname;
1258     Error *local_err = NULL;
1259 
1260     assert(pbs);
1261 
1262     if (reference) {
1263         bool options_non_empty = options ? qdict_size(options) : false;
1264         QDECREF(options);
1265 
1266         if (*pbs) {
1267             error_setg(errp, "Cannot reuse an existing BDS when referencing "
1268                        "another block device");
1269             return -EINVAL;
1270         }
1271 
1272         if (filename || options_non_empty) {
1273             error_setg(errp, "Cannot reference an existing block device with "
1274                        "additional options or a new filename");
1275             return -EINVAL;
1276         }
1277 
1278         bs = bdrv_lookup_bs(reference, reference, errp);
1279         if (!bs) {
1280             return -ENODEV;
1281         }
1282         bdrv_ref(bs);
1283         *pbs = bs;
1284         return 0;
1285     }
1286 
1287     if (*pbs) {
1288         bs = *pbs;
1289     } else {
1290         bs = bdrv_new("");
1291     }
1292 
1293     /* NULL means an empty set of options */
1294     if (options == NULL) {
1295         options = qdict_new();
1296     }
1297 
1298     bs->options = options;
1299     options = qdict_clone_shallow(options);
1300 
1301     if (flags & BDRV_O_PROTOCOL) {
1302         assert(!drv);
1303         ret = bdrv_file_open(bs, filename, &options, flags & ~BDRV_O_PROTOCOL,
1304                              &local_err);
1305         if (!ret) {
1306             drv = bs->drv;
1307             goto done;
1308         } else if (bs->drv) {
1309             goto close_and_fail;
1310         } else {
1311             goto fail;
1312         }
1313     }
1314 
1315     /* Open image file without format layer */
1316     if (flags & BDRV_O_RDWR) {
1317         flags |= BDRV_O_ALLOW_RDWR;
1318     }
1319 
1320     assert(file == NULL);
1321     ret = bdrv_open_image(&file, filename, options, "file",
1322                           bdrv_open_flags(bs, flags | BDRV_O_UNMAP) |
1323                           BDRV_O_PROTOCOL, true, &local_err);
1324     if (ret < 0) {
1325         goto unlink_and_fail;
1326     }
1327 
1328     /* Find the right image format driver */
1329     drvname = qdict_get_try_str(options, "driver");
1330     if (drvname) {
1331         drv = bdrv_find_format(drvname);
1332         qdict_del(options, "driver");
1333         if (!drv) {
1334             error_setg(errp, "Invalid driver: '%s'", drvname);
1335             ret = -EINVAL;
1336             goto unlink_and_fail;
1337         }
1338     }
1339 
1340     if (!drv) {
1341         if (file) {
1342             ret = find_image_format(file, filename, &drv, &local_err);
1343         } else {
1344             error_setg(errp, "Must specify either driver or file");
1345             ret = -EINVAL;
1346             goto unlink_and_fail;
1347         }
1348     }
1349 
1350     if (!drv) {
1351         goto unlink_and_fail;
1352     }
1353 
1354     /* Open the image */
1355     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1356     if (ret < 0) {
1357         goto unlink_and_fail;
1358     }
1359 
1360     if (file && (bs->file != file)) {
1361         bdrv_unref(file);
1362         file = NULL;
1363     }
1364 
1365     /* If there is a backing file, use it */
1366     if ((flags & BDRV_O_NO_BACKING) == 0) {
1367         QDict *backing_options;
1368 
1369         qdict_extract_subqdict(options, &backing_options, "backing.");
1370         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1371         if (ret < 0) {
1372             goto close_and_fail;
1373         }
1374     }
1375 
1376     /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1377      * temporary snapshot afterwards. */
1378     if (flags & BDRV_O_SNAPSHOT) {
1379         bdrv_append_temp_snapshot(bs, &local_err);
1380         if (local_err) {
1381             error_propagate(errp, local_err);
1382             goto close_and_fail;
1383         }
1384     }
1385 
1386 
1387 done:
1388     /* Check if any unknown options were used */
1389     if (options && (qdict_size(options) != 0)) {
1390         const QDictEntry *entry = qdict_first(options);
1391         if (flags & BDRV_O_PROTOCOL) {
1392             error_setg(errp, "Block protocol '%s' doesn't support the option "
1393                        "'%s'", drv->format_name, entry->key);
1394         } else {
1395             error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1396                        "support the option '%s'", drv->format_name,
1397                        bs->device_name, entry->key);
1398         }
1399 
1400         ret = -EINVAL;
1401         goto close_and_fail;
1402     }
1403 
1404     if (!bdrv_key_required(bs)) {
1405         bdrv_dev_change_media_cb(bs, true);
1406     } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1407                && !runstate_check(RUN_STATE_INMIGRATE)
1408                && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1409         error_setg(errp,
1410                    "Guest must be stopped for opening of encrypted image");
1411         ret = -EBUSY;
1412         goto close_and_fail;
1413     }
1414 
1415     QDECREF(options);
1416     *pbs = bs;
1417     return 0;
1418 
1419 unlink_and_fail:
1420     if (file != NULL) {
1421         bdrv_unref(file);
1422     }
1423     if (bs->is_temporary) {
1424         unlink(filename);
1425     }
1426 fail:
1427     QDECREF(bs->options);
1428     QDECREF(options);
1429     bs->options = NULL;
1430     if (!*pbs) {
1431         /* If *pbs is NULL, a new BDS has been created in this function and
1432            needs to be freed now. Otherwise, it does not need to be closed,
1433            since it has not really been opened yet. */
1434         bdrv_unref(bs);
1435     }
1436     if (local_err) {
1437         error_propagate(errp, local_err);
1438     }
1439     return ret;
1440 
1441 close_and_fail:
1442     /* See fail path, but now the BDS has to be always closed */
1443     if (*pbs) {
1444         bdrv_close(bs);
1445     } else {
1446         bdrv_unref(bs);
1447     }
1448     QDECREF(options);
1449     if (local_err) {
1450         error_propagate(errp, local_err);
1451     }
1452     return ret;
1453 }
1454 
1455 typedef struct BlockReopenQueueEntry {
1456      bool prepared;
1457      BDRVReopenState state;
1458      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1459 } BlockReopenQueueEntry;
1460 
1461 /*
1462  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1463  * reopen of multiple devices.
1464  *
1465  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1466  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1467  * be created and initialized. This newly created BlockReopenQueue should be
1468  * passed back in for subsequent calls that are intended to be of the same
1469  * atomic 'set'.
1470  *
1471  * bs is the BlockDriverState to add to the reopen queue.
1472  *
1473  * flags contains the open flags for the associated bs
1474  *
1475  * returns a pointer to bs_queue, which is either the newly allocated
1476  * bs_queue, or the existing bs_queue being used.
1477  *
1478  */
1479 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1480                                     BlockDriverState *bs, int flags)
1481 {
1482     assert(bs != NULL);
1483 
1484     BlockReopenQueueEntry *bs_entry;
1485     if (bs_queue == NULL) {
1486         bs_queue = g_new0(BlockReopenQueue, 1);
1487         QSIMPLEQ_INIT(bs_queue);
1488     }
1489 
1490     if (bs->file) {
1491         bdrv_reopen_queue(bs_queue, bs->file, flags);
1492     }
1493 
1494     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1495     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1496 
1497     bs_entry->state.bs = bs;
1498     bs_entry->state.flags = flags;
1499 
1500     return bs_queue;
1501 }
1502 
1503 /*
1504  * Reopen multiple BlockDriverStates atomically & transactionally.
1505  *
1506  * The queue passed in (bs_queue) must have been built up previous
1507  * via bdrv_reopen_queue().
1508  *
1509  * Reopens all BDS specified in the queue, with the appropriate
1510  * flags.  All devices are prepared for reopen, and failure of any
1511  * device will cause all device changes to be abandonded, and intermediate
1512  * data cleaned up.
1513  *
1514  * If all devices prepare successfully, then the changes are committed
1515  * to all devices.
1516  *
1517  */
1518 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1519 {
1520     int ret = -1;
1521     BlockReopenQueueEntry *bs_entry, *next;
1522     Error *local_err = NULL;
1523 
1524     assert(bs_queue != NULL);
1525 
1526     bdrv_drain_all();
1527 
1528     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1529         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1530             error_propagate(errp, local_err);
1531             goto cleanup;
1532         }
1533         bs_entry->prepared = true;
1534     }
1535 
1536     /* If we reach this point, we have success and just need to apply the
1537      * changes
1538      */
1539     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1540         bdrv_reopen_commit(&bs_entry->state);
1541     }
1542 
1543     ret = 0;
1544 
1545 cleanup:
1546     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1547         if (ret && bs_entry->prepared) {
1548             bdrv_reopen_abort(&bs_entry->state);
1549         }
1550         g_free(bs_entry);
1551     }
1552     g_free(bs_queue);
1553     return ret;
1554 }
1555 
1556 
1557 /* Reopen a single BlockDriverState with the specified flags. */
1558 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1559 {
1560     int ret = -1;
1561     Error *local_err = NULL;
1562     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1563 
1564     ret = bdrv_reopen_multiple(queue, &local_err);
1565     if (local_err != NULL) {
1566         error_propagate(errp, local_err);
1567     }
1568     return ret;
1569 }
1570 
1571 
1572 /*
1573  * Prepares a BlockDriverState for reopen. All changes are staged in the
1574  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1575  * the block driver layer .bdrv_reopen_prepare()
1576  *
1577  * bs is the BlockDriverState to reopen
1578  * flags are the new open flags
1579  * queue is the reopen queue
1580  *
1581  * Returns 0 on success, non-zero on error.  On error errp will be set
1582  * as well.
1583  *
1584  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1585  * It is the responsibility of the caller to then call the abort() or
1586  * commit() for any other BDS that have been left in a prepare() state
1587  *
1588  */
1589 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1590                         Error **errp)
1591 {
1592     int ret = -1;
1593     Error *local_err = NULL;
1594     BlockDriver *drv;
1595 
1596     assert(reopen_state != NULL);
1597     assert(reopen_state->bs->drv != NULL);
1598     drv = reopen_state->bs->drv;
1599 
1600     /* if we are to stay read-only, do not allow permission change
1601      * to r/w */
1602     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1603         reopen_state->flags & BDRV_O_RDWR) {
1604         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1605                   reopen_state->bs->device_name);
1606         goto error;
1607     }
1608 
1609 
1610     ret = bdrv_flush(reopen_state->bs);
1611     if (ret) {
1612         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1613                   strerror(-ret));
1614         goto error;
1615     }
1616 
1617     if (drv->bdrv_reopen_prepare) {
1618         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1619         if (ret) {
1620             if (local_err != NULL) {
1621                 error_propagate(errp, local_err);
1622             } else {
1623                 error_setg(errp, "failed while preparing to reopen image '%s'",
1624                            reopen_state->bs->filename);
1625             }
1626             goto error;
1627         }
1628     } else {
1629         /* It is currently mandatory to have a bdrv_reopen_prepare()
1630          * handler for each supported drv. */
1631         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1632                   drv->format_name, reopen_state->bs->device_name,
1633                  "reopening of file");
1634         ret = -1;
1635         goto error;
1636     }
1637 
1638     ret = 0;
1639 
1640 error:
1641     return ret;
1642 }
1643 
1644 /*
1645  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1646  * makes them final by swapping the staging BlockDriverState contents into
1647  * the active BlockDriverState contents.
1648  */
1649 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1650 {
1651     BlockDriver *drv;
1652 
1653     assert(reopen_state != NULL);
1654     drv = reopen_state->bs->drv;
1655     assert(drv != NULL);
1656 
1657     /* If there are any driver level actions to take */
1658     if (drv->bdrv_reopen_commit) {
1659         drv->bdrv_reopen_commit(reopen_state);
1660     }
1661 
1662     /* set BDS specific flags now */
1663     reopen_state->bs->open_flags         = reopen_state->flags;
1664     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1665                                               BDRV_O_CACHE_WB);
1666     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1667 
1668     bdrv_refresh_limits(reopen_state->bs);
1669 }
1670 
1671 /*
1672  * Abort the reopen, and delete and free the staged changes in
1673  * reopen_state
1674  */
1675 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1676 {
1677     BlockDriver *drv;
1678 
1679     assert(reopen_state != NULL);
1680     drv = reopen_state->bs->drv;
1681     assert(drv != NULL);
1682 
1683     if (drv->bdrv_reopen_abort) {
1684         drv->bdrv_reopen_abort(reopen_state);
1685     }
1686 }
1687 
1688 
1689 void bdrv_close(BlockDriverState *bs)
1690 {
1691     if (bs->job) {
1692         block_job_cancel_sync(bs->job);
1693     }
1694     bdrv_drain_all(); /* complete I/O */
1695     bdrv_flush(bs);
1696     bdrv_drain_all(); /* in case flush left pending I/O */
1697     notifier_list_notify(&bs->close_notifiers, bs);
1698 
1699     if (bs->drv) {
1700         if (bs->backing_hd) {
1701             bdrv_unref(bs->backing_hd);
1702             bs->backing_hd = NULL;
1703         }
1704         bs->drv->bdrv_close(bs);
1705         g_free(bs->opaque);
1706 #ifdef _WIN32
1707         if (bs->is_temporary) {
1708             unlink(bs->filename);
1709         }
1710 #endif
1711         bs->opaque = NULL;
1712         bs->drv = NULL;
1713         bs->copy_on_read = 0;
1714         bs->backing_file[0] = '\0';
1715         bs->backing_format[0] = '\0';
1716         bs->total_sectors = 0;
1717         bs->encrypted = 0;
1718         bs->valid_key = 0;
1719         bs->sg = 0;
1720         bs->growable = 0;
1721         bs->zero_beyond_eof = false;
1722         QDECREF(bs->options);
1723         bs->options = NULL;
1724 
1725         if (bs->file != NULL) {
1726             bdrv_unref(bs->file);
1727             bs->file = NULL;
1728         }
1729     }
1730 
1731     bdrv_dev_change_media_cb(bs, false);
1732 
1733     /*throttling disk I/O limits*/
1734     if (bs->io_limits_enabled) {
1735         bdrv_io_limits_disable(bs);
1736     }
1737 }
1738 
1739 void bdrv_close_all(void)
1740 {
1741     BlockDriverState *bs;
1742 
1743     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1744         bdrv_close(bs);
1745     }
1746 }
1747 
1748 /* Check if any requests are in-flight (including throttled requests) */
1749 static bool bdrv_requests_pending(BlockDriverState *bs)
1750 {
1751     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1752         return true;
1753     }
1754     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1755         return true;
1756     }
1757     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1758         return true;
1759     }
1760     if (bs->file && bdrv_requests_pending(bs->file)) {
1761         return true;
1762     }
1763     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1764         return true;
1765     }
1766     return false;
1767 }
1768 
1769 static bool bdrv_requests_pending_all(void)
1770 {
1771     BlockDriverState *bs;
1772     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1773         if (bdrv_requests_pending(bs)) {
1774             return true;
1775         }
1776     }
1777     return false;
1778 }
1779 
1780 /*
1781  * Wait for pending requests to complete across all BlockDriverStates
1782  *
1783  * This function does not flush data to disk, use bdrv_flush_all() for that
1784  * after calling this function.
1785  *
1786  * Note that completion of an asynchronous I/O operation can trigger any
1787  * number of other I/O operations on other devices---for example a coroutine
1788  * can be arbitrarily complex and a constant flow of I/O can come until the
1789  * coroutine is complete.  Because of this, it is not possible to have a
1790  * function to drain a single device's I/O queue.
1791  */
1792 void bdrv_drain_all(void)
1793 {
1794     /* Always run first iteration so any pending completion BHs run */
1795     bool busy = true;
1796     BlockDriverState *bs;
1797 
1798     while (busy) {
1799         QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1800             bdrv_start_throttled_reqs(bs);
1801         }
1802 
1803         busy = bdrv_requests_pending_all();
1804         busy |= aio_poll(qemu_get_aio_context(), busy);
1805     }
1806 }
1807 
1808 /* make a BlockDriverState anonymous by removing from bdrv_state and
1809  * graph_bdrv_state list.
1810    Also, NULL terminate the device_name to prevent double remove */
1811 void bdrv_make_anon(BlockDriverState *bs)
1812 {
1813     if (bs->device_name[0] != '\0') {
1814         QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1815     }
1816     bs->device_name[0] = '\0';
1817     if (bs->node_name[0] != '\0') {
1818         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1819     }
1820     bs->node_name[0] = '\0';
1821 }
1822 
1823 static void bdrv_rebind(BlockDriverState *bs)
1824 {
1825     if (bs->drv && bs->drv->bdrv_rebind) {
1826         bs->drv->bdrv_rebind(bs);
1827     }
1828 }
1829 
1830 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1831                                      BlockDriverState *bs_src)
1832 {
1833     /* move some fields that need to stay attached to the device */
1834     bs_dest->open_flags         = bs_src->open_flags;
1835 
1836     /* dev info */
1837     bs_dest->dev_ops            = bs_src->dev_ops;
1838     bs_dest->dev_opaque         = bs_src->dev_opaque;
1839     bs_dest->dev                = bs_src->dev;
1840     bs_dest->guest_block_size   = bs_src->guest_block_size;
1841     bs_dest->copy_on_read       = bs_src->copy_on_read;
1842 
1843     bs_dest->enable_write_cache = bs_src->enable_write_cache;
1844 
1845     /* i/o throttled req */
1846     memcpy(&bs_dest->throttle_state,
1847            &bs_src->throttle_state,
1848            sizeof(ThrottleState));
1849     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1850     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1851     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1852 
1853     /* r/w error */
1854     bs_dest->on_read_error      = bs_src->on_read_error;
1855     bs_dest->on_write_error     = bs_src->on_write_error;
1856 
1857     /* i/o status */
1858     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1859     bs_dest->iostatus           = bs_src->iostatus;
1860 
1861     /* dirty bitmap */
1862     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1863 
1864     /* reference count */
1865     bs_dest->refcnt             = bs_src->refcnt;
1866 
1867     /* job */
1868     bs_dest->in_use             = bs_src->in_use;
1869     bs_dest->job                = bs_src->job;
1870 
1871     /* keep the same entry in bdrv_states */
1872     pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1873             bs_src->device_name);
1874     bs_dest->device_list = bs_src->device_list;
1875 }
1876 
1877 /*
1878  * Swap bs contents for two image chains while they are live,
1879  * while keeping required fields on the BlockDriverState that is
1880  * actually attached to a device.
1881  *
1882  * This will modify the BlockDriverState fields, and swap contents
1883  * between bs_new and bs_old. Both bs_new and bs_old are modified.
1884  *
1885  * bs_new is required to be anonymous.
1886  *
1887  * This function does not create any image files.
1888  */
1889 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1890 {
1891     BlockDriverState tmp;
1892 
1893     /* The code needs to swap the node_name but simply swapping node_list won't
1894      * work so first remove the nodes from the graph list, do the swap then
1895      * insert them back if needed.
1896      */
1897     if (bs_new->node_name[0] != '\0') {
1898         QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
1899     }
1900     if (bs_old->node_name[0] != '\0') {
1901         QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
1902     }
1903 
1904     /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1905     assert(bs_new->device_name[0] == '\0');
1906     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1907     assert(bs_new->job == NULL);
1908     assert(bs_new->dev == NULL);
1909     assert(bs_new->in_use == 0);
1910     assert(bs_new->io_limits_enabled == false);
1911     assert(!throttle_have_timer(&bs_new->throttle_state));
1912 
1913     tmp = *bs_new;
1914     *bs_new = *bs_old;
1915     *bs_old = tmp;
1916 
1917     /* there are some fields that should not be swapped, move them back */
1918     bdrv_move_feature_fields(&tmp, bs_old);
1919     bdrv_move_feature_fields(bs_old, bs_new);
1920     bdrv_move_feature_fields(bs_new, &tmp);
1921 
1922     /* bs_new shouldn't be in bdrv_states even after the swap!  */
1923     assert(bs_new->device_name[0] == '\0');
1924 
1925     /* Check a few fields that should remain attached to the device */
1926     assert(bs_new->dev == NULL);
1927     assert(bs_new->job == NULL);
1928     assert(bs_new->in_use == 0);
1929     assert(bs_new->io_limits_enabled == false);
1930     assert(!throttle_have_timer(&bs_new->throttle_state));
1931 
1932     /* insert the nodes back into the graph node list if needed */
1933     if (bs_new->node_name[0] != '\0') {
1934         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
1935     }
1936     if (bs_old->node_name[0] != '\0') {
1937         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
1938     }
1939 
1940     bdrv_rebind(bs_new);
1941     bdrv_rebind(bs_old);
1942 }
1943 
1944 /*
1945  * Add new bs contents at the top of an image chain while the chain is
1946  * live, while keeping required fields on the top layer.
1947  *
1948  * This will modify the BlockDriverState fields, and swap contents
1949  * between bs_new and bs_top. Both bs_new and bs_top are modified.
1950  *
1951  * bs_new is required to be anonymous.
1952  *
1953  * This function does not create any image files.
1954  */
1955 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1956 {
1957     bdrv_swap(bs_new, bs_top);
1958 
1959     /* The contents of 'tmp' will become bs_top, as we are
1960      * swapping bs_new and bs_top contents. */
1961     bs_top->backing_hd = bs_new;
1962     bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1963     pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1964             bs_new->filename);
1965     pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1966             bs_new->drv ? bs_new->drv->format_name : "");
1967 }
1968 
1969 static void bdrv_delete(BlockDriverState *bs)
1970 {
1971     assert(!bs->dev);
1972     assert(!bs->job);
1973     assert(!bs->in_use);
1974     assert(!bs->refcnt);
1975     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1976 
1977     bdrv_close(bs);
1978 
1979     /* remove from list, if necessary */
1980     bdrv_make_anon(bs);
1981 
1982     g_free(bs);
1983 }
1984 
1985 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1986 /* TODO change to DeviceState *dev when all users are qdevified */
1987 {
1988     if (bs->dev) {
1989         return -EBUSY;
1990     }
1991     bs->dev = dev;
1992     bdrv_iostatus_reset(bs);
1993     return 0;
1994 }
1995 
1996 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1997 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1998 {
1999     if (bdrv_attach_dev(bs, dev) < 0) {
2000         abort();
2001     }
2002 }
2003 
2004 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
2005 /* TODO change to DeviceState *dev when all users are qdevified */
2006 {
2007     assert(bs->dev == dev);
2008     bs->dev = NULL;
2009     bs->dev_ops = NULL;
2010     bs->dev_opaque = NULL;
2011     bs->guest_block_size = 512;
2012 }
2013 
2014 /* TODO change to return DeviceState * when all users are qdevified */
2015 void *bdrv_get_attached_dev(BlockDriverState *bs)
2016 {
2017     return bs->dev;
2018 }
2019 
2020 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2021                       void *opaque)
2022 {
2023     bs->dev_ops = ops;
2024     bs->dev_opaque = opaque;
2025 }
2026 
2027 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
2028                                enum MonitorEvent ev,
2029                                BlockErrorAction action, bool is_read)
2030 {
2031     QObject *data;
2032     const char *action_str;
2033 
2034     switch (action) {
2035     case BDRV_ACTION_REPORT:
2036         action_str = "report";
2037         break;
2038     case BDRV_ACTION_IGNORE:
2039         action_str = "ignore";
2040         break;
2041     case BDRV_ACTION_STOP:
2042         action_str = "stop";
2043         break;
2044     default:
2045         abort();
2046     }
2047 
2048     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2049                               bdrv->device_name,
2050                               action_str,
2051                               is_read ? "read" : "write");
2052     monitor_protocol_event(ev, data);
2053 
2054     qobject_decref(data);
2055 }
2056 
2057 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2058 {
2059     QObject *data;
2060 
2061     data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2062                               bdrv_get_device_name(bs), ejected);
2063     monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2064 
2065     qobject_decref(data);
2066 }
2067 
2068 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2069 {
2070     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2071         bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2072         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2073         if (tray_was_closed) {
2074             /* tray open */
2075             bdrv_emit_qmp_eject_event(bs, true);
2076         }
2077         if (load) {
2078             /* tray close */
2079             bdrv_emit_qmp_eject_event(bs, false);
2080         }
2081     }
2082 }
2083 
2084 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2085 {
2086     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2087 }
2088 
2089 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2090 {
2091     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2092         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2093     }
2094 }
2095 
2096 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2097 {
2098     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2099         return bs->dev_ops->is_tray_open(bs->dev_opaque);
2100     }
2101     return false;
2102 }
2103 
2104 static void bdrv_dev_resize_cb(BlockDriverState *bs)
2105 {
2106     if (bs->dev_ops && bs->dev_ops->resize_cb) {
2107         bs->dev_ops->resize_cb(bs->dev_opaque);
2108     }
2109 }
2110 
2111 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2112 {
2113     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2114         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2115     }
2116     return false;
2117 }
2118 
2119 /*
2120  * Run consistency checks on an image
2121  *
2122  * Returns 0 if the check could be completed (it doesn't mean that the image is
2123  * free of errors) or -errno when an internal error occurred. The results of the
2124  * check are stored in res.
2125  */
2126 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2127 {
2128     if (bs->drv->bdrv_check == NULL) {
2129         return -ENOTSUP;
2130     }
2131 
2132     memset(res, 0, sizeof(*res));
2133     return bs->drv->bdrv_check(bs, res, fix);
2134 }
2135 
2136 #define COMMIT_BUF_SECTORS 2048
2137 
2138 /* commit COW file into the raw image */
2139 int bdrv_commit(BlockDriverState *bs)
2140 {
2141     BlockDriver *drv = bs->drv;
2142     int64_t sector, total_sectors, length, backing_length;
2143     int n, ro, open_flags;
2144     int ret = 0;
2145     uint8_t *buf = NULL;
2146     char filename[PATH_MAX];
2147 
2148     if (!drv)
2149         return -ENOMEDIUM;
2150 
2151     if (!bs->backing_hd) {
2152         return -ENOTSUP;
2153     }
2154 
2155     if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
2156         return -EBUSY;
2157     }
2158 
2159     ro = bs->backing_hd->read_only;
2160     /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2161     pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2162     open_flags =  bs->backing_hd->open_flags;
2163 
2164     if (ro) {
2165         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2166             return -EACCES;
2167         }
2168     }
2169 
2170     length = bdrv_getlength(bs);
2171     if (length < 0) {
2172         ret = length;
2173         goto ro_cleanup;
2174     }
2175 
2176     backing_length = bdrv_getlength(bs->backing_hd);
2177     if (backing_length < 0) {
2178         ret = backing_length;
2179         goto ro_cleanup;
2180     }
2181 
2182     /* If our top snapshot is larger than the backing file image,
2183      * grow the backing file image if possible.  If not possible,
2184      * we must return an error */
2185     if (length > backing_length) {
2186         ret = bdrv_truncate(bs->backing_hd, length);
2187         if (ret < 0) {
2188             goto ro_cleanup;
2189         }
2190     }
2191 
2192     total_sectors = length >> BDRV_SECTOR_BITS;
2193     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2194 
2195     for (sector = 0; sector < total_sectors; sector += n) {
2196         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2197         if (ret < 0) {
2198             goto ro_cleanup;
2199         }
2200         if (ret) {
2201             ret = bdrv_read(bs, sector, buf, n);
2202             if (ret < 0) {
2203                 goto ro_cleanup;
2204             }
2205 
2206             ret = bdrv_write(bs->backing_hd, sector, buf, n);
2207             if (ret < 0) {
2208                 goto ro_cleanup;
2209             }
2210         }
2211     }
2212 
2213     if (drv->bdrv_make_empty) {
2214         ret = drv->bdrv_make_empty(bs);
2215         if (ret < 0) {
2216             goto ro_cleanup;
2217         }
2218         bdrv_flush(bs);
2219     }
2220 
2221     /*
2222      * Make sure all data we wrote to the backing device is actually
2223      * stable on disk.
2224      */
2225     if (bs->backing_hd) {
2226         bdrv_flush(bs->backing_hd);
2227     }
2228 
2229     ret = 0;
2230 ro_cleanup:
2231     g_free(buf);
2232 
2233     if (ro) {
2234         /* ignoring error return here */
2235         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2236     }
2237 
2238     return ret;
2239 }
2240 
2241 int bdrv_commit_all(void)
2242 {
2243     BlockDriverState *bs;
2244 
2245     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2246         if (bs->drv && bs->backing_hd) {
2247             int ret = bdrv_commit(bs);
2248             if (ret < 0) {
2249                 return ret;
2250             }
2251         }
2252     }
2253     return 0;
2254 }
2255 
2256 /**
2257  * Remove an active request from the tracked requests list
2258  *
2259  * This function should be called when a tracked request is completing.
2260  */
2261 static void tracked_request_end(BdrvTrackedRequest *req)
2262 {
2263     if (req->serialising) {
2264         req->bs->serialising_in_flight--;
2265     }
2266 
2267     QLIST_REMOVE(req, list);
2268     qemu_co_queue_restart_all(&req->wait_queue);
2269 }
2270 
2271 /**
2272  * Add an active request to the tracked requests list
2273  */
2274 static void tracked_request_begin(BdrvTrackedRequest *req,
2275                                   BlockDriverState *bs,
2276                                   int64_t offset,
2277                                   unsigned int bytes, bool is_write)
2278 {
2279     *req = (BdrvTrackedRequest){
2280         .bs = bs,
2281         .offset         = offset,
2282         .bytes          = bytes,
2283         .is_write       = is_write,
2284         .co             = qemu_coroutine_self(),
2285         .serialising    = false,
2286         .overlap_offset = offset,
2287         .overlap_bytes  = bytes,
2288     };
2289 
2290     qemu_co_queue_init(&req->wait_queue);
2291 
2292     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2293 }
2294 
2295 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2296 {
2297     int64_t overlap_offset = req->offset & ~(align - 1);
2298     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2299                                - overlap_offset;
2300 
2301     if (!req->serialising) {
2302         req->bs->serialising_in_flight++;
2303         req->serialising = true;
2304     }
2305 
2306     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2307     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2308 }
2309 
2310 /**
2311  * Round a region to cluster boundaries
2312  */
2313 void bdrv_round_to_clusters(BlockDriverState *bs,
2314                             int64_t sector_num, int nb_sectors,
2315                             int64_t *cluster_sector_num,
2316                             int *cluster_nb_sectors)
2317 {
2318     BlockDriverInfo bdi;
2319 
2320     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2321         *cluster_sector_num = sector_num;
2322         *cluster_nb_sectors = nb_sectors;
2323     } else {
2324         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2325         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2326         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2327                                             nb_sectors, c);
2328     }
2329 }
2330 
2331 static int bdrv_get_cluster_size(BlockDriverState *bs)
2332 {
2333     BlockDriverInfo bdi;
2334     int ret;
2335 
2336     ret = bdrv_get_info(bs, &bdi);
2337     if (ret < 0 || bdi.cluster_size == 0) {
2338         return bs->request_alignment;
2339     } else {
2340         return bdi.cluster_size;
2341     }
2342 }
2343 
2344 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2345                                      int64_t offset, unsigned int bytes)
2346 {
2347     /*        aaaa   bbbb */
2348     if (offset >= req->overlap_offset + req->overlap_bytes) {
2349         return false;
2350     }
2351     /* bbbb   aaaa        */
2352     if (req->overlap_offset >= offset + bytes) {
2353         return false;
2354     }
2355     return true;
2356 }
2357 
2358 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2359 {
2360     BlockDriverState *bs = self->bs;
2361     BdrvTrackedRequest *req;
2362     bool retry;
2363     bool waited = false;
2364 
2365     if (!bs->serialising_in_flight) {
2366         return false;
2367     }
2368 
2369     do {
2370         retry = false;
2371         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2372             if (req == self || (!req->serialising && !self->serialising)) {
2373                 continue;
2374             }
2375             if (tracked_request_overlaps(req, self->overlap_offset,
2376                                          self->overlap_bytes))
2377             {
2378                 /* Hitting this means there was a reentrant request, for
2379                  * example, a block driver issuing nested requests.  This must
2380                  * never happen since it means deadlock.
2381                  */
2382                 assert(qemu_coroutine_self() != req->co);
2383 
2384                 /* If the request is already (indirectly) waiting for us, or
2385                  * will wait for us as soon as it wakes up, then just go on
2386                  * (instead of producing a deadlock in the former case). */
2387                 if (!req->waiting_for) {
2388                     self->waiting_for = req;
2389                     qemu_co_queue_wait(&req->wait_queue);
2390                     self->waiting_for = NULL;
2391                     retry = true;
2392                     waited = true;
2393                     break;
2394                 }
2395             }
2396         }
2397     } while (retry);
2398 
2399     return waited;
2400 }
2401 
2402 /*
2403  * Return values:
2404  * 0        - success
2405  * -EINVAL  - backing format specified, but no file
2406  * -ENOSPC  - can't update the backing file because no space is left in the
2407  *            image file header
2408  * -ENOTSUP - format driver doesn't support changing the backing file
2409  */
2410 int bdrv_change_backing_file(BlockDriverState *bs,
2411     const char *backing_file, const char *backing_fmt)
2412 {
2413     BlockDriver *drv = bs->drv;
2414     int ret;
2415 
2416     /* Backing file format doesn't make sense without a backing file */
2417     if (backing_fmt && !backing_file) {
2418         return -EINVAL;
2419     }
2420 
2421     if (drv->bdrv_change_backing_file != NULL) {
2422         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2423     } else {
2424         ret = -ENOTSUP;
2425     }
2426 
2427     if (ret == 0) {
2428         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2429         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2430     }
2431     return ret;
2432 }
2433 
2434 /*
2435  * Finds the image layer in the chain that has 'bs' as its backing file.
2436  *
2437  * active is the current topmost image.
2438  *
2439  * Returns NULL if bs is not found in active's image chain,
2440  * or if active == bs.
2441  */
2442 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2443                                     BlockDriverState *bs)
2444 {
2445     BlockDriverState *overlay = NULL;
2446     BlockDriverState *intermediate;
2447 
2448     assert(active != NULL);
2449     assert(bs != NULL);
2450 
2451     /* if bs is the same as active, then by definition it has no overlay
2452      */
2453     if (active == bs) {
2454         return NULL;
2455     }
2456 
2457     intermediate = active;
2458     while (intermediate->backing_hd) {
2459         if (intermediate->backing_hd == bs) {
2460             overlay = intermediate;
2461             break;
2462         }
2463         intermediate = intermediate->backing_hd;
2464     }
2465 
2466     return overlay;
2467 }
2468 
2469 typedef struct BlkIntermediateStates {
2470     BlockDriverState *bs;
2471     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2472 } BlkIntermediateStates;
2473 
2474 
2475 /*
2476  * Drops images above 'base' up to and including 'top', and sets the image
2477  * above 'top' to have base as its backing file.
2478  *
2479  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2480  * information in 'bs' can be properly updated.
2481  *
2482  * E.g., this will convert the following chain:
2483  * bottom <- base <- intermediate <- top <- active
2484  *
2485  * to
2486  *
2487  * bottom <- base <- active
2488  *
2489  * It is allowed for bottom==base, in which case it converts:
2490  *
2491  * base <- intermediate <- top <- active
2492  *
2493  * to
2494  *
2495  * base <- active
2496  *
2497  * Error conditions:
2498  *  if active == top, that is considered an error
2499  *
2500  */
2501 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2502                            BlockDriverState *base)
2503 {
2504     BlockDriverState *intermediate;
2505     BlockDriverState *base_bs = NULL;
2506     BlockDriverState *new_top_bs = NULL;
2507     BlkIntermediateStates *intermediate_state, *next;
2508     int ret = -EIO;
2509 
2510     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2511     QSIMPLEQ_INIT(&states_to_delete);
2512 
2513     if (!top->drv || !base->drv) {
2514         goto exit;
2515     }
2516 
2517     new_top_bs = bdrv_find_overlay(active, top);
2518 
2519     if (new_top_bs == NULL) {
2520         /* we could not find the image above 'top', this is an error */
2521         goto exit;
2522     }
2523 
2524     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2525      * to do, no intermediate images */
2526     if (new_top_bs->backing_hd == base) {
2527         ret = 0;
2528         goto exit;
2529     }
2530 
2531     intermediate = top;
2532 
2533     /* now we will go down through the list, and add each BDS we find
2534      * into our deletion queue, until we hit the 'base'
2535      */
2536     while (intermediate) {
2537         intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2538         intermediate_state->bs = intermediate;
2539         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2540 
2541         if (intermediate->backing_hd == base) {
2542             base_bs = intermediate->backing_hd;
2543             break;
2544         }
2545         intermediate = intermediate->backing_hd;
2546     }
2547     if (base_bs == NULL) {
2548         /* something went wrong, we did not end at the base. safely
2549          * unravel everything, and exit with error */
2550         goto exit;
2551     }
2552 
2553     /* success - we can delete the intermediate states, and link top->base */
2554     ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2555                                    base_bs->drv ? base_bs->drv->format_name : "");
2556     if (ret) {
2557         goto exit;
2558     }
2559     new_top_bs->backing_hd = base_bs;
2560 
2561     bdrv_refresh_limits(new_top_bs);
2562 
2563     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2564         /* so that bdrv_close() does not recursively close the chain */
2565         intermediate_state->bs->backing_hd = NULL;
2566         bdrv_unref(intermediate_state->bs);
2567     }
2568     ret = 0;
2569 
2570 exit:
2571     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2572         g_free(intermediate_state);
2573     }
2574     return ret;
2575 }
2576 
2577 
2578 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2579                                    size_t size)
2580 {
2581     int64_t len;
2582 
2583     if (size > INT_MAX) {
2584         return -EIO;
2585     }
2586 
2587     if (!bdrv_is_inserted(bs))
2588         return -ENOMEDIUM;
2589 
2590     if (bs->growable)
2591         return 0;
2592 
2593     len = bdrv_getlength(bs);
2594 
2595     if (offset < 0)
2596         return -EIO;
2597 
2598     if ((offset > len) || (len - offset < size))
2599         return -EIO;
2600 
2601     return 0;
2602 }
2603 
2604 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2605                               int nb_sectors)
2606 {
2607     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2608         return -EIO;
2609     }
2610 
2611     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2612                                    nb_sectors * BDRV_SECTOR_SIZE);
2613 }
2614 
2615 typedef struct RwCo {
2616     BlockDriverState *bs;
2617     int64_t offset;
2618     QEMUIOVector *qiov;
2619     bool is_write;
2620     int ret;
2621     BdrvRequestFlags flags;
2622 } RwCo;
2623 
2624 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2625 {
2626     RwCo *rwco = opaque;
2627 
2628     if (!rwco->is_write) {
2629         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2630                                       rwco->qiov->size, rwco->qiov,
2631                                       rwco->flags);
2632     } else {
2633         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2634                                        rwco->qiov->size, rwco->qiov,
2635                                        rwco->flags);
2636     }
2637 }
2638 
2639 /*
2640  * Process a vectored synchronous request using coroutines
2641  */
2642 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2643                         QEMUIOVector *qiov, bool is_write,
2644                         BdrvRequestFlags flags)
2645 {
2646     Coroutine *co;
2647     RwCo rwco = {
2648         .bs = bs,
2649         .offset = offset,
2650         .qiov = qiov,
2651         .is_write = is_write,
2652         .ret = NOT_DONE,
2653         .flags = flags,
2654     };
2655 
2656     /**
2657      * In sync call context, when the vcpu is blocked, this throttling timer
2658      * will not fire; so the I/O throttling function has to be disabled here
2659      * if it has been enabled.
2660      */
2661     if (bs->io_limits_enabled) {
2662         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2663                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2664         bdrv_io_limits_disable(bs);
2665     }
2666 
2667     if (qemu_in_coroutine()) {
2668         /* Fast-path if already in coroutine context */
2669         bdrv_rw_co_entry(&rwco);
2670     } else {
2671         co = qemu_coroutine_create(bdrv_rw_co_entry);
2672         qemu_coroutine_enter(co, &rwco);
2673         while (rwco.ret == NOT_DONE) {
2674             qemu_aio_wait();
2675         }
2676     }
2677     return rwco.ret;
2678 }
2679 
2680 /*
2681  * Process a synchronous request using coroutines
2682  */
2683 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2684                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2685 {
2686     QEMUIOVector qiov;
2687     struct iovec iov = {
2688         .iov_base = (void *)buf,
2689         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2690     };
2691 
2692     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2693         return -EINVAL;
2694     }
2695 
2696     qemu_iovec_init_external(&qiov, &iov, 1);
2697     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2698                         &qiov, is_write, flags);
2699 }
2700 
2701 /* return < 0 if error. See bdrv_write() for the return codes */
2702 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2703               uint8_t *buf, int nb_sectors)
2704 {
2705     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2706 }
2707 
2708 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2709 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2710                           uint8_t *buf, int nb_sectors)
2711 {
2712     bool enabled;
2713     int ret;
2714 
2715     enabled = bs->io_limits_enabled;
2716     bs->io_limits_enabled = false;
2717     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2718     bs->io_limits_enabled = enabled;
2719     return ret;
2720 }
2721 
2722 /* Return < 0 if error. Important errors are:
2723   -EIO         generic I/O error (may happen for all errors)
2724   -ENOMEDIUM   No media inserted.
2725   -EINVAL      Invalid sector number or nb_sectors
2726   -EACCES      Trying to write a read-only device
2727 */
2728 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2729                const uint8_t *buf, int nb_sectors)
2730 {
2731     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2732 }
2733 
2734 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2735                       int nb_sectors, BdrvRequestFlags flags)
2736 {
2737     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2738                       BDRV_REQ_ZERO_WRITE | flags);
2739 }
2740 
2741 /*
2742  * Completely zero out a block device with the help of bdrv_write_zeroes.
2743  * The operation is sped up by checking the block status and only writing
2744  * zeroes to the device if they currently do not return zeroes. Optional
2745  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2746  *
2747  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2748  */
2749 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2750 {
2751     int64_t target_size;
2752     int64_t ret, nb_sectors, sector_num = 0;
2753     int n;
2754 
2755     target_size = bdrv_getlength(bs);
2756     if (target_size < 0) {
2757         return target_size;
2758     }
2759     target_size /= BDRV_SECTOR_SIZE;
2760 
2761     for (;;) {
2762         nb_sectors = target_size - sector_num;
2763         if (nb_sectors <= 0) {
2764             return 0;
2765         }
2766         if (nb_sectors > INT_MAX) {
2767             nb_sectors = INT_MAX;
2768         }
2769         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2770         if (ret < 0) {
2771             error_report("error getting block status at sector %" PRId64 ": %s",
2772                          sector_num, strerror(-ret));
2773             return ret;
2774         }
2775         if (ret & BDRV_BLOCK_ZERO) {
2776             sector_num += n;
2777             continue;
2778         }
2779         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2780         if (ret < 0) {
2781             error_report("error writing zeroes at sector %" PRId64 ": %s",
2782                          sector_num, strerror(-ret));
2783             return ret;
2784         }
2785         sector_num += n;
2786     }
2787 }
2788 
2789 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2790 {
2791     QEMUIOVector qiov;
2792     struct iovec iov = {
2793         .iov_base = (void *)buf,
2794         .iov_len = bytes,
2795     };
2796     int ret;
2797 
2798     if (bytes < 0) {
2799         return -EINVAL;
2800     }
2801 
2802     qemu_iovec_init_external(&qiov, &iov, 1);
2803     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2804     if (ret < 0) {
2805         return ret;
2806     }
2807 
2808     return bytes;
2809 }
2810 
2811 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2812 {
2813     int ret;
2814 
2815     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2816     if (ret < 0) {
2817         return ret;
2818     }
2819 
2820     return qiov->size;
2821 }
2822 
2823 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2824                 const void *buf, int bytes)
2825 {
2826     QEMUIOVector qiov;
2827     struct iovec iov = {
2828         .iov_base   = (void *) buf,
2829         .iov_len    = bytes,
2830     };
2831 
2832     if (bytes < 0) {
2833         return -EINVAL;
2834     }
2835 
2836     qemu_iovec_init_external(&qiov, &iov, 1);
2837     return bdrv_pwritev(bs, offset, &qiov);
2838 }
2839 
2840 /*
2841  * Writes to the file and ensures that no writes are reordered across this
2842  * request (acts as a barrier)
2843  *
2844  * Returns 0 on success, -errno in error cases.
2845  */
2846 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2847     const void *buf, int count)
2848 {
2849     int ret;
2850 
2851     ret = bdrv_pwrite(bs, offset, buf, count);
2852     if (ret < 0) {
2853         return ret;
2854     }
2855 
2856     /* No flush needed for cache modes that already do it */
2857     if (bs->enable_write_cache) {
2858         bdrv_flush(bs);
2859     }
2860 
2861     return 0;
2862 }
2863 
2864 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2865         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2866 {
2867     /* Perform I/O through a temporary buffer so that users who scribble over
2868      * their read buffer while the operation is in progress do not end up
2869      * modifying the image file.  This is critical for zero-copy guest I/O
2870      * where anything might happen inside guest memory.
2871      */
2872     void *bounce_buffer;
2873 
2874     BlockDriver *drv = bs->drv;
2875     struct iovec iov;
2876     QEMUIOVector bounce_qiov;
2877     int64_t cluster_sector_num;
2878     int cluster_nb_sectors;
2879     size_t skip_bytes;
2880     int ret;
2881 
2882     /* Cover entire cluster so no additional backing file I/O is required when
2883      * allocating cluster in the image file.
2884      */
2885     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2886                            &cluster_sector_num, &cluster_nb_sectors);
2887 
2888     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2889                                    cluster_sector_num, cluster_nb_sectors);
2890 
2891     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2892     iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2893     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2894 
2895     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2896                              &bounce_qiov);
2897     if (ret < 0) {
2898         goto err;
2899     }
2900 
2901     if (drv->bdrv_co_write_zeroes &&
2902         buffer_is_zero(bounce_buffer, iov.iov_len)) {
2903         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2904                                       cluster_nb_sectors, 0);
2905     } else {
2906         /* This does not change the data on the disk, it is not necessary
2907          * to flush even in cache=writethrough mode.
2908          */
2909         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2910                                   &bounce_qiov);
2911     }
2912 
2913     if (ret < 0) {
2914         /* It might be okay to ignore write errors for guest requests.  If this
2915          * is a deliberate copy-on-read then we don't want to ignore the error.
2916          * Simply report it in all cases.
2917          */
2918         goto err;
2919     }
2920 
2921     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2922     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2923                         nb_sectors * BDRV_SECTOR_SIZE);
2924 
2925 err:
2926     qemu_vfree(bounce_buffer);
2927     return ret;
2928 }
2929 
2930 /*
2931  * Forwards an already correctly aligned request to the BlockDriver. This
2932  * handles copy on read and zeroing after EOF; any other features must be
2933  * implemented by the caller.
2934  */
2935 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2936     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2937     int64_t align, QEMUIOVector *qiov, int flags)
2938 {
2939     BlockDriver *drv = bs->drv;
2940     int ret;
2941 
2942     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2943     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2944 
2945     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2946     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2947 
2948     /* Handle Copy on Read and associated serialisation */
2949     if (flags & BDRV_REQ_COPY_ON_READ) {
2950         /* If we touch the same cluster it counts as an overlap.  This
2951          * guarantees that allocating writes will be serialized and not race
2952          * with each other for the same cluster.  For example, in copy-on-read
2953          * it ensures that the CoR read and write operations are atomic and
2954          * guest writes cannot interleave between them. */
2955         mark_request_serialising(req, bdrv_get_cluster_size(bs));
2956     }
2957 
2958     wait_serialising_requests(req);
2959 
2960     if (flags & BDRV_REQ_COPY_ON_READ) {
2961         int pnum;
2962 
2963         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2964         if (ret < 0) {
2965             goto out;
2966         }
2967 
2968         if (!ret || pnum != nb_sectors) {
2969             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2970             goto out;
2971         }
2972     }
2973 
2974     /* Forward the request to the BlockDriver */
2975     if (!(bs->zero_beyond_eof && bs->growable)) {
2976         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2977     } else {
2978         /* Read zeros after EOF of growable BDSes */
2979         int64_t len, total_sectors, max_nb_sectors;
2980 
2981         len = bdrv_getlength(bs);
2982         if (len < 0) {
2983             ret = len;
2984             goto out;
2985         }
2986 
2987         total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
2988         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
2989                                   align >> BDRV_SECTOR_BITS);
2990         if (max_nb_sectors > 0) {
2991             ret = drv->bdrv_co_readv(bs, sector_num,
2992                                      MIN(nb_sectors, max_nb_sectors), qiov);
2993         } else {
2994             ret = 0;
2995         }
2996 
2997         /* Reading beyond end of file is supposed to produce zeroes */
2998         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2999             uint64_t offset = MAX(0, total_sectors - sector_num);
3000             uint64_t bytes = (sector_num + nb_sectors - offset) *
3001                               BDRV_SECTOR_SIZE;
3002             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3003         }
3004     }
3005 
3006 out:
3007     return ret;
3008 }
3009 
3010 /*
3011  * Handle a read request in coroutine context
3012  */
3013 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3014     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3015     BdrvRequestFlags flags)
3016 {
3017     BlockDriver *drv = bs->drv;
3018     BdrvTrackedRequest req;
3019 
3020     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3021     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3022     uint8_t *head_buf = NULL;
3023     uint8_t *tail_buf = NULL;
3024     QEMUIOVector local_qiov;
3025     bool use_local_qiov = false;
3026     int ret;
3027 
3028     if (!drv) {
3029         return -ENOMEDIUM;
3030     }
3031     if (bdrv_check_byte_request(bs, offset, bytes)) {
3032         return -EIO;
3033     }
3034 
3035     if (bs->copy_on_read) {
3036         flags |= BDRV_REQ_COPY_ON_READ;
3037     }
3038 
3039     /* throttling disk I/O */
3040     if (bs->io_limits_enabled) {
3041         bdrv_io_limits_intercept(bs, bytes, false);
3042     }
3043 
3044     /* Align read if necessary by padding qiov */
3045     if (offset & (align - 1)) {
3046         head_buf = qemu_blockalign(bs, align);
3047         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3048         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3049         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3050         use_local_qiov = true;
3051 
3052         bytes += offset & (align - 1);
3053         offset = offset & ~(align - 1);
3054     }
3055 
3056     if ((offset + bytes) & (align - 1)) {
3057         if (!use_local_qiov) {
3058             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3059             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3060             use_local_qiov = true;
3061         }
3062         tail_buf = qemu_blockalign(bs, align);
3063         qemu_iovec_add(&local_qiov, tail_buf,
3064                        align - ((offset + bytes) & (align - 1)));
3065 
3066         bytes = ROUND_UP(bytes, align);
3067     }
3068 
3069     tracked_request_begin(&req, bs, offset, bytes, false);
3070     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3071                               use_local_qiov ? &local_qiov : qiov,
3072                               flags);
3073     tracked_request_end(&req);
3074 
3075     if (use_local_qiov) {
3076         qemu_iovec_destroy(&local_qiov);
3077         qemu_vfree(head_buf);
3078         qemu_vfree(tail_buf);
3079     }
3080 
3081     return ret;
3082 }
3083 
3084 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3085     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3086     BdrvRequestFlags flags)
3087 {
3088     if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3089         return -EINVAL;
3090     }
3091 
3092     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3093                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3094 }
3095 
3096 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3097     int nb_sectors, QEMUIOVector *qiov)
3098 {
3099     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3100 
3101     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3102 }
3103 
3104 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3105     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3106 {
3107     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3108 
3109     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3110                             BDRV_REQ_COPY_ON_READ);
3111 }
3112 
3113 /* if no limit is specified in the BlockLimits use a default
3114  * of 32768 512-byte sectors (16 MiB) per request.
3115  */
3116 #define MAX_WRITE_ZEROES_DEFAULT 32768
3117 
3118 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3119     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3120 {
3121     BlockDriver *drv = bs->drv;
3122     QEMUIOVector qiov;
3123     struct iovec iov = {0};
3124     int ret = 0;
3125 
3126     int max_write_zeroes = bs->bl.max_write_zeroes ?
3127                            bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3128 
3129     while (nb_sectors > 0 && !ret) {
3130         int num = nb_sectors;
3131 
3132         /* Align request.  Block drivers can expect the "bulk" of the request
3133          * to be aligned.
3134          */
3135         if (bs->bl.write_zeroes_alignment
3136             && num > bs->bl.write_zeroes_alignment) {
3137             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3138                 /* Make a small request up to the first aligned sector.  */
3139                 num = bs->bl.write_zeroes_alignment;
3140                 num -= sector_num % bs->bl.write_zeroes_alignment;
3141             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3142                 /* Shorten the request to the last aligned sector.  num cannot
3143                  * underflow because num > bs->bl.write_zeroes_alignment.
3144                  */
3145                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3146             }
3147         }
3148 
3149         /* limit request size */
3150         if (num > max_write_zeroes) {
3151             num = max_write_zeroes;
3152         }
3153 
3154         ret = -ENOTSUP;
3155         /* First try the efficient write zeroes operation */
3156         if (drv->bdrv_co_write_zeroes) {
3157             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3158         }
3159 
3160         if (ret == -ENOTSUP) {
3161             /* Fall back to bounce buffer if write zeroes is unsupported */
3162             iov.iov_len = num * BDRV_SECTOR_SIZE;
3163             if (iov.iov_base == NULL) {
3164                 iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3165                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3166             }
3167             qemu_iovec_init_external(&qiov, &iov, 1);
3168 
3169             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3170 
3171             /* Keep bounce buffer around if it is big enough for all
3172              * all future requests.
3173              */
3174             if (num < max_write_zeroes) {
3175                 qemu_vfree(iov.iov_base);
3176                 iov.iov_base = NULL;
3177             }
3178         }
3179 
3180         sector_num += num;
3181         nb_sectors -= num;
3182     }
3183 
3184     qemu_vfree(iov.iov_base);
3185     return ret;
3186 }
3187 
3188 /*
3189  * Forwards an already correctly aligned write request to the BlockDriver.
3190  */
3191 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3192     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3193     QEMUIOVector *qiov, int flags)
3194 {
3195     BlockDriver *drv = bs->drv;
3196     bool waited;
3197     int ret;
3198 
3199     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3200     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3201 
3202     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3203     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3204 
3205     waited = wait_serialising_requests(req);
3206     assert(!waited || !req->serialising);
3207     assert(req->overlap_offset <= offset);
3208     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3209 
3210     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3211 
3212     if (ret < 0) {
3213         /* Do nothing, write notifier decided to fail this request */
3214     } else if (flags & BDRV_REQ_ZERO_WRITE) {
3215         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3216         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3217     } else {
3218         BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3219         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3220     }
3221     BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3222 
3223     if (ret == 0 && !bs->enable_write_cache) {
3224         ret = bdrv_co_flush(bs);
3225     }
3226 
3227     bdrv_set_dirty(bs, sector_num, nb_sectors);
3228 
3229     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3230         bs->wr_highest_sector = sector_num + nb_sectors - 1;
3231     }
3232     if (bs->growable && ret >= 0) {
3233         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3234     }
3235 
3236     return ret;
3237 }
3238 
3239 /*
3240  * Handle a write request in coroutine context
3241  */
3242 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3243     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3244     BdrvRequestFlags flags)
3245 {
3246     BdrvTrackedRequest req;
3247     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3248     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3249     uint8_t *head_buf = NULL;
3250     uint8_t *tail_buf = NULL;
3251     QEMUIOVector local_qiov;
3252     bool use_local_qiov = false;
3253     int ret;
3254 
3255     if (!bs->drv) {
3256         return -ENOMEDIUM;
3257     }
3258     if (bs->read_only) {
3259         return -EACCES;
3260     }
3261     if (bdrv_check_byte_request(bs, offset, bytes)) {
3262         return -EIO;
3263     }
3264 
3265     /* throttling disk I/O */
3266     if (bs->io_limits_enabled) {
3267         bdrv_io_limits_intercept(bs, bytes, true);
3268     }
3269 
3270     /*
3271      * Align write if necessary by performing a read-modify-write cycle.
3272      * Pad qiov with the read parts and be sure to have a tracked request not
3273      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3274      */
3275     tracked_request_begin(&req, bs, offset, bytes, true);
3276 
3277     if (offset & (align - 1)) {
3278         QEMUIOVector head_qiov;
3279         struct iovec head_iov;
3280 
3281         mark_request_serialising(&req, align);
3282         wait_serialising_requests(&req);
3283 
3284         head_buf = qemu_blockalign(bs, align);
3285         head_iov = (struct iovec) {
3286             .iov_base   = head_buf,
3287             .iov_len    = align,
3288         };
3289         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3290 
3291         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3292         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3293                                   align, &head_qiov, 0);
3294         if (ret < 0) {
3295             goto fail;
3296         }
3297         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3298 
3299         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3300         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3301         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3302         use_local_qiov = true;
3303 
3304         bytes += offset & (align - 1);
3305         offset = offset & ~(align - 1);
3306     }
3307 
3308     if ((offset + bytes) & (align - 1)) {
3309         QEMUIOVector tail_qiov;
3310         struct iovec tail_iov;
3311         size_t tail_bytes;
3312         bool waited;
3313 
3314         mark_request_serialising(&req, align);
3315         waited = wait_serialising_requests(&req);
3316         assert(!waited || !use_local_qiov);
3317 
3318         tail_buf = qemu_blockalign(bs, align);
3319         tail_iov = (struct iovec) {
3320             .iov_base   = tail_buf,
3321             .iov_len    = align,
3322         };
3323         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3324 
3325         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3326         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3327                                   align, &tail_qiov, 0);
3328         if (ret < 0) {
3329             goto fail;
3330         }
3331         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3332 
3333         if (!use_local_qiov) {
3334             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3335             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3336             use_local_qiov = true;
3337         }
3338 
3339         tail_bytes = (offset + bytes) & (align - 1);
3340         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3341 
3342         bytes = ROUND_UP(bytes, align);
3343     }
3344 
3345     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3346                                use_local_qiov ? &local_qiov : qiov,
3347                                flags);
3348 
3349 fail:
3350     tracked_request_end(&req);
3351 
3352     if (use_local_qiov) {
3353         qemu_iovec_destroy(&local_qiov);
3354     }
3355     qemu_vfree(head_buf);
3356     qemu_vfree(tail_buf);
3357 
3358     return ret;
3359 }
3360 
3361 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3362     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3363     BdrvRequestFlags flags)
3364 {
3365     if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3366         return -EINVAL;
3367     }
3368 
3369     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3370                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3371 }
3372 
3373 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3374     int nb_sectors, QEMUIOVector *qiov)
3375 {
3376     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3377 
3378     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3379 }
3380 
3381 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3382                                       int64_t sector_num, int nb_sectors,
3383                                       BdrvRequestFlags flags)
3384 {
3385     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3386 
3387     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3388         flags &= ~BDRV_REQ_MAY_UNMAP;
3389     }
3390 
3391     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3392                              BDRV_REQ_ZERO_WRITE | flags);
3393 }
3394 
3395 /**
3396  * Truncate file to 'offset' bytes (needed only for file protocols)
3397  */
3398 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3399 {
3400     BlockDriver *drv = bs->drv;
3401     int ret;
3402     if (!drv)
3403         return -ENOMEDIUM;
3404     if (!drv->bdrv_truncate)
3405         return -ENOTSUP;
3406     if (bs->read_only)
3407         return -EACCES;
3408     if (bdrv_in_use(bs))
3409         return -EBUSY;
3410     ret = drv->bdrv_truncate(bs, offset);
3411     if (ret == 0) {
3412         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3413         bdrv_dev_resize_cb(bs);
3414     }
3415     return ret;
3416 }
3417 
3418 /**
3419  * Length of a allocated file in bytes. Sparse files are counted by actual
3420  * allocated space. Return < 0 if error or unknown.
3421  */
3422 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3423 {
3424     BlockDriver *drv = bs->drv;
3425     if (!drv) {
3426         return -ENOMEDIUM;
3427     }
3428     if (drv->bdrv_get_allocated_file_size) {
3429         return drv->bdrv_get_allocated_file_size(bs);
3430     }
3431     if (bs->file) {
3432         return bdrv_get_allocated_file_size(bs->file);
3433     }
3434     return -ENOTSUP;
3435 }
3436 
3437 /**
3438  * Length of a file in bytes. Return < 0 if error or unknown.
3439  */
3440 int64_t bdrv_getlength(BlockDriverState *bs)
3441 {
3442     BlockDriver *drv = bs->drv;
3443     if (!drv)
3444         return -ENOMEDIUM;
3445 
3446     if (drv->has_variable_length) {
3447         int ret = refresh_total_sectors(bs, bs->total_sectors);
3448         if (ret < 0) {
3449             return ret;
3450         }
3451     }
3452     return bs->total_sectors * BDRV_SECTOR_SIZE;
3453 }
3454 
3455 /* return 0 as number of sectors if no device present or error */
3456 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3457 {
3458     int64_t length;
3459     length = bdrv_getlength(bs);
3460     if (length < 0)
3461         length = 0;
3462     else
3463         length = length >> BDRV_SECTOR_BITS;
3464     *nb_sectors_ptr = length;
3465 }
3466 
3467 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3468                        BlockdevOnError on_write_error)
3469 {
3470     bs->on_read_error = on_read_error;
3471     bs->on_write_error = on_write_error;
3472 }
3473 
3474 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3475 {
3476     return is_read ? bs->on_read_error : bs->on_write_error;
3477 }
3478 
3479 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3480 {
3481     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3482 
3483     switch (on_err) {
3484     case BLOCKDEV_ON_ERROR_ENOSPC:
3485         return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3486     case BLOCKDEV_ON_ERROR_STOP:
3487         return BDRV_ACTION_STOP;
3488     case BLOCKDEV_ON_ERROR_REPORT:
3489         return BDRV_ACTION_REPORT;
3490     case BLOCKDEV_ON_ERROR_IGNORE:
3491         return BDRV_ACTION_IGNORE;
3492     default:
3493         abort();
3494     }
3495 }
3496 
3497 /* This is done by device models because, while the block layer knows
3498  * about the error, it does not know whether an operation comes from
3499  * the device or the block layer (from a job, for example).
3500  */
3501 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3502                        bool is_read, int error)
3503 {
3504     assert(error >= 0);
3505     bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3506     if (action == BDRV_ACTION_STOP) {
3507         vm_stop(RUN_STATE_IO_ERROR);
3508         bdrv_iostatus_set_err(bs, error);
3509     }
3510 }
3511 
3512 int bdrv_is_read_only(BlockDriverState *bs)
3513 {
3514     return bs->read_only;
3515 }
3516 
3517 int bdrv_is_sg(BlockDriverState *bs)
3518 {
3519     return bs->sg;
3520 }
3521 
3522 int bdrv_enable_write_cache(BlockDriverState *bs)
3523 {
3524     return bs->enable_write_cache;
3525 }
3526 
3527 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3528 {
3529     bs->enable_write_cache = wce;
3530 
3531     /* so a reopen() will preserve wce */
3532     if (wce) {
3533         bs->open_flags |= BDRV_O_CACHE_WB;
3534     } else {
3535         bs->open_flags &= ~BDRV_O_CACHE_WB;
3536     }
3537 }
3538 
3539 int bdrv_is_encrypted(BlockDriverState *bs)
3540 {
3541     if (bs->backing_hd && bs->backing_hd->encrypted)
3542         return 1;
3543     return bs->encrypted;
3544 }
3545 
3546 int bdrv_key_required(BlockDriverState *bs)
3547 {
3548     BlockDriverState *backing_hd = bs->backing_hd;
3549 
3550     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3551         return 1;
3552     return (bs->encrypted && !bs->valid_key);
3553 }
3554 
3555 int bdrv_set_key(BlockDriverState *bs, const char *key)
3556 {
3557     int ret;
3558     if (bs->backing_hd && bs->backing_hd->encrypted) {
3559         ret = bdrv_set_key(bs->backing_hd, key);
3560         if (ret < 0)
3561             return ret;
3562         if (!bs->encrypted)
3563             return 0;
3564     }
3565     if (!bs->encrypted) {
3566         return -EINVAL;
3567     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3568         return -ENOMEDIUM;
3569     }
3570     ret = bs->drv->bdrv_set_key(bs, key);
3571     if (ret < 0) {
3572         bs->valid_key = 0;
3573     } else if (!bs->valid_key) {
3574         bs->valid_key = 1;
3575         /* call the change callback now, we skipped it on open */
3576         bdrv_dev_change_media_cb(bs, true);
3577     }
3578     return ret;
3579 }
3580 
3581 const char *bdrv_get_format_name(BlockDriverState *bs)
3582 {
3583     return bs->drv ? bs->drv->format_name : NULL;
3584 }
3585 
3586 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3587                          void *opaque)
3588 {
3589     BlockDriver *drv;
3590 
3591     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3592         it(opaque, drv->format_name);
3593     }
3594 }
3595 
3596 /* This function is to find block backend bs */
3597 BlockDriverState *bdrv_find(const char *name)
3598 {
3599     BlockDriverState *bs;
3600 
3601     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3602         if (!strcmp(name, bs->device_name)) {
3603             return bs;
3604         }
3605     }
3606     return NULL;
3607 }
3608 
3609 /* This function is to find a node in the bs graph */
3610 BlockDriverState *bdrv_find_node(const char *node_name)
3611 {
3612     BlockDriverState *bs;
3613 
3614     assert(node_name);
3615 
3616     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3617         if (!strcmp(node_name, bs->node_name)) {
3618             return bs;
3619         }
3620     }
3621     return NULL;
3622 }
3623 
3624 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3625 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3626 {
3627     BlockDeviceInfoList *list, *entry;
3628     BlockDriverState *bs;
3629 
3630     list = NULL;
3631     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3632         entry = g_malloc0(sizeof(*entry));
3633         entry->value = bdrv_block_device_info(bs);
3634         entry->next = list;
3635         list = entry;
3636     }
3637 
3638     return list;
3639 }
3640 
3641 BlockDriverState *bdrv_lookup_bs(const char *device,
3642                                  const char *node_name,
3643                                  Error **errp)
3644 {
3645     BlockDriverState *bs = NULL;
3646 
3647     if (device) {
3648         bs = bdrv_find(device);
3649 
3650         if (bs) {
3651             return bs;
3652         }
3653     }
3654 
3655     if (node_name) {
3656         bs = bdrv_find_node(node_name);
3657 
3658         if (bs) {
3659             return bs;
3660         }
3661     }
3662 
3663     error_setg(errp, "Cannot find device=%s nor node_name=%s",
3664                      device ? device : "",
3665                      node_name ? node_name : "");
3666     return NULL;
3667 }
3668 
3669 BlockDriverState *bdrv_next(BlockDriverState *bs)
3670 {
3671     if (!bs) {
3672         return QTAILQ_FIRST(&bdrv_states);
3673     }
3674     return QTAILQ_NEXT(bs, device_list);
3675 }
3676 
3677 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3678 {
3679     BlockDriverState *bs;
3680 
3681     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3682         it(opaque, bs);
3683     }
3684 }
3685 
3686 const char *bdrv_get_device_name(BlockDriverState *bs)
3687 {
3688     return bs->device_name;
3689 }
3690 
3691 int bdrv_get_flags(BlockDriverState *bs)
3692 {
3693     return bs->open_flags;
3694 }
3695 
3696 int bdrv_flush_all(void)
3697 {
3698     BlockDriverState *bs;
3699     int result = 0;
3700 
3701     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3702         int ret = bdrv_flush(bs);
3703         if (ret < 0 && !result) {
3704             result = ret;
3705         }
3706     }
3707 
3708     return result;
3709 }
3710 
3711 int bdrv_has_zero_init_1(BlockDriverState *bs)
3712 {
3713     return 1;
3714 }
3715 
3716 int bdrv_has_zero_init(BlockDriverState *bs)
3717 {
3718     assert(bs->drv);
3719 
3720     /* If BS is a copy on write image, it is initialized to
3721        the contents of the base image, which may not be zeroes.  */
3722     if (bs->backing_hd) {
3723         return 0;
3724     }
3725     if (bs->drv->bdrv_has_zero_init) {
3726         return bs->drv->bdrv_has_zero_init(bs);
3727     }
3728 
3729     /* safe default */
3730     return 0;
3731 }
3732 
3733 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3734 {
3735     BlockDriverInfo bdi;
3736 
3737     if (bs->backing_hd) {
3738         return false;
3739     }
3740 
3741     if (bdrv_get_info(bs, &bdi) == 0) {
3742         return bdi.unallocated_blocks_are_zero;
3743     }
3744 
3745     return false;
3746 }
3747 
3748 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3749 {
3750     BlockDriverInfo bdi;
3751 
3752     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3753         return false;
3754     }
3755 
3756     if (bdrv_get_info(bs, &bdi) == 0) {
3757         return bdi.can_write_zeroes_with_unmap;
3758     }
3759 
3760     return false;
3761 }
3762 
3763 typedef struct BdrvCoGetBlockStatusData {
3764     BlockDriverState *bs;
3765     BlockDriverState *base;
3766     int64_t sector_num;
3767     int nb_sectors;
3768     int *pnum;
3769     int64_t ret;
3770     bool done;
3771 } BdrvCoGetBlockStatusData;
3772 
3773 /*
3774  * Returns true iff the specified sector is present in the disk image. Drivers
3775  * not implementing the functionality are assumed to not support backing files,
3776  * hence all their sectors are reported as allocated.
3777  *
3778  * If 'sector_num' is beyond the end of the disk image the return value is 0
3779  * and 'pnum' is set to 0.
3780  *
3781  * 'pnum' is set to the number of sectors (including and immediately following
3782  * the specified sector) that are known to be in the same
3783  * allocated/unallocated state.
3784  *
3785  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3786  * beyond the end of the disk image it will be clamped.
3787  */
3788 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3789                                                      int64_t sector_num,
3790                                                      int nb_sectors, int *pnum)
3791 {
3792     int64_t length;
3793     int64_t n;
3794     int64_t ret, ret2;
3795 
3796     length = bdrv_getlength(bs);
3797     if (length < 0) {
3798         return length;
3799     }
3800 
3801     if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3802         *pnum = 0;
3803         return 0;
3804     }
3805 
3806     n = bs->total_sectors - sector_num;
3807     if (n < nb_sectors) {
3808         nb_sectors = n;
3809     }
3810 
3811     if (!bs->drv->bdrv_co_get_block_status) {
3812         *pnum = nb_sectors;
3813         ret = BDRV_BLOCK_DATA;
3814         if (bs->drv->protocol_name) {
3815             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3816         }
3817         return ret;
3818     }
3819 
3820     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3821     if (ret < 0) {
3822         *pnum = 0;
3823         return ret;
3824     }
3825 
3826     if (ret & BDRV_BLOCK_RAW) {
3827         assert(ret & BDRV_BLOCK_OFFSET_VALID);
3828         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3829                                      *pnum, pnum);
3830     }
3831 
3832     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3833         if (bdrv_unallocated_blocks_are_zero(bs)) {
3834             ret |= BDRV_BLOCK_ZERO;
3835         } else if (bs->backing_hd) {
3836             BlockDriverState *bs2 = bs->backing_hd;
3837             int64_t length2 = bdrv_getlength(bs2);
3838             if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3839                 ret |= BDRV_BLOCK_ZERO;
3840             }
3841         }
3842     }
3843 
3844     if (bs->file &&
3845         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3846         (ret & BDRV_BLOCK_OFFSET_VALID)) {
3847         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3848                                         *pnum, pnum);
3849         if (ret2 >= 0) {
3850             /* Ignore errors.  This is just providing extra information, it
3851              * is useful but not necessary.
3852              */
3853             ret |= (ret2 & BDRV_BLOCK_ZERO);
3854         }
3855     }
3856 
3857     return ret;
3858 }
3859 
3860 /* Coroutine wrapper for bdrv_get_block_status() */
3861 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3862 {
3863     BdrvCoGetBlockStatusData *data = opaque;
3864     BlockDriverState *bs = data->bs;
3865 
3866     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3867                                          data->pnum);
3868     data->done = true;
3869 }
3870 
3871 /*
3872  * Synchronous wrapper around bdrv_co_get_block_status().
3873  *
3874  * See bdrv_co_get_block_status() for details.
3875  */
3876 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3877                               int nb_sectors, int *pnum)
3878 {
3879     Coroutine *co;
3880     BdrvCoGetBlockStatusData data = {
3881         .bs = bs,
3882         .sector_num = sector_num,
3883         .nb_sectors = nb_sectors,
3884         .pnum = pnum,
3885         .done = false,
3886     };
3887 
3888     if (qemu_in_coroutine()) {
3889         /* Fast-path if already in coroutine context */
3890         bdrv_get_block_status_co_entry(&data);
3891     } else {
3892         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3893         qemu_coroutine_enter(co, &data);
3894         while (!data.done) {
3895             qemu_aio_wait();
3896         }
3897     }
3898     return data.ret;
3899 }
3900 
3901 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3902                                    int nb_sectors, int *pnum)
3903 {
3904     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3905     if (ret < 0) {
3906         return ret;
3907     }
3908     return
3909         (ret & BDRV_BLOCK_DATA) ||
3910         ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3911 }
3912 
3913 /*
3914  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3915  *
3916  * Return true if the given sector is allocated in any image between
3917  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3918  * sector is allocated in any image of the chain.  Return false otherwise.
3919  *
3920  * 'pnum' is set to the number of sectors (including and immediately following
3921  *  the specified sector) that are known to be in the same
3922  *  allocated/unallocated state.
3923  *
3924  */
3925 int bdrv_is_allocated_above(BlockDriverState *top,
3926                             BlockDriverState *base,
3927                             int64_t sector_num,
3928                             int nb_sectors, int *pnum)
3929 {
3930     BlockDriverState *intermediate;
3931     int ret, n = nb_sectors;
3932 
3933     intermediate = top;
3934     while (intermediate && intermediate != base) {
3935         int pnum_inter;
3936         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3937                                 &pnum_inter);
3938         if (ret < 0) {
3939             return ret;
3940         } else if (ret) {
3941             *pnum = pnum_inter;
3942             return 1;
3943         }
3944 
3945         /*
3946          * [sector_num, nb_sectors] is unallocated on top but intermediate
3947          * might have
3948          *
3949          * [sector_num+x, nr_sectors] allocated.
3950          */
3951         if (n > pnum_inter &&
3952             (intermediate == top ||
3953              sector_num + pnum_inter < intermediate->total_sectors)) {
3954             n = pnum_inter;
3955         }
3956 
3957         intermediate = intermediate->backing_hd;
3958     }
3959 
3960     *pnum = n;
3961     return 0;
3962 }
3963 
3964 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3965 {
3966     if (bs->backing_hd && bs->backing_hd->encrypted)
3967         return bs->backing_file;
3968     else if (bs->encrypted)
3969         return bs->filename;
3970     else
3971         return NULL;
3972 }
3973 
3974 void bdrv_get_backing_filename(BlockDriverState *bs,
3975                                char *filename, int filename_size)
3976 {
3977     pstrcpy(filename, filename_size, bs->backing_file);
3978 }
3979 
3980 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3981                           const uint8_t *buf, int nb_sectors)
3982 {
3983     BlockDriver *drv = bs->drv;
3984     if (!drv)
3985         return -ENOMEDIUM;
3986     if (!drv->bdrv_write_compressed)
3987         return -ENOTSUP;
3988     if (bdrv_check_request(bs, sector_num, nb_sectors))
3989         return -EIO;
3990 
3991     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
3992 
3993     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3994 }
3995 
3996 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3997 {
3998     BlockDriver *drv = bs->drv;
3999     if (!drv)
4000         return -ENOMEDIUM;
4001     if (!drv->bdrv_get_info)
4002         return -ENOTSUP;
4003     memset(bdi, 0, sizeof(*bdi));
4004     return drv->bdrv_get_info(bs, bdi);
4005 }
4006 
4007 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4008 {
4009     BlockDriver *drv = bs->drv;
4010     if (drv && drv->bdrv_get_specific_info) {
4011         return drv->bdrv_get_specific_info(bs);
4012     }
4013     return NULL;
4014 }
4015 
4016 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4017                       int64_t pos, int size)
4018 {
4019     QEMUIOVector qiov;
4020     struct iovec iov = {
4021         .iov_base   = (void *) buf,
4022         .iov_len    = size,
4023     };
4024 
4025     qemu_iovec_init_external(&qiov, &iov, 1);
4026     return bdrv_writev_vmstate(bs, &qiov, pos);
4027 }
4028 
4029 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4030 {
4031     BlockDriver *drv = bs->drv;
4032 
4033     if (!drv) {
4034         return -ENOMEDIUM;
4035     } else if (drv->bdrv_save_vmstate) {
4036         return drv->bdrv_save_vmstate(bs, qiov, pos);
4037     } else if (bs->file) {
4038         return bdrv_writev_vmstate(bs->file, qiov, pos);
4039     }
4040 
4041     return -ENOTSUP;
4042 }
4043 
4044 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4045                       int64_t pos, int size)
4046 {
4047     BlockDriver *drv = bs->drv;
4048     if (!drv)
4049         return -ENOMEDIUM;
4050     if (drv->bdrv_load_vmstate)
4051         return drv->bdrv_load_vmstate(bs, buf, pos, size);
4052     if (bs->file)
4053         return bdrv_load_vmstate(bs->file, buf, pos, size);
4054     return -ENOTSUP;
4055 }
4056 
4057 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4058 {
4059     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4060         return;
4061     }
4062 
4063     bs->drv->bdrv_debug_event(bs, event);
4064 }
4065 
4066 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4067                           const char *tag)
4068 {
4069     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4070         bs = bs->file;
4071     }
4072 
4073     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4074         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4075     }
4076 
4077     return -ENOTSUP;
4078 }
4079 
4080 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4081 {
4082     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4083         bs = bs->file;
4084     }
4085 
4086     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4087         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4088     }
4089 
4090     return -ENOTSUP;
4091 }
4092 
4093 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4094 {
4095     while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4096         bs = bs->file;
4097     }
4098 
4099     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4100         return bs->drv->bdrv_debug_resume(bs, tag);
4101     }
4102 
4103     return -ENOTSUP;
4104 }
4105 
4106 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4107 {
4108     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4109         bs = bs->file;
4110     }
4111 
4112     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4113         return bs->drv->bdrv_debug_is_suspended(bs, tag);
4114     }
4115 
4116     return false;
4117 }
4118 
4119 int bdrv_is_snapshot(BlockDriverState *bs)
4120 {
4121     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4122 }
4123 
4124 /* backing_file can either be relative, or absolute, or a protocol.  If it is
4125  * relative, it must be relative to the chain.  So, passing in bs->filename
4126  * from a BDS as backing_file should not be done, as that may be relative to
4127  * the CWD rather than the chain. */
4128 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4129         const char *backing_file)
4130 {
4131     char *filename_full = NULL;
4132     char *backing_file_full = NULL;
4133     char *filename_tmp = NULL;
4134     int is_protocol = 0;
4135     BlockDriverState *curr_bs = NULL;
4136     BlockDriverState *retval = NULL;
4137 
4138     if (!bs || !bs->drv || !backing_file) {
4139         return NULL;
4140     }
4141 
4142     filename_full     = g_malloc(PATH_MAX);
4143     backing_file_full = g_malloc(PATH_MAX);
4144     filename_tmp      = g_malloc(PATH_MAX);
4145 
4146     is_protocol = path_has_protocol(backing_file);
4147 
4148     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4149 
4150         /* If either of the filename paths is actually a protocol, then
4151          * compare unmodified paths; otherwise make paths relative */
4152         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4153             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4154                 retval = curr_bs->backing_hd;
4155                 break;
4156             }
4157         } else {
4158             /* If not an absolute filename path, make it relative to the current
4159              * image's filename path */
4160             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4161                          backing_file);
4162 
4163             /* We are going to compare absolute pathnames */
4164             if (!realpath(filename_tmp, filename_full)) {
4165                 continue;
4166             }
4167 
4168             /* We need to make sure the backing filename we are comparing against
4169              * is relative to the current image filename (or absolute) */
4170             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4171                          curr_bs->backing_file);
4172 
4173             if (!realpath(filename_tmp, backing_file_full)) {
4174                 continue;
4175             }
4176 
4177             if (strcmp(backing_file_full, filename_full) == 0) {
4178                 retval = curr_bs->backing_hd;
4179                 break;
4180             }
4181         }
4182     }
4183 
4184     g_free(filename_full);
4185     g_free(backing_file_full);
4186     g_free(filename_tmp);
4187     return retval;
4188 }
4189 
4190 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4191 {
4192     if (!bs->drv) {
4193         return 0;
4194     }
4195 
4196     if (!bs->backing_hd) {
4197         return 0;
4198     }
4199 
4200     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4201 }
4202 
4203 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4204 {
4205     BlockDriverState *curr_bs = NULL;
4206 
4207     if (!bs) {
4208         return NULL;
4209     }
4210 
4211     curr_bs = bs;
4212 
4213     while (curr_bs->backing_hd) {
4214         curr_bs = curr_bs->backing_hd;
4215     }
4216     return curr_bs;
4217 }
4218 
4219 /**************************************************************/
4220 /* async I/Os */
4221 
4222 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4223                                  QEMUIOVector *qiov, int nb_sectors,
4224                                  BlockDriverCompletionFunc *cb, void *opaque)
4225 {
4226     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4227 
4228     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4229                                  cb, opaque, false);
4230 }
4231 
4232 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4233                                   QEMUIOVector *qiov, int nb_sectors,
4234                                   BlockDriverCompletionFunc *cb, void *opaque)
4235 {
4236     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4237 
4238     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4239                                  cb, opaque, true);
4240 }
4241 
4242 BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4243         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4244         BlockDriverCompletionFunc *cb, void *opaque)
4245 {
4246     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4247 
4248     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4249                                  BDRV_REQ_ZERO_WRITE | flags,
4250                                  cb, opaque, true);
4251 }
4252 
4253 
4254 typedef struct MultiwriteCB {
4255     int error;
4256     int num_requests;
4257     int num_callbacks;
4258     struct {
4259         BlockDriverCompletionFunc *cb;
4260         void *opaque;
4261         QEMUIOVector *free_qiov;
4262     } callbacks[];
4263 } MultiwriteCB;
4264 
4265 static void multiwrite_user_cb(MultiwriteCB *mcb)
4266 {
4267     int i;
4268 
4269     for (i = 0; i < mcb->num_callbacks; i++) {
4270         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4271         if (mcb->callbacks[i].free_qiov) {
4272             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4273         }
4274         g_free(mcb->callbacks[i].free_qiov);
4275     }
4276 }
4277 
4278 static void multiwrite_cb(void *opaque, int ret)
4279 {
4280     MultiwriteCB *mcb = opaque;
4281 
4282     trace_multiwrite_cb(mcb, ret);
4283 
4284     if (ret < 0 && !mcb->error) {
4285         mcb->error = ret;
4286     }
4287 
4288     mcb->num_requests--;
4289     if (mcb->num_requests == 0) {
4290         multiwrite_user_cb(mcb);
4291         g_free(mcb);
4292     }
4293 }
4294 
4295 static int multiwrite_req_compare(const void *a, const void *b)
4296 {
4297     const BlockRequest *req1 = a, *req2 = b;
4298 
4299     /*
4300      * Note that we can't simply subtract req2->sector from req1->sector
4301      * here as that could overflow the return value.
4302      */
4303     if (req1->sector > req2->sector) {
4304         return 1;
4305     } else if (req1->sector < req2->sector) {
4306         return -1;
4307     } else {
4308         return 0;
4309     }
4310 }
4311 
4312 /*
4313  * Takes a bunch of requests and tries to merge them. Returns the number of
4314  * requests that remain after merging.
4315  */
4316 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4317     int num_reqs, MultiwriteCB *mcb)
4318 {
4319     int i, outidx;
4320 
4321     // Sort requests by start sector
4322     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4323 
4324     // Check if adjacent requests touch the same clusters. If so, combine them,
4325     // filling up gaps with zero sectors.
4326     outidx = 0;
4327     for (i = 1; i < num_reqs; i++) {
4328         int merge = 0;
4329         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4330 
4331         // Handle exactly sequential writes and overlapping writes.
4332         if (reqs[i].sector <= oldreq_last) {
4333             merge = 1;
4334         }
4335 
4336         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4337             merge = 0;
4338         }
4339 
4340         if (merge) {
4341             size_t size;
4342             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4343             qemu_iovec_init(qiov,
4344                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4345 
4346             // Add the first request to the merged one. If the requests are
4347             // overlapping, drop the last sectors of the first request.
4348             size = (reqs[i].sector - reqs[outidx].sector) << 9;
4349             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4350 
4351             // We should need to add any zeros between the two requests
4352             assert (reqs[i].sector <= oldreq_last);
4353 
4354             // Add the second request
4355             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4356 
4357             reqs[outidx].nb_sectors = qiov->size >> 9;
4358             reqs[outidx].qiov = qiov;
4359 
4360             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4361         } else {
4362             outidx++;
4363             reqs[outidx].sector     = reqs[i].sector;
4364             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4365             reqs[outidx].qiov       = reqs[i].qiov;
4366         }
4367     }
4368 
4369     return outidx + 1;
4370 }
4371 
4372 /*
4373  * Submit multiple AIO write requests at once.
4374  *
4375  * On success, the function returns 0 and all requests in the reqs array have
4376  * been submitted. In error case this function returns -1, and any of the
4377  * requests may or may not be submitted yet. In particular, this means that the
4378  * callback will be called for some of the requests, for others it won't. The
4379  * caller must check the error field of the BlockRequest to wait for the right
4380  * callbacks (if error != 0, no callback will be called).
4381  *
4382  * The implementation may modify the contents of the reqs array, e.g. to merge
4383  * requests. However, the fields opaque and error are left unmodified as they
4384  * are used to signal failure for a single request to the caller.
4385  */
4386 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4387 {
4388     MultiwriteCB *mcb;
4389     int i;
4390 
4391     /* don't submit writes if we don't have a medium */
4392     if (bs->drv == NULL) {
4393         for (i = 0; i < num_reqs; i++) {
4394             reqs[i].error = -ENOMEDIUM;
4395         }
4396         return -1;
4397     }
4398 
4399     if (num_reqs == 0) {
4400         return 0;
4401     }
4402 
4403     // Create MultiwriteCB structure
4404     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4405     mcb->num_requests = 0;
4406     mcb->num_callbacks = num_reqs;
4407 
4408     for (i = 0; i < num_reqs; i++) {
4409         mcb->callbacks[i].cb = reqs[i].cb;
4410         mcb->callbacks[i].opaque = reqs[i].opaque;
4411     }
4412 
4413     // Check for mergable requests
4414     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4415 
4416     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4417 
4418     /* Run the aio requests. */
4419     mcb->num_requests = num_reqs;
4420     for (i = 0; i < num_reqs; i++) {
4421         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4422                               reqs[i].nb_sectors, reqs[i].flags,
4423                               multiwrite_cb, mcb,
4424                               true);
4425     }
4426 
4427     return 0;
4428 }
4429 
4430 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4431 {
4432     acb->aiocb_info->cancel(acb);
4433 }
4434 
4435 /**************************************************************/
4436 /* async block device emulation */
4437 
4438 typedef struct BlockDriverAIOCBSync {
4439     BlockDriverAIOCB common;
4440     QEMUBH *bh;
4441     int ret;
4442     /* vector translation state */
4443     QEMUIOVector *qiov;
4444     uint8_t *bounce;
4445     int is_write;
4446 } BlockDriverAIOCBSync;
4447 
4448 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4449 {
4450     BlockDriverAIOCBSync *acb =
4451         container_of(blockacb, BlockDriverAIOCBSync, common);
4452     qemu_bh_delete(acb->bh);
4453     acb->bh = NULL;
4454     qemu_aio_release(acb);
4455 }
4456 
4457 static const AIOCBInfo bdrv_em_aiocb_info = {
4458     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
4459     .cancel             = bdrv_aio_cancel_em,
4460 };
4461 
4462 static void bdrv_aio_bh_cb(void *opaque)
4463 {
4464     BlockDriverAIOCBSync *acb = opaque;
4465 
4466     if (!acb->is_write)
4467         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4468     qemu_vfree(acb->bounce);
4469     acb->common.cb(acb->common.opaque, acb->ret);
4470     qemu_bh_delete(acb->bh);
4471     acb->bh = NULL;
4472     qemu_aio_release(acb);
4473 }
4474 
4475 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4476                                             int64_t sector_num,
4477                                             QEMUIOVector *qiov,
4478                                             int nb_sectors,
4479                                             BlockDriverCompletionFunc *cb,
4480                                             void *opaque,
4481                                             int is_write)
4482 
4483 {
4484     BlockDriverAIOCBSync *acb;
4485 
4486     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4487     acb->is_write = is_write;
4488     acb->qiov = qiov;
4489     acb->bounce = qemu_blockalign(bs, qiov->size);
4490     acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4491 
4492     if (is_write) {
4493         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4494         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4495     } else {
4496         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4497     }
4498 
4499     qemu_bh_schedule(acb->bh);
4500 
4501     return &acb->common;
4502 }
4503 
4504 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4505         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4506         BlockDriverCompletionFunc *cb, void *opaque)
4507 {
4508     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4509 }
4510 
4511 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4512         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4513         BlockDriverCompletionFunc *cb, void *opaque)
4514 {
4515     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4516 }
4517 
4518 
4519 typedef struct BlockDriverAIOCBCoroutine {
4520     BlockDriverAIOCB common;
4521     BlockRequest req;
4522     bool is_write;
4523     bool *done;
4524     QEMUBH* bh;
4525 } BlockDriverAIOCBCoroutine;
4526 
4527 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4528 {
4529     BlockDriverAIOCBCoroutine *acb =
4530         container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4531     bool done = false;
4532 
4533     acb->done = &done;
4534     while (!done) {
4535         qemu_aio_wait();
4536     }
4537 }
4538 
4539 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4540     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4541     .cancel             = bdrv_aio_co_cancel_em,
4542 };
4543 
4544 static void bdrv_co_em_bh(void *opaque)
4545 {
4546     BlockDriverAIOCBCoroutine *acb = opaque;
4547 
4548     acb->common.cb(acb->common.opaque, acb->req.error);
4549 
4550     if (acb->done) {
4551         *acb->done = true;
4552     }
4553 
4554     qemu_bh_delete(acb->bh);
4555     qemu_aio_release(acb);
4556 }
4557 
4558 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4559 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4560 {
4561     BlockDriverAIOCBCoroutine *acb = opaque;
4562     BlockDriverState *bs = acb->common.bs;
4563 
4564     if (!acb->is_write) {
4565         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4566             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4567     } else {
4568         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4569             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4570     }
4571 
4572     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4573     qemu_bh_schedule(acb->bh);
4574 }
4575 
4576 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4577                                                int64_t sector_num,
4578                                                QEMUIOVector *qiov,
4579                                                int nb_sectors,
4580                                                BdrvRequestFlags flags,
4581                                                BlockDriverCompletionFunc *cb,
4582                                                void *opaque,
4583                                                bool is_write)
4584 {
4585     Coroutine *co;
4586     BlockDriverAIOCBCoroutine *acb;
4587 
4588     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4589     acb->req.sector = sector_num;
4590     acb->req.nb_sectors = nb_sectors;
4591     acb->req.qiov = qiov;
4592     acb->req.flags = flags;
4593     acb->is_write = is_write;
4594     acb->done = NULL;
4595 
4596     co = qemu_coroutine_create(bdrv_co_do_rw);
4597     qemu_coroutine_enter(co, acb);
4598 
4599     return &acb->common;
4600 }
4601 
4602 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4603 {
4604     BlockDriverAIOCBCoroutine *acb = opaque;
4605     BlockDriverState *bs = acb->common.bs;
4606 
4607     acb->req.error = bdrv_co_flush(bs);
4608     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4609     qemu_bh_schedule(acb->bh);
4610 }
4611 
4612 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4613         BlockDriverCompletionFunc *cb, void *opaque)
4614 {
4615     trace_bdrv_aio_flush(bs, opaque);
4616 
4617     Coroutine *co;
4618     BlockDriverAIOCBCoroutine *acb;
4619 
4620     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4621     acb->done = NULL;
4622 
4623     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4624     qemu_coroutine_enter(co, acb);
4625 
4626     return &acb->common;
4627 }
4628 
4629 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4630 {
4631     BlockDriverAIOCBCoroutine *acb = opaque;
4632     BlockDriverState *bs = acb->common.bs;
4633 
4634     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4635     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4636     qemu_bh_schedule(acb->bh);
4637 }
4638 
4639 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4640         int64_t sector_num, int nb_sectors,
4641         BlockDriverCompletionFunc *cb, void *opaque)
4642 {
4643     Coroutine *co;
4644     BlockDriverAIOCBCoroutine *acb;
4645 
4646     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4647 
4648     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4649     acb->req.sector = sector_num;
4650     acb->req.nb_sectors = nb_sectors;
4651     acb->done = NULL;
4652     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4653     qemu_coroutine_enter(co, acb);
4654 
4655     return &acb->common;
4656 }
4657 
4658 void bdrv_init(void)
4659 {
4660     module_call_init(MODULE_INIT_BLOCK);
4661 }
4662 
4663 void bdrv_init_with_whitelist(void)
4664 {
4665     use_bdrv_whitelist = 1;
4666     bdrv_init();
4667 }
4668 
4669 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4670                    BlockDriverCompletionFunc *cb, void *opaque)
4671 {
4672     BlockDriverAIOCB *acb;
4673 
4674     acb = g_slice_alloc(aiocb_info->aiocb_size);
4675     acb->aiocb_info = aiocb_info;
4676     acb->bs = bs;
4677     acb->cb = cb;
4678     acb->opaque = opaque;
4679     return acb;
4680 }
4681 
4682 void qemu_aio_release(void *p)
4683 {
4684     BlockDriverAIOCB *acb = p;
4685     g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4686 }
4687 
4688 /**************************************************************/
4689 /* Coroutine block device emulation */
4690 
4691 typedef struct CoroutineIOCompletion {
4692     Coroutine *coroutine;
4693     int ret;
4694 } CoroutineIOCompletion;
4695 
4696 static void bdrv_co_io_em_complete(void *opaque, int ret)
4697 {
4698     CoroutineIOCompletion *co = opaque;
4699 
4700     co->ret = ret;
4701     qemu_coroutine_enter(co->coroutine, NULL);
4702 }
4703 
4704 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4705                                       int nb_sectors, QEMUIOVector *iov,
4706                                       bool is_write)
4707 {
4708     CoroutineIOCompletion co = {
4709         .coroutine = qemu_coroutine_self(),
4710     };
4711     BlockDriverAIOCB *acb;
4712 
4713     if (is_write) {
4714         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4715                                        bdrv_co_io_em_complete, &co);
4716     } else {
4717         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4718                                       bdrv_co_io_em_complete, &co);
4719     }
4720 
4721     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4722     if (!acb) {
4723         return -EIO;
4724     }
4725     qemu_coroutine_yield();
4726 
4727     return co.ret;
4728 }
4729 
4730 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4731                                          int64_t sector_num, int nb_sectors,
4732                                          QEMUIOVector *iov)
4733 {
4734     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4735 }
4736 
4737 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4738                                          int64_t sector_num, int nb_sectors,
4739                                          QEMUIOVector *iov)
4740 {
4741     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4742 }
4743 
4744 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4745 {
4746     RwCo *rwco = opaque;
4747 
4748     rwco->ret = bdrv_co_flush(rwco->bs);
4749 }
4750 
4751 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4752 {
4753     int ret;
4754 
4755     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4756         return 0;
4757     }
4758 
4759     /* Write back cached data to the OS even with cache=unsafe */
4760     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4761     if (bs->drv->bdrv_co_flush_to_os) {
4762         ret = bs->drv->bdrv_co_flush_to_os(bs);
4763         if (ret < 0) {
4764             return ret;
4765         }
4766     }
4767 
4768     /* But don't actually force it to the disk with cache=unsafe */
4769     if (bs->open_flags & BDRV_O_NO_FLUSH) {
4770         goto flush_parent;
4771     }
4772 
4773     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4774     if (bs->drv->bdrv_co_flush_to_disk) {
4775         ret = bs->drv->bdrv_co_flush_to_disk(bs);
4776     } else if (bs->drv->bdrv_aio_flush) {
4777         BlockDriverAIOCB *acb;
4778         CoroutineIOCompletion co = {
4779             .coroutine = qemu_coroutine_self(),
4780         };
4781 
4782         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4783         if (acb == NULL) {
4784             ret = -EIO;
4785         } else {
4786             qemu_coroutine_yield();
4787             ret = co.ret;
4788         }
4789     } else {
4790         /*
4791          * Some block drivers always operate in either writethrough or unsafe
4792          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4793          * know how the server works (because the behaviour is hardcoded or
4794          * depends on server-side configuration), so we can't ensure that
4795          * everything is safe on disk. Returning an error doesn't work because
4796          * that would break guests even if the server operates in writethrough
4797          * mode.
4798          *
4799          * Let's hope the user knows what he's doing.
4800          */
4801         ret = 0;
4802     }
4803     if (ret < 0) {
4804         return ret;
4805     }
4806 
4807     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4808      * in the case of cache=unsafe, so there are no useless flushes.
4809      */
4810 flush_parent:
4811     return bdrv_co_flush(bs->file);
4812 }
4813 
4814 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
4815 {
4816     Error *local_err = NULL;
4817     int ret;
4818 
4819     if (!bs->drv)  {
4820         return;
4821     }
4822 
4823     if (bs->drv->bdrv_invalidate_cache) {
4824         bs->drv->bdrv_invalidate_cache(bs, &local_err);
4825     } else if (bs->file) {
4826         bdrv_invalidate_cache(bs->file, &local_err);
4827     }
4828     if (local_err) {
4829         error_propagate(errp, local_err);
4830         return;
4831     }
4832 
4833     ret = refresh_total_sectors(bs, bs->total_sectors);
4834     if (ret < 0) {
4835         error_setg_errno(errp, -ret, "Could not refresh total sector count");
4836         return;
4837     }
4838 }
4839 
4840 void bdrv_invalidate_cache_all(Error **errp)
4841 {
4842     BlockDriverState *bs;
4843     Error *local_err = NULL;
4844 
4845     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4846         bdrv_invalidate_cache(bs, &local_err);
4847         if (local_err) {
4848             error_propagate(errp, local_err);
4849             return;
4850         }
4851     }
4852 }
4853 
4854 void bdrv_clear_incoming_migration_all(void)
4855 {
4856     BlockDriverState *bs;
4857 
4858     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4859         bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4860     }
4861 }
4862 
4863 int bdrv_flush(BlockDriverState *bs)
4864 {
4865     Coroutine *co;
4866     RwCo rwco = {
4867         .bs = bs,
4868         .ret = NOT_DONE,
4869     };
4870 
4871     if (qemu_in_coroutine()) {
4872         /* Fast-path if already in coroutine context */
4873         bdrv_flush_co_entry(&rwco);
4874     } else {
4875         co = qemu_coroutine_create(bdrv_flush_co_entry);
4876         qemu_coroutine_enter(co, &rwco);
4877         while (rwco.ret == NOT_DONE) {
4878             qemu_aio_wait();
4879         }
4880     }
4881 
4882     return rwco.ret;
4883 }
4884 
4885 typedef struct DiscardCo {
4886     BlockDriverState *bs;
4887     int64_t sector_num;
4888     int nb_sectors;
4889     int ret;
4890 } DiscardCo;
4891 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4892 {
4893     DiscardCo *rwco = opaque;
4894 
4895     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4896 }
4897 
4898 /* if no limit is specified in the BlockLimits use a default
4899  * of 32768 512-byte sectors (16 MiB) per request.
4900  */
4901 #define MAX_DISCARD_DEFAULT 32768
4902 
4903 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4904                                  int nb_sectors)
4905 {
4906     int max_discard;
4907 
4908     if (!bs->drv) {
4909         return -ENOMEDIUM;
4910     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4911         return -EIO;
4912     } else if (bs->read_only) {
4913         return -EROFS;
4914     }
4915 
4916     bdrv_reset_dirty(bs, sector_num, nb_sectors);
4917 
4918     /* Do nothing if disabled.  */
4919     if (!(bs->open_flags & BDRV_O_UNMAP)) {
4920         return 0;
4921     }
4922 
4923     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4924         return 0;
4925     }
4926 
4927     max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4928     while (nb_sectors > 0) {
4929         int ret;
4930         int num = nb_sectors;
4931 
4932         /* align request */
4933         if (bs->bl.discard_alignment &&
4934             num >= bs->bl.discard_alignment &&
4935             sector_num % bs->bl.discard_alignment) {
4936             if (num > bs->bl.discard_alignment) {
4937                 num = bs->bl.discard_alignment;
4938             }
4939             num -= sector_num % bs->bl.discard_alignment;
4940         }
4941 
4942         /* limit request size */
4943         if (num > max_discard) {
4944             num = max_discard;
4945         }
4946 
4947         if (bs->drv->bdrv_co_discard) {
4948             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
4949         } else {
4950             BlockDriverAIOCB *acb;
4951             CoroutineIOCompletion co = {
4952                 .coroutine = qemu_coroutine_self(),
4953             };
4954 
4955             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4956                                             bdrv_co_io_em_complete, &co);
4957             if (acb == NULL) {
4958                 return -EIO;
4959             } else {
4960                 qemu_coroutine_yield();
4961                 ret = co.ret;
4962             }
4963         }
4964         if (ret && ret != -ENOTSUP) {
4965             return ret;
4966         }
4967 
4968         sector_num += num;
4969         nb_sectors -= num;
4970     }
4971     return 0;
4972 }
4973 
4974 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4975 {
4976     Coroutine *co;
4977     DiscardCo rwco = {
4978         .bs = bs,
4979         .sector_num = sector_num,
4980         .nb_sectors = nb_sectors,
4981         .ret = NOT_DONE,
4982     };
4983 
4984     if (qemu_in_coroutine()) {
4985         /* Fast-path if already in coroutine context */
4986         bdrv_discard_co_entry(&rwco);
4987     } else {
4988         co = qemu_coroutine_create(bdrv_discard_co_entry);
4989         qemu_coroutine_enter(co, &rwco);
4990         while (rwco.ret == NOT_DONE) {
4991             qemu_aio_wait();
4992         }
4993     }
4994 
4995     return rwco.ret;
4996 }
4997 
4998 /**************************************************************/
4999 /* removable device support */
5000 
5001 /**
5002  * Return TRUE if the media is present
5003  */
5004 int bdrv_is_inserted(BlockDriverState *bs)
5005 {
5006     BlockDriver *drv = bs->drv;
5007 
5008     if (!drv)
5009         return 0;
5010     if (!drv->bdrv_is_inserted)
5011         return 1;
5012     return drv->bdrv_is_inserted(bs);
5013 }
5014 
5015 /**
5016  * Return whether the media changed since the last call to this
5017  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
5018  */
5019 int bdrv_media_changed(BlockDriverState *bs)
5020 {
5021     BlockDriver *drv = bs->drv;
5022 
5023     if (drv && drv->bdrv_media_changed) {
5024         return drv->bdrv_media_changed(bs);
5025     }
5026     return -ENOTSUP;
5027 }
5028 
5029 /**
5030  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5031  */
5032 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5033 {
5034     BlockDriver *drv = bs->drv;
5035 
5036     if (drv && drv->bdrv_eject) {
5037         drv->bdrv_eject(bs, eject_flag);
5038     }
5039 
5040     if (bs->device_name[0] != '\0') {
5041         bdrv_emit_qmp_eject_event(bs, eject_flag);
5042     }
5043 }
5044 
5045 /**
5046  * Lock or unlock the media (if it is locked, the user won't be able
5047  * to eject it manually).
5048  */
5049 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5050 {
5051     BlockDriver *drv = bs->drv;
5052 
5053     trace_bdrv_lock_medium(bs, locked);
5054 
5055     if (drv && drv->bdrv_lock_medium) {
5056         drv->bdrv_lock_medium(bs, locked);
5057     }
5058 }
5059 
5060 /* needed for generic scsi interface */
5061 
5062 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5063 {
5064     BlockDriver *drv = bs->drv;
5065 
5066     if (drv && drv->bdrv_ioctl)
5067         return drv->bdrv_ioctl(bs, req, buf);
5068     return -ENOTSUP;
5069 }
5070 
5071 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5072         unsigned long int req, void *buf,
5073         BlockDriverCompletionFunc *cb, void *opaque)
5074 {
5075     BlockDriver *drv = bs->drv;
5076 
5077     if (drv && drv->bdrv_aio_ioctl)
5078         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5079     return NULL;
5080 }
5081 
5082 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5083 {
5084     bs->guest_block_size = align;
5085 }
5086 
5087 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5088 {
5089     return qemu_memalign(bdrv_opt_mem_align(bs), size);
5090 }
5091 
5092 /*
5093  * Check if all memory in this vector is sector aligned.
5094  */
5095 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5096 {
5097     int i;
5098     size_t alignment = bdrv_opt_mem_align(bs);
5099 
5100     for (i = 0; i < qiov->niov; i++) {
5101         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5102             return false;
5103         }
5104         if (qiov->iov[i].iov_len % alignment) {
5105             return false;
5106         }
5107     }
5108 
5109     return true;
5110 }
5111 
5112 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5113                                           Error **errp)
5114 {
5115     int64_t bitmap_size;
5116     BdrvDirtyBitmap *bitmap;
5117 
5118     assert((granularity & (granularity - 1)) == 0);
5119 
5120     granularity >>= BDRV_SECTOR_BITS;
5121     assert(granularity);
5122     bitmap_size = bdrv_getlength(bs);
5123     if (bitmap_size < 0) {
5124         error_setg_errno(errp, -bitmap_size, "could not get length of device");
5125         errno = -bitmap_size;
5126         return NULL;
5127     }
5128     bitmap_size >>= BDRV_SECTOR_BITS;
5129     bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5130     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5131     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5132     return bitmap;
5133 }
5134 
5135 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5136 {
5137     BdrvDirtyBitmap *bm, *next;
5138     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5139         if (bm == bitmap) {
5140             QLIST_REMOVE(bitmap, list);
5141             hbitmap_free(bitmap->bitmap);
5142             g_free(bitmap);
5143             return;
5144         }
5145     }
5146 }
5147 
5148 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5149 {
5150     BdrvDirtyBitmap *bm;
5151     BlockDirtyInfoList *list = NULL;
5152     BlockDirtyInfoList **plist = &list;
5153 
5154     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5155         BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5156         BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5157         info->count = bdrv_get_dirty_count(bs, bm);
5158         info->granularity =
5159             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5160         entry->value = info;
5161         *plist = entry;
5162         plist = &entry->next;
5163     }
5164 
5165     return list;
5166 }
5167 
5168 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5169 {
5170     if (bitmap) {
5171         return hbitmap_get(bitmap->bitmap, sector);
5172     } else {
5173         return 0;
5174     }
5175 }
5176 
5177 void bdrv_dirty_iter_init(BlockDriverState *bs,
5178                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5179 {
5180     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5181 }
5182 
5183 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5184                     int nr_sectors)
5185 {
5186     BdrvDirtyBitmap *bitmap;
5187     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5188         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5189     }
5190 }
5191 
5192 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5193 {
5194     BdrvDirtyBitmap *bitmap;
5195     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5196         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5197     }
5198 }
5199 
5200 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5201 {
5202     return hbitmap_count(bitmap->bitmap);
5203 }
5204 
5205 /* Get a reference to bs */
5206 void bdrv_ref(BlockDriverState *bs)
5207 {
5208     bs->refcnt++;
5209 }
5210 
5211 /* Release a previously grabbed reference to bs.
5212  * If after releasing, reference count is zero, the BlockDriverState is
5213  * deleted. */
5214 void bdrv_unref(BlockDriverState *bs)
5215 {
5216     assert(bs->refcnt > 0);
5217     if (--bs->refcnt == 0) {
5218         bdrv_delete(bs);
5219     }
5220 }
5221 
5222 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
5223 {
5224     assert(bs->in_use != in_use);
5225     bs->in_use = in_use;
5226 }
5227 
5228 int bdrv_in_use(BlockDriverState *bs)
5229 {
5230     return bs->in_use;
5231 }
5232 
5233 void bdrv_iostatus_enable(BlockDriverState *bs)
5234 {
5235     bs->iostatus_enabled = true;
5236     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5237 }
5238 
5239 /* The I/O status is only enabled if the drive explicitly
5240  * enables it _and_ the VM is configured to stop on errors */
5241 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5242 {
5243     return (bs->iostatus_enabled &&
5244            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5245             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5246             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5247 }
5248 
5249 void bdrv_iostatus_disable(BlockDriverState *bs)
5250 {
5251     bs->iostatus_enabled = false;
5252 }
5253 
5254 void bdrv_iostatus_reset(BlockDriverState *bs)
5255 {
5256     if (bdrv_iostatus_is_enabled(bs)) {
5257         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5258         if (bs->job) {
5259             block_job_iostatus_reset(bs->job);
5260         }
5261     }
5262 }
5263 
5264 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5265 {
5266     assert(bdrv_iostatus_is_enabled(bs));
5267     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5268         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5269                                          BLOCK_DEVICE_IO_STATUS_FAILED;
5270     }
5271 }
5272 
5273 void
5274 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5275         enum BlockAcctType type)
5276 {
5277     assert(type < BDRV_MAX_IOTYPE);
5278 
5279     cookie->bytes = bytes;
5280     cookie->start_time_ns = get_clock();
5281     cookie->type = type;
5282 }
5283 
5284 void
5285 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5286 {
5287     assert(cookie->type < BDRV_MAX_IOTYPE);
5288 
5289     bs->nr_bytes[cookie->type] += cookie->bytes;
5290     bs->nr_ops[cookie->type]++;
5291     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5292 }
5293 
5294 void bdrv_img_create(const char *filename, const char *fmt,
5295                      const char *base_filename, const char *base_fmt,
5296                      char *options, uint64_t img_size, int flags,
5297                      Error **errp, bool quiet)
5298 {
5299     QEMUOptionParameter *param = NULL, *create_options = NULL;
5300     QEMUOptionParameter *backing_fmt, *backing_file, *size;
5301     BlockDriver *drv, *proto_drv;
5302     BlockDriver *backing_drv = NULL;
5303     Error *local_err = NULL;
5304     int ret = 0;
5305 
5306     /* Find driver and parse its options */
5307     drv = bdrv_find_format(fmt);
5308     if (!drv) {
5309         error_setg(errp, "Unknown file format '%s'", fmt);
5310         return;
5311     }
5312 
5313     proto_drv = bdrv_find_protocol(filename, true);
5314     if (!proto_drv) {
5315         error_setg(errp, "Unknown protocol '%s'", filename);
5316         return;
5317     }
5318 
5319     create_options = append_option_parameters(create_options,
5320                                               drv->create_options);
5321     create_options = append_option_parameters(create_options,
5322                                               proto_drv->create_options);
5323 
5324     /* Create parameter list with default values */
5325     param = parse_option_parameters("", create_options, param);
5326 
5327     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5328 
5329     /* Parse -o options */
5330     if (options) {
5331         param = parse_option_parameters(options, create_options, param);
5332         if (param == NULL) {
5333             error_setg(errp, "Invalid options for file format '%s'.", fmt);
5334             goto out;
5335         }
5336     }
5337 
5338     if (base_filename) {
5339         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5340                                  base_filename)) {
5341             error_setg(errp, "Backing file not supported for file format '%s'",
5342                        fmt);
5343             goto out;
5344         }
5345     }
5346 
5347     if (base_fmt) {
5348         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5349             error_setg(errp, "Backing file format not supported for file "
5350                              "format '%s'", fmt);
5351             goto out;
5352         }
5353     }
5354 
5355     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5356     if (backing_file && backing_file->value.s) {
5357         if (!strcmp(filename, backing_file->value.s)) {
5358             error_setg(errp, "Error: Trying to create an image with the "
5359                              "same filename as the backing file");
5360             goto out;
5361         }
5362     }
5363 
5364     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5365     if (backing_fmt && backing_fmt->value.s) {
5366         backing_drv = bdrv_find_format(backing_fmt->value.s);
5367         if (!backing_drv) {
5368             error_setg(errp, "Unknown backing file format '%s'",
5369                        backing_fmt->value.s);
5370             goto out;
5371         }
5372     }
5373 
5374     // The size for the image must always be specified, with one exception:
5375     // If we are using a backing file, we can obtain the size from there
5376     size = get_option_parameter(param, BLOCK_OPT_SIZE);
5377     if (size && size->value.n == -1) {
5378         if (backing_file && backing_file->value.s) {
5379             BlockDriverState *bs;
5380             uint64_t size;
5381             char buf[32];
5382             int back_flags;
5383 
5384             /* backing files always opened read-only */
5385             back_flags =
5386                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5387 
5388             bs = NULL;
5389             ret = bdrv_open(&bs, backing_file->value.s, NULL, NULL, back_flags,
5390                             backing_drv, &local_err);
5391             if (ret < 0) {
5392                 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5393                                  backing_file->value.s,
5394                                  error_get_pretty(local_err));
5395                 error_free(local_err);
5396                 local_err = NULL;
5397                 goto out;
5398             }
5399             bdrv_get_geometry(bs, &size);
5400             size *= 512;
5401 
5402             snprintf(buf, sizeof(buf), "%" PRId64, size);
5403             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
5404 
5405             bdrv_unref(bs);
5406         } else {
5407             error_setg(errp, "Image creation needs a size parameter");
5408             goto out;
5409         }
5410     }
5411 
5412     if (!quiet) {
5413         printf("Formatting '%s', fmt=%s ", filename, fmt);
5414         print_option_parameters(param);
5415         puts("");
5416     }
5417     ret = bdrv_create(drv, filename, param, &local_err);
5418     if (ret == -EFBIG) {
5419         /* This is generally a better message than whatever the driver would
5420          * deliver (especially because of the cluster_size_hint), since that
5421          * is most probably not much different from "image too large". */
5422         const char *cluster_size_hint = "";
5423         if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5424             cluster_size_hint = " (try using a larger cluster size)";
5425         }
5426         error_setg(errp, "The image size is too large for file format '%s'"
5427                    "%s", fmt, cluster_size_hint);
5428         error_free(local_err);
5429         local_err = NULL;
5430     }
5431 
5432 out:
5433     free_option_parameters(create_options);
5434     free_option_parameters(param);
5435 
5436     if (local_err) {
5437         error_propagate(errp, local_err);
5438     }
5439 }
5440 
5441 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5442 {
5443     /* Currently BlockDriverState always uses the main loop AioContext */
5444     return qemu_get_aio_context();
5445 }
5446 
5447 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5448                                     NotifierWithReturn *notifier)
5449 {
5450     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5451 }
5452 
5453 int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5454 {
5455     if (bs->drv->bdrv_amend_options == NULL) {
5456         return -ENOTSUP;
5457     }
5458     return bs->drv->bdrv_amend_options(bs, options);
5459 }
5460 
5461 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5462  * of block filter and by bdrv_is_first_non_filter.
5463  * It is used to test if the given bs is the candidate or recurse more in the
5464  * node graph.
5465  */
5466 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5467                                       BlockDriverState *candidate)
5468 {
5469     /* return false if basic checks fails */
5470     if (!bs || !bs->drv) {
5471         return false;
5472     }
5473 
5474     /* the code reached a non block filter driver -> check if the bs is
5475      * the same as the candidate. It's the recursion termination condition.
5476      */
5477     if (!bs->drv->is_filter) {
5478         return bs == candidate;
5479     }
5480     /* Down this path the driver is a block filter driver */
5481 
5482     /* If the block filter recursion method is defined use it to recurse down
5483      * the node graph.
5484      */
5485     if (bs->drv->bdrv_recurse_is_first_non_filter) {
5486         return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5487     }
5488 
5489     /* the driver is a block filter but don't allow to recurse -> return false
5490      */
5491     return false;
5492 }
5493 
5494 /* This function checks if the candidate is the first non filter bs down it's
5495  * bs chain. Since we don't have pointers to parents it explore all bs chains
5496  * from the top. Some filters can choose not to pass down the recursion.
5497  */
5498 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5499 {
5500     BlockDriverState *bs;
5501 
5502     /* walk down the bs forest recursively */
5503     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5504         bool perm;
5505 
5506         /* try to recurse in this top level bs */
5507         perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5508 
5509         /* candidate is the first non filter */
5510         if (perm) {
5511             return true;
5512         }
5513     }
5514 
5515     return false;
5516 }
5517