xref: /openbmc/qemu/block.c (revision 0f20ba62c35e6a779ba4ea00616192ef2abb6896)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor/monitor.h"
28 #include "block/block_int.h"
29 #include "block/blockjob.h"
30 #include "qemu/module.h"
31 #include "qapi/qmp/qjson.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 
39 #ifdef CONFIG_BSD
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/queue.h>
44 #ifndef __DragonFly__
45 #include <sys/disk.h>
46 #endif
47 #endif
48 
49 #ifdef _WIN32
50 #include <windows.h>
51 #endif
52 
53 struct BdrvDirtyBitmap {
54     HBitmap *bitmap;
55     QLIST_ENTRY(BdrvDirtyBitmap) list;
56 };
57 
58 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59 
60 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63         BlockDriverCompletionFunc *cb, void *opaque);
64 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66         BlockDriverCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68                                          int64_t sector_num, int nb_sectors,
69                                          QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71                                          int64_t sector_num, int nb_sectors,
72                                          QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75     BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78     BdrvRequestFlags flags);
79 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80                                                int64_t sector_num,
81                                                QEMUIOVector *qiov,
82                                                int nb_sectors,
83                                                BdrvRequestFlags flags,
84                                                BlockDriverCompletionFunc *cb,
85                                                void *opaque,
86                                                bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90 
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92     QTAILQ_HEAD_INITIALIZER(bdrv_states);
93 
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96 
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98     QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
102 
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
105 {
106     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108             filename[1] == ':');
109 }
110 
111 int is_windows_drive(const char *filename)
112 {
113     if (is_windows_drive_prefix(filename) &&
114         filename[2] == '\0')
115         return 1;
116     if (strstart(filename, "\\\\.\\", NULL) ||
117         strstart(filename, "//./", NULL))
118         return 1;
119     return 0;
120 }
121 #endif
122 
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125                         ThrottleConfig *cfg)
126 {
127     int i;
128 
129     throttle_config(&bs->throttle_state, cfg);
130 
131     for (i = 0; i < 2; i++) {
132         qemu_co_enter_next(&bs->throttled_reqs[i]);
133     }
134 }
135 
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138 {
139     bool drained = false;
140     bool enabled = bs->io_limits_enabled;
141     int i;
142 
143     bs->io_limits_enabled = false;
144 
145     for (i = 0; i < 2; i++) {
146         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147             drained = true;
148         }
149     }
150 
151     bs->io_limits_enabled = enabled;
152 
153     return drained;
154 }
155 
156 void bdrv_io_limits_disable(BlockDriverState *bs)
157 {
158     bs->io_limits_enabled = false;
159 
160     bdrv_start_throttled_reqs(bs);
161 
162     throttle_destroy(&bs->throttle_state);
163 }
164 
165 static void bdrv_throttle_read_timer_cb(void *opaque)
166 {
167     BlockDriverState *bs = opaque;
168     qemu_co_enter_next(&bs->throttled_reqs[0]);
169 }
170 
171 static void bdrv_throttle_write_timer_cb(void *opaque)
172 {
173     BlockDriverState *bs = opaque;
174     qemu_co_enter_next(&bs->throttled_reqs[1]);
175 }
176 
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
179 {
180     assert(!bs->io_limits_enabled);
181     throttle_init(&bs->throttle_state,
182                   QEMU_CLOCK_VIRTUAL,
183                   bdrv_throttle_read_timer_cb,
184                   bdrv_throttle_write_timer_cb,
185                   bs);
186     bs->io_limits_enabled = true;
187 }
188 
189 /* This function makes an IO wait if needed
190  *
191  * @nb_sectors: the number of sectors of the IO
192  * @is_write:   is the IO a write
193  */
194 static void bdrv_io_limits_intercept(BlockDriverState *bs,
195                                      unsigned int bytes,
196                                      bool is_write)
197 {
198     /* does this io must wait */
199     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
200 
201     /* if must wait or any request of this type throttled queue the IO */
202     if (must_wait ||
203         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
205     }
206 
207     /* the IO will be executed, do the accounting */
208     throttle_account(&bs->throttle_state, is_write, bytes);
209 
210 
211     /* if the next request must wait -> do nothing */
212     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
213         return;
214     }
215 
216     /* else queue next request for execution */
217     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
218 }
219 
220 size_t bdrv_opt_mem_align(BlockDriverState *bs)
221 {
222     if (!bs || !bs->drv) {
223         /* 4k should be on the safe side */
224         return 4096;
225     }
226 
227     return bs->bl.opt_mem_alignment;
228 }
229 
230 /* check if the path starts with "<protocol>:" */
231 static int path_has_protocol(const char *path)
232 {
233     const char *p;
234 
235 #ifdef _WIN32
236     if (is_windows_drive(path) ||
237         is_windows_drive_prefix(path)) {
238         return 0;
239     }
240     p = path + strcspn(path, ":/\\");
241 #else
242     p = path + strcspn(path, ":/");
243 #endif
244 
245     return *p == ':';
246 }
247 
248 int path_is_absolute(const char *path)
249 {
250 #ifdef _WIN32
251     /* specific case for names like: "\\.\d:" */
252     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
253         return 1;
254     }
255     return (*path == '/' || *path == '\\');
256 #else
257     return (*path == '/');
258 #endif
259 }
260 
261 /* if filename is absolute, just copy it to dest. Otherwise, build a
262    path to it by considering it is relative to base_path. URL are
263    supported. */
264 void path_combine(char *dest, int dest_size,
265                   const char *base_path,
266                   const char *filename)
267 {
268     const char *p, *p1;
269     int len;
270 
271     if (dest_size <= 0)
272         return;
273     if (path_is_absolute(filename)) {
274         pstrcpy(dest, dest_size, filename);
275     } else {
276         p = strchr(base_path, ':');
277         if (p)
278             p++;
279         else
280             p = base_path;
281         p1 = strrchr(base_path, '/');
282 #ifdef _WIN32
283         {
284             const char *p2;
285             p2 = strrchr(base_path, '\\');
286             if (!p1 || p2 > p1)
287                 p1 = p2;
288         }
289 #endif
290         if (p1)
291             p1++;
292         else
293             p1 = base_path;
294         if (p1 > p)
295             p = p1;
296         len = p - base_path;
297         if (len > dest_size - 1)
298             len = dest_size - 1;
299         memcpy(dest, base_path, len);
300         dest[len] = '\0';
301         pstrcat(dest, dest_size, filename);
302     }
303 }
304 
305 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
306 {
307     if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
308         pstrcpy(dest, sz, bs->backing_file);
309     } else {
310         path_combine(dest, sz, bs->filename, bs->backing_file);
311     }
312 }
313 
314 void bdrv_register(BlockDriver *bdrv)
315 {
316     /* Block drivers without coroutine functions need emulation */
317     if (!bdrv->bdrv_co_readv) {
318         bdrv->bdrv_co_readv = bdrv_co_readv_em;
319         bdrv->bdrv_co_writev = bdrv_co_writev_em;
320 
321         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
322          * the block driver lacks aio we need to emulate that too.
323          */
324         if (!bdrv->bdrv_aio_readv) {
325             /* add AIO emulation layer */
326             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
327             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
328         }
329     }
330 
331     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
332 }
333 
334 /* create a new block device (by default it is empty) */
335 BlockDriverState *bdrv_new(const char *device_name)
336 {
337     BlockDriverState *bs;
338 
339     bs = g_malloc0(sizeof(BlockDriverState));
340     QLIST_INIT(&bs->dirty_bitmaps);
341     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
342     if (device_name[0] != '\0') {
343         QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
344     }
345     bdrv_iostatus_disable(bs);
346     notifier_list_init(&bs->close_notifiers);
347     notifier_with_return_list_init(&bs->before_write_notifiers);
348     qemu_co_queue_init(&bs->throttled_reqs[0]);
349     qemu_co_queue_init(&bs->throttled_reqs[1]);
350     bs->refcnt = 1;
351 
352     return bs;
353 }
354 
355 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
356 {
357     notifier_list_add(&bs->close_notifiers, notify);
358 }
359 
360 BlockDriver *bdrv_find_format(const char *format_name)
361 {
362     BlockDriver *drv1;
363     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
364         if (!strcmp(drv1->format_name, format_name)) {
365             return drv1;
366         }
367     }
368     return NULL;
369 }
370 
371 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
372 {
373     static const char *whitelist_rw[] = {
374         CONFIG_BDRV_RW_WHITELIST
375     };
376     static const char *whitelist_ro[] = {
377         CONFIG_BDRV_RO_WHITELIST
378     };
379     const char **p;
380 
381     if (!whitelist_rw[0] && !whitelist_ro[0]) {
382         return 1;               /* no whitelist, anything goes */
383     }
384 
385     for (p = whitelist_rw; *p; p++) {
386         if (!strcmp(drv->format_name, *p)) {
387             return 1;
388         }
389     }
390     if (read_only) {
391         for (p = whitelist_ro; *p; p++) {
392             if (!strcmp(drv->format_name, *p)) {
393                 return 1;
394             }
395         }
396     }
397     return 0;
398 }
399 
400 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
401                                           bool read_only)
402 {
403     BlockDriver *drv = bdrv_find_format(format_name);
404     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
405 }
406 
407 typedef struct CreateCo {
408     BlockDriver *drv;
409     char *filename;
410     QEMUOptionParameter *options;
411     int ret;
412     Error *err;
413 } CreateCo;
414 
415 static void coroutine_fn bdrv_create_co_entry(void *opaque)
416 {
417     Error *local_err = NULL;
418     int ret;
419 
420     CreateCo *cco = opaque;
421     assert(cco->drv);
422 
423     ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
424     if (local_err) {
425         error_propagate(&cco->err, local_err);
426     }
427     cco->ret = ret;
428 }
429 
430 int bdrv_create(BlockDriver *drv, const char* filename,
431     QEMUOptionParameter *options, Error **errp)
432 {
433     int ret;
434 
435     Coroutine *co;
436     CreateCo cco = {
437         .drv = drv,
438         .filename = g_strdup(filename),
439         .options = options,
440         .ret = NOT_DONE,
441         .err = NULL,
442     };
443 
444     if (!drv->bdrv_create) {
445         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
446         ret = -ENOTSUP;
447         goto out;
448     }
449 
450     if (qemu_in_coroutine()) {
451         /* Fast-path if already in coroutine context */
452         bdrv_create_co_entry(&cco);
453     } else {
454         co = qemu_coroutine_create(bdrv_create_co_entry);
455         qemu_coroutine_enter(co, &cco);
456         while (cco.ret == NOT_DONE) {
457             qemu_aio_wait();
458         }
459     }
460 
461     ret = cco.ret;
462     if (ret < 0) {
463         if (cco.err) {
464             error_propagate(errp, cco.err);
465         } else {
466             error_setg_errno(errp, -ret, "Could not create image");
467         }
468     }
469 
470 out:
471     g_free(cco.filename);
472     return ret;
473 }
474 
475 int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
476                      Error **errp)
477 {
478     BlockDriver *drv;
479     Error *local_err = NULL;
480     int ret;
481 
482     drv = bdrv_find_protocol(filename, true);
483     if (drv == NULL) {
484         error_setg(errp, "Could not find protocol for file '%s'", filename);
485         return -ENOENT;
486     }
487 
488     ret = bdrv_create(drv, filename, options, &local_err);
489     if (local_err) {
490         error_propagate(errp, local_err);
491     }
492     return ret;
493 }
494 
495 int bdrv_refresh_limits(BlockDriverState *bs)
496 {
497     BlockDriver *drv = bs->drv;
498 
499     memset(&bs->bl, 0, sizeof(bs->bl));
500 
501     if (!drv) {
502         return 0;
503     }
504 
505     /* Take some limits from the children as a default */
506     if (bs->file) {
507         bdrv_refresh_limits(bs->file);
508         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
509         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
510     } else {
511         bs->bl.opt_mem_alignment = 512;
512     }
513 
514     if (bs->backing_hd) {
515         bdrv_refresh_limits(bs->backing_hd);
516         bs->bl.opt_transfer_length =
517             MAX(bs->bl.opt_transfer_length,
518                 bs->backing_hd->bl.opt_transfer_length);
519         bs->bl.opt_mem_alignment =
520             MAX(bs->bl.opt_mem_alignment,
521                 bs->backing_hd->bl.opt_mem_alignment);
522     }
523 
524     /* Then let the driver override it */
525     if (drv->bdrv_refresh_limits) {
526         return drv->bdrv_refresh_limits(bs);
527     }
528 
529     return 0;
530 }
531 
532 /*
533  * Create a uniquely-named empty temporary file.
534  * Return 0 upon success, otherwise a negative errno value.
535  */
536 int get_tmp_filename(char *filename, int size)
537 {
538 #ifdef _WIN32
539     char temp_dir[MAX_PATH];
540     /* GetTempFileName requires that its output buffer (4th param)
541        have length MAX_PATH or greater.  */
542     assert(size >= MAX_PATH);
543     return (GetTempPath(MAX_PATH, temp_dir)
544             && GetTempFileName(temp_dir, "qem", 0, filename)
545             ? 0 : -GetLastError());
546 #else
547     int fd;
548     const char *tmpdir;
549     tmpdir = getenv("TMPDIR");
550     if (!tmpdir) {
551         tmpdir = "/var/tmp";
552     }
553     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
554         return -EOVERFLOW;
555     }
556     fd = mkstemp(filename);
557     if (fd < 0) {
558         return -errno;
559     }
560     if (close(fd) != 0) {
561         unlink(filename);
562         return -errno;
563     }
564     return 0;
565 #endif
566 }
567 
568 /*
569  * Detect host devices. By convention, /dev/cdrom[N] is always
570  * recognized as a host CDROM.
571  */
572 static BlockDriver *find_hdev_driver(const char *filename)
573 {
574     int score_max = 0, score;
575     BlockDriver *drv = NULL, *d;
576 
577     QLIST_FOREACH(d, &bdrv_drivers, list) {
578         if (d->bdrv_probe_device) {
579             score = d->bdrv_probe_device(filename);
580             if (score > score_max) {
581                 score_max = score;
582                 drv = d;
583             }
584         }
585     }
586 
587     return drv;
588 }
589 
590 BlockDriver *bdrv_find_protocol(const char *filename,
591                                 bool allow_protocol_prefix)
592 {
593     BlockDriver *drv1;
594     char protocol[128];
595     int len;
596     const char *p;
597 
598     /* TODO Drivers without bdrv_file_open must be specified explicitly */
599 
600     /*
601      * XXX(hch): we really should not let host device detection
602      * override an explicit protocol specification, but moving this
603      * later breaks access to device names with colons in them.
604      * Thanks to the brain-dead persistent naming schemes on udev-
605      * based Linux systems those actually are quite common.
606      */
607     drv1 = find_hdev_driver(filename);
608     if (drv1) {
609         return drv1;
610     }
611 
612     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
613         return bdrv_find_format("file");
614     }
615 
616     p = strchr(filename, ':');
617     assert(p != NULL);
618     len = p - filename;
619     if (len > sizeof(protocol) - 1)
620         len = sizeof(protocol) - 1;
621     memcpy(protocol, filename, len);
622     protocol[len] = '\0';
623     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
624         if (drv1->protocol_name &&
625             !strcmp(drv1->protocol_name, protocol)) {
626             return drv1;
627         }
628     }
629     return NULL;
630 }
631 
632 static int find_image_format(BlockDriverState *bs, const char *filename,
633                              BlockDriver **pdrv, Error **errp)
634 {
635     int score, score_max;
636     BlockDriver *drv1, *drv;
637     uint8_t buf[2048];
638     int ret = 0;
639 
640     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
641     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
642         drv = bdrv_find_format("raw");
643         if (!drv) {
644             error_setg(errp, "Could not find raw image format");
645             ret = -ENOENT;
646         }
647         *pdrv = drv;
648         return ret;
649     }
650 
651     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
652     if (ret < 0) {
653         error_setg_errno(errp, -ret, "Could not read image for determining its "
654                          "format");
655         *pdrv = NULL;
656         return ret;
657     }
658 
659     score_max = 0;
660     drv = NULL;
661     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
662         if (drv1->bdrv_probe) {
663             score = drv1->bdrv_probe(buf, ret, filename);
664             if (score > score_max) {
665                 score_max = score;
666                 drv = drv1;
667             }
668         }
669     }
670     if (!drv) {
671         error_setg(errp, "Could not determine image format: No compatible "
672                    "driver found");
673         ret = -ENOENT;
674     }
675     *pdrv = drv;
676     return ret;
677 }
678 
679 /**
680  * Set the current 'total_sectors' value
681  */
682 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
683 {
684     BlockDriver *drv = bs->drv;
685 
686     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
687     if (bs->sg)
688         return 0;
689 
690     /* query actual device if possible, otherwise just trust the hint */
691     if (drv->bdrv_getlength) {
692         int64_t length = drv->bdrv_getlength(bs);
693         if (length < 0) {
694             return length;
695         }
696         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
697     }
698 
699     bs->total_sectors = hint;
700     return 0;
701 }
702 
703 /**
704  * Set open flags for a given discard mode
705  *
706  * Return 0 on success, -1 if the discard mode was invalid.
707  */
708 int bdrv_parse_discard_flags(const char *mode, int *flags)
709 {
710     *flags &= ~BDRV_O_UNMAP;
711 
712     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
713         /* do nothing */
714     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
715         *flags |= BDRV_O_UNMAP;
716     } else {
717         return -1;
718     }
719 
720     return 0;
721 }
722 
723 /**
724  * Set open flags for a given cache mode
725  *
726  * Return 0 on success, -1 if the cache mode was invalid.
727  */
728 int bdrv_parse_cache_flags(const char *mode, int *flags)
729 {
730     *flags &= ~BDRV_O_CACHE_MASK;
731 
732     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
733         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
734     } else if (!strcmp(mode, "directsync")) {
735         *flags |= BDRV_O_NOCACHE;
736     } else if (!strcmp(mode, "writeback")) {
737         *flags |= BDRV_O_CACHE_WB;
738     } else if (!strcmp(mode, "unsafe")) {
739         *flags |= BDRV_O_CACHE_WB;
740         *flags |= BDRV_O_NO_FLUSH;
741     } else if (!strcmp(mode, "writethrough")) {
742         /* this is the default */
743     } else {
744         return -1;
745     }
746 
747     return 0;
748 }
749 
750 /**
751  * The copy-on-read flag is actually a reference count so multiple users may
752  * use the feature without worrying about clobbering its previous state.
753  * Copy-on-read stays enabled until all users have called to disable it.
754  */
755 void bdrv_enable_copy_on_read(BlockDriverState *bs)
756 {
757     bs->copy_on_read++;
758 }
759 
760 void bdrv_disable_copy_on_read(BlockDriverState *bs)
761 {
762     assert(bs->copy_on_read > 0);
763     bs->copy_on_read--;
764 }
765 
766 static int bdrv_open_flags(BlockDriverState *bs, int flags)
767 {
768     int open_flags = flags | BDRV_O_CACHE_WB;
769 
770     /*
771      * Clear flags that are internal to the block layer before opening the
772      * image.
773      */
774     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
775 
776     /*
777      * Snapshots should be writable.
778      */
779     if (bs->is_temporary) {
780         open_flags |= BDRV_O_RDWR;
781     }
782 
783     return open_flags;
784 }
785 
786 static int bdrv_assign_node_name(BlockDriverState *bs,
787                                  const char *node_name,
788                                  Error **errp)
789 {
790     if (!node_name) {
791         return 0;
792     }
793 
794     /* empty string node name is invalid */
795     if (node_name[0] == '\0') {
796         error_setg(errp, "Empty node name");
797         return -EINVAL;
798     }
799 
800     /* takes care of avoiding namespaces collisions */
801     if (bdrv_find(node_name)) {
802         error_setg(errp, "node-name=%s is conflicting with a device id",
803                    node_name);
804         return -EINVAL;
805     }
806 
807     /* takes care of avoiding duplicates node names */
808     if (bdrv_find_node(node_name)) {
809         error_setg(errp, "Duplicate node name");
810         return -EINVAL;
811     }
812 
813     /* copy node name into the bs and insert it into the graph list */
814     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
815     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
816 
817     return 0;
818 }
819 
820 /*
821  * Common part for opening disk images and files
822  *
823  * Removes all processed options from *options.
824  */
825 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
826     QDict *options, int flags, BlockDriver *drv, Error **errp)
827 {
828     int ret, open_flags;
829     const char *filename;
830     const char *node_name = NULL;
831     Error *local_err = NULL;
832 
833     assert(drv != NULL);
834     assert(bs->file == NULL);
835     assert(options != NULL && bs->options != options);
836 
837     if (file != NULL) {
838         filename = file->filename;
839     } else {
840         filename = qdict_get_try_str(options, "filename");
841     }
842 
843     if (drv->bdrv_needs_filename && !filename) {
844         error_setg(errp, "The '%s' block driver requires a file name",
845                    drv->format_name);
846         return -EINVAL;
847     }
848 
849     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
850 
851     node_name = qdict_get_try_str(options, "node-name");
852     ret = bdrv_assign_node_name(bs, node_name, errp);
853     if (ret < 0) {
854         return ret;
855     }
856     qdict_del(options, "node-name");
857 
858     /* bdrv_open() with directly using a protocol as drv. This layer is already
859      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
860      * and return immediately. */
861     if (file != NULL && drv->bdrv_file_open) {
862         bdrv_swap(file, bs);
863         return 0;
864     }
865 
866     bs->open_flags = flags;
867     bs->guest_block_size = 512;
868     bs->request_alignment = 512;
869     bs->zero_beyond_eof = true;
870     open_flags = bdrv_open_flags(bs, flags);
871     bs->read_only = !(open_flags & BDRV_O_RDWR);
872 
873     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
874         error_setg(errp,
875                    !bs->read_only && bdrv_is_whitelisted(drv, true)
876                         ? "Driver '%s' can only be used for read-only devices"
877                         : "Driver '%s' is not whitelisted",
878                    drv->format_name);
879         return -ENOTSUP;
880     }
881 
882     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
883     if (flags & BDRV_O_COPY_ON_READ) {
884         if (!bs->read_only) {
885             bdrv_enable_copy_on_read(bs);
886         } else {
887             error_setg(errp, "Can't use copy-on-read on read-only device");
888             return -EINVAL;
889         }
890     }
891 
892     if (filename != NULL) {
893         pstrcpy(bs->filename, sizeof(bs->filename), filename);
894     } else {
895         bs->filename[0] = '\0';
896     }
897 
898     bs->drv = drv;
899     bs->opaque = g_malloc0(drv->instance_size);
900 
901     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
902 
903     /* Open the image, either directly or using a protocol */
904     if (drv->bdrv_file_open) {
905         assert(file == NULL);
906         assert(!drv->bdrv_needs_filename || filename != NULL);
907         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
908     } else {
909         if (file == NULL) {
910             error_setg(errp, "Can't use '%s' as a block driver for the "
911                        "protocol level", drv->format_name);
912             ret = -EINVAL;
913             goto free_and_fail;
914         }
915         bs->file = file;
916         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
917     }
918 
919     if (ret < 0) {
920         if (local_err) {
921             error_propagate(errp, local_err);
922         } else if (bs->filename[0]) {
923             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
924         } else {
925             error_setg_errno(errp, -ret, "Could not open image");
926         }
927         goto free_and_fail;
928     }
929 
930     ret = refresh_total_sectors(bs, bs->total_sectors);
931     if (ret < 0) {
932         error_setg_errno(errp, -ret, "Could not refresh total sector count");
933         goto free_and_fail;
934     }
935 
936     bdrv_refresh_limits(bs);
937     assert(bdrv_opt_mem_align(bs) != 0);
938     assert(bs->request_alignment != 0);
939 
940 #ifndef _WIN32
941     if (bs->is_temporary) {
942         assert(bs->filename[0] != '\0');
943         unlink(bs->filename);
944     }
945 #endif
946     return 0;
947 
948 free_and_fail:
949     bs->file = NULL;
950     g_free(bs->opaque);
951     bs->opaque = NULL;
952     bs->drv = NULL;
953     return ret;
954 }
955 
956 /*
957  * Opens a file using a protocol (file, host_device, nbd, ...)
958  *
959  * options is an indirect pointer to a QDict of options to pass to the block
960  * drivers, or pointer to NULL for an empty set of options. If this function
961  * takes ownership of the QDict reference, it will set *options to NULL;
962  * otherwise, it will contain unused/unrecognized options after this function
963  * returns. Then, the caller is responsible for freeing it. If it intends to
964  * reuse the QDict, QINCREF() should be called beforehand.
965  */
966 static int bdrv_file_open(BlockDriverState *bs, const char *filename,
967                           QDict **options, int flags, Error **errp)
968 {
969     BlockDriver *drv;
970     const char *drvname;
971     bool allow_protocol_prefix = false;
972     Error *local_err = NULL;
973     int ret;
974 
975     /* Fetch the file name from the options QDict if necessary */
976     if (!filename) {
977         filename = qdict_get_try_str(*options, "filename");
978     } else if (filename && !qdict_haskey(*options, "filename")) {
979         qdict_put(*options, "filename", qstring_from_str(filename));
980         allow_protocol_prefix = true;
981     } else {
982         error_setg(errp, "Can't specify 'file' and 'filename' options at the "
983                    "same time");
984         ret = -EINVAL;
985         goto fail;
986     }
987 
988     /* Find the right block driver */
989     drvname = qdict_get_try_str(*options, "driver");
990     if (drvname) {
991         drv = bdrv_find_format(drvname);
992         if (!drv) {
993             error_setg(errp, "Unknown driver '%s'", drvname);
994         }
995         qdict_del(*options, "driver");
996     } else if (filename) {
997         drv = bdrv_find_protocol(filename, allow_protocol_prefix);
998         if (!drv) {
999             error_setg(errp, "Unknown protocol");
1000         }
1001     } else {
1002         error_setg(errp, "Must specify either driver or file");
1003         drv = NULL;
1004     }
1005 
1006     if (!drv) {
1007         /* errp has been set already */
1008         ret = -ENOENT;
1009         goto fail;
1010     }
1011 
1012     /* Parse the filename and open it */
1013     if (drv->bdrv_parse_filename && filename) {
1014         drv->bdrv_parse_filename(filename, *options, &local_err);
1015         if (local_err) {
1016             error_propagate(errp, local_err);
1017             ret = -EINVAL;
1018             goto fail;
1019         }
1020         qdict_del(*options, "filename");
1021     }
1022 
1023     if (!drv->bdrv_file_open) {
1024         ret = bdrv_open(&bs, filename, NULL, *options, flags, drv, &local_err);
1025         *options = NULL;
1026     } else {
1027         ret = bdrv_open_common(bs, NULL, *options, flags, drv, &local_err);
1028     }
1029     if (ret < 0) {
1030         error_propagate(errp, local_err);
1031         goto fail;
1032     }
1033 
1034     bs->growable = 1;
1035     return 0;
1036 
1037 fail:
1038     return ret;
1039 }
1040 
1041 /*
1042  * Opens the backing file for a BlockDriverState if not yet open
1043  *
1044  * options is a QDict of options to pass to the block drivers, or NULL for an
1045  * empty set of options. The reference to the QDict is transferred to this
1046  * function (even on failure), so if the caller intends to reuse the dictionary,
1047  * it needs to use QINCREF() before calling bdrv_file_open.
1048  */
1049 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1050 {
1051     char backing_filename[PATH_MAX];
1052     int back_flags, ret;
1053     BlockDriver *back_drv = NULL;
1054     Error *local_err = NULL;
1055 
1056     if (bs->backing_hd != NULL) {
1057         QDECREF(options);
1058         return 0;
1059     }
1060 
1061     /* NULL means an empty set of options */
1062     if (options == NULL) {
1063         options = qdict_new();
1064     }
1065 
1066     bs->open_flags &= ~BDRV_O_NO_BACKING;
1067     if (qdict_haskey(options, "file.filename")) {
1068         backing_filename[0] = '\0';
1069     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1070         QDECREF(options);
1071         return 0;
1072     } else {
1073         bdrv_get_full_backing_filename(bs, backing_filename,
1074                                        sizeof(backing_filename));
1075     }
1076 
1077     if (bs->backing_format[0] != '\0') {
1078         back_drv = bdrv_find_format(bs->backing_format);
1079     }
1080 
1081     /* backing files always opened read-only */
1082     back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
1083                                     BDRV_O_COPY_ON_READ);
1084 
1085     assert(bs->backing_hd == NULL);
1086     ret = bdrv_open(&bs->backing_hd,
1087                     *backing_filename ? backing_filename : NULL, NULL, options,
1088                     back_flags, back_drv, &local_err);
1089     if (ret < 0) {
1090         bs->backing_hd = NULL;
1091         bs->open_flags |= BDRV_O_NO_BACKING;
1092         error_setg(errp, "Could not open backing file: %s",
1093                    error_get_pretty(local_err));
1094         error_free(local_err);
1095         return ret;
1096     }
1097 
1098     if (bs->backing_hd->file) {
1099         pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1100                 bs->backing_hd->file->filename);
1101     }
1102 
1103     /* Recalculate the BlockLimits with the backing file */
1104     bdrv_refresh_limits(bs);
1105 
1106     return 0;
1107 }
1108 
1109 /*
1110  * Opens a disk image whose options are given as BlockdevRef in another block
1111  * device's options.
1112  *
1113  * If allow_none is true, no image will be opened if filename is false and no
1114  * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1115  *
1116  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1117  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1118  * itself, all options starting with "${bdref_key}." are considered part of the
1119  * BlockdevRef.
1120  *
1121  * The BlockdevRef will be removed from the options QDict.
1122  *
1123  * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1124  */
1125 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1126                     QDict *options, const char *bdref_key, int flags,
1127                     bool allow_none, Error **errp)
1128 {
1129     QDict *image_options;
1130     int ret;
1131     char *bdref_key_dot;
1132     const char *reference;
1133 
1134     assert(pbs);
1135     assert(*pbs == NULL);
1136 
1137     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1138     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1139     g_free(bdref_key_dot);
1140 
1141     reference = qdict_get_try_str(options, bdref_key);
1142     if (!filename && !reference && !qdict_size(image_options)) {
1143         if (allow_none) {
1144             ret = 0;
1145         } else {
1146             error_setg(errp, "A block device must be specified for \"%s\"",
1147                        bdref_key);
1148             ret = -EINVAL;
1149         }
1150         goto done;
1151     }
1152 
1153     ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1154 
1155 done:
1156     qdict_del(options, bdref_key);
1157     return ret;
1158 }
1159 
1160 /*
1161  * Opens a disk image (raw, qcow2, vmdk, ...)
1162  *
1163  * options is a QDict of options to pass to the block drivers, or NULL for an
1164  * empty set of options. The reference to the QDict belongs to the block layer
1165  * after the call (even on failure), so if the caller intends to reuse the
1166  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1167  *
1168  * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1169  * If it is not NULL, the referenced BDS will be reused.
1170  *
1171  * The reference parameter may be used to specify an existing block device which
1172  * should be opened. If specified, neither options nor a filename may be given,
1173  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1174  */
1175 int bdrv_open(BlockDriverState **pbs, const char *filename,
1176               const char *reference, QDict *options, int flags,
1177               BlockDriver *drv, Error **errp)
1178 {
1179     int ret;
1180     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1181     char tmp_filename[PATH_MAX + 1];
1182     BlockDriverState *file = NULL, *bs;
1183     const char *drvname;
1184     Error *local_err = NULL;
1185 
1186     assert(pbs);
1187 
1188     if (reference) {
1189         bool options_non_empty = options ? qdict_size(options) : false;
1190         QDECREF(options);
1191 
1192         if (*pbs) {
1193             error_setg(errp, "Cannot reuse an existing BDS when referencing "
1194                        "another block device");
1195             return -EINVAL;
1196         }
1197 
1198         if (filename || options_non_empty) {
1199             error_setg(errp, "Cannot reference an existing block device with "
1200                        "additional options or a new filename");
1201             return -EINVAL;
1202         }
1203 
1204         bs = bdrv_lookup_bs(reference, reference, errp);
1205         if (!bs) {
1206             return -ENODEV;
1207         }
1208         bdrv_ref(bs);
1209         *pbs = bs;
1210         return 0;
1211     }
1212 
1213     if (*pbs) {
1214         bs = *pbs;
1215     } else {
1216         bs = bdrv_new("");
1217     }
1218 
1219     /* NULL means an empty set of options */
1220     if (options == NULL) {
1221         options = qdict_new();
1222     }
1223 
1224     bs->options = options;
1225     options = qdict_clone_shallow(options);
1226 
1227     if (flags & BDRV_O_PROTOCOL) {
1228         assert(!drv);
1229         ret = bdrv_file_open(bs, filename, &options, flags & ~BDRV_O_PROTOCOL,
1230                              &local_err);
1231         if (!ret) {
1232             goto done;
1233         } else if (bs->drv) {
1234             goto close_and_fail;
1235         } else {
1236             goto fail;
1237         }
1238     }
1239 
1240     /* For snapshot=on, create a temporary qcow2 overlay */
1241     if (flags & BDRV_O_SNAPSHOT) {
1242         BlockDriverState *bs1;
1243         int64_t total_size;
1244         BlockDriver *bdrv_qcow2;
1245         QEMUOptionParameter *create_options;
1246         QDict *snapshot_options;
1247 
1248         /* if snapshot, we create a temporary backing file and open it
1249            instead of opening 'filename' directly */
1250 
1251         /* Get the required size from the image */
1252         QINCREF(options);
1253         bs1 = NULL;
1254         ret = bdrv_open(&bs1, filename, NULL, options, BDRV_O_NO_BACKING,
1255                         drv, &local_err);
1256         if (ret < 0) {
1257             goto fail;
1258         }
1259         total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1260 
1261         bdrv_unref(bs1);
1262 
1263         /* Create the temporary image */
1264         ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1265         if (ret < 0) {
1266             error_setg_errno(errp, -ret, "Could not get temporary filename");
1267             goto fail;
1268         }
1269 
1270         bdrv_qcow2 = bdrv_find_format("qcow2");
1271         create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1272                                                  NULL);
1273 
1274         set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1275 
1276         ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1277         free_option_parameters(create_options);
1278         if (ret < 0) {
1279             error_setg_errno(errp, -ret, "Could not create temporary overlay "
1280                              "'%s': %s", tmp_filename,
1281                              error_get_pretty(local_err));
1282             error_free(local_err);
1283             local_err = NULL;
1284             goto fail;
1285         }
1286 
1287         /* Prepare a new options QDict for the temporary file, where user
1288          * options refer to the backing file */
1289         if (filename) {
1290             qdict_put(options, "file.filename", qstring_from_str(filename));
1291         }
1292         if (drv) {
1293             qdict_put(options, "driver", qstring_from_str(drv->format_name));
1294         }
1295 
1296         snapshot_options = qdict_new();
1297         qdict_put(snapshot_options, "backing", options);
1298         qdict_flatten(snapshot_options);
1299 
1300         bs->options = snapshot_options;
1301         options = qdict_clone_shallow(bs->options);
1302 
1303         filename = tmp_filename;
1304         drv = bdrv_qcow2;
1305         bs->is_temporary = 1;
1306     }
1307 
1308     /* Open image file without format layer */
1309     if (flags & BDRV_O_RDWR) {
1310         flags |= BDRV_O_ALLOW_RDWR;
1311     }
1312 
1313     assert(file == NULL);
1314     ret = bdrv_open_image(&file, filename, options, "file",
1315                           bdrv_open_flags(bs, flags | BDRV_O_UNMAP) |
1316                           BDRV_O_PROTOCOL, true, &local_err);
1317     if (ret < 0) {
1318         goto fail;
1319     }
1320 
1321     /* Find the right image format driver */
1322     drvname = qdict_get_try_str(options, "driver");
1323     if (drvname) {
1324         drv = bdrv_find_format(drvname);
1325         qdict_del(options, "driver");
1326         if (!drv) {
1327             error_setg(errp, "Invalid driver: '%s'", drvname);
1328             ret = -EINVAL;
1329             goto unlink_and_fail;
1330         }
1331     }
1332 
1333     if (!drv) {
1334         if (file) {
1335             ret = find_image_format(file, filename, &drv, &local_err);
1336         } else {
1337             error_setg(errp, "Must specify either driver or file");
1338             ret = -EINVAL;
1339             goto unlink_and_fail;
1340         }
1341     }
1342 
1343     if (!drv) {
1344         goto unlink_and_fail;
1345     }
1346 
1347     /* Open the image */
1348     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1349     if (ret < 0) {
1350         goto unlink_and_fail;
1351     }
1352 
1353     if (file && (bs->file != file)) {
1354         bdrv_unref(file);
1355         file = NULL;
1356     }
1357 
1358     /* If there is a backing file, use it */
1359     if ((flags & BDRV_O_NO_BACKING) == 0) {
1360         QDict *backing_options;
1361 
1362         qdict_extract_subqdict(options, &backing_options, "backing.");
1363         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1364         if (ret < 0) {
1365             goto close_and_fail;
1366         }
1367     }
1368 
1369 done:
1370     /* Check if any unknown options were used */
1371     if (options && (qdict_size(options) != 0)) {
1372         const QDictEntry *entry = qdict_first(options);
1373         if (flags & BDRV_O_PROTOCOL) {
1374             error_setg(errp, "Block protocol '%s' doesn't support the option "
1375                        "'%s'", drv->format_name, entry->key);
1376         } else {
1377             error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1378                        "support the option '%s'", drv->format_name,
1379                        bs->device_name, entry->key);
1380         }
1381 
1382         ret = -EINVAL;
1383         goto close_and_fail;
1384     }
1385     QDECREF(options);
1386 
1387     if (!bdrv_key_required(bs)) {
1388         bdrv_dev_change_media_cb(bs, true);
1389     }
1390 
1391     *pbs = bs;
1392     return 0;
1393 
1394 unlink_and_fail:
1395     if (file != NULL) {
1396         bdrv_unref(file);
1397     }
1398     if (bs->is_temporary) {
1399         unlink(filename);
1400     }
1401 fail:
1402     QDECREF(bs->options);
1403     QDECREF(options);
1404     bs->options = NULL;
1405     if (!*pbs) {
1406         /* If *pbs is NULL, a new BDS has been created in this function and
1407            needs to be freed now. Otherwise, it does not need to be closed,
1408            since it has not really been opened yet. */
1409         bdrv_unref(bs);
1410     }
1411     if (local_err) {
1412         error_propagate(errp, local_err);
1413     }
1414     return ret;
1415 
1416 close_and_fail:
1417     /* See fail path, but now the BDS has to be always closed */
1418     if (*pbs) {
1419         bdrv_close(bs);
1420     } else {
1421         bdrv_unref(bs);
1422     }
1423     QDECREF(options);
1424     if (local_err) {
1425         error_propagate(errp, local_err);
1426     }
1427     return ret;
1428 }
1429 
1430 typedef struct BlockReopenQueueEntry {
1431      bool prepared;
1432      BDRVReopenState state;
1433      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1434 } BlockReopenQueueEntry;
1435 
1436 /*
1437  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1438  * reopen of multiple devices.
1439  *
1440  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1441  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1442  * be created and initialized. This newly created BlockReopenQueue should be
1443  * passed back in for subsequent calls that are intended to be of the same
1444  * atomic 'set'.
1445  *
1446  * bs is the BlockDriverState to add to the reopen queue.
1447  *
1448  * flags contains the open flags for the associated bs
1449  *
1450  * returns a pointer to bs_queue, which is either the newly allocated
1451  * bs_queue, or the existing bs_queue being used.
1452  *
1453  */
1454 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1455                                     BlockDriverState *bs, int flags)
1456 {
1457     assert(bs != NULL);
1458 
1459     BlockReopenQueueEntry *bs_entry;
1460     if (bs_queue == NULL) {
1461         bs_queue = g_new0(BlockReopenQueue, 1);
1462         QSIMPLEQ_INIT(bs_queue);
1463     }
1464 
1465     if (bs->file) {
1466         bdrv_reopen_queue(bs_queue, bs->file, flags);
1467     }
1468 
1469     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1470     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1471 
1472     bs_entry->state.bs = bs;
1473     bs_entry->state.flags = flags;
1474 
1475     return bs_queue;
1476 }
1477 
1478 /*
1479  * Reopen multiple BlockDriverStates atomically & transactionally.
1480  *
1481  * The queue passed in (bs_queue) must have been built up previous
1482  * via bdrv_reopen_queue().
1483  *
1484  * Reopens all BDS specified in the queue, with the appropriate
1485  * flags.  All devices are prepared for reopen, and failure of any
1486  * device will cause all device changes to be abandonded, and intermediate
1487  * data cleaned up.
1488  *
1489  * If all devices prepare successfully, then the changes are committed
1490  * to all devices.
1491  *
1492  */
1493 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1494 {
1495     int ret = -1;
1496     BlockReopenQueueEntry *bs_entry, *next;
1497     Error *local_err = NULL;
1498 
1499     assert(bs_queue != NULL);
1500 
1501     bdrv_drain_all();
1502 
1503     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1504         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1505             error_propagate(errp, local_err);
1506             goto cleanup;
1507         }
1508         bs_entry->prepared = true;
1509     }
1510 
1511     /* If we reach this point, we have success and just need to apply the
1512      * changes
1513      */
1514     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1515         bdrv_reopen_commit(&bs_entry->state);
1516     }
1517 
1518     ret = 0;
1519 
1520 cleanup:
1521     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1522         if (ret && bs_entry->prepared) {
1523             bdrv_reopen_abort(&bs_entry->state);
1524         }
1525         g_free(bs_entry);
1526     }
1527     g_free(bs_queue);
1528     return ret;
1529 }
1530 
1531 
1532 /* Reopen a single BlockDriverState with the specified flags. */
1533 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1534 {
1535     int ret = -1;
1536     Error *local_err = NULL;
1537     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1538 
1539     ret = bdrv_reopen_multiple(queue, &local_err);
1540     if (local_err != NULL) {
1541         error_propagate(errp, local_err);
1542     }
1543     return ret;
1544 }
1545 
1546 
1547 /*
1548  * Prepares a BlockDriverState for reopen. All changes are staged in the
1549  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1550  * the block driver layer .bdrv_reopen_prepare()
1551  *
1552  * bs is the BlockDriverState to reopen
1553  * flags are the new open flags
1554  * queue is the reopen queue
1555  *
1556  * Returns 0 on success, non-zero on error.  On error errp will be set
1557  * as well.
1558  *
1559  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1560  * It is the responsibility of the caller to then call the abort() or
1561  * commit() for any other BDS that have been left in a prepare() state
1562  *
1563  */
1564 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1565                         Error **errp)
1566 {
1567     int ret = -1;
1568     Error *local_err = NULL;
1569     BlockDriver *drv;
1570 
1571     assert(reopen_state != NULL);
1572     assert(reopen_state->bs->drv != NULL);
1573     drv = reopen_state->bs->drv;
1574 
1575     /* if we are to stay read-only, do not allow permission change
1576      * to r/w */
1577     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1578         reopen_state->flags & BDRV_O_RDWR) {
1579         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1580                   reopen_state->bs->device_name);
1581         goto error;
1582     }
1583 
1584 
1585     ret = bdrv_flush(reopen_state->bs);
1586     if (ret) {
1587         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1588                   strerror(-ret));
1589         goto error;
1590     }
1591 
1592     if (drv->bdrv_reopen_prepare) {
1593         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1594         if (ret) {
1595             if (local_err != NULL) {
1596                 error_propagate(errp, local_err);
1597             } else {
1598                 error_setg(errp, "failed while preparing to reopen image '%s'",
1599                            reopen_state->bs->filename);
1600             }
1601             goto error;
1602         }
1603     } else {
1604         /* It is currently mandatory to have a bdrv_reopen_prepare()
1605          * handler for each supported drv. */
1606         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1607                   drv->format_name, reopen_state->bs->device_name,
1608                  "reopening of file");
1609         ret = -1;
1610         goto error;
1611     }
1612 
1613     ret = 0;
1614 
1615 error:
1616     return ret;
1617 }
1618 
1619 /*
1620  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1621  * makes them final by swapping the staging BlockDriverState contents into
1622  * the active BlockDriverState contents.
1623  */
1624 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1625 {
1626     BlockDriver *drv;
1627 
1628     assert(reopen_state != NULL);
1629     drv = reopen_state->bs->drv;
1630     assert(drv != NULL);
1631 
1632     /* If there are any driver level actions to take */
1633     if (drv->bdrv_reopen_commit) {
1634         drv->bdrv_reopen_commit(reopen_state);
1635     }
1636 
1637     /* set BDS specific flags now */
1638     reopen_state->bs->open_flags         = reopen_state->flags;
1639     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1640                                               BDRV_O_CACHE_WB);
1641     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1642 
1643     bdrv_refresh_limits(reopen_state->bs);
1644 }
1645 
1646 /*
1647  * Abort the reopen, and delete and free the staged changes in
1648  * reopen_state
1649  */
1650 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1651 {
1652     BlockDriver *drv;
1653 
1654     assert(reopen_state != NULL);
1655     drv = reopen_state->bs->drv;
1656     assert(drv != NULL);
1657 
1658     if (drv->bdrv_reopen_abort) {
1659         drv->bdrv_reopen_abort(reopen_state);
1660     }
1661 }
1662 
1663 
1664 void bdrv_close(BlockDriverState *bs)
1665 {
1666     if (bs->job) {
1667         block_job_cancel_sync(bs->job);
1668     }
1669     bdrv_drain_all(); /* complete I/O */
1670     bdrv_flush(bs);
1671     bdrv_drain_all(); /* in case flush left pending I/O */
1672     notifier_list_notify(&bs->close_notifiers, bs);
1673 
1674     if (bs->drv) {
1675         if (bs->backing_hd) {
1676             bdrv_unref(bs->backing_hd);
1677             bs->backing_hd = NULL;
1678         }
1679         bs->drv->bdrv_close(bs);
1680         g_free(bs->opaque);
1681 #ifdef _WIN32
1682         if (bs->is_temporary) {
1683             unlink(bs->filename);
1684         }
1685 #endif
1686         bs->opaque = NULL;
1687         bs->drv = NULL;
1688         bs->copy_on_read = 0;
1689         bs->backing_file[0] = '\0';
1690         bs->backing_format[0] = '\0';
1691         bs->total_sectors = 0;
1692         bs->encrypted = 0;
1693         bs->valid_key = 0;
1694         bs->sg = 0;
1695         bs->growable = 0;
1696         bs->zero_beyond_eof = false;
1697         QDECREF(bs->options);
1698         bs->options = NULL;
1699 
1700         if (bs->file != NULL) {
1701             bdrv_unref(bs->file);
1702             bs->file = NULL;
1703         }
1704     }
1705 
1706     bdrv_dev_change_media_cb(bs, false);
1707 
1708     /*throttling disk I/O limits*/
1709     if (bs->io_limits_enabled) {
1710         bdrv_io_limits_disable(bs);
1711     }
1712 }
1713 
1714 void bdrv_close_all(void)
1715 {
1716     BlockDriverState *bs;
1717 
1718     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1719         bdrv_close(bs);
1720     }
1721 }
1722 
1723 /* Check if any requests are in-flight (including throttled requests) */
1724 static bool bdrv_requests_pending(BlockDriverState *bs)
1725 {
1726     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1727         return true;
1728     }
1729     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1730         return true;
1731     }
1732     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1733         return true;
1734     }
1735     if (bs->file && bdrv_requests_pending(bs->file)) {
1736         return true;
1737     }
1738     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1739         return true;
1740     }
1741     return false;
1742 }
1743 
1744 static bool bdrv_requests_pending_all(void)
1745 {
1746     BlockDriverState *bs;
1747     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1748         if (bdrv_requests_pending(bs)) {
1749             return true;
1750         }
1751     }
1752     return false;
1753 }
1754 
1755 /*
1756  * Wait for pending requests to complete across all BlockDriverStates
1757  *
1758  * This function does not flush data to disk, use bdrv_flush_all() for that
1759  * after calling this function.
1760  *
1761  * Note that completion of an asynchronous I/O operation can trigger any
1762  * number of other I/O operations on other devices---for example a coroutine
1763  * can be arbitrarily complex and a constant flow of I/O can come until the
1764  * coroutine is complete.  Because of this, it is not possible to have a
1765  * function to drain a single device's I/O queue.
1766  */
1767 void bdrv_drain_all(void)
1768 {
1769     /* Always run first iteration so any pending completion BHs run */
1770     bool busy = true;
1771     BlockDriverState *bs;
1772 
1773     while (busy) {
1774         QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1775             bdrv_start_throttled_reqs(bs);
1776         }
1777 
1778         busy = bdrv_requests_pending_all();
1779         busy |= aio_poll(qemu_get_aio_context(), busy);
1780     }
1781 }
1782 
1783 /* make a BlockDriverState anonymous by removing from bdrv_state and
1784  * graph_bdrv_state list.
1785    Also, NULL terminate the device_name to prevent double remove */
1786 void bdrv_make_anon(BlockDriverState *bs)
1787 {
1788     if (bs->device_name[0] != '\0') {
1789         QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1790     }
1791     bs->device_name[0] = '\0';
1792     if (bs->node_name[0] != '\0') {
1793         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1794     }
1795     bs->node_name[0] = '\0';
1796 }
1797 
1798 static void bdrv_rebind(BlockDriverState *bs)
1799 {
1800     if (bs->drv && bs->drv->bdrv_rebind) {
1801         bs->drv->bdrv_rebind(bs);
1802     }
1803 }
1804 
1805 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1806                                      BlockDriverState *bs_src)
1807 {
1808     /* move some fields that need to stay attached to the device */
1809     bs_dest->open_flags         = bs_src->open_flags;
1810 
1811     /* dev info */
1812     bs_dest->dev_ops            = bs_src->dev_ops;
1813     bs_dest->dev_opaque         = bs_src->dev_opaque;
1814     bs_dest->dev                = bs_src->dev;
1815     bs_dest->guest_block_size   = bs_src->guest_block_size;
1816     bs_dest->copy_on_read       = bs_src->copy_on_read;
1817 
1818     bs_dest->enable_write_cache = bs_src->enable_write_cache;
1819 
1820     /* i/o throttled req */
1821     memcpy(&bs_dest->throttle_state,
1822            &bs_src->throttle_state,
1823            sizeof(ThrottleState));
1824     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1825     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1826     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1827 
1828     /* r/w error */
1829     bs_dest->on_read_error      = bs_src->on_read_error;
1830     bs_dest->on_write_error     = bs_src->on_write_error;
1831 
1832     /* i/o status */
1833     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1834     bs_dest->iostatus           = bs_src->iostatus;
1835 
1836     /* dirty bitmap */
1837     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1838 
1839     /* reference count */
1840     bs_dest->refcnt             = bs_src->refcnt;
1841 
1842     /* job */
1843     bs_dest->in_use             = bs_src->in_use;
1844     bs_dest->job                = bs_src->job;
1845 
1846     /* keep the same entry in bdrv_states */
1847     pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1848             bs_src->device_name);
1849     bs_dest->device_list = bs_src->device_list;
1850 
1851     /* keep the same entry in graph_bdrv_states
1852      * We do want to swap name but don't want to swap linked list entries
1853      */
1854     bs_dest->node_list   = bs_src->node_list;
1855 }
1856 
1857 /*
1858  * Swap bs contents for two image chains while they are live,
1859  * while keeping required fields on the BlockDriverState that is
1860  * actually attached to a device.
1861  *
1862  * This will modify the BlockDriverState fields, and swap contents
1863  * between bs_new and bs_old. Both bs_new and bs_old are modified.
1864  *
1865  * bs_new is required to be anonymous.
1866  *
1867  * This function does not create any image files.
1868  */
1869 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1870 {
1871     BlockDriverState tmp;
1872 
1873     /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1874     assert(bs_new->device_name[0] == '\0');
1875     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1876     assert(bs_new->job == NULL);
1877     assert(bs_new->dev == NULL);
1878     assert(bs_new->in_use == 0);
1879     assert(bs_new->io_limits_enabled == false);
1880     assert(!throttle_have_timer(&bs_new->throttle_state));
1881 
1882     tmp = *bs_new;
1883     *bs_new = *bs_old;
1884     *bs_old = tmp;
1885 
1886     /* there are some fields that should not be swapped, move them back */
1887     bdrv_move_feature_fields(&tmp, bs_old);
1888     bdrv_move_feature_fields(bs_old, bs_new);
1889     bdrv_move_feature_fields(bs_new, &tmp);
1890 
1891     /* bs_new shouldn't be in bdrv_states even after the swap!  */
1892     assert(bs_new->device_name[0] == '\0');
1893 
1894     /* Check a few fields that should remain attached to the device */
1895     assert(bs_new->dev == NULL);
1896     assert(bs_new->job == NULL);
1897     assert(bs_new->in_use == 0);
1898     assert(bs_new->io_limits_enabled == false);
1899     assert(!throttle_have_timer(&bs_new->throttle_state));
1900 
1901     bdrv_rebind(bs_new);
1902     bdrv_rebind(bs_old);
1903 }
1904 
1905 /*
1906  * Add new bs contents at the top of an image chain while the chain is
1907  * live, while keeping required fields on the top layer.
1908  *
1909  * This will modify the BlockDriverState fields, and swap contents
1910  * between bs_new and bs_top. Both bs_new and bs_top are modified.
1911  *
1912  * bs_new is required to be anonymous.
1913  *
1914  * This function does not create any image files.
1915  */
1916 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1917 {
1918     bdrv_swap(bs_new, bs_top);
1919 
1920     /* The contents of 'tmp' will become bs_top, as we are
1921      * swapping bs_new and bs_top contents. */
1922     bs_top->backing_hd = bs_new;
1923     bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1924     pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1925             bs_new->filename);
1926     pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1927             bs_new->drv ? bs_new->drv->format_name : "");
1928 }
1929 
1930 static void bdrv_delete(BlockDriverState *bs)
1931 {
1932     assert(!bs->dev);
1933     assert(!bs->job);
1934     assert(!bs->in_use);
1935     assert(!bs->refcnt);
1936     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1937 
1938     bdrv_close(bs);
1939 
1940     /* remove from list, if necessary */
1941     bdrv_make_anon(bs);
1942 
1943     g_free(bs);
1944 }
1945 
1946 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1947 /* TODO change to DeviceState *dev when all users are qdevified */
1948 {
1949     if (bs->dev) {
1950         return -EBUSY;
1951     }
1952     bs->dev = dev;
1953     bdrv_iostatus_reset(bs);
1954     return 0;
1955 }
1956 
1957 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1958 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1959 {
1960     if (bdrv_attach_dev(bs, dev) < 0) {
1961         abort();
1962     }
1963 }
1964 
1965 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1966 /* TODO change to DeviceState *dev when all users are qdevified */
1967 {
1968     assert(bs->dev == dev);
1969     bs->dev = NULL;
1970     bs->dev_ops = NULL;
1971     bs->dev_opaque = NULL;
1972     bs->guest_block_size = 512;
1973 }
1974 
1975 /* TODO change to return DeviceState * when all users are qdevified */
1976 void *bdrv_get_attached_dev(BlockDriverState *bs)
1977 {
1978     return bs->dev;
1979 }
1980 
1981 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1982                       void *opaque)
1983 {
1984     bs->dev_ops = ops;
1985     bs->dev_opaque = opaque;
1986 }
1987 
1988 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1989                                enum MonitorEvent ev,
1990                                BlockErrorAction action, bool is_read)
1991 {
1992     QObject *data;
1993     const char *action_str;
1994 
1995     switch (action) {
1996     case BDRV_ACTION_REPORT:
1997         action_str = "report";
1998         break;
1999     case BDRV_ACTION_IGNORE:
2000         action_str = "ignore";
2001         break;
2002     case BDRV_ACTION_STOP:
2003         action_str = "stop";
2004         break;
2005     default:
2006         abort();
2007     }
2008 
2009     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2010                               bdrv->device_name,
2011                               action_str,
2012                               is_read ? "read" : "write");
2013     monitor_protocol_event(ev, data);
2014 
2015     qobject_decref(data);
2016 }
2017 
2018 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2019 {
2020     QObject *data;
2021 
2022     data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2023                               bdrv_get_device_name(bs), ejected);
2024     monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2025 
2026     qobject_decref(data);
2027 }
2028 
2029 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2030 {
2031     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2032         bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2033         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2034         if (tray_was_closed) {
2035             /* tray open */
2036             bdrv_emit_qmp_eject_event(bs, true);
2037         }
2038         if (load) {
2039             /* tray close */
2040             bdrv_emit_qmp_eject_event(bs, false);
2041         }
2042     }
2043 }
2044 
2045 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2046 {
2047     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2048 }
2049 
2050 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2051 {
2052     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2053         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2054     }
2055 }
2056 
2057 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2058 {
2059     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2060         return bs->dev_ops->is_tray_open(bs->dev_opaque);
2061     }
2062     return false;
2063 }
2064 
2065 static void bdrv_dev_resize_cb(BlockDriverState *bs)
2066 {
2067     if (bs->dev_ops && bs->dev_ops->resize_cb) {
2068         bs->dev_ops->resize_cb(bs->dev_opaque);
2069     }
2070 }
2071 
2072 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2073 {
2074     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2075         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2076     }
2077     return false;
2078 }
2079 
2080 /*
2081  * Run consistency checks on an image
2082  *
2083  * Returns 0 if the check could be completed (it doesn't mean that the image is
2084  * free of errors) or -errno when an internal error occurred. The results of the
2085  * check are stored in res.
2086  */
2087 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2088 {
2089     if (bs->drv->bdrv_check == NULL) {
2090         return -ENOTSUP;
2091     }
2092 
2093     memset(res, 0, sizeof(*res));
2094     return bs->drv->bdrv_check(bs, res, fix);
2095 }
2096 
2097 #define COMMIT_BUF_SECTORS 2048
2098 
2099 /* commit COW file into the raw image */
2100 int bdrv_commit(BlockDriverState *bs)
2101 {
2102     BlockDriver *drv = bs->drv;
2103     int64_t sector, total_sectors, length, backing_length;
2104     int n, ro, open_flags;
2105     int ret = 0;
2106     uint8_t *buf = NULL;
2107     char filename[PATH_MAX];
2108 
2109     if (!drv)
2110         return -ENOMEDIUM;
2111 
2112     if (!bs->backing_hd) {
2113         return -ENOTSUP;
2114     }
2115 
2116     if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
2117         return -EBUSY;
2118     }
2119 
2120     ro = bs->backing_hd->read_only;
2121     /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2122     pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2123     open_flags =  bs->backing_hd->open_flags;
2124 
2125     if (ro) {
2126         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2127             return -EACCES;
2128         }
2129     }
2130 
2131     length = bdrv_getlength(bs);
2132     if (length < 0) {
2133         ret = length;
2134         goto ro_cleanup;
2135     }
2136 
2137     backing_length = bdrv_getlength(bs->backing_hd);
2138     if (backing_length < 0) {
2139         ret = backing_length;
2140         goto ro_cleanup;
2141     }
2142 
2143     /* If our top snapshot is larger than the backing file image,
2144      * grow the backing file image if possible.  If not possible,
2145      * we must return an error */
2146     if (length > backing_length) {
2147         ret = bdrv_truncate(bs->backing_hd, length);
2148         if (ret < 0) {
2149             goto ro_cleanup;
2150         }
2151     }
2152 
2153     total_sectors = length >> BDRV_SECTOR_BITS;
2154     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2155 
2156     for (sector = 0; sector < total_sectors; sector += n) {
2157         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2158         if (ret < 0) {
2159             goto ro_cleanup;
2160         }
2161         if (ret) {
2162             ret = bdrv_read(bs, sector, buf, n);
2163             if (ret < 0) {
2164                 goto ro_cleanup;
2165             }
2166 
2167             ret = bdrv_write(bs->backing_hd, sector, buf, n);
2168             if (ret < 0) {
2169                 goto ro_cleanup;
2170             }
2171         }
2172     }
2173 
2174     if (drv->bdrv_make_empty) {
2175         ret = drv->bdrv_make_empty(bs);
2176         if (ret < 0) {
2177             goto ro_cleanup;
2178         }
2179         bdrv_flush(bs);
2180     }
2181 
2182     /*
2183      * Make sure all data we wrote to the backing device is actually
2184      * stable on disk.
2185      */
2186     if (bs->backing_hd) {
2187         bdrv_flush(bs->backing_hd);
2188     }
2189 
2190     ret = 0;
2191 ro_cleanup:
2192     g_free(buf);
2193 
2194     if (ro) {
2195         /* ignoring error return here */
2196         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2197     }
2198 
2199     return ret;
2200 }
2201 
2202 int bdrv_commit_all(void)
2203 {
2204     BlockDriverState *bs;
2205 
2206     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2207         if (bs->drv && bs->backing_hd) {
2208             int ret = bdrv_commit(bs);
2209             if (ret < 0) {
2210                 return ret;
2211             }
2212         }
2213     }
2214     return 0;
2215 }
2216 
2217 /**
2218  * Remove an active request from the tracked requests list
2219  *
2220  * This function should be called when a tracked request is completing.
2221  */
2222 static void tracked_request_end(BdrvTrackedRequest *req)
2223 {
2224     if (req->serialising) {
2225         req->bs->serialising_in_flight--;
2226     }
2227 
2228     QLIST_REMOVE(req, list);
2229     qemu_co_queue_restart_all(&req->wait_queue);
2230 }
2231 
2232 /**
2233  * Add an active request to the tracked requests list
2234  */
2235 static void tracked_request_begin(BdrvTrackedRequest *req,
2236                                   BlockDriverState *bs,
2237                                   int64_t offset,
2238                                   unsigned int bytes, bool is_write)
2239 {
2240     *req = (BdrvTrackedRequest){
2241         .bs = bs,
2242         .offset         = offset,
2243         .bytes          = bytes,
2244         .is_write       = is_write,
2245         .co             = qemu_coroutine_self(),
2246         .serialising    = false,
2247         .overlap_offset = offset,
2248         .overlap_bytes  = bytes,
2249     };
2250 
2251     qemu_co_queue_init(&req->wait_queue);
2252 
2253     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2254 }
2255 
2256 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2257 {
2258     int64_t overlap_offset = req->offset & ~(align - 1);
2259     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2260                                - overlap_offset;
2261 
2262     if (!req->serialising) {
2263         req->bs->serialising_in_flight++;
2264         req->serialising = true;
2265     }
2266 
2267     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2268     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2269 }
2270 
2271 /**
2272  * Round a region to cluster boundaries
2273  */
2274 void bdrv_round_to_clusters(BlockDriverState *bs,
2275                             int64_t sector_num, int nb_sectors,
2276                             int64_t *cluster_sector_num,
2277                             int *cluster_nb_sectors)
2278 {
2279     BlockDriverInfo bdi;
2280 
2281     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2282         *cluster_sector_num = sector_num;
2283         *cluster_nb_sectors = nb_sectors;
2284     } else {
2285         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2286         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2287         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2288                                             nb_sectors, c);
2289     }
2290 }
2291 
2292 static int bdrv_get_cluster_size(BlockDriverState *bs)
2293 {
2294     BlockDriverInfo bdi;
2295     int ret;
2296 
2297     ret = bdrv_get_info(bs, &bdi);
2298     if (ret < 0 || bdi.cluster_size == 0) {
2299         return bs->request_alignment;
2300     } else {
2301         return bdi.cluster_size;
2302     }
2303 }
2304 
2305 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2306                                      int64_t offset, unsigned int bytes)
2307 {
2308     /*        aaaa   bbbb */
2309     if (offset >= req->overlap_offset + req->overlap_bytes) {
2310         return false;
2311     }
2312     /* bbbb   aaaa        */
2313     if (req->overlap_offset >= offset + bytes) {
2314         return false;
2315     }
2316     return true;
2317 }
2318 
2319 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2320 {
2321     BlockDriverState *bs = self->bs;
2322     BdrvTrackedRequest *req;
2323     bool retry;
2324     bool waited = false;
2325 
2326     if (!bs->serialising_in_flight) {
2327         return false;
2328     }
2329 
2330     do {
2331         retry = false;
2332         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2333             if (req == self || (!req->serialising && !self->serialising)) {
2334                 continue;
2335             }
2336             if (tracked_request_overlaps(req, self->overlap_offset,
2337                                          self->overlap_bytes))
2338             {
2339                 /* Hitting this means there was a reentrant request, for
2340                  * example, a block driver issuing nested requests.  This must
2341                  * never happen since it means deadlock.
2342                  */
2343                 assert(qemu_coroutine_self() != req->co);
2344 
2345                 /* If the request is already (indirectly) waiting for us, or
2346                  * will wait for us as soon as it wakes up, then just go on
2347                  * (instead of producing a deadlock in the former case). */
2348                 if (!req->waiting_for) {
2349                     self->waiting_for = req;
2350                     qemu_co_queue_wait(&req->wait_queue);
2351                     self->waiting_for = NULL;
2352                     retry = true;
2353                     waited = true;
2354                     break;
2355                 }
2356             }
2357         }
2358     } while (retry);
2359 
2360     return waited;
2361 }
2362 
2363 /*
2364  * Return values:
2365  * 0        - success
2366  * -EINVAL  - backing format specified, but no file
2367  * -ENOSPC  - can't update the backing file because no space is left in the
2368  *            image file header
2369  * -ENOTSUP - format driver doesn't support changing the backing file
2370  */
2371 int bdrv_change_backing_file(BlockDriverState *bs,
2372     const char *backing_file, const char *backing_fmt)
2373 {
2374     BlockDriver *drv = bs->drv;
2375     int ret;
2376 
2377     /* Backing file format doesn't make sense without a backing file */
2378     if (backing_fmt && !backing_file) {
2379         return -EINVAL;
2380     }
2381 
2382     if (drv->bdrv_change_backing_file != NULL) {
2383         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2384     } else {
2385         ret = -ENOTSUP;
2386     }
2387 
2388     if (ret == 0) {
2389         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2390         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2391     }
2392     return ret;
2393 }
2394 
2395 /*
2396  * Finds the image layer in the chain that has 'bs' as its backing file.
2397  *
2398  * active is the current topmost image.
2399  *
2400  * Returns NULL if bs is not found in active's image chain,
2401  * or if active == bs.
2402  */
2403 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2404                                     BlockDriverState *bs)
2405 {
2406     BlockDriverState *overlay = NULL;
2407     BlockDriverState *intermediate;
2408 
2409     assert(active != NULL);
2410     assert(bs != NULL);
2411 
2412     /* if bs is the same as active, then by definition it has no overlay
2413      */
2414     if (active == bs) {
2415         return NULL;
2416     }
2417 
2418     intermediate = active;
2419     while (intermediate->backing_hd) {
2420         if (intermediate->backing_hd == bs) {
2421             overlay = intermediate;
2422             break;
2423         }
2424         intermediate = intermediate->backing_hd;
2425     }
2426 
2427     return overlay;
2428 }
2429 
2430 typedef struct BlkIntermediateStates {
2431     BlockDriverState *bs;
2432     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2433 } BlkIntermediateStates;
2434 
2435 
2436 /*
2437  * Drops images above 'base' up to and including 'top', and sets the image
2438  * above 'top' to have base as its backing file.
2439  *
2440  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2441  * information in 'bs' can be properly updated.
2442  *
2443  * E.g., this will convert the following chain:
2444  * bottom <- base <- intermediate <- top <- active
2445  *
2446  * to
2447  *
2448  * bottom <- base <- active
2449  *
2450  * It is allowed for bottom==base, in which case it converts:
2451  *
2452  * base <- intermediate <- top <- active
2453  *
2454  * to
2455  *
2456  * base <- active
2457  *
2458  * Error conditions:
2459  *  if active == top, that is considered an error
2460  *
2461  */
2462 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2463                            BlockDriverState *base)
2464 {
2465     BlockDriverState *intermediate;
2466     BlockDriverState *base_bs = NULL;
2467     BlockDriverState *new_top_bs = NULL;
2468     BlkIntermediateStates *intermediate_state, *next;
2469     int ret = -EIO;
2470 
2471     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2472     QSIMPLEQ_INIT(&states_to_delete);
2473 
2474     if (!top->drv || !base->drv) {
2475         goto exit;
2476     }
2477 
2478     new_top_bs = bdrv_find_overlay(active, top);
2479 
2480     if (new_top_bs == NULL) {
2481         /* we could not find the image above 'top', this is an error */
2482         goto exit;
2483     }
2484 
2485     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2486      * to do, no intermediate images */
2487     if (new_top_bs->backing_hd == base) {
2488         ret = 0;
2489         goto exit;
2490     }
2491 
2492     intermediate = top;
2493 
2494     /* now we will go down through the list, and add each BDS we find
2495      * into our deletion queue, until we hit the 'base'
2496      */
2497     while (intermediate) {
2498         intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2499         intermediate_state->bs = intermediate;
2500         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2501 
2502         if (intermediate->backing_hd == base) {
2503             base_bs = intermediate->backing_hd;
2504             break;
2505         }
2506         intermediate = intermediate->backing_hd;
2507     }
2508     if (base_bs == NULL) {
2509         /* something went wrong, we did not end at the base. safely
2510          * unravel everything, and exit with error */
2511         goto exit;
2512     }
2513 
2514     /* success - we can delete the intermediate states, and link top->base */
2515     ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2516                                    base_bs->drv ? base_bs->drv->format_name : "");
2517     if (ret) {
2518         goto exit;
2519     }
2520     new_top_bs->backing_hd = base_bs;
2521 
2522     bdrv_refresh_limits(new_top_bs);
2523 
2524     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2525         /* so that bdrv_close() does not recursively close the chain */
2526         intermediate_state->bs->backing_hd = NULL;
2527         bdrv_unref(intermediate_state->bs);
2528     }
2529     ret = 0;
2530 
2531 exit:
2532     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2533         g_free(intermediate_state);
2534     }
2535     return ret;
2536 }
2537 
2538 
2539 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2540                                    size_t size)
2541 {
2542     int64_t len;
2543 
2544     if (!bdrv_is_inserted(bs))
2545         return -ENOMEDIUM;
2546 
2547     if (bs->growable)
2548         return 0;
2549 
2550     len = bdrv_getlength(bs);
2551 
2552     if (offset < 0)
2553         return -EIO;
2554 
2555     if ((offset > len) || (len - offset < size))
2556         return -EIO;
2557 
2558     return 0;
2559 }
2560 
2561 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2562                               int nb_sectors)
2563 {
2564     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2565                                    nb_sectors * BDRV_SECTOR_SIZE);
2566 }
2567 
2568 typedef struct RwCo {
2569     BlockDriverState *bs;
2570     int64_t offset;
2571     QEMUIOVector *qiov;
2572     bool is_write;
2573     int ret;
2574     BdrvRequestFlags flags;
2575 } RwCo;
2576 
2577 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2578 {
2579     RwCo *rwco = opaque;
2580 
2581     if (!rwco->is_write) {
2582         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2583                                       rwco->qiov->size, rwco->qiov,
2584                                       rwco->flags);
2585     } else {
2586         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2587                                        rwco->qiov->size, rwco->qiov,
2588                                        rwco->flags);
2589     }
2590 }
2591 
2592 /*
2593  * Process a vectored synchronous request using coroutines
2594  */
2595 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2596                         QEMUIOVector *qiov, bool is_write,
2597                         BdrvRequestFlags flags)
2598 {
2599     Coroutine *co;
2600     RwCo rwco = {
2601         .bs = bs,
2602         .offset = offset,
2603         .qiov = qiov,
2604         .is_write = is_write,
2605         .ret = NOT_DONE,
2606         .flags = flags,
2607     };
2608 
2609     /**
2610      * In sync call context, when the vcpu is blocked, this throttling timer
2611      * will not fire; so the I/O throttling function has to be disabled here
2612      * if it has been enabled.
2613      */
2614     if (bs->io_limits_enabled) {
2615         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2616                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2617         bdrv_io_limits_disable(bs);
2618     }
2619 
2620     if (qemu_in_coroutine()) {
2621         /* Fast-path if already in coroutine context */
2622         bdrv_rw_co_entry(&rwco);
2623     } else {
2624         co = qemu_coroutine_create(bdrv_rw_co_entry);
2625         qemu_coroutine_enter(co, &rwco);
2626         while (rwco.ret == NOT_DONE) {
2627             qemu_aio_wait();
2628         }
2629     }
2630     return rwco.ret;
2631 }
2632 
2633 /*
2634  * Process a synchronous request using coroutines
2635  */
2636 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2637                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2638 {
2639     QEMUIOVector qiov;
2640     struct iovec iov = {
2641         .iov_base = (void *)buf,
2642         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2643     };
2644 
2645     qemu_iovec_init_external(&qiov, &iov, 1);
2646     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2647                         &qiov, is_write, flags);
2648 }
2649 
2650 /* return < 0 if error. See bdrv_write() for the return codes */
2651 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2652               uint8_t *buf, int nb_sectors)
2653 {
2654     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2655 }
2656 
2657 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2658 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2659                           uint8_t *buf, int nb_sectors)
2660 {
2661     bool enabled;
2662     int ret;
2663 
2664     enabled = bs->io_limits_enabled;
2665     bs->io_limits_enabled = false;
2666     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2667     bs->io_limits_enabled = enabled;
2668     return ret;
2669 }
2670 
2671 /* Return < 0 if error. Important errors are:
2672   -EIO         generic I/O error (may happen for all errors)
2673   -ENOMEDIUM   No media inserted.
2674   -EINVAL      Invalid sector number or nb_sectors
2675   -EACCES      Trying to write a read-only device
2676 */
2677 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2678                const uint8_t *buf, int nb_sectors)
2679 {
2680     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2681 }
2682 
2683 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2684                       int nb_sectors, BdrvRequestFlags flags)
2685 {
2686     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2687                       BDRV_REQ_ZERO_WRITE | flags);
2688 }
2689 
2690 /*
2691  * Completely zero out a block device with the help of bdrv_write_zeroes.
2692  * The operation is sped up by checking the block status and only writing
2693  * zeroes to the device if they currently do not return zeroes. Optional
2694  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2695  *
2696  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2697  */
2698 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2699 {
2700     int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
2701     int64_t ret, nb_sectors, sector_num = 0;
2702     int n;
2703 
2704     for (;;) {
2705         nb_sectors = target_size - sector_num;
2706         if (nb_sectors <= 0) {
2707             return 0;
2708         }
2709         if (nb_sectors > INT_MAX) {
2710             nb_sectors = INT_MAX;
2711         }
2712         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2713         if (ret < 0) {
2714             error_report("error getting block status at sector %" PRId64 ": %s",
2715                          sector_num, strerror(-ret));
2716             return ret;
2717         }
2718         if (ret & BDRV_BLOCK_ZERO) {
2719             sector_num += n;
2720             continue;
2721         }
2722         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2723         if (ret < 0) {
2724             error_report("error writing zeroes at sector %" PRId64 ": %s",
2725                          sector_num, strerror(-ret));
2726             return ret;
2727         }
2728         sector_num += n;
2729     }
2730 }
2731 
2732 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2733 {
2734     QEMUIOVector qiov;
2735     struct iovec iov = {
2736         .iov_base = (void *)buf,
2737         .iov_len = bytes,
2738     };
2739     int ret;
2740 
2741     if (bytes < 0) {
2742         return -EINVAL;
2743     }
2744 
2745     qemu_iovec_init_external(&qiov, &iov, 1);
2746     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2747     if (ret < 0) {
2748         return ret;
2749     }
2750 
2751     return bytes;
2752 }
2753 
2754 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2755 {
2756     int ret;
2757 
2758     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2759     if (ret < 0) {
2760         return ret;
2761     }
2762 
2763     return qiov->size;
2764 }
2765 
2766 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2767                 const void *buf, int bytes)
2768 {
2769     QEMUIOVector qiov;
2770     struct iovec iov = {
2771         .iov_base   = (void *) buf,
2772         .iov_len    = bytes,
2773     };
2774 
2775     if (bytes < 0) {
2776         return -EINVAL;
2777     }
2778 
2779     qemu_iovec_init_external(&qiov, &iov, 1);
2780     return bdrv_pwritev(bs, offset, &qiov);
2781 }
2782 
2783 /*
2784  * Writes to the file and ensures that no writes are reordered across this
2785  * request (acts as a barrier)
2786  *
2787  * Returns 0 on success, -errno in error cases.
2788  */
2789 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2790     const void *buf, int count)
2791 {
2792     int ret;
2793 
2794     ret = bdrv_pwrite(bs, offset, buf, count);
2795     if (ret < 0) {
2796         return ret;
2797     }
2798 
2799     /* No flush needed for cache modes that already do it */
2800     if (bs->enable_write_cache) {
2801         bdrv_flush(bs);
2802     }
2803 
2804     return 0;
2805 }
2806 
2807 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2808         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2809 {
2810     /* Perform I/O through a temporary buffer so that users who scribble over
2811      * their read buffer while the operation is in progress do not end up
2812      * modifying the image file.  This is critical for zero-copy guest I/O
2813      * where anything might happen inside guest memory.
2814      */
2815     void *bounce_buffer;
2816 
2817     BlockDriver *drv = bs->drv;
2818     struct iovec iov;
2819     QEMUIOVector bounce_qiov;
2820     int64_t cluster_sector_num;
2821     int cluster_nb_sectors;
2822     size_t skip_bytes;
2823     int ret;
2824 
2825     /* Cover entire cluster so no additional backing file I/O is required when
2826      * allocating cluster in the image file.
2827      */
2828     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2829                            &cluster_sector_num, &cluster_nb_sectors);
2830 
2831     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2832                                    cluster_sector_num, cluster_nb_sectors);
2833 
2834     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2835     iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2836     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2837 
2838     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2839                              &bounce_qiov);
2840     if (ret < 0) {
2841         goto err;
2842     }
2843 
2844     if (drv->bdrv_co_write_zeroes &&
2845         buffer_is_zero(bounce_buffer, iov.iov_len)) {
2846         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2847                                       cluster_nb_sectors, 0);
2848     } else {
2849         /* This does not change the data on the disk, it is not necessary
2850          * to flush even in cache=writethrough mode.
2851          */
2852         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2853                                   &bounce_qiov);
2854     }
2855 
2856     if (ret < 0) {
2857         /* It might be okay to ignore write errors for guest requests.  If this
2858          * is a deliberate copy-on-read then we don't want to ignore the error.
2859          * Simply report it in all cases.
2860          */
2861         goto err;
2862     }
2863 
2864     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2865     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2866                         nb_sectors * BDRV_SECTOR_SIZE);
2867 
2868 err:
2869     qemu_vfree(bounce_buffer);
2870     return ret;
2871 }
2872 
2873 /*
2874  * Forwards an already correctly aligned request to the BlockDriver. This
2875  * handles copy on read and zeroing after EOF; any other features must be
2876  * implemented by the caller.
2877  */
2878 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2879     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2880     int64_t align, QEMUIOVector *qiov, int flags)
2881 {
2882     BlockDriver *drv = bs->drv;
2883     int ret;
2884 
2885     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2886     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2887 
2888     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2889     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2890 
2891     /* Handle Copy on Read and associated serialisation */
2892     if (flags & BDRV_REQ_COPY_ON_READ) {
2893         /* If we touch the same cluster it counts as an overlap.  This
2894          * guarantees that allocating writes will be serialized and not race
2895          * with each other for the same cluster.  For example, in copy-on-read
2896          * it ensures that the CoR read and write operations are atomic and
2897          * guest writes cannot interleave between them. */
2898         mark_request_serialising(req, bdrv_get_cluster_size(bs));
2899     }
2900 
2901     wait_serialising_requests(req);
2902 
2903     if (flags & BDRV_REQ_COPY_ON_READ) {
2904         int pnum;
2905 
2906         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2907         if (ret < 0) {
2908             goto out;
2909         }
2910 
2911         if (!ret || pnum != nb_sectors) {
2912             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2913             goto out;
2914         }
2915     }
2916 
2917     /* Forward the request to the BlockDriver */
2918     if (!(bs->zero_beyond_eof && bs->growable)) {
2919         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2920     } else {
2921         /* Read zeros after EOF of growable BDSes */
2922         int64_t len, total_sectors, max_nb_sectors;
2923 
2924         len = bdrv_getlength(bs);
2925         if (len < 0) {
2926             ret = len;
2927             goto out;
2928         }
2929 
2930         total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
2931         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
2932                                   align >> BDRV_SECTOR_BITS);
2933         if (max_nb_sectors > 0) {
2934             ret = drv->bdrv_co_readv(bs, sector_num,
2935                                      MIN(nb_sectors, max_nb_sectors), qiov);
2936         } else {
2937             ret = 0;
2938         }
2939 
2940         /* Reading beyond end of file is supposed to produce zeroes */
2941         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2942             uint64_t offset = MAX(0, total_sectors - sector_num);
2943             uint64_t bytes = (sector_num + nb_sectors - offset) *
2944                               BDRV_SECTOR_SIZE;
2945             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
2946         }
2947     }
2948 
2949 out:
2950     return ret;
2951 }
2952 
2953 /*
2954  * Handle a read request in coroutine context
2955  */
2956 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
2957     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
2958     BdrvRequestFlags flags)
2959 {
2960     BlockDriver *drv = bs->drv;
2961     BdrvTrackedRequest req;
2962 
2963     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
2964     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
2965     uint8_t *head_buf = NULL;
2966     uint8_t *tail_buf = NULL;
2967     QEMUIOVector local_qiov;
2968     bool use_local_qiov = false;
2969     int ret;
2970 
2971     if (!drv) {
2972         return -ENOMEDIUM;
2973     }
2974     if (bdrv_check_byte_request(bs, offset, bytes)) {
2975         return -EIO;
2976     }
2977 
2978     if (bs->copy_on_read) {
2979         flags |= BDRV_REQ_COPY_ON_READ;
2980     }
2981 
2982     /* throttling disk I/O */
2983     if (bs->io_limits_enabled) {
2984         bdrv_io_limits_intercept(bs, bytes, false);
2985     }
2986 
2987     /* Align read if necessary by padding qiov */
2988     if (offset & (align - 1)) {
2989         head_buf = qemu_blockalign(bs, align);
2990         qemu_iovec_init(&local_qiov, qiov->niov + 2);
2991         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
2992         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
2993         use_local_qiov = true;
2994 
2995         bytes += offset & (align - 1);
2996         offset = offset & ~(align - 1);
2997     }
2998 
2999     if ((offset + bytes) & (align - 1)) {
3000         if (!use_local_qiov) {
3001             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3002             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3003             use_local_qiov = true;
3004         }
3005         tail_buf = qemu_blockalign(bs, align);
3006         qemu_iovec_add(&local_qiov, tail_buf,
3007                        align - ((offset + bytes) & (align - 1)));
3008 
3009         bytes = ROUND_UP(bytes, align);
3010     }
3011 
3012     tracked_request_begin(&req, bs, offset, bytes, false);
3013     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3014                               use_local_qiov ? &local_qiov : qiov,
3015                               flags);
3016     tracked_request_end(&req);
3017 
3018     if (use_local_qiov) {
3019         qemu_iovec_destroy(&local_qiov);
3020         qemu_vfree(head_buf);
3021         qemu_vfree(tail_buf);
3022     }
3023 
3024     return ret;
3025 }
3026 
3027 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3028     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3029     BdrvRequestFlags flags)
3030 {
3031     if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3032         return -EINVAL;
3033     }
3034 
3035     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3036                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3037 }
3038 
3039 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3040     int nb_sectors, QEMUIOVector *qiov)
3041 {
3042     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3043 
3044     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3045 }
3046 
3047 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3048     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3049 {
3050     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3051 
3052     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3053                             BDRV_REQ_COPY_ON_READ);
3054 }
3055 
3056 /* if no limit is specified in the BlockLimits use a default
3057  * of 32768 512-byte sectors (16 MiB) per request.
3058  */
3059 #define MAX_WRITE_ZEROES_DEFAULT 32768
3060 
3061 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3062     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3063 {
3064     BlockDriver *drv = bs->drv;
3065     QEMUIOVector qiov;
3066     struct iovec iov = {0};
3067     int ret = 0;
3068 
3069     int max_write_zeroes = bs->bl.max_write_zeroes ?
3070                            bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3071 
3072     while (nb_sectors > 0 && !ret) {
3073         int num = nb_sectors;
3074 
3075         /* Align request.  Block drivers can expect the "bulk" of the request
3076          * to be aligned.
3077          */
3078         if (bs->bl.write_zeroes_alignment
3079             && num > bs->bl.write_zeroes_alignment) {
3080             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3081                 /* Make a small request up to the first aligned sector.  */
3082                 num = bs->bl.write_zeroes_alignment;
3083                 num -= sector_num % bs->bl.write_zeroes_alignment;
3084             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3085                 /* Shorten the request to the last aligned sector.  num cannot
3086                  * underflow because num > bs->bl.write_zeroes_alignment.
3087                  */
3088                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3089             }
3090         }
3091 
3092         /* limit request size */
3093         if (num > max_write_zeroes) {
3094             num = max_write_zeroes;
3095         }
3096 
3097         ret = -ENOTSUP;
3098         /* First try the efficient write zeroes operation */
3099         if (drv->bdrv_co_write_zeroes) {
3100             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3101         }
3102 
3103         if (ret == -ENOTSUP) {
3104             /* Fall back to bounce buffer if write zeroes is unsupported */
3105             iov.iov_len = num * BDRV_SECTOR_SIZE;
3106             if (iov.iov_base == NULL) {
3107                 iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3108                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3109             }
3110             qemu_iovec_init_external(&qiov, &iov, 1);
3111 
3112             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3113 
3114             /* Keep bounce buffer around if it is big enough for all
3115              * all future requests.
3116              */
3117             if (num < max_write_zeroes) {
3118                 qemu_vfree(iov.iov_base);
3119                 iov.iov_base = NULL;
3120             }
3121         }
3122 
3123         sector_num += num;
3124         nb_sectors -= num;
3125     }
3126 
3127     qemu_vfree(iov.iov_base);
3128     return ret;
3129 }
3130 
3131 /*
3132  * Forwards an already correctly aligned write request to the BlockDriver.
3133  */
3134 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3135     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3136     QEMUIOVector *qiov, int flags)
3137 {
3138     BlockDriver *drv = bs->drv;
3139     bool waited;
3140     int ret;
3141 
3142     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3143     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3144 
3145     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3146     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3147 
3148     waited = wait_serialising_requests(req);
3149     assert(!waited || !req->serialising);
3150     assert(req->overlap_offset <= offset);
3151     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3152 
3153     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3154 
3155     if (ret < 0) {
3156         /* Do nothing, write notifier decided to fail this request */
3157     } else if (flags & BDRV_REQ_ZERO_WRITE) {
3158         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3159         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3160     } else {
3161         BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3162         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3163     }
3164     BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3165 
3166     if (ret == 0 && !bs->enable_write_cache) {
3167         ret = bdrv_co_flush(bs);
3168     }
3169 
3170     bdrv_set_dirty(bs, sector_num, nb_sectors);
3171 
3172     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3173         bs->wr_highest_sector = sector_num + nb_sectors - 1;
3174     }
3175     if (bs->growable && ret >= 0) {
3176         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3177     }
3178 
3179     return ret;
3180 }
3181 
3182 /*
3183  * Handle a write request in coroutine context
3184  */
3185 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3186     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3187     BdrvRequestFlags flags)
3188 {
3189     BdrvTrackedRequest req;
3190     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3191     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3192     uint8_t *head_buf = NULL;
3193     uint8_t *tail_buf = NULL;
3194     QEMUIOVector local_qiov;
3195     bool use_local_qiov = false;
3196     int ret;
3197 
3198     if (!bs->drv) {
3199         return -ENOMEDIUM;
3200     }
3201     if (bs->read_only) {
3202         return -EACCES;
3203     }
3204     if (bdrv_check_byte_request(bs, offset, bytes)) {
3205         return -EIO;
3206     }
3207 
3208     /* throttling disk I/O */
3209     if (bs->io_limits_enabled) {
3210         bdrv_io_limits_intercept(bs, bytes, true);
3211     }
3212 
3213     /*
3214      * Align write if necessary by performing a read-modify-write cycle.
3215      * Pad qiov with the read parts and be sure to have a tracked request not
3216      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3217      */
3218     tracked_request_begin(&req, bs, offset, bytes, true);
3219 
3220     if (offset & (align - 1)) {
3221         QEMUIOVector head_qiov;
3222         struct iovec head_iov;
3223 
3224         mark_request_serialising(&req, align);
3225         wait_serialising_requests(&req);
3226 
3227         head_buf = qemu_blockalign(bs, align);
3228         head_iov = (struct iovec) {
3229             .iov_base   = head_buf,
3230             .iov_len    = align,
3231         };
3232         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3233 
3234         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3235         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3236                                   align, &head_qiov, 0);
3237         if (ret < 0) {
3238             goto fail;
3239         }
3240         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3241 
3242         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3243         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3244         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3245         use_local_qiov = true;
3246 
3247         bytes += offset & (align - 1);
3248         offset = offset & ~(align - 1);
3249     }
3250 
3251     if ((offset + bytes) & (align - 1)) {
3252         QEMUIOVector tail_qiov;
3253         struct iovec tail_iov;
3254         size_t tail_bytes;
3255         bool waited;
3256 
3257         mark_request_serialising(&req, align);
3258         waited = wait_serialising_requests(&req);
3259         assert(!waited || !use_local_qiov);
3260 
3261         tail_buf = qemu_blockalign(bs, align);
3262         tail_iov = (struct iovec) {
3263             .iov_base   = tail_buf,
3264             .iov_len    = align,
3265         };
3266         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3267 
3268         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3269         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3270                                   align, &tail_qiov, 0);
3271         if (ret < 0) {
3272             goto fail;
3273         }
3274         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3275 
3276         if (!use_local_qiov) {
3277             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3278             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3279             use_local_qiov = true;
3280         }
3281 
3282         tail_bytes = (offset + bytes) & (align - 1);
3283         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3284 
3285         bytes = ROUND_UP(bytes, align);
3286     }
3287 
3288     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3289                                use_local_qiov ? &local_qiov : qiov,
3290                                flags);
3291 
3292 fail:
3293     tracked_request_end(&req);
3294 
3295     if (use_local_qiov) {
3296         qemu_iovec_destroy(&local_qiov);
3297     }
3298     qemu_vfree(head_buf);
3299     qemu_vfree(tail_buf);
3300 
3301     return ret;
3302 }
3303 
3304 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3305     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3306     BdrvRequestFlags flags)
3307 {
3308     if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3309         return -EINVAL;
3310     }
3311 
3312     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3313                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3314 }
3315 
3316 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3317     int nb_sectors, QEMUIOVector *qiov)
3318 {
3319     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3320 
3321     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3322 }
3323 
3324 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3325                                       int64_t sector_num, int nb_sectors,
3326                                       BdrvRequestFlags flags)
3327 {
3328     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3329 
3330     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3331         flags &= ~BDRV_REQ_MAY_UNMAP;
3332     }
3333 
3334     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3335                              BDRV_REQ_ZERO_WRITE | flags);
3336 }
3337 
3338 /**
3339  * Truncate file to 'offset' bytes (needed only for file protocols)
3340  */
3341 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3342 {
3343     BlockDriver *drv = bs->drv;
3344     int ret;
3345     if (!drv)
3346         return -ENOMEDIUM;
3347     if (!drv->bdrv_truncate)
3348         return -ENOTSUP;
3349     if (bs->read_only)
3350         return -EACCES;
3351     if (bdrv_in_use(bs))
3352         return -EBUSY;
3353     ret = drv->bdrv_truncate(bs, offset);
3354     if (ret == 0) {
3355         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3356         bdrv_dev_resize_cb(bs);
3357     }
3358     return ret;
3359 }
3360 
3361 /**
3362  * Length of a allocated file in bytes. Sparse files are counted by actual
3363  * allocated space. Return < 0 if error or unknown.
3364  */
3365 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3366 {
3367     BlockDriver *drv = bs->drv;
3368     if (!drv) {
3369         return -ENOMEDIUM;
3370     }
3371     if (drv->bdrv_get_allocated_file_size) {
3372         return drv->bdrv_get_allocated_file_size(bs);
3373     }
3374     if (bs->file) {
3375         return bdrv_get_allocated_file_size(bs->file);
3376     }
3377     return -ENOTSUP;
3378 }
3379 
3380 /**
3381  * Length of a file in bytes. Return < 0 if error or unknown.
3382  */
3383 int64_t bdrv_getlength(BlockDriverState *bs)
3384 {
3385     BlockDriver *drv = bs->drv;
3386     if (!drv)
3387         return -ENOMEDIUM;
3388 
3389     if (drv->has_variable_length) {
3390         int ret = refresh_total_sectors(bs, bs->total_sectors);
3391         if (ret < 0) {
3392             return ret;
3393         }
3394     }
3395     return bs->total_sectors * BDRV_SECTOR_SIZE;
3396 }
3397 
3398 /* return 0 as number of sectors if no device present or error */
3399 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3400 {
3401     int64_t length;
3402     length = bdrv_getlength(bs);
3403     if (length < 0)
3404         length = 0;
3405     else
3406         length = length >> BDRV_SECTOR_BITS;
3407     *nb_sectors_ptr = length;
3408 }
3409 
3410 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3411                        BlockdevOnError on_write_error)
3412 {
3413     bs->on_read_error = on_read_error;
3414     bs->on_write_error = on_write_error;
3415 }
3416 
3417 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3418 {
3419     return is_read ? bs->on_read_error : bs->on_write_error;
3420 }
3421 
3422 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3423 {
3424     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3425 
3426     switch (on_err) {
3427     case BLOCKDEV_ON_ERROR_ENOSPC:
3428         return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3429     case BLOCKDEV_ON_ERROR_STOP:
3430         return BDRV_ACTION_STOP;
3431     case BLOCKDEV_ON_ERROR_REPORT:
3432         return BDRV_ACTION_REPORT;
3433     case BLOCKDEV_ON_ERROR_IGNORE:
3434         return BDRV_ACTION_IGNORE;
3435     default:
3436         abort();
3437     }
3438 }
3439 
3440 /* This is done by device models because, while the block layer knows
3441  * about the error, it does not know whether an operation comes from
3442  * the device or the block layer (from a job, for example).
3443  */
3444 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3445                        bool is_read, int error)
3446 {
3447     assert(error >= 0);
3448     bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3449     if (action == BDRV_ACTION_STOP) {
3450         vm_stop(RUN_STATE_IO_ERROR);
3451         bdrv_iostatus_set_err(bs, error);
3452     }
3453 }
3454 
3455 int bdrv_is_read_only(BlockDriverState *bs)
3456 {
3457     return bs->read_only;
3458 }
3459 
3460 int bdrv_is_sg(BlockDriverState *bs)
3461 {
3462     return bs->sg;
3463 }
3464 
3465 int bdrv_enable_write_cache(BlockDriverState *bs)
3466 {
3467     return bs->enable_write_cache;
3468 }
3469 
3470 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3471 {
3472     bs->enable_write_cache = wce;
3473 
3474     /* so a reopen() will preserve wce */
3475     if (wce) {
3476         bs->open_flags |= BDRV_O_CACHE_WB;
3477     } else {
3478         bs->open_flags &= ~BDRV_O_CACHE_WB;
3479     }
3480 }
3481 
3482 int bdrv_is_encrypted(BlockDriverState *bs)
3483 {
3484     if (bs->backing_hd && bs->backing_hd->encrypted)
3485         return 1;
3486     return bs->encrypted;
3487 }
3488 
3489 int bdrv_key_required(BlockDriverState *bs)
3490 {
3491     BlockDriverState *backing_hd = bs->backing_hd;
3492 
3493     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3494         return 1;
3495     return (bs->encrypted && !bs->valid_key);
3496 }
3497 
3498 int bdrv_set_key(BlockDriverState *bs, const char *key)
3499 {
3500     int ret;
3501     if (bs->backing_hd && bs->backing_hd->encrypted) {
3502         ret = bdrv_set_key(bs->backing_hd, key);
3503         if (ret < 0)
3504             return ret;
3505         if (!bs->encrypted)
3506             return 0;
3507     }
3508     if (!bs->encrypted) {
3509         return -EINVAL;
3510     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3511         return -ENOMEDIUM;
3512     }
3513     ret = bs->drv->bdrv_set_key(bs, key);
3514     if (ret < 0) {
3515         bs->valid_key = 0;
3516     } else if (!bs->valid_key) {
3517         bs->valid_key = 1;
3518         /* call the change callback now, we skipped it on open */
3519         bdrv_dev_change_media_cb(bs, true);
3520     }
3521     return ret;
3522 }
3523 
3524 const char *bdrv_get_format_name(BlockDriverState *bs)
3525 {
3526     return bs->drv ? bs->drv->format_name : NULL;
3527 }
3528 
3529 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3530                          void *opaque)
3531 {
3532     BlockDriver *drv;
3533 
3534     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3535         it(opaque, drv->format_name);
3536     }
3537 }
3538 
3539 /* This function is to find block backend bs */
3540 BlockDriverState *bdrv_find(const char *name)
3541 {
3542     BlockDriverState *bs;
3543 
3544     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3545         if (!strcmp(name, bs->device_name)) {
3546             return bs;
3547         }
3548     }
3549     return NULL;
3550 }
3551 
3552 /* This function is to find a node in the bs graph */
3553 BlockDriverState *bdrv_find_node(const char *node_name)
3554 {
3555     BlockDriverState *bs;
3556 
3557     assert(node_name);
3558 
3559     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3560         if (!strcmp(node_name, bs->node_name)) {
3561             return bs;
3562         }
3563     }
3564     return NULL;
3565 }
3566 
3567 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3568 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3569 {
3570     BlockDeviceInfoList *list, *entry;
3571     BlockDriverState *bs;
3572 
3573     list = NULL;
3574     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3575         entry = g_malloc0(sizeof(*entry));
3576         entry->value = bdrv_block_device_info(bs);
3577         entry->next = list;
3578         list = entry;
3579     }
3580 
3581     return list;
3582 }
3583 
3584 BlockDriverState *bdrv_lookup_bs(const char *device,
3585                                  const char *node_name,
3586                                  Error **errp)
3587 {
3588     BlockDriverState *bs = NULL;
3589 
3590     if (device) {
3591         bs = bdrv_find(device);
3592 
3593         if (bs) {
3594             return bs;
3595         }
3596     }
3597 
3598     if (node_name) {
3599         bs = bdrv_find_node(node_name);
3600 
3601         if (bs) {
3602             return bs;
3603         }
3604     }
3605 
3606     error_setg(errp, "Cannot find device=%s nor node_name=%s",
3607                      device ? device : "",
3608                      node_name ? node_name : "");
3609     return NULL;
3610 }
3611 
3612 BlockDriverState *bdrv_next(BlockDriverState *bs)
3613 {
3614     if (!bs) {
3615         return QTAILQ_FIRST(&bdrv_states);
3616     }
3617     return QTAILQ_NEXT(bs, device_list);
3618 }
3619 
3620 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3621 {
3622     BlockDriverState *bs;
3623 
3624     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3625         it(opaque, bs);
3626     }
3627 }
3628 
3629 const char *bdrv_get_device_name(BlockDriverState *bs)
3630 {
3631     return bs->device_name;
3632 }
3633 
3634 int bdrv_get_flags(BlockDriverState *bs)
3635 {
3636     return bs->open_flags;
3637 }
3638 
3639 int bdrv_flush_all(void)
3640 {
3641     BlockDriverState *bs;
3642     int result = 0;
3643 
3644     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3645         int ret = bdrv_flush(bs);
3646         if (ret < 0 && !result) {
3647             result = ret;
3648         }
3649     }
3650 
3651     return result;
3652 }
3653 
3654 int bdrv_has_zero_init_1(BlockDriverState *bs)
3655 {
3656     return 1;
3657 }
3658 
3659 int bdrv_has_zero_init(BlockDriverState *bs)
3660 {
3661     assert(bs->drv);
3662 
3663     /* If BS is a copy on write image, it is initialized to
3664        the contents of the base image, which may not be zeroes.  */
3665     if (bs->backing_hd) {
3666         return 0;
3667     }
3668     if (bs->drv->bdrv_has_zero_init) {
3669         return bs->drv->bdrv_has_zero_init(bs);
3670     }
3671 
3672     /* safe default */
3673     return 0;
3674 }
3675 
3676 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3677 {
3678     BlockDriverInfo bdi;
3679 
3680     if (bs->backing_hd) {
3681         return false;
3682     }
3683 
3684     if (bdrv_get_info(bs, &bdi) == 0) {
3685         return bdi.unallocated_blocks_are_zero;
3686     }
3687 
3688     return false;
3689 }
3690 
3691 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3692 {
3693     BlockDriverInfo bdi;
3694 
3695     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3696         return false;
3697     }
3698 
3699     if (bdrv_get_info(bs, &bdi) == 0) {
3700         return bdi.can_write_zeroes_with_unmap;
3701     }
3702 
3703     return false;
3704 }
3705 
3706 typedef struct BdrvCoGetBlockStatusData {
3707     BlockDriverState *bs;
3708     BlockDriverState *base;
3709     int64_t sector_num;
3710     int nb_sectors;
3711     int *pnum;
3712     int64_t ret;
3713     bool done;
3714 } BdrvCoGetBlockStatusData;
3715 
3716 /*
3717  * Returns true iff the specified sector is present in the disk image. Drivers
3718  * not implementing the functionality are assumed to not support backing files,
3719  * hence all their sectors are reported as allocated.
3720  *
3721  * If 'sector_num' is beyond the end of the disk image the return value is 0
3722  * and 'pnum' is set to 0.
3723  *
3724  * 'pnum' is set to the number of sectors (including and immediately following
3725  * the specified sector) that are known to be in the same
3726  * allocated/unallocated state.
3727  *
3728  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3729  * beyond the end of the disk image it will be clamped.
3730  */
3731 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3732                                                      int64_t sector_num,
3733                                                      int nb_sectors, int *pnum)
3734 {
3735     int64_t length;
3736     int64_t n;
3737     int64_t ret, ret2;
3738 
3739     length = bdrv_getlength(bs);
3740     if (length < 0) {
3741         return length;
3742     }
3743 
3744     if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3745         *pnum = 0;
3746         return 0;
3747     }
3748 
3749     n = bs->total_sectors - sector_num;
3750     if (n < nb_sectors) {
3751         nb_sectors = n;
3752     }
3753 
3754     if (!bs->drv->bdrv_co_get_block_status) {
3755         *pnum = nb_sectors;
3756         ret = BDRV_BLOCK_DATA;
3757         if (bs->drv->protocol_name) {
3758             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3759         }
3760         return ret;
3761     }
3762 
3763     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3764     if (ret < 0) {
3765         *pnum = 0;
3766         return ret;
3767     }
3768 
3769     if (ret & BDRV_BLOCK_RAW) {
3770         assert(ret & BDRV_BLOCK_OFFSET_VALID);
3771         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3772                                      *pnum, pnum);
3773     }
3774 
3775     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3776         if (bdrv_unallocated_blocks_are_zero(bs)) {
3777             ret |= BDRV_BLOCK_ZERO;
3778         } else if (bs->backing_hd) {
3779             BlockDriverState *bs2 = bs->backing_hd;
3780             int64_t length2 = bdrv_getlength(bs2);
3781             if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3782                 ret |= BDRV_BLOCK_ZERO;
3783             }
3784         }
3785     }
3786 
3787     if (bs->file &&
3788         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3789         (ret & BDRV_BLOCK_OFFSET_VALID)) {
3790         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3791                                         *pnum, pnum);
3792         if (ret2 >= 0) {
3793             /* Ignore errors.  This is just providing extra information, it
3794              * is useful but not necessary.
3795              */
3796             ret |= (ret2 & BDRV_BLOCK_ZERO);
3797         }
3798     }
3799 
3800     return ret;
3801 }
3802 
3803 /* Coroutine wrapper for bdrv_get_block_status() */
3804 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3805 {
3806     BdrvCoGetBlockStatusData *data = opaque;
3807     BlockDriverState *bs = data->bs;
3808 
3809     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3810                                          data->pnum);
3811     data->done = true;
3812 }
3813 
3814 /*
3815  * Synchronous wrapper around bdrv_co_get_block_status().
3816  *
3817  * See bdrv_co_get_block_status() for details.
3818  */
3819 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3820                               int nb_sectors, int *pnum)
3821 {
3822     Coroutine *co;
3823     BdrvCoGetBlockStatusData data = {
3824         .bs = bs,
3825         .sector_num = sector_num,
3826         .nb_sectors = nb_sectors,
3827         .pnum = pnum,
3828         .done = false,
3829     };
3830 
3831     if (qemu_in_coroutine()) {
3832         /* Fast-path if already in coroutine context */
3833         bdrv_get_block_status_co_entry(&data);
3834     } else {
3835         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3836         qemu_coroutine_enter(co, &data);
3837         while (!data.done) {
3838             qemu_aio_wait();
3839         }
3840     }
3841     return data.ret;
3842 }
3843 
3844 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3845                                    int nb_sectors, int *pnum)
3846 {
3847     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3848     if (ret < 0) {
3849         return ret;
3850     }
3851     return
3852         (ret & BDRV_BLOCK_DATA) ||
3853         ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3854 }
3855 
3856 /*
3857  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3858  *
3859  * Return true if the given sector is allocated in any image between
3860  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3861  * sector is allocated in any image of the chain.  Return false otherwise.
3862  *
3863  * 'pnum' is set to the number of sectors (including and immediately following
3864  *  the specified sector) that are known to be in the same
3865  *  allocated/unallocated state.
3866  *
3867  */
3868 int bdrv_is_allocated_above(BlockDriverState *top,
3869                             BlockDriverState *base,
3870                             int64_t sector_num,
3871                             int nb_sectors, int *pnum)
3872 {
3873     BlockDriverState *intermediate;
3874     int ret, n = nb_sectors;
3875 
3876     intermediate = top;
3877     while (intermediate && intermediate != base) {
3878         int pnum_inter;
3879         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3880                                 &pnum_inter);
3881         if (ret < 0) {
3882             return ret;
3883         } else if (ret) {
3884             *pnum = pnum_inter;
3885             return 1;
3886         }
3887 
3888         /*
3889          * [sector_num, nb_sectors] is unallocated on top but intermediate
3890          * might have
3891          *
3892          * [sector_num+x, nr_sectors] allocated.
3893          */
3894         if (n > pnum_inter &&
3895             (intermediate == top ||
3896              sector_num + pnum_inter < intermediate->total_sectors)) {
3897             n = pnum_inter;
3898         }
3899 
3900         intermediate = intermediate->backing_hd;
3901     }
3902 
3903     *pnum = n;
3904     return 0;
3905 }
3906 
3907 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3908 {
3909     if (bs->backing_hd && bs->backing_hd->encrypted)
3910         return bs->backing_file;
3911     else if (bs->encrypted)
3912         return bs->filename;
3913     else
3914         return NULL;
3915 }
3916 
3917 void bdrv_get_backing_filename(BlockDriverState *bs,
3918                                char *filename, int filename_size)
3919 {
3920     pstrcpy(filename, filename_size, bs->backing_file);
3921 }
3922 
3923 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3924                           const uint8_t *buf, int nb_sectors)
3925 {
3926     BlockDriver *drv = bs->drv;
3927     if (!drv)
3928         return -ENOMEDIUM;
3929     if (!drv->bdrv_write_compressed)
3930         return -ENOTSUP;
3931     if (bdrv_check_request(bs, sector_num, nb_sectors))
3932         return -EIO;
3933 
3934     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
3935 
3936     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3937 }
3938 
3939 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3940 {
3941     BlockDriver *drv = bs->drv;
3942     if (!drv)
3943         return -ENOMEDIUM;
3944     if (!drv->bdrv_get_info)
3945         return -ENOTSUP;
3946     memset(bdi, 0, sizeof(*bdi));
3947     return drv->bdrv_get_info(bs, bdi);
3948 }
3949 
3950 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
3951 {
3952     BlockDriver *drv = bs->drv;
3953     if (drv && drv->bdrv_get_specific_info) {
3954         return drv->bdrv_get_specific_info(bs);
3955     }
3956     return NULL;
3957 }
3958 
3959 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3960                       int64_t pos, int size)
3961 {
3962     QEMUIOVector qiov;
3963     struct iovec iov = {
3964         .iov_base   = (void *) buf,
3965         .iov_len    = size,
3966     };
3967 
3968     qemu_iovec_init_external(&qiov, &iov, 1);
3969     return bdrv_writev_vmstate(bs, &qiov, pos);
3970 }
3971 
3972 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
3973 {
3974     BlockDriver *drv = bs->drv;
3975 
3976     if (!drv) {
3977         return -ENOMEDIUM;
3978     } else if (drv->bdrv_save_vmstate) {
3979         return drv->bdrv_save_vmstate(bs, qiov, pos);
3980     } else if (bs->file) {
3981         return bdrv_writev_vmstate(bs->file, qiov, pos);
3982     }
3983 
3984     return -ENOTSUP;
3985 }
3986 
3987 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3988                       int64_t pos, int size)
3989 {
3990     BlockDriver *drv = bs->drv;
3991     if (!drv)
3992         return -ENOMEDIUM;
3993     if (drv->bdrv_load_vmstate)
3994         return drv->bdrv_load_vmstate(bs, buf, pos, size);
3995     if (bs->file)
3996         return bdrv_load_vmstate(bs->file, buf, pos, size);
3997     return -ENOTSUP;
3998 }
3999 
4000 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4001 {
4002     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4003         return;
4004     }
4005 
4006     bs->drv->bdrv_debug_event(bs, event);
4007 }
4008 
4009 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4010                           const char *tag)
4011 {
4012     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4013         bs = bs->file;
4014     }
4015 
4016     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4017         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4018     }
4019 
4020     return -ENOTSUP;
4021 }
4022 
4023 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4024 {
4025     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4026         bs = bs->file;
4027     }
4028 
4029     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4030         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4031     }
4032 
4033     return -ENOTSUP;
4034 }
4035 
4036 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4037 {
4038     while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
4039         bs = bs->file;
4040     }
4041 
4042     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4043         return bs->drv->bdrv_debug_resume(bs, tag);
4044     }
4045 
4046     return -ENOTSUP;
4047 }
4048 
4049 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4050 {
4051     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4052         bs = bs->file;
4053     }
4054 
4055     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4056         return bs->drv->bdrv_debug_is_suspended(bs, tag);
4057     }
4058 
4059     return false;
4060 }
4061 
4062 int bdrv_is_snapshot(BlockDriverState *bs)
4063 {
4064     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4065 }
4066 
4067 /* backing_file can either be relative, or absolute, or a protocol.  If it is
4068  * relative, it must be relative to the chain.  So, passing in bs->filename
4069  * from a BDS as backing_file should not be done, as that may be relative to
4070  * the CWD rather than the chain. */
4071 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4072         const char *backing_file)
4073 {
4074     char *filename_full = NULL;
4075     char *backing_file_full = NULL;
4076     char *filename_tmp = NULL;
4077     int is_protocol = 0;
4078     BlockDriverState *curr_bs = NULL;
4079     BlockDriverState *retval = NULL;
4080 
4081     if (!bs || !bs->drv || !backing_file) {
4082         return NULL;
4083     }
4084 
4085     filename_full     = g_malloc(PATH_MAX);
4086     backing_file_full = g_malloc(PATH_MAX);
4087     filename_tmp      = g_malloc(PATH_MAX);
4088 
4089     is_protocol = path_has_protocol(backing_file);
4090 
4091     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4092 
4093         /* If either of the filename paths is actually a protocol, then
4094          * compare unmodified paths; otherwise make paths relative */
4095         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4096             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4097                 retval = curr_bs->backing_hd;
4098                 break;
4099             }
4100         } else {
4101             /* If not an absolute filename path, make it relative to the current
4102              * image's filename path */
4103             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4104                          backing_file);
4105 
4106             /* We are going to compare absolute pathnames */
4107             if (!realpath(filename_tmp, filename_full)) {
4108                 continue;
4109             }
4110 
4111             /* We need to make sure the backing filename we are comparing against
4112              * is relative to the current image filename (or absolute) */
4113             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4114                          curr_bs->backing_file);
4115 
4116             if (!realpath(filename_tmp, backing_file_full)) {
4117                 continue;
4118             }
4119 
4120             if (strcmp(backing_file_full, filename_full) == 0) {
4121                 retval = curr_bs->backing_hd;
4122                 break;
4123             }
4124         }
4125     }
4126 
4127     g_free(filename_full);
4128     g_free(backing_file_full);
4129     g_free(filename_tmp);
4130     return retval;
4131 }
4132 
4133 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4134 {
4135     if (!bs->drv) {
4136         return 0;
4137     }
4138 
4139     if (!bs->backing_hd) {
4140         return 0;
4141     }
4142 
4143     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4144 }
4145 
4146 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4147 {
4148     BlockDriverState *curr_bs = NULL;
4149 
4150     if (!bs) {
4151         return NULL;
4152     }
4153 
4154     curr_bs = bs;
4155 
4156     while (curr_bs->backing_hd) {
4157         curr_bs = curr_bs->backing_hd;
4158     }
4159     return curr_bs;
4160 }
4161 
4162 /**************************************************************/
4163 /* async I/Os */
4164 
4165 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4166                                  QEMUIOVector *qiov, int nb_sectors,
4167                                  BlockDriverCompletionFunc *cb, void *opaque)
4168 {
4169     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4170 
4171     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4172                                  cb, opaque, false);
4173 }
4174 
4175 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4176                                   QEMUIOVector *qiov, int nb_sectors,
4177                                   BlockDriverCompletionFunc *cb, void *opaque)
4178 {
4179     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4180 
4181     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4182                                  cb, opaque, true);
4183 }
4184 
4185 BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4186         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4187         BlockDriverCompletionFunc *cb, void *opaque)
4188 {
4189     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4190 
4191     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4192                                  BDRV_REQ_ZERO_WRITE | flags,
4193                                  cb, opaque, true);
4194 }
4195 
4196 
4197 typedef struct MultiwriteCB {
4198     int error;
4199     int num_requests;
4200     int num_callbacks;
4201     struct {
4202         BlockDriverCompletionFunc *cb;
4203         void *opaque;
4204         QEMUIOVector *free_qiov;
4205     } callbacks[];
4206 } MultiwriteCB;
4207 
4208 static void multiwrite_user_cb(MultiwriteCB *mcb)
4209 {
4210     int i;
4211 
4212     for (i = 0; i < mcb->num_callbacks; i++) {
4213         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4214         if (mcb->callbacks[i].free_qiov) {
4215             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4216         }
4217         g_free(mcb->callbacks[i].free_qiov);
4218     }
4219 }
4220 
4221 static void multiwrite_cb(void *opaque, int ret)
4222 {
4223     MultiwriteCB *mcb = opaque;
4224 
4225     trace_multiwrite_cb(mcb, ret);
4226 
4227     if (ret < 0 && !mcb->error) {
4228         mcb->error = ret;
4229     }
4230 
4231     mcb->num_requests--;
4232     if (mcb->num_requests == 0) {
4233         multiwrite_user_cb(mcb);
4234         g_free(mcb);
4235     }
4236 }
4237 
4238 static int multiwrite_req_compare(const void *a, const void *b)
4239 {
4240     const BlockRequest *req1 = a, *req2 = b;
4241 
4242     /*
4243      * Note that we can't simply subtract req2->sector from req1->sector
4244      * here as that could overflow the return value.
4245      */
4246     if (req1->sector > req2->sector) {
4247         return 1;
4248     } else if (req1->sector < req2->sector) {
4249         return -1;
4250     } else {
4251         return 0;
4252     }
4253 }
4254 
4255 /*
4256  * Takes a bunch of requests and tries to merge them. Returns the number of
4257  * requests that remain after merging.
4258  */
4259 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4260     int num_reqs, MultiwriteCB *mcb)
4261 {
4262     int i, outidx;
4263 
4264     // Sort requests by start sector
4265     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4266 
4267     // Check if adjacent requests touch the same clusters. If so, combine them,
4268     // filling up gaps with zero sectors.
4269     outidx = 0;
4270     for (i = 1; i < num_reqs; i++) {
4271         int merge = 0;
4272         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4273 
4274         // Handle exactly sequential writes and overlapping writes.
4275         if (reqs[i].sector <= oldreq_last) {
4276             merge = 1;
4277         }
4278 
4279         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4280             merge = 0;
4281         }
4282 
4283         if (merge) {
4284             size_t size;
4285             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4286             qemu_iovec_init(qiov,
4287                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4288 
4289             // Add the first request to the merged one. If the requests are
4290             // overlapping, drop the last sectors of the first request.
4291             size = (reqs[i].sector - reqs[outidx].sector) << 9;
4292             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4293 
4294             // We should need to add any zeros between the two requests
4295             assert (reqs[i].sector <= oldreq_last);
4296 
4297             // Add the second request
4298             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4299 
4300             reqs[outidx].nb_sectors = qiov->size >> 9;
4301             reqs[outidx].qiov = qiov;
4302 
4303             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4304         } else {
4305             outidx++;
4306             reqs[outidx].sector     = reqs[i].sector;
4307             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4308             reqs[outidx].qiov       = reqs[i].qiov;
4309         }
4310     }
4311 
4312     return outidx + 1;
4313 }
4314 
4315 /*
4316  * Submit multiple AIO write requests at once.
4317  *
4318  * On success, the function returns 0 and all requests in the reqs array have
4319  * been submitted. In error case this function returns -1, and any of the
4320  * requests may or may not be submitted yet. In particular, this means that the
4321  * callback will be called for some of the requests, for others it won't. The
4322  * caller must check the error field of the BlockRequest to wait for the right
4323  * callbacks (if error != 0, no callback will be called).
4324  *
4325  * The implementation may modify the contents of the reqs array, e.g. to merge
4326  * requests. However, the fields opaque and error are left unmodified as they
4327  * are used to signal failure for a single request to the caller.
4328  */
4329 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4330 {
4331     MultiwriteCB *mcb;
4332     int i;
4333 
4334     /* don't submit writes if we don't have a medium */
4335     if (bs->drv == NULL) {
4336         for (i = 0; i < num_reqs; i++) {
4337             reqs[i].error = -ENOMEDIUM;
4338         }
4339         return -1;
4340     }
4341 
4342     if (num_reqs == 0) {
4343         return 0;
4344     }
4345 
4346     // Create MultiwriteCB structure
4347     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4348     mcb->num_requests = 0;
4349     mcb->num_callbacks = num_reqs;
4350 
4351     for (i = 0; i < num_reqs; i++) {
4352         mcb->callbacks[i].cb = reqs[i].cb;
4353         mcb->callbacks[i].opaque = reqs[i].opaque;
4354     }
4355 
4356     // Check for mergable requests
4357     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4358 
4359     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4360 
4361     /* Run the aio requests. */
4362     mcb->num_requests = num_reqs;
4363     for (i = 0; i < num_reqs; i++) {
4364         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4365                               reqs[i].nb_sectors, reqs[i].flags,
4366                               multiwrite_cb, mcb,
4367                               true);
4368     }
4369 
4370     return 0;
4371 }
4372 
4373 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4374 {
4375     acb->aiocb_info->cancel(acb);
4376 }
4377 
4378 /**************************************************************/
4379 /* async block device emulation */
4380 
4381 typedef struct BlockDriverAIOCBSync {
4382     BlockDriverAIOCB common;
4383     QEMUBH *bh;
4384     int ret;
4385     /* vector translation state */
4386     QEMUIOVector *qiov;
4387     uint8_t *bounce;
4388     int is_write;
4389 } BlockDriverAIOCBSync;
4390 
4391 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4392 {
4393     BlockDriverAIOCBSync *acb =
4394         container_of(blockacb, BlockDriverAIOCBSync, common);
4395     qemu_bh_delete(acb->bh);
4396     acb->bh = NULL;
4397     qemu_aio_release(acb);
4398 }
4399 
4400 static const AIOCBInfo bdrv_em_aiocb_info = {
4401     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
4402     .cancel             = bdrv_aio_cancel_em,
4403 };
4404 
4405 static void bdrv_aio_bh_cb(void *opaque)
4406 {
4407     BlockDriverAIOCBSync *acb = opaque;
4408 
4409     if (!acb->is_write)
4410         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4411     qemu_vfree(acb->bounce);
4412     acb->common.cb(acb->common.opaque, acb->ret);
4413     qemu_bh_delete(acb->bh);
4414     acb->bh = NULL;
4415     qemu_aio_release(acb);
4416 }
4417 
4418 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4419                                             int64_t sector_num,
4420                                             QEMUIOVector *qiov,
4421                                             int nb_sectors,
4422                                             BlockDriverCompletionFunc *cb,
4423                                             void *opaque,
4424                                             int is_write)
4425 
4426 {
4427     BlockDriverAIOCBSync *acb;
4428 
4429     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4430     acb->is_write = is_write;
4431     acb->qiov = qiov;
4432     acb->bounce = qemu_blockalign(bs, qiov->size);
4433     acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4434 
4435     if (is_write) {
4436         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4437         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4438     } else {
4439         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4440     }
4441 
4442     qemu_bh_schedule(acb->bh);
4443 
4444     return &acb->common;
4445 }
4446 
4447 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4448         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4449         BlockDriverCompletionFunc *cb, void *opaque)
4450 {
4451     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4452 }
4453 
4454 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4455         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4456         BlockDriverCompletionFunc *cb, void *opaque)
4457 {
4458     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4459 }
4460 
4461 
4462 typedef struct BlockDriverAIOCBCoroutine {
4463     BlockDriverAIOCB common;
4464     BlockRequest req;
4465     bool is_write;
4466     bool *done;
4467     QEMUBH* bh;
4468 } BlockDriverAIOCBCoroutine;
4469 
4470 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4471 {
4472     BlockDriverAIOCBCoroutine *acb =
4473         container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4474     bool done = false;
4475 
4476     acb->done = &done;
4477     while (!done) {
4478         qemu_aio_wait();
4479     }
4480 }
4481 
4482 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4483     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4484     .cancel             = bdrv_aio_co_cancel_em,
4485 };
4486 
4487 static void bdrv_co_em_bh(void *opaque)
4488 {
4489     BlockDriverAIOCBCoroutine *acb = opaque;
4490 
4491     acb->common.cb(acb->common.opaque, acb->req.error);
4492 
4493     if (acb->done) {
4494         *acb->done = true;
4495     }
4496 
4497     qemu_bh_delete(acb->bh);
4498     qemu_aio_release(acb);
4499 }
4500 
4501 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4502 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4503 {
4504     BlockDriverAIOCBCoroutine *acb = opaque;
4505     BlockDriverState *bs = acb->common.bs;
4506 
4507     if (!acb->is_write) {
4508         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4509             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4510     } else {
4511         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4512             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4513     }
4514 
4515     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4516     qemu_bh_schedule(acb->bh);
4517 }
4518 
4519 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4520                                                int64_t sector_num,
4521                                                QEMUIOVector *qiov,
4522                                                int nb_sectors,
4523                                                BdrvRequestFlags flags,
4524                                                BlockDriverCompletionFunc *cb,
4525                                                void *opaque,
4526                                                bool is_write)
4527 {
4528     Coroutine *co;
4529     BlockDriverAIOCBCoroutine *acb;
4530 
4531     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4532     acb->req.sector = sector_num;
4533     acb->req.nb_sectors = nb_sectors;
4534     acb->req.qiov = qiov;
4535     acb->req.flags = flags;
4536     acb->is_write = is_write;
4537     acb->done = NULL;
4538 
4539     co = qemu_coroutine_create(bdrv_co_do_rw);
4540     qemu_coroutine_enter(co, acb);
4541 
4542     return &acb->common;
4543 }
4544 
4545 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4546 {
4547     BlockDriverAIOCBCoroutine *acb = opaque;
4548     BlockDriverState *bs = acb->common.bs;
4549 
4550     acb->req.error = bdrv_co_flush(bs);
4551     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4552     qemu_bh_schedule(acb->bh);
4553 }
4554 
4555 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4556         BlockDriverCompletionFunc *cb, void *opaque)
4557 {
4558     trace_bdrv_aio_flush(bs, opaque);
4559 
4560     Coroutine *co;
4561     BlockDriverAIOCBCoroutine *acb;
4562 
4563     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4564     acb->done = NULL;
4565 
4566     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4567     qemu_coroutine_enter(co, acb);
4568 
4569     return &acb->common;
4570 }
4571 
4572 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4573 {
4574     BlockDriverAIOCBCoroutine *acb = opaque;
4575     BlockDriverState *bs = acb->common.bs;
4576 
4577     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4578     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4579     qemu_bh_schedule(acb->bh);
4580 }
4581 
4582 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4583         int64_t sector_num, int nb_sectors,
4584         BlockDriverCompletionFunc *cb, void *opaque)
4585 {
4586     Coroutine *co;
4587     BlockDriverAIOCBCoroutine *acb;
4588 
4589     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4590 
4591     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4592     acb->req.sector = sector_num;
4593     acb->req.nb_sectors = nb_sectors;
4594     acb->done = NULL;
4595     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4596     qemu_coroutine_enter(co, acb);
4597 
4598     return &acb->common;
4599 }
4600 
4601 void bdrv_init(void)
4602 {
4603     module_call_init(MODULE_INIT_BLOCK);
4604 }
4605 
4606 void bdrv_init_with_whitelist(void)
4607 {
4608     use_bdrv_whitelist = 1;
4609     bdrv_init();
4610 }
4611 
4612 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4613                    BlockDriverCompletionFunc *cb, void *opaque)
4614 {
4615     BlockDriverAIOCB *acb;
4616 
4617     acb = g_slice_alloc(aiocb_info->aiocb_size);
4618     acb->aiocb_info = aiocb_info;
4619     acb->bs = bs;
4620     acb->cb = cb;
4621     acb->opaque = opaque;
4622     return acb;
4623 }
4624 
4625 void qemu_aio_release(void *p)
4626 {
4627     BlockDriverAIOCB *acb = p;
4628     g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4629 }
4630 
4631 /**************************************************************/
4632 /* Coroutine block device emulation */
4633 
4634 typedef struct CoroutineIOCompletion {
4635     Coroutine *coroutine;
4636     int ret;
4637 } CoroutineIOCompletion;
4638 
4639 static void bdrv_co_io_em_complete(void *opaque, int ret)
4640 {
4641     CoroutineIOCompletion *co = opaque;
4642 
4643     co->ret = ret;
4644     qemu_coroutine_enter(co->coroutine, NULL);
4645 }
4646 
4647 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4648                                       int nb_sectors, QEMUIOVector *iov,
4649                                       bool is_write)
4650 {
4651     CoroutineIOCompletion co = {
4652         .coroutine = qemu_coroutine_self(),
4653     };
4654     BlockDriverAIOCB *acb;
4655 
4656     if (is_write) {
4657         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4658                                        bdrv_co_io_em_complete, &co);
4659     } else {
4660         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4661                                       bdrv_co_io_em_complete, &co);
4662     }
4663 
4664     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4665     if (!acb) {
4666         return -EIO;
4667     }
4668     qemu_coroutine_yield();
4669 
4670     return co.ret;
4671 }
4672 
4673 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4674                                          int64_t sector_num, int nb_sectors,
4675                                          QEMUIOVector *iov)
4676 {
4677     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4678 }
4679 
4680 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4681                                          int64_t sector_num, int nb_sectors,
4682                                          QEMUIOVector *iov)
4683 {
4684     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4685 }
4686 
4687 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4688 {
4689     RwCo *rwco = opaque;
4690 
4691     rwco->ret = bdrv_co_flush(rwco->bs);
4692 }
4693 
4694 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4695 {
4696     int ret;
4697 
4698     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4699         return 0;
4700     }
4701 
4702     /* Write back cached data to the OS even with cache=unsafe */
4703     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4704     if (bs->drv->bdrv_co_flush_to_os) {
4705         ret = bs->drv->bdrv_co_flush_to_os(bs);
4706         if (ret < 0) {
4707             return ret;
4708         }
4709     }
4710 
4711     /* But don't actually force it to the disk with cache=unsafe */
4712     if (bs->open_flags & BDRV_O_NO_FLUSH) {
4713         goto flush_parent;
4714     }
4715 
4716     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4717     if (bs->drv->bdrv_co_flush_to_disk) {
4718         ret = bs->drv->bdrv_co_flush_to_disk(bs);
4719     } else if (bs->drv->bdrv_aio_flush) {
4720         BlockDriverAIOCB *acb;
4721         CoroutineIOCompletion co = {
4722             .coroutine = qemu_coroutine_self(),
4723         };
4724 
4725         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4726         if (acb == NULL) {
4727             ret = -EIO;
4728         } else {
4729             qemu_coroutine_yield();
4730             ret = co.ret;
4731         }
4732     } else {
4733         /*
4734          * Some block drivers always operate in either writethrough or unsafe
4735          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4736          * know how the server works (because the behaviour is hardcoded or
4737          * depends on server-side configuration), so we can't ensure that
4738          * everything is safe on disk. Returning an error doesn't work because
4739          * that would break guests even if the server operates in writethrough
4740          * mode.
4741          *
4742          * Let's hope the user knows what he's doing.
4743          */
4744         ret = 0;
4745     }
4746     if (ret < 0) {
4747         return ret;
4748     }
4749 
4750     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4751      * in the case of cache=unsafe, so there are no useless flushes.
4752      */
4753 flush_parent:
4754     return bdrv_co_flush(bs->file);
4755 }
4756 
4757 void bdrv_invalidate_cache(BlockDriverState *bs)
4758 {
4759     if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4760         bs->drv->bdrv_invalidate_cache(bs);
4761     }
4762 }
4763 
4764 void bdrv_invalidate_cache_all(void)
4765 {
4766     BlockDriverState *bs;
4767 
4768     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4769         bdrv_invalidate_cache(bs);
4770     }
4771 }
4772 
4773 void bdrv_clear_incoming_migration_all(void)
4774 {
4775     BlockDriverState *bs;
4776 
4777     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4778         bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4779     }
4780 }
4781 
4782 int bdrv_flush(BlockDriverState *bs)
4783 {
4784     Coroutine *co;
4785     RwCo rwco = {
4786         .bs = bs,
4787         .ret = NOT_DONE,
4788     };
4789 
4790     if (qemu_in_coroutine()) {
4791         /* Fast-path if already in coroutine context */
4792         bdrv_flush_co_entry(&rwco);
4793     } else {
4794         co = qemu_coroutine_create(bdrv_flush_co_entry);
4795         qemu_coroutine_enter(co, &rwco);
4796         while (rwco.ret == NOT_DONE) {
4797             qemu_aio_wait();
4798         }
4799     }
4800 
4801     return rwco.ret;
4802 }
4803 
4804 typedef struct DiscardCo {
4805     BlockDriverState *bs;
4806     int64_t sector_num;
4807     int nb_sectors;
4808     int ret;
4809 } DiscardCo;
4810 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4811 {
4812     DiscardCo *rwco = opaque;
4813 
4814     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4815 }
4816 
4817 /* if no limit is specified in the BlockLimits use a default
4818  * of 32768 512-byte sectors (16 MiB) per request.
4819  */
4820 #define MAX_DISCARD_DEFAULT 32768
4821 
4822 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4823                                  int nb_sectors)
4824 {
4825     int max_discard;
4826 
4827     if (!bs->drv) {
4828         return -ENOMEDIUM;
4829     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4830         return -EIO;
4831     } else if (bs->read_only) {
4832         return -EROFS;
4833     }
4834 
4835     bdrv_reset_dirty(bs, sector_num, nb_sectors);
4836 
4837     /* Do nothing if disabled.  */
4838     if (!(bs->open_flags & BDRV_O_UNMAP)) {
4839         return 0;
4840     }
4841 
4842     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4843         return 0;
4844     }
4845 
4846     max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4847     while (nb_sectors > 0) {
4848         int ret;
4849         int num = nb_sectors;
4850 
4851         /* align request */
4852         if (bs->bl.discard_alignment &&
4853             num >= bs->bl.discard_alignment &&
4854             sector_num % bs->bl.discard_alignment) {
4855             if (num > bs->bl.discard_alignment) {
4856                 num = bs->bl.discard_alignment;
4857             }
4858             num -= sector_num % bs->bl.discard_alignment;
4859         }
4860 
4861         /* limit request size */
4862         if (num > max_discard) {
4863             num = max_discard;
4864         }
4865 
4866         if (bs->drv->bdrv_co_discard) {
4867             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
4868         } else {
4869             BlockDriverAIOCB *acb;
4870             CoroutineIOCompletion co = {
4871                 .coroutine = qemu_coroutine_self(),
4872             };
4873 
4874             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4875                                             bdrv_co_io_em_complete, &co);
4876             if (acb == NULL) {
4877                 return -EIO;
4878             } else {
4879                 qemu_coroutine_yield();
4880                 ret = co.ret;
4881             }
4882         }
4883         if (ret && ret != -ENOTSUP) {
4884             return ret;
4885         }
4886 
4887         sector_num += num;
4888         nb_sectors -= num;
4889     }
4890     return 0;
4891 }
4892 
4893 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4894 {
4895     Coroutine *co;
4896     DiscardCo rwco = {
4897         .bs = bs,
4898         .sector_num = sector_num,
4899         .nb_sectors = nb_sectors,
4900         .ret = NOT_DONE,
4901     };
4902 
4903     if (qemu_in_coroutine()) {
4904         /* Fast-path if already in coroutine context */
4905         bdrv_discard_co_entry(&rwco);
4906     } else {
4907         co = qemu_coroutine_create(bdrv_discard_co_entry);
4908         qemu_coroutine_enter(co, &rwco);
4909         while (rwco.ret == NOT_DONE) {
4910             qemu_aio_wait();
4911         }
4912     }
4913 
4914     return rwco.ret;
4915 }
4916 
4917 /**************************************************************/
4918 /* removable device support */
4919 
4920 /**
4921  * Return TRUE if the media is present
4922  */
4923 int bdrv_is_inserted(BlockDriverState *bs)
4924 {
4925     BlockDriver *drv = bs->drv;
4926 
4927     if (!drv)
4928         return 0;
4929     if (!drv->bdrv_is_inserted)
4930         return 1;
4931     return drv->bdrv_is_inserted(bs);
4932 }
4933 
4934 /**
4935  * Return whether the media changed since the last call to this
4936  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4937  */
4938 int bdrv_media_changed(BlockDriverState *bs)
4939 {
4940     BlockDriver *drv = bs->drv;
4941 
4942     if (drv && drv->bdrv_media_changed) {
4943         return drv->bdrv_media_changed(bs);
4944     }
4945     return -ENOTSUP;
4946 }
4947 
4948 /**
4949  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4950  */
4951 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4952 {
4953     BlockDriver *drv = bs->drv;
4954 
4955     if (drv && drv->bdrv_eject) {
4956         drv->bdrv_eject(bs, eject_flag);
4957     }
4958 
4959     if (bs->device_name[0] != '\0') {
4960         bdrv_emit_qmp_eject_event(bs, eject_flag);
4961     }
4962 }
4963 
4964 /**
4965  * Lock or unlock the media (if it is locked, the user won't be able
4966  * to eject it manually).
4967  */
4968 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4969 {
4970     BlockDriver *drv = bs->drv;
4971 
4972     trace_bdrv_lock_medium(bs, locked);
4973 
4974     if (drv && drv->bdrv_lock_medium) {
4975         drv->bdrv_lock_medium(bs, locked);
4976     }
4977 }
4978 
4979 /* needed for generic scsi interface */
4980 
4981 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4982 {
4983     BlockDriver *drv = bs->drv;
4984 
4985     if (drv && drv->bdrv_ioctl)
4986         return drv->bdrv_ioctl(bs, req, buf);
4987     return -ENOTSUP;
4988 }
4989 
4990 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4991         unsigned long int req, void *buf,
4992         BlockDriverCompletionFunc *cb, void *opaque)
4993 {
4994     BlockDriver *drv = bs->drv;
4995 
4996     if (drv && drv->bdrv_aio_ioctl)
4997         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4998     return NULL;
4999 }
5000 
5001 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5002 {
5003     bs->guest_block_size = align;
5004 }
5005 
5006 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5007 {
5008     return qemu_memalign(bdrv_opt_mem_align(bs), size);
5009 }
5010 
5011 /*
5012  * Check if all memory in this vector is sector aligned.
5013  */
5014 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5015 {
5016     int i;
5017     size_t alignment = bdrv_opt_mem_align(bs);
5018 
5019     for (i = 0; i < qiov->niov; i++) {
5020         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5021             return false;
5022         }
5023         if (qiov->iov[i].iov_len % alignment) {
5024             return false;
5025         }
5026     }
5027 
5028     return true;
5029 }
5030 
5031 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity)
5032 {
5033     int64_t bitmap_size;
5034     BdrvDirtyBitmap *bitmap;
5035 
5036     assert((granularity & (granularity - 1)) == 0);
5037 
5038     granularity >>= BDRV_SECTOR_BITS;
5039     assert(granularity);
5040     bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
5041     bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5042     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5043     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5044     return bitmap;
5045 }
5046 
5047 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5048 {
5049     BdrvDirtyBitmap *bm, *next;
5050     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5051         if (bm == bitmap) {
5052             QLIST_REMOVE(bitmap, list);
5053             hbitmap_free(bitmap->bitmap);
5054             g_free(bitmap);
5055             return;
5056         }
5057     }
5058 }
5059 
5060 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5061 {
5062     BdrvDirtyBitmap *bm;
5063     BlockDirtyInfoList *list = NULL;
5064     BlockDirtyInfoList **plist = &list;
5065 
5066     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5067         BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5068         BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5069         info->count = bdrv_get_dirty_count(bs, bm);
5070         info->granularity =
5071             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5072         entry->value = info;
5073         *plist = entry;
5074         plist = &entry->next;
5075     }
5076 
5077     return list;
5078 }
5079 
5080 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5081 {
5082     if (bitmap) {
5083         return hbitmap_get(bitmap->bitmap, sector);
5084     } else {
5085         return 0;
5086     }
5087 }
5088 
5089 void bdrv_dirty_iter_init(BlockDriverState *bs,
5090                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5091 {
5092     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5093 }
5094 
5095 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5096                     int nr_sectors)
5097 {
5098     BdrvDirtyBitmap *bitmap;
5099     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5100         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5101     }
5102 }
5103 
5104 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5105 {
5106     BdrvDirtyBitmap *bitmap;
5107     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5108         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5109     }
5110 }
5111 
5112 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5113 {
5114     return hbitmap_count(bitmap->bitmap);
5115 }
5116 
5117 /* Get a reference to bs */
5118 void bdrv_ref(BlockDriverState *bs)
5119 {
5120     bs->refcnt++;
5121 }
5122 
5123 /* Release a previously grabbed reference to bs.
5124  * If after releasing, reference count is zero, the BlockDriverState is
5125  * deleted. */
5126 void bdrv_unref(BlockDriverState *bs)
5127 {
5128     assert(bs->refcnt > 0);
5129     if (--bs->refcnt == 0) {
5130         bdrv_delete(bs);
5131     }
5132 }
5133 
5134 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
5135 {
5136     assert(bs->in_use != in_use);
5137     bs->in_use = in_use;
5138 }
5139 
5140 int bdrv_in_use(BlockDriverState *bs)
5141 {
5142     return bs->in_use;
5143 }
5144 
5145 void bdrv_iostatus_enable(BlockDriverState *bs)
5146 {
5147     bs->iostatus_enabled = true;
5148     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5149 }
5150 
5151 /* The I/O status is only enabled if the drive explicitly
5152  * enables it _and_ the VM is configured to stop on errors */
5153 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5154 {
5155     return (bs->iostatus_enabled &&
5156            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5157             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5158             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5159 }
5160 
5161 void bdrv_iostatus_disable(BlockDriverState *bs)
5162 {
5163     bs->iostatus_enabled = false;
5164 }
5165 
5166 void bdrv_iostatus_reset(BlockDriverState *bs)
5167 {
5168     if (bdrv_iostatus_is_enabled(bs)) {
5169         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5170         if (bs->job) {
5171             block_job_iostatus_reset(bs->job);
5172         }
5173     }
5174 }
5175 
5176 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5177 {
5178     assert(bdrv_iostatus_is_enabled(bs));
5179     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5180         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5181                                          BLOCK_DEVICE_IO_STATUS_FAILED;
5182     }
5183 }
5184 
5185 void
5186 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5187         enum BlockAcctType type)
5188 {
5189     assert(type < BDRV_MAX_IOTYPE);
5190 
5191     cookie->bytes = bytes;
5192     cookie->start_time_ns = get_clock();
5193     cookie->type = type;
5194 }
5195 
5196 void
5197 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5198 {
5199     assert(cookie->type < BDRV_MAX_IOTYPE);
5200 
5201     bs->nr_bytes[cookie->type] += cookie->bytes;
5202     bs->nr_ops[cookie->type]++;
5203     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5204 }
5205 
5206 void bdrv_img_create(const char *filename, const char *fmt,
5207                      const char *base_filename, const char *base_fmt,
5208                      char *options, uint64_t img_size, int flags,
5209                      Error **errp, bool quiet)
5210 {
5211     QEMUOptionParameter *param = NULL, *create_options = NULL;
5212     QEMUOptionParameter *backing_fmt, *backing_file, *size;
5213     BlockDriver *drv, *proto_drv;
5214     BlockDriver *backing_drv = NULL;
5215     Error *local_err = NULL;
5216     int ret = 0;
5217 
5218     /* Find driver and parse its options */
5219     drv = bdrv_find_format(fmt);
5220     if (!drv) {
5221         error_setg(errp, "Unknown file format '%s'", fmt);
5222         return;
5223     }
5224 
5225     proto_drv = bdrv_find_protocol(filename, true);
5226     if (!proto_drv) {
5227         error_setg(errp, "Unknown protocol '%s'", filename);
5228         return;
5229     }
5230 
5231     create_options = append_option_parameters(create_options,
5232                                               drv->create_options);
5233     create_options = append_option_parameters(create_options,
5234                                               proto_drv->create_options);
5235 
5236     /* Create parameter list with default values */
5237     param = parse_option_parameters("", create_options, param);
5238 
5239     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5240 
5241     /* Parse -o options */
5242     if (options) {
5243         param = parse_option_parameters(options, create_options, param);
5244         if (param == NULL) {
5245             error_setg(errp, "Invalid options for file format '%s'.", fmt);
5246             goto out;
5247         }
5248     }
5249 
5250     if (base_filename) {
5251         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5252                                  base_filename)) {
5253             error_setg(errp, "Backing file not supported for file format '%s'",
5254                        fmt);
5255             goto out;
5256         }
5257     }
5258 
5259     if (base_fmt) {
5260         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5261             error_setg(errp, "Backing file format not supported for file "
5262                              "format '%s'", fmt);
5263             goto out;
5264         }
5265     }
5266 
5267     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5268     if (backing_file && backing_file->value.s) {
5269         if (!strcmp(filename, backing_file->value.s)) {
5270             error_setg(errp, "Error: Trying to create an image with the "
5271                              "same filename as the backing file");
5272             goto out;
5273         }
5274     }
5275 
5276     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5277     if (backing_fmt && backing_fmt->value.s) {
5278         backing_drv = bdrv_find_format(backing_fmt->value.s);
5279         if (!backing_drv) {
5280             error_setg(errp, "Unknown backing file format '%s'",
5281                        backing_fmt->value.s);
5282             goto out;
5283         }
5284     }
5285 
5286     // The size for the image must always be specified, with one exception:
5287     // If we are using a backing file, we can obtain the size from there
5288     size = get_option_parameter(param, BLOCK_OPT_SIZE);
5289     if (size && size->value.n == -1) {
5290         if (backing_file && backing_file->value.s) {
5291             BlockDriverState *bs;
5292             uint64_t size;
5293             char buf[32];
5294             int back_flags;
5295 
5296             /* backing files always opened read-only */
5297             back_flags =
5298                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5299 
5300             bs = NULL;
5301             ret = bdrv_open(&bs, backing_file->value.s, NULL, NULL, back_flags,
5302                             backing_drv, &local_err);
5303             if (ret < 0) {
5304                 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5305                                  backing_file->value.s,
5306                                  error_get_pretty(local_err));
5307                 error_free(local_err);
5308                 local_err = NULL;
5309                 goto out;
5310             }
5311             bdrv_get_geometry(bs, &size);
5312             size *= 512;
5313 
5314             snprintf(buf, sizeof(buf), "%" PRId64, size);
5315             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
5316 
5317             bdrv_unref(bs);
5318         } else {
5319             error_setg(errp, "Image creation needs a size parameter");
5320             goto out;
5321         }
5322     }
5323 
5324     if (!quiet) {
5325         printf("Formatting '%s', fmt=%s ", filename, fmt);
5326         print_option_parameters(param);
5327         puts("");
5328     }
5329     ret = bdrv_create(drv, filename, param, &local_err);
5330     if (ret == -EFBIG) {
5331         /* This is generally a better message than whatever the driver would
5332          * deliver (especially because of the cluster_size_hint), since that
5333          * is most probably not much different from "image too large". */
5334         const char *cluster_size_hint = "";
5335         if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5336             cluster_size_hint = " (try using a larger cluster size)";
5337         }
5338         error_setg(errp, "The image size is too large for file format '%s'"
5339                    "%s", fmt, cluster_size_hint);
5340         error_free(local_err);
5341         local_err = NULL;
5342     }
5343 
5344 out:
5345     free_option_parameters(create_options);
5346     free_option_parameters(param);
5347 
5348     if (local_err) {
5349         error_propagate(errp, local_err);
5350     }
5351 }
5352 
5353 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5354 {
5355     /* Currently BlockDriverState always uses the main loop AioContext */
5356     return qemu_get_aio_context();
5357 }
5358 
5359 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5360                                     NotifierWithReturn *notifier)
5361 {
5362     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5363 }
5364 
5365 int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5366 {
5367     if (bs->drv->bdrv_amend_options == NULL) {
5368         return -ENOTSUP;
5369     }
5370     return bs->drv->bdrv_amend_options(bs, options);
5371 }
5372 
5373 /* Used to recurse on single child block filters.
5374  * Single child block filter will store their child in bs->file.
5375  */
5376 bool bdrv_generic_is_first_non_filter(BlockDriverState *bs,
5377                                       BlockDriverState *candidate)
5378 {
5379     if (!bs->drv) {
5380         return false;
5381     }
5382 
5383     if (!bs->drv->authorizations[BS_IS_A_FILTER]) {
5384         if (bs == candidate) {
5385             return true;
5386         } else {
5387             return false;
5388         }
5389     }
5390 
5391     if (!bs->drv->authorizations[BS_FILTER_PASS_DOWN]) {
5392         return false;
5393     }
5394 
5395     if (!bs->file) {
5396         return false;
5397     }
5398 
5399     return bdrv_recurse_is_first_non_filter(bs->file, candidate);
5400 }
5401 
5402 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5403                                       BlockDriverState *candidate)
5404 {
5405     if (bs->drv && bs->drv->bdrv_recurse_is_first_non_filter) {
5406         return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5407     }
5408 
5409     return bdrv_generic_is_first_non_filter(bs, candidate);
5410 }
5411 
5412 /* This function checks if the candidate is the first non filter bs down it's
5413  * bs chain. Since we don't have pointers to parents it explore all bs chains
5414  * from the top. Some filters can choose not to pass down the recursion.
5415  */
5416 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5417 {
5418     BlockDriverState *bs;
5419 
5420     /* walk down the bs forest recursively */
5421     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5422         bool perm;
5423 
5424         perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5425 
5426         /* candidate is the first non filter */
5427         if (perm) {
5428             return true;
5429         }
5430     }
5431 
5432     return false;
5433 }
5434