xref: /openbmc/qemu/block.c (revision 0399a381)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor/monitor.h"
28 #include "block/block_int.h"
29 #include "block/blockjob.h"
30 #include "qemu/module.h"
31 #include "qapi/qmp/qjson.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 
39 #ifdef CONFIG_BSD
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/queue.h>
44 #ifndef __DragonFly__
45 #include <sys/disk.h>
46 #endif
47 #endif
48 
49 #ifdef _WIN32
50 #include <windows.h>
51 #endif
52 
53 struct BdrvDirtyBitmap {
54     HBitmap *bitmap;
55     QLIST_ENTRY(BdrvDirtyBitmap) list;
56 };
57 
58 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59 
60 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63         BlockDriverCompletionFunc *cb, void *opaque);
64 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66         BlockDriverCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68                                          int64_t sector_num, int nb_sectors,
69                                          QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71                                          int64_t sector_num, int nb_sectors,
72                                          QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75     BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78     BdrvRequestFlags flags);
79 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80                                                int64_t sector_num,
81                                                QEMUIOVector *qiov,
82                                                int nb_sectors,
83                                                BdrvRequestFlags flags,
84                                                BlockDriverCompletionFunc *cb,
85                                                void *opaque,
86                                                bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90 
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92     QTAILQ_HEAD_INITIALIZER(bdrv_states);
93 
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96 
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98     QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
102 
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
105 {
106     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108             filename[1] == ':');
109 }
110 
111 int is_windows_drive(const char *filename)
112 {
113     if (is_windows_drive_prefix(filename) &&
114         filename[2] == '\0')
115         return 1;
116     if (strstart(filename, "\\\\.\\", NULL) ||
117         strstart(filename, "//./", NULL))
118         return 1;
119     return 0;
120 }
121 #endif
122 
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125                         ThrottleConfig *cfg)
126 {
127     int i;
128 
129     throttle_config(&bs->throttle_state, cfg);
130 
131     for (i = 0; i < 2; i++) {
132         qemu_co_enter_next(&bs->throttled_reqs[i]);
133     }
134 }
135 
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138 {
139     bool drained = false;
140     bool enabled = bs->io_limits_enabled;
141     int i;
142 
143     bs->io_limits_enabled = false;
144 
145     for (i = 0; i < 2; i++) {
146         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147             drained = true;
148         }
149     }
150 
151     bs->io_limits_enabled = enabled;
152 
153     return drained;
154 }
155 
156 void bdrv_io_limits_disable(BlockDriverState *bs)
157 {
158     bs->io_limits_enabled = false;
159 
160     bdrv_start_throttled_reqs(bs);
161 
162     throttle_destroy(&bs->throttle_state);
163 }
164 
165 static void bdrv_throttle_read_timer_cb(void *opaque)
166 {
167     BlockDriverState *bs = opaque;
168     qemu_co_enter_next(&bs->throttled_reqs[0]);
169 }
170 
171 static void bdrv_throttle_write_timer_cb(void *opaque)
172 {
173     BlockDriverState *bs = opaque;
174     qemu_co_enter_next(&bs->throttled_reqs[1]);
175 }
176 
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
179 {
180     assert(!bs->io_limits_enabled);
181     throttle_init(&bs->throttle_state,
182                   QEMU_CLOCK_VIRTUAL,
183                   bdrv_throttle_read_timer_cb,
184                   bdrv_throttle_write_timer_cb,
185                   bs);
186     bs->io_limits_enabled = true;
187 }
188 
189 /* This function makes an IO wait if needed
190  *
191  * @nb_sectors: the number of sectors of the IO
192  * @is_write:   is the IO a write
193  */
194 static void bdrv_io_limits_intercept(BlockDriverState *bs,
195                                      unsigned int bytes,
196                                      bool is_write)
197 {
198     /* does this io must wait */
199     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
200 
201     /* if must wait or any request of this type throttled queue the IO */
202     if (must_wait ||
203         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
205     }
206 
207     /* the IO will be executed, do the accounting */
208     throttle_account(&bs->throttle_state, is_write, bytes);
209 
210 
211     /* if the next request must wait -> do nothing */
212     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
213         return;
214     }
215 
216     /* else queue next request for execution */
217     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
218 }
219 
220 size_t bdrv_opt_mem_align(BlockDriverState *bs)
221 {
222     if (!bs || !bs->drv) {
223         /* 4k should be on the safe side */
224         return 4096;
225     }
226 
227     return bs->bl.opt_mem_alignment;
228 }
229 
230 /* check if the path starts with "<protocol>:" */
231 static int path_has_protocol(const char *path)
232 {
233     const char *p;
234 
235 #ifdef _WIN32
236     if (is_windows_drive(path) ||
237         is_windows_drive_prefix(path)) {
238         return 0;
239     }
240     p = path + strcspn(path, ":/\\");
241 #else
242     p = path + strcspn(path, ":/");
243 #endif
244 
245     return *p == ':';
246 }
247 
248 int path_is_absolute(const char *path)
249 {
250 #ifdef _WIN32
251     /* specific case for names like: "\\.\d:" */
252     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
253         return 1;
254     }
255     return (*path == '/' || *path == '\\');
256 #else
257     return (*path == '/');
258 #endif
259 }
260 
261 /* if filename is absolute, just copy it to dest. Otherwise, build a
262    path to it by considering it is relative to base_path. URL are
263    supported. */
264 void path_combine(char *dest, int dest_size,
265                   const char *base_path,
266                   const char *filename)
267 {
268     const char *p, *p1;
269     int len;
270 
271     if (dest_size <= 0)
272         return;
273     if (path_is_absolute(filename)) {
274         pstrcpy(dest, dest_size, filename);
275     } else {
276         p = strchr(base_path, ':');
277         if (p)
278             p++;
279         else
280             p = base_path;
281         p1 = strrchr(base_path, '/');
282 #ifdef _WIN32
283         {
284             const char *p2;
285             p2 = strrchr(base_path, '\\');
286             if (!p1 || p2 > p1)
287                 p1 = p2;
288         }
289 #endif
290         if (p1)
291             p1++;
292         else
293             p1 = base_path;
294         if (p1 > p)
295             p = p1;
296         len = p - base_path;
297         if (len > dest_size - 1)
298             len = dest_size - 1;
299         memcpy(dest, base_path, len);
300         dest[len] = '\0';
301         pstrcat(dest, dest_size, filename);
302     }
303 }
304 
305 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
306 {
307     if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
308         pstrcpy(dest, sz, bs->backing_file);
309     } else {
310         path_combine(dest, sz, bs->filename, bs->backing_file);
311     }
312 }
313 
314 void bdrv_register(BlockDriver *bdrv)
315 {
316     /* Block drivers without coroutine functions need emulation */
317     if (!bdrv->bdrv_co_readv) {
318         bdrv->bdrv_co_readv = bdrv_co_readv_em;
319         bdrv->bdrv_co_writev = bdrv_co_writev_em;
320 
321         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
322          * the block driver lacks aio we need to emulate that too.
323          */
324         if (!bdrv->bdrv_aio_readv) {
325             /* add AIO emulation layer */
326             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
327             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
328         }
329     }
330 
331     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
332 }
333 
334 /* create a new block device (by default it is empty) */
335 BlockDriverState *bdrv_new(const char *device_name)
336 {
337     BlockDriverState *bs;
338 
339     bs = g_malloc0(sizeof(BlockDriverState));
340     QLIST_INIT(&bs->dirty_bitmaps);
341     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
342     if (device_name[0] != '\0') {
343         QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
344     }
345     bdrv_iostatus_disable(bs);
346     notifier_list_init(&bs->close_notifiers);
347     notifier_with_return_list_init(&bs->before_write_notifiers);
348     qemu_co_queue_init(&bs->throttled_reqs[0]);
349     qemu_co_queue_init(&bs->throttled_reqs[1]);
350     bs->refcnt = 1;
351 
352     return bs;
353 }
354 
355 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
356 {
357     notifier_list_add(&bs->close_notifiers, notify);
358 }
359 
360 BlockDriver *bdrv_find_format(const char *format_name)
361 {
362     BlockDriver *drv1;
363     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
364         if (!strcmp(drv1->format_name, format_name)) {
365             return drv1;
366         }
367     }
368     return NULL;
369 }
370 
371 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
372 {
373     static const char *whitelist_rw[] = {
374         CONFIG_BDRV_RW_WHITELIST
375     };
376     static const char *whitelist_ro[] = {
377         CONFIG_BDRV_RO_WHITELIST
378     };
379     const char **p;
380 
381     if (!whitelist_rw[0] && !whitelist_ro[0]) {
382         return 1;               /* no whitelist, anything goes */
383     }
384 
385     for (p = whitelist_rw; *p; p++) {
386         if (!strcmp(drv->format_name, *p)) {
387             return 1;
388         }
389     }
390     if (read_only) {
391         for (p = whitelist_ro; *p; p++) {
392             if (!strcmp(drv->format_name, *p)) {
393                 return 1;
394             }
395         }
396     }
397     return 0;
398 }
399 
400 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
401                                           bool read_only)
402 {
403     BlockDriver *drv = bdrv_find_format(format_name);
404     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
405 }
406 
407 typedef struct CreateCo {
408     BlockDriver *drv;
409     char *filename;
410     QEMUOptionParameter *options;
411     int ret;
412     Error *err;
413 } CreateCo;
414 
415 static void coroutine_fn bdrv_create_co_entry(void *opaque)
416 {
417     Error *local_err = NULL;
418     int ret;
419 
420     CreateCo *cco = opaque;
421     assert(cco->drv);
422 
423     ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
424     if (local_err) {
425         error_propagate(&cco->err, local_err);
426     }
427     cco->ret = ret;
428 }
429 
430 int bdrv_create(BlockDriver *drv, const char* filename,
431     QEMUOptionParameter *options, Error **errp)
432 {
433     int ret;
434 
435     Coroutine *co;
436     CreateCo cco = {
437         .drv = drv,
438         .filename = g_strdup(filename),
439         .options = options,
440         .ret = NOT_DONE,
441         .err = NULL,
442     };
443 
444     if (!drv->bdrv_create) {
445         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
446         ret = -ENOTSUP;
447         goto out;
448     }
449 
450     if (qemu_in_coroutine()) {
451         /* Fast-path if already in coroutine context */
452         bdrv_create_co_entry(&cco);
453     } else {
454         co = qemu_coroutine_create(bdrv_create_co_entry);
455         qemu_coroutine_enter(co, &cco);
456         while (cco.ret == NOT_DONE) {
457             qemu_aio_wait();
458         }
459     }
460 
461     ret = cco.ret;
462     if (ret < 0) {
463         if (cco.err) {
464             error_propagate(errp, cco.err);
465         } else {
466             error_setg_errno(errp, -ret, "Could not create image");
467         }
468     }
469 
470 out:
471     g_free(cco.filename);
472     return ret;
473 }
474 
475 int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
476                      Error **errp)
477 {
478     BlockDriver *drv;
479     Error *local_err = NULL;
480     int ret;
481 
482     drv = bdrv_find_protocol(filename, true);
483     if (drv == NULL) {
484         error_setg(errp, "Could not find protocol for file '%s'", filename);
485         return -ENOENT;
486     }
487 
488     ret = bdrv_create(drv, filename, options, &local_err);
489     if (local_err) {
490         error_propagate(errp, local_err);
491     }
492     return ret;
493 }
494 
495 int bdrv_refresh_limits(BlockDriverState *bs)
496 {
497     BlockDriver *drv = bs->drv;
498 
499     memset(&bs->bl, 0, sizeof(bs->bl));
500 
501     if (!drv) {
502         return 0;
503     }
504 
505     /* Take some limits from the children as a default */
506     if (bs->file) {
507         bdrv_refresh_limits(bs->file);
508         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
509         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
510     } else {
511         bs->bl.opt_mem_alignment = 512;
512     }
513 
514     if (bs->backing_hd) {
515         bdrv_refresh_limits(bs->backing_hd);
516         bs->bl.opt_transfer_length =
517             MAX(bs->bl.opt_transfer_length,
518                 bs->backing_hd->bl.opt_transfer_length);
519         bs->bl.opt_mem_alignment =
520             MAX(bs->bl.opt_mem_alignment,
521                 bs->backing_hd->bl.opt_mem_alignment);
522     }
523 
524     /* Then let the driver override it */
525     if (drv->bdrv_refresh_limits) {
526         return drv->bdrv_refresh_limits(bs);
527     }
528 
529     return 0;
530 }
531 
532 /*
533  * Create a uniquely-named empty temporary file.
534  * Return 0 upon success, otherwise a negative errno value.
535  */
536 int get_tmp_filename(char *filename, int size)
537 {
538 #ifdef _WIN32
539     char temp_dir[MAX_PATH];
540     /* GetTempFileName requires that its output buffer (4th param)
541        have length MAX_PATH or greater.  */
542     assert(size >= MAX_PATH);
543     return (GetTempPath(MAX_PATH, temp_dir)
544             && GetTempFileName(temp_dir, "qem", 0, filename)
545             ? 0 : -GetLastError());
546 #else
547     int fd;
548     const char *tmpdir;
549     tmpdir = getenv("TMPDIR");
550     if (!tmpdir) {
551         tmpdir = "/var/tmp";
552     }
553     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
554         return -EOVERFLOW;
555     }
556     fd = mkstemp(filename);
557     if (fd < 0) {
558         return -errno;
559     }
560     if (close(fd) != 0) {
561         unlink(filename);
562         return -errno;
563     }
564     return 0;
565 #endif
566 }
567 
568 /*
569  * Detect host devices. By convention, /dev/cdrom[N] is always
570  * recognized as a host CDROM.
571  */
572 static BlockDriver *find_hdev_driver(const char *filename)
573 {
574     int score_max = 0, score;
575     BlockDriver *drv = NULL, *d;
576 
577     QLIST_FOREACH(d, &bdrv_drivers, list) {
578         if (d->bdrv_probe_device) {
579             score = d->bdrv_probe_device(filename);
580             if (score > score_max) {
581                 score_max = score;
582                 drv = d;
583             }
584         }
585     }
586 
587     return drv;
588 }
589 
590 BlockDriver *bdrv_find_protocol(const char *filename,
591                                 bool allow_protocol_prefix)
592 {
593     BlockDriver *drv1;
594     char protocol[128];
595     int len;
596     const char *p;
597 
598     /* TODO Drivers without bdrv_file_open must be specified explicitly */
599 
600     /*
601      * XXX(hch): we really should not let host device detection
602      * override an explicit protocol specification, but moving this
603      * later breaks access to device names with colons in them.
604      * Thanks to the brain-dead persistent naming schemes on udev-
605      * based Linux systems those actually are quite common.
606      */
607     drv1 = find_hdev_driver(filename);
608     if (drv1) {
609         return drv1;
610     }
611 
612     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
613         return bdrv_find_format("file");
614     }
615 
616     p = strchr(filename, ':');
617     assert(p != NULL);
618     len = p - filename;
619     if (len > sizeof(protocol) - 1)
620         len = sizeof(protocol) - 1;
621     memcpy(protocol, filename, len);
622     protocol[len] = '\0';
623     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
624         if (drv1->protocol_name &&
625             !strcmp(drv1->protocol_name, protocol)) {
626             return drv1;
627         }
628     }
629     return NULL;
630 }
631 
632 static int find_image_format(BlockDriverState *bs, const char *filename,
633                              BlockDriver **pdrv, Error **errp)
634 {
635     int score, score_max;
636     BlockDriver *drv1, *drv;
637     uint8_t buf[2048];
638     int ret = 0;
639 
640     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
641     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
642         drv = bdrv_find_format("raw");
643         if (!drv) {
644             error_setg(errp, "Could not find raw image format");
645             ret = -ENOENT;
646         }
647         *pdrv = drv;
648         return ret;
649     }
650 
651     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
652     if (ret < 0) {
653         error_setg_errno(errp, -ret, "Could not read image for determining its "
654                          "format");
655         *pdrv = NULL;
656         return ret;
657     }
658 
659     score_max = 0;
660     drv = NULL;
661     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
662         if (drv1->bdrv_probe) {
663             score = drv1->bdrv_probe(buf, ret, filename);
664             if (score > score_max) {
665                 score_max = score;
666                 drv = drv1;
667             }
668         }
669     }
670     if (!drv) {
671         error_setg(errp, "Could not determine image format: No compatible "
672                    "driver found");
673         ret = -ENOENT;
674     }
675     *pdrv = drv;
676     return ret;
677 }
678 
679 /**
680  * Set the current 'total_sectors' value
681  */
682 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
683 {
684     BlockDriver *drv = bs->drv;
685 
686     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
687     if (bs->sg)
688         return 0;
689 
690     /* query actual device if possible, otherwise just trust the hint */
691     if (drv->bdrv_getlength) {
692         int64_t length = drv->bdrv_getlength(bs);
693         if (length < 0) {
694             return length;
695         }
696         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
697     }
698 
699     bs->total_sectors = hint;
700     return 0;
701 }
702 
703 /**
704  * Set open flags for a given discard mode
705  *
706  * Return 0 on success, -1 if the discard mode was invalid.
707  */
708 int bdrv_parse_discard_flags(const char *mode, int *flags)
709 {
710     *flags &= ~BDRV_O_UNMAP;
711 
712     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
713         /* do nothing */
714     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
715         *flags |= BDRV_O_UNMAP;
716     } else {
717         return -1;
718     }
719 
720     return 0;
721 }
722 
723 /**
724  * Set open flags for a given cache mode
725  *
726  * Return 0 on success, -1 if the cache mode was invalid.
727  */
728 int bdrv_parse_cache_flags(const char *mode, int *flags)
729 {
730     *flags &= ~BDRV_O_CACHE_MASK;
731 
732     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
733         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
734     } else if (!strcmp(mode, "directsync")) {
735         *flags |= BDRV_O_NOCACHE;
736     } else if (!strcmp(mode, "writeback")) {
737         *flags |= BDRV_O_CACHE_WB;
738     } else if (!strcmp(mode, "unsafe")) {
739         *flags |= BDRV_O_CACHE_WB;
740         *flags |= BDRV_O_NO_FLUSH;
741     } else if (!strcmp(mode, "writethrough")) {
742         /* this is the default */
743     } else {
744         return -1;
745     }
746 
747     return 0;
748 }
749 
750 /**
751  * The copy-on-read flag is actually a reference count so multiple users may
752  * use the feature without worrying about clobbering its previous state.
753  * Copy-on-read stays enabled until all users have called to disable it.
754  */
755 void bdrv_enable_copy_on_read(BlockDriverState *bs)
756 {
757     bs->copy_on_read++;
758 }
759 
760 void bdrv_disable_copy_on_read(BlockDriverState *bs)
761 {
762     assert(bs->copy_on_read > 0);
763     bs->copy_on_read--;
764 }
765 
766 static int bdrv_open_flags(BlockDriverState *bs, int flags)
767 {
768     int open_flags = flags | BDRV_O_CACHE_WB;
769 
770     /*
771      * Clear flags that are internal to the block layer before opening the
772      * image.
773      */
774     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
775 
776     /*
777      * Snapshots should be writable.
778      */
779     if (bs->is_temporary) {
780         open_flags |= BDRV_O_RDWR;
781     }
782 
783     return open_flags;
784 }
785 
786 static int bdrv_assign_node_name(BlockDriverState *bs,
787                                  const char *node_name,
788                                  Error **errp)
789 {
790     if (!node_name) {
791         return 0;
792     }
793 
794     /* empty string node name is invalid */
795     if (node_name[0] == '\0') {
796         error_setg(errp, "Empty node name");
797         return -EINVAL;
798     }
799 
800     /* takes care of avoiding namespaces collisions */
801     if (bdrv_find(node_name)) {
802         error_setg(errp, "node-name=%s is conflicting with a device id",
803                    node_name);
804         return -EINVAL;
805     }
806 
807     /* takes care of avoiding duplicates node names */
808     if (bdrv_find_node(node_name)) {
809         error_setg(errp, "Duplicate node name");
810         return -EINVAL;
811     }
812 
813     /* copy node name into the bs and insert it into the graph list */
814     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
815     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
816 
817     return 0;
818 }
819 
820 /*
821  * Common part for opening disk images and files
822  *
823  * Removes all processed options from *options.
824  */
825 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
826     QDict *options, int flags, BlockDriver *drv, Error **errp)
827 {
828     int ret, open_flags;
829     const char *filename;
830     const char *node_name = NULL;
831     Error *local_err = NULL;
832 
833     assert(drv != NULL);
834     assert(bs->file == NULL);
835     assert(options != NULL && bs->options != options);
836 
837     if (file != NULL) {
838         filename = file->filename;
839     } else {
840         filename = qdict_get_try_str(options, "filename");
841     }
842 
843     if (drv->bdrv_needs_filename && !filename) {
844         error_setg(errp, "The '%s' block driver requires a file name",
845                    drv->format_name);
846         return -EINVAL;
847     }
848 
849     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
850 
851     node_name = qdict_get_try_str(options, "node-name");
852     ret = bdrv_assign_node_name(bs, node_name, errp);
853     if (ret < 0) {
854         return ret;
855     }
856     qdict_del(options, "node-name");
857 
858     /* bdrv_open() with directly using a protocol as drv. This layer is already
859      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
860      * and return immediately. */
861     if (file != NULL && drv->bdrv_file_open) {
862         bdrv_swap(file, bs);
863         return 0;
864     }
865 
866     bs->open_flags = flags;
867     bs->guest_block_size = 512;
868     bs->request_alignment = 512;
869     bs->zero_beyond_eof = true;
870     open_flags = bdrv_open_flags(bs, flags);
871     bs->read_only = !(open_flags & BDRV_O_RDWR);
872 
873     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
874         error_setg(errp,
875                    !bs->read_only && bdrv_is_whitelisted(drv, true)
876                         ? "Driver '%s' can only be used for read-only devices"
877                         : "Driver '%s' is not whitelisted",
878                    drv->format_name);
879         return -ENOTSUP;
880     }
881 
882     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
883     if (flags & BDRV_O_COPY_ON_READ) {
884         if (!bs->read_only) {
885             bdrv_enable_copy_on_read(bs);
886         } else {
887             error_setg(errp, "Can't use copy-on-read on read-only device");
888             return -EINVAL;
889         }
890     }
891 
892     if (filename != NULL) {
893         pstrcpy(bs->filename, sizeof(bs->filename), filename);
894     } else {
895         bs->filename[0] = '\0';
896     }
897 
898     bs->drv = drv;
899     bs->opaque = g_malloc0(drv->instance_size);
900 
901     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
902 
903     /* Open the image, either directly or using a protocol */
904     if (drv->bdrv_file_open) {
905         assert(file == NULL);
906         assert(!drv->bdrv_needs_filename || filename != NULL);
907         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
908     } else {
909         if (file == NULL) {
910             error_setg(errp, "Can't use '%s' as a block driver for the "
911                        "protocol level", drv->format_name);
912             ret = -EINVAL;
913             goto free_and_fail;
914         }
915         bs->file = file;
916         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
917     }
918 
919     if (ret < 0) {
920         if (local_err) {
921             error_propagate(errp, local_err);
922         } else if (bs->filename[0]) {
923             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
924         } else {
925             error_setg_errno(errp, -ret, "Could not open image");
926         }
927         goto free_and_fail;
928     }
929 
930     ret = refresh_total_sectors(bs, bs->total_sectors);
931     if (ret < 0) {
932         error_setg_errno(errp, -ret, "Could not refresh total sector count");
933         goto free_and_fail;
934     }
935 
936     bdrv_refresh_limits(bs);
937     assert(bdrv_opt_mem_align(bs) != 0);
938     assert((bs->request_alignment != 0) || bs->sg);
939 
940 #ifndef _WIN32
941     if (bs->is_temporary) {
942         assert(bs->filename[0] != '\0');
943         unlink(bs->filename);
944     }
945 #endif
946     return 0;
947 
948 free_and_fail:
949     bs->file = NULL;
950     g_free(bs->opaque);
951     bs->opaque = NULL;
952     bs->drv = NULL;
953     return ret;
954 }
955 
956 /*
957  * Opens a file using a protocol (file, host_device, nbd, ...)
958  *
959  * options is an indirect pointer to a QDict of options to pass to the block
960  * drivers, or pointer to NULL for an empty set of options. If this function
961  * takes ownership of the QDict reference, it will set *options to NULL;
962  * otherwise, it will contain unused/unrecognized options after this function
963  * returns. Then, the caller is responsible for freeing it. If it intends to
964  * reuse the QDict, QINCREF() should be called beforehand.
965  */
966 static int bdrv_file_open(BlockDriverState *bs, const char *filename,
967                           QDict **options, int flags, Error **errp)
968 {
969     BlockDriver *drv;
970     const char *drvname;
971     bool allow_protocol_prefix = false;
972     Error *local_err = NULL;
973     int ret;
974 
975     /* Fetch the file name from the options QDict if necessary */
976     if (!filename) {
977         filename = qdict_get_try_str(*options, "filename");
978     } else if (filename && !qdict_haskey(*options, "filename")) {
979         qdict_put(*options, "filename", qstring_from_str(filename));
980         allow_protocol_prefix = true;
981     } else {
982         error_setg(errp, "Can't specify 'file' and 'filename' options at the "
983                    "same time");
984         ret = -EINVAL;
985         goto fail;
986     }
987 
988     /* Find the right block driver */
989     drvname = qdict_get_try_str(*options, "driver");
990     if (drvname) {
991         drv = bdrv_find_format(drvname);
992         if (!drv) {
993             error_setg(errp, "Unknown driver '%s'", drvname);
994         }
995         qdict_del(*options, "driver");
996     } else if (filename) {
997         drv = bdrv_find_protocol(filename, allow_protocol_prefix);
998         if (!drv) {
999             error_setg(errp, "Unknown protocol");
1000         }
1001     } else {
1002         error_setg(errp, "Must specify either driver or file");
1003         drv = NULL;
1004     }
1005 
1006     if (!drv) {
1007         /* errp has been set already */
1008         ret = -ENOENT;
1009         goto fail;
1010     }
1011 
1012     /* Parse the filename and open it */
1013     if (drv->bdrv_parse_filename && filename) {
1014         drv->bdrv_parse_filename(filename, *options, &local_err);
1015         if (local_err) {
1016             error_propagate(errp, local_err);
1017             ret = -EINVAL;
1018             goto fail;
1019         }
1020 
1021         if (!drv->bdrv_needs_filename) {
1022             qdict_del(*options, "filename");
1023         } else {
1024             filename = qdict_get_str(*options, "filename");
1025         }
1026     }
1027 
1028     if (!drv->bdrv_file_open) {
1029         ret = bdrv_open(&bs, filename, NULL, *options, flags, drv, &local_err);
1030         *options = NULL;
1031     } else {
1032         ret = bdrv_open_common(bs, NULL, *options, flags, drv, &local_err);
1033     }
1034     if (ret < 0) {
1035         error_propagate(errp, local_err);
1036         goto fail;
1037     }
1038 
1039     bs->growable = 1;
1040     return 0;
1041 
1042 fail:
1043     return ret;
1044 }
1045 
1046 /*
1047  * Opens the backing file for a BlockDriverState if not yet open
1048  *
1049  * options is a QDict of options to pass to the block drivers, or NULL for an
1050  * empty set of options. The reference to the QDict is transferred to this
1051  * function (even on failure), so if the caller intends to reuse the dictionary,
1052  * it needs to use QINCREF() before calling bdrv_file_open.
1053  */
1054 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1055 {
1056     char backing_filename[PATH_MAX];
1057     int back_flags, ret;
1058     BlockDriver *back_drv = NULL;
1059     Error *local_err = NULL;
1060 
1061     if (bs->backing_hd != NULL) {
1062         QDECREF(options);
1063         return 0;
1064     }
1065 
1066     /* NULL means an empty set of options */
1067     if (options == NULL) {
1068         options = qdict_new();
1069     }
1070 
1071     bs->open_flags &= ~BDRV_O_NO_BACKING;
1072     if (qdict_haskey(options, "file.filename")) {
1073         backing_filename[0] = '\0';
1074     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1075         QDECREF(options);
1076         return 0;
1077     } else {
1078         bdrv_get_full_backing_filename(bs, backing_filename,
1079                                        sizeof(backing_filename));
1080     }
1081 
1082     if (bs->backing_format[0] != '\0') {
1083         back_drv = bdrv_find_format(bs->backing_format);
1084     }
1085 
1086     /* backing files always opened read-only */
1087     back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
1088                                     BDRV_O_COPY_ON_READ);
1089 
1090     assert(bs->backing_hd == NULL);
1091     ret = bdrv_open(&bs->backing_hd,
1092                     *backing_filename ? backing_filename : NULL, NULL, options,
1093                     back_flags, back_drv, &local_err);
1094     if (ret < 0) {
1095         bs->backing_hd = NULL;
1096         bs->open_flags |= BDRV_O_NO_BACKING;
1097         error_setg(errp, "Could not open backing file: %s",
1098                    error_get_pretty(local_err));
1099         error_free(local_err);
1100         return ret;
1101     }
1102 
1103     if (bs->backing_hd->file) {
1104         pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1105                 bs->backing_hd->file->filename);
1106     }
1107 
1108     /* Recalculate the BlockLimits with the backing file */
1109     bdrv_refresh_limits(bs);
1110 
1111     return 0;
1112 }
1113 
1114 /*
1115  * Opens a disk image whose options are given as BlockdevRef in another block
1116  * device's options.
1117  *
1118  * If allow_none is true, no image will be opened if filename is false and no
1119  * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1120  *
1121  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1122  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1123  * itself, all options starting with "${bdref_key}." are considered part of the
1124  * BlockdevRef.
1125  *
1126  * The BlockdevRef will be removed from the options QDict.
1127  *
1128  * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1129  */
1130 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1131                     QDict *options, const char *bdref_key, int flags,
1132                     bool allow_none, Error **errp)
1133 {
1134     QDict *image_options;
1135     int ret;
1136     char *bdref_key_dot;
1137     const char *reference;
1138 
1139     assert(pbs);
1140     assert(*pbs == NULL);
1141 
1142     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1143     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1144     g_free(bdref_key_dot);
1145 
1146     reference = qdict_get_try_str(options, bdref_key);
1147     if (!filename && !reference && !qdict_size(image_options)) {
1148         if (allow_none) {
1149             ret = 0;
1150         } else {
1151             error_setg(errp, "A block device must be specified for \"%s\"",
1152                        bdref_key);
1153             ret = -EINVAL;
1154         }
1155         goto done;
1156     }
1157 
1158     ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1159 
1160 done:
1161     qdict_del(options, bdref_key);
1162     return ret;
1163 }
1164 
1165 /*
1166  * Opens a disk image (raw, qcow2, vmdk, ...)
1167  *
1168  * options is a QDict of options to pass to the block drivers, or NULL for an
1169  * empty set of options. The reference to the QDict belongs to the block layer
1170  * after the call (even on failure), so if the caller intends to reuse the
1171  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1172  *
1173  * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1174  * If it is not NULL, the referenced BDS will be reused.
1175  *
1176  * The reference parameter may be used to specify an existing block device which
1177  * should be opened. If specified, neither options nor a filename may be given,
1178  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1179  */
1180 int bdrv_open(BlockDriverState **pbs, const char *filename,
1181               const char *reference, QDict *options, int flags,
1182               BlockDriver *drv, Error **errp)
1183 {
1184     int ret;
1185     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1186     char tmp_filename[PATH_MAX + 1];
1187     BlockDriverState *file = NULL, *bs;
1188     const char *drvname;
1189     Error *local_err = NULL;
1190 
1191     assert(pbs);
1192 
1193     if (reference) {
1194         bool options_non_empty = options ? qdict_size(options) : false;
1195         QDECREF(options);
1196 
1197         if (*pbs) {
1198             error_setg(errp, "Cannot reuse an existing BDS when referencing "
1199                        "another block device");
1200             return -EINVAL;
1201         }
1202 
1203         if (filename || options_non_empty) {
1204             error_setg(errp, "Cannot reference an existing block device with "
1205                        "additional options or a new filename");
1206             return -EINVAL;
1207         }
1208 
1209         bs = bdrv_lookup_bs(reference, reference, errp);
1210         if (!bs) {
1211             return -ENODEV;
1212         }
1213         bdrv_ref(bs);
1214         *pbs = bs;
1215         return 0;
1216     }
1217 
1218     if (*pbs) {
1219         bs = *pbs;
1220     } else {
1221         bs = bdrv_new("");
1222     }
1223 
1224     /* NULL means an empty set of options */
1225     if (options == NULL) {
1226         options = qdict_new();
1227     }
1228 
1229     bs->options = options;
1230     options = qdict_clone_shallow(options);
1231 
1232     if (flags & BDRV_O_PROTOCOL) {
1233         assert(!drv);
1234         ret = bdrv_file_open(bs, filename, &options, flags & ~BDRV_O_PROTOCOL,
1235                              &local_err);
1236         if (!ret) {
1237             drv = bs->drv;
1238             goto done;
1239         } else if (bs->drv) {
1240             goto close_and_fail;
1241         } else {
1242             goto fail;
1243         }
1244     }
1245 
1246     /* For snapshot=on, create a temporary qcow2 overlay */
1247     if (flags & BDRV_O_SNAPSHOT) {
1248         BlockDriverState *bs1;
1249         int64_t total_size;
1250         BlockDriver *bdrv_qcow2;
1251         QEMUOptionParameter *create_options;
1252         QDict *snapshot_options;
1253 
1254         /* if snapshot, we create a temporary backing file and open it
1255            instead of opening 'filename' directly */
1256 
1257         /* Get the required size from the image */
1258         QINCREF(options);
1259         bs1 = NULL;
1260         ret = bdrv_open(&bs1, filename, NULL, options, BDRV_O_NO_BACKING,
1261                         drv, &local_err);
1262         if (ret < 0) {
1263             goto fail;
1264         }
1265         total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1266 
1267         bdrv_unref(bs1);
1268 
1269         /* Create the temporary image */
1270         ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1271         if (ret < 0) {
1272             error_setg_errno(errp, -ret, "Could not get temporary filename");
1273             goto fail;
1274         }
1275 
1276         bdrv_qcow2 = bdrv_find_format("qcow2");
1277         create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1278                                                  NULL);
1279 
1280         set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1281 
1282         ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1283         free_option_parameters(create_options);
1284         if (ret < 0) {
1285             error_setg_errno(errp, -ret, "Could not create temporary overlay "
1286                              "'%s': %s", tmp_filename,
1287                              error_get_pretty(local_err));
1288             error_free(local_err);
1289             local_err = NULL;
1290             goto fail;
1291         }
1292 
1293         /* Prepare a new options QDict for the temporary file, where user
1294          * options refer to the backing file */
1295         if (filename) {
1296             qdict_put(options, "file.filename", qstring_from_str(filename));
1297         }
1298         if (drv) {
1299             qdict_put(options, "driver", qstring_from_str(drv->format_name));
1300         }
1301 
1302         snapshot_options = qdict_new();
1303         qdict_put(snapshot_options, "backing", options);
1304         qdict_flatten(snapshot_options);
1305 
1306         bs->options = snapshot_options;
1307         options = qdict_clone_shallow(bs->options);
1308 
1309         filename = tmp_filename;
1310         drv = bdrv_qcow2;
1311         bs->is_temporary = 1;
1312     }
1313 
1314     /* Open image file without format layer */
1315     if (flags & BDRV_O_RDWR) {
1316         flags |= BDRV_O_ALLOW_RDWR;
1317     }
1318 
1319     assert(file == NULL);
1320     ret = bdrv_open_image(&file, filename, options, "file",
1321                           bdrv_open_flags(bs, flags | BDRV_O_UNMAP) |
1322                           BDRV_O_PROTOCOL, true, &local_err);
1323     if (ret < 0) {
1324         goto fail;
1325     }
1326 
1327     /* Find the right image format driver */
1328     drvname = qdict_get_try_str(options, "driver");
1329     if (drvname) {
1330         drv = bdrv_find_format(drvname);
1331         qdict_del(options, "driver");
1332         if (!drv) {
1333             error_setg(errp, "Invalid driver: '%s'", drvname);
1334             ret = -EINVAL;
1335             goto unlink_and_fail;
1336         }
1337     }
1338 
1339     if (!drv) {
1340         if (file) {
1341             ret = find_image_format(file, filename, &drv, &local_err);
1342         } else {
1343             error_setg(errp, "Must specify either driver or file");
1344             ret = -EINVAL;
1345             goto unlink_and_fail;
1346         }
1347     }
1348 
1349     if (!drv) {
1350         goto unlink_and_fail;
1351     }
1352 
1353     /* Open the image */
1354     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1355     if (ret < 0) {
1356         goto unlink_and_fail;
1357     }
1358 
1359     if (file && (bs->file != file)) {
1360         bdrv_unref(file);
1361         file = NULL;
1362     }
1363 
1364     /* If there is a backing file, use it */
1365     if ((flags & BDRV_O_NO_BACKING) == 0) {
1366         QDict *backing_options;
1367 
1368         qdict_extract_subqdict(options, &backing_options, "backing.");
1369         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1370         if (ret < 0) {
1371             goto close_and_fail;
1372         }
1373     }
1374 
1375 done:
1376     /* Check if any unknown options were used */
1377     if (options && (qdict_size(options) != 0)) {
1378         const QDictEntry *entry = qdict_first(options);
1379         if (flags & BDRV_O_PROTOCOL) {
1380             error_setg(errp, "Block protocol '%s' doesn't support the option "
1381                        "'%s'", drv->format_name, entry->key);
1382         } else {
1383             error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1384                        "support the option '%s'", drv->format_name,
1385                        bs->device_name, entry->key);
1386         }
1387 
1388         ret = -EINVAL;
1389         goto close_and_fail;
1390     }
1391     QDECREF(options);
1392 
1393     if (!bdrv_key_required(bs)) {
1394         bdrv_dev_change_media_cb(bs, true);
1395     }
1396 
1397     *pbs = bs;
1398     return 0;
1399 
1400 unlink_and_fail:
1401     if (file != NULL) {
1402         bdrv_unref(file);
1403     }
1404     if (bs->is_temporary) {
1405         unlink(filename);
1406     }
1407 fail:
1408     QDECREF(bs->options);
1409     QDECREF(options);
1410     bs->options = NULL;
1411     if (!*pbs) {
1412         /* If *pbs is NULL, a new BDS has been created in this function and
1413            needs to be freed now. Otherwise, it does not need to be closed,
1414            since it has not really been opened yet. */
1415         bdrv_unref(bs);
1416     }
1417     if (local_err) {
1418         error_propagate(errp, local_err);
1419     }
1420     return ret;
1421 
1422 close_and_fail:
1423     /* See fail path, but now the BDS has to be always closed */
1424     if (*pbs) {
1425         bdrv_close(bs);
1426     } else {
1427         bdrv_unref(bs);
1428     }
1429     QDECREF(options);
1430     if (local_err) {
1431         error_propagate(errp, local_err);
1432     }
1433     return ret;
1434 }
1435 
1436 typedef struct BlockReopenQueueEntry {
1437      bool prepared;
1438      BDRVReopenState state;
1439      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1440 } BlockReopenQueueEntry;
1441 
1442 /*
1443  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1444  * reopen of multiple devices.
1445  *
1446  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1447  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1448  * be created and initialized. This newly created BlockReopenQueue should be
1449  * passed back in for subsequent calls that are intended to be of the same
1450  * atomic 'set'.
1451  *
1452  * bs is the BlockDriverState to add to the reopen queue.
1453  *
1454  * flags contains the open flags for the associated bs
1455  *
1456  * returns a pointer to bs_queue, which is either the newly allocated
1457  * bs_queue, or the existing bs_queue being used.
1458  *
1459  */
1460 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1461                                     BlockDriverState *bs, int flags)
1462 {
1463     assert(bs != NULL);
1464 
1465     BlockReopenQueueEntry *bs_entry;
1466     if (bs_queue == NULL) {
1467         bs_queue = g_new0(BlockReopenQueue, 1);
1468         QSIMPLEQ_INIT(bs_queue);
1469     }
1470 
1471     if (bs->file) {
1472         bdrv_reopen_queue(bs_queue, bs->file, flags);
1473     }
1474 
1475     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1476     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1477 
1478     bs_entry->state.bs = bs;
1479     bs_entry->state.flags = flags;
1480 
1481     return bs_queue;
1482 }
1483 
1484 /*
1485  * Reopen multiple BlockDriverStates atomically & transactionally.
1486  *
1487  * The queue passed in (bs_queue) must have been built up previous
1488  * via bdrv_reopen_queue().
1489  *
1490  * Reopens all BDS specified in the queue, with the appropriate
1491  * flags.  All devices are prepared for reopen, and failure of any
1492  * device will cause all device changes to be abandonded, and intermediate
1493  * data cleaned up.
1494  *
1495  * If all devices prepare successfully, then the changes are committed
1496  * to all devices.
1497  *
1498  */
1499 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1500 {
1501     int ret = -1;
1502     BlockReopenQueueEntry *bs_entry, *next;
1503     Error *local_err = NULL;
1504 
1505     assert(bs_queue != NULL);
1506 
1507     bdrv_drain_all();
1508 
1509     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1510         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1511             error_propagate(errp, local_err);
1512             goto cleanup;
1513         }
1514         bs_entry->prepared = true;
1515     }
1516 
1517     /* If we reach this point, we have success and just need to apply the
1518      * changes
1519      */
1520     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1521         bdrv_reopen_commit(&bs_entry->state);
1522     }
1523 
1524     ret = 0;
1525 
1526 cleanup:
1527     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1528         if (ret && bs_entry->prepared) {
1529             bdrv_reopen_abort(&bs_entry->state);
1530         }
1531         g_free(bs_entry);
1532     }
1533     g_free(bs_queue);
1534     return ret;
1535 }
1536 
1537 
1538 /* Reopen a single BlockDriverState with the specified flags. */
1539 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1540 {
1541     int ret = -1;
1542     Error *local_err = NULL;
1543     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1544 
1545     ret = bdrv_reopen_multiple(queue, &local_err);
1546     if (local_err != NULL) {
1547         error_propagate(errp, local_err);
1548     }
1549     return ret;
1550 }
1551 
1552 
1553 /*
1554  * Prepares a BlockDriverState for reopen. All changes are staged in the
1555  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1556  * the block driver layer .bdrv_reopen_prepare()
1557  *
1558  * bs is the BlockDriverState to reopen
1559  * flags are the new open flags
1560  * queue is the reopen queue
1561  *
1562  * Returns 0 on success, non-zero on error.  On error errp will be set
1563  * as well.
1564  *
1565  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1566  * It is the responsibility of the caller to then call the abort() or
1567  * commit() for any other BDS that have been left in a prepare() state
1568  *
1569  */
1570 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1571                         Error **errp)
1572 {
1573     int ret = -1;
1574     Error *local_err = NULL;
1575     BlockDriver *drv;
1576 
1577     assert(reopen_state != NULL);
1578     assert(reopen_state->bs->drv != NULL);
1579     drv = reopen_state->bs->drv;
1580 
1581     /* if we are to stay read-only, do not allow permission change
1582      * to r/w */
1583     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1584         reopen_state->flags & BDRV_O_RDWR) {
1585         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1586                   reopen_state->bs->device_name);
1587         goto error;
1588     }
1589 
1590 
1591     ret = bdrv_flush(reopen_state->bs);
1592     if (ret) {
1593         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1594                   strerror(-ret));
1595         goto error;
1596     }
1597 
1598     if (drv->bdrv_reopen_prepare) {
1599         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1600         if (ret) {
1601             if (local_err != NULL) {
1602                 error_propagate(errp, local_err);
1603             } else {
1604                 error_setg(errp, "failed while preparing to reopen image '%s'",
1605                            reopen_state->bs->filename);
1606             }
1607             goto error;
1608         }
1609     } else {
1610         /* It is currently mandatory to have a bdrv_reopen_prepare()
1611          * handler for each supported drv. */
1612         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1613                   drv->format_name, reopen_state->bs->device_name,
1614                  "reopening of file");
1615         ret = -1;
1616         goto error;
1617     }
1618 
1619     ret = 0;
1620 
1621 error:
1622     return ret;
1623 }
1624 
1625 /*
1626  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1627  * makes them final by swapping the staging BlockDriverState contents into
1628  * the active BlockDriverState contents.
1629  */
1630 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1631 {
1632     BlockDriver *drv;
1633 
1634     assert(reopen_state != NULL);
1635     drv = reopen_state->bs->drv;
1636     assert(drv != NULL);
1637 
1638     /* If there are any driver level actions to take */
1639     if (drv->bdrv_reopen_commit) {
1640         drv->bdrv_reopen_commit(reopen_state);
1641     }
1642 
1643     /* set BDS specific flags now */
1644     reopen_state->bs->open_flags         = reopen_state->flags;
1645     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1646                                               BDRV_O_CACHE_WB);
1647     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1648 
1649     bdrv_refresh_limits(reopen_state->bs);
1650 }
1651 
1652 /*
1653  * Abort the reopen, and delete and free the staged changes in
1654  * reopen_state
1655  */
1656 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1657 {
1658     BlockDriver *drv;
1659 
1660     assert(reopen_state != NULL);
1661     drv = reopen_state->bs->drv;
1662     assert(drv != NULL);
1663 
1664     if (drv->bdrv_reopen_abort) {
1665         drv->bdrv_reopen_abort(reopen_state);
1666     }
1667 }
1668 
1669 
1670 void bdrv_close(BlockDriverState *bs)
1671 {
1672     if (bs->job) {
1673         block_job_cancel_sync(bs->job);
1674     }
1675     bdrv_drain_all(); /* complete I/O */
1676     bdrv_flush(bs);
1677     bdrv_drain_all(); /* in case flush left pending I/O */
1678     notifier_list_notify(&bs->close_notifiers, bs);
1679 
1680     if (bs->drv) {
1681         if (bs->backing_hd) {
1682             bdrv_unref(bs->backing_hd);
1683             bs->backing_hd = NULL;
1684         }
1685         bs->drv->bdrv_close(bs);
1686         g_free(bs->opaque);
1687 #ifdef _WIN32
1688         if (bs->is_temporary) {
1689             unlink(bs->filename);
1690         }
1691 #endif
1692         bs->opaque = NULL;
1693         bs->drv = NULL;
1694         bs->copy_on_read = 0;
1695         bs->backing_file[0] = '\0';
1696         bs->backing_format[0] = '\0';
1697         bs->total_sectors = 0;
1698         bs->encrypted = 0;
1699         bs->valid_key = 0;
1700         bs->sg = 0;
1701         bs->growable = 0;
1702         bs->zero_beyond_eof = false;
1703         QDECREF(bs->options);
1704         bs->options = NULL;
1705 
1706         if (bs->file != NULL) {
1707             bdrv_unref(bs->file);
1708             bs->file = NULL;
1709         }
1710     }
1711 
1712     bdrv_dev_change_media_cb(bs, false);
1713 
1714     /*throttling disk I/O limits*/
1715     if (bs->io_limits_enabled) {
1716         bdrv_io_limits_disable(bs);
1717     }
1718 }
1719 
1720 void bdrv_close_all(void)
1721 {
1722     BlockDriverState *bs;
1723 
1724     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1725         bdrv_close(bs);
1726     }
1727 }
1728 
1729 /* Check if any requests are in-flight (including throttled requests) */
1730 static bool bdrv_requests_pending(BlockDriverState *bs)
1731 {
1732     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1733         return true;
1734     }
1735     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1736         return true;
1737     }
1738     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1739         return true;
1740     }
1741     if (bs->file && bdrv_requests_pending(bs->file)) {
1742         return true;
1743     }
1744     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1745         return true;
1746     }
1747     return false;
1748 }
1749 
1750 static bool bdrv_requests_pending_all(void)
1751 {
1752     BlockDriverState *bs;
1753     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1754         if (bdrv_requests_pending(bs)) {
1755             return true;
1756         }
1757     }
1758     return false;
1759 }
1760 
1761 /*
1762  * Wait for pending requests to complete across all BlockDriverStates
1763  *
1764  * This function does not flush data to disk, use bdrv_flush_all() for that
1765  * after calling this function.
1766  *
1767  * Note that completion of an asynchronous I/O operation can trigger any
1768  * number of other I/O operations on other devices---for example a coroutine
1769  * can be arbitrarily complex and a constant flow of I/O can come until the
1770  * coroutine is complete.  Because of this, it is not possible to have a
1771  * function to drain a single device's I/O queue.
1772  */
1773 void bdrv_drain_all(void)
1774 {
1775     /* Always run first iteration so any pending completion BHs run */
1776     bool busy = true;
1777     BlockDriverState *bs;
1778 
1779     while (busy) {
1780         QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1781             bdrv_start_throttled_reqs(bs);
1782         }
1783 
1784         busy = bdrv_requests_pending_all();
1785         busy |= aio_poll(qemu_get_aio_context(), busy);
1786     }
1787 }
1788 
1789 /* make a BlockDriverState anonymous by removing from bdrv_state and
1790  * graph_bdrv_state list.
1791    Also, NULL terminate the device_name to prevent double remove */
1792 void bdrv_make_anon(BlockDriverState *bs)
1793 {
1794     if (bs->device_name[0] != '\0') {
1795         QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1796     }
1797     bs->device_name[0] = '\0';
1798     if (bs->node_name[0] != '\0') {
1799         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1800     }
1801     bs->node_name[0] = '\0';
1802 }
1803 
1804 static void bdrv_rebind(BlockDriverState *bs)
1805 {
1806     if (bs->drv && bs->drv->bdrv_rebind) {
1807         bs->drv->bdrv_rebind(bs);
1808     }
1809 }
1810 
1811 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1812                                      BlockDriverState *bs_src)
1813 {
1814     /* move some fields that need to stay attached to the device */
1815     bs_dest->open_flags         = bs_src->open_flags;
1816 
1817     /* dev info */
1818     bs_dest->dev_ops            = bs_src->dev_ops;
1819     bs_dest->dev_opaque         = bs_src->dev_opaque;
1820     bs_dest->dev                = bs_src->dev;
1821     bs_dest->guest_block_size   = bs_src->guest_block_size;
1822     bs_dest->copy_on_read       = bs_src->copy_on_read;
1823 
1824     bs_dest->enable_write_cache = bs_src->enable_write_cache;
1825 
1826     /* i/o throttled req */
1827     memcpy(&bs_dest->throttle_state,
1828            &bs_src->throttle_state,
1829            sizeof(ThrottleState));
1830     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1831     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1832     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1833 
1834     /* r/w error */
1835     bs_dest->on_read_error      = bs_src->on_read_error;
1836     bs_dest->on_write_error     = bs_src->on_write_error;
1837 
1838     /* i/o status */
1839     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1840     bs_dest->iostatus           = bs_src->iostatus;
1841 
1842     /* dirty bitmap */
1843     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1844 
1845     /* reference count */
1846     bs_dest->refcnt             = bs_src->refcnt;
1847 
1848     /* job */
1849     bs_dest->in_use             = bs_src->in_use;
1850     bs_dest->job                = bs_src->job;
1851 
1852     /* keep the same entry in bdrv_states */
1853     pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1854             bs_src->device_name);
1855     bs_dest->device_list = bs_src->device_list;
1856 }
1857 
1858 /*
1859  * Swap bs contents for two image chains while they are live,
1860  * while keeping required fields on the BlockDriverState that is
1861  * actually attached to a device.
1862  *
1863  * This will modify the BlockDriverState fields, and swap contents
1864  * between bs_new and bs_old. Both bs_new and bs_old are modified.
1865  *
1866  * bs_new is required to be anonymous.
1867  *
1868  * This function does not create any image files.
1869  */
1870 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1871 {
1872     BlockDriverState tmp;
1873 
1874     /* The code needs to swap the node_name but simply swapping node_list won't
1875      * work so first remove the nodes from the graph list, do the swap then
1876      * insert them back if needed.
1877      */
1878     if (bs_new->node_name[0] != '\0') {
1879         QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
1880     }
1881     if (bs_old->node_name[0] != '\0') {
1882         QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
1883     }
1884 
1885     /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1886     assert(bs_new->device_name[0] == '\0');
1887     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1888     assert(bs_new->job == NULL);
1889     assert(bs_new->dev == NULL);
1890     assert(bs_new->in_use == 0);
1891     assert(bs_new->io_limits_enabled == false);
1892     assert(!throttle_have_timer(&bs_new->throttle_state));
1893 
1894     tmp = *bs_new;
1895     *bs_new = *bs_old;
1896     *bs_old = tmp;
1897 
1898     /* there are some fields that should not be swapped, move them back */
1899     bdrv_move_feature_fields(&tmp, bs_old);
1900     bdrv_move_feature_fields(bs_old, bs_new);
1901     bdrv_move_feature_fields(bs_new, &tmp);
1902 
1903     /* bs_new shouldn't be in bdrv_states even after the swap!  */
1904     assert(bs_new->device_name[0] == '\0');
1905 
1906     /* Check a few fields that should remain attached to the device */
1907     assert(bs_new->dev == NULL);
1908     assert(bs_new->job == NULL);
1909     assert(bs_new->in_use == 0);
1910     assert(bs_new->io_limits_enabled == false);
1911     assert(!throttle_have_timer(&bs_new->throttle_state));
1912 
1913     /* insert the nodes back into the graph node list if needed */
1914     if (bs_new->node_name[0] != '\0') {
1915         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
1916     }
1917     if (bs_old->node_name[0] != '\0') {
1918         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
1919     }
1920 
1921     bdrv_rebind(bs_new);
1922     bdrv_rebind(bs_old);
1923 }
1924 
1925 /*
1926  * Add new bs contents at the top of an image chain while the chain is
1927  * live, while keeping required fields on the top layer.
1928  *
1929  * This will modify the BlockDriverState fields, and swap contents
1930  * between bs_new and bs_top. Both bs_new and bs_top are modified.
1931  *
1932  * bs_new is required to be anonymous.
1933  *
1934  * This function does not create any image files.
1935  */
1936 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1937 {
1938     bdrv_swap(bs_new, bs_top);
1939 
1940     /* The contents of 'tmp' will become bs_top, as we are
1941      * swapping bs_new and bs_top contents. */
1942     bs_top->backing_hd = bs_new;
1943     bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1944     pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1945             bs_new->filename);
1946     pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1947             bs_new->drv ? bs_new->drv->format_name : "");
1948 }
1949 
1950 static void bdrv_delete(BlockDriverState *bs)
1951 {
1952     assert(!bs->dev);
1953     assert(!bs->job);
1954     assert(!bs->in_use);
1955     assert(!bs->refcnt);
1956     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1957 
1958     bdrv_close(bs);
1959 
1960     /* remove from list, if necessary */
1961     bdrv_make_anon(bs);
1962 
1963     g_free(bs);
1964 }
1965 
1966 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1967 /* TODO change to DeviceState *dev when all users are qdevified */
1968 {
1969     if (bs->dev) {
1970         return -EBUSY;
1971     }
1972     bs->dev = dev;
1973     bdrv_iostatus_reset(bs);
1974     return 0;
1975 }
1976 
1977 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1978 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1979 {
1980     if (bdrv_attach_dev(bs, dev) < 0) {
1981         abort();
1982     }
1983 }
1984 
1985 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1986 /* TODO change to DeviceState *dev when all users are qdevified */
1987 {
1988     assert(bs->dev == dev);
1989     bs->dev = NULL;
1990     bs->dev_ops = NULL;
1991     bs->dev_opaque = NULL;
1992     bs->guest_block_size = 512;
1993 }
1994 
1995 /* TODO change to return DeviceState * when all users are qdevified */
1996 void *bdrv_get_attached_dev(BlockDriverState *bs)
1997 {
1998     return bs->dev;
1999 }
2000 
2001 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2002                       void *opaque)
2003 {
2004     bs->dev_ops = ops;
2005     bs->dev_opaque = opaque;
2006 }
2007 
2008 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
2009                                enum MonitorEvent ev,
2010                                BlockErrorAction action, bool is_read)
2011 {
2012     QObject *data;
2013     const char *action_str;
2014 
2015     switch (action) {
2016     case BDRV_ACTION_REPORT:
2017         action_str = "report";
2018         break;
2019     case BDRV_ACTION_IGNORE:
2020         action_str = "ignore";
2021         break;
2022     case BDRV_ACTION_STOP:
2023         action_str = "stop";
2024         break;
2025     default:
2026         abort();
2027     }
2028 
2029     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2030                               bdrv->device_name,
2031                               action_str,
2032                               is_read ? "read" : "write");
2033     monitor_protocol_event(ev, data);
2034 
2035     qobject_decref(data);
2036 }
2037 
2038 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2039 {
2040     QObject *data;
2041 
2042     data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2043                               bdrv_get_device_name(bs), ejected);
2044     monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2045 
2046     qobject_decref(data);
2047 }
2048 
2049 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2050 {
2051     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2052         bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2053         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2054         if (tray_was_closed) {
2055             /* tray open */
2056             bdrv_emit_qmp_eject_event(bs, true);
2057         }
2058         if (load) {
2059             /* tray close */
2060             bdrv_emit_qmp_eject_event(bs, false);
2061         }
2062     }
2063 }
2064 
2065 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2066 {
2067     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2068 }
2069 
2070 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2071 {
2072     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2073         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2074     }
2075 }
2076 
2077 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2078 {
2079     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2080         return bs->dev_ops->is_tray_open(bs->dev_opaque);
2081     }
2082     return false;
2083 }
2084 
2085 static void bdrv_dev_resize_cb(BlockDriverState *bs)
2086 {
2087     if (bs->dev_ops && bs->dev_ops->resize_cb) {
2088         bs->dev_ops->resize_cb(bs->dev_opaque);
2089     }
2090 }
2091 
2092 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2093 {
2094     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2095         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2096     }
2097     return false;
2098 }
2099 
2100 /*
2101  * Run consistency checks on an image
2102  *
2103  * Returns 0 if the check could be completed (it doesn't mean that the image is
2104  * free of errors) or -errno when an internal error occurred. The results of the
2105  * check are stored in res.
2106  */
2107 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2108 {
2109     if (bs->drv->bdrv_check == NULL) {
2110         return -ENOTSUP;
2111     }
2112 
2113     memset(res, 0, sizeof(*res));
2114     return bs->drv->bdrv_check(bs, res, fix);
2115 }
2116 
2117 #define COMMIT_BUF_SECTORS 2048
2118 
2119 /* commit COW file into the raw image */
2120 int bdrv_commit(BlockDriverState *bs)
2121 {
2122     BlockDriver *drv = bs->drv;
2123     int64_t sector, total_sectors, length, backing_length;
2124     int n, ro, open_flags;
2125     int ret = 0;
2126     uint8_t *buf = NULL;
2127     char filename[PATH_MAX];
2128 
2129     if (!drv)
2130         return -ENOMEDIUM;
2131 
2132     if (!bs->backing_hd) {
2133         return -ENOTSUP;
2134     }
2135 
2136     if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
2137         return -EBUSY;
2138     }
2139 
2140     ro = bs->backing_hd->read_only;
2141     /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2142     pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2143     open_flags =  bs->backing_hd->open_flags;
2144 
2145     if (ro) {
2146         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2147             return -EACCES;
2148         }
2149     }
2150 
2151     length = bdrv_getlength(bs);
2152     if (length < 0) {
2153         ret = length;
2154         goto ro_cleanup;
2155     }
2156 
2157     backing_length = bdrv_getlength(bs->backing_hd);
2158     if (backing_length < 0) {
2159         ret = backing_length;
2160         goto ro_cleanup;
2161     }
2162 
2163     /* If our top snapshot is larger than the backing file image,
2164      * grow the backing file image if possible.  If not possible,
2165      * we must return an error */
2166     if (length > backing_length) {
2167         ret = bdrv_truncate(bs->backing_hd, length);
2168         if (ret < 0) {
2169             goto ro_cleanup;
2170         }
2171     }
2172 
2173     total_sectors = length >> BDRV_SECTOR_BITS;
2174     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2175 
2176     for (sector = 0; sector < total_sectors; sector += n) {
2177         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2178         if (ret < 0) {
2179             goto ro_cleanup;
2180         }
2181         if (ret) {
2182             ret = bdrv_read(bs, sector, buf, n);
2183             if (ret < 0) {
2184                 goto ro_cleanup;
2185             }
2186 
2187             ret = bdrv_write(bs->backing_hd, sector, buf, n);
2188             if (ret < 0) {
2189                 goto ro_cleanup;
2190             }
2191         }
2192     }
2193 
2194     if (drv->bdrv_make_empty) {
2195         ret = drv->bdrv_make_empty(bs);
2196         if (ret < 0) {
2197             goto ro_cleanup;
2198         }
2199         bdrv_flush(bs);
2200     }
2201 
2202     /*
2203      * Make sure all data we wrote to the backing device is actually
2204      * stable on disk.
2205      */
2206     if (bs->backing_hd) {
2207         bdrv_flush(bs->backing_hd);
2208     }
2209 
2210     ret = 0;
2211 ro_cleanup:
2212     g_free(buf);
2213 
2214     if (ro) {
2215         /* ignoring error return here */
2216         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2217     }
2218 
2219     return ret;
2220 }
2221 
2222 int bdrv_commit_all(void)
2223 {
2224     BlockDriverState *bs;
2225 
2226     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2227         if (bs->drv && bs->backing_hd) {
2228             int ret = bdrv_commit(bs);
2229             if (ret < 0) {
2230                 return ret;
2231             }
2232         }
2233     }
2234     return 0;
2235 }
2236 
2237 /**
2238  * Remove an active request from the tracked requests list
2239  *
2240  * This function should be called when a tracked request is completing.
2241  */
2242 static void tracked_request_end(BdrvTrackedRequest *req)
2243 {
2244     if (req->serialising) {
2245         req->bs->serialising_in_flight--;
2246     }
2247 
2248     QLIST_REMOVE(req, list);
2249     qemu_co_queue_restart_all(&req->wait_queue);
2250 }
2251 
2252 /**
2253  * Add an active request to the tracked requests list
2254  */
2255 static void tracked_request_begin(BdrvTrackedRequest *req,
2256                                   BlockDriverState *bs,
2257                                   int64_t offset,
2258                                   unsigned int bytes, bool is_write)
2259 {
2260     *req = (BdrvTrackedRequest){
2261         .bs = bs,
2262         .offset         = offset,
2263         .bytes          = bytes,
2264         .is_write       = is_write,
2265         .co             = qemu_coroutine_self(),
2266         .serialising    = false,
2267         .overlap_offset = offset,
2268         .overlap_bytes  = bytes,
2269     };
2270 
2271     qemu_co_queue_init(&req->wait_queue);
2272 
2273     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2274 }
2275 
2276 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2277 {
2278     int64_t overlap_offset = req->offset & ~(align - 1);
2279     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2280                                - overlap_offset;
2281 
2282     if (!req->serialising) {
2283         req->bs->serialising_in_flight++;
2284         req->serialising = true;
2285     }
2286 
2287     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2288     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2289 }
2290 
2291 /**
2292  * Round a region to cluster boundaries
2293  */
2294 void bdrv_round_to_clusters(BlockDriverState *bs,
2295                             int64_t sector_num, int nb_sectors,
2296                             int64_t *cluster_sector_num,
2297                             int *cluster_nb_sectors)
2298 {
2299     BlockDriverInfo bdi;
2300 
2301     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2302         *cluster_sector_num = sector_num;
2303         *cluster_nb_sectors = nb_sectors;
2304     } else {
2305         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2306         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2307         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2308                                             nb_sectors, c);
2309     }
2310 }
2311 
2312 static int bdrv_get_cluster_size(BlockDriverState *bs)
2313 {
2314     BlockDriverInfo bdi;
2315     int ret;
2316 
2317     ret = bdrv_get_info(bs, &bdi);
2318     if (ret < 0 || bdi.cluster_size == 0) {
2319         return bs->request_alignment;
2320     } else {
2321         return bdi.cluster_size;
2322     }
2323 }
2324 
2325 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2326                                      int64_t offset, unsigned int bytes)
2327 {
2328     /*        aaaa   bbbb */
2329     if (offset >= req->overlap_offset + req->overlap_bytes) {
2330         return false;
2331     }
2332     /* bbbb   aaaa        */
2333     if (req->overlap_offset >= offset + bytes) {
2334         return false;
2335     }
2336     return true;
2337 }
2338 
2339 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2340 {
2341     BlockDriverState *bs = self->bs;
2342     BdrvTrackedRequest *req;
2343     bool retry;
2344     bool waited = false;
2345 
2346     if (!bs->serialising_in_flight) {
2347         return false;
2348     }
2349 
2350     do {
2351         retry = false;
2352         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2353             if (req == self || (!req->serialising && !self->serialising)) {
2354                 continue;
2355             }
2356             if (tracked_request_overlaps(req, self->overlap_offset,
2357                                          self->overlap_bytes))
2358             {
2359                 /* Hitting this means there was a reentrant request, for
2360                  * example, a block driver issuing nested requests.  This must
2361                  * never happen since it means deadlock.
2362                  */
2363                 assert(qemu_coroutine_self() != req->co);
2364 
2365                 /* If the request is already (indirectly) waiting for us, or
2366                  * will wait for us as soon as it wakes up, then just go on
2367                  * (instead of producing a deadlock in the former case). */
2368                 if (!req->waiting_for) {
2369                     self->waiting_for = req;
2370                     qemu_co_queue_wait(&req->wait_queue);
2371                     self->waiting_for = NULL;
2372                     retry = true;
2373                     waited = true;
2374                     break;
2375                 }
2376             }
2377         }
2378     } while (retry);
2379 
2380     return waited;
2381 }
2382 
2383 /*
2384  * Return values:
2385  * 0        - success
2386  * -EINVAL  - backing format specified, but no file
2387  * -ENOSPC  - can't update the backing file because no space is left in the
2388  *            image file header
2389  * -ENOTSUP - format driver doesn't support changing the backing file
2390  */
2391 int bdrv_change_backing_file(BlockDriverState *bs,
2392     const char *backing_file, const char *backing_fmt)
2393 {
2394     BlockDriver *drv = bs->drv;
2395     int ret;
2396 
2397     /* Backing file format doesn't make sense without a backing file */
2398     if (backing_fmt && !backing_file) {
2399         return -EINVAL;
2400     }
2401 
2402     if (drv->bdrv_change_backing_file != NULL) {
2403         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2404     } else {
2405         ret = -ENOTSUP;
2406     }
2407 
2408     if (ret == 0) {
2409         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2410         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2411     }
2412     return ret;
2413 }
2414 
2415 /*
2416  * Finds the image layer in the chain that has 'bs' as its backing file.
2417  *
2418  * active is the current topmost image.
2419  *
2420  * Returns NULL if bs is not found in active's image chain,
2421  * or if active == bs.
2422  */
2423 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2424                                     BlockDriverState *bs)
2425 {
2426     BlockDriverState *overlay = NULL;
2427     BlockDriverState *intermediate;
2428 
2429     assert(active != NULL);
2430     assert(bs != NULL);
2431 
2432     /* if bs is the same as active, then by definition it has no overlay
2433      */
2434     if (active == bs) {
2435         return NULL;
2436     }
2437 
2438     intermediate = active;
2439     while (intermediate->backing_hd) {
2440         if (intermediate->backing_hd == bs) {
2441             overlay = intermediate;
2442             break;
2443         }
2444         intermediate = intermediate->backing_hd;
2445     }
2446 
2447     return overlay;
2448 }
2449 
2450 typedef struct BlkIntermediateStates {
2451     BlockDriverState *bs;
2452     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2453 } BlkIntermediateStates;
2454 
2455 
2456 /*
2457  * Drops images above 'base' up to and including 'top', and sets the image
2458  * above 'top' to have base as its backing file.
2459  *
2460  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2461  * information in 'bs' can be properly updated.
2462  *
2463  * E.g., this will convert the following chain:
2464  * bottom <- base <- intermediate <- top <- active
2465  *
2466  * to
2467  *
2468  * bottom <- base <- active
2469  *
2470  * It is allowed for bottom==base, in which case it converts:
2471  *
2472  * base <- intermediate <- top <- active
2473  *
2474  * to
2475  *
2476  * base <- active
2477  *
2478  * Error conditions:
2479  *  if active == top, that is considered an error
2480  *
2481  */
2482 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2483                            BlockDriverState *base)
2484 {
2485     BlockDriverState *intermediate;
2486     BlockDriverState *base_bs = NULL;
2487     BlockDriverState *new_top_bs = NULL;
2488     BlkIntermediateStates *intermediate_state, *next;
2489     int ret = -EIO;
2490 
2491     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2492     QSIMPLEQ_INIT(&states_to_delete);
2493 
2494     if (!top->drv || !base->drv) {
2495         goto exit;
2496     }
2497 
2498     new_top_bs = bdrv_find_overlay(active, top);
2499 
2500     if (new_top_bs == NULL) {
2501         /* we could not find the image above 'top', this is an error */
2502         goto exit;
2503     }
2504 
2505     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2506      * to do, no intermediate images */
2507     if (new_top_bs->backing_hd == base) {
2508         ret = 0;
2509         goto exit;
2510     }
2511 
2512     intermediate = top;
2513 
2514     /* now we will go down through the list, and add each BDS we find
2515      * into our deletion queue, until we hit the 'base'
2516      */
2517     while (intermediate) {
2518         intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2519         intermediate_state->bs = intermediate;
2520         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2521 
2522         if (intermediate->backing_hd == base) {
2523             base_bs = intermediate->backing_hd;
2524             break;
2525         }
2526         intermediate = intermediate->backing_hd;
2527     }
2528     if (base_bs == NULL) {
2529         /* something went wrong, we did not end at the base. safely
2530          * unravel everything, and exit with error */
2531         goto exit;
2532     }
2533 
2534     /* success - we can delete the intermediate states, and link top->base */
2535     ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2536                                    base_bs->drv ? base_bs->drv->format_name : "");
2537     if (ret) {
2538         goto exit;
2539     }
2540     new_top_bs->backing_hd = base_bs;
2541 
2542     bdrv_refresh_limits(new_top_bs);
2543 
2544     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2545         /* so that bdrv_close() does not recursively close the chain */
2546         intermediate_state->bs->backing_hd = NULL;
2547         bdrv_unref(intermediate_state->bs);
2548     }
2549     ret = 0;
2550 
2551 exit:
2552     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2553         g_free(intermediate_state);
2554     }
2555     return ret;
2556 }
2557 
2558 
2559 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2560                                    size_t size)
2561 {
2562     int64_t len;
2563 
2564     if (!bdrv_is_inserted(bs))
2565         return -ENOMEDIUM;
2566 
2567     if (bs->growable)
2568         return 0;
2569 
2570     len = bdrv_getlength(bs);
2571 
2572     if (offset < 0)
2573         return -EIO;
2574 
2575     if ((offset > len) || (len - offset < size))
2576         return -EIO;
2577 
2578     return 0;
2579 }
2580 
2581 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2582                               int nb_sectors)
2583 {
2584     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2585                                    nb_sectors * BDRV_SECTOR_SIZE);
2586 }
2587 
2588 typedef struct RwCo {
2589     BlockDriverState *bs;
2590     int64_t offset;
2591     QEMUIOVector *qiov;
2592     bool is_write;
2593     int ret;
2594     BdrvRequestFlags flags;
2595 } RwCo;
2596 
2597 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2598 {
2599     RwCo *rwco = opaque;
2600 
2601     if (!rwco->is_write) {
2602         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2603                                       rwco->qiov->size, rwco->qiov,
2604                                       rwco->flags);
2605     } else {
2606         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2607                                        rwco->qiov->size, rwco->qiov,
2608                                        rwco->flags);
2609     }
2610 }
2611 
2612 /*
2613  * Process a vectored synchronous request using coroutines
2614  */
2615 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2616                         QEMUIOVector *qiov, bool is_write,
2617                         BdrvRequestFlags flags)
2618 {
2619     Coroutine *co;
2620     RwCo rwco = {
2621         .bs = bs,
2622         .offset = offset,
2623         .qiov = qiov,
2624         .is_write = is_write,
2625         .ret = NOT_DONE,
2626         .flags = flags,
2627     };
2628 
2629     /**
2630      * In sync call context, when the vcpu is blocked, this throttling timer
2631      * will not fire; so the I/O throttling function has to be disabled here
2632      * if it has been enabled.
2633      */
2634     if (bs->io_limits_enabled) {
2635         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2636                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2637         bdrv_io_limits_disable(bs);
2638     }
2639 
2640     if (qemu_in_coroutine()) {
2641         /* Fast-path if already in coroutine context */
2642         bdrv_rw_co_entry(&rwco);
2643     } else {
2644         co = qemu_coroutine_create(bdrv_rw_co_entry);
2645         qemu_coroutine_enter(co, &rwco);
2646         while (rwco.ret == NOT_DONE) {
2647             qemu_aio_wait();
2648         }
2649     }
2650     return rwco.ret;
2651 }
2652 
2653 /*
2654  * Process a synchronous request using coroutines
2655  */
2656 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2657                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2658 {
2659     QEMUIOVector qiov;
2660     struct iovec iov = {
2661         .iov_base = (void *)buf,
2662         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2663     };
2664 
2665     qemu_iovec_init_external(&qiov, &iov, 1);
2666     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2667                         &qiov, is_write, flags);
2668 }
2669 
2670 /* return < 0 if error. See bdrv_write() for the return codes */
2671 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2672               uint8_t *buf, int nb_sectors)
2673 {
2674     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2675 }
2676 
2677 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2678 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2679                           uint8_t *buf, int nb_sectors)
2680 {
2681     bool enabled;
2682     int ret;
2683 
2684     enabled = bs->io_limits_enabled;
2685     bs->io_limits_enabled = false;
2686     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2687     bs->io_limits_enabled = enabled;
2688     return ret;
2689 }
2690 
2691 /* Return < 0 if error. Important errors are:
2692   -EIO         generic I/O error (may happen for all errors)
2693   -ENOMEDIUM   No media inserted.
2694   -EINVAL      Invalid sector number or nb_sectors
2695   -EACCES      Trying to write a read-only device
2696 */
2697 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2698                const uint8_t *buf, int nb_sectors)
2699 {
2700     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2701 }
2702 
2703 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2704                       int nb_sectors, BdrvRequestFlags flags)
2705 {
2706     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2707                       BDRV_REQ_ZERO_WRITE | flags);
2708 }
2709 
2710 /*
2711  * Completely zero out a block device with the help of bdrv_write_zeroes.
2712  * The operation is sped up by checking the block status and only writing
2713  * zeroes to the device if they currently do not return zeroes. Optional
2714  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2715  *
2716  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2717  */
2718 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2719 {
2720     int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
2721     int64_t ret, nb_sectors, sector_num = 0;
2722     int n;
2723 
2724     for (;;) {
2725         nb_sectors = target_size - sector_num;
2726         if (nb_sectors <= 0) {
2727             return 0;
2728         }
2729         if (nb_sectors > INT_MAX) {
2730             nb_sectors = INT_MAX;
2731         }
2732         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2733         if (ret < 0) {
2734             error_report("error getting block status at sector %" PRId64 ": %s",
2735                          sector_num, strerror(-ret));
2736             return ret;
2737         }
2738         if (ret & BDRV_BLOCK_ZERO) {
2739             sector_num += n;
2740             continue;
2741         }
2742         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2743         if (ret < 0) {
2744             error_report("error writing zeroes at sector %" PRId64 ": %s",
2745                          sector_num, strerror(-ret));
2746             return ret;
2747         }
2748         sector_num += n;
2749     }
2750 }
2751 
2752 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2753 {
2754     QEMUIOVector qiov;
2755     struct iovec iov = {
2756         .iov_base = (void *)buf,
2757         .iov_len = bytes,
2758     };
2759     int ret;
2760 
2761     if (bytes < 0) {
2762         return -EINVAL;
2763     }
2764 
2765     qemu_iovec_init_external(&qiov, &iov, 1);
2766     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2767     if (ret < 0) {
2768         return ret;
2769     }
2770 
2771     return bytes;
2772 }
2773 
2774 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2775 {
2776     int ret;
2777 
2778     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2779     if (ret < 0) {
2780         return ret;
2781     }
2782 
2783     return qiov->size;
2784 }
2785 
2786 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2787                 const void *buf, int bytes)
2788 {
2789     QEMUIOVector qiov;
2790     struct iovec iov = {
2791         .iov_base   = (void *) buf,
2792         .iov_len    = bytes,
2793     };
2794 
2795     if (bytes < 0) {
2796         return -EINVAL;
2797     }
2798 
2799     qemu_iovec_init_external(&qiov, &iov, 1);
2800     return bdrv_pwritev(bs, offset, &qiov);
2801 }
2802 
2803 /*
2804  * Writes to the file and ensures that no writes are reordered across this
2805  * request (acts as a barrier)
2806  *
2807  * Returns 0 on success, -errno in error cases.
2808  */
2809 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2810     const void *buf, int count)
2811 {
2812     int ret;
2813 
2814     ret = bdrv_pwrite(bs, offset, buf, count);
2815     if (ret < 0) {
2816         return ret;
2817     }
2818 
2819     /* No flush needed for cache modes that already do it */
2820     if (bs->enable_write_cache) {
2821         bdrv_flush(bs);
2822     }
2823 
2824     return 0;
2825 }
2826 
2827 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2828         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2829 {
2830     /* Perform I/O through a temporary buffer so that users who scribble over
2831      * their read buffer while the operation is in progress do not end up
2832      * modifying the image file.  This is critical for zero-copy guest I/O
2833      * where anything might happen inside guest memory.
2834      */
2835     void *bounce_buffer;
2836 
2837     BlockDriver *drv = bs->drv;
2838     struct iovec iov;
2839     QEMUIOVector bounce_qiov;
2840     int64_t cluster_sector_num;
2841     int cluster_nb_sectors;
2842     size_t skip_bytes;
2843     int ret;
2844 
2845     /* Cover entire cluster so no additional backing file I/O is required when
2846      * allocating cluster in the image file.
2847      */
2848     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2849                            &cluster_sector_num, &cluster_nb_sectors);
2850 
2851     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2852                                    cluster_sector_num, cluster_nb_sectors);
2853 
2854     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2855     iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2856     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2857 
2858     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2859                              &bounce_qiov);
2860     if (ret < 0) {
2861         goto err;
2862     }
2863 
2864     if (drv->bdrv_co_write_zeroes &&
2865         buffer_is_zero(bounce_buffer, iov.iov_len)) {
2866         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2867                                       cluster_nb_sectors, 0);
2868     } else {
2869         /* This does not change the data on the disk, it is not necessary
2870          * to flush even in cache=writethrough mode.
2871          */
2872         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2873                                   &bounce_qiov);
2874     }
2875 
2876     if (ret < 0) {
2877         /* It might be okay to ignore write errors for guest requests.  If this
2878          * is a deliberate copy-on-read then we don't want to ignore the error.
2879          * Simply report it in all cases.
2880          */
2881         goto err;
2882     }
2883 
2884     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2885     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2886                         nb_sectors * BDRV_SECTOR_SIZE);
2887 
2888 err:
2889     qemu_vfree(bounce_buffer);
2890     return ret;
2891 }
2892 
2893 /*
2894  * Forwards an already correctly aligned request to the BlockDriver. This
2895  * handles copy on read and zeroing after EOF; any other features must be
2896  * implemented by the caller.
2897  */
2898 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2899     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2900     int64_t align, QEMUIOVector *qiov, int flags)
2901 {
2902     BlockDriver *drv = bs->drv;
2903     int ret;
2904 
2905     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2906     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2907 
2908     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2909     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2910 
2911     /* Handle Copy on Read and associated serialisation */
2912     if (flags & BDRV_REQ_COPY_ON_READ) {
2913         /* If we touch the same cluster it counts as an overlap.  This
2914          * guarantees that allocating writes will be serialized and not race
2915          * with each other for the same cluster.  For example, in copy-on-read
2916          * it ensures that the CoR read and write operations are atomic and
2917          * guest writes cannot interleave between them. */
2918         mark_request_serialising(req, bdrv_get_cluster_size(bs));
2919     }
2920 
2921     wait_serialising_requests(req);
2922 
2923     if (flags & BDRV_REQ_COPY_ON_READ) {
2924         int pnum;
2925 
2926         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2927         if (ret < 0) {
2928             goto out;
2929         }
2930 
2931         if (!ret || pnum != nb_sectors) {
2932             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2933             goto out;
2934         }
2935     }
2936 
2937     /* Forward the request to the BlockDriver */
2938     if (!(bs->zero_beyond_eof && bs->growable)) {
2939         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2940     } else {
2941         /* Read zeros after EOF of growable BDSes */
2942         int64_t len, total_sectors, max_nb_sectors;
2943 
2944         len = bdrv_getlength(bs);
2945         if (len < 0) {
2946             ret = len;
2947             goto out;
2948         }
2949 
2950         total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
2951         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
2952                                   align >> BDRV_SECTOR_BITS);
2953         if (max_nb_sectors > 0) {
2954             ret = drv->bdrv_co_readv(bs, sector_num,
2955                                      MIN(nb_sectors, max_nb_sectors), qiov);
2956         } else {
2957             ret = 0;
2958         }
2959 
2960         /* Reading beyond end of file is supposed to produce zeroes */
2961         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2962             uint64_t offset = MAX(0, total_sectors - sector_num);
2963             uint64_t bytes = (sector_num + nb_sectors - offset) *
2964                               BDRV_SECTOR_SIZE;
2965             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
2966         }
2967     }
2968 
2969 out:
2970     return ret;
2971 }
2972 
2973 /*
2974  * Handle a read request in coroutine context
2975  */
2976 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
2977     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
2978     BdrvRequestFlags flags)
2979 {
2980     BlockDriver *drv = bs->drv;
2981     BdrvTrackedRequest req;
2982 
2983     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
2984     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
2985     uint8_t *head_buf = NULL;
2986     uint8_t *tail_buf = NULL;
2987     QEMUIOVector local_qiov;
2988     bool use_local_qiov = false;
2989     int ret;
2990 
2991     if (!drv) {
2992         return -ENOMEDIUM;
2993     }
2994     if (bdrv_check_byte_request(bs, offset, bytes)) {
2995         return -EIO;
2996     }
2997 
2998     if (bs->copy_on_read) {
2999         flags |= BDRV_REQ_COPY_ON_READ;
3000     }
3001 
3002     /* throttling disk I/O */
3003     if (bs->io_limits_enabled) {
3004         bdrv_io_limits_intercept(bs, bytes, false);
3005     }
3006 
3007     /* Align read if necessary by padding qiov */
3008     if (offset & (align - 1)) {
3009         head_buf = qemu_blockalign(bs, align);
3010         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3011         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3012         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3013         use_local_qiov = true;
3014 
3015         bytes += offset & (align - 1);
3016         offset = offset & ~(align - 1);
3017     }
3018 
3019     if ((offset + bytes) & (align - 1)) {
3020         if (!use_local_qiov) {
3021             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3022             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3023             use_local_qiov = true;
3024         }
3025         tail_buf = qemu_blockalign(bs, align);
3026         qemu_iovec_add(&local_qiov, tail_buf,
3027                        align - ((offset + bytes) & (align - 1)));
3028 
3029         bytes = ROUND_UP(bytes, align);
3030     }
3031 
3032     tracked_request_begin(&req, bs, offset, bytes, false);
3033     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3034                               use_local_qiov ? &local_qiov : qiov,
3035                               flags);
3036     tracked_request_end(&req);
3037 
3038     if (use_local_qiov) {
3039         qemu_iovec_destroy(&local_qiov);
3040         qemu_vfree(head_buf);
3041         qemu_vfree(tail_buf);
3042     }
3043 
3044     return ret;
3045 }
3046 
3047 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3048     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3049     BdrvRequestFlags flags)
3050 {
3051     if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3052         return -EINVAL;
3053     }
3054 
3055     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3056                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3057 }
3058 
3059 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3060     int nb_sectors, QEMUIOVector *qiov)
3061 {
3062     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3063 
3064     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3065 }
3066 
3067 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3068     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3069 {
3070     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3071 
3072     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3073                             BDRV_REQ_COPY_ON_READ);
3074 }
3075 
3076 /* if no limit is specified in the BlockLimits use a default
3077  * of 32768 512-byte sectors (16 MiB) per request.
3078  */
3079 #define MAX_WRITE_ZEROES_DEFAULT 32768
3080 
3081 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3082     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3083 {
3084     BlockDriver *drv = bs->drv;
3085     QEMUIOVector qiov;
3086     struct iovec iov = {0};
3087     int ret = 0;
3088 
3089     int max_write_zeroes = bs->bl.max_write_zeroes ?
3090                            bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3091 
3092     while (nb_sectors > 0 && !ret) {
3093         int num = nb_sectors;
3094 
3095         /* Align request.  Block drivers can expect the "bulk" of the request
3096          * to be aligned.
3097          */
3098         if (bs->bl.write_zeroes_alignment
3099             && num > bs->bl.write_zeroes_alignment) {
3100             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3101                 /* Make a small request up to the first aligned sector.  */
3102                 num = bs->bl.write_zeroes_alignment;
3103                 num -= sector_num % bs->bl.write_zeroes_alignment;
3104             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3105                 /* Shorten the request to the last aligned sector.  num cannot
3106                  * underflow because num > bs->bl.write_zeroes_alignment.
3107                  */
3108                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3109             }
3110         }
3111 
3112         /* limit request size */
3113         if (num > max_write_zeroes) {
3114             num = max_write_zeroes;
3115         }
3116 
3117         ret = -ENOTSUP;
3118         /* First try the efficient write zeroes operation */
3119         if (drv->bdrv_co_write_zeroes) {
3120             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3121         }
3122 
3123         if (ret == -ENOTSUP) {
3124             /* Fall back to bounce buffer if write zeroes is unsupported */
3125             iov.iov_len = num * BDRV_SECTOR_SIZE;
3126             if (iov.iov_base == NULL) {
3127                 iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3128                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3129             }
3130             qemu_iovec_init_external(&qiov, &iov, 1);
3131 
3132             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3133 
3134             /* Keep bounce buffer around if it is big enough for all
3135              * all future requests.
3136              */
3137             if (num < max_write_zeroes) {
3138                 qemu_vfree(iov.iov_base);
3139                 iov.iov_base = NULL;
3140             }
3141         }
3142 
3143         sector_num += num;
3144         nb_sectors -= num;
3145     }
3146 
3147     qemu_vfree(iov.iov_base);
3148     return ret;
3149 }
3150 
3151 /*
3152  * Forwards an already correctly aligned write request to the BlockDriver.
3153  */
3154 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3155     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3156     QEMUIOVector *qiov, int flags)
3157 {
3158     BlockDriver *drv = bs->drv;
3159     bool waited;
3160     int ret;
3161 
3162     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3163     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3164 
3165     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3166     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3167 
3168     waited = wait_serialising_requests(req);
3169     assert(!waited || !req->serialising);
3170     assert(req->overlap_offset <= offset);
3171     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3172 
3173     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3174 
3175     if (ret < 0) {
3176         /* Do nothing, write notifier decided to fail this request */
3177     } else if (flags & BDRV_REQ_ZERO_WRITE) {
3178         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3179         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3180     } else {
3181         BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3182         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3183     }
3184     BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3185 
3186     if (ret == 0 && !bs->enable_write_cache) {
3187         ret = bdrv_co_flush(bs);
3188     }
3189 
3190     bdrv_set_dirty(bs, sector_num, nb_sectors);
3191 
3192     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3193         bs->wr_highest_sector = sector_num + nb_sectors - 1;
3194     }
3195     if (bs->growable && ret >= 0) {
3196         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3197     }
3198 
3199     return ret;
3200 }
3201 
3202 /*
3203  * Handle a write request in coroutine context
3204  */
3205 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3206     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3207     BdrvRequestFlags flags)
3208 {
3209     BdrvTrackedRequest req;
3210     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3211     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3212     uint8_t *head_buf = NULL;
3213     uint8_t *tail_buf = NULL;
3214     QEMUIOVector local_qiov;
3215     bool use_local_qiov = false;
3216     int ret;
3217 
3218     if (!bs->drv) {
3219         return -ENOMEDIUM;
3220     }
3221     if (bs->read_only) {
3222         return -EACCES;
3223     }
3224     if (bdrv_check_byte_request(bs, offset, bytes)) {
3225         return -EIO;
3226     }
3227 
3228     /* throttling disk I/O */
3229     if (bs->io_limits_enabled) {
3230         bdrv_io_limits_intercept(bs, bytes, true);
3231     }
3232 
3233     /*
3234      * Align write if necessary by performing a read-modify-write cycle.
3235      * Pad qiov with the read parts and be sure to have a tracked request not
3236      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3237      */
3238     tracked_request_begin(&req, bs, offset, bytes, true);
3239 
3240     if (offset & (align - 1)) {
3241         QEMUIOVector head_qiov;
3242         struct iovec head_iov;
3243 
3244         mark_request_serialising(&req, align);
3245         wait_serialising_requests(&req);
3246 
3247         head_buf = qemu_blockalign(bs, align);
3248         head_iov = (struct iovec) {
3249             .iov_base   = head_buf,
3250             .iov_len    = align,
3251         };
3252         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3253 
3254         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3255         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3256                                   align, &head_qiov, 0);
3257         if (ret < 0) {
3258             goto fail;
3259         }
3260         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3261 
3262         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3263         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3264         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3265         use_local_qiov = true;
3266 
3267         bytes += offset & (align - 1);
3268         offset = offset & ~(align - 1);
3269     }
3270 
3271     if ((offset + bytes) & (align - 1)) {
3272         QEMUIOVector tail_qiov;
3273         struct iovec tail_iov;
3274         size_t tail_bytes;
3275         bool waited;
3276 
3277         mark_request_serialising(&req, align);
3278         waited = wait_serialising_requests(&req);
3279         assert(!waited || !use_local_qiov);
3280 
3281         tail_buf = qemu_blockalign(bs, align);
3282         tail_iov = (struct iovec) {
3283             .iov_base   = tail_buf,
3284             .iov_len    = align,
3285         };
3286         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3287 
3288         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3289         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3290                                   align, &tail_qiov, 0);
3291         if (ret < 0) {
3292             goto fail;
3293         }
3294         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3295 
3296         if (!use_local_qiov) {
3297             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3298             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3299             use_local_qiov = true;
3300         }
3301 
3302         tail_bytes = (offset + bytes) & (align - 1);
3303         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3304 
3305         bytes = ROUND_UP(bytes, align);
3306     }
3307 
3308     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3309                                use_local_qiov ? &local_qiov : qiov,
3310                                flags);
3311 
3312 fail:
3313     tracked_request_end(&req);
3314 
3315     if (use_local_qiov) {
3316         qemu_iovec_destroy(&local_qiov);
3317     }
3318     qemu_vfree(head_buf);
3319     qemu_vfree(tail_buf);
3320 
3321     return ret;
3322 }
3323 
3324 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3325     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3326     BdrvRequestFlags flags)
3327 {
3328     if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3329         return -EINVAL;
3330     }
3331 
3332     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3333                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3334 }
3335 
3336 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3337     int nb_sectors, QEMUIOVector *qiov)
3338 {
3339     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3340 
3341     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3342 }
3343 
3344 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3345                                       int64_t sector_num, int nb_sectors,
3346                                       BdrvRequestFlags flags)
3347 {
3348     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3349 
3350     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3351         flags &= ~BDRV_REQ_MAY_UNMAP;
3352     }
3353 
3354     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3355                              BDRV_REQ_ZERO_WRITE | flags);
3356 }
3357 
3358 /**
3359  * Truncate file to 'offset' bytes (needed only for file protocols)
3360  */
3361 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3362 {
3363     BlockDriver *drv = bs->drv;
3364     int ret;
3365     if (!drv)
3366         return -ENOMEDIUM;
3367     if (!drv->bdrv_truncate)
3368         return -ENOTSUP;
3369     if (bs->read_only)
3370         return -EACCES;
3371     if (bdrv_in_use(bs))
3372         return -EBUSY;
3373     ret = drv->bdrv_truncate(bs, offset);
3374     if (ret == 0) {
3375         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3376         bdrv_dev_resize_cb(bs);
3377     }
3378     return ret;
3379 }
3380 
3381 /**
3382  * Length of a allocated file in bytes. Sparse files are counted by actual
3383  * allocated space. Return < 0 if error or unknown.
3384  */
3385 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3386 {
3387     BlockDriver *drv = bs->drv;
3388     if (!drv) {
3389         return -ENOMEDIUM;
3390     }
3391     if (drv->bdrv_get_allocated_file_size) {
3392         return drv->bdrv_get_allocated_file_size(bs);
3393     }
3394     if (bs->file) {
3395         return bdrv_get_allocated_file_size(bs->file);
3396     }
3397     return -ENOTSUP;
3398 }
3399 
3400 /**
3401  * Length of a file in bytes. Return < 0 if error or unknown.
3402  */
3403 int64_t bdrv_getlength(BlockDriverState *bs)
3404 {
3405     BlockDriver *drv = bs->drv;
3406     if (!drv)
3407         return -ENOMEDIUM;
3408 
3409     if (drv->has_variable_length) {
3410         int ret = refresh_total_sectors(bs, bs->total_sectors);
3411         if (ret < 0) {
3412             return ret;
3413         }
3414     }
3415     return bs->total_sectors * BDRV_SECTOR_SIZE;
3416 }
3417 
3418 /* return 0 as number of sectors if no device present or error */
3419 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3420 {
3421     int64_t length;
3422     length = bdrv_getlength(bs);
3423     if (length < 0)
3424         length = 0;
3425     else
3426         length = length >> BDRV_SECTOR_BITS;
3427     *nb_sectors_ptr = length;
3428 }
3429 
3430 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3431                        BlockdevOnError on_write_error)
3432 {
3433     bs->on_read_error = on_read_error;
3434     bs->on_write_error = on_write_error;
3435 }
3436 
3437 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3438 {
3439     return is_read ? bs->on_read_error : bs->on_write_error;
3440 }
3441 
3442 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3443 {
3444     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3445 
3446     switch (on_err) {
3447     case BLOCKDEV_ON_ERROR_ENOSPC:
3448         return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3449     case BLOCKDEV_ON_ERROR_STOP:
3450         return BDRV_ACTION_STOP;
3451     case BLOCKDEV_ON_ERROR_REPORT:
3452         return BDRV_ACTION_REPORT;
3453     case BLOCKDEV_ON_ERROR_IGNORE:
3454         return BDRV_ACTION_IGNORE;
3455     default:
3456         abort();
3457     }
3458 }
3459 
3460 /* This is done by device models because, while the block layer knows
3461  * about the error, it does not know whether an operation comes from
3462  * the device or the block layer (from a job, for example).
3463  */
3464 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3465                        bool is_read, int error)
3466 {
3467     assert(error >= 0);
3468     bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3469     if (action == BDRV_ACTION_STOP) {
3470         vm_stop(RUN_STATE_IO_ERROR);
3471         bdrv_iostatus_set_err(bs, error);
3472     }
3473 }
3474 
3475 int bdrv_is_read_only(BlockDriverState *bs)
3476 {
3477     return bs->read_only;
3478 }
3479 
3480 int bdrv_is_sg(BlockDriverState *bs)
3481 {
3482     return bs->sg;
3483 }
3484 
3485 int bdrv_enable_write_cache(BlockDriverState *bs)
3486 {
3487     return bs->enable_write_cache;
3488 }
3489 
3490 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3491 {
3492     bs->enable_write_cache = wce;
3493 
3494     /* so a reopen() will preserve wce */
3495     if (wce) {
3496         bs->open_flags |= BDRV_O_CACHE_WB;
3497     } else {
3498         bs->open_flags &= ~BDRV_O_CACHE_WB;
3499     }
3500 }
3501 
3502 int bdrv_is_encrypted(BlockDriverState *bs)
3503 {
3504     if (bs->backing_hd && bs->backing_hd->encrypted)
3505         return 1;
3506     return bs->encrypted;
3507 }
3508 
3509 int bdrv_key_required(BlockDriverState *bs)
3510 {
3511     BlockDriverState *backing_hd = bs->backing_hd;
3512 
3513     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3514         return 1;
3515     return (bs->encrypted && !bs->valid_key);
3516 }
3517 
3518 int bdrv_set_key(BlockDriverState *bs, const char *key)
3519 {
3520     int ret;
3521     if (bs->backing_hd && bs->backing_hd->encrypted) {
3522         ret = bdrv_set_key(bs->backing_hd, key);
3523         if (ret < 0)
3524             return ret;
3525         if (!bs->encrypted)
3526             return 0;
3527     }
3528     if (!bs->encrypted) {
3529         return -EINVAL;
3530     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3531         return -ENOMEDIUM;
3532     }
3533     ret = bs->drv->bdrv_set_key(bs, key);
3534     if (ret < 0) {
3535         bs->valid_key = 0;
3536     } else if (!bs->valid_key) {
3537         bs->valid_key = 1;
3538         /* call the change callback now, we skipped it on open */
3539         bdrv_dev_change_media_cb(bs, true);
3540     }
3541     return ret;
3542 }
3543 
3544 const char *bdrv_get_format_name(BlockDriverState *bs)
3545 {
3546     return bs->drv ? bs->drv->format_name : NULL;
3547 }
3548 
3549 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3550                          void *opaque)
3551 {
3552     BlockDriver *drv;
3553 
3554     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3555         it(opaque, drv->format_name);
3556     }
3557 }
3558 
3559 /* This function is to find block backend bs */
3560 BlockDriverState *bdrv_find(const char *name)
3561 {
3562     BlockDriverState *bs;
3563 
3564     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3565         if (!strcmp(name, bs->device_name)) {
3566             return bs;
3567         }
3568     }
3569     return NULL;
3570 }
3571 
3572 /* This function is to find a node in the bs graph */
3573 BlockDriverState *bdrv_find_node(const char *node_name)
3574 {
3575     BlockDriverState *bs;
3576 
3577     assert(node_name);
3578 
3579     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3580         if (!strcmp(node_name, bs->node_name)) {
3581             return bs;
3582         }
3583     }
3584     return NULL;
3585 }
3586 
3587 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3588 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3589 {
3590     BlockDeviceInfoList *list, *entry;
3591     BlockDriverState *bs;
3592 
3593     list = NULL;
3594     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3595         entry = g_malloc0(sizeof(*entry));
3596         entry->value = bdrv_block_device_info(bs);
3597         entry->next = list;
3598         list = entry;
3599     }
3600 
3601     return list;
3602 }
3603 
3604 BlockDriverState *bdrv_lookup_bs(const char *device,
3605                                  const char *node_name,
3606                                  Error **errp)
3607 {
3608     BlockDriverState *bs = NULL;
3609 
3610     if (device) {
3611         bs = bdrv_find(device);
3612 
3613         if (bs) {
3614             return bs;
3615         }
3616     }
3617 
3618     if (node_name) {
3619         bs = bdrv_find_node(node_name);
3620 
3621         if (bs) {
3622             return bs;
3623         }
3624     }
3625 
3626     error_setg(errp, "Cannot find device=%s nor node_name=%s",
3627                      device ? device : "",
3628                      node_name ? node_name : "");
3629     return NULL;
3630 }
3631 
3632 BlockDriverState *bdrv_next(BlockDriverState *bs)
3633 {
3634     if (!bs) {
3635         return QTAILQ_FIRST(&bdrv_states);
3636     }
3637     return QTAILQ_NEXT(bs, device_list);
3638 }
3639 
3640 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3641 {
3642     BlockDriverState *bs;
3643 
3644     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3645         it(opaque, bs);
3646     }
3647 }
3648 
3649 const char *bdrv_get_device_name(BlockDriverState *bs)
3650 {
3651     return bs->device_name;
3652 }
3653 
3654 int bdrv_get_flags(BlockDriverState *bs)
3655 {
3656     return bs->open_flags;
3657 }
3658 
3659 int bdrv_flush_all(void)
3660 {
3661     BlockDriverState *bs;
3662     int result = 0;
3663 
3664     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3665         int ret = bdrv_flush(bs);
3666         if (ret < 0 && !result) {
3667             result = ret;
3668         }
3669     }
3670 
3671     return result;
3672 }
3673 
3674 int bdrv_has_zero_init_1(BlockDriverState *bs)
3675 {
3676     return 1;
3677 }
3678 
3679 int bdrv_has_zero_init(BlockDriverState *bs)
3680 {
3681     assert(bs->drv);
3682 
3683     /* If BS is a copy on write image, it is initialized to
3684        the contents of the base image, which may not be zeroes.  */
3685     if (bs->backing_hd) {
3686         return 0;
3687     }
3688     if (bs->drv->bdrv_has_zero_init) {
3689         return bs->drv->bdrv_has_zero_init(bs);
3690     }
3691 
3692     /* safe default */
3693     return 0;
3694 }
3695 
3696 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3697 {
3698     BlockDriverInfo bdi;
3699 
3700     if (bs->backing_hd) {
3701         return false;
3702     }
3703 
3704     if (bdrv_get_info(bs, &bdi) == 0) {
3705         return bdi.unallocated_blocks_are_zero;
3706     }
3707 
3708     return false;
3709 }
3710 
3711 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3712 {
3713     BlockDriverInfo bdi;
3714 
3715     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3716         return false;
3717     }
3718 
3719     if (bdrv_get_info(bs, &bdi) == 0) {
3720         return bdi.can_write_zeroes_with_unmap;
3721     }
3722 
3723     return false;
3724 }
3725 
3726 typedef struct BdrvCoGetBlockStatusData {
3727     BlockDriverState *bs;
3728     BlockDriverState *base;
3729     int64_t sector_num;
3730     int nb_sectors;
3731     int *pnum;
3732     int64_t ret;
3733     bool done;
3734 } BdrvCoGetBlockStatusData;
3735 
3736 /*
3737  * Returns true iff the specified sector is present in the disk image. Drivers
3738  * not implementing the functionality are assumed to not support backing files,
3739  * hence all their sectors are reported as allocated.
3740  *
3741  * If 'sector_num' is beyond the end of the disk image the return value is 0
3742  * and 'pnum' is set to 0.
3743  *
3744  * 'pnum' is set to the number of sectors (including and immediately following
3745  * the specified sector) that are known to be in the same
3746  * allocated/unallocated state.
3747  *
3748  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3749  * beyond the end of the disk image it will be clamped.
3750  */
3751 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3752                                                      int64_t sector_num,
3753                                                      int nb_sectors, int *pnum)
3754 {
3755     int64_t length;
3756     int64_t n;
3757     int64_t ret, ret2;
3758 
3759     length = bdrv_getlength(bs);
3760     if (length < 0) {
3761         return length;
3762     }
3763 
3764     if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3765         *pnum = 0;
3766         return 0;
3767     }
3768 
3769     n = bs->total_sectors - sector_num;
3770     if (n < nb_sectors) {
3771         nb_sectors = n;
3772     }
3773 
3774     if (!bs->drv->bdrv_co_get_block_status) {
3775         *pnum = nb_sectors;
3776         ret = BDRV_BLOCK_DATA;
3777         if (bs->drv->protocol_name) {
3778             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3779         }
3780         return ret;
3781     }
3782 
3783     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3784     if (ret < 0) {
3785         *pnum = 0;
3786         return ret;
3787     }
3788 
3789     if (ret & BDRV_BLOCK_RAW) {
3790         assert(ret & BDRV_BLOCK_OFFSET_VALID);
3791         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3792                                      *pnum, pnum);
3793     }
3794 
3795     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3796         if (bdrv_unallocated_blocks_are_zero(bs)) {
3797             ret |= BDRV_BLOCK_ZERO;
3798         } else if (bs->backing_hd) {
3799             BlockDriverState *bs2 = bs->backing_hd;
3800             int64_t length2 = bdrv_getlength(bs2);
3801             if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3802                 ret |= BDRV_BLOCK_ZERO;
3803             }
3804         }
3805     }
3806 
3807     if (bs->file &&
3808         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3809         (ret & BDRV_BLOCK_OFFSET_VALID)) {
3810         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3811                                         *pnum, pnum);
3812         if (ret2 >= 0) {
3813             /* Ignore errors.  This is just providing extra information, it
3814              * is useful but not necessary.
3815              */
3816             ret |= (ret2 & BDRV_BLOCK_ZERO);
3817         }
3818     }
3819 
3820     return ret;
3821 }
3822 
3823 /* Coroutine wrapper for bdrv_get_block_status() */
3824 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3825 {
3826     BdrvCoGetBlockStatusData *data = opaque;
3827     BlockDriverState *bs = data->bs;
3828 
3829     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3830                                          data->pnum);
3831     data->done = true;
3832 }
3833 
3834 /*
3835  * Synchronous wrapper around bdrv_co_get_block_status().
3836  *
3837  * See bdrv_co_get_block_status() for details.
3838  */
3839 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3840                               int nb_sectors, int *pnum)
3841 {
3842     Coroutine *co;
3843     BdrvCoGetBlockStatusData data = {
3844         .bs = bs,
3845         .sector_num = sector_num,
3846         .nb_sectors = nb_sectors,
3847         .pnum = pnum,
3848         .done = false,
3849     };
3850 
3851     if (qemu_in_coroutine()) {
3852         /* Fast-path if already in coroutine context */
3853         bdrv_get_block_status_co_entry(&data);
3854     } else {
3855         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3856         qemu_coroutine_enter(co, &data);
3857         while (!data.done) {
3858             qemu_aio_wait();
3859         }
3860     }
3861     return data.ret;
3862 }
3863 
3864 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3865                                    int nb_sectors, int *pnum)
3866 {
3867     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3868     if (ret < 0) {
3869         return ret;
3870     }
3871     return
3872         (ret & BDRV_BLOCK_DATA) ||
3873         ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3874 }
3875 
3876 /*
3877  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3878  *
3879  * Return true if the given sector is allocated in any image between
3880  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3881  * sector is allocated in any image of the chain.  Return false otherwise.
3882  *
3883  * 'pnum' is set to the number of sectors (including and immediately following
3884  *  the specified sector) that are known to be in the same
3885  *  allocated/unallocated state.
3886  *
3887  */
3888 int bdrv_is_allocated_above(BlockDriverState *top,
3889                             BlockDriverState *base,
3890                             int64_t sector_num,
3891                             int nb_sectors, int *pnum)
3892 {
3893     BlockDriverState *intermediate;
3894     int ret, n = nb_sectors;
3895 
3896     intermediate = top;
3897     while (intermediate && intermediate != base) {
3898         int pnum_inter;
3899         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3900                                 &pnum_inter);
3901         if (ret < 0) {
3902             return ret;
3903         } else if (ret) {
3904             *pnum = pnum_inter;
3905             return 1;
3906         }
3907 
3908         /*
3909          * [sector_num, nb_sectors] is unallocated on top but intermediate
3910          * might have
3911          *
3912          * [sector_num+x, nr_sectors] allocated.
3913          */
3914         if (n > pnum_inter &&
3915             (intermediate == top ||
3916              sector_num + pnum_inter < intermediate->total_sectors)) {
3917             n = pnum_inter;
3918         }
3919 
3920         intermediate = intermediate->backing_hd;
3921     }
3922 
3923     *pnum = n;
3924     return 0;
3925 }
3926 
3927 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3928 {
3929     if (bs->backing_hd && bs->backing_hd->encrypted)
3930         return bs->backing_file;
3931     else if (bs->encrypted)
3932         return bs->filename;
3933     else
3934         return NULL;
3935 }
3936 
3937 void bdrv_get_backing_filename(BlockDriverState *bs,
3938                                char *filename, int filename_size)
3939 {
3940     pstrcpy(filename, filename_size, bs->backing_file);
3941 }
3942 
3943 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3944                           const uint8_t *buf, int nb_sectors)
3945 {
3946     BlockDriver *drv = bs->drv;
3947     if (!drv)
3948         return -ENOMEDIUM;
3949     if (!drv->bdrv_write_compressed)
3950         return -ENOTSUP;
3951     if (bdrv_check_request(bs, sector_num, nb_sectors))
3952         return -EIO;
3953 
3954     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
3955 
3956     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3957 }
3958 
3959 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3960 {
3961     BlockDriver *drv = bs->drv;
3962     if (!drv)
3963         return -ENOMEDIUM;
3964     if (!drv->bdrv_get_info)
3965         return -ENOTSUP;
3966     memset(bdi, 0, sizeof(*bdi));
3967     return drv->bdrv_get_info(bs, bdi);
3968 }
3969 
3970 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
3971 {
3972     BlockDriver *drv = bs->drv;
3973     if (drv && drv->bdrv_get_specific_info) {
3974         return drv->bdrv_get_specific_info(bs);
3975     }
3976     return NULL;
3977 }
3978 
3979 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3980                       int64_t pos, int size)
3981 {
3982     QEMUIOVector qiov;
3983     struct iovec iov = {
3984         .iov_base   = (void *) buf,
3985         .iov_len    = size,
3986     };
3987 
3988     qemu_iovec_init_external(&qiov, &iov, 1);
3989     return bdrv_writev_vmstate(bs, &qiov, pos);
3990 }
3991 
3992 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
3993 {
3994     BlockDriver *drv = bs->drv;
3995 
3996     if (!drv) {
3997         return -ENOMEDIUM;
3998     } else if (drv->bdrv_save_vmstate) {
3999         return drv->bdrv_save_vmstate(bs, qiov, pos);
4000     } else if (bs->file) {
4001         return bdrv_writev_vmstate(bs->file, qiov, pos);
4002     }
4003 
4004     return -ENOTSUP;
4005 }
4006 
4007 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4008                       int64_t pos, int size)
4009 {
4010     BlockDriver *drv = bs->drv;
4011     if (!drv)
4012         return -ENOMEDIUM;
4013     if (drv->bdrv_load_vmstate)
4014         return drv->bdrv_load_vmstate(bs, buf, pos, size);
4015     if (bs->file)
4016         return bdrv_load_vmstate(bs->file, buf, pos, size);
4017     return -ENOTSUP;
4018 }
4019 
4020 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4021 {
4022     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4023         return;
4024     }
4025 
4026     bs->drv->bdrv_debug_event(bs, event);
4027 }
4028 
4029 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4030                           const char *tag)
4031 {
4032     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4033         bs = bs->file;
4034     }
4035 
4036     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4037         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4038     }
4039 
4040     return -ENOTSUP;
4041 }
4042 
4043 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4044 {
4045     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4046         bs = bs->file;
4047     }
4048 
4049     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4050         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4051     }
4052 
4053     return -ENOTSUP;
4054 }
4055 
4056 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4057 {
4058     while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
4059         bs = bs->file;
4060     }
4061 
4062     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4063         return bs->drv->bdrv_debug_resume(bs, tag);
4064     }
4065 
4066     return -ENOTSUP;
4067 }
4068 
4069 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4070 {
4071     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4072         bs = bs->file;
4073     }
4074 
4075     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4076         return bs->drv->bdrv_debug_is_suspended(bs, tag);
4077     }
4078 
4079     return false;
4080 }
4081 
4082 int bdrv_is_snapshot(BlockDriverState *bs)
4083 {
4084     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4085 }
4086 
4087 /* backing_file can either be relative, or absolute, or a protocol.  If it is
4088  * relative, it must be relative to the chain.  So, passing in bs->filename
4089  * from a BDS as backing_file should not be done, as that may be relative to
4090  * the CWD rather than the chain. */
4091 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4092         const char *backing_file)
4093 {
4094     char *filename_full = NULL;
4095     char *backing_file_full = NULL;
4096     char *filename_tmp = NULL;
4097     int is_protocol = 0;
4098     BlockDriverState *curr_bs = NULL;
4099     BlockDriverState *retval = NULL;
4100 
4101     if (!bs || !bs->drv || !backing_file) {
4102         return NULL;
4103     }
4104 
4105     filename_full     = g_malloc(PATH_MAX);
4106     backing_file_full = g_malloc(PATH_MAX);
4107     filename_tmp      = g_malloc(PATH_MAX);
4108 
4109     is_protocol = path_has_protocol(backing_file);
4110 
4111     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4112 
4113         /* If either of the filename paths is actually a protocol, then
4114          * compare unmodified paths; otherwise make paths relative */
4115         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4116             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4117                 retval = curr_bs->backing_hd;
4118                 break;
4119             }
4120         } else {
4121             /* If not an absolute filename path, make it relative to the current
4122              * image's filename path */
4123             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4124                          backing_file);
4125 
4126             /* We are going to compare absolute pathnames */
4127             if (!realpath(filename_tmp, filename_full)) {
4128                 continue;
4129             }
4130 
4131             /* We need to make sure the backing filename we are comparing against
4132              * is relative to the current image filename (or absolute) */
4133             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4134                          curr_bs->backing_file);
4135 
4136             if (!realpath(filename_tmp, backing_file_full)) {
4137                 continue;
4138             }
4139 
4140             if (strcmp(backing_file_full, filename_full) == 0) {
4141                 retval = curr_bs->backing_hd;
4142                 break;
4143             }
4144         }
4145     }
4146 
4147     g_free(filename_full);
4148     g_free(backing_file_full);
4149     g_free(filename_tmp);
4150     return retval;
4151 }
4152 
4153 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4154 {
4155     if (!bs->drv) {
4156         return 0;
4157     }
4158 
4159     if (!bs->backing_hd) {
4160         return 0;
4161     }
4162 
4163     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4164 }
4165 
4166 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4167 {
4168     BlockDriverState *curr_bs = NULL;
4169 
4170     if (!bs) {
4171         return NULL;
4172     }
4173 
4174     curr_bs = bs;
4175 
4176     while (curr_bs->backing_hd) {
4177         curr_bs = curr_bs->backing_hd;
4178     }
4179     return curr_bs;
4180 }
4181 
4182 /**************************************************************/
4183 /* async I/Os */
4184 
4185 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4186                                  QEMUIOVector *qiov, int nb_sectors,
4187                                  BlockDriverCompletionFunc *cb, void *opaque)
4188 {
4189     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4190 
4191     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4192                                  cb, opaque, false);
4193 }
4194 
4195 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4196                                   QEMUIOVector *qiov, int nb_sectors,
4197                                   BlockDriverCompletionFunc *cb, void *opaque)
4198 {
4199     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4200 
4201     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4202                                  cb, opaque, true);
4203 }
4204 
4205 BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4206         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4207         BlockDriverCompletionFunc *cb, void *opaque)
4208 {
4209     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4210 
4211     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4212                                  BDRV_REQ_ZERO_WRITE | flags,
4213                                  cb, opaque, true);
4214 }
4215 
4216 
4217 typedef struct MultiwriteCB {
4218     int error;
4219     int num_requests;
4220     int num_callbacks;
4221     struct {
4222         BlockDriverCompletionFunc *cb;
4223         void *opaque;
4224         QEMUIOVector *free_qiov;
4225     } callbacks[];
4226 } MultiwriteCB;
4227 
4228 static void multiwrite_user_cb(MultiwriteCB *mcb)
4229 {
4230     int i;
4231 
4232     for (i = 0; i < mcb->num_callbacks; i++) {
4233         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4234         if (mcb->callbacks[i].free_qiov) {
4235             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4236         }
4237         g_free(mcb->callbacks[i].free_qiov);
4238     }
4239 }
4240 
4241 static void multiwrite_cb(void *opaque, int ret)
4242 {
4243     MultiwriteCB *mcb = opaque;
4244 
4245     trace_multiwrite_cb(mcb, ret);
4246 
4247     if (ret < 0 && !mcb->error) {
4248         mcb->error = ret;
4249     }
4250 
4251     mcb->num_requests--;
4252     if (mcb->num_requests == 0) {
4253         multiwrite_user_cb(mcb);
4254         g_free(mcb);
4255     }
4256 }
4257 
4258 static int multiwrite_req_compare(const void *a, const void *b)
4259 {
4260     const BlockRequest *req1 = a, *req2 = b;
4261 
4262     /*
4263      * Note that we can't simply subtract req2->sector from req1->sector
4264      * here as that could overflow the return value.
4265      */
4266     if (req1->sector > req2->sector) {
4267         return 1;
4268     } else if (req1->sector < req2->sector) {
4269         return -1;
4270     } else {
4271         return 0;
4272     }
4273 }
4274 
4275 /*
4276  * Takes a bunch of requests and tries to merge them. Returns the number of
4277  * requests that remain after merging.
4278  */
4279 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4280     int num_reqs, MultiwriteCB *mcb)
4281 {
4282     int i, outidx;
4283 
4284     // Sort requests by start sector
4285     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4286 
4287     // Check if adjacent requests touch the same clusters. If so, combine them,
4288     // filling up gaps with zero sectors.
4289     outidx = 0;
4290     for (i = 1; i < num_reqs; i++) {
4291         int merge = 0;
4292         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4293 
4294         // Handle exactly sequential writes and overlapping writes.
4295         if (reqs[i].sector <= oldreq_last) {
4296             merge = 1;
4297         }
4298 
4299         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4300             merge = 0;
4301         }
4302 
4303         if (merge) {
4304             size_t size;
4305             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4306             qemu_iovec_init(qiov,
4307                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4308 
4309             // Add the first request to the merged one. If the requests are
4310             // overlapping, drop the last sectors of the first request.
4311             size = (reqs[i].sector - reqs[outidx].sector) << 9;
4312             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4313 
4314             // We should need to add any zeros between the two requests
4315             assert (reqs[i].sector <= oldreq_last);
4316 
4317             // Add the second request
4318             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4319 
4320             reqs[outidx].nb_sectors = qiov->size >> 9;
4321             reqs[outidx].qiov = qiov;
4322 
4323             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4324         } else {
4325             outidx++;
4326             reqs[outidx].sector     = reqs[i].sector;
4327             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4328             reqs[outidx].qiov       = reqs[i].qiov;
4329         }
4330     }
4331 
4332     return outidx + 1;
4333 }
4334 
4335 /*
4336  * Submit multiple AIO write requests at once.
4337  *
4338  * On success, the function returns 0 and all requests in the reqs array have
4339  * been submitted. In error case this function returns -1, and any of the
4340  * requests may or may not be submitted yet. In particular, this means that the
4341  * callback will be called for some of the requests, for others it won't. The
4342  * caller must check the error field of the BlockRequest to wait for the right
4343  * callbacks (if error != 0, no callback will be called).
4344  *
4345  * The implementation may modify the contents of the reqs array, e.g. to merge
4346  * requests. However, the fields opaque and error are left unmodified as they
4347  * are used to signal failure for a single request to the caller.
4348  */
4349 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4350 {
4351     MultiwriteCB *mcb;
4352     int i;
4353 
4354     /* don't submit writes if we don't have a medium */
4355     if (bs->drv == NULL) {
4356         for (i = 0; i < num_reqs; i++) {
4357             reqs[i].error = -ENOMEDIUM;
4358         }
4359         return -1;
4360     }
4361 
4362     if (num_reqs == 0) {
4363         return 0;
4364     }
4365 
4366     // Create MultiwriteCB structure
4367     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4368     mcb->num_requests = 0;
4369     mcb->num_callbacks = num_reqs;
4370 
4371     for (i = 0; i < num_reqs; i++) {
4372         mcb->callbacks[i].cb = reqs[i].cb;
4373         mcb->callbacks[i].opaque = reqs[i].opaque;
4374     }
4375 
4376     // Check for mergable requests
4377     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4378 
4379     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4380 
4381     /* Run the aio requests. */
4382     mcb->num_requests = num_reqs;
4383     for (i = 0; i < num_reqs; i++) {
4384         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4385                               reqs[i].nb_sectors, reqs[i].flags,
4386                               multiwrite_cb, mcb,
4387                               true);
4388     }
4389 
4390     return 0;
4391 }
4392 
4393 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4394 {
4395     acb->aiocb_info->cancel(acb);
4396 }
4397 
4398 /**************************************************************/
4399 /* async block device emulation */
4400 
4401 typedef struct BlockDriverAIOCBSync {
4402     BlockDriverAIOCB common;
4403     QEMUBH *bh;
4404     int ret;
4405     /* vector translation state */
4406     QEMUIOVector *qiov;
4407     uint8_t *bounce;
4408     int is_write;
4409 } BlockDriverAIOCBSync;
4410 
4411 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4412 {
4413     BlockDriverAIOCBSync *acb =
4414         container_of(blockacb, BlockDriverAIOCBSync, common);
4415     qemu_bh_delete(acb->bh);
4416     acb->bh = NULL;
4417     qemu_aio_release(acb);
4418 }
4419 
4420 static const AIOCBInfo bdrv_em_aiocb_info = {
4421     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
4422     .cancel             = bdrv_aio_cancel_em,
4423 };
4424 
4425 static void bdrv_aio_bh_cb(void *opaque)
4426 {
4427     BlockDriverAIOCBSync *acb = opaque;
4428 
4429     if (!acb->is_write)
4430         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4431     qemu_vfree(acb->bounce);
4432     acb->common.cb(acb->common.opaque, acb->ret);
4433     qemu_bh_delete(acb->bh);
4434     acb->bh = NULL;
4435     qemu_aio_release(acb);
4436 }
4437 
4438 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4439                                             int64_t sector_num,
4440                                             QEMUIOVector *qiov,
4441                                             int nb_sectors,
4442                                             BlockDriverCompletionFunc *cb,
4443                                             void *opaque,
4444                                             int is_write)
4445 
4446 {
4447     BlockDriverAIOCBSync *acb;
4448 
4449     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4450     acb->is_write = is_write;
4451     acb->qiov = qiov;
4452     acb->bounce = qemu_blockalign(bs, qiov->size);
4453     acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4454 
4455     if (is_write) {
4456         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4457         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4458     } else {
4459         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4460     }
4461 
4462     qemu_bh_schedule(acb->bh);
4463 
4464     return &acb->common;
4465 }
4466 
4467 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4468         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4469         BlockDriverCompletionFunc *cb, void *opaque)
4470 {
4471     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4472 }
4473 
4474 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4475         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4476         BlockDriverCompletionFunc *cb, void *opaque)
4477 {
4478     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4479 }
4480 
4481 
4482 typedef struct BlockDriverAIOCBCoroutine {
4483     BlockDriverAIOCB common;
4484     BlockRequest req;
4485     bool is_write;
4486     bool *done;
4487     QEMUBH* bh;
4488 } BlockDriverAIOCBCoroutine;
4489 
4490 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4491 {
4492     BlockDriverAIOCBCoroutine *acb =
4493         container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4494     bool done = false;
4495 
4496     acb->done = &done;
4497     while (!done) {
4498         qemu_aio_wait();
4499     }
4500 }
4501 
4502 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4503     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4504     .cancel             = bdrv_aio_co_cancel_em,
4505 };
4506 
4507 static void bdrv_co_em_bh(void *opaque)
4508 {
4509     BlockDriverAIOCBCoroutine *acb = opaque;
4510 
4511     acb->common.cb(acb->common.opaque, acb->req.error);
4512 
4513     if (acb->done) {
4514         *acb->done = true;
4515     }
4516 
4517     qemu_bh_delete(acb->bh);
4518     qemu_aio_release(acb);
4519 }
4520 
4521 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4522 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4523 {
4524     BlockDriverAIOCBCoroutine *acb = opaque;
4525     BlockDriverState *bs = acb->common.bs;
4526 
4527     if (!acb->is_write) {
4528         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4529             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4530     } else {
4531         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4532             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4533     }
4534 
4535     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4536     qemu_bh_schedule(acb->bh);
4537 }
4538 
4539 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4540                                                int64_t sector_num,
4541                                                QEMUIOVector *qiov,
4542                                                int nb_sectors,
4543                                                BdrvRequestFlags flags,
4544                                                BlockDriverCompletionFunc *cb,
4545                                                void *opaque,
4546                                                bool is_write)
4547 {
4548     Coroutine *co;
4549     BlockDriverAIOCBCoroutine *acb;
4550 
4551     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4552     acb->req.sector = sector_num;
4553     acb->req.nb_sectors = nb_sectors;
4554     acb->req.qiov = qiov;
4555     acb->req.flags = flags;
4556     acb->is_write = is_write;
4557     acb->done = NULL;
4558 
4559     co = qemu_coroutine_create(bdrv_co_do_rw);
4560     qemu_coroutine_enter(co, acb);
4561 
4562     return &acb->common;
4563 }
4564 
4565 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4566 {
4567     BlockDriverAIOCBCoroutine *acb = opaque;
4568     BlockDriverState *bs = acb->common.bs;
4569 
4570     acb->req.error = bdrv_co_flush(bs);
4571     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4572     qemu_bh_schedule(acb->bh);
4573 }
4574 
4575 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4576         BlockDriverCompletionFunc *cb, void *opaque)
4577 {
4578     trace_bdrv_aio_flush(bs, opaque);
4579 
4580     Coroutine *co;
4581     BlockDriverAIOCBCoroutine *acb;
4582 
4583     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4584     acb->done = NULL;
4585 
4586     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4587     qemu_coroutine_enter(co, acb);
4588 
4589     return &acb->common;
4590 }
4591 
4592 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4593 {
4594     BlockDriverAIOCBCoroutine *acb = opaque;
4595     BlockDriverState *bs = acb->common.bs;
4596 
4597     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4598     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4599     qemu_bh_schedule(acb->bh);
4600 }
4601 
4602 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4603         int64_t sector_num, int nb_sectors,
4604         BlockDriverCompletionFunc *cb, void *opaque)
4605 {
4606     Coroutine *co;
4607     BlockDriverAIOCBCoroutine *acb;
4608 
4609     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4610 
4611     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4612     acb->req.sector = sector_num;
4613     acb->req.nb_sectors = nb_sectors;
4614     acb->done = NULL;
4615     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4616     qemu_coroutine_enter(co, acb);
4617 
4618     return &acb->common;
4619 }
4620 
4621 void bdrv_init(void)
4622 {
4623     module_call_init(MODULE_INIT_BLOCK);
4624 }
4625 
4626 void bdrv_init_with_whitelist(void)
4627 {
4628     use_bdrv_whitelist = 1;
4629     bdrv_init();
4630 }
4631 
4632 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4633                    BlockDriverCompletionFunc *cb, void *opaque)
4634 {
4635     BlockDriverAIOCB *acb;
4636 
4637     acb = g_slice_alloc(aiocb_info->aiocb_size);
4638     acb->aiocb_info = aiocb_info;
4639     acb->bs = bs;
4640     acb->cb = cb;
4641     acb->opaque = opaque;
4642     return acb;
4643 }
4644 
4645 void qemu_aio_release(void *p)
4646 {
4647     BlockDriverAIOCB *acb = p;
4648     g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4649 }
4650 
4651 /**************************************************************/
4652 /* Coroutine block device emulation */
4653 
4654 typedef struct CoroutineIOCompletion {
4655     Coroutine *coroutine;
4656     int ret;
4657 } CoroutineIOCompletion;
4658 
4659 static void bdrv_co_io_em_complete(void *opaque, int ret)
4660 {
4661     CoroutineIOCompletion *co = opaque;
4662 
4663     co->ret = ret;
4664     qemu_coroutine_enter(co->coroutine, NULL);
4665 }
4666 
4667 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4668                                       int nb_sectors, QEMUIOVector *iov,
4669                                       bool is_write)
4670 {
4671     CoroutineIOCompletion co = {
4672         .coroutine = qemu_coroutine_self(),
4673     };
4674     BlockDriverAIOCB *acb;
4675 
4676     if (is_write) {
4677         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4678                                        bdrv_co_io_em_complete, &co);
4679     } else {
4680         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4681                                       bdrv_co_io_em_complete, &co);
4682     }
4683 
4684     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4685     if (!acb) {
4686         return -EIO;
4687     }
4688     qemu_coroutine_yield();
4689 
4690     return co.ret;
4691 }
4692 
4693 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4694                                          int64_t sector_num, int nb_sectors,
4695                                          QEMUIOVector *iov)
4696 {
4697     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4698 }
4699 
4700 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4701                                          int64_t sector_num, int nb_sectors,
4702                                          QEMUIOVector *iov)
4703 {
4704     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4705 }
4706 
4707 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4708 {
4709     RwCo *rwco = opaque;
4710 
4711     rwco->ret = bdrv_co_flush(rwco->bs);
4712 }
4713 
4714 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4715 {
4716     int ret;
4717 
4718     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4719         return 0;
4720     }
4721 
4722     /* Write back cached data to the OS even with cache=unsafe */
4723     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4724     if (bs->drv->bdrv_co_flush_to_os) {
4725         ret = bs->drv->bdrv_co_flush_to_os(bs);
4726         if (ret < 0) {
4727             return ret;
4728         }
4729     }
4730 
4731     /* But don't actually force it to the disk with cache=unsafe */
4732     if (bs->open_flags & BDRV_O_NO_FLUSH) {
4733         goto flush_parent;
4734     }
4735 
4736     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4737     if (bs->drv->bdrv_co_flush_to_disk) {
4738         ret = bs->drv->bdrv_co_flush_to_disk(bs);
4739     } else if (bs->drv->bdrv_aio_flush) {
4740         BlockDriverAIOCB *acb;
4741         CoroutineIOCompletion co = {
4742             .coroutine = qemu_coroutine_self(),
4743         };
4744 
4745         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4746         if (acb == NULL) {
4747             ret = -EIO;
4748         } else {
4749             qemu_coroutine_yield();
4750             ret = co.ret;
4751         }
4752     } else {
4753         /*
4754          * Some block drivers always operate in either writethrough or unsafe
4755          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4756          * know how the server works (because the behaviour is hardcoded or
4757          * depends on server-side configuration), so we can't ensure that
4758          * everything is safe on disk. Returning an error doesn't work because
4759          * that would break guests even if the server operates in writethrough
4760          * mode.
4761          *
4762          * Let's hope the user knows what he's doing.
4763          */
4764         ret = 0;
4765     }
4766     if (ret < 0) {
4767         return ret;
4768     }
4769 
4770     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4771      * in the case of cache=unsafe, so there are no useless flushes.
4772      */
4773 flush_parent:
4774     return bdrv_co_flush(bs->file);
4775 }
4776 
4777 void bdrv_invalidate_cache(BlockDriverState *bs)
4778 {
4779     if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4780         bs->drv->bdrv_invalidate_cache(bs);
4781     }
4782 }
4783 
4784 void bdrv_invalidate_cache_all(void)
4785 {
4786     BlockDriverState *bs;
4787 
4788     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4789         bdrv_invalidate_cache(bs);
4790     }
4791 }
4792 
4793 void bdrv_clear_incoming_migration_all(void)
4794 {
4795     BlockDriverState *bs;
4796 
4797     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4798         bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4799     }
4800 }
4801 
4802 int bdrv_flush(BlockDriverState *bs)
4803 {
4804     Coroutine *co;
4805     RwCo rwco = {
4806         .bs = bs,
4807         .ret = NOT_DONE,
4808     };
4809 
4810     if (qemu_in_coroutine()) {
4811         /* Fast-path if already in coroutine context */
4812         bdrv_flush_co_entry(&rwco);
4813     } else {
4814         co = qemu_coroutine_create(bdrv_flush_co_entry);
4815         qemu_coroutine_enter(co, &rwco);
4816         while (rwco.ret == NOT_DONE) {
4817             qemu_aio_wait();
4818         }
4819     }
4820 
4821     return rwco.ret;
4822 }
4823 
4824 typedef struct DiscardCo {
4825     BlockDriverState *bs;
4826     int64_t sector_num;
4827     int nb_sectors;
4828     int ret;
4829 } DiscardCo;
4830 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4831 {
4832     DiscardCo *rwco = opaque;
4833 
4834     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4835 }
4836 
4837 /* if no limit is specified in the BlockLimits use a default
4838  * of 32768 512-byte sectors (16 MiB) per request.
4839  */
4840 #define MAX_DISCARD_DEFAULT 32768
4841 
4842 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4843                                  int nb_sectors)
4844 {
4845     int max_discard;
4846 
4847     if (!bs->drv) {
4848         return -ENOMEDIUM;
4849     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4850         return -EIO;
4851     } else if (bs->read_only) {
4852         return -EROFS;
4853     }
4854 
4855     bdrv_reset_dirty(bs, sector_num, nb_sectors);
4856 
4857     /* Do nothing if disabled.  */
4858     if (!(bs->open_flags & BDRV_O_UNMAP)) {
4859         return 0;
4860     }
4861 
4862     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4863         return 0;
4864     }
4865 
4866     max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4867     while (nb_sectors > 0) {
4868         int ret;
4869         int num = nb_sectors;
4870 
4871         /* align request */
4872         if (bs->bl.discard_alignment &&
4873             num >= bs->bl.discard_alignment &&
4874             sector_num % bs->bl.discard_alignment) {
4875             if (num > bs->bl.discard_alignment) {
4876                 num = bs->bl.discard_alignment;
4877             }
4878             num -= sector_num % bs->bl.discard_alignment;
4879         }
4880 
4881         /* limit request size */
4882         if (num > max_discard) {
4883             num = max_discard;
4884         }
4885 
4886         if (bs->drv->bdrv_co_discard) {
4887             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
4888         } else {
4889             BlockDriverAIOCB *acb;
4890             CoroutineIOCompletion co = {
4891                 .coroutine = qemu_coroutine_self(),
4892             };
4893 
4894             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4895                                             bdrv_co_io_em_complete, &co);
4896             if (acb == NULL) {
4897                 return -EIO;
4898             } else {
4899                 qemu_coroutine_yield();
4900                 ret = co.ret;
4901             }
4902         }
4903         if (ret && ret != -ENOTSUP) {
4904             return ret;
4905         }
4906 
4907         sector_num += num;
4908         nb_sectors -= num;
4909     }
4910     return 0;
4911 }
4912 
4913 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4914 {
4915     Coroutine *co;
4916     DiscardCo rwco = {
4917         .bs = bs,
4918         .sector_num = sector_num,
4919         .nb_sectors = nb_sectors,
4920         .ret = NOT_DONE,
4921     };
4922 
4923     if (qemu_in_coroutine()) {
4924         /* Fast-path if already in coroutine context */
4925         bdrv_discard_co_entry(&rwco);
4926     } else {
4927         co = qemu_coroutine_create(bdrv_discard_co_entry);
4928         qemu_coroutine_enter(co, &rwco);
4929         while (rwco.ret == NOT_DONE) {
4930             qemu_aio_wait();
4931         }
4932     }
4933 
4934     return rwco.ret;
4935 }
4936 
4937 /**************************************************************/
4938 /* removable device support */
4939 
4940 /**
4941  * Return TRUE if the media is present
4942  */
4943 int bdrv_is_inserted(BlockDriverState *bs)
4944 {
4945     BlockDriver *drv = bs->drv;
4946 
4947     if (!drv)
4948         return 0;
4949     if (!drv->bdrv_is_inserted)
4950         return 1;
4951     return drv->bdrv_is_inserted(bs);
4952 }
4953 
4954 /**
4955  * Return whether the media changed since the last call to this
4956  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4957  */
4958 int bdrv_media_changed(BlockDriverState *bs)
4959 {
4960     BlockDriver *drv = bs->drv;
4961 
4962     if (drv && drv->bdrv_media_changed) {
4963         return drv->bdrv_media_changed(bs);
4964     }
4965     return -ENOTSUP;
4966 }
4967 
4968 /**
4969  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4970  */
4971 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4972 {
4973     BlockDriver *drv = bs->drv;
4974 
4975     if (drv && drv->bdrv_eject) {
4976         drv->bdrv_eject(bs, eject_flag);
4977     }
4978 
4979     if (bs->device_name[0] != '\0') {
4980         bdrv_emit_qmp_eject_event(bs, eject_flag);
4981     }
4982 }
4983 
4984 /**
4985  * Lock or unlock the media (if it is locked, the user won't be able
4986  * to eject it manually).
4987  */
4988 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4989 {
4990     BlockDriver *drv = bs->drv;
4991 
4992     trace_bdrv_lock_medium(bs, locked);
4993 
4994     if (drv && drv->bdrv_lock_medium) {
4995         drv->bdrv_lock_medium(bs, locked);
4996     }
4997 }
4998 
4999 /* needed for generic scsi interface */
5000 
5001 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5002 {
5003     BlockDriver *drv = bs->drv;
5004 
5005     if (drv && drv->bdrv_ioctl)
5006         return drv->bdrv_ioctl(bs, req, buf);
5007     return -ENOTSUP;
5008 }
5009 
5010 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5011         unsigned long int req, void *buf,
5012         BlockDriverCompletionFunc *cb, void *opaque)
5013 {
5014     BlockDriver *drv = bs->drv;
5015 
5016     if (drv && drv->bdrv_aio_ioctl)
5017         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5018     return NULL;
5019 }
5020 
5021 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5022 {
5023     bs->guest_block_size = align;
5024 }
5025 
5026 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5027 {
5028     return qemu_memalign(bdrv_opt_mem_align(bs), size);
5029 }
5030 
5031 /*
5032  * Check if all memory in this vector is sector aligned.
5033  */
5034 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5035 {
5036     int i;
5037     size_t alignment = bdrv_opt_mem_align(bs);
5038 
5039     for (i = 0; i < qiov->niov; i++) {
5040         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5041             return false;
5042         }
5043         if (qiov->iov[i].iov_len % alignment) {
5044             return false;
5045         }
5046     }
5047 
5048     return true;
5049 }
5050 
5051 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity)
5052 {
5053     int64_t bitmap_size;
5054     BdrvDirtyBitmap *bitmap;
5055 
5056     assert((granularity & (granularity - 1)) == 0);
5057 
5058     granularity >>= BDRV_SECTOR_BITS;
5059     assert(granularity);
5060     bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
5061     bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5062     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5063     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5064     return bitmap;
5065 }
5066 
5067 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5068 {
5069     BdrvDirtyBitmap *bm, *next;
5070     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5071         if (bm == bitmap) {
5072             QLIST_REMOVE(bitmap, list);
5073             hbitmap_free(bitmap->bitmap);
5074             g_free(bitmap);
5075             return;
5076         }
5077     }
5078 }
5079 
5080 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5081 {
5082     BdrvDirtyBitmap *bm;
5083     BlockDirtyInfoList *list = NULL;
5084     BlockDirtyInfoList **plist = &list;
5085 
5086     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5087         BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5088         BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5089         info->count = bdrv_get_dirty_count(bs, bm);
5090         info->granularity =
5091             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5092         entry->value = info;
5093         *plist = entry;
5094         plist = &entry->next;
5095     }
5096 
5097     return list;
5098 }
5099 
5100 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5101 {
5102     if (bitmap) {
5103         return hbitmap_get(bitmap->bitmap, sector);
5104     } else {
5105         return 0;
5106     }
5107 }
5108 
5109 void bdrv_dirty_iter_init(BlockDriverState *bs,
5110                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5111 {
5112     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5113 }
5114 
5115 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5116                     int nr_sectors)
5117 {
5118     BdrvDirtyBitmap *bitmap;
5119     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5120         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5121     }
5122 }
5123 
5124 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5125 {
5126     BdrvDirtyBitmap *bitmap;
5127     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5128         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5129     }
5130 }
5131 
5132 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5133 {
5134     return hbitmap_count(bitmap->bitmap);
5135 }
5136 
5137 /* Get a reference to bs */
5138 void bdrv_ref(BlockDriverState *bs)
5139 {
5140     bs->refcnt++;
5141 }
5142 
5143 /* Release a previously grabbed reference to bs.
5144  * If after releasing, reference count is zero, the BlockDriverState is
5145  * deleted. */
5146 void bdrv_unref(BlockDriverState *bs)
5147 {
5148     assert(bs->refcnt > 0);
5149     if (--bs->refcnt == 0) {
5150         bdrv_delete(bs);
5151     }
5152 }
5153 
5154 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
5155 {
5156     assert(bs->in_use != in_use);
5157     bs->in_use = in_use;
5158 }
5159 
5160 int bdrv_in_use(BlockDriverState *bs)
5161 {
5162     return bs->in_use;
5163 }
5164 
5165 void bdrv_iostatus_enable(BlockDriverState *bs)
5166 {
5167     bs->iostatus_enabled = true;
5168     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5169 }
5170 
5171 /* The I/O status is only enabled if the drive explicitly
5172  * enables it _and_ the VM is configured to stop on errors */
5173 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5174 {
5175     return (bs->iostatus_enabled &&
5176            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5177             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5178             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5179 }
5180 
5181 void bdrv_iostatus_disable(BlockDriverState *bs)
5182 {
5183     bs->iostatus_enabled = false;
5184 }
5185 
5186 void bdrv_iostatus_reset(BlockDriverState *bs)
5187 {
5188     if (bdrv_iostatus_is_enabled(bs)) {
5189         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5190         if (bs->job) {
5191             block_job_iostatus_reset(bs->job);
5192         }
5193     }
5194 }
5195 
5196 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5197 {
5198     assert(bdrv_iostatus_is_enabled(bs));
5199     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5200         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5201                                          BLOCK_DEVICE_IO_STATUS_FAILED;
5202     }
5203 }
5204 
5205 void
5206 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5207         enum BlockAcctType type)
5208 {
5209     assert(type < BDRV_MAX_IOTYPE);
5210 
5211     cookie->bytes = bytes;
5212     cookie->start_time_ns = get_clock();
5213     cookie->type = type;
5214 }
5215 
5216 void
5217 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5218 {
5219     assert(cookie->type < BDRV_MAX_IOTYPE);
5220 
5221     bs->nr_bytes[cookie->type] += cookie->bytes;
5222     bs->nr_ops[cookie->type]++;
5223     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5224 }
5225 
5226 void bdrv_img_create(const char *filename, const char *fmt,
5227                      const char *base_filename, const char *base_fmt,
5228                      char *options, uint64_t img_size, int flags,
5229                      Error **errp, bool quiet)
5230 {
5231     QEMUOptionParameter *param = NULL, *create_options = NULL;
5232     QEMUOptionParameter *backing_fmt, *backing_file, *size;
5233     BlockDriver *drv, *proto_drv;
5234     BlockDriver *backing_drv = NULL;
5235     Error *local_err = NULL;
5236     int ret = 0;
5237 
5238     /* Find driver and parse its options */
5239     drv = bdrv_find_format(fmt);
5240     if (!drv) {
5241         error_setg(errp, "Unknown file format '%s'", fmt);
5242         return;
5243     }
5244 
5245     proto_drv = bdrv_find_protocol(filename, true);
5246     if (!proto_drv) {
5247         error_setg(errp, "Unknown protocol '%s'", filename);
5248         return;
5249     }
5250 
5251     create_options = append_option_parameters(create_options,
5252                                               drv->create_options);
5253     create_options = append_option_parameters(create_options,
5254                                               proto_drv->create_options);
5255 
5256     /* Create parameter list with default values */
5257     param = parse_option_parameters("", create_options, param);
5258 
5259     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5260 
5261     /* Parse -o options */
5262     if (options) {
5263         param = parse_option_parameters(options, create_options, param);
5264         if (param == NULL) {
5265             error_setg(errp, "Invalid options for file format '%s'.", fmt);
5266             goto out;
5267         }
5268     }
5269 
5270     if (base_filename) {
5271         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5272                                  base_filename)) {
5273             error_setg(errp, "Backing file not supported for file format '%s'",
5274                        fmt);
5275             goto out;
5276         }
5277     }
5278 
5279     if (base_fmt) {
5280         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5281             error_setg(errp, "Backing file format not supported for file "
5282                              "format '%s'", fmt);
5283             goto out;
5284         }
5285     }
5286 
5287     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5288     if (backing_file && backing_file->value.s) {
5289         if (!strcmp(filename, backing_file->value.s)) {
5290             error_setg(errp, "Error: Trying to create an image with the "
5291                              "same filename as the backing file");
5292             goto out;
5293         }
5294     }
5295 
5296     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5297     if (backing_fmt && backing_fmt->value.s) {
5298         backing_drv = bdrv_find_format(backing_fmt->value.s);
5299         if (!backing_drv) {
5300             error_setg(errp, "Unknown backing file format '%s'",
5301                        backing_fmt->value.s);
5302             goto out;
5303         }
5304     }
5305 
5306     // The size for the image must always be specified, with one exception:
5307     // If we are using a backing file, we can obtain the size from there
5308     size = get_option_parameter(param, BLOCK_OPT_SIZE);
5309     if (size && size->value.n == -1) {
5310         if (backing_file && backing_file->value.s) {
5311             BlockDriverState *bs;
5312             uint64_t size;
5313             char buf[32];
5314             int back_flags;
5315 
5316             /* backing files always opened read-only */
5317             back_flags =
5318                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5319 
5320             bs = NULL;
5321             ret = bdrv_open(&bs, backing_file->value.s, NULL, NULL, back_flags,
5322                             backing_drv, &local_err);
5323             if (ret < 0) {
5324                 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5325                                  backing_file->value.s,
5326                                  error_get_pretty(local_err));
5327                 error_free(local_err);
5328                 local_err = NULL;
5329                 goto out;
5330             }
5331             bdrv_get_geometry(bs, &size);
5332             size *= 512;
5333 
5334             snprintf(buf, sizeof(buf), "%" PRId64, size);
5335             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
5336 
5337             bdrv_unref(bs);
5338         } else {
5339             error_setg(errp, "Image creation needs a size parameter");
5340             goto out;
5341         }
5342     }
5343 
5344     if (!quiet) {
5345         printf("Formatting '%s', fmt=%s ", filename, fmt);
5346         print_option_parameters(param);
5347         puts("");
5348     }
5349     ret = bdrv_create(drv, filename, param, &local_err);
5350     if (ret == -EFBIG) {
5351         /* This is generally a better message than whatever the driver would
5352          * deliver (especially because of the cluster_size_hint), since that
5353          * is most probably not much different from "image too large". */
5354         const char *cluster_size_hint = "";
5355         if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5356             cluster_size_hint = " (try using a larger cluster size)";
5357         }
5358         error_setg(errp, "The image size is too large for file format '%s'"
5359                    "%s", fmt, cluster_size_hint);
5360         error_free(local_err);
5361         local_err = NULL;
5362     }
5363 
5364 out:
5365     free_option_parameters(create_options);
5366     free_option_parameters(param);
5367 
5368     if (local_err) {
5369         error_propagate(errp, local_err);
5370     }
5371 }
5372 
5373 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5374 {
5375     /* Currently BlockDriverState always uses the main loop AioContext */
5376     return qemu_get_aio_context();
5377 }
5378 
5379 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5380                                     NotifierWithReturn *notifier)
5381 {
5382     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5383 }
5384 
5385 int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5386 {
5387     if (bs->drv->bdrv_amend_options == NULL) {
5388         return -ENOTSUP;
5389     }
5390     return bs->drv->bdrv_amend_options(bs, options);
5391 }
5392 
5393 /* Used to recurse on single child block filters.
5394  * Single child block filter will store their child in bs->file.
5395  */
5396 bool bdrv_generic_is_first_non_filter(BlockDriverState *bs,
5397                                       BlockDriverState *candidate)
5398 {
5399     if (!bs->drv) {
5400         return false;
5401     }
5402 
5403     if (!bs->drv->authorizations[BS_IS_A_FILTER]) {
5404         if (bs == candidate) {
5405             return true;
5406         } else {
5407             return false;
5408         }
5409     }
5410 
5411     if (!bs->drv->authorizations[BS_FILTER_PASS_DOWN]) {
5412         return false;
5413     }
5414 
5415     if (!bs->file) {
5416         return false;
5417     }
5418 
5419     return bdrv_recurse_is_first_non_filter(bs->file, candidate);
5420 }
5421 
5422 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5423                                       BlockDriverState *candidate)
5424 {
5425     if (bs->drv && bs->drv->bdrv_recurse_is_first_non_filter) {
5426         return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5427     }
5428 
5429     return bdrv_generic_is_first_non_filter(bs, candidate);
5430 }
5431 
5432 /* This function checks if the candidate is the first non filter bs down it's
5433  * bs chain. Since we don't have pointers to parents it explore all bs chains
5434  * from the top. Some filters can choose not to pass down the recursion.
5435  */
5436 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5437 {
5438     BlockDriverState *bs;
5439 
5440     /* walk down the bs forest recursively */
5441     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5442         bool perm;
5443 
5444         perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5445 
5446         /* candidate is the first non filter */
5447         if (perm) {
5448             return true;
5449         }
5450     }
5451 
5452     return false;
5453 }
5454