xref: /openbmc/qemu/block.c (revision 56d1b4d21d444619302d3f1291a133b1c2b9b072)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor/monitor.h"
28 #include "block/block_int.h"
29 #include "block/blockjob.h"
30 #include "qemu/module.h"
31 #include "qapi/qmp/qjson.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "qmp-commands.h"
36 #include "qemu/timer.h"
37 
38 #ifdef CONFIG_BSD
39 #include <sys/types.h>
40 #include <sys/stat.h>
41 #include <sys/ioctl.h>
42 #include <sys/queue.h>
43 #ifndef __DragonFly__
44 #include <sys/disk.h>
45 #endif
46 #endif
47 
48 #ifdef _WIN32
49 #include <windows.h>
50 #endif
51 
52 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
53 
54 typedef enum {
55     BDRV_REQ_COPY_ON_READ = 0x1,
56     BDRV_REQ_ZERO_WRITE   = 0x2,
57 } BdrvRequestFlags;
58 
59 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
60 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
61         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62         BlockDriverCompletionFunc *cb, void *opaque);
63 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
64         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
65         BlockDriverCompletionFunc *cb, void *opaque);
66 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
67                                          int64_t sector_num, int nb_sectors,
68                                          QEMUIOVector *iov);
69 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
70                                          int64_t sector_num, int nb_sectors,
71                                          QEMUIOVector *iov);
72 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
73     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74     BdrvRequestFlags flags);
75 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
76     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
77     BdrvRequestFlags flags);
78 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
79                                                int64_t sector_num,
80                                                QEMUIOVector *qiov,
81                                                int nb_sectors,
82                                                BlockDriverCompletionFunc *cb,
83                                                void *opaque,
84                                                bool is_write);
85 static void coroutine_fn bdrv_co_do_rw(void *opaque);
86 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
87     int64_t sector_num, int nb_sectors);
88 
89 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
90         bool is_write, double elapsed_time, uint64_t *wait);
91 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
92         double elapsed_time, uint64_t *wait);
93 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
94         bool is_write, int64_t *wait);
95 
96 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
97     QTAILQ_HEAD_INITIALIZER(bdrv_states);
98 
99 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
100     QLIST_HEAD_INITIALIZER(bdrv_drivers);
101 
102 /* The device to use for VM snapshots */
103 static BlockDriverState *bs_snapshots;
104 
105 /* If non-zero, use only whitelisted block drivers */
106 static int use_bdrv_whitelist;
107 
108 #ifdef _WIN32
109 static int is_windows_drive_prefix(const char *filename)
110 {
111     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
112              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
113             filename[1] == ':');
114 }
115 
116 int is_windows_drive(const char *filename)
117 {
118     if (is_windows_drive_prefix(filename) &&
119         filename[2] == '\0')
120         return 1;
121     if (strstart(filename, "\\\\.\\", NULL) ||
122         strstart(filename, "//./", NULL))
123         return 1;
124     return 0;
125 }
126 #endif
127 
128 /* throttling disk I/O limits */
129 void bdrv_io_limits_disable(BlockDriverState *bs)
130 {
131     bs->io_limits_enabled = false;
132 
133     while (qemu_co_queue_next(&bs->throttled_reqs));
134 
135     if (bs->block_timer) {
136         qemu_del_timer(bs->block_timer);
137         qemu_free_timer(bs->block_timer);
138         bs->block_timer = NULL;
139     }
140 
141     bs->slice_start = 0;
142     bs->slice_end   = 0;
143 }
144 
145 static void bdrv_block_timer(void *opaque)
146 {
147     BlockDriverState *bs = opaque;
148 
149     qemu_co_queue_next(&bs->throttled_reqs);
150 }
151 
152 void bdrv_io_limits_enable(BlockDriverState *bs)
153 {
154     qemu_co_queue_init(&bs->throttled_reqs);
155     bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
156     bs->io_limits_enabled = true;
157 }
158 
159 bool bdrv_io_limits_enabled(BlockDriverState *bs)
160 {
161     BlockIOLimit *io_limits = &bs->io_limits;
162     return io_limits->bps[BLOCK_IO_LIMIT_READ]
163          || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
164          || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
165          || io_limits->iops[BLOCK_IO_LIMIT_READ]
166          || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
167          || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
168 }
169 
170 static void bdrv_io_limits_intercept(BlockDriverState *bs,
171                                      bool is_write, int nb_sectors)
172 {
173     int64_t wait_time = -1;
174 
175     if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
176         qemu_co_queue_wait(&bs->throttled_reqs);
177     }
178 
179     /* In fact, we hope to keep each request's timing, in FIFO mode. The next
180      * throttled requests will not be dequeued until the current request is
181      * allowed to be serviced. So if the current request still exceeds the
182      * limits, it will be inserted to the head. All requests followed it will
183      * be still in throttled_reqs queue.
184      */
185 
186     while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
187         qemu_mod_timer(bs->block_timer,
188                        wait_time + qemu_get_clock_ns(vm_clock));
189         qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
190     }
191 
192     qemu_co_queue_next(&bs->throttled_reqs);
193 }
194 
195 /* check if the path starts with "<protocol>:" */
196 static int path_has_protocol(const char *path)
197 {
198     const char *p;
199 
200 #ifdef _WIN32
201     if (is_windows_drive(path) ||
202         is_windows_drive_prefix(path)) {
203         return 0;
204     }
205     p = path + strcspn(path, ":/\\");
206 #else
207     p = path + strcspn(path, ":/");
208 #endif
209 
210     return *p == ':';
211 }
212 
213 int path_is_absolute(const char *path)
214 {
215 #ifdef _WIN32
216     /* specific case for names like: "\\.\d:" */
217     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
218         return 1;
219     }
220     return (*path == '/' || *path == '\\');
221 #else
222     return (*path == '/');
223 #endif
224 }
225 
226 /* if filename is absolute, just copy it to dest. Otherwise, build a
227    path to it by considering it is relative to base_path. URL are
228    supported. */
229 void path_combine(char *dest, int dest_size,
230                   const char *base_path,
231                   const char *filename)
232 {
233     const char *p, *p1;
234     int len;
235 
236     if (dest_size <= 0)
237         return;
238     if (path_is_absolute(filename)) {
239         pstrcpy(dest, dest_size, filename);
240     } else {
241         p = strchr(base_path, ':');
242         if (p)
243             p++;
244         else
245             p = base_path;
246         p1 = strrchr(base_path, '/');
247 #ifdef _WIN32
248         {
249             const char *p2;
250             p2 = strrchr(base_path, '\\');
251             if (!p1 || p2 > p1)
252                 p1 = p2;
253         }
254 #endif
255         if (p1)
256             p1++;
257         else
258             p1 = base_path;
259         if (p1 > p)
260             p = p1;
261         len = p - base_path;
262         if (len > dest_size - 1)
263             len = dest_size - 1;
264         memcpy(dest, base_path, len);
265         dest[len] = '\0';
266         pstrcat(dest, dest_size, filename);
267     }
268 }
269 
270 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
271 {
272     if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
273         pstrcpy(dest, sz, bs->backing_file);
274     } else {
275         path_combine(dest, sz, bs->filename, bs->backing_file);
276     }
277 }
278 
279 void bdrv_register(BlockDriver *bdrv)
280 {
281     /* Block drivers without coroutine functions need emulation */
282     if (!bdrv->bdrv_co_readv) {
283         bdrv->bdrv_co_readv = bdrv_co_readv_em;
284         bdrv->bdrv_co_writev = bdrv_co_writev_em;
285 
286         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
287          * the block driver lacks aio we need to emulate that too.
288          */
289         if (!bdrv->bdrv_aio_readv) {
290             /* add AIO emulation layer */
291             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
292             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
293         }
294     }
295 
296     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
297 }
298 
299 /* create a new block device (by default it is empty) */
300 BlockDriverState *bdrv_new(const char *device_name)
301 {
302     BlockDriverState *bs;
303 
304     bs = g_malloc0(sizeof(BlockDriverState));
305     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
306     if (device_name[0] != '\0') {
307         QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
308     }
309     bdrv_iostatus_disable(bs);
310     notifier_list_init(&bs->close_notifiers);
311 
312     return bs;
313 }
314 
315 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
316 {
317     notifier_list_add(&bs->close_notifiers, notify);
318 }
319 
320 BlockDriver *bdrv_find_format(const char *format_name)
321 {
322     BlockDriver *drv1;
323     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
324         if (!strcmp(drv1->format_name, format_name)) {
325             return drv1;
326         }
327     }
328     return NULL;
329 }
330 
331 static int bdrv_is_whitelisted(BlockDriver *drv)
332 {
333     static const char *whitelist[] = {
334         CONFIG_BDRV_WHITELIST
335     };
336     const char **p;
337 
338     if (!whitelist[0])
339         return 1;               /* no whitelist, anything goes */
340 
341     for (p = whitelist; *p; p++) {
342         if (!strcmp(drv->format_name, *p)) {
343             return 1;
344         }
345     }
346     return 0;
347 }
348 
349 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
350 {
351     BlockDriver *drv = bdrv_find_format(format_name);
352     return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
353 }
354 
355 typedef struct CreateCo {
356     BlockDriver *drv;
357     char *filename;
358     QEMUOptionParameter *options;
359     int ret;
360 } CreateCo;
361 
362 static void coroutine_fn bdrv_create_co_entry(void *opaque)
363 {
364     CreateCo *cco = opaque;
365     assert(cco->drv);
366 
367     cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
368 }
369 
370 int bdrv_create(BlockDriver *drv, const char* filename,
371     QEMUOptionParameter *options)
372 {
373     int ret;
374 
375     Coroutine *co;
376     CreateCo cco = {
377         .drv = drv,
378         .filename = g_strdup(filename),
379         .options = options,
380         .ret = NOT_DONE,
381     };
382 
383     if (!drv->bdrv_create) {
384         ret = -ENOTSUP;
385         goto out;
386     }
387 
388     if (qemu_in_coroutine()) {
389         /* Fast-path if already in coroutine context */
390         bdrv_create_co_entry(&cco);
391     } else {
392         co = qemu_coroutine_create(bdrv_create_co_entry);
393         qemu_coroutine_enter(co, &cco);
394         while (cco.ret == NOT_DONE) {
395             qemu_aio_wait();
396         }
397     }
398 
399     ret = cco.ret;
400 
401 out:
402     g_free(cco.filename);
403     return ret;
404 }
405 
406 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
407 {
408     BlockDriver *drv;
409 
410     drv = bdrv_find_protocol(filename);
411     if (drv == NULL) {
412         return -ENOENT;
413     }
414 
415     return bdrv_create(drv, filename, options);
416 }
417 
418 /*
419  * Create a uniquely-named empty temporary file.
420  * Return 0 upon success, otherwise a negative errno value.
421  */
422 int get_tmp_filename(char *filename, int size)
423 {
424 #ifdef _WIN32
425     char temp_dir[MAX_PATH];
426     /* GetTempFileName requires that its output buffer (4th param)
427        have length MAX_PATH or greater.  */
428     assert(size >= MAX_PATH);
429     return (GetTempPath(MAX_PATH, temp_dir)
430             && GetTempFileName(temp_dir, "qem", 0, filename)
431             ? 0 : -GetLastError());
432 #else
433     int fd;
434     const char *tmpdir;
435     tmpdir = getenv("TMPDIR");
436     if (!tmpdir)
437         tmpdir = "/tmp";
438     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
439         return -EOVERFLOW;
440     }
441     fd = mkstemp(filename);
442     if (fd < 0) {
443         return -errno;
444     }
445     if (close(fd) != 0) {
446         unlink(filename);
447         return -errno;
448     }
449     return 0;
450 #endif
451 }
452 
453 /*
454  * Detect host devices. By convention, /dev/cdrom[N] is always
455  * recognized as a host CDROM.
456  */
457 static BlockDriver *find_hdev_driver(const char *filename)
458 {
459     int score_max = 0, score;
460     BlockDriver *drv = NULL, *d;
461 
462     QLIST_FOREACH(d, &bdrv_drivers, list) {
463         if (d->bdrv_probe_device) {
464             score = d->bdrv_probe_device(filename);
465             if (score > score_max) {
466                 score_max = score;
467                 drv = d;
468             }
469         }
470     }
471 
472     return drv;
473 }
474 
475 BlockDriver *bdrv_find_protocol(const char *filename)
476 {
477     BlockDriver *drv1;
478     char protocol[128];
479     int len;
480     const char *p;
481 
482     /* TODO Drivers without bdrv_file_open must be specified explicitly */
483 
484     /*
485      * XXX(hch): we really should not let host device detection
486      * override an explicit protocol specification, but moving this
487      * later breaks access to device names with colons in them.
488      * Thanks to the brain-dead persistent naming schemes on udev-
489      * based Linux systems those actually are quite common.
490      */
491     drv1 = find_hdev_driver(filename);
492     if (drv1) {
493         return drv1;
494     }
495 
496     if (!path_has_protocol(filename)) {
497         return bdrv_find_format("file");
498     }
499     p = strchr(filename, ':');
500     assert(p != NULL);
501     len = p - filename;
502     if (len > sizeof(protocol) - 1)
503         len = sizeof(protocol) - 1;
504     memcpy(protocol, filename, len);
505     protocol[len] = '\0';
506     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
507         if (drv1->protocol_name &&
508             !strcmp(drv1->protocol_name, protocol)) {
509             return drv1;
510         }
511     }
512     return NULL;
513 }
514 
515 static int find_image_format(BlockDriverState *bs, const char *filename,
516                              BlockDriver **pdrv)
517 {
518     int score, score_max;
519     BlockDriver *drv1, *drv;
520     uint8_t buf[2048];
521     int ret = 0;
522 
523     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
524     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
525         drv = bdrv_find_format("raw");
526         if (!drv) {
527             ret = -ENOENT;
528         }
529         *pdrv = drv;
530         return ret;
531     }
532 
533     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
534     if (ret < 0) {
535         *pdrv = NULL;
536         return ret;
537     }
538 
539     score_max = 0;
540     drv = NULL;
541     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
542         if (drv1->bdrv_probe) {
543             score = drv1->bdrv_probe(buf, ret, filename);
544             if (score > score_max) {
545                 score_max = score;
546                 drv = drv1;
547             }
548         }
549     }
550     if (!drv) {
551         ret = -ENOENT;
552     }
553     *pdrv = drv;
554     return ret;
555 }
556 
557 /**
558  * Set the current 'total_sectors' value
559  */
560 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
561 {
562     BlockDriver *drv = bs->drv;
563 
564     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
565     if (bs->sg)
566         return 0;
567 
568     /* query actual device if possible, otherwise just trust the hint */
569     if (drv->bdrv_getlength) {
570         int64_t length = drv->bdrv_getlength(bs);
571         if (length < 0) {
572             return length;
573         }
574         hint = length >> BDRV_SECTOR_BITS;
575     }
576 
577     bs->total_sectors = hint;
578     return 0;
579 }
580 
581 /**
582  * Set open flags for a given discard mode
583  *
584  * Return 0 on success, -1 if the discard mode was invalid.
585  */
586 int bdrv_parse_discard_flags(const char *mode, int *flags)
587 {
588     *flags &= ~BDRV_O_UNMAP;
589 
590     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
591         /* do nothing */
592     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
593         *flags |= BDRV_O_UNMAP;
594     } else {
595         return -1;
596     }
597 
598     return 0;
599 }
600 
601 /**
602  * Set open flags for a given cache mode
603  *
604  * Return 0 on success, -1 if the cache mode was invalid.
605  */
606 int bdrv_parse_cache_flags(const char *mode, int *flags)
607 {
608     *flags &= ~BDRV_O_CACHE_MASK;
609 
610     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
611         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
612     } else if (!strcmp(mode, "directsync")) {
613         *flags |= BDRV_O_NOCACHE;
614     } else if (!strcmp(mode, "writeback")) {
615         *flags |= BDRV_O_CACHE_WB;
616     } else if (!strcmp(mode, "unsafe")) {
617         *flags |= BDRV_O_CACHE_WB;
618         *flags |= BDRV_O_NO_FLUSH;
619     } else if (!strcmp(mode, "writethrough")) {
620         /* this is the default */
621     } else {
622         return -1;
623     }
624 
625     return 0;
626 }
627 
628 /**
629  * The copy-on-read flag is actually a reference count so multiple users may
630  * use the feature without worrying about clobbering its previous state.
631  * Copy-on-read stays enabled until all users have called to disable it.
632  */
633 void bdrv_enable_copy_on_read(BlockDriverState *bs)
634 {
635     bs->copy_on_read++;
636 }
637 
638 void bdrv_disable_copy_on_read(BlockDriverState *bs)
639 {
640     assert(bs->copy_on_read > 0);
641     bs->copy_on_read--;
642 }
643 
644 static int bdrv_open_flags(BlockDriverState *bs, int flags)
645 {
646     int open_flags = flags | BDRV_O_CACHE_WB;
647 
648     /*
649      * Clear flags that are internal to the block layer before opening the
650      * image.
651      */
652     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
653 
654     /*
655      * Snapshots should be writable.
656      */
657     if (bs->is_temporary) {
658         open_flags |= BDRV_O_RDWR;
659     }
660 
661     return open_flags;
662 }
663 
664 /*
665  * Common part for opening disk images and files
666  *
667  * Removes all processed options from *options.
668  */
669 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
670     QDict *options, int flags, BlockDriver *drv)
671 {
672     int ret, open_flags;
673     const char *filename;
674 
675     assert(drv != NULL);
676     assert(bs->file == NULL);
677     assert(options != NULL && bs->options != options);
678 
679     trace_bdrv_open_common(bs, filename, flags, drv->format_name);
680 
681     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
682         return -ENOTSUP;
683     }
684 
685     /* bdrv_open() with directly using a protocol as drv. This layer is already
686      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
687      * and return immediately. */
688     if (file != NULL && drv->bdrv_file_open) {
689         bdrv_swap(file, bs);
690         return 0;
691     }
692 
693     bs->open_flags = flags;
694     bs->buffer_alignment = 512;
695 
696     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
697     if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
698         bdrv_enable_copy_on_read(bs);
699     }
700 
701     if (file != NULL) {
702         filename = file->filename;
703     } else {
704         filename = qdict_get_try_str(options, "filename");
705     }
706 
707     if (filename != NULL) {
708         pstrcpy(bs->filename, sizeof(bs->filename), filename);
709     } else {
710         bs->filename[0] = '\0';
711     }
712 
713     bs->drv = drv;
714     bs->opaque = g_malloc0(drv->instance_size);
715 
716     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
717     open_flags = bdrv_open_flags(bs, flags);
718 
719     bs->read_only = !(open_flags & BDRV_O_RDWR);
720 
721     /* Open the image, either directly or using a protocol */
722     if (drv->bdrv_file_open) {
723         assert(file == NULL);
724         assert(drv->bdrv_parse_filename || filename != NULL);
725         ret = drv->bdrv_file_open(bs, options, open_flags);
726     } else {
727         if (file == NULL) {
728             qerror_report(ERROR_CLASS_GENERIC_ERROR, "Can't use '%s' as a "
729                           "block driver for the protocol level",
730                           drv->format_name);
731             ret = -EINVAL;
732             goto free_and_fail;
733         }
734         assert(file != NULL);
735         bs->file = file;
736         ret = drv->bdrv_open(bs, options, open_flags);
737     }
738 
739     if (ret < 0) {
740         goto free_and_fail;
741     }
742 
743     ret = refresh_total_sectors(bs, bs->total_sectors);
744     if (ret < 0) {
745         goto free_and_fail;
746     }
747 
748 #ifndef _WIN32
749     if (bs->is_temporary) {
750         assert(filename != NULL);
751         unlink(filename);
752     }
753 #endif
754     return 0;
755 
756 free_and_fail:
757     bs->file = NULL;
758     g_free(bs->opaque);
759     bs->opaque = NULL;
760     bs->drv = NULL;
761     return ret;
762 }
763 
764 /*
765  * Opens a file using a protocol (file, host_device, nbd, ...)
766  *
767  * options is a QDict of options to pass to the block drivers, or NULL for an
768  * empty set of options. The reference to the QDict belongs to the block layer
769  * after the call (even on failure), so if the caller intends to reuse the
770  * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
771  */
772 int bdrv_file_open(BlockDriverState **pbs, const char *filename,
773                    QDict *options, int flags)
774 {
775     BlockDriverState *bs;
776     BlockDriver *drv;
777     const char *drvname;
778     int ret;
779 
780     /* NULL means an empty set of options */
781     if (options == NULL) {
782         options = qdict_new();
783     }
784 
785     bs = bdrv_new("");
786     bs->options = options;
787     options = qdict_clone_shallow(options);
788 
789     /* Fetch the file name from the options QDict if necessary */
790     if (!filename) {
791         filename = qdict_get_try_str(options, "filename");
792     } else if (filename && !qdict_haskey(options, "filename")) {
793         qdict_put(options, "filename", qstring_from_str(filename));
794     } else {
795         qerror_report(ERROR_CLASS_GENERIC_ERROR, "Can't specify 'file' and "
796                       "'filename' options at the same time");
797         ret = -EINVAL;
798         goto fail;
799     }
800 
801     /* Find the right block driver */
802     drvname = qdict_get_try_str(options, "driver");
803     if (drvname) {
804         drv = bdrv_find_whitelisted_format(drvname);
805         qdict_del(options, "driver");
806     } else if (filename) {
807         drv = bdrv_find_protocol(filename);
808     } else {
809         qerror_report(ERROR_CLASS_GENERIC_ERROR,
810                       "Must specify either driver or file");
811         drv = NULL;
812     }
813 
814     if (!drv) {
815         ret = -ENOENT;
816         goto fail;
817     }
818 
819     /* Parse the filename and open it */
820     if (drv->bdrv_parse_filename && filename) {
821         Error *local_err = NULL;
822         drv->bdrv_parse_filename(filename, options, &local_err);
823         if (error_is_set(&local_err)) {
824             qerror_report_err(local_err);
825             error_free(local_err);
826             ret = -EINVAL;
827             goto fail;
828         }
829         qdict_del(options, "filename");
830     } else if (!drv->bdrv_parse_filename && !filename) {
831         qerror_report(ERROR_CLASS_GENERIC_ERROR,
832                       "The '%s' block driver requires a file name",
833                       drv->format_name);
834         ret = -EINVAL;
835         goto fail;
836     }
837 
838     ret = bdrv_open_common(bs, NULL, options, flags, drv);
839     if (ret < 0) {
840         goto fail;
841     }
842 
843     /* Check if any unknown options were used */
844     if (qdict_size(options) != 0) {
845         const QDictEntry *entry = qdict_first(options);
846         qerror_report(ERROR_CLASS_GENERIC_ERROR, "Block protocol '%s' doesn't "
847                       "support the option '%s'",
848                       drv->format_name, entry->key);
849         ret = -EINVAL;
850         goto fail;
851     }
852     QDECREF(options);
853 
854     bs->growable = 1;
855     *pbs = bs;
856     return 0;
857 
858 fail:
859     QDECREF(options);
860     if (!bs->drv) {
861         QDECREF(bs->options);
862     }
863     bdrv_delete(bs);
864     return ret;
865 }
866 
867 /*
868  * Opens the backing file for a BlockDriverState if not yet open
869  *
870  * options is a QDict of options to pass to the block drivers, or NULL for an
871  * empty set of options. The reference to the QDict is transferred to this
872  * function (even on failure), so if the caller intends to reuse the dictionary,
873  * it needs to use QINCREF() before calling bdrv_file_open.
874  */
875 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options)
876 {
877     char backing_filename[PATH_MAX];
878     int back_flags, ret;
879     BlockDriver *back_drv = NULL;
880 
881     if (bs->backing_hd != NULL) {
882         QDECREF(options);
883         return 0;
884     }
885 
886     /* NULL means an empty set of options */
887     if (options == NULL) {
888         options = qdict_new();
889     }
890 
891     bs->open_flags &= ~BDRV_O_NO_BACKING;
892     if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
893         QDECREF(options);
894         return 0;
895     }
896 
897     bs->backing_hd = bdrv_new("");
898     bdrv_get_full_backing_filename(bs, backing_filename,
899                                    sizeof(backing_filename));
900 
901     if (bs->backing_format[0] != '\0') {
902         back_drv = bdrv_find_format(bs->backing_format);
903     }
904 
905     /* backing files always opened read-only */
906     back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT);
907 
908     ret = bdrv_open(bs->backing_hd,
909                     *backing_filename ? backing_filename : NULL, options,
910                     back_flags, back_drv);
911     if (ret < 0) {
912         bdrv_delete(bs->backing_hd);
913         bs->backing_hd = NULL;
914         bs->open_flags |= BDRV_O_NO_BACKING;
915         return ret;
916     }
917     return 0;
918 }
919 
920 static void extract_subqdict(QDict *src, QDict **dst, const char *start)
921 {
922     const QDictEntry *entry, *next;
923     const char *p;
924 
925     *dst = qdict_new();
926     entry = qdict_first(src);
927 
928     while (entry != NULL) {
929         next = qdict_next(src, entry);
930         if (strstart(entry->key, start, &p)) {
931             qobject_incref(entry->value);
932             qdict_put_obj(*dst, p, entry->value);
933             qdict_del(src, entry->key);
934         }
935         entry = next;
936     }
937 }
938 
939 /*
940  * Opens a disk image (raw, qcow2, vmdk, ...)
941  *
942  * options is a QDict of options to pass to the block drivers, or NULL for an
943  * empty set of options. The reference to the QDict belongs to the block layer
944  * after the call (even on failure), so if the caller intends to reuse the
945  * dictionary, it needs to use QINCREF() before calling bdrv_open.
946  */
947 int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
948               int flags, BlockDriver *drv)
949 {
950     int ret;
951     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
952     char tmp_filename[PATH_MAX + 1];
953     BlockDriverState *file = NULL;
954     QDict *file_options = NULL;
955 
956     /* NULL means an empty set of options */
957     if (options == NULL) {
958         options = qdict_new();
959     }
960 
961     bs->options = options;
962     options = qdict_clone_shallow(options);
963 
964     /* For snapshot=on, create a temporary qcow2 overlay */
965     if (flags & BDRV_O_SNAPSHOT) {
966         BlockDriverState *bs1;
967         int64_t total_size;
968         BlockDriver *bdrv_qcow2;
969         QEMUOptionParameter *create_options;
970         char backing_filename[PATH_MAX];
971 
972         if (qdict_size(options) != 0) {
973             error_report("Can't use snapshot=on with driver-specific options");
974             ret = -EINVAL;
975             goto fail;
976         }
977         assert(filename != NULL);
978 
979         /* if snapshot, we create a temporary backing file and open it
980            instead of opening 'filename' directly */
981 
982         /* if there is a backing file, use it */
983         bs1 = bdrv_new("");
984         ret = bdrv_open(bs1, filename, NULL, 0, drv);
985         if (ret < 0) {
986             bdrv_delete(bs1);
987             goto fail;
988         }
989         total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
990 
991         bdrv_delete(bs1);
992 
993         ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
994         if (ret < 0) {
995             goto fail;
996         }
997 
998         /* Real path is meaningless for protocols */
999         if (path_has_protocol(filename)) {
1000             snprintf(backing_filename, sizeof(backing_filename),
1001                      "%s", filename);
1002         } else if (!realpath(filename, backing_filename)) {
1003             ret = -errno;
1004             goto fail;
1005         }
1006 
1007         bdrv_qcow2 = bdrv_find_format("qcow2");
1008         create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1009                                                  NULL);
1010 
1011         set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1012         set_option_parameter(create_options, BLOCK_OPT_BACKING_FILE,
1013                              backing_filename);
1014         if (drv) {
1015             set_option_parameter(create_options, BLOCK_OPT_BACKING_FMT,
1016                 drv->format_name);
1017         }
1018 
1019         ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options);
1020         free_option_parameters(create_options);
1021         if (ret < 0) {
1022             goto fail;
1023         }
1024 
1025         filename = tmp_filename;
1026         drv = bdrv_qcow2;
1027         bs->is_temporary = 1;
1028     }
1029 
1030     /* Open image file without format layer */
1031     if (flags & BDRV_O_RDWR) {
1032         flags |= BDRV_O_ALLOW_RDWR;
1033     }
1034 
1035     extract_subqdict(options, &file_options, "file.");
1036 
1037     ret = bdrv_file_open(&file, filename, file_options,
1038                          bdrv_open_flags(bs, flags));
1039     if (ret < 0) {
1040         goto fail;
1041     }
1042 
1043     /* Find the right image format driver */
1044     if (!drv) {
1045         ret = find_image_format(file, filename, &drv);
1046     }
1047 
1048     if (!drv) {
1049         goto unlink_and_fail;
1050     }
1051 
1052     /* Open the image */
1053     ret = bdrv_open_common(bs, file, options, flags, drv);
1054     if (ret < 0) {
1055         goto unlink_and_fail;
1056     }
1057 
1058     if (bs->file != file) {
1059         bdrv_delete(file);
1060         file = NULL;
1061     }
1062 
1063     /* If there is a backing file, use it */
1064     if ((flags & BDRV_O_NO_BACKING) == 0) {
1065         QDict *backing_options;
1066 
1067         extract_subqdict(options, &backing_options, "backing.");
1068         ret = bdrv_open_backing_file(bs, backing_options);
1069         if (ret < 0) {
1070             goto close_and_fail;
1071         }
1072     }
1073 
1074     /* Check if any unknown options were used */
1075     if (qdict_size(options) != 0) {
1076         const QDictEntry *entry = qdict_first(options);
1077         qerror_report(ERROR_CLASS_GENERIC_ERROR, "Block format '%s' used by "
1078             "device '%s' doesn't support the option '%s'",
1079             drv->format_name, bs->device_name, entry->key);
1080 
1081         ret = -EINVAL;
1082         goto close_and_fail;
1083     }
1084     QDECREF(options);
1085 
1086     if (!bdrv_key_required(bs)) {
1087         bdrv_dev_change_media_cb(bs, true);
1088     }
1089 
1090     /* throttling disk I/O limits */
1091     if (bs->io_limits_enabled) {
1092         bdrv_io_limits_enable(bs);
1093     }
1094 
1095     return 0;
1096 
1097 unlink_and_fail:
1098     if (file != NULL) {
1099         bdrv_delete(file);
1100     }
1101     if (bs->is_temporary) {
1102         unlink(filename);
1103     }
1104 fail:
1105     QDECREF(bs->options);
1106     QDECREF(options);
1107     bs->options = NULL;
1108     return ret;
1109 
1110 close_and_fail:
1111     bdrv_close(bs);
1112     QDECREF(options);
1113     return ret;
1114 }
1115 
1116 typedef struct BlockReopenQueueEntry {
1117      bool prepared;
1118      BDRVReopenState state;
1119      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1120 } BlockReopenQueueEntry;
1121 
1122 /*
1123  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1124  * reopen of multiple devices.
1125  *
1126  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1127  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1128  * be created and initialized. This newly created BlockReopenQueue should be
1129  * passed back in for subsequent calls that are intended to be of the same
1130  * atomic 'set'.
1131  *
1132  * bs is the BlockDriverState to add to the reopen queue.
1133  *
1134  * flags contains the open flags for the associated bs
1135  *
1136  * returns a pointer to bs_queue, which is either the newly allocated
1137  * bs_queue, or the existing bs_queue being used.
1138  *
1139  */
1140 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1141                                     BlockDriverState *bs, int flags)
1142 {
1143     assert(bs != NULL);
1144 
1145     BlockReopenQueueEntry *bs_entry;
1146     if (bs_queue == NULL) {
1147         bs_queue = g_new0(BlockReopenQueue, 1);
1148         QSIMPLEQ_INIT(bs_queue);
1149     }
1150 
1151     if (bs->file) {
1152         bdrv_reopen_queue(bs_queue, bs->file, flags);
1153     }
1154 
1155     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1156     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1157 
1158     bs_entry->state.bs = bs;
1159     bs_entry->state.flags = flags;
1160 
1161     return bs_queue;
1162 }
1163 
1164 /*
1165  * Reopen multiple BlockDriverStates atomically & transactionally.
1166  *
1167  * The queue passed in (bs_queue) must have been built up previous
1168  * via bdrv_reopen_queue().
1169  *
1170  * Reopens all BDS specified in the queue, with the appropriate
1171  * flags.  All devices are prepared for reopen, and failure of any
1172  * device will cause all device changes to be abandonded, and intermediate
1173  * data cleaned up.
1174  *
1175  * If all devices prepare successfully, then the changes are committed
1176  * to all devices.
1177  *
1178  */
1179 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1180 {
1181     int ret = -1;
1182     BlockReopenQueueEntry *bs_entry, *next;
1183     Error *local_err = NULL;
1184 
1185     assert(bs_queue != NULL);
1186 
1187     bdrv_drain_all();
1188 
1189     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1190         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1191             error_propagate(errp, local_err);
1192             goto cleanup;
1193         }
1194         bs_entry->prepared = true;
1195     }
1196 
1197     /* If we reach this point, we have success and just need to apply the
1198      * changes
1199      */
1200     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1201         bdrv_reopen_commit(&bs_entry->state);
1202     }
1203 
1204     ret = 0;
1205 
1206 cleanup:
1207     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1208         if (ret && bs_entry->prepared) {
1209             bdrv_reopen_abort(&bs_entry->state);
1210         }
1211         g_free(bs_entry);
1212     }
1213     g_free(bs_queue);
1214     return ret;
1215 }
1216 
1217 
1218 /* Reopen a single BlockDriverState with the specified flags. */
1219 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1220 {
1221     int ret = -1;
1222     Error *local_err = NULL;
1223     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1224 
1225     ret = bdrv_reopen_multiple(queue, &local_err);
1226     if (local_err != NULL) {
1227         error_propagate(errp, local_err);
1228     }
1229     return ret;
1230 }
1231 
1232 
1233 /*
1234  * Prepares a BlockDriverState for reopen. All changes are staged in the
1235  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1236  * the block driver layer .bdrv_reopen_prepare()
1237  *
1238  * bs is the BlockDriverState to reopen
1239  * flags are the new open flags
1240  * queue is the reopen queue
1241  *
1242  * Returns 0 on success, non-zero on error.  On error errp will be set
1243  * as well.
1244  *
1245  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1246  * It is the responsibility of the caller to then call the abort() or
1247  * commit() for any other BDS that have been left in a prepare() state
1248  *
1249  */
1250 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1251                         Error **errp)
1252 {
1253     int ret = -1;
1254     Error *local_err = NULL;
1255     BlockDriver *drv;
1256 
1257     assert(reopen_state != NULL);
1258     assert(reopen_state->bs->drv != NULL);
1259     drv = reopen_state->bs->drv;
1260 
1261     /* if we are to stay read-only, do not allow permission change
1262      * to r/w */
1263     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1264         reopen_state->flags & BDRV_O_RDWR) {
1265         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1266                   reopen_state->bs->device_name);
1267         goto error;
1268     }
1269 
1270 
1271     ret = bdrv_flush(reopen_state->bs);
1272     if (ret) {
1273         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1274                   strerror(-ret));
1275         goto error;
1276     }
1277 
1278     if (drv->bdrv_reopen_prepare) {
1279         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1280         if (ret) {
1281             if (local_err != NULL) {
1282                 error_propagate(errp, local_err);
1283             } else {
1284                 error_set(errp, QERR_OPEN_FILE_FAILED,
1285                           reopen_state->bs->filename);
1286             }
1287             goto error;
1288         }
1289     } else {
1290         /* It is currently mandatory to have a bdrv_reopen_prepare()
1291          * handler for each supported drv. */
1292         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1293                   drv->format_name, reopen_state->bs->device_name,
1294                  "reopening of file");
1295         ret = -1;
1296         goto error;
1297     }
1298 
1299     ret = 0;
1300 
1301 error:
1302     return ret;
1303 }
1304 
1305 /*
1306  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1307  * makes them final by swapping the staging BlockDriverState contents into
1308  * the active BlockDriverState contents.
1309  */
1310 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1311 {
1312     BlockDriver *drv;
1313 
1314     assert(reopen_state != NULL);
1315     drv = reopen_state->bs->drv;
1316     assert(drv != NULL);
1317 
1318     /* If there are any driver level actions to take */
1319     if (drv->bdrv_reopen_commit) {
1320         drv->bdrv_reopen_commit(reopen_state);
1321     }
1322 
1323     /* set BDS specific flags now */
1324     reopen_state->bs->open_flags         = reopen_state->flags;
1325     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1326                                               BDRV_O_CACHE_WB);
1327     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1328 }
1329 
1330 /*
1331  * Abort the reopen, and delete and free the staged changes in
1332  * reopen_state
1333  */
1334 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1335 {
1336     BlockDriver *drv;
1337 
1338     assert(reopen_state != NULL);
1339     drv = reopen_state->bs->drv;
1340     assert(drv != NULL);
1341 
1342     if (drv->bdrv_reopen_abort) {
1343         drv->bdrv_reopen_abort(reopen_state);
1344     }
1345 }
1346 
1347 
1348 void bdrv_close(BlockDriverState *bs)
1349 {
1350     bdrv_flush(bs);
1351     if (bs->job) {
1352         block_job_cancel_sync(bs->job);
1353     }
1354     bdrv_drain_all();
1355     notifier_list_notify(&bs->close_notifiers, bs);
1356 
1357     if (bs->drv) {
1358         if (bs == bs_snapshots) {
1359             bs_snapshots = NULL;
1360         }
1361         if (bs->backing_hd) {
1362             bdrv_delete(bs->backing_hd);
1363             bs->backing_hd = NULL;
1364         }
1365         bs->drv->bdrv_close(bs);
1366         g_free(bs->opaque);
1367 #ifdef _WIN32
1368         if (bs->is_temporary) {
1369             unlink(bs->filename);
1370         }
1371 #endif
1372         bs->opaque = NULL;
1373         bs->drv = NULL;
1374         bs->copy_on_read = 0;
1375         bs->backing_file[0] = '\0';
1376         bs->backing_format[0] = '\0';
1377         bs->total_sectors = 0;
1378         bs->encrypted = 0;
1379         bs->valid_key = 0;
1380         bs->sg = 0;
1381         bs->growable = 0;
1382         QDECREF(bs->options);
1383         bs->options = NULL;
1384 
1385         if (bs->file != NULL) {
1386             bdrv_delete(bs->file);
1387             bs->file = NULL;
1388         }
1389     }
1390 
1391     bdrv_dev_change_media_cb(bs, false);
1392 
1393     /*throttling disk I/O limits*/
1394     if (bs->io_limits_enabled) {
1395         bdrv_io_limits_disable(bs);
1396     }
1397 }
1398 
1399 void bdrv_close_all(void)
1400 {
1401     BlockDriverState *bs;
1402 
1403     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1404         bdrv_close(bs);
1405     }
1406 }
1407 
1408 /*
1409  * Wait for pending requests to complete across all BlockDriverStates
1410  *
1411  * This function does not flush data to disk, use bdrv_flush_all() for that
1412  * after calling this function.
1413  *
1414  * Note that completion of an asynchronous I/O operation can trigger any
1415  * number of other I/O operations on other devices---for example a coroutine
1416  * can be arbitrarily complex and a constant flow of I/O can come until the
1417  * coroutine is complete.  Because of this, it is not possible to have a
1418  * function to drain a single device's I/O queue.
1419  */
1420 void bdrv_drain_all(void)
1421 {
1422     BlockDriverState *bs;
1423     bool busy;
1424 
1425     do {
1426         busy = qemu_aio_wait();
1427 
1428         /* FIXME: We do not have timer support here, so this is effectively
1429          * a busy wait.
1430          */
1431         QTAILQ_FOREACH(bs, &bdrv_states, list) {
1432             if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
1433                 qemu_co_queue_restart_all(&bs->throttled_reqs);
1434                 busy = true;
1435             }
1436         }
1437     } while (busy);
1438 
1439     /* If requests are still pending there is a bug somewhere */
1440     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1441         assert(QLIST_EMPTY(&bs->tracked_requests));
1442         assert(qemu_co_queue_empty(&bs->throttled_reqs));
1443     }
1444 }
1445 
1446 /* make a BlockDriverState anonymous by removing from bdrv_state list.
1447    Also, NULL terminate the device_name to prevent double remove */
1448 void bdrv_make_anon(BlockDriverState *bs)
1449 {
1450     if (bs->device_name[0] != '\0') {
1451         QTAILQ_REMOVE(&bdrv_states, bs, list);
1452     }
1453     bs->device_name[0] = '\0';
1454 }
1455 
1456 static void bdrv_rebind(BlockDriverState *bs)
1457 {
1458     if (bs->drv && bs->drv->bdrv_rebind) {
1459         bs->drv->bdrv_rebind(bs);
1460     }
1461 }
1462 
1463 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1464                                      BlockDriverState *bs_src)
1465 {
1466     /* move some fields that need to stay attached to the device */
1467     bs_dest->open_flags         = bs_src->open_flags;
1468 
1469     /* dev info */
1470     bs_dest->dev_ops            = bs_src->dev_ops;
1471     bs_dest->dev_opaque         = bs_src->dev_opaque;
1472     bs_dest->dev                = bs_src->dev;
1473     bs_dest->buffer_alignment   = bs_src->buffer_alignment;
1474     bs_dest->copy_on_read       = bs_src->copy_on_read;
1475 
1476     bs_dest->enable_write_cache = bs_src->enable_write_cache;
1477 
1478     /* i/o timing parameters */
1479     bs_dest->slice_start        = bs_src->slice_start;
1480     bs_dest->slice_end          = bs_src->slice_end;
1481     bs_dest->slice_submitted    = bs_src->slice_submitted;
1482     bs_dest->io_limits          = bs_src->io_limits;
1483     bs_dest->throttled_reqs     = bs_src->throttled_reqs;
1484     bs_dest->block_timer        = bs_src->block_timer;
1485     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1486 
1487     /* r/w error */
1488     bs_dest->on_read_error      = bs_src->on_read_error;
1489     bs_dest->on_write_error     = bs_src->on_write_error;
1490 
1491     /* i/o status */
1492     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1493     bs_dest->iostatus           = bs_src->iostatus;
1494 
1495     /* dirty bitmap */
1496     bs_dest->dirty_bitmap       = bs_src->dirty_bitmap;
1497 
1498     /* job */
1499     bs_dest->in_use             = bs_src->in_use;
1500     bs_dest->job                = bs_src->job;
1501 
1502     /* keep the same entry in bdrv_states */
1503     pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1504             bs_src->device_name);
1505     bs_dest->list = bs_src->list;
1506 }
1507 
1508 /*
1509  * Swap bs contents for two image chains while they are live,
1510  * while keeping required fields on the BlockDriverState that is
1511  * actually attached to a device.
1512  *
1513  * This will modify the BlockDriverState fields, and swap contents
1514  * between bs_new and bs_old. Both bs_new and bs_old are modified.
1515  *
1516  * bs_new is required to be anonymous.
1517  *
1518  * This function does not create any image files.
1519  */
1520 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1521 {
1522     BlockDriverState tmp;
1523 
1524     /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1525     assert(bs_new->device_name[0] == '\0');
1526     assert(bs_new->dirty_bitmap == NULL);
1527     assert(bs_new->job == NULL);
1528     assert(bs_new->dev == NULL);
1529     assert(bs_new->in_use == 0);
1530     assert(bs_new->io_limits_enabled == false);
1531     assert(bs_new->block_timer == NULL);
1532 
1533     tmp = *bs_new;
1534     *bs_new = *bs_old;
1535     *bs_old = tmp;
1536 
1537     /* there are some fields that should not be swapped, move them back */
1538     bdrv_move_feature_fields(&tmp, bs_old);
1539     bdrv_move_feature_fields(bs_old, bs_new);
1540     bdrv_move_feature_fields(bs_new, &tmp);
1541 
1542     /* bs_new shouldn't be in bdrv_states even after the swap!  */
1543     assert(bs_new->device_name[0] == '\0');
1544 
1545     /* Check a few fields that should remain attached to the device */
1546     assert(bs_new->dev == NULL);
1547     assert(bs_new->job == NULL);
1548     assert(bs_new->in_use == 0);
1549     assert(bs_new->io_limits_enabled == false);
1550     assert(bs_new->block_timer == NULL);
1551 
1552     bdrv_rebind(bs_new);
1553     bdrv_rebind(bs_old);
1554 }
1555 
1556 /*
1557  * Add new bs contents at the top of an image chain while the chain is
1558  * live, while keeping required fields on the top layer.
1559  *
1560  * This will modify the BlockDriverState fields, and swap contents
1561  * between bs_new and bs_top. Both bs_new and bs_top are modified.
1562  *
1563  * bs_new is required to be anonymous.
1564  *
1565  * This function does not create any image files.
1566  */
1567 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1568 {
1569     bdrv_swap(bs_new, bs_top);
1570 
1571     /* The contents of 'tmp' will become bs_top, as we are
1572      * swapping bs_new and bs_top contents. */
1573     bs_top->backing_hd = bs_new;
1574     bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1575     pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1576             bs_new->filename);
1577     pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1578             bs_new->drv ? bs_new->drv->format_name : "");
1579 }
1580 
1581 void bdrv_delete(BlockDriverState *bs)
1582 {
1583     assert(!bs->dev);
1584     assert(!bs->job);
1585     assert(!bs->in_use);
1586 
1587     /* remove from list, if necessary */
1588     bdrv_make_anon(bs);
1589 
1590     bdrv_close(bs);
1591 
1592     assert(bs != bs_snapshots);
1593     g_free(bs);
1594 }
1595 
1596 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1597 /* TODO change to DeviceState *dev when all users are qdevified */
1598 {
1599     if (bs->dev) {
1600         return -EBUSY;
1601     }
1602     bs->dev = dev;
1603     bdrv_iostatus_reset(bs);
1604     return 0;
1605 }
1606 
1607 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1608 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1609 {
1610     if (bdrv_attach_dev(bs, dev) < 0) {
1611         abort();
1612     }
1613 }
1614 
1615 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1616 /* TODO change to DeviceState *dev when all users are qdevified */
1617 {
1618     assert(bs->dev == dev);
1619     bs->dev = NULL;
1620     bs->dev_ops = NULL;
1621     bs->dev_opaque = NULL;
1622     bs->buffer_alignment = 512;
1623 }
1624 
1625 /* TODO change to return DeviceState * when all users are qdevified */
1626 void *bdrv_get_attached_dev(BlockDriverState *bs)
1627 {
1628     return bs->dev;
1629 }
1630 
1631 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1632                       void *opaque)
1633 {
1634     bs->dev_ops = ops;
1635     bs->dev_opaque = opaque;
1636     if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1637         bs_snapshots = NULL;
1638     }
1639 }
1640 
1641 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1642                                enum MonitorEvent ev,
1643                                BlockErrorAction action, bool is_read)
1644 {
1645     QObject *data;
1646     const char *action_str;
1647 
1648     switch (action) {
1649     case BDRV_ACTION_REPORT:
1650         action_str = "report";
1651         break;
1652     case BDRV_ACTION_IGNORE:
1653         action_str = "ignore";
1654         break;
1655     case BDRV_ACTION_STOP:
1656         action_str = "stop";
1657         break;
1658     default:
1659         abort();
1660     }
1661 
1662     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1663                               bdrv->device_name,
1664                               action_str,
1665                               is_read ? "read" : "write");
1666     monitor_protocol_event(ev, data);
1667 
1668     qobject_decref(data);
1669 }
1670 
1671 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1672 {
1673     QObject *data;
1674 
1675     data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1676                               bdrv_get_device_name(bs), ejected);
1677     monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1678 
1679     qobject_decref(data);
1680 }
1681 
1682 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1683 {
1684     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1685         bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1686         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1687         if (tray_was_closed) {
1688             /* tray open */
1689             bdrv_emit_qmp_eject_event(bs, true);
1690         }
1691         if (load) {
1692             /* tray close */
1693             bdrv_emit_qmp_eject_event(bs, false);
1694         }
1695     }
1696 }
1697 
1698 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1699 {
1700     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1701 }
1702 
1703 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1704 {
1705     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1706         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1707     }
1708 }
1709 
1710 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1711 {
1712     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1713         return bs->dev_ops->is_tray_open(bs->dev_opaque);
1714     }
1715     return false;
1716 }
1717 
1718 static void bdrv_dev_resize_cb(BlockDriverState *bs)
1719 {
1720     if (bs->dev_ops && bs->dev_ops->resize_cb) {
1721         bs->dev_ops->resize_cb(bs->dev_opaque);
1722     }
1723 }
1724 
1725 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1726 {
1727     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1728         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1729     }
1730     return false;
1731 }
1732 
1733 /*
1734  * Run consistency checks on an image
1735  *
1736  * Returns 0 if the check could be completed (it doesn't mean that the image is
1737  * free of errors) or -errno when an internal error occurred. The results of the
1738  * check are stored in res.
1739  */
1740 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1741 {
1742     if (bs->drv->bdrv_check == NULL) {
1743         return -ENOTSUP;
1744     }
1745 
1746     memset(res, 0, sizeof(*res));
1747     return bs->drv->bdrv_check(bs, res, fix);
1748 }
1749 
1750 #define COMMIT_BUF_SECTORS 2048
1751 
1752 /* commit COW file into the raw image */
1753 int bdrv_commit(BlockDriverState *bs)
1754 {
1755     BlockDriver *drv = bs->drv;
1756     int64_t sector, total_sectors;
1757     int n, ro, open_flags;
1758     int ret = 0;
1759     uint8_t *buf;
1760     char filename[PATH_MAX];
1761 
1762     if (!drv)
1763         return -ENOMEDIUM;
1764 
1765     if (!bs->backing_hd) {
1766         return -ENOTSUP;
1767     }
1768 
1769     if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1770         return -EBUSY;
1771     }
1772 
1773     ro = bs->backing_hd->read_only;
1774     /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
1775     pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
1776     open_flags =  bs->backing_hd->open_flags;
1777 
1778     if (ro) {
1779         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
1780             return -EACCES;
1781         }
1782     }
1783 
1784     total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1785     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1786 
1787     for (sector = 0; sector < total_sectors; sector += n) {
1788         if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1789 
1790             if (bdrv_read(bs, sector, buf, n) != 0) {
1791                 ret = -EIO;
1792                 goto ro_cleanup;
1793             }
1794 
1795             if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1796                 ret = -EIO;
1797                 goto ro_cleanup;
1798             }
1799         }
1800     }
1801 
1802     if (drv->bdrv_make_empty) {
1803         ret = drv->bdrv_make_empty(bs);
1804         bdrv_flush(bs);
1805     }
1806 
1807     /*
1808      * Make sure all data we wrote to the backing device is actually
1809      * stable on disk.
1810      */
1811     if (bs->backing_hd)
1812         bdrv_flush(bs->backing_hd);
1813 
1814 ro_cleanup:
1815     g_free(buf);
1816 
1817     if (ro) {
1818         /* ignoring error return here */
1819         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
1820     }
1821 
1822     return ret;
1823 }
1824 
1825 int bdrv_commit_all(void)
1826 {
1827     BlockDriverState *bs;
1828 
1829     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1830         if (bs->drv && bs->backing_hd) {
1831             int ret = bdrv_commit(bs);
1832             if (ret < 0) {
1833                 return ret;
1834             }
1835         }
1836     }
1837     return 0;
1838 }
1839 
1840 struct BdrvTrackedRequest {
1841     BlockDriverState *bs;
1842     int64_t sector_num;
1843     int nb_sectors;
1844     bool is_write;
1845     QLIST_ENTRY(BdrvTrackedRequest) list;
1846     Coroutine *co; /* owner, used for deadlock detection */
1847     CoQueue wait_queue; /* coroutines blocked on this request */
1848 };
1849 
1850 /**
1851  * Remove an active request from the tracked requests list
1852  *
1853  * This function should be called when a tracked request is completing.
1854  */
1855 static void tracked_request_end(BdrvTrackedRequest *req)
1856 {
1857     QLIST_REMOVE(req, list);
1858     qemu_co_queue_restart_all(&req->wait_queue);
1859 }
1860 
1861 /**
1862  * Add an active request to the tracked requests list
1863  */
1864 static void tracked_request_begin(BdrvTrackedRequest *req,
1865                                   BlockDriverState *bs,
1866                                   int64_t sector_num,
1867                                   int nb_sectors, bool is_write)
1868 {
1869     *req = (BdrvTrackedRequest){
1870         .bs = bs,
1871         .sector_num = sector_num,
1872         .nb_sectors = nb_sectors,
1873         .is_write = is_write,
1874         .co = qemu_coroutine_self(),
1875     };
1876 
1877     qemu_co_queue_init(&req->wait_queue);
1878 
1879     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1880 }
1881 
1882 /**
1883  * Round a region to cluster boundaries
1884  */
1885 void bdrv_round_to_clusters(BlockDriverState *bs,
1886                             int64_t sector_num, int nb_sectors,
1887                             int64_t *cluster_sector_num,
1888                             int *cluster_nb_sectors)
1889 {
1890     BlockDriverInfo bdi;
1891 
1892     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1893         *cluster_sector_num = sector_num;
1894         *cluster_nb_sectors = nb_sectors;
1895     } else {
1896         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1897         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1898         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1899                                             nb_sectors, c);
1900     }
1901 }
1902 
1903 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1904                                      int64_t sector_num, int nb_sectors) {
1905     /*        aaaa   bbbb */
1906     if (sector_num >= req->sector_num + req->nb_sectors) {
1907         return false;
1908     }
1909     /* bbbb   aaaa        */
1910     if (req->sector_num >= sector_num + nb_sectors) {
1911         return false;
1912     }
1913     return true;
1914 }
1915 
1916 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1917         int64_t sector_num, int nb_sectors)
1918 {
1919     BdrvTrackedRequest *req;
1920     int64_t cluster_sector_num;
1921     int cluster_nb_sectors;
1922     bool retry;
1923 
1924     /* If we touch the same cluster it counts as an overlap.  This guarantees
1925      * that allocating writes will be serialized and not race with each other
1926      * for the same cluster.  For example, in copy-on-read it ensures that the
1927      * CoR read and write operations are atomic and guest writes cannot
1928      * interleave between them.
1929      */
1930     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
1931                            &cluster_sector_num, &cluster_nb_sectors);
1932 
1933     do {
1934         retry = false;
1935         QLIST_FOREACH(req, &bs->tracked_requests, list) {
1936             if (tracked_request_overlaps(req, cluster_sector_num,
1937                                          cluster_nb_sectors)) {
1938                 /* Hitting this means there was a reentrant request, for
1939                  * example, a block driver issuing nested requests.  This must
1940                  * never happen since it means deadlock.
1941                  */
1942                 assert(qemu_coroutine_self() != req->co);
1943 
1944                 qemu_co_queue_wait(&req->wait_queue);
1945                 retry = true;
1946                 break;
1947             }
1948         }
1949     } while (retry);
1950 }
1951 
1952 /*
1953  * Return values:
1954  * 0        - success
1955  * -EINVAL  - backing format specified, but no file
1956  * -ENOSPC  - can't update the backing file because no space is left in the
1957  *            image file header
1958  * -ENOTSUP - format driver doesn't support changing the backing file
1959  */
1960 int bdrv_change_backing_file(BlockDriverState *bs,
1961     const char *backing_file, const char *backing_fmt)
1962 {
1963     BlockDriver *drv = bs->drv;
1964     int ret;
1965 
1966     /* Backing file format doesn't make sense without a backing file */
1967     if (backing_fmt && !backing_file) {
1968         return -EINVAL;
1969     }
1970 
1971     if (drv->bdrv_change_backing_file != NULL) {
1972         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1973     } else {
1974         ret = -ENOTSUP;
1975     }
1976 
1977     if (ret == 0) {
1978         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1979         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1980     }
1981     return ret;
1982 }
1983 
1984 /*
1985  * Finds the image layer in the chain that has 'bs' as its backing file.
1986  *
1987  * active is the current topmost image.
1988  *
1989  * Returns NULL if bs is not found in active's image chain,
1990  * or if active == bs.
1991  */
1992 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
1993                                     BlockDriverState *bs)
1994 {
1995     BlockDriverState *overlay = NULL;
1996     BlockDriverState *intermediate;
1997 
1998     assert(active != NULL);
1999     assert(bs != NULL);
2000 
2001     /* if bs is the same as active, then by definition it has no overlay
2002      */
2003     if (active == bs) {
2004         return NULL;
2005     }
2006 
2007     intermediate = active;
2008     while (intermediate->backing_hd) {
2009         if (intermediate->backing_hd == bs) {
2010             overlay = intermediate;
2011             break;
2012         }
2013         intermediate = intermediate->backing_hd;
2014     }
2015 
2016     return overlay;
2017 }
2018 
2019 typedef struct BlkIntermediateStates {
2020     BlockDriverState *bs;
2021     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2022 } BlkIntermediateStates;
2023 
2024 
2025 /*
2026  * Drops images above 'base' up to and including 'top', and sets the image
2027  * above 'top' to have base as its backing file.
2028  *
2029  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2030  * information in 'bs' can be properly updated.
2031  *
2032  * E.g., this will convert the following chain:
2033  * bottom <- base <- intermediate <- top <- active
2034  *
2035  * to
2036  *
2037  * bottom <- base <- active
2038  *
2039  * It is allowed for bottom==base, in which case it converts:
2040  *
2041  * base <- intermediate <- top <- active
2042  *
2043  * to
2044  *
2045  * base <- active
2046  *
2047  * Error conditions:
2048  *  if active == top, that is considered an error
2049  *
2050  */
2051 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2052                            BlockDriverState *base)
2053 {
2054     BlockDriverState *intermediate;
2055     BlockDriverState *base_bs = NULL;
2056     BlockDriverState *new_top_bs = NULL;
2057     BlkIntermediateStates *intermediate_state, *next;
2058     int ret = -EIO;
2059 
2060     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2061     QSIMPLEQ_INIT(&states_to_delete);
2062 
2063     if (!top->drv || !base->drv) {
2064         goto exit;
2065     }
2066 
2067     new_top_bs = bdrv_find_overlay(active, top);
2068 
2069     if (new_top_bs == NULL) {
2070         /* we could not find the image above 'top', this is an error */
2071         goto exit;
2072     }
2073 
2074     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2075      * to do, no intermediate images */
2076     if (new_top_bs->backing_hd == base) {
2077         ret = 0;
2078         goto exit;
2079     }
2080 
2081     intermediate = top;
2082 
2083     /* now we will go down through the list, and add each BDS we find
2084      * into our deletion queue, until we hit the 'base'
2085      */
2086     while (intermediate) {
2087         intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2088         intermediate_state->bs = intermediate;
2089         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2090 
2091         if (intermediate->backing_hd == base) {
2092             base_bs = intermediate->backing_hd;
2093             break;
2094         }
2095         intermediate = intermediate->backing_hd;
2096     }
2097     if (base_bs == NULL) {
2098         /* something went wrong, we did not end at the base. safely
2099          * unravel everything, and exit with error */
2100         goto exit;
2101     }
2102 
2103     /* success - we can delete the intermediate states, and link top->base */
2104     ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2105                                    base_bs->drv ? base_bs->drv->format_name : "");
2106     if (ret) {
2107         goto exit;
2108     }
2109     new_top_bs->backing_hd = base_bs;
2110 
2111 
2112     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2113         /* so that bdrv_close() does not recursively close the chain */
2114         intermediate_state->bs->backing_hd = NULL;
2115         bdrv_delete(intermediate_state->bs);
2116     }
2117     ret = 0;
2118 
2119 exit:
2120     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2121         g_free(intermediate_state);
2122     }
2123     return ret;
2124 }
2125 
2126 
2127 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2128                                    size_t size)
2129 {
2130     int64_t len;
2131 
2132     if (!bdrv_is_inserted(bs))
2133         return -ENOMEDIUM;
2134 
2135     if (bs->growable)
2136         return 0;
2137 
2138     len = bdrv_getlength(bs);
2139 
2140     if (offset < 0)
2141         return -EIO;
2142 
2143     if ((offset > len) || (len - offset < size))
2144         return -EIO;
2145 
2146     return 0;
2147 }
2148 
2149 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2150                               int nb_sectors)
2151 {
2152     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2153                                    nb_sectors * BDRV_SECTOR_SIZE);
2154 }
2155 
2156 typedef struct RwCo {
2157     BlockDriverState *bs;
2158     int64_t sector_num;
2159     int nb_sectors;
2160     QEMUIOVector *qiov;
2161     bool is_write;
2162     int ret;
2163 } RwCo;
2164 
2165 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2166 {
2167     RwCo *rwco = opaque;
2168 
2169     if (!rwco->is_write) {
2170         rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
2171                                      rwco->nb_sectors, rwco->qiov, 0);
2172     } else {
2173         rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
2174                                       rwco->nb_sectors, rwco->qiov, 0);
2175     }
2176 }
2177 
2178 /*
2179  * Process a vectored synchronous request using coroutines
2180  */
2181 static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
2182                        QEMUIOVector *qiov, bool is_write)
2183 {
2184     Coroutine *co;
2185     RwCo rwco = {
2186         .bs = bs,
2187         .sector_num = sector_num,
2188         .nb_sectors = qiov->size >> BDRV_SECTOR_BITS,
2189         .qiov = qiov,
2190         .is_write = is_write,
2191         .ret = NOT_DONE,
2192     };
2193     assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0);
2194 
2195     /**
2196      * In sync call context, when the vcpu is blocked, this throttling timer
2197      * will not fire; so the I/O throttling function has to be disabled here
2198      * if it has been enabled.
2199      */
2200     if (bs->io_limits_enabled) {
2201         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2202                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2203         bdrv_io_limits_disable(bs);
2204     }
2205 
2206     if (qemu_in_coroutine()) {
2207         /* Fast-path if already in coroutine context */
2208         bdrv_rw_co_entry(&rwco);
2209     } else {
2210         co = qemu_coroutine_create(bdrv_rw_co_entry);
2211         qemu_coroutine_enter(co, &rwco);
2212         while (rwco.ret == NOT_DONE) {
2213             qemu_aio_wait();
2214         }
2215     }
2216     return rwco.ret;
2217 }
2218 
2219 /*
2220  * Process a synchronous request using coroutines
2221  */
2222 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2223                       int nb_sectors, bool is_write)
2224 {
2225     QEMUIOVector qiov;
2226     struct iovec iov = {
2227         .iov_base = (void *)buf,
2228         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2229     };
2230 
2231     qemu_iovec_init_external(&qiov, &iov, 1);
2232     return bdrv_rwv_co(bs, sector_num, &qiov, is_write);
2233 }
2234 
2235 /* return < 0 if error. See bdrv_write() for the return codes */
2236 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2237               uint8_t *buf, int nb_sectors)
2238 {
2239     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
2240 }
2241 
2242 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2243 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2244                           uint8_t *buf, int nb_sectors)
2245 {
2246     bool enabled;
2247     int ret;
2248 
2249     enabled = bs->io_limits_enabled;
2250     bs->io_limits_enabled = false;
2251     ret = bdrv_read(bs, 0, buf, 1);
2252     bs->io_limits_enabled = enabled;
2253     return ret;
2254 }
2255 
2256 /* Return < 0 if error. Important errors are:
2257   -EIO         generic I/O error (may happen for all errors)
2258   -ENOMEDIUM   No media inserted.
2259   -EINVAL      Invalid sector number or nb_sectors
2260   -EACCES      Trying to write a read-only device
2261 */
2262 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2263                const uint8_t *buf, int nb_sectors)
2264 {
2265     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
2266 }
2267 
2268 int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
2269 {
2270     return bdrv_rwv_co(bs, sector_num, qiov, true);
2271 }
2272 
2273 int bdrv_pread(BlockDriverState *bs, int64_t offset,
2274                void *buf, int count1)
2275 {
2276     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2277     int len, nb_sectors, count;
2278     int64_t sector_num;
2279     int ret;
2280 
2281     count = count1;
2282     /* first read to align to sector start */
2283     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2284     if (len > count)
2285         len = count;
2286     sector_num = offset >> BDRV_SECTOR_BITS;
2287     if (len > 0) {
2288         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2289             return ret;
2290         memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
2291         count -= len;
2292         if (count == 0)
2293             return count1;
2294         sector_num++;
2295         buf += len;
2296     }
2297 
2298     /* read the sectors "in place" */
2299     nb_sectors = count >> BDRV_SECTOR_BITS;
2300     if (nb_sectors > 0) {
2301         if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
2302             return ret;
2303         sector_num += nb_sectors;
2304         len = nb_sectors << BDRV_SECTOR_BITS;
2305         buf += len;
2306         count -= len;
2307     }
2308 
2309     /* add data from the last sector */
2310     if (count > 0) {
2311         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2312             return ret;
2313         memcpy(buf, tmp_buf, count);
2314     }
2315     return count1;
2316 }
2317 
2318 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2319 {
2320     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
2321     int len, nb_sectors, count;
2322     int64_t sector_num;
2323     int ret;
2324 
2325     count = qiov->size;
2326 
2327     /* first write to align to sector start */
2328     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
2329     if (len > count)
2330         len = count;
2331     sector_num = offset >> BDRV_SECTOR_BITS;
2332     if (len > 0) {
2333         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2334             return ret;
2335         qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)),
2336                           len);
2337         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2338             return ret;
2339         count -= len;
2340         if (count == 0)
2341             return qiov->size;
2342         sector_num++;
2343     }
2344 
2345     /* write the sectors "in place" */
2346     nb_sectors = count >> BDRV_SECTOR_BITS;
2347     if (nb_sectors > 0) {
2348         QEMUIOVector qiov_inplace;
2349 
2350         qemu_iovec_init(&qiov_inplace, qiov->niov);
2351         qemu_iovec_concat(&qiov_inplace, qiov, len,
2352                           nb_sectors << BDRV_SECTOR_BITS);
2353         ret = bdrv_writev(bs, sector_num, &qiov_inplace);
2354         qemu_iovec_destroy(&qiov_inplace);
2355         if (ret < 0) {
2356             return ret;
2357         }
2358 
2359         sector_num += nb_sectors;
2360         len = nb_sectors << BDRV_SECTOR_BITS;
2361         count -= len;
2362     }
2363 
2364     /* add data from the last sector */
2365     if (count > 0) {
2366         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2367             return ret;
2368         qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count);
2369         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2370             return ret;
2371     }
2372     return qiov->size;
2373 }
2374 
2375 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2376                 const void *buf, int count1)
2377 {
2378     QEMUIOVector qiov;
2379     struct iovec iov = {
2380         .iov_base   = (void *) buf,
2381         .iov_len    = count1,
2382     };
2383 
2384     qemu_iovec_init_external(&qiov, &iov, 1);
2385     return bdrv_pwritev(bs, offset, &qiov);
2386 }
2387 
2388 /*
2389  * Writes to the file and ensures that no writes are reordered across this
2390  * request (acts as a barrier)
2391  *
2392  * Returns 0 on success, -errno in error cases.
2393  */
2394 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2395     const void *buf, int count)
2396 {
2397     int ret;
2398 
2399     ret = bdrv_pwrite(bs, offset, buf, count);
2400     if (ret < 0) {
2401         return ret;
2402     }
2403 
2404     /* No flush needed for cache modes that already do it */
2405     if (bs->enable_write_cache) {
2406         bdrv_flush(bs);
2407     }
2408 
2409     return 0;
2410 }
2411 
2412 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2413         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2414 {
2415     /* Perform I/O through a temporary buffer so that users who scribble over
2416      * their read buffer while the operation is in progress do not end up
2417      * modifying the image file.  This is critical for zero-copy guest I/O
2418      * where anything might happen inside guest memory.
2419      */
2420     void *bounce_buffer;
2421 
2422     BlockDriver *drv = bs->drv;
2423     struct iovec iov;
2424     QEMUIOVector bounce_qiov;
2425     int64_t cluster_sector_num;
2426     int cluster_nb_sectors;
2427     size_t skip_bytes;
2428     int ret;
2429 
2430     /* Cover entire cluster so no additional backing file I/O is required when
2431      * allocating cluster in the image file.
2432      */
2433     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2434                            &cluster_sector_num, &cluster_nb_sectors);
2435 
2436     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2437                                    cluster_sector_num, cluster_nb_sectors);
2438 
2439     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2440     iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2441     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2442 
2443     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2444                              &bounce_qiov);
2445     if (ret < 0) {
2446         goto err;
2447     }
2448 
2449     if (drv->bdrv_co_write_zeroes &&
2450         buffer_is_zero(bounce_buffer, iov.iov_len)) {
2451         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2452                                       cluster_nb_sectors);
2453     } else {
2454         /* This does not change the data on the disk, it is not necessary
2455          * to flush even in cache=writethrough mode.
2456          */
2457         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2458                                   &bounce_qiov);
2459     }
2460 
2461     if (ret < 0) {
2462         /* It might be okay to ignore write errors for guest requests.  If this
2463          * is a deliberate copy-on-read then we don't want to ignore the error.
2464          * Simply report it in all cases.
2465          */
2466         goto err;
2467     }
2468 
2469     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2470     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2471                         nb_sectors * BDRV_SECTOR_SIZE);
2472 
2473 err:
2474     qemu_vfree(bounce_buffer);
2475     return ret;
2476 }
2477 
2478 /*
2479  * Handle a read request in coroutine context
2480  */
2481 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
2482     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2483     BdrvRequestFlags flags)
2484 {
2485     BlockDriver *drv = bs->drv;
2486     BdrvTrackedRequest req;
2487     int ret;
2488 
2489     if (!drv) {
2490         return -ENOMEDIUM;
2491     }
2492     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2493         return -EIO;
2494     }
2495 
2496     /* throttling disk read I/O */
2497     if (bs->io_limits_enabled) {
2498         bdrv_io_limits_intercept(bs, false, nb_sectors);
2499     }
2500 
2501     if (bs->copy_on_read) {
2502         flags |= BDRV_REQ_COPY_ON_READ;
2503     }
2504     if (flags & BDRV_REQ_COPY_ON_READ) {
2505         bs->copy_on_read_in_flight++;
2506     }
2507 
2508     if (bs->copy_on_read_in_flight) {
2509         wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2510     }
2511 
2512     tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
2513 
2514     if (flags & BDRV_REQ_COPY_ON_READ) {
2515         int pnum;
2516 
2517         ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
2518         if (ret < 0) {
2519             goto out;
2520         }
2521 
2522         if (!ret || pnum != nb_sectors) {
2523             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2524             goto out;
2525         }
2526     }
2527 
2528     ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2529 
2530 out:
2531     tracked_request_end(&req);
2532 
2533     if (flags & BDRV_REQ_COPY_ON_READ) {
2534         bs->copy_on_read_in_flight--;
2535     }
2536 
2537     return ret;
2538 }
2539 
2540 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
2541     int nb_sectors, QEMUIOVector *qiov)
2542 {
2543     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
2544 
2545     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
2546 }
2547 
2548 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
2549     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2550 {
2551     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
2552 
2553     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
2554                             BDRV_REQ_COPY_ON_READ);
2555 }
2556 
2557 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
2558     int64_t sector_num, int nb_sectors)
2559 {
2560     BlockDriver *drv = bs->drv;
2561     QEMUIOVector qiov;
2562     struct iovec iov;
2563     int ret;
2564 
2565     /* TODO Emulate only part of misaligned requests instead of letting block
2566      * drivers return -ENOTSUP and emulate everything */
2567 
2568     /* First try the efficient write zeroes operation */
2569     if (drv->bdrv_co_write_zeroes) {
2570         ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2571         if (ret != -ENOTSUP) {
2572             return ret;
2573         }
2574     }
2575 
2576     /* Fall back to bounce buffer if write zeroes is unsupported */
2577     iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
2578     iov.iov_base = qemu_blockalign(bs, iov.iov_len);
2579     memset(iov.iov_base, 0, iov.iov_len);
2580     qemu_iovec_init_external(&qiov, &iov, 1);
2581 
2582     ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
2583 
2584     qemu_vfree(iov.iov_base);
2585     return ret;
2586 }
2587 
2588 /*
2589  * Handle a write request in coroutine context
2590  */
2591 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
2592     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2593     BdrvRequestFlags flags)
2594 {
2595     BlockDriver *drv = bs->drv;
2596     BdrvTrackedRequest req;
2597     int ret;
2598 
2599     if (!bs->drv) {
2600         return -ENOMEDIUM;
2601     }
2602     if (bs->read_only) {
2603         return -EACCES;
2604     }
2605     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2606         return -EIO;
2607     }
2608 
2609     /* throttling disk write I/O */
2610     if (bs->io_limits_enabled) {
2611         bdrv_io_limits_intercept(bs, true, nb_sectors);
2612     }
2613 
2614     if (bs->copy_on_read_in_flight) {
2615         wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2616     }
2617 
2618     tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
2619 
2620     if (flags & BDRV_REQ_ZERO_WRITE) {
2621         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
2622     } else {
2623         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
2624     }
2625 
2626     if (ret == 0 && !bs->enable_write_cache) {
2627         ret = bdrv_co_flush(bs);
2628     }
2629 
2630     if (bs->dirty_bitmap) {
2631         bdrv_set_dirty(bs, sector_num, nb_sectors);
2632     }
2633 
2634     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2635         bs->wr_highest_sector = sector_num + nb_sectors - 1;
2636     }
2637 
2638     tracked_request_end(&req);
2639 
2640     return ret;
2641 }
2642 
2643 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2644     int nb_sectors, QEMUIOVector *qiov)
2645 {
2646     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2647 
2648     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2649 }
2650 
2651 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2652                                       int64_t sector_num, int nb_sectors)
2653 {
2654     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2655 
2656     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2657                              BDRV_REQ_ZERO_WRITE);
2658 }
2659 
2660 /**
2661  * Truncate file to 'offset' bytes (needed only for file protocols)
2662  */
2663 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2664 {
2665     BlockDriver *drv = bs->drv;
2666     int ret;
2667     if (!drv)
2668         return -ENOMEDIUM;
2669     if (!drv->bdrv_truncate)
2670         return -ENOTSUP;
2671     if (bs->read_only)
2672         return -EACCES;
2673     if (bdrv_in_use(bs))
2674         return -EBUSY;
2675     ret = drv->bdrv_truncate(bs, offset);
2676     if (ret == 0) {
2677         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2678         bdrv_dev_resize_cb(bs);
2679     }
2680     return ret;
2681 }
2682 
2683 /**
2684  * Length of a allocated file in bytes. Sparse files are counted by actual
2685  * allocated space. Return < 0 if error or unknown.
2686  */
2687 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2688 {
2689     BlockDriver *drv = bs->drv;
2690     if (!drv) {
2691         return -ENOMEDIUM;
2692     }
2693     if (drv->bdrv_get_allocated_file_size) {
2694         return drv->bdrv_get_allocated_file_size(bs);
2695     }
2696     if (bs->file) {
2697         return bdrv_get_allocated_file_size(bs->file);
2698     }
2699     return -ENOTSUP;
2700 }
2701 
2702 /**
2703  * Length of a file in bytes. Return < 0 if error or unknown.
2704  */
2705 int64_t bdrv_getlength(BlockDriverState *bs)
2706 {
2707     BlockDriver *drv = bs->drv;
2708     if (!drv)
2709         return -ENOMEDIUM;
2710 
2711     if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2712         if (drv->bdrv_getlength) {
2713             return drv->bdrv_getlength(bs);
2714         }
2715     }
2716     return bs->total_sectors * BDRV_SECTOR_SIZE;
2717 }
2718 
2719 /* return 0 as number of sectors if no device present or error */
2720 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2721 {
2722     int64_t length;
2723     length = bdrv_getlength(bs);
2724     if (length < 0)
2725         length = 0;
2726     else
2727         length = length >> BDRV_SECTOR_BITS;
2728     *nb_sectors_ptr = length;
2729 }
2730 
2731 /* throttling disk io limits */
2732 void bdrv_set_io_limits(BlockDriverState *bs,
2733                         BlockIOLimit *io_limits)
2734 {
2735     bs->io_limits = *io_limits;
2736     bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2737 }
2738 
2739 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
2740                        BlockdevOnError on_write_error)
2741 {
2742     bs->on_read_error = on_read_error;
2743     bs->on_write_error = on_write_error;
2744 }
2745 
2746 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
2747 {
2748     return is_read ? bs->on_read_error : bs->on_write_error;
2749 }
2750 
2751 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
2752 {
2753     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
2754 
2755     switch (on_err) {
2756     case BLOCKDEV_ON_ERROR_ENOSPC:
2757         return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
2758     case BLOCKDEV_ON_ERROR_STOP:
2759         return BDRV_ACTION_STOP;
2760     case BLOCKDEV_ON_ERROR_REPORT:
2761         return BDRV_ACTION_REPORT;
2762     case BLOCKDEV_ON_ERROR_IGNORE:
2763         return BDRV_ACTION_IGNORE;
2764     default:
2765         abort();
2766     }
2767 }
2768 
2769 /* This is done by device models because, while the block layer knows
2770  * about the error, it does not know whether an operation comes from
2771  * the device or the block layer (from a job, for example).
2772  */
2773 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
2774                        bool is_read, int error)
2775 {
2776     assert(error >= 0);
2777     bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
2778     if (action == BDRV_ACTION_STOP) {
2779         vm_stop(RUN_STATE_IO_ERROR);
2780         bdrv_iostatus_set_err(bs, error);
2781     }
2782 }
2783 
2784 int bdrv_is_read_only(BlockDriverState *bs)
2785 {
2786     return bs->read_only;
2787 }
2788 
2789 int bdrv_is_sg(BlockDriverState *bs)
2790 {
2791     return bs->sg;
2792 }
2793 
2794 int bdrv_enable_write_cache(BlockDriverState *bs)
2795 {
2796     return bs->enable_write_cache;
2797 }
2798 
2799 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
2800 {
2801     bs->enable_write_cache = wce;
2802 
2803     /* so a reopen() will preserve wce */
2804     if (wce) {
2805         bs->open_flags |= BDRV_O_CACHE_WB;
2806     } else {
2807         bs->open_flags &= ~BDRV_O_CACHE_WB;
2808     }
2809 }
2810 
2811 int bdrv_is_encrypted(BlockDriverState *bs)
2812 {
2813     if (bs->backing_hd && bs->backing_hd->encrypted)
2814         return 1;
2815     return bs->encrypted;
2816 }
2817 
2818 int bdrv_key_required(BlockDriverState *bs)
2819 {
2820     BlockDriverState *backing_hd = bs->backing_hd;
2821 
2822     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2823         return 1;
2824     return (bs->encrypted && !bs->valid_key);
2825 }
2826 
2827 int bdrv_set_key(BlockDriverState *bs, const char *key)
2828 {
2829     int ret;
2830     if (bs->backing_hd && bs->backing_hd->encrypted) {
2831         ret = bdrv_set_key(bs->backing_hd, key);
2832         if (ret < 0)
2833             return ret;
2834         if (!bs->encrypted)
2835             return 0;
2836     }
2837     if (!bs->encrypted) {
2838         return -EINVAL;
2839     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2840         return -ENOMEDIUM;
2841     }
2842     ret = bs->drv->bdrv_set_key(bs, key);
2843     if (ret < 0) {
2844         bs->valid_key = 0;
2845     } else if (!bs->valid_key) {
2846         bs->valid_key = 1;
2847         /* call the change callback now, we skipped it on open */
2848         bdrv_dev_change_media_cb(bs, true);
2849     }
2850     return ret;
2851 }
2852 
2853 const char *bdrv_get_format_name(BlockDriverState *bs)
2854 {
2855     return bs->drv ? bs->drv->format_name : NULL;
2856 }
2857 
2858 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2859                          void *opaque)
2860 {
2861     BlockDriver *drv;
2862 
2863     QLIST_FOREACH(drv, &bdrv_drivers, list) {
2864         it(opaque, drv->format_name);
2865     }
2866 }
2867 
2868 BlockDriverState *bdrv_find(const char *name)
2869 {
2870     BlockDriverState *bs;
2871 
2872     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2873         if (!strcmp(name, bs->device_name)) {
2874             return bs;
2875         }
2876     }
2877     return NULL;
2878 }
2879 
2880 BlockDriverState *bdrv_next(BlockDriverState *bs)
2881 {
2882     if (!bs) {
2883         return QTAILQ_FIRST(&bdrv_states);
2884     }
2885     return QTAILQ_NEXT(bs, list);
2886 }
2887 
2888 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2889 {
2890     BlockDriverState *bs;
2891 
2892     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2893         it(opaque, bs);
2894     }
2895 }
2896 
2897 const char *bdrv_get_device_name(BlockDriverState *bs)
2898 {
2899     return bs->device_name;
2900 }
2901 
2902 int bdrv_get_flags(BlockDriverState *bs)
2903 {
2904     return bs->open_flags;
2905 }
2906 
2907 void bdrv_flush_all(void)
2908 {
2909     BlockDriverState *bs;
2910 
2911     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2912         bdrv_flush(bs);
2913     }
2914 }
2915 
2916 int bdrv_has_zero_init(BlockDriverState *bs)
2917 {
2918     assert(bs->drv);
2919 
2920     if (bs->drv->bdrv_has_zero_init) {
2921         return bs->drv->bdrv_has_zero_init(bs);
2922     }
2923 
2924     return 1;
2925 }
2926 
2927 typedef struct BdrvCoIsAllocatedData {
2928     BlockDriverState *bs;
2929     BlockDriverState *base;
2930     int64_t sector_num;
2931     int nb_sectors;
2932     int *pnum;
2933     int ret;
2934     bool done;
2935 } BdrvCoIsAllocatedData;
2936 
2937 /*
2938  * Returns true iff the specified sector is present in the disk image. Drivers
2939  * not implementing the functionality are assumed to not support backing files,
2940  * hence all their sectors are reported as allocated.
2941  *
2942  * If 'sector_num' is beyond the end of the disk image the return value is 0
2943  * and 'pnum' is set to 0.
2944  *
2945  * 'pnum' is set to the number of sectors (including and immediately following
2946  * the specified sector) that are known to be in the same
2947  * allocated/unallocated state.
2948  *
2949  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
2950  * beyond the end of the disk image it will be clamped.
2951  */
2952 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2953                                       int nb_sectors, int *pnum)
2954 {
2955     int64_t n;
2956 
2957     if (sector_num >= bs->total_sectors) {
2958         *pnum = 0;
2959         return 0;
2960     }
2961 
2962     n = bs->total_sectors - sector_num;
2963     if (n < nb_sectors) {
2964         nb_sectors = n;
2965     }
2966 
2967     if (!bs->drv->bdrv_co_is_allocated) {
2968         *pnum = nb_sectors;
2969         return 1;
2970     }
2971 
2972     return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2973 }
2974 
2975 /* Coroutine wrapper for bdrv_is_allocated() */
2976 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2977 {
2978     BdrvCoIsAllocatedData *data = opaque;
2979     BlockDriverState *bs = data->bs;
2980 
2981     data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2982                                      data->pnum);
2983     data->done = true;
2984 }
2985 
2986 /*
2987  * Synchronous wrapper around bdrv_co_is_allocated().
2988  *
2989  * See bdrv_co_is_allocated() for details.
2990  */
2991 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2992                       int *pnum)
2993 {
2994     Coroutine *co;
2995     BdrvCoIsAllocatedData data = {
2996         .bs = bs,
2997         .sector_num = sector_num,
2998         .nb_sectors = nb_sectors,
2999         .pnum = pnum,
3000         .done = false,
3001     };
3002 
3003     co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
3004     qemu_coroutine_enter(co, &data);
3005     while (!data.done) {
3006         qemu_aio_wait();
3007     }
3008     return data.ret;
3009 }
3010 
3011 /*
3012  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3013  *
3014  * Return true if the given sector is allocated in any image between
3015  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
3016  * sector is allocated in any image of the chain.  Return false otherwise.
3017  *
3018  * 'pnum' is set to the number of sectors (including and immediately following
3019  *  the specified sector) that are known to be in the same
3020  *  allocated/unallocated state.
3021  *
3022  */
3023 int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
3024                                             BlockDriverState *base,
3025                                             int64_t sector_num,
3026                                             int nb_sectors, int *pnum)
3027 {
3028     BlockDriverState *intermediate;
3029     int ret, n = nb_sectors;
3030 
3031     intermediate = top;
3032     while (intermediate && intermediate != base) {
3033         int pnum_inter;
3034         ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
3035                                    &pnum_inter);
3036         if (ret < 0) {
3037             return ret;
3038         } else if (ret) {
3039             *pnum = pnum_inter;
3040             return 1;
3041         }
3042 
3043         /*
3044          * [sector_num, nb_sectors] is unallocated on top but intermediate
3045          * might have
3046          *
3047          * [sector_num+x, nr_sectors] allocated.
3048          */
3049         if (n > pnum_inter &&
3050             (intermediate == top ||
3051              sector_num + pnum_inter < intermediate->total_sectors)) {
3052             n = pnum_inter;
3053         }
3054 
3055         intermediate = intermediate->backing_hd;
3056     }
3057 
3058     *pnum = n;
3059     return 0;
3060 }
3061 
3062 /* Coroutine wrapper for bdrv_is_allocated_above() */
3063 static void coroutine_fn bdrv_is_allocated_above_co_entry(void *opaque)
3064 {
3065     BdrvCoIsAllocatedData *data = opaque;
3066     BlockDriverState *top = data->bs;
3067     BlockDriverState *base = data->base;
3068 
3069     data->ret = bdrv_co_is_allocated_above(top, base, data->sector_num,
3070                                            data->nb_sectors, data->pnum);
3071     data->done = true;
3072 }
3073 
3074 /*
3075  * Synchronous wrapper around bdrv_co_is_allocated_above().
3076  *
3077  * See bdrv_co_is_allocated_above() for details.
3078  */
3079 int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base,
3080                             int64_t sector_num, int nb_sectors, int *pnum)
3081 {
3082     Coroutine *co;
3083     BdrvCoIsAllocatedData data = {
3084         .bs = top,
3085         .base = base,
3086         .sector_num = sector_num,
3087         .nb_sectors = nb_sectors,
3088         .pnum = pnum,
3089         .done = false,
3090     };
3091 
3092     co = qemu_coroutine_create(bdrv_is_allocated_above_co_entry);
3093     qemu_coroutine_enter(co, &data);
3094     while (!data.done) {
3095         qemu_aio_wait();
3096     }
3097     return data.ret;
3098 }
3099 
3100 BlockInfo *bdrv_query_info(BlockDriverState *bs)
3101 {
3102     BlockInfo *info = g_malloc0(sizeof(*info));
3103     info->device = g_strdup(bs->device_name);
3104     info->type = g_strdup("unknown");
3105     info->locked = bdrv_dev_is_medium_locked(bs);
3106     info->removable = bdrv_dev_has_removable_media(bs);
3107 
3108     if (bdrv_dev_has_removable_media(bs)) {
3109         info->has_tray_open = true;
3110         info->tray_open = bdrv_dev_is_tray_open(bs);
3111     }
3112 
3113     if (bdrv_iostatus_is_enabled(bs)) {
3114         info->has_io_status = true;
3115         info->io_status = bs->iostatus;
3116     }
3117 
3118     if (bs->dirty_bitmap) {
3119         info->has_dirty = true;
3120         info->dirty = g_malloc0(sizeof(*info->dirty));
3121         info->dirty->count = bdrv_get_dirty_count(bs) * BDRV_SECTOR_SIZE;
3122         info->dirty->granularity =
3123             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bs->dirty_bitmap));
3124     }
3125 
3126     if (bs->drv) {
3127         info->has_inserted = true;
3128         info->inserted = g_malloc0(sizeof(*info->inserted));
3129         info->inserted->file = g_strdup(bs->filename);
3130         info->inserted->ro = bs->read_only;
3131         info->inserted->drv = g_strdup(bs->drv->format_name);
3132         info->inserted->encrypted = bs->encrypted;
3133         info->inserted->encryption_key_missing = bdrv_key_required(bs);
3134 
3135         if (bs->backing_file[0]) {
3136             info->inserted->has_backing_file = true;
3137             info->inserted->backing_file = g_strdup(bs->backing_file);
3138         }
3139 
3140         info->inserted->backing_file_depth = bdrv_get_backing_file_depth(bs);
3141 
3142         if (bs->io_limits_enabled) {
3143             info->inserted->bps =
3144                            bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3145             info->inserted->bps_rd =
3146                            bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
3147             info->inserted->bps_wr =
3148                            bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
3149             info->inserted->iops =
3150                            bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3151             info->inserted->iops_rd =
3152                            bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
3153             info->inserted->iops_wr =
3154                            bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
3155         }
3156     }
3157     return info;
3158 }
3159 
3160 BlockInfoList *qmp_query_block(Error **errp)
3161 {
3162     BlockInfoList *head = NULL, **p_next = &head;
3163     BlockDriverState *bs;
3164 
3165     QTAILQ_FOREACH(bs, &bdrv_states, list) {
3166         BlockInfoList *info = g_malloc0(sizeof(*info));
3167         info->value = bdrv_query_info(bs);
3168 
3169         *p_next = info;
3170         p_next = &info->next;
3171     }
3172 
3173     return head;
3174 }
3175 
3176 BlockStats *bdrv_query_stats(const BlockDriverState *bs)
3177 {
3178     BlockStats *s;
3179 
3180     s = g_malloc0(sizeof(*s));
3181 
3182     if (bs->device_name[0]) {
3183         s->has_device = true;
3184         s->device = g_strdup(bs->device_name);
3185     }
3186 
3187     s->stats = g_malloc0(sizeof(*s->stats));
3188     s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
3189     s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
3190     s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
3191     s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
3192     s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
3193     s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
3194     s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
3195     s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
3196     s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
3197 
3198     if (bs->file) {
3199         s->has_parent = true;
3200         s->parent = bdrv_query_stats(bs->file);
3201     }
3202 
3203     return s;
3204 }
3205 
3206 BlockStatsList *qmp_query_blockstats(Error **errp)
3207 {
3208     BlockStatsList *head = NULL, **p_next = &head;
3209     BlockDriverState *bs;
3210 
3211     QTAILQ_FOREACH(bs, &bdrv_states, list) {
3212         BlockStatsList *info = g_malloc0(sizeof(*info));
3213         info->value = bdrv_query_stats(bs);
3214 
3215         *p_next = info;
3216         p_next = &info->next;
3217     }
3218 
3219     return head;
3220 }
3221 
3222 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3223 {
3224     if (bs->backing_hd && bs->backing_hd->encrypted)
3225         return bs->backing_file;
3226     else if (bs->encrypted)
3227         return bs->filename;
3228     else
3229         return NULL;
3230 }
3231 
3232 void bdrv_get_backing_filename(BlockDriverState *bs,
3233                                char *filename, int filename_size)
3234 {
3235     pstrcpy(filename, filename_size, bs->backing_file);
3236 }
3237 
3238 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3239                           const uint8_t *buf, int nb_sectors)
3240 {
3241     BlockDriver *drv = bs->drv;
3242     if (!drv)
3243         return -ENOMEDIUM;
3244     if (!drv->bdrv_write_compressed)
3245         return -ENOTSUP;
3246     if (bdrv_check_request(bs, sector_num, nb_sectors))
3247         return -EIO;
3248 
3249     assert(!bs->dirty_bitmap);
3250 
3251     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3252 }
3253 
3254 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3255 {
3256     BlockDriver *drv = bs->drv;
3257     if (!drv)
3258         return -ENOMEDIUM;
3259     if (!drv->bdrv_get_info)
3260         return -ENOTSUP;
3261     memset(bdi, 0, sizeof(*bdi));
3262     return drv->bdrv_get_info(bs, bdi);
3263 }
3264 
3265 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3266                       int64_t pos, int size)
3267 {
3268     QEMUIOVector qiov;
3269     struct iovec iov = {
3270         .iov_base   = (void *) buf,
3271         .iov_len    = size,
3272     };
3273 
3274     qemu_iovec_init_external(&qiov, &iov, 1);
3275     return bdrv_writev_vmstate(bs, &qiov, pos);
3276 }
3277 
3278 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
3279 {
3280     BlockDriver *drv = bs->drv;
3281 
3282     if (!drv) {
3283         return -ENOMEDIUM;
3284     } else if (drv->bdrv_save_vmstate) {
3285         return drv->bdrv_save_vmstate(bs, qiov, pos);
3286     } else if (bs->file) {
3287         return bdrv_writev_vmstate(bs->file, qiov, pos);
3288     }
3289 
3290     return -ENOTSUP;
3291 }
3292 
3293 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3294                       int64_t pos, int size)
3295 {
3296     BlockDriver *drv = bs->drv;
3297     if (!drv)
3298         return -ENOMEDIUM;
3299     if (drv->bdrv_load_vmstate)
3300         return drv->bdrv_load_vmstate(bs, buf, pos, size);
3301     if (bs->file)
3302         return bdrv_load_vmstate(bs->file, buf, pos, size);
3303     return -ENOTSUP;
3304 }
3305 
3306 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
3307 {
3308     BlockDriver *drv = bs->drv;
3309 
3310     if (!drv || !drv->bdrv_debug_event) {
3311         return;
3312     }
3313 
3314     drv->bdrv_debug_event(bs, event);
3315 }
3316 
3317 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
3318                           const char *tag)
3319 {
3320     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
3321         bs = bs->file;
3322     }
3323 
3324     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
3325         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
3326     }
3327 
3328     return -ENOTSUP;
3329 }
3330 
3331 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
3332 {
3333     while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
3334         bs = bs->file;
3335     }
3336 
3337     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
3338         return bs->drv->bdrv_debug_resume(bs, tag);
3339     }
3340 
3341     return -ENOTSUP;
3342 }
3343 
3344 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
3345 {
3346     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
3347         bs = bs->file;
3348     }
3349 
3350     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
3351         return bs->drv->bdrv_debug_is_suspended(bs, tag);
3352     }
3353 
3354     return false;
3355 }
3356 
3357 /**************************************************************/
3358 /* handling of snapshots */
3359 
3360 int bdrv_can_snapshot(BlockDriverState *bs)
3361 {
3362     BlockDriver *drv = bs->drv;
3363     if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3364         return 0;
3365     }
3366 
3367     if (!drv->bdrv_snapshot_create) {
3368         if (bs->file != NULL) {
3369             return bdrv_can_snapshot(bs->file);
3370         }
3371         return 0;
3372     }
3373 
3374     return 1;
3375 }
3376 
3377 int bdrv_is_snapshot(BlockDriverState *bs)
3378 {
3379     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
3380 }
3381 
3382 BlockDriverState *bdrv_snapshots(void)
3383 {
3384     BlockDriverState *bs;
3385 
3386     if (bs_snapshots) {
3387         return bs_snapshots;
3388     }
3389 
3390     bs = NULL;
3391     while ((bs = bdrv_next(bs))) {
3392         if (bdrv_can_snapshot(bs)) {
3393             bs_snapshots = bs;
3394             return bs;
3395         }
3396     }
3397     return NULL;
3398 }
3399 
3400 int bdrv_snapshot_create(BlockDriverState *bs,
3401                          QEMUSnapshotInfo *sn_info)
3402 {
3403     BlockDriver *drv = bs->drv;
3404     if (!drv)
3405         return -ENOMEDIUM;
3406     if (drv->bdrv_snapshot_create)
3407         return drv->bdrv_snapshot_create(bs, sn_info);
3408     if (bs->file)
3409         return bdrv_snapshot_create(bs->file, sn_info);
3410     return -ENOTSUP;
3411 }
3412 
3413 int bdrv_snapshot_goto(BlockDriverState *bs,
3414                        const char *snapshot_id)
3415 {
3416     BlockDriver *drv = bs->drv;
3417     int ret, open_ret;
3418 
3419     if (!drv)
3420         return -ENOMEDIUM;
3421     if (drv->bdrv_snapshot_goto)
3422         return drv->bdrv_snapshot_goto(bs, snapshot_id);
3423 
3424     if (bs->file) {
3425         drv->bdrv_close(bs);
3426         ret = bdrv_snapshot_goto(bs->file, snapshot_id);
3427         open_ret = drv->bdrv_open(bs, NULL, bs->open_flags);
3428         if (open_ret < 0) {
3429             bdrv_delete(bs->file);
3430             bs->drv = NULL;
3431             return open_ret;
3432         }
3433         return ret;
3434     }
3435 
3436     return -ENOTSUP;
3437 }
3438 
3439 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
3440 {
3441     BlockDriver *drv = bs->drv;
3442     if (!drv)
3443         return -ENOMEDIUM;
3444     if (drv->bdrv_snapshot_delete)
3445         return drv->bdrv_snapshot_delete(bs, snapshot_id);
3446     if (bs->file)
3447         return bdrv_snapshot_delete(bs->file, snapshot_id);
3448     return -ENOTSUP;
3449 }
3450 
3451 int bdrv_snapshot_list(BlockDriverState *bs,
3452                        QEMUSnapshotInfo **psn_info)
3453 {
3454     BlockDriver *drv = bs->drv;
3455     if (!drv)
3456         return -ENOMEDIUM;
3457     if (drv->bdrv_snapshot_list)
3458         return drv->bdrv_snapshot_list(bs, psn_info);
3459     if (bs->file)
3460         return bdrv_snapshot_list(bs->file, psn_info);
3461     return -ENOTSUP;
3462 }
3463 
3464 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
3465         const char *snapshot_name)
3466 {
3467     BlockDriver *drv = bs->drv;
3468     if (!drv) {
3469         return -ENOMEDIUM;
3470     }
3471     if (!bs->read_only) {
3472         return -EINVAL;
3473     }
3474     if (drv->bdrv_snapshot_load_tmp) {
3475         return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
3476     }
3477     return -ENOTSUP;
3478 }
3479 
3480 /* backing_file can either be relative, or absolute, or a protocol.  If it is
3481  * relative, it must be relative to the chain.  So, passing in bs->filename
3482  * from a BDS as backing_file should not be done, as that may be relative to
3483  * the CWD rather than the chain. */
3484 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
3485         const char *backing_file)
3486 {
3487     char *filename_full = NULL;
3488     char *backing_file_full = NULL;
3489     char *filename_tmp = NULL;
3490     int is_protocol = 0;
3491     BlockDriverState *curr_bs = NULL;
3492     BlockDriverState *retval = NULL;
3493 
3494     if (!bs || !bs->drv || !backing_file) {
3495         return NULL;
3496     }
3497 
3498     filename_full     = g_malloc(PATH_MAX);
3499     backing_file_full = g_malloc(PATH_MAX);
3500     filename_tmp      = g_malloc(PATH_MAX);
3501 
3502     is_protocol = path_has_protocol(backing_file);
3503 
3504     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
3505 
3506         /* If either of the filename paths is actually a protocol, then
3507          * compare unmodified paths; otherwise make paths relative */
3508         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
3509             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
3510                 retval = curr_bs->backing_hd;
3511                 break;
3512             }
3513         } else {
3514             /* If not an absolute filename path, make it relative to the current
3515              * image's filename path */
3516             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3517                          backing_file);
3518 
3519             /* We are going to compare absolute pathnames */
3520             if (!realpath(filename_tmp, filename_full)) {
3521                 continue;
3522             }
3523 
3524             /* We need to make sure the backing filename we are comparing against
3525              * is relative to the current image filename (or absolute) */
3526             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
3527                          curr_bs->backing_file);
3528 
3529             if (!realpath(filename_tmp, backing_file_full)) {
3530                 continue;
3531             }
3532 
3533             if (strcmp(backing_file_full, filename_full) == 0) {
3534                 retval = curr_bs->backing_hd;
3535                 break;
3536             }
3537         }
3538     }
3539 
3540     g_free(filename_full);
3541     g_free(backing_file_full);
3542     g_free(filename_tmp);
3543     return retval;
3544 }
3545 
3546 int bdrv_get_backing_file_depth(BlockDriverState *bs)
3547 {
3548     if (!bs->drv) {
3549         return 0;
3550     }
3551 
3552     if (!bs->backing_hd) {
3553         return 0;
3554     }
3555 
3556     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
3557 }
3558 
3559 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
3560 {
3561     BlockDriverState *curr_bs = NULL;
3562 
3563     if (!bs) {
3564         return NULL;
3565     }
3566 
3567     curr_bs = bs;
3568 
3569     while (curr_bs->backing_hd) {
3570         curr_bs = curr_bs->backing_hd;
3571     }
3572     return curr_bs;
3573 }
3574 
3575 #define NB_SUFFIXES 4
3576 
3577 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
3578 {
3579     static const char suffixes[NB_SUFFIXES] = "KMGT";
3580     int64_t base;
3581     int i;
3582 
3583     if (size <= 999) {
3584         snprintf(buf, buf_size, "%" PRId64, size);
3585     } else {
3586         base = 1024;
3587         for(i = 0; i < NB_SUFFIXES; i++) {
3588             if (size < (10 * base)) {
3589                 snprintf(buf, buf_size, "%0.1f%c",
3590                          (double)size / base,
3591                          suffixes[i]);
3592                 break;
3593             } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
3594                 snprintf(buf, buf_size, "%" PRId64 "%c",
3595                          ((size + (base >> 1)) / base),
3596                          suffixes[i]);
3597                 break;
3598             }
3599             base = base * 1024;
3600         }
3601     }
3602     return buf;
3603 }
3604 
3605 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
3606 {
3607     char buf1[128], date_buf[128], clock_buf[128];
3608     struct tm tm;
3609     time_t ti;
3610     int64_t secs;
3611 
3612     if (!sn) {
3613         snprintf(buf, buf_size,
3614                  "%-10s%-20s%7s%20s%15s",
3615                  "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
3616     } else {
3617         ti = sn->date_sec;
3618         localtime_r(&ti, &tm);
3619         strftime(date_buf, sizeof(date_buf),
3620                  "%Y-%m-%d %H:%M:%S", &tm);
3621         secs = sn->vm_clock_nsec / 1000000000;
3622         snprintf(clock_buf, sizeof(clock_buf),
3623                  "%02d:%02d:%02d.%03d",
3624                  (int)(secs / 3600),
3625                  (int)((secs / 60) % 60),
3626                  (int)(secs % 60),
3627                  (int)((sn->vm_clock_nsec / 1000000) % 1000));
3628         snprintf(buf, buf_size,
3629                  "%-10s%-20s%7s%20s%15s",
3630                  sn->id_str, sn->name,
3631                  get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
3632                  date_buf,
3633                  clock_buf);
3634     }
3635     return buf;
3636 }
3637 
3638 /**************************************************************/
3639 /* async I/Os */
3640 
3641 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3642                                  QEMUIOVector *qiov, int nb_sectors,
3643                                  BlockDriverCompletionFunc *cb, void *opaque)
3644 {
3645     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3646 
3647     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3648                                  cb, opaque, false);
3649 }
3650 
3651 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3652                                   QEMUIOVector *qiov, int nb_sectors,
3653                                   BlockDriverCompletionFunc *cb, void *opaque)
3654 {
3655     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3656 
3657     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3658                                  cb, opaque, true);
3659 }
3660 
3661 
3662 typedef struct MultiwriteCB {
3663     int error;
3664     int num_requests;
3665     int num_callbacks;
3666     struct {
3667         BlockDriverCompletionFunc *cb;
3668         void *opaque;
3669         QEMUIOVector *free_qiov;
3670     } callbacks[];
3671 } MultiwriteCB;
3672 
3673 static void multiwrite_user_cb(MultiwriteCB *mcb)
3674 {
3675     int i;
3676 
3677     for (i = 0; i < mcb->num_callbacks; i++) {
3678         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3679         if (mcb->callbacks[i].free_qiov) {
3680             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3681         }
3682         g_free(mcb->callbacks[i].free_qiov);
3683     }
3684 }
3685 
3686 static void multiwrite_cb(void *opaque, int ret)
3687 {
3688     MultiwriteCB *mcb = opaque;
3689 
3690     trace_multiwrite_cb(mcb, ret);
3691 
3692     if (ret < 0 && !mcb->error) {
3693         mcb->error = ret;
3694     }
3695 
3696     mcb->num_requests--;
3697     if (mcb->num_requests == 0) {
3698         multiwrite_user_cb(mcb);
3699         g_free(mcb);
3700     }
3701 }
3702 
3703 static int multiwrite_req_compare(const void *a, const void *b)
3704 {
3705     const BlockRequest *req1 = a, *req2 = b;
3706 
3707     /*
3708      * Note that we can't simply subtract req2->sector from req1->sector
3709      * here as that could overflow the return value.
3710      */
3711     if (req1->sector > req2->sector) {
3712         return 1;
3713     } else if (req1->sector < req2->sector) {
3714         return -1;
3715     } else {
3716         return 0;
3717     }
3718 }
3719 
3720 /*
3721  * Takes a bunch of requests and tries to merge them. Returns the number of
3722  * requests that remain after merging.
3723  */
3724 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3725     int num_reqs, MultiwriteCB *mcb)
3726 {
3727     int i, outidx;
3728 
3729     // Sort requests by start sector
3730     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3731 
3732     // Check if adjacent requests touch the same clusters. If so, combine them,
3733     // filling up gaps with zero sectors.
3734     outidx = 0;
3735     for (i = 1; i < num_reqs; i++) {
3736         int merge = 0;
3737         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3738 
3739         // Handle exactly sequential writes and overlapping writes.
3740         if (reqs[i].sector <= oldreq_last) {
3741             merge = 1;
3742         }
3743 
3744         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3745             merge = 0;
3746         }
3747 
3748         if (merge) {
3749             size_t size;
3750             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3751             qemu_iovec_init(qiov,
3752                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3753 
3754             // Add the first request to the merged one. If the requests are
3755             // overlapping, drop the last sectors of the first request.
3756             size = (reqs[i].sector - reqs[outidx].sector) << 9;
3757             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
3758 
3759             // We should need to add any zeros between the two requests
3760             assert (reqs[i].sector <= oldreq_last);
3761 
3762             // Add the second request
3763             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
3764 
3765             reqs[outidx].nb_sectors = qiov->size >> 9;
3766             reqs[outidx].qiov = qiov;
3767 
3768             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3769         } else {
3770             outidx++;
3771             reqs[outidx].sector     = reqs[i].sector;
3772             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3773             reqs[outidx].qiov       = reqs[i].qiov;
3774         }
3775     }
3776 
3777     return outidx + 1;
3778 }
3779 
3780 /*
3781  * Submit multiple AIO write requests at once.
3782  *
3783  * On success, the function returns 0 and all requests in the reqs array have
3784  * been submitted. In error case this function returns -1, and any of the
3785  * requests may or may not be submitted yet. In particular, this means that the
3786  * callback will be called for some of the requests, for others it won't. The
3787  * caller must check the error field of the BlockRequest to wait for the right
3788  * callbacks (if error != 0, no callback will be called).
3789  *
3790  * The implementation may modify the contents of the reqs array, e.g. to merge
3791  * requests. However, the fields opaque and error are left unmodified as they
3792  * are used to signal failure for a single request to the caller.
3793  */
3794 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3795 {
3796     MultiwriteCB *mcb;
3797     int i;
3798 
3799     /* don't submit writes if we don't have a medium */
3800     if (bs->drv == NULL) {
3801         for (i = 0; i < num_reqs; i++) {
3802             reqs[i].error = -ENOMEDIUM;
3803         }
3804         return -1;
3805     }
3806 
3807     if (num_reqs == 0) {
3808         return 0;
3809     }
3810 
3811     // Create MultiwriteCB structure
3812     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3813     mcb->num_requests = 0;
3814     mcb->num_callbacks = num_reqs;
3815 
3816     for (i = 0; i < num_reqs; i++) {
3817         mcb->callbacks[i].cb = reqs[i].cb;
3818         mcb->callbacks[i].opaque = reqs[i].opaque;
3819     }
3820 
3821     // Check for mergable requests
3822     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3823 
3824     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3825 
3826     /* Run the aio requests. */
3827     mcb->num_requests = num_reqs;
3828     for (i = 0; i < num_reqs; i++) {
3829         bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3830             reqs[i].nb_sectors, multiwrite_cb, mcb);
3831     }
3832 
3833     return 0;
3834 }
3835 
3836 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3837 {
3838     acb->aiocb_info->cancel(acb);
3839 }
3840 
3841 /* block I/O throttling */
3842 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3843                  bool is_write, double elapsed_time, uint64_t *wait)
3844 {
3845     uint64_t bps_limit = 0;
3846     uint64_t extension;
3847     double   bytes_limit, bytes_base, bytes_res;
3848     double   slice_time, wait_time;
3849 
3850     if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3851         bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3852     } else if (bs->io_limits.bps[is_write]) {
3853         bps_limit = bs->io_limits.bps[is_write];
3854     } else {
3855         if (wait) {
3856             *wait = 0;
3857         }
3858 
3859         return false;
3860     }
3861 
3862     slice_time = bs->slice_end - bs->slice_start;
3863     slice_time /= (NANOSECONDS_PER_SECOND);
3864     bytes_limit = bps_limit * slice_time;
3865     bytes_base  = bs->slice_submitted.bytes[is_write];
3866     if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3867         bytes_base += bs->slice_submitted.bytes[!is_write];
3868     }
3869 
3870     /* bytes_base: the bytes of data which have been read/written; and
3871      *             it is obtained from the history statistic info.
3872      * bytes_res: the remaining bytes of data which need to be read/written.
3873      * (bytes_base + bytes_res) / bps_limit: used to calcuate
3874      *             the total time for completing reading/writting all data.
3875      */
3876     bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3877 
3878     if (bytes_base + bytes_res <= bytes_limit) {
3879         if (wait) {
3880             *wait = 0;
3881         }
3882 
3883         return false;
3884     }
3885 
3886     /* Calc approx time to dispatch */
3887     wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3888 
3889     /* When the I/O rate at runtime exceeds the limits,
3890      * bs->slice_end need to be extended in order that the current statistic
3891      * info can be kept until the timer fire, so it is increased and tuned
3892      * based on the result of experiment.
3893      */
3894     extension = wait_time * NANOSECONDS_PER_SECOND;
3895     extension = DIV_ROUND_UP(extension, BLOCK_IO_SLICE_TIME) *
3896                 BLOCK_IO_SLICE_TIME;
3897     bs->slice_end += extension;
3898     if (wait) {
3899         *wait = wait_time * NANOSECONDS_PER_SECOND;
3900     }
3901 
3902     return true;
3903 }
3904 
3905 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3906                              double elapsed_time, uint64_t *wait)
3907 {
3908     uint64_t iops_limit = 0;
3909     double   ios_limit, ios_base;
3910     double   slice_time, wait_time;
3911 
3912     if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3913         iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3914     } else if (bs->io_limits.iops[is_write]) {
3915         iops_limit = bs->io_limits.iops[is_write];
3916     } else {
3917         if (wait) {
3918             *wait = 0;
3919         }
3920 
3921         return false;
3922     }
3923 
3924     slice_time = bs->slice_end - bs->slice_start;
3925     slice_time /= (NANOSECONDS_PER_SECOND);
3926     ios_limit  = iops_limit * slice_time;
3927     ios_base   = bs->slice_submitted.ios[is_write];
3928     if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3929         ios_base += bs->slice_submitted.ios[!is_write];
3930     }
3931 
3932     if (ios_base + 1 <= ios_limit) {
3933         if (wait) {
3934             *wait = 0;
3935         }
3936 
3937         return false;
3938     }
3939 
3940     /* Calc approx time to dispatch, in seconds */
3941     wait_time = (ios_base + 1) / iops_limit;
3942     if (wait_time > elapsed_time) {
3943         wait_time = wait_time - elapsed_time;
3944     } else {
3945         wait_time = 0;
3946     }
3947 
3948     /* Exceeded current slice, extend it by another slice time */
3949     bs->slice_end += BLOCK_IO_SLICE_TIME;
3950     if (wait) {
3951         *wait = wait_time * NANOSECONDS_PER_SECOND;
3952     }
3953 
3954     return true;
3955 }
3956 
3957 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3958                            bool is_write, int64_t *wait)
3959 {
3960     int64_t  now, max_wait;
3961     uint64_t bps_wait = 0, iops_wait = 0;
3962     double   elapsed_time;
3963     int      bps_ret, iops_ret;
3964 
3965     now = qemu_get_clock_ns(vm_clock);
3966     if (now > bs->slice_end) {
3967         bs->slice_start = now;
3968         bs->slice_end   = now + BLOCK_IO_SLICE_TIME;
3969         memset(&bs->slice_submitted, 0, sizeof(bs->slice_submitted));
3970     }
3971 
3972     elapsed_time  = now - bs->slice_start;
3973     elapsed_time  /= (NANOSECONDS_PER_SECOND);
3974 
3975     bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
3976                                       is_write, elapsed_time, &bps_wait);
3977     iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3978                                       elapsed_time, &iops_wait);
3979     if (bps_ret || iops_ret) {
3980         max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3981         if (wait) {
3982             *wait = max_wait;
3983         }
3984 
3985         now = qemu_get_clock_ns(vm_clock);
3986         if (bs->slice_end < now + max_wait) {
3987             bs->slice_end = now + max_wait;
3988         }
3989 
3990         return true;
3991     }
3992 
3993     if (wait) {
3994         *wait = 0;
3995     }
3996 
3997     bs->slice_submitted.bytes[is_write] += (int64_t)nb_sectors *
3998                                            BDRV_SECTOR_SIZE;
3999     bs->slice_submitted.ios[is_write]++;
4000 
4001     return false;
4002 }
4003 
4004 /**************************************************************/
4005 /* async block device emulation */
4006 
4007 typedef struct BlockDriverAIOCBSync {
4008     BlockDriverAIOCB common;
4009     QEMUBH *bh;
4010     int ret;
4011     /* vector translation state */
4012     QEMUIOVector *qiov;
4013     uint8_t *bounce;
4014     int is_write;
4015 } BlockDriverAIOCBSync;
4016 
4017 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4018 {
4019     BlockDriverAIOCBSync *acb =
4020         container_of(blockacb, BlockDriverAIOCBSync, common);
4021     qemu_bh_delete(acb->bh);
4022     acb->bh = NULL;
4023     qemu_aio_release(acb);
4024 }
4025 
4026 static const AIOCBInfo bdrv_em_aiocb_info = {
4027     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
4028     .cancel             = bdrv_aio_cancel_em,
4029 };
4030 
4031 static void bdrv_aio_bh_cb(void *opaque)
4032 {
4033     BlockDriverAIOCBSync *acb = opaque;
4034 
4035     if (!acb->is_write)
4036         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4037     qemu_vfree(acb->bounce);
4038     acb->common.cb(acb->common.opaque, acb->ret);
4039     qemu_bh_delete(acb->bh);
4040     acb->bh = NULL;
4041     qemu_aio_release(acb);
4042 }
4043 
4044 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4045                                             int64_t sector_num,
4046                                             QEMUIOVector *qiov,
4047                                             int nb_sectors,
4048                                             BlockDriverCompletionFunc *cb,
4049                                             void *opaque,
4050                                             int is_write)
4051 
4052 {
4053     BlockDriverAIOCBSync *acb;
4054 
4055     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4056     acb->is_write = is_write;
4057     acb->qiov = qiov;
4058     acb->bounce = qemu_blockalign(bs, qiov->size);
4059     acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4060 
4061     if (is_write) {
4062         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4063         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4064     } else {
4065         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4066     }
4067 
4068     qemu_bh_schedule(acb->bh);
4069 
4070     return &acb->common;
4071 }
4072 
4073 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4074         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4075         BlockDriverCompletionFunc *cb, void *opaque)
4076 {
4077     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4078 }
4079 
4080 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4081         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4082         BlockDriverCompletionFunc *cb, void *opaque)
4083 {
4084     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4085 }
4086 
4087 
4088 typedef struct BlockDriverAIOCBCoroutine {
4089     BlockDriverAIOCB common;
4090     BlockRequest req;
4091     bool is_write;
4092     bool *done;
4093     QEMUBH* bh;
4094 } BlockDriverAIOCBCoroutine;
4095 
4096 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4097 {
4098     BlockDriverAIOCBCoroutine *acb =
4099         container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4100     bool done = false;
4101 
4102     acb->done = &done;
4103     while (!done) {
4104         qemu_aio_wait();
4105     }
4106 }
4107 
4108 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4109     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4110     .cancel             = bdrv_aio_co_cancel_em,
4111 };
4112 
4113 static void bdrv_co_em_bh(void *opaque)
4114 {
4115     BlockDriverAIOCBCoroutine *acb = opaque;
4116 
4117     acb->common.cb(acb->common.opaque, acb->req.error);
4118 
4119     if (acb->done) {
4120         *acb->done = true;
4121     }
4122 
4123     qemu_bh_delete(acb->bh);
4124     qemu_aio_release(acb);
4125 }
4126 
4127 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4128 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4129 {
4130     BlockDriverAIOCBCoroutine *acb = opaque;
4131     BlockDriverState *bs = acb->common.bs;
4132 
4133     if (!acb->is_write) {
4134         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4135             acb->req.nb_sectors, acb->req.qiov, 0);
4136     } else {
4137         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4138             acb->req.nb_sectors, acb->req.qiov, 0);
4139     }
4140 
4141     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4142     qemu_bh_schedule(acb->bh);
4143 }
4144 
4145 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4146                                                int64_t sector_num,
4147                                                QEMUIOVector *qiov,
4148                                                int nb_sectors,
4149                                                BlockDriverCompletionFunc *cb,
4150                                                void *opaque,
4151                                                bool is_write)
4152 {
4153     Coroutine *co;
4154     BlockDriverAIOCBCoroutine *acb;
4155 
4156     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4157     acb->req.sector = sector_num;
4158     acb->req.nb_sectors = nb_sectors;
4159     acb->req.qiov = qiov;
4160     acb->is_write = is_write;
4161     acb->done = NULL;
4162 
4163     co = qemu_coroutine_create(bdrv_co_do_rw);
4164     qemu_coroutine_enter(co, acb);
4165 
4166     return &acb->common;
4167 }
4168 
4169 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4170 {
4171     BlockDriverAIOCBCoroutine *acb = opaque;
4172     BlockDriverState *bs = acb->common.bs;
4173 
4174     acb->req.error = bdrv_co_flush(bs);
4175     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4176     qemu_bh_schedule(acb->bh);
4177 }
4178 
4179 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4180         BlockDriverCompletionFunc *cb, void *opaque)
4181 {
4182     trace_bdrv_aio_flush(bs, opaque);
4183 
4184     Coroutine *co;
4185     BlockDriverAIOCBCoroutine *acb;
4186 
4187     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4188     acb->done = NULL;
4189 
4190     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4191     qemu_coroutine_enter(co, acb);
4192 
4193     return &acb->common;
4194 }
4195 
4196 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4197 {
4198     BlockDriverAIOCBCoroutine *acb = opaque;
4199     BlockDriverState *bs = acb->common.bs;
4200 
4201     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4202     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4203     qemu_bh_schedule(acb->bh);
4204 }
4205 
4206 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4207         int64_t sector_num, int nb_sectors,
4208         BlockDriverCompletionFunc *cb, void *opaque)
4209 {
4210     Coroutine *co;
4211     BlockDriverAIOCBCoroutine *acb;
4212 
4213     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4214 
4215     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4216     acb->req.sector = sector_num;
4217     acb->req.nb_sectors = nb_sectors;
4218     acb->done = NULL;
4219     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4220     qemu_coroutine_enter(co, acb);
4221 
4222     return &acb->common;
4223 }
4224 
4225 void bdrv_init(void)
4226 {
4227     module_call_init(MODULE_INIT_BLOCK);
4228 }
4229 
4230 void bdrv_init_with_whitelist(void)
4231 {
4232     use_bdrv_whitelist = 1;
4233     bdrv_init();
4234 }
4235 
4236 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4237                    BlockDriverCompletionFunc *cb, void *opaque)
4238 {
4239     BlockDriverAIOCB *acb;
4240 
4241     acb = g_slice_alloc(aiocb_info->aiocb_size);
4242     acb->aiocb_info = aiocb_info;
4243     acb->bs = bs;
4244     acb->cb = cb;
4245     acb->opaque = opaque;
4246     return acb;
4247 }
4248 
4249 void qemu_aio_release(void *p)
4250 {
4251     BlockDriverAIOCB *acb = p;
4252     g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4253 }
4254 
4255 /**************************************************************/
4256 /* Coroutine block device emulation */
4257 
4258 typedef struct CoroutineIOCompletion {
4259     Coroutine *coroutine;
4260     int ret;
4261 } CoroutineIOCompletion;
4262 
4263 static void bdrv_co_io_em_complete(void *opaque, int ret)
4264 {
4265     CoroutineIOCompletion *co = opaque;
4266 
4267     co->ret = ret;
4268     qemu_coroutine_enter(co->coroutine, NULL);
4269 }
4270 
4271 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4272                                       int nb_sectors, QEMUIOVector *iov,
4273                                       bool is_write)
4274 {
4275     CoroutineIOCompletion co = {
4276         .coroutine = qemu_coroutine_self(),
4277     };
4278     BlockDriverAIOCB *acb;
4279 
4280     if (is_write) {
4281         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4282                                        bdrv_co_io_em_complete, &co);
4283     } else {
4284         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4285                                       bdrv_co_io_em_complete, &co);
4286     }
4287 
4288     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4289     if (!acb) {
4290         return -EIO;
4291     }
4292     qemu_coroutine_yield();
4293 
4294     return co.ret;
4295 }
4296 
4297 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4298                                          int64_t sector_num, int nb_sectors,
4299                                          QEMUIOVector *iov)
4300 {
4301     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4302 }
4303 
4304 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4305                                          int64_t sector_num, int nb_sectors,
4306                                          QEMUIOVector *iov)
4307 {
4308     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4309 }
4310 
4311 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4312 {
4313     RwCo *rwco = opaque;
4314 
4315     rwco->ret = bdrv_co_flush(rwco->bs);
4316 }
4317 
4318 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4319 {
4320     int ret;
4321 
4322     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4323         return 0;
4324     }
4325 
4326     /* Write back cached data to the OS even with cache=unsafe */
4327     if (bs->drv->bdrv_co_flush_to_os) {
4328         ret = bs->drv->bdrv_co_flush_to_os(bs);
4329         if (ret < 0) {
4330             return ret;
4331         }
4332     }
4333 
4334     /* But don't actually force it to the disk with cache=unsafe */
4335     if (bs->open_flags & BDRV_O_NO_FLUSH) {
4336         goto flush_parent;
4337     }
4338 
4339     if (bs->drv->bdrv_co_flush_to_disk) {
4340         ret = bs->drv->bdrv_co_flush_to_disk(bs);
4341     } else if (bs->drv->bdrv_aio_flush) {
4342         BlockDriverAIOCB *acb;
4343         CoroutineIOCompletion co = {
4344             .coroutine = qemu_coroutine_self(),
4345         };
4346 
4347         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4348         if (acb == NULL) {
4349             ret = -EIO;
4350         } else {
4351             qemu_coroutine_yield();
4352             ret = co.ret;
4353         }
4354     } else {
4355         /*
4356          * Some block drivers always operate in either writethrough or unsafe
4357          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4358          * know how the server works (because the behaviour is hardcoded or
4359          * depends on server-side configuration), so we can't ensure that
4360          * everything is safe on disk. Returning an error doesn't work because
4361          * that would break guests even if the server operates in writethrough
4362          * mode.
4363          *
4364          * Let's hope the user knows what he's doing.
4365          */
4366         ret = 0;
4367     }
4368     if (ret < 0) {
4369         return ret;
4370     }
4371 
4372     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4373      * in the case of cache=unsafe, so there are no useless flushes.
4374      */
4375 flush_parent:
4376     return bdrv_co_flush(bs->file);
4377 }
4378 
4379 void bdrv_invalidate_cache(BlockDriverState *bs)
4380 {
4381     if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4382         bs->drv->bdrv_invalidate_cache(bs);
4383     }
4384 }
4385 
4386 void bdrv_invalidate_cache_all(void)
4387 {
4388     BlockDriverState *bs;
4389 
4390     QTAILQ_FOREACH(bs, &bdrv_states, list) {
4391         bdrv_invalidate_cache(bs);
4392     }
4393 }
4394 
4395 void bdrv_clear_incoming_migration_all(void)
4396 {
4397     BlockDriverState *bs;
4398 
4399     QTAILQ_FOREACH(bs, &bdrv_states, list) {
4400         bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4401     }
4402 }
4403 
4404 int bdrv_flush(BlockDriverState *bs)
4405 {
4406     Coroutine *co;
4407     RwCo rwco = {
4408         .bs = bs,
4409         .ret = NOT_DONE,
4410     };
4411 
4412     if (qemu_in_coroutine()) {
4413         /* Fast-path if already in coroutine context */
4414         bdrv_flush_co_entry(&rwco);
4415     } else {
4416         co = qemu_coroutine_create(bdrv_flush_co_entry);
4417         qemu_coroutine_enter(co, &rwco);
4418         while (rwco.ret == NOT_DONE) {
4419             qemu_aio_wait();
4420         }
4421     }
4422 
4423     return rwco.ret;
4424 }
4425 
4426 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4427 {
4428     RwCo *rwco = opaque;
4429 
4430     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4431 }
4432 
4433 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4434                                  int nb_sectors)
4435 {
4436     if (!bs->drv) {
4437         return -ENOMEDIUM;
4438     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4439         return -EIO;
4440     } else if (bs->read_only) {
4441         return -EROFS;
4442     }
4443 
4444     if (bs->dirty_bitmap) {
4445         bdrv_reset_dirty(bs, sector_num, nb_sectors);
4446     }
4447 
4448     /* Do nothing if disabled.  */
4449     if (!(bs->open_flags & BDRV_O_UNMAP)) {
4450         return 0;
4451     }
4452 
4453     if (bs->drv->bdrv_co_discard) {
4454         return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
4455     } else if (bs->drv->bdrv_aio_discard) {
4456         BlockDriverAIOCB *acb;
4457         CoroutineIOCompletion co = {
4458             .coroutine = qemu_coroutine_self(),
4459         };
4460 
4461         acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4462                                         bdrv_co_io_em_complete, &co);
4463         if (acb == NULL) {
4464             return -EIO;
4465         } else {
4466             qemu_coroutine_yield();
4467             return co.ret;
4468         }
4469     } else {
4470         return 0;
4471     }
4472 }
4473 
4474 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4475 {
4476     Coroutine *co;
4477     RwCo rwco = {
4478         .bs = bs,
4479         .sector_num = sector_num,
4480         .nb_sectors = nb_sectors,
4481         .ret = NOT_DONE,
4482     };
4483 
4484     if (qemu_in_coroutine()) {
4485         /* Fast-path if already in coroutine context */
4486         bdrv_discard_co_entry(&rwco);
4487     } else {
4488         co = qemu_coroutine_create(bdrv_discard_co_entry);
4489         qemu_coroutine_enter(co, &rwco);
4490         while (rwco.ret == NOT_DONE) {
4491             qemu_aio_wait();
4492         }
4493     }
4494 
4495     return rwco.ret;
4496 }
4497 
4498 /**************************************************************/
4499 /* removable device support */
4500 
4501 /**
4502  * Return TRUE if the media is present
4503  */
4504 int bdrv_is_inserted(BlockDriverState *bs)
4505 {
4506     BlockDriver *drv = bs->drv;
4507 
4508     if (!drv)
4509         return 0;
4510     if (!drv->bdrv_is_inserted)
4511         return 1;
4512     return drv->bdrv_is_inserted(bs);
4513 }
4514 
4515 /**
4516  * Return whether the media changed since the last call to this
4517  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
4518  */
4519 int bdrv_media_changed(BlockDriverState *bs)
4520 {
4521     BlockDriver *drv = bs->drv;
4522 
4523     if (drv && drv->bdrv_media_changed) {
4524         return drv->bdrv_media_changed(bs);
4525     }
4526     return -ENOTSUP;
4527 }
4528 
4529 /**
4530  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4531  */
4532 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4533 {
4534     BlockDriver *drv = bs->drv;
4535 
4536     if (drv && drv->bdrv_eject) {
4537         drv->bdrv_eject(bs, eject_flag);
4538     }
4539 
4540     if (bs->device_name[0] != '\0') {
4541         bdrv_emit_qmp_eject_event(bs, eject_flag);
4542     }
4543 }
4544 
4545 /**
4546  * Lock or unlock the media (if it is locked, the user won't be able
4547  * to eject it manually).
4548  */
4549 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4550 {
4551     BlockDriver *drv = bs->drv;
4552 
4553     trace_bdrv_lock_medium(bs, locked);
4554 
4555     if (drv && drv->bdrv_lock_medium) {
4556         drv->bdrv_lock_medium(bs, locked);
4557     }
4558 }
4559 
4560 /* needed for generic scsi interface */
4561 
4562 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4563 {
4564     BlockDriver *drv = bs->drv;
4565 
4566     if (drv && drv->bdrv_ioctl)
4567         return drv->bdrv_ioctl(bs, req, buf);
4568     return -ENOTSUP;
4569 }
4570 
4571 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4572         unsigned long int req, void *buf,
4573         BlockDriverCompletionFunc *cb, void *opaque)
4574 {
4575     BlockDriver *drv = bs->drv;
4576 
4577     if (drv && drv->bdrv_aio_ioctl)
4578         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4579     return NULL;
4580 }
4581 
4582 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
4583 {
4584     bs->buffer_alignment = align;
4585 }
4586 
4587 void *qemu_blockalign(BlockDriverState *bs, size_t size)
4588 {
4589     return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
4590 }
4591 
4592 /*
4593  * Check if all memory in this vector is sector aligned.
4594  */
4595 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
4596 {
4597     int i;
4598 
4599     for (i = 0; i < qiov->niov; i++) {
4600         if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
4601             return false;
4602         }
4603     }
4604 
4605     return true;
4606 }
4607 
4608 void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity)
4609 {
4610     int64_t bitmap_size;
4611 
4612     assert((granularity & (granularity - 1)) == 0);
4613 
4614     if (granularity) {
4615         granularity >>= BDRV_SECTOR_BITS;
4616         assert(!bs->dirty_bitmap);
4617         bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
4618         bs->dirty_bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
4619     } else {
4620         if (bs->dirty_bitmap) {
4621             hbitmap_free(bs->dirty_bitmap);
4622             bs->dirty_bitmap = NULL;
4623         }
4624     }
4625 }
4626 
4627 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
4628 {
4629     if (bs->dirty_bitmap) {
4630         return hbitmap_get(bs->dirty_bitmap, sector);
4631     } else {
4632         return 0;
4633     }
4634 }
4635 
4636 void bdrv_dirty_iter_init(BlockDriverState *bs, HBitmapIter *hbi)
4637 {
4638     hbitmap_iter_init(hbi, bs->dirty_bitmap, 0);
4639 }
4640 
4641 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
4642                     int nr_sectors)
4643 {
4644     hbitmap_set(bs->dirty_bitmap, cur_sector, nr_sectors);
4645 }
4646 
4647 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
4648                       int nr_sectors)
4649 {
4650     hbitmap_reset(bs->dirty_bitmap, cur_sector, nr_sectors);
4651 }
4652 
4653 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
4654 {
4655     if (bs->dirty_bitmap) {
4656         return hbitmap_count(bs->dirty_bitmap);
4657     } else {
4658         return 0;
4659     }
4660 }
4661 
4662 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4663 {
4664     assert(bs->in_use != in_use);
4665     bs->in_use = in_use;
4666 }
4667 
4668 int bdrv_in_use(BlockDriverState *bs)
4669 {
4670     return bs->in_use;
4671 }
4672 
4673 void bdrv_iostatus_enable(BlockDriverState *bs)
4674 {
4675     bs->iostatus_enabled = true;
4676     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4677 }
4678 
4679 /* The I/O status is only enabled if the drive explicitly
4680  * enables it _and_ the VM is configured to stop on errors */
4681 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4682 {
4683     return (bs->iostatus_enabled &&
4684            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
4685             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
4686             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
4687 }
4688 
4689 void bdrv_iostatus_disable(BlockDriverState *bs)
4690 {
4691     bs->iostatus_enabled = false;
4692 }
4693 
4694 void bdrv_iostatus_reset(BlockDriverState *bs)
4695 {
4696     if (bdrv_iostatus_is_enabled(bs)) {
4697         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4698         if (bs->job) {
4699             block_job_iostatus_reset(bs->job);
4700         }
4701     }
4702 }
4703 
4704 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4705 {
4706     assert(bdrv_iostatus_is_enabled(bs));
4707     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4708         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4709                                          BLOCK_DEVICE_IO_STATUS_FAILED;
4710     }
4711 }
4712 
4713 void
4714 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4715         enum BlockAcctType type)
4716 {
4717     assert(type < BDRV_MAX_IOTYPE);
4718 
4719     cookie->bytes = bytes;
4720     cookie->start_time_ns = get_clock();
4721     cookie->type = type;
4722 }
4723 
4724 void
4725 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4726 {
4727     assert(cookie->type < BDRV_MAX_IOTYPE);
4728 
4729     bs->nr_bytes[cookie->type] += cookie->bytes;
4730     bs->nr_ops[cookie->type]++;
4731     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4732 }
4733 
4734 void bdrv_img_create(const char *filename, const char *fmt,
4735                      const char *base_filename, const char *base_fmt,
4736                      char *options, uint64_t img_size, int flags,
4737                      Error **errp, bool quiet)
4738 {
4739     QEMUOptionParameter *param = NULL, *create_options = NULL;
4740     QEMUOptionParameter *backing_fmt, *backing_file, *size;
4741     BlockDriverState *bs = NULL;
4742     BlockDriver *drv, *proto_drv;
4743     BlockDriver *backing_drv = NULL;
4744     int ret = 0;
4745 
4746     /* Find driver and parse its options */
4747     drv = bdrv_find_format(fmt);
4748     if (!drv) {
4749         error_setg(errp, "Unknown file format '%s'", fmt);
4750         return;
4751     }
4752 
4753     proto_drv = bdrv_find_protocol(filename);
4754     if (!proto_drv) {
4755         error_setg(errp, "Unknown protocol '%s'", filename);
4756         return;
4757     }
4758 
4759     create_options = append_option_parameters(create_options,
4760                                               drv->create_options);
4761     create_options = append_option_parameters(create_options,
4762                                               proto_drv->create_options);
4763 
4764     /* Create parameter list with default values */
4765     param = parse_option_parameters("", create_options, param);
4766 
4767     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4768 
4769     /* Parse -o options */
4770     if (options) {
4771         param = parse_option_parameters(options, create_options, param);
4772         if (param == NULL) {
4773             error_setg(errp, "Invalid options for file format '%s'.", fmt);
4774             goto out;
4775         }
4776     }
4777 
4778     if (base_filename) {
4779         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4780                                  base_filename)) {
4781             error_setg(errp, "Backing file not supported for file format '%s'",
4782                        fmt);
4783             goto out;
4784         }
4785     }
4786 
4787     if (base_fmt) {
4788         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4789             error_setg(errp, "Backing file format not supported for file "
4790                              "format '%s'", fmt);
4791             goto out;
4792         }
4793     }
4794 
4795     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4796     if (backing_file && backing_file->value.s) {
4797         if (!strcmp(filename, backing_file->value.s)) {
4798             error_setg(errp, "Error: Trying to create an image with the "
4799                              "same filename as the backing file");
4800             goto out;
4801         }
4802     }
4803 
4804     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4805     if (backing_fmt && backing_fmt->value.s) {
4806         backing_drv = bdrv_find_format(backing_fmt->value.s);
4807         if (!backing_drv) {
4808             error_setg(errp, "Unknown backing file format '%s'",
4809                        backing_fmt->value.s);
4810             goto out;
4811         }
4812     }
4813 
4814     // The size for the image must always be specified, with one exception:
4815     // If we are using a backing file, we can obtain the size from there
4816     size = get_option_parameter(param, BLOCK_OPT_SIZE);
4817     if (size && size->value.n == -1) {
4818         if (backing_file && backing_file->value.s) {
4819             uint64_t size;
4820             char buf[32];
4821             int back_flags;
4822 
4823             /* backing files always opened read-only */
4824             back_flags =
4825                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4826 
4827             bs = bdrv_new("");
4828 
4829             ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags,
4830                             backing_drv);
4831             if (ret < 0) {
4832                 error_setg_errno(errp, -ret, "Could not open '%s'",
4833                                  backing_file->value.s);
4834                 goto out;
4835             }
4836             bdrv_get_geometry(bs, &size);
4837             size *= 512;
4838 
4839             snprintf(buf, sizeof(buf), "%" PRId64, size);
4840             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4841         } else {
4842             error_setg(errp, "Image creation needs a size parameter");
4843             goto out;
4844         }
4845     }
4846 
4847     if (!quiet) {
4848         printf("Formatting '%s', fmt=%s ", filename, fmt);
4849         print_option_parameters(param);
4850         puts("");
4851     }
4852     ret = bdrv_create(drv, filename, param);
4853     if (ret < 0) {
4854         if (ret == -ENOTSUP) {
4855             error_setg(errp,"Formatting or formatting option not supported for "
4856                             "file format '%s'", fmt);
4857         } else if (ret == -EFBIG) {
4858             error_setg(errp, "The image size is too large for file format '%s'",
4859                        fmt);
4860         } else {
4861             error_setg(errp, "%s: error while creating %s: %s", filename, fmt,
4862                        strerror(-ret));
4863         }
4864     }
4865 
4866 out:
4867     free_option_parameters(create_options);
4868     free_option_parameters(param);
4869 
4870     if (bs) {
4871         bdrv_delete(bs);
4872     }
4873 }
4874 
4875 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
4876 {
4877     /* Currently BlockDriverState always uses the main loop AioContext */
4878     return qemu_get_aio_context();
4879 }
4880