xref: /openbmc/qemu/block.c (revision 57a33d89)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor.h"
28 #include "block_int.h"
29 #include "module.h"
30 #include "qjson.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 #include "qemu-timer.h"
34 
35 #ifdef CONFIG_BSD
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/ioctl.h>
39 #include <sys/queue.h>
40 #ifndef __DragonFly__
41 #include <sys/disk.h>
42 #endif
43 #endif
44 
45 #ifdef _WIN32
46 #include <windows.h>
47 #endif
48 
49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50 
51 typedef enum {
52     BDRV_REQ_COPY_ON_READ = 0x1,
53     BDRV_REQ_ZERO_WRITE   = 0x2,
54 } BdrvRequestFlags;
55 
56 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59         BlockDriverCompletionFunc *cb, void *opaque);
60 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62         BlockDriverCompletionFunc *cb, void *opaque);
63 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64                                          int64_t sector_num, int nb_sectors,
65                                          QEMUIOVector *iov);
66 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67                                          int64_t sector_num, int nb_sectors,
68                                          QEMUIOVector *iov);
69 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71     BdrvRequestFlags flags);
72 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74     BdrvRequestFlags flags);
75 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76                                                int64_t sector_num,
77                                                QEMUIOVector *qiov,
78                                                int nb_sectors,
79                                                BlockDriverCompletionFunc *cb,
80                                                void *opaque,
81                                                bool is_write);
82 static void coroutine_fn bdrv_co_do_rw(void *opaque);
83 
84 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
85         bool is_write, double elapsed_time, uint64_t *wait);
86 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
87         double elapsed_time, uint64_t *wait);
88 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
89         bool is_write, int64_t *wait);
90 
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92     QTAILQ_HEAD_INITIALIZER(bdrv_states);
93 
94 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
95     QLIST_HEAD_INITIALIZER(bdrv_drivers);
96 
97 /* The device to use for VM snapshots */
98 static BlockDriverState *bs_snapshots;
99 
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
102 
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
105 {
106     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108             filename[1] == ':');
109 }
110 
111 int is_windows_drive(const char *filename)
112 {
113     if (is_windows_drive_prefix(filename) &&
114         filename[2] == '\0')
115         return 1;
116     if (strstart(filename, "\\\\.\\", NULL) ||
117         strstart(filename, "//./", NULL))
118         return 1;
119     return 0;
120 }
121 #endif
122 
123 /* throttling disk I/O limits */
124 void bdrv_io_limits_disable(BlockDriverState *bs)
125 {
126     bs->io_limits_enabled = false;
127 
128     while (qemu_co_queue_next(&bs->throttled_reqs));
129 
130     if (bs->block_timer) {
131         qemu_del_timer(bs->block_timer);
132         qemu_free_timer(bs->block_timer);
133         bs->block_timer = NULL;
134     }
135 
136     bs->slice_start = 0;
137     bs->slice_end   = 0;
138     bs->slice_time  = 0;
139     memset(&bs->io_base, 0, sizeof(bs->io_base));
140 }
141 
142 static void bdrv_block_timer(void *opaque)
143 {
144     BlockDriverState *bs = opaque;
145 
146     qemu_co_queue_next(&bs->throttled_reqs);
147 }
148 
149 void bdrv_io_limits_enable(BlockDriverState *bs)
150 {
151     qemu_co_queue_init(&bs->throttled_reqs);
152     bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
153     bs->slice_time  = 5 * BLOCK_IO_SLICE_TIME;
154     bs->slice_start = qemu_get_clock_ns(vm_clock);
155     bs->slice_end   = bs->slice_start + bs->slice_time;
156     memset(&bs->io_base, 0, sizeof(bs->io_base));
157     bs->io_limits_enabled = true;
158 }
159 
160 bool bdrv_io_limits_enabled(BlockDriverState *bs)
161 {
162     BlockIOLimit *io_limits = &bs->io_limits;
163     return io_limits->bps[BLOCK_IO_LIMIT_READ]
164          || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
165          || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
166          || io_limits->iops[BLOCK_IO_LIMIT_READ]
167          || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
168          || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
169 }
170 
171 static void bdrv_io_limits_intercept(BlockDriverState *bs,
172                                      bool is_write, int nb_sectors)
173 {
174     int64_t wait_time = -1;
175 
176     if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
177         qemu_co_queue_wait(&bs->throttled_reqs);
178     }
179 
180     /* In fact, we hope to keep each request's timing, in FIFO mode. The next
181      * throttled requests will not be dequeued until the current request is
182      * allowed to be serviced. So if the current request still exceeds the
183      * limits, it will be inserted to the head. All requests followed it will
184      * be still in throttled_reqs queue.
185      */
186 
187     while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
188         qemu_mod_timer(bs->block_timer,
189                        wait_time + qemu_get_clock_ns(vm_clock));
190         qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
191     }
192 
193     qemu_co_queue_next(&bs->throttled_reqs);
194 }
195 
196 /* check if the path starts with "<protocol>:" */
197 static int path_has_protocol(const char *path)
198 {
199 #ifdef _WIN32
200     if (is_windows_drive(path) ||
201         is_windows_drive_prefix(path)) {
202         return 0;
203     }
204 #endif
205 
206     return strchr(path, ':') != NULL;
207 }
208 
209 int path_is_absolute(const char *path)
210 {
211     const char *p;
212 #ifdef _WIN32
213     /* specific case for names like: "\\.\d:" */
214     if (*path == '/' || *path == '\\')
215         return 1;
216 #endif
217     p = strchr(path, ':');
218     if (p)
219         p++;
220     else
221         p = path;
222 #ifdef _WIN32
223     return (*p == '/' || *p == '\\');
224 #else
225     return (*p == '/');
226 #endif
227 }
228 
229 /* if filename is absolute, just copy it to dest. Otherwise, build a
230    path to it by considering it is relative to base_path. URL are
231    supported. */
232 void path_combine(char *dest, int dest_size,
233                   const char *base_path,
234                   const char *filename)
235 {
236     const char *p, *p1;
237     int len;
238 
239     if (dest_size <= 0)
240         return;
241     if (path_is_absolute(filename)) {
242         pstrcpy(dest, dest_size, filename);
243     } else {
244         p = strchr(base_path, ':');
245         if (p)
246             p++;
247         else
248             p = base_path;
249         p1 = strrchr(base_path, '/');
250 #ifdef _WIN32
251         {
252             const char *p2;
253             p2 = strrchr(base_path, '\\');
254             if (!p1 || p2 > p1)
255                 p1 = p2;
256         }
257 #endif
258         if (p1)
259             p1++;
260         else
261             p1 = base_path;
262         if (p1 > p)
263             p = p1;
264         len = p - base_path;
265         if (len > dest_size - 1)
266             len = dest_size - 1;
267         memcpy(dest, base_path, len);
268         dest[len] = '\0';
269         pstrcat(dest, dest_size, filename);
270     }
271 }
272 
273 void bdrv_register(BlockDriver *bdrv)
274 {
275     /* Block drivers without coroutine functions need emulation */
276     if (!bdrv->bdrv_co_readv) {
277         bdrv->bdrv_co_readv = bdrv_co_readv_em;
278         bdrv->bdrv_co_writev = bdrv_co_writev_em;
279 
280         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
281          * the block driver lacks aio we need to emulate that too.
282          */
283         if (!bdrv->bdrv_aio_readv) {
284             /* add AIO emulation layer */
285             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
286             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
287         }
288     }
289 
290     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
291 }
292 
293 /* create a new block device (by default it is empty) */
294 BlockDriverState *bdrv_new(const char *device_name)
295 {
296     BlockDriverState *bs;
297 
298     bs = g_malloc0(sizeof(BlockDriverState));
299     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
300     if (device_name[0] != '\0') {
301         QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
302     }
303     bdrv_iostatus_disable(bs);
304     return bs;
305 }
306 
307 BlockDriver *bdrv_find_format(const char *format_name)
308 {
309     BlockDriver *drv1;
310     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
311         if (!strcmp(drv1->format_name, format_name)) {
312             return drv1;
313         }
314     }
315     return NULL;
316 }
317 
318 static int bdrv_is_whitelisted(BlockDriver *drv)
319 {
320     static const char *whitelist[] = {
321         CONFIG_BDRV_WHITELIST
322     };
323     const char **p;
324 
325     if (!whitelist[0])
326         return 1;               /* no whitelist, anything goes */
327 
328     for (p = whitelist; *p; p++) {
329         if (!strcmp(drv->format_name, *p)) {
330             return 1;
331         }
332     }
333     return 0;
334 }
335 
336 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
337 {
338     BlockDriver *drv = bdrv_find_format(format_name);
339     return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
340 }
341 
342 int bdrv_create(BlockDriver *drv, const char* filename,
343     QEMUOptionParameter *options)
344 {
345     if (!drv->bdrv_create)
346         return -ENOTSUP;
347 
348     return drv->bdrv_create(filename, options);
349 }
350 
351 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
352 {
353     BlockDriver *drv;
354 
355     drv = bdrv_find_protocol(filename);
356     if (drv == NULL) {
357         return -ENOENT;
358     }
359 
360     return bdrv_create(drv, filename, options);
361 }
362 
363 #ifdef _WIN32
364 void get_tmp_filename(char *filename, int size)
365 {
366     char temp_dir[MAX_PATH];
367 
368     GetTempPath(MAX_PATH, temp_dir);
369     GetTempFileName(temp_dir, "qem", 0, filename);
370 }
371 #else
372 void get_tmp_filename(char *filename, int size)
373 {
374     int fd;
375     const char *tmpdir;
376     /* XXX: race condition possible */
377     tmpdir = getenv("TMPDIR");
378     if (!tmpdir)
379         tmpdir = "/tmp";
380     snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
381     fd = mkstemp(filename);
382     close(fd);
383 }
384 #endif
385 
386 /*
387  * Detect host devices. By convention, /dev/cdrom[N] is always
388  * recognized as a host CDROM.
389  */
390 static BlockDriver *find_hdev_driver(const char *filename)
391 {
392     int score_max = 0, score;
393     BlockDriver *drv = NULL, *d;
394 
395     QLIST_FOREACH(d, &bdrv_drivers, list) {
396         if (d->bdrv_probe_device) {
397             score = d->bdrv_probe_device(filename);
398             if (score > score_max) {
399                 score_max = score;
400                 drv = d;
401             }
402         }
403     }
404 
405     return drv;
406 }
407 
408 BlockDriver *bdrv_find_protocol(const char *filename)
409 {
410     BlockDriver *drv1;
411     char protocol[128];
412     int len;
413     const char *p;
414 
415     /* TODO Drivers without bdrv_file_open must be specified explicitly */
416 
417     /*
418      * XXX(hch): we really should not let host device detection
419      * override an explicit protocol specification, but moving this
420      * later breaks access to device names with colons in them.
421      * Thanks to the brain-dead persistent naming schemes on udev-
422      * based Linux systems those actually are quite common.
423      */
424     drv1 = find_hdev_driver(filename);
425     if (drv1) {
426         return drv1;
427     }
428 
429     if (!path_has_protocol(filename)) {
430         return bdrv_find_format("file");
431     }
432     p = strchr(filename, ':');
433     assert(p != NULL);
434     len = p - filename;
435     if (len > sizeof(protocol) - 1)
436         len = sizeof(protocol) - 1;
437     memcpy(protocol, filename, len);
438     protocol[len] = '\0';
439     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
440         if (drv1->protocol_name &&
441             !strcmp(drv1->protocol_name, protocol)) {
442             return drv1;
443         }
444     }
445     return NULL;
446 }
447 
448 static int find_image_format(const char *filename, BlockDriver **pdrv)
449 {
450     int ret, score, score_max;
451     BlockDriver *drv1, *drv;
452     uint8_t buf[2048];
453     BlockDriverState *bs;
454 
455     ret = bdrv_file_open(&bs, filename, 0);
456     if (ret < 0) {
457         *pdrv = NULL;
458         return ret;
459     }
460 
461     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
462     if (bs->sg || !bdrv_is_inserted(bs)) {
463         bdrv_delete(bs);
464         drv = bdrv_find_format("raw");
465         if (!drv) {
466             ret = -ENOENT;
467         }
468         *pdrv = drv;
469         return ret;
470     }
471 
472     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
473     bdrv_delete(bs);
474     if (ret < 0) {
475         *pdrv = NULL;
476         return ret;
477     }
478 
479     score_max = 0;
480     drv = NULL;
481     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
482         if (drv1->bdrv_probe) {
483             score = drv1->bdrv_probe(buf, ret, filename);
484             if (score > score_max) {
485                 score_max = score;
486                 drv = drv1;
487             }
488         }
489     }
490     if (!drv) {
491         ret = -ENOENT;
492     }
493     *pdrv = drv;
494     return ret;
495 }
496 
497 /**
498  * Set the current 'total_sectors' value
499  */
500 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
501 {
502     BlockDriver *drv = bs->drv;
503 
504     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
505     if (bs->sg)
506         return 0;
507 
508     /* query actual device if possible, otherwise just trust the hint */
509     if (drv->bdrv_getlength) {
510         int64_t length = drv->bdrv_getlength(bs);
511         if (length < 0) {
512             return length;
513         }
514         hint = length >> BDRV_SECTOR_BITS;
515     }
516 
517     bs->total_sectors = hint;
518     return 0;
519 }
520 
521 /**
522  * Set open flags for a given cache mode
523  *
524  * Return 0 on success, -1 if the cache mode was invalid.
525  */
526 int bdrv_parse_cache_flags(const char *mode, int *flags)
527 {
528     *flags &= ~BDRV_O_CACHE_MASK;
529 
530     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
531         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
532     } else if (!strcmp(mode, "directsync")) {
533         *flags |= BDRV_O_NOCACHE;
534     } else if (!strcmp(mode, "writeback")) {
535         *flags |= BDRV_O_CACHE_WB;
536     } else if (!strcmp(mode, "unsafe")) {
537         *flags |= BDRV_O_CACHE_WB;
538         *flags |= BDRV_O_NO_FLUSH;
539     } else if (!strcmp(mode, "writethrough")) {
540         /* this is the default */
541     } else {
542         return -1;
543     }
544 
545     return 0;
546 }
547 
548 /**
549  * The copy-on-read flag is actually a reference count so multiple users may
550  * use the feature without worrying about clobbering its previous state.
551  * Copy-on-read stays enabled until all users have called to disable it.
552  */
553 void bdrv_enable_copy_on_read(BlockDriverState *bs)
554 {
555     bs->copy_on_read++;
556 }
557 
558 void bdrv_disable_copy_on_read(BlockDriverState *bs)
559 {
560     assert(bs->copy_on_read > 0);
561     bs->copy_on_read--;
562 }
563 
564 /*
565  * Common part for opening disk images and files
566  */
567 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
568     int flags, BlockDriver *drv)
569 {
570     int ret, open_flags;
571 
572     assert(drv != NULL);
573 
574     trace_bdrv_open_common(bs, filename, flags, drv->format_name);
575 
576     bs->file = NULL;
577     bs->total_sectors = 0;
578     bs->encrypted = 0;
579     bs->valid_key = 0;
580     bs->sg = 0;
581     bs->open_flags = flags;
582     bs->growable = 0;
583     bs->buffer_alignment = 512;
584 
585     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
586     if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
587         bdrv_enable_copy_on_read(bs);
588     }
589 
590     pstrcpy(bs->filename, sizeof(bs->filename), filename);
591     bs->backing_file[0] = '\0';
592 
593     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
594         return -ENOTSUP;
595     }
596 
597     bs->drv = drv;
598     bs->opaque = g_malloc0(drv->instance_size);
599 
600     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
601 
602     /*
603      * Clear flags that are internal to the block layer before opening the
604      * image.
605      */
606     open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
607 
608     /*
609      * Snapshots should be writable.
610      */
611     if (bs->is_temporary) {
612         open_flags |= BDRV_O_RDWR;
613     }
614 
615     bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
616 
617     /* Open the image, either directly or using a protocol */
618     if (drv->bdrv_file_open) {
619         ret = drv->bdrv_file_open(bs, filename, open_flags);
620     } else {
621         ret = bdrv_file_open(&bs->file, filename, open_flags);
622         if (ret >= 0) {
623             ret = drv->bdrv_open(bs, open_flags);
624         }
625     }
626 
627     if (ret < 0) {
628         goto free_and_fail;
629     }
630 
631     ret = refresh_total_sectors(bs, bs->total_sectors);
632     if (ret < 0) {
633         goto free_and_fail;
634     }
635 
636 #ifndef _WIN32
637     if (bs->is_temporary) {
638         unlink(filename);
639     }
640 #endif
641     return 0;
642 
643 free_and_fail:
644     if (bs->file) {
645         bdrv_delete(bs->file);
646         bs->file = NULL;
647     }
648     g_free(bs->opaque);
649     bs->opaque = NULL;
650     bs->drv = NULL;
651     return ret;
652 }
653 
654 /*
655  * Opens a file using a protocol (file, host_device, nbd, ...)
656  */
657 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
658 {
659     BlockDriverState *bs;
660     BlockDriver *drv;
661     int ret;
662 
663     drv = bdrv_find_protocol(filename);
664     if (!drv) {
665         return -ENOENT;
666     }
667 
668     bs = bdrv_new("");
669     ret = bdrv_open_common(bs, filename, flags, drv);
670     if (ret < 0) {
671         bdrv_delete(bs);
672         return ret;
673     }
674     bs->growable = 1;
675     *pbs = bs;
676     return 0;
677 }
678 
679 /*
680  * Opens a disk image (raw, qcow2, vmdk, ...)
681  */
682 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
683               BlockDriver *drv)
684 {
685     int ret;
686     char tmp_filename[PATH_MAX];
687 
688     if (flags & BDRV_O_SNAPSHOT) {
689         BlockDriverState *bs1;
690         int64_t total_size;
691         int is_protocol = 0;
692         BlockDriver *bdrv_qcow2;
693         QEMUOptionParameter *options;
694         char backing_filename[PATH_MAX];
695 
696         /* if snapshot, we create a temporary backing file and open it
697            instead of opening 'filename' directly */
698 
699         /* if there is a backing file, use it */
700         bs1 = bdrv_new("");
701         ret = bdrv_open(bs1, filename, 0, drv);
702         if (ret < 0) {
703             bdrv_delete(bs1);
704             return ret;
705         }
706         total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
707 
708         if (bs1->drv && bs1->drv->protocol_name)
709             is_protocol = 1;
710 
711         bdrv_delete(bs1);
712 
713         get_tmp_filename(tmp_filename, sizeof(tmp_filename));
714 
715         /* Real path is meaningless for protocols */
716         if (is_protocol)
717             snprintf(backing_filename, sizeof(backing_filename),
718                      "%s", filename);
719         else if (!realpath(filename, backing_filename))
720             return -errno;
721 
722         bdrv_qcow2 = bdrv_find_format("qcow2");
723         options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
724 
725         set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
726         set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
727         if (drv) {
728             set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
729                 drv->format_name);
730         }
731 
732         ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
733         free_option_parameters(options);
734         if (ret < 0) {
735             return ret;
736         }
737 
738         filename = tmp_filename;
739         drv = bdrv_qcow2;
740         bs->is_temporary = 1;
741     }
742 
743     /* Find the right image format driver */
744     if (!drv) {
745         ret = find_image_format(filename, &drv);
746     }
747 
748     if (!drv) {
749         goto unlink_and_fail;
750     }
751 
752     /* Open the image */
753     ret = bdrv_open_common(bs, filename, flags, drv);
754     if (ret < 0) {
755         goto unlink_and_fail;
756     }
757 
758     /* If there is a backing file, use it */
759     if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
760         char backing_filename[PATH_MAX];
761         int back_flags;
762         BlockDriver *back_drv = NULL;
763 
764         bs->backing_hd = bdrv_new("");
765 
766         if (path_has_protocol(bs->backing_file)) {
767             pstrcpy(backing_filename, sizeof(backing_filename),
768                     bs->backing_file);
769         } else {
770             path_combine(backing_filename, sizeof(backing_filename),
771                          filename, bs->backing_file);
772         }
773 
774         if (bs->backing_format[0] != '\0') {
775             back_drv = bdrv_find_format(bs->backing_format);
776         }
777 
778         /* backing files always opened read-only */
779         back_flags =
780             flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
781 
782         ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
783         if (ret < 0) {
784             bdrv_close(bs);
785             return ret;
786         }
787         if (bs->is_temporary) {
788             bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
789         } else {
790             /* base image inherits from "parent" */
791             bs->backing_hd->keep_read_only = bs->keep_read_only;
792         }
793     }
794 
795     if (!bdrv_key_required(bs)) {
796         bdrv_dev_change_media_cb(bs, true);
797     }
798 
799     /* throttling disk I/O limits */
800     if (bs->io_limits_enabled) {
801         bdrv_io_limits_enable(bs);
802     }
803 
804     return 0;
805 
806 unlink_and_fail:
807     if (bs->is_temporary) {
808         unlink(filename);
809     }
810     return ret;
811 }
812 
813 void bdrv_close(BlockDriverState *bs)
814 {
815     if (bs->drv) {
816         if (bs->job) {
817             block_job_cancel_sync(bs->job);
818         }
819         if (bs == bs_snapshots) {
820             bs_snapshots = NULL;
821         }
822         if (bs->backing_hd) {
823             bdrv_delete(bs->backing_hd);
824             bs->backing_hd = NULL;
825         }
826         bs->drv->bdrv_close(bs);
827         g_free(bs->opaque);
828 #ifdef _WIN32
829         if (bs->is_temporary) {
830             unlink(bs->filename);
831         }
832 #endif
833         bs->opaque = NULL;
834         bs->drv = NULL;
835         bs->copy_on_read = 0;
836 
837         if (bs->file != NULL) {
838             bdrv_close(bs->file);
839         }
840 
841         bdrv_dev_change_media_cb(bs, false);
842     }
843 
844     /*throttling disk I/O limits*/
845     if (bs->io_limits_enabled) {
846         bdrv_io_limits_disable(bs);
847     }
848 }
849 
850 void bdrv_close_all(void)
851 {
852     BlockDriverState *bs;
853 
854     QTAILQ_FOREACH(bs, &bdrv_states, list) {
855         bdrv_close(bs);
856     }
857 }
858 
859 /*
860  * Wait for pending requests to complete across all BlockDriverStates
861  *
862  * This function does not flush data to disk, use bdrv_flush_all() for that
863  * after calling this function.
864  */
865 void bdrv_drain_all(void)
866 {
867     BlockDriverState *bs;
868 
869     qemu_aio_flush();
870 
871     /* If requests are still pending there is a bug somewhere */
872     QTAILQ_FOREACH(bs, &bdrv_states, list) {
873         assert(QLIST_EMPTY(&bs->tracked_requests));
874         assert(qemu_co_queue_empty(&bs->throttled_reqs));
875     }
876 }
877 
878 /* make a BlockDriverState anonymous by removing from bdrv_state list.
879    Also, NULL terminate the device_name to prevent double remove */
880 void bdrv_make_anon(BlockDriverState *bs)
881 {
882     if (bs->device_name[0] != '\0') {
883         QTAILQ_REMOVE(&bdrv_states, bs, list);
884     }
885     bs->device_name[0] = '\0';
886 }
887 
888 /*
889  * Add new bs contents at the top of an image chain while the chain is
890  * live, while keeping required fields on the top layer.
891  *
892  * This will modify the BlockDriverState fields, and swap contents
893  * between bs_new and bs_top. Both bs_new and bs_top are modified.
894  *
895  * bs_new is required to be anonymous.
896  *
897  * This function does not create any image files.
898  */
899 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
900 {
901     BlockDriverState tmp;
902 
903     /* bs_new must be anonymous */
904     assert(bs_new->device_name[0] == '\0');
905 
906     tmp = *bs_new;
907 
908     /* there are some fields that need to stay on the top layer: */
909 
910     /* dev info */
911     tmp.dev_ops           = bs_top->dev_ops;
912     tmp.dev_opaque        = bs_top->dev_opaque;
913     tmp.dev               = bs_top->dev;
914     tmp.buffer_alignment  = bs_top->buffer_alignment;
915     tmp.copy_on_read      = bs_top->copy_on_read;
916 
917     /* i/o timing parameters */
918     tmp.slice_time        = bs_top->slice_time;
919     tmp.slice_start       = bs_top->slice_start;
920     tmp.slice_end         = bs_top->slice_end;
921     tmp.io_limits         = bs_top->io_limits;
922     tmp.io_base           = bs_top->io_base;
923     tmp.throttled_reqs    = bs_top->throttled_reqs;
924     tmp.block_timer       = bs_top->block_timer;
925     tmp.io_limits_enabled = bs_top->io_limits_enabled;
926 
927     /* geometry */
928     tmp.cyls              = bs_top->cyls;
929     tmp.heads             = bs_top->heads;
930     tmp.secs              = bs_top->secs;
931     tmp.translation       = bs_top->translation;
932 
933     /* r/w error */
934     tmp.on_read_error     = bs_top->on_read_error;
935     tmp.on_write_error    = bs_top->on_write_error;
936 
937     /* i/o status */
938     tmp.iostatus_enabled  = bs_top->iostatus_enabled;
939     tmp.iostatus          = bs_top->iostatus;
940 
941     /* keep the same entry in bdrv_states */
942     pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name);
943     tmp.list = bs_top->list;
944 
945     /* The contents of 'tmp' will become bs_top, as we are
946      * swapping bs_new and bs_top contents. */
947     tmp.backing_hd = bs_new;
948     pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
949     bdrv_get_format(bs_top, tmp.backing_format, sizeof(tmp.backing_format));
950 
951     /* swap contents of the fixed new bs and the current top */
952     *bs_new = *bs_top;
953     *bs_top = tmp;
954 
955     /* device_name[] was carried over from the old bs_top.  bs_new
956      * shouldn't be in bdrv_states, so we need to make device_name[]
957      * reflect the anonymity of bs_new
958      */
959     bs_new->device_name[0] = '\0';
960 
961     /* clear the copied fields in the new backing file */
962     bdrv_detach_dev(bs_new, bs_new->dev);
963 
964     qemu_co_queue_init(&bs_new->throttled_reqs);
965     memset(&bs_new->io_base,   0, sizeof(bs_new->io_base));
966     memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits));
967     bdrv_iostatus_disable(bs_new);
968 
969     /* we don't use bdrv_io_limits_disable() for this, because we don't want
970      * to affect or delete the block_timer, as it has been moved to bs_top */
971     bs_new->io_limits_enabled = false;
972     bs_new->block_timer       = NULL;
973     bs_new->slice_time        = 0;
974     bs_new->slice_start       = 0;
975     bs_new->slice_end         = 0;
976 }
977 
978 void bdrv_delete(BlockDriverState *bs)
979 {
980     assert(!bs->dev);
981     assert(!bs->job);
982     assert(!bs->in_use);
983 
984     /* remove from list, if necessary */
985     bdrv_make_anon(bs);
986 
987     bdrv_close(bs);
988     if (bs->file != NULL) {
989         bdrv_delete(bs->file);
990     }
991 
992     assert(bs != bs_snapshots);
993     g_free(bs);
994 }
995 
996 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
997 /* TODO change to DeviceState *dev when all users are qdevified */
998 {
999     if (bs->dev) {
1000         return -EBUSY;
1001     }
1002     bs->dev = dev;
1003     bdrv_iostatus_reset(bs);
1004     return 0;
1005 }
1006 
1007 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1008 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1009 {
1010     if (bdrv_attach_dev(bs, dev) < 0) {
1011         abort();
1012     }
1013 }
1014 
1015 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1016 /* TODO change to DeviceState *dev when all users are qdevified */
1017 {
1018     assert(bs->dev == dev);
1019     bs->dev = NULL;
1020     bs->dev_ops = NULL;
1021     bs->dev_opaque = NULL;
1022     bs->buffer_alignment = 512;
1023 }
1024 
1025 /* TODO change to return DeviceState * when all users are qdevified */
1026 void *bdrv_get_attached_dev(BlockDriverState *bs)
1027 {
1028     return bs->dev;
1029 }
1030 
1031 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1032                       void *opaque)
1033 {
1034     bs->dev_ops = ops;
1035     bs->dev_opaque = opaque;
1036     if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1037         bs_snapshots = NULL;
1038     }
1039 }
1040 
1041 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1042                                BlockQMPEventAction action, int is_read)
1043 {
1044     QObject *data;
1045     const char *action_str;
1046 
1047     switch (action) {
1048     case BDRV_ACTION_REPORT:
1049         action_str = "report";
1050         break;
1051     case BDRV_ACTION_IGNORE:
1052         action_str = "ignore";
1053         break;
1054     case BDRV_ACTION_STOP:
1055         action_str = "stop";
1056         break;
1057     default:
1058         abort();
1059     }
1060 
1061     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1062                               bdrv->device_name,
1063                               action_str,
1064                               is_read ? "read" : "write");
1065     monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1066 
1067     qobject_decref(data);
1068 }
1069 
1070 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1071 {
1072     QObject *data;
1073 
1074     data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1075                               bdrv_get_device_name(bs), ejected);
1076     monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1077 
1078     qobject_decref(data);
1079 }
1080 
1081 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1082 {
1083     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1084         bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1085         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1086         if (tray_was_closed) {
1087             /* tray open */
1088             bdrv_emit_qmp_eject_event(bs, true);
1089         }
1090         if (load) {
1091             /* tray close */
1092             bdrv_emit_qmp_eject_event(bs, false);
1093         }
1094     }
1095 }
1096 
1097 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1098 {
1099     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1100 }
1101 
1102 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1103 {
1104     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1105         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1106     }
1107 }
1108 
1109 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1110 {
1111     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1112         return bs->dev_ops->is_tray_open(bs->dev_opaque);
1113     }
1114     return false;
1115 }
1116 
1117 static void bdrv_dev_resize_cb(BlockDriverState *bs)
1118 {
1119     if (bs->dev_ops && bs->dev_ops->resize_cb) {
1120         bs->dev_ops->resize_cb(bs->dev_opaque);
1121     }
1122 }
1123 
1124 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1125 {
1126     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1127         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1128     }
1129     return false;
1130 }
1131 
1132 /*
1133  * Run consistency checks on an image
1134  *
1135  * Returns 0 if the check could be completed (it doesn't mean that the image is
1136  * free of errors) or -errno when an internal error occurred. The results of the
1137  * check are stored in res.
1138  */
1139 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
1140 {
1141     if (bs->drv->bdrv_check == NULL) {
1142         return -ENOTSUP;
1143     }
1144 
1145     memset(res, 0, sizeof(*res));
1146     return bs->drv->bdrv_check(bs, res);
1147 }
1148 
1149 #define COMMIT_BUF_SECTORS 2048
1150 
1151 /* commit COW file into the raw image */
1152 int bdrv_commit(BlockDriverState *bs)
1153 {
1154     BlockDriver *drv = bs->drv;
1155     BlockDriver *backing_drv;
1156     int64_t sector, total_sectors;
1157     int n, ro, open_flags;
1158     int ret = 0, rw_ret = 0;
1159     uint8_t *buf;
1160     char filename[1024];
1161     BlockDriverState *bs_rw, *bs_ro;
1162 
1163     if (!drv)
1164         return -ENOMEDIUM;
1165 
1166     if (!bs->backing_hd) {
1167         return -ENOTSUP;
1168     }
1169 
1170     if (bs->backing_hd->keep_read_only) {
1171         return -EACCES;
1172     }
1173 
1174     if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1175         return -EBUSY;
1176     }
1177 
1178     backing_drv = bs->backing_hd->drv;
1179     ro = bs->backing_hd->read_only;
1180     strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1181     open_flags =  bs->backing_hd->open_flags;
1182 
1183     if (ro) {
1184         /* re-open as RW */
1185         bdrv_delete(bs->backing_hd);
1186         bs->backing_hd = NULL;
1187         bs_rw = bdrv_new("");
1188         rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1189             backing_drv);
1190         if (rw_ret < 0) {
1191             bdrv_delete(bs_rw);
1192             /* try to re-open read-only */
1193             bs_ro = bdrv_new("");
1194             ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1195                 backing_drv);
1196             if (ret < 0) {
1197                 bdrv_delete(bs_ro);
1198                 /* drive not functional anymore */
1199                 bs->drv = NULL;
1200                 return ret;
1201             }
1202             bs->backing_hd = bs_ro;
1203             return rw_ret;
1204         }
1205         bs->backing_hd = bs_rw;
1206     }
1207 
1208     total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1209     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1210 
1211     for (sector = 0; sector < total_sectors; sector += n) {
1212         if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1213 
1214             if (bdrv_read(bs, sector, buf, n) != 0) {
1215                 ret = -EIO;
1216                 goto ro_cleanup;
1217             }
1218 
1219             if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1220                 ret = -EIO;
1221                 goto ro_cleanup;
1222             }
1223         }
1224     }
1225 
1226     if (drv->bdrv_make_empty) {
1227         ret = drv->bdrv_make_empty(bs);
1228         bdrv_flush(bs);
1229     }
1230 
1231     /*
1232      * Make sure all data we wrote to the backing device is actually
1233      * stable on disk.
1234      */
1235     if (bs->backing_hd)
1236         bdrv_flush(bs->backing_hd);
1237 
1238 ro_cleanup:
1239     g_free(buf);
1240 
1241     if (ro) {
1242         /* re-open as RO */
1243         bdrv_delete(bs->backing_hd);
1244         bs->backing_hd = NULL;
1245         bs_ro = bdrv_new("");
1246         ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1247             backing_drv);
1248         if (ret < 0) {
1249             bdrv_delete(bs_ro);
1250             /* drive not functional anymore */
1251             bs->drv = NULL;
1252             return ret;
1253         }
1254         bs->backing_hd = bs_ro;
1255         bs->backing_hd->keep_read_only = 0;
1256     }
1257 
1258     return ret;
1259 }
1260 
1261 int bdrv_commit_all(void)
1262 {
1263     BlockDriverState *bs;
1264 
1265     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1266         int ret = bdrv_commit(bs);
1267         if (ret < 0) {
1268             return ret;
1269         }
1270     }
1271     return 0;
1272 }
1273 
1274 struct BdrvTrackedRequest {
1275     BlockDriverState *bs;
1276     int64_t sector_num;
1277     int nb_sectors;
1278     bool is_write;
1279     QLIST_ENTRY(BdrvTrackedRequest) list;
1280     Coroutine *co; /* owner, used for deadlock detection */
1281     CoQueue wait_queue; /* coroutines blocked on this request */
1282 };
1283 
1284 /**
1285  * Remove an active request from the tracked requests list
1286  *
1287  * This function should be called when a tracked request is completing.
1288  */
1289 static void tracked_request_end(BdrvTrackedRequest *req)
1290 {
1291     QLIST_REMOVE(req, list);
1292     qemu_co_queue_restart_all(&req->wait_queue);
1293 }
1294 
1295 /**
1296  * Add an active request to the tracked requests list
1297  */
1298 static void tracked_request_begin(BdrvTrackedRequest *req,
1299                                   BlockDriverState *bs,
1300                                   int64_t sector_num,
1301                                   int nb_sectors, bool is_write)
1302 {
1303     *req = (BdrvTrackedRequest){
1304         .bs = bs,
1305         .sector_num = sector_num,
1306         .nb_sectors = nb_sectors,
1307         .is_write = is_write,
1308         .co = qemu_coroutine_self(),
1309     };
1310 
1311     qemu_co_queue_init(&req->wait_queue);
1312 
1313     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1314 }
1315 
1316 /**
1317  * Round a region to cluster boundaries
1318  */
1319 static void round_to_clusters(BlockDriverState *bs,
1320                               int64_t sector_num, int nb_sectors,
1321                               int64_t *cluster_sector_num,
1322                               int *cluster_nb_sectors)
1323 {
1324     BlockDriverInfo bdi;
1325 
1326     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1327         *cluster_sector_num = sector_num;
1328         *cluster_nb_sectors = nb_sectors;
1329     } else {
1330         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1331         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1332         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1333                                             nb_sectors, c);
1334     }
1335 }
1336 
1337 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1338                                      int64_t sector_num, int nb_sectors) {
1339     /*        aaaa   bbbb */
1340     if (sector_num >= req->sector_num + req->nb_sectors) {
1341         return false;
1342     }
1343     /* bbbb   aaaa        */
1344     if (req->sector_num >= sector_num + nb_sectors) {
1345         return false;
1346     }
1347     return true;
1348 }
1349 
1350 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1351         int64_t sector_num, int nb_sectors)
1352 {
1353     BdrvTrackedRequest *req;
1354     int64_t cluster_sector_num;
1355     int cluster_nb_sectors;
1356     bool retry;
1357 
1358     /* If we touch the same cluster it counts as an overlap.  This guarantees
1359      * that allocating writes will be serialized and not race with each other
1360      * for the same cluster.  For example, in copy-on-read it ensures that the
1361      * CoR read and write operations are atomic and guest writes cannot
1362      * interleave between them.
1363      */
1364     round_to_clusters(bs, sector_num, nb_sectors,
1365                       &cluster_sector_num, &cluster_nb_sectors);
1366 
1367     do {
1368         retry = false;
1369         QLIST_FOREACH(req, &bs->tracked_requests, list) {
1370             if (tracked_request_overlaps(req, cluster_sector_num,
1371                                          cluster_nb_sectors)) {
1372                 /* Hitting this means there was a reentrant request, for
1373                  * example, a block driver issuing nested requests.  This must
1374                  * never happen since it means deadlock.
1375                  */
1376                 assert(qemu_coroutine_self() != req->co);
1377 
1378                 qemu_co_queue_wait(&req->wait_queue);
1379                 retry = true;
1380                 break;
1381             }
1382         }
1383     } while (retry);
1384 }
1385 
1386 /*
1387  * Return values:
1388  * 0        - success
1389  * -EINVAL  - backing format specified, but no file
1390  * -ENOSPC  - can't update the backing file because no space is left in the
1391  *            image file header
1392  * -ENOTSUP - format driver doesn't support changing the backing file
1393  */
1394 int bdrv_change_backing_file(BlockDriverState *bs,
1395     const char *backing_file, const char *backing_fmt)
1396 {
1397     BlockDriver *drv = bs->drv;
1398 
1399     if (drv->bdrv_change_backing_file != NULL) {
1400         return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1401     } else {
1402         return -ENOTSUP;
1403     }
1404 }
1405 
1406 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1407                                    size_t size)
1408 {
1409     int64_t len;
1410 
1411     if (!bdrv_is_inserted(bs))
1412         return -ENOMEDIUM;
1413 
1414     if (bs->growable)
1415         return 0;
1416 
1417     len = bdrv_getlength(bs);
1418 
1419     if (offset < 0)
1420         return -EIO;
1421 
1422     if ((offset > len) || (len - offset < size))
1423         return -EIO;
1424 
1425     return 0;
1426 }
1427 
1428 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1429                               int nb_sectors)
1430 {
1431     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1432                                    nb_sectors * BDRV_SECTOR_SIZE);
1433 }
1434 
1435 typedef struct RwCo {
1436     BlockDriverState *bs;
1437     int64_t sector_num;
1438     int nb_sectors;
1439     QEMUIOVector *qiov;
1440     bool is_write;
1441     int ret;
1442 } RwCo;
1443 
1444 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1445 {
1446     RwCo *rwco = opaque;
1447 
1448     if (!rwco->is_write) {
1449         rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1450                                      rwco->nb_sectors, rwco->qiov, 0);
1451     } else {
1452         rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1453                                       rwco->nb_sectors, rwco->qiov, 0);
1454     }
1455 }
1456 
1457 /*
1458  * Process a synchronous request using coroutines
1459  */
1460 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1461                       int nb_sectors, bool is_write)
1462 {
1463     QEMUIOVector qiov;
1464     struct iovec iov = {
1465         .iov_base = (void *)buf,
1466         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1467     };
1468     Coroutine *co;
1469     RwCo rwco = {
1470         .bs = bs,
1471         .sector_num = sector_num,
1472         .nb_sectors = nb_sectors,
1473         .qiov = &qiov,
1474         .is_write = is_write,
1475         .ret = NOT_DONE,
1476     };
1477 
1478     qemu_iovec_init_external(&qiov, &iov, 1);
1479 
1480     /**
1481      * In sync call context, when the vcpu is blocked, this throttling timer
1482      * will not fire; so the I/O throttling function has to be disabled here
1483      * if it has been enabled.
1484      */
1485     if (bs->io_limits_enabled) {
1486         fprintf(stderr, "Disabling I/O throttling on '%s' due "
1487                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
1488         bdrv_io_limits_disable(bs);
1489     }
1490 
1491     if (qemu_in_coroutine()) {
1492         /* Fast-path if already in coroutine context */
1493         bdrv_rw_co_entry(&rwco);
1494     } else {
1495         co = qemu_coroutine_create(bdrv_rw_co_entry);
1496         qemu_coroutine_enter(co, &rwco);
1497         while (rwco.ret == NOT_DONE) {
1498             qemu_aio_wait();
1499         }
1500     }
1501     return rwco.ret;
1502 }
1503 
1504 /* return < 0 if error. See bdrv_write() for the return codes */
1505 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1506               uint8_t *buf, int nb_sectors)
1507 {
1508     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1509 }
1510 
1511 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1512                              int nb_sectors, int dirty)
1513 {
1514     int64_t start, end;
1515     unsigned long val, idx, bit;
1516 
1517     start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1518     end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1519 
1520     for (; start <= end; start++) {
1521         idx = start / (sizeof(unsigned long) * 8);
1522         bit = start % (sizeof(unsigned long) * 8);
1523         val = bs->dirty_bitmap[idx];
1524         if (dirty) {
1525             if (!(val & (1UL << bit))) {
1526                 bs->dirty_count++;
1527                 val |= 1UL << bit;
1528             }
1529         } else {
1530             if (val & (1UL << bit)) {
1531                 bs->dirty_count--;
1532                 val &= ~(1UL << bit);
1533             }
1534         }
1535         bs->dirty_bitmap[idx] = val;
1536     }
1537 }
1538 
1539 /* Return < 0 if error. Important errors are:
1540   -EIO         generic I/O error (may happen for all errors)
1541   -ENOMEDIUM   No media inserted.
1542   -EINVAL      Invalid sector number or nb_sectors
1543   -EACCES      Trying to write a read-only device
1544 */
1545 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1546                const uint8_t *buf, int nb_sectors)
1547 {
1548     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1549 }
1550 
1551 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1552                void *buf, int count1)
1553 {
1554     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1555     int len, nb_sectors, count;
1556     int64_t sector_num;
1557     int ret;
1558 
1559     count = count1;
1560     /* first read to align to sector start */
1561     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1562     if (len > count)
1563         len = count;
1564     sector_num = offset >> BDRV_SECTOR_BITS;
1565     if (len > 0) {
1566         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1567             return ret;
1568         memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1569         count -= len;
1570         if (count == 0)
1571             return count1;
1572         sector_num++;
1573         buf += len;
1574     }
1575 
1576     /* read the sectors "in place" */
1577     nb_sectors = count >> BDRV_SECTOR_BITS;
1578     if (nb_sectors > 0) {
1579         if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1580             return ret;
1581         sector_num += nb_sectors;
1582         len = nb_sectors << BDRV_SECTOR_BITS;
1583         buf += len;
1584         count -= len;
1585     }
1586 
1587     /* add data from the last sector */
1588     if (count > 0) {
1589         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1590             return ret;
1591         memcpy(buf, tmp_buf, count);
1592     }
1593     return count1;
1594 }
1595 
1596 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1597                 const void *buf, int count1)
1598 {
1599     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1600     int len, nb_sectors, count;
1601     int64_t sector_num;
1602     int ret;
1603 
1604     count = count1;
1605     /* first write to align to sector start */
1606     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1607     if (len > count)
1608         len = count;
1609     sector_num = offset >> BDRV_SECTOR_BITS;
1610     if (len > 0) {
1611         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1612             return ret;
1613         memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1614         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1615             return ret;
1616         count -= len;
1617         if (count == 0)
1618             return count1;
1619         sector_num++;
1620         buf += len;
1621     }
1622 
1623     /* write the sectors "in place" */
1624     nb_sectors = count >> BDRV_SECTOR_BITS;
1625     if (nb_sectors > 0) {
1626         if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1627             return ret;
1628         sector_num += nb_sectors;
1629         len = nb_sectors << BDRV_SECTOR_BITS;
1630         buf += len;
1631         count -= len;
1632     }
1633 
1634     /* add data from the last sector */
1635     if (count > 0) {
1636         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1637             return ret;
1638         memcpy(tmp_buf, buf, count);
1639         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1640             return ret;
1641     }
1642     return count1;
1643 }
1644 
1645 /*
1646  * Writes to the file and ensures that no writes are reordered across this
1647  * request (acts as a barrier)
1648  *
1649  * Returns 0 on success, -errno in error cases.
1650  */
1651 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1652     const void *buf, int count)
1653 {
1654     int ret;
1655 
1656     ret = bdrv_pwrite(bs, offset, buf, count);
1657     if (ret < 0) {
1658         return ret;
1659     }
1660 
1661     /* No flush needed for cache modes that use O_DSYNC */
1662     if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
1663         bdrv_flush(bs);
1664     }
1665 
1666     return 0;
1667 }
1668 
1669 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1670         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1671 {
1672     /* Perform I/O through a temporary buffer so that users who scribble over
1673      * their read buffer while the operation is in progress do not end up
1674      * modifying the image file.  This is critical for zero-copy guest I/O
1675      * where anything might happen inside guest memory.
1676      */
1677     void *bounce_buffer;
1678 
1679     BlockDriver *drv = bs->drv;
1680     struct iovec iov;
1681     QEMUIOVector bounce_qiov;
1682     int64_t cluster_sector_num;
1683     int cluster_nb_sectors;
1684     size_t skip_bytes;
1685     int ret;
1686 
1687     /* Cover entire cluster so no additional backing file I/O is required when
1688      * allocating cluster in the image file.
1689      */
1690     round_to_clusters(bs, sector_num, nb_sectors,
1691                       &cluster_sector_num, &cluster_nb_sectors);
1692 
1693     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1694                                    cluster_sector_num, cluster_nb_sectors);
1695 
1696     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1697     iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1698     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1699 
1700     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1701                              &bounce_qiov);
1702     if (ret < 0) {
1703         goto err;
1704     }
1705 
1706     if (drv->bdrv_co_write_zeroes &&
1707         buffer_is_zero(bounce_buffer, iov.iov_len)) {
1708         ret = drv->bdrv_co_write_zeroes(bs, cluster_sector_num,
1709                                         cluster_nb_sectors);
1710     } else {
1711         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1712                                   &bounce_qiov);
1713     }
1714 
1715     if (ret < 0) {
1716         /* It might be okay to ignore write errors for guest requests.  If this
1717          * is a deliberate copy-on-read then we don't want to ignore the error.
1718          * Simply report it in all cases.
1719          */
1720         goto err;
1721     }
1722 
1723     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1724     qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1725                            nb_sectors * BDRV_SECTOR_SIZE);
1726 
1727 err:
1728     qemu_vfree(bounce_buffer);
1729     return ret;
1730 }
1731 
1732 /*
1733  * Handle a read request in coroutine context
1734  */
1735 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1736     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1737     BdrvRequestFlags flags)
1738 {
1739     BlockDriver *drv = bs->drv;
1740     BdrvTrackedRequest req;
1741     int ret;
1742 
1743     if (!drv) {
1744         return -ENOMEDIUM;
1745     }
1746     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1747         return -EIO;
1748     }
1749 
1750     /* throttling disk read I/O */
1751     if (bs->io_limits_enabled) {
1752         bdrv_io_limits_intercept(bs, false, nb_sectors);
1753     }
1754 
1755     if (bs->copy_on_read) {
1756         flags |= BDRV_REQ_COPY_ON_READ;
1757     }
1758     if (flags & BDRV_REQ_COPY_ON_READ) {
1759         bs->copy_on_read_in_flight++;
1760     }
1761 
1762     if (bs->copy_on_read_in_flight) {
1763         wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1764     }
1765 
1766     tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1767 
1768     if (flags & BDRV_REQ_COPY_ON_READ) {
1769         int pnum;
1770 
1771         ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1772         if (ret < 0) {
1773             goto out;
1774         }
1775 
1776         if (!ret || pnum != nb_sectors) {
1777             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1778             goto out;
1779         }
1780     }
1781 
1782     ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1783 
1784 out:
1785     tracked_request_end(&req);
1786 
1787     if (flags & BDRV_REQ_COPY_ON_READ) {
1788         bs->copy_on_read_in_flight--;
1789     }
1790 
1791     return ret;
1792 }
1793 
1794 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1795     int nb_sectors, QEMUIOVector *qiov)
1796 {
1797     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1798 
1799     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1800 }
1801 
1802 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1803     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1804 {
1805     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1806 
1807     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1808                             BDRV_REQ_COPY_ON_READ);
1809 }
1810 
1811 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1812     int64_t sector_num, int nb_sectors)
1813 {
1814     BlockDriver *drv = bs->drv;
1815     QEMUIOVector qiov;
1816     struct iovec iov;
1817     int ret;
1818 
1819     /* First try the efficient write zeroes operation */
1820     if (drv->bdrv_co_write_zeroes) {
1821         return drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1822     }
1823 
1824     /* Fall back to bounce buffer if write zeroes is unsupported */
1825     iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
1826     iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1827     memset(iov.iov_base, 0, iov.iov_len);
1828     qemu_iovec_init_external(&qiov, &iov, 1);
1829 
1830     ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1831 
1832     qemu_vfree(iov.iov_base);
1833     return ret;
1834 }
1835 
1836 /*
1837  * Handle a write request in coroutine context
1838  */
1839 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1840     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1841     BdrvRequestFlags flags)
1842 {
1843     BlockDriver *drv = bs->drv;
1844     BdrvTrackedRequest req;
1845     int ret;
1846 
1847     if (!bs->drv) {
1848         return -ENOMEDIUM;
1849     }
1850     if (bs->read_only) {
1851         return -EACCES;
1852     }
1853     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1854         return -EIO;
1855     }
1856 
1857     /* throttling disk write I/O */
1858     if (bs->io_limits_enabled) {
1859         bdrv_io_limits_intercept(bs, true, nb_sectors);
1860     }
1861 
1862     if (bs->copy_on_read_in_flight) {
1863         wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1864     }
1865 
1866     tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1867 
1868     if (flags & BDRV_REQ_ZERO_WRITE) {
1869         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1870     } else {
1871         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1872     }
1873 
1874     if (bs->dirty_bitmap) {
1875         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1876     }
1877 
1878     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1879         bs->wr_highest_sector = sector_num + nb_sectors - 1;
1880     }
1881 
1882     tracked_request_end(&req);
1883 
1884     return ret;
1885 }
1886 
1887 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1888     int nb_sectors, QEMUIOVector *qiov)
1889 {
1890     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1891 
1892     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1893 }
1894 
1895 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1896                                       int64_t sector_num, int nb_sectors)
1897 {
1898     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1899 
1900     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1901                              BDRV_REQ_ZERO_WRITE);
1902 }
1903 
1904 /**
1905  * Truncate file to 'offset' bytes (needed only for file protocols)
1906  */
1907 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1908 {
1909     BlockDriver *drv = bs->drv;
1910     int ret;
1911     if (!drv)
1912         return -ENOMEDIUM;
1913     if (!drv->bdrv_truncate)
1914         return -ENOTSUP;
1915     if (bs->read_only)
1916         return -EACCES;
1917     if (bdrv_in_use(bs))
1918         return -EBUSY;
1919     ret = drv->bdrv_truncate(bs, offset);
1920     if (ret == 0) {
1921         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1922         bdrv_dev_resize_cb(bs);
1923     }
1924     return ret;
1925 }
1926 
1927 /**
1928  * Length of a allocated file in bytes. Sparse files are counted by actual
1929  * allocated space. Return < 0 if error or unknown.
1930  */
1931 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
1932 {
1933     BlockDriver *drv = bs->drv;
1934     if (!drv) {
1935         return -ENOMEDIUM;
1936     }
1937     if (drv->bdrv_get_allocated_file_size) {
1938         return drv->bdrv_get_allocated_file_size(bs);
1939     }
1940     if (bs->file) {
1941         return bdrv_get_allocated_file_size(bs->file);
1942     }
1943     return -ENOTSUP;
1944 }
1945 
1946 /**
1947  * Length of a file in bytes. Return < 0 if error or unknown.
1948  */
1949 int64_t bdrv_getlength(BlockDriverState *bs)
1950 {
1951     BlockDriver *drv = bs->drv;
1952     if (!drv)
1953         return -ENOMEDIUM;
1954 
1955     if (bs->growable || bdrv_dev_has_removable_media(bs)) {
1956         if (drv->bdrv_getlength) {
1957             return drv->bdrv_getlength(bs);
1958         }
1959     }
1960     return bs->total_sectors * BDRV_SECTOR_SIZE;
1961 }
1962 
1963 /* return 0 as number of sectors if no device present or error */
1964 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
1965 {
1966     int64_t length;
1967     length = bdrv_getlength(bs);
1968     if (length < 0)
1969         length = 0;
1970     else
1971         length = length >> BDRV_SECTOR_BITS;
1972     *nb_sectors_ptr = length;
1973 }
1974 
1975 struct partition {
1976         uint8_t boot_ind;           /* 0x80 - active */
1977         uint8_t head;               /* starting head */
1978         uint8_t sector;             /* starting sector */
1979         uint8_t cyl;                /* starting cylinder */
1980         uint8_t sys_ind;            /* What partition type */
1981         uint8_t end_head;           /* end head */
1982         uint8_t end_sector;         /* end sector */
1983         uint8_t end_cyl;            /* end cylinder */
1984         uint32_t start_sect;        /* starting sector counting from 0 */
1985         uint32_t nr_sects;          /* nr of sectors in partition */
1986 } QEMU_PACKED;
1987 
1988 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1989 static int guess_disk_lchs(BlockDriverState *bs,
1990                            int *pcylinders, int *pheads, int *psectors)
1991 {
1992     uint8_t buf[BDRV_SECTOR_SIZE];
1993     int ret, i, heads, sectors, cylinders;
1994     struct partition *p;
1995     uint32_t nr_sects;
1996     uint64_t nb_sectors;
1997     bool enabled;
1998 
1999     bdrv_get_geometry(bs, &nb_sectors);
2000 
2001     /**
2002      * The function will be invoked during startup not only in sync I/O mode,
2003      * but also in async I/O mode. So the I/O throttling function has to
2004      * be disabled temporarily here, not permanently.
2005      */
2006     enabled = bs->io_limits_enabled;
2007     bs->io_limits_enabled = false;
2008     ret = bdrv_read(bs, 0, buf, 1);
2009     bs->io_limits_enabled = enabled;
2010     if (ret < 0)
2011         return -1;
2012     /* test msdos magic */
2013     if (buf[510] != 0x55 || buf[511] != 0xaa)
2014         return -1;
2015     for(i = 0; i < 4; i++) {
2016         p = ((struct partition *)(buf + 0x1be)) + i;
2017         nr_sects = le32_to_cpu(p->nr_sects);
2018         if (nr_sects && p->end_head) {
2019             /* We make the assumption that the partition terminates on
2020                a cylinder boundary */
2021             heads = p->end_head + 1;
2022             sectors = p->end_sector & 63;
2023             if (sectors == 0)
2024                 continue;
2025             cylinders = nb_sectors / (heads * sectors);
2026             if (cylinders < 1 || cylinders > 16383)
2027                 continue;
2028             *pheads = heads;
2029             *psectors = sectors;
2030             *pcylinders = cylinders;
2031 #if 0
2032             printf("guessed geometry: LCHS=%d %d %d\n",
2033                    cylinders, heads, sectors);
2034 #endif
2035             return 0;
2036         }
2037     }
2038     return -1;
2039 }
2040 
2041 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2042 {
2043     int translation, lba_detected = 0;
2044     int cylinders, heads, secs;
2045     uint64_t nb_sectors;
2046 
2047     /* if a geometry hint is available, use it */
2048     bdrv_get_geometry(bs, &nb_sectors);
2049     bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2050     translation = bdrv_get_translation_hint(bs);
2051     if (cylinders != 0) {
2052         *pcyls = cylinders;
2053         *pheads = heads;
2054         *psecs = secs;
2055     } else {
2056         if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2057             if (heads > 16) {
2058                 /* if heads > 16, it means that a BIOS LBA
2059                    translation was active, so the default
2060                    hardware geometry is OK */
2061                 lba_detected = 1;
2062                 goto default_geometry;
2063             } else {
2064                 *pcyls = cylinders;
2065                 *pheads = heads;
2066                 *psecs = secs;
2067                 /* disable any translation to be in sync with
2068                    the logical geometry */
2069                 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2070                     bdrv_set_translation_hint(bs,
2071                                               BIOS_ATA_TRANSLATION_NONE);
2072                 }
2073             }
2074         } else {
2075         default_geometry:
2076             /* if no geometry, use a standard physical disk geometry */
2077             cylinders = nb_sectors / (16 * 63);
2078 
2079             if (cylinders > 16383)
2080                 cylinders = 16383;
2081             else if (cylinders < 2)
2082                 cylinders = 2;
2083             *pcyls = cylinders;
2084             *pheads = 16;
2085             *psecs = 63;
2086             if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2087                 if ((*pcyls * *pheads) <= 131072) {
2088                     bdrv_set_translation_hint(bs,
2089                                               BIOS_ATA_TRANSLATION_LARGE);
2090                 } else {
2091                     bdrv_set_translation_hint(bs,
2092                                               BIOS_ATA_TRANSLATION_LBA);
2093                 }
2094             }
2095         }
2096         bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2097     }
2098 }
2099 
2100 void bdrv_set_geometry_hint(BlockDriverState *bs,
2101                             int cyls, int heads, int secs)
2102 {
2103     bs->cyls = cyls;
2104     bs->heads = heads;
2105     bs->secs = secs;
2106 }
2107 
2108 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2109 {
2110     bs->translation = translation;
2111 }
2112 
2113 void bdrv_get_geometry_hint(BlockDriverState *bs,
2114                             int *pcyls, int *pheads, int *psecs)
2115 {
2116     *pcyls = bs->cyls;
2117     *pheads = bs->heads;
2118     *psecs = bs->secs;
2119 }
2120 
2121 /* throttling disk io limits */
2122 void bdrv_set_io_limits(BlockDriverState *bs,
2123                         BlockIOLimit *io_limits)
2124 {
2125     bs->io_limits = *io_limits;
2126     bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2127 }
2128 
2129 /* Recognize floppy formats */
2130 typedef struct FDFormat {
2131     FDriveType drive;
2132     uint8_t last_sect;
2133     uint8_t max_track;
2134     uint8_t max_head;
2135     FDriveRate rate;
2136 } FDFormat;
2137 
2138 static const FDFormat fd_formats[] = {
2139     /* First entry is default format */
2140     /* 1.44 MB 3"1/2 floppy disks */
2141     { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2142     { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2143     { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2144     { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2145     { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2146     { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2147     { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2148     { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
2149     /* 2.88 MB 3"1/2 floppy disks */
2150     { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2151     { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2152     { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2153     { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2154     { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
2155     /* 720 kB 3"1/2 floppy disks */
2156     { FDRIVE_DRV_144,  9, 80, 1, FDRIVE_RATE_250K, },
2157     { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2158     { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2159     { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2160     { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2161     { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
2162     /* 1.2 MB 5"1/4 floppy disks */
2163     { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2164     { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2165     { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2166     { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2167     { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
2168     /* 720 kB 5"1/4 floppy disks */
2169     { FDRIVE_DRV_120,  9, 80, 1, FDRIVE_RATE_250K, },
2170     { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
2171     /* 360 kB 5"1/4 floppy disks */
2172     { FDRIVE_DRV_120,  9, 40, 1, FDRIVE_RATE_300K, },
2173     { FDRIVE_DRV_120,  9, 40, 0, FDRIVE_RATE_300K, },
2174     { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2175     { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
2176     /* 320 kB 5"1/4 floppy disks */
2177     { FDRIVE_DRV_120,  8, 40, 1, FDRIVE_RATE_250K, },
2178     { FDRIVE_DRV_120,  8, 40, 0, FDRIVE_RATE_250K, },
2179     /* 360 kB must match 5"1/4 better than 3"1/2... */
2180     { FDRIVE_DRV_144,  9, 80, 0, FDRIVE_RATE_250K, },
2181     /* end */
2182     { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
2183 };
2184 
2185 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2186                                    int *max_track, int *last_sect,
2187                                    FDriveType drive_in, FDriveType *drive,
2188                                    FDriveRate *rate)
2189 {
2190     const FDFormat *parse;
2191     uint64_t nb_sectors, size;
2192     int i, first_match, match;
2193 
2194     bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2195     if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2196         /* User defined disk */
2197         *rate = FDRIVE_RATE_500K;
2198     } else {
2199         bdrv_get_geometry(bs, &nb_sectors);
2200         match = -1;
2201         first_match = -1;
2202         for (i = 0; ; i++) {
2203             parse = &fd_formats[i];
2204             if (parse->drive == FDRIVE_DRV_NONE) {
2205                 break;
2206             }
2207             if (drive_in == parse->drive ||
2208                 drive_in == FDRIVE_DRV_NONE) {
2209                 size = (parse->max_head + 1) * parse->max_track *
2210                     parse->last_sect;
2211                 if (nb_sectors == size) {
2212                     match = i;
2213                     break;
2214                 }
2215                 if (first_match == -1) {
2216                     first_match = i;
2217                 }
2218             }
2219         }
2220         if (match == -1) {
2221             if (first_match == -1) {
2222                 match = 1;
2223             } else {
2224                 match = first_match;
2225             }
2226             parse = &fd_formats[match];
2227         }
2228         *nb_heads = parse->max_head + 1;
2229         *max_track = parse->max_track;
2230         *last_sect = parse->last_sect;
2231         *drive = parse->drive;
2232         *rate = parse->rate;
2233     }
2234 }
2235 
2236 int bdrv_get_translation_hint(BlockDriverState *bs)
2237 {
2238     return bs->translation;
2239 }
2240 
2241 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2242                        BlockErrorAction on_write_error)
2243 {
2244     bs->on_read_error = on_read_error;
2245     bs->on_write_error = on_write_error;
2246 }
2247 
2248 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2249 {
2250     return is_read ? bs->on_read_error : bs->on_write_error;
2251 }
2252 
2253 int bdrv_is_read_only(BlockDriverState *bs)
2254 {
2255     return bs->read_only;
2256 }
2257 
2258 int bdrv_is_sg(BlockDriverState *bs)
2259 {
2260     return bs->sg;
2261 }
2262 
2263 int bdrv_enable_write_cache(BlockDriverState *bs)
2264 {
2265     return bs->enable_write_cache;
2266 }
2267 
2268 int bdrv_is_encrypted(BlockDriverState *bs)
2269 {
2270     if (bs->backing_hd && bs->backing_hd->encrypted)
2271         return 1;
2272     return bs->encrypted;
2273 }
2274 
2275 int bdrv_key_required(BlockDriverState *bs)
2276 {
2277     BlockDriverState *backing_hd = bs->backing_hd;
2278 
2279     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2280         return 1;
2281     return (bs->encrypted && !bs->valid_key);
2282 }
2283 
2284 int bdrv_set_key(BlockDriverState *bs, const char *key)
2285 {
2286     int ret;
2287     if (bs->backing_hd && bs->backing_hd->encrypted) {
2288         ret = bdrv_set_key(bs->backing_hd, key);
2289         if (ret < 0)
2290             return ret;
2291         if (!bs->encrypted)
2292             return 0;
2293     }
2294     if (!bs->encrypted) {
2295         return -EINVAL;
2296     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2297         return -ENOMEDIUM;
2298     }
2299     ret = bs->drv->bdrv_set_key(bs, key);
2300     if (ret < 0) {
2301         bs->valid_key = 0;
2302     } else if (!bs->valid_key) {
2303         bs->valid_key = 1;
2304         /* call the change callback now, we skipped it on open */
2305         bdrv_dev_change_media_cb(bs, true);
2306     }
2307     return ret;
2308 }
2309 
2310 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2311 {
2312     if (!bs->drv) {
2313         buf[0] = '\0';
2314     } else {
2315         pstrcpy(buf, buf_size, bs->drv->format_name);
2316     }
2317 }
2318 
2319 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2320                          void *opaque)
2321 {
2322     BlockDriver *drv;
2323 
2324     QLIST_FOREACH(drv, &bdrv_drivers, list) {
2325         it(opaque, drv->format_name);
2326     }
2327 }
2328 
2329 BlockDriverState *bdrv_find(const char *name)
2330 {
2331     BlockDriverState *bs;
2332 
2333     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2334         if (!strcmp(name, bs->device_name)) {
2335             return bs;
2336         }
2337     }
2338     return NULL;
2339 }
2340 
2341 BlockDriverState *bdrv_next(BlockDriverState *bs)
2342 {
2343     if (!bs) {
2344         return QTAILQ_FIRST(&bdrv_states);
2345     }
2346     return QTAILQ_NEXT(bs, list);
2347 }
2348 
2349 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2350 {
2351     BlockDriverState *bs;
2352 
2353     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2354         it(opaque, bs);
2355     }
2356 }
2357 
2358 const char *bdrv_get_device_name(BlockDriverState *bs)
2359 {
2360     return bs->device_name;
2361 }
2362 
2363 void bdrv_flush_all(void)
2364 {
2365     BlockDriverState *bs;
2366 
2367     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2368         bdrv_flush(bs);
2369     }
2370 }
2371 
2372 int bdrv_has_zero_init(BlockDriverState *bs)
2373 {
2374     assert(bs->drv);
2375 
2376     if (bs->drv->bdrv_has_zero_init) {
2377         return bs->drv->bdrv_has_zero_init(bs);
2378     }
2379 
2380     return 1;
2381 }
2382 
2383 typedef struct BdrvCoIsAllocatedData {
2384     BlockDriverState *bs;
2385     int64_t sector_num;
2386     int nb_sectors;
2387     int *pnum;
2388     int ret;
2389     bool done;
2390 } BdrvCoIsAllocatedData;
2391 
2392 /*
2393  * Returns true iff the specified sector is present in the disk image. Drivers
2394  * not implementing the functionality are assumed to not support backing files,
2395  * hence all their sectors are reported as allocated.
2396  *
2397  * If 'sector_num' is beyond the end of the disk image the return value is 0
2398  * and 'pnum' is set to 0.
2399  *
2400  * 'pnum' is set to the number of sectors (including and immediately following
2401  * the specified sector) that are known to be in the same
2402  * allocated/unallocated state.
2403  *
2404  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
2405  * beyond the end of the disk image it will be clamped.
2406  */
2407 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2408                                       int nb_sectors, int *pnum)
2409 {
2410     int64_t n;
2411 
2412     if (sector_num >= bs->total_sectors) {
2413         *pnum = 0;
2414         return 0;
2415     }
2416 
2417     n = bs->total_sectors - sector_num;
2418     if (n < nb_sectors) {
2419         nb_sectors = n;
2420     }
2421 
2422     if (!bs->drv->bdrv_co_is_allocated) {
2423         *pnum = nb_sectors;
2424         return 1;
2425     }
2426 
2427     return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2428 }
2429 
2430 /* Coroutine wrapper for bdrv_is_allocated() */
2431 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2432 {
2433     BdrvCoIsAllocatedData *data = opaque;
2434     BlockDriverState *bs = data->bs;
2435 
2436     data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2437                                      data->pnum);
2438     data->done = true;
2439 }
2440 
2441 /*
2442  * Synchronous wrapper around bdrv_co_is_allocated().
2443  *
2444  * See bdrv_co_is_allocated() for details.
2445  */
2446 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2447                       int *pnum)
2448 {
2449     Coroutine *co;
2450     BdrvCoIsAllocatedData data = {
2451         .bs = bs,
2452         .sector_num = sector_num,
2453         .nb_sectors = nb_sectors,
2454         .pnum = pnum,
2455         .done = false,
2456     };
2457 
2458     co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2459     qemu_coroutine_enter(co, &data);
2460     while (!data.done) {
2461         qemu_aio_wait();
2462     }
2463     return data.ret;
2464 }
2465 
2466 BlockInfoList *qmp_query_block(Error **errp)
2467 {
2468     BlockInfoList *head = NULL, *cur_item = NULL;
2469     BlockDriverState *bs;
2470 
2471     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2472         BlockInfoList *info = g_malloc0(sizeof(*info));
2473 
2474         info->value = g_malloc0(sizeof(*info->value));
2475         info->value->device = g_strdup(bs->device_name);
2476         info->value->type = g_strdup("unknown");
2477         info->value->locked = bdrv_dev_is_medium_locked(bs);
2478         info->value->removable = bdrv_dev_has_removable_media(bs);
2479 
2480         if (bdrv_dev_has_removable_media(bs)) {
2481             info->value->has_tray_open = true;
2482             info->value->tray_open = bdrv_dev_is_tray_open(bs);
2483         }
2484 
2485         if (bdrv_iostatus_is_enabled(bs)) {
2486             info->value->has_io_status = true;
2487             info->value->io_status = bs->iostatus;
2488         }
2489 
2490         if (bs->drv) {
2491             info->value->has_inserted = true;
2492             info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2493             info->value->inserted->file = g_strdup(bs->filename);
2494             info->value->inserted->ro = bs->read_only;
2495             info->value->inserted->drv = g_strdup(bs->drv->format_name);
2496             info->value->inserted->encrypted = bs->encrypted;
2497             if (bs->backing_file[0]) {
2498                 info->value->inserted->has_backing_file = true;
2499                 info->value->inserted->backing_file = g_strdup(bs->backing_file);
2500             }
2501 
2502             if (bs->io_limits_enabled) {
2503                 info->value->inserted->bps =
2504                                bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2505                 info->value->inserted->bps_rd =
2506                                bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2507                 info->value->inserted->bps_wr =
2508                                bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2509                 info->value->inserted->iops =
2510                                bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2511                 info->value->inserted->iops_rd =
2512                                bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2513                 info->value->inserted->iops_wr =
2514                                bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2515             }
2516         }
2517 
2518         /* XXX: waiting for the qapi to support GSList */
2519         if (!cur_item) {
2520             head = cur_item = info;
2521         } else {
2522             cur_item->next = info;
2523             cur_item = info;
2524         }
2525     }
2526 
2527     return head;
2528 }
2529 
2530 /* Consider exposing this as a full fledged QMP command */
2531 static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2532 {
2533     BlockStats *s;
2534 
2535     s = g_malloc0(sizeof(*s));
2536 
2537     if (bs->device_name[0]) {
2538         s->has_device = true;
2539         s->device = g_strdup(bs->device_name);
2540     }
2541 
2542     s->stats = g_malloc0(sizeof(*s->stats));
2543     s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2544     s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2545     s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2546     s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2547     s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2548     s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2549     s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2550     s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2551     s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2552 
2553     if (bs->file) {
2554         s->has_parent = true;
2555         s->parent = qmp_query_blockstat(bs->file, NULL);
2556     }
2557 
2558     return s;
2559 }
2560 
2561 BlockStatsList *qmp_query_blockstats(Error **errp)
2562 {
2563     BlockStatsList *head = NULL, *cur_item = NULL;
2564     BlockDriverState *bs;
2565 
2566     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2567         BlockStatsList *info = g_malloc0(sizeof(*info));
2568         info->value = qmp_query_blockstat(bs, NULL);
2569 
2570         /* XXX: waiting for the qapi to support GSList */
2571         if (!cur_item) {
2572             head = cur_item = info;
2573         } else {
2574             cur_item->next = info;
2575             cur_item = info;
2576         }
2577     }
2578 
2579     return head;
2580 }
2581 
2582 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2583 {
2584     if (bs->backing_hd && bs->backing_hd->encrypted)
2585         return bs->backing_file;
2586     else if (bs->encrypted)
2587         return bs->filename;
2588     else
2589         return NULL;
2590 }
2591 
2592 void bdrv_get_backing_filename(BlockDriverState *bs,
2593                                char *filename, int filename_size)
2594 {
2595     pstrcpy(filename, filename_size, bs->backing_file);
2596 }
2597 
2598 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2599                           const uint8_t *buf, int nb_sectors)
2600 {
2601     BlockDriver *drv = bs->drv;
2602     if (!drv)
2603         return -ENOMEDIUM;
2604     if (!drv->bdrv_write_compressed)
2605         return -ENOTSUP;
2606     if (bdrv_check_request(bs, sector_num, nb_sectors))
2607         return -EIO;
2608 
2609     if (bs->dirty_bitmap) {
2610         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2611     }
2612 
2613     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2614 }
2615 
2616 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2617 {
2618     BlockDriver *drv = bs->drv;
2619     if (!drv)
2620         return -ENOMEDIUM;
2621     if (!drv->bdrv_get_info)
2622         return -ENOTSUP;
2623     memset(bdi, 0, sizeof(*bdi));
2624     return drv->bdrv_get_info(bs, bdi);
2625 }
2626 
2627 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2628                       int64_t pos, int size)
2629 {
2630     BlockDriver *drv = bs->drv;
2631     if (!drv)
2632         return -ENOMEDIUM;
2633     if (drv->bdrv_save_vmstate)
2634         return drv->bdrv_save_vmstate(bs, buf, pos, size);
2635     if (bs->file)
2636         return bdrv_save_vmstate(bs->file, buf, pos, size);
2637     return -ENOTSUP;
2638 }
2639 
2640 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2641                       int64_t pos, int size)
2642 {
2643     BlockDriver *drv = bs->drv;
2644     if (!drv)
2645         return -ENOMEDIUM;
2646     if (drv->bdrv_load_vmstate)
2647         return drv->bdrv_load_vmstate(bs, buf, pos, size);
2648     if (bs->file)
2649         return bdrv_load_vmstate(bs->file, buf, pos, size);
2650     return -ENOTSUP;
2651 }
2652 
2653 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2654 {
2655     BlockDriver *drv = bs->drv;
2656 
2657     if (!drv || !drv->bdrv_debug_event) {
2658         return;
2659     }
2660 
2661     return drv->bdrv_debug_event(bs, event);
2662 
2663 }
2664 
2665 /**************************************************************/
2666 /* handling of snapshots */
2667 
2668 int bdrv_can_snapshot(BlockDriverState *bs)
2669 {
2670     BlockDriver *drv = bs->drv;
2671     if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2672         return 0;
2673     }
2674 
2675     if (!drv->bdrv_snapshot_create) {
2676         if (bs->file != NULL) {
2677             return bdrv_can_snapshot(bs->file);
2678         }
2679         return 0;
2680     }
2681 
2682     return 1;
2683 }
2684 
2685 int bdrv_is_snapshot(BlockDriverState *bs)
2686 {
2687     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2688 }
2689 
2690 BlockDriverState *bdrv_snapshots(void)
2691 {
2692     BlockDriverState *bs;
2693 
2694     if (bs_snapshots) {
2695         return bs_snapshots;
2696     }
2697 
2698     bs = NULL;
2699     while ((bs = bdrv_next(bs))) {
2700         if (bdrv_can_snapshot(bs)) {
2701             bs_snapshots = bs;
2702             return bs;
2703         }
2704     }
2705     return NULL;
2706 }
2707 
2708 int bdrv_snapshot_create(BlockDriverState *bs,
2709                          QEMUSnapshotInfo *sn_info)
2710 {
2711     BlockDriver *drv = bs->drv;
2712     if (!drv)
2713         return -ENOMEDIUM;
2714     if (drv->bdrv_snapshot_create)
2715         return drv->bdrv_snapshot_create(bs, sn_info);
2716     if (bs->file)
2717         return bdrv_snapshot_create(bs->file, sn_info);
2718     return -ENOTSUP;
2719 }
2720 
2721 int bdrv_snapshot_goto(BlockDriverState *bs,
2722                        const char *snapshot_id)
2723 {
2724     BlockDriver *drv = bs->drv;
2725     int ret, open_ret;
2726 
2727     if (!drv)
2728         return -ENOMEDIUM;
2729     if (drv->bdrv_snapshot_goto)
2730         return drv->bdrv_snapshot_goto(bs, snapshot_id);
2731 
2732     if (bs->file) {
2733         drv->bdrv_close(bs);
2734         ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2735         open_ret = drv->bdrv_open(bs, bs->open_flags);
2736         if (open_ret < 0) {
2737             bdrv_delete(bs->file);
2738             bs->drv = NULL;
2739             return open_ret;
2740         }
2741         return ret;
2742     }
2743 
2744     return -ENOTSUP;
2745 }
2746 
2747 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2748 {
2749     BlockDriver *drv = bs->drv;
2750     if (!drv)
2751         return -ENOMEDIUM;
2752     if (drv->bdrv_snapshot_delete)
2753         return drv->bdrv_snapshot_delete(bs, snapshot_id);
2754     if (bs->file)
2755         return bdrv_snapshot_delete(bs->file, snapshot_id);
2756     return -ENOTSUP;
2757 }
2758 
2759 int bdrv_snapshot_list(BlockDriverState *bs,
2760                        QEMUSnapshotInfo **psn_info)
2761 {
2762     BlockDriver *drv = bs->drv;
2763     if (!drv)
2764         return -ENOMEDIUM;
2765     if (drv->bdrv_snapshot_list)
2766         return drv->bdrv_snapshot_list(bs, psn_info);
2767     if (bs->file)
2768         return bdrv_snapshot_list(bs->file, psn_info);
2769     return -ENOTSUP;
2770 }
2771 
2772 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2773         const char *snapshot_name)
2774 {
2775     BlockDriver *drv = bs->drv;
2776     if (!drv) {
2777         return -ENOMEDIUM;
2778     }
2779     if (!bs->read_only) {
2780         return -EINVAL;
2781     }
2782     if (drv->bdrv_snapshot_load_tmp) {
2783         return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2784     }
2785     return -ENOTSUP;
2786 }
2787 
2788 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2789         const char *backing_file)
2790 {
2791     if (!bs->drv) {
2792         return NULL;
2793     }
2794 
2795     if (bs->backing_hd) {
2796         if (strcmp(bs->backing_file, backing_file) == 0) {
2797             return bs->backing_hd;
2798         } else {
2799             return bdrv_find_backing_image(bs->backing_hd, backing_file);
2800         }
2801     }
2802 
2803     return NULL;
2804 }
2805 
2806 #define NB_SUFFIXES 4
2807 
2808 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2809 {
2810     static const char suffixes[NB_SUFFIXES] = "KMGT";
2811     int64_t base;
2812     int i;
2813 
2814     if (size <= 999) {
2815         snprintf(buf, buf_size, "%" PRId64, size);
2816     } else {
2817         base = 1024;
2818         for(i = 0; i < NB_SUFFIXES; i++) {
2819             if (size < (10 * base)) {
2820                 snprintf(buf, buf_size, "%0.1f%c",
2821                          (double)size / base,
2822                          suffixes[i]);
2823                 break;
2824             } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2825                 snprintf(buf, buf_size, "%" PRId64 "%c",
2826                          ((size + (base >> 1)) / base),
2827                          suffixes[i]);
2828                 break;
2829             }
2830             base = base * 1024;
2831         }
2832     }
2833     return buf;
2834 }
2835 
2836 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2837 {
2838     char buf1[128], date_buf[128], clock_buf[128];
2839 #ifdef _WIN32
2840     struct tm *ptm;
2841 #else
2842     struct tm tm;
2843 #endif
2844     time_t ti;
2845     int64_t secs;
2846 
2847     if (!sn) {
2848         snprintf(buf, buf_size,
2849                  "%-10s%-20s%7s%20s%15s",
2850                  "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2851     } else {
2852         ti = sn->date_sec;
2853 #ifdef _WIN32
2854         ptm = localtime(&ti);
2855         strftime(date_buf, sizeof(date_buf),
2856                  "%Y-%m-%d %H:%M:%S", ptm);
2857 #else
2858         localtime_r(&ti, &tm);
2859         strftime(date_buf, sizeof(date_buf),
2860                  "%Y-%m-%d %H:%M:%S", &tm);
2861 #endif
2862         secs = sn->vm_clock_nsec / 1000000000;
2863         snprintf(clock_buf, sizeof(clock_buf),
2864                  "%02d:%02d:%02d.%03d",
2865                  (int)(secs / 3600),
2866                  (int)((secs / 60) % 60),
2867                  (int)(secs % 60),
2868                  (int)((sn->vm_clock_nsec / 1000000) % 1000));
2869         snprintf(buf, buf_size,
2870                  "%-10s%-20s%7s%20s%15s",
2871                  sn->id_str, sn->name,
2872                  get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2873                  date_buf,
2874                  clock_buf);
2875     }
2876     return buf;
2877 }
2878 
2879 /**************************************************************/
2880 /* async I/Os */
2881 
2882 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2883                                  QEMUIOVector *qiov, int nb_sectors,
2884                                  BlockDriverCompletionFunc *cb, void *opaque)
2885 {
2886     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2887 
2888     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2889                                  cb, opaque, false);
2890 }
2891 
2892 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2893                                   QEMUIOVector *qiov, int nb_sectors,
2894                                   BlockDriverCompletionFunc *cb, void *opaque)
2895 {
2896     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2897 
2898     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2899                                  cb, opaque, true);
2900 }
2901 
2902 
2903 typedef struct MultiwriteCB {
2904     int error;
2905     int num_requests;
2906     int num_callbacks;
2907     struct {
2908         BlockDriverCompletionFunc *cb;
2909         void *opaque;
2910         QEMUIOVector *free_qiov;
2911     } callbacks[];
2912 } MultiwriteCB;
2913 
2914 static void multiwrite_user_cb(MultiwriteCB *mcb)
2915 {
2916     int i;
2917 
2918     for (i = 0; i < mcb->num_callbacks; i++) {
2919         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2920         if (mcb->callbacks[i].free_qiov) {
2921             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2922         }
2923         g_free(mcb->callbacks[i].free_qiov);
2924     }
2925 }
2926 
2927 static void multiwrite_cb(void *opaque, int ret)
2928 {
2929     MultiwriteCB *mcb = opaque;
2930 
2931     trace_multiwrite_cb(mcb, ret);
2932 
2933     if (ret < 0 && !mcb->error) {
2934         mcb->error = ret;
2935     }
2936 
2937     mcb->num_requests--;
2938     if (mcb->num_requests == 0) {
2939         multiwrite_user_cb(mcb);
2940         g_free(mcb);
2941     }
2942 }
2943 
2944 static int multiwrite_req_compare(const void *a, const void *b)
2945 {
2946     const BlockRequest *req1 = a, *req2 = b;
2947 
2948     /*
2949      * Note that we can't simply subtract req2->sector from req1->sector
2950      * here as that could overflow the return value.
2951      */
2952     if (req1->sector > req2->sector) {
2953         return 1;
2954     } else if (req1->sector < req2->sector) {
2955         return -1;
2956     } else {
2957         return 0;
2958     }
2959 }
2960 
2961 /*
2962  * Takes a bunch of requests and tries to merge them. Returns the number of
2963  * requests that remain after merging.
2964  */
2965 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2966     int num_reqs, MultiwriteCB *mcb)
2967 {
2968     int i, outidx;
2969 
2970     // Sort requests by start sector
2971     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2972 
2973     // Check if adjacent requests touch the same clusters. If so, combine them,
2974     // filling up gaps with zero sectors.
2975     outidx = 0;
2976     for (i = 1; i < num_reqs; i++) {
2977         int merge = 0;
2978         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2979 
2980         // Handle exactly sequential writes and overlapping writes.
2981         if (reqs[i].sector <= oldreq_last) {
2982             merge = 1;
2983         }
2984 
2985         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2986             merge = 0;
2987         }
2988 
2989         if (merge) {
2990             size_t size;
2991             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
2992             qemu_iovec_init(qiov,
2993                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2994 
2995             // Add the first request to the merged one. If the requests are
2996             // overlapping, drop the last sectors of the first request.
2997             size = (reqs[i].sector - reqs[outidx].sector) << 9;
2998             qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
2999 
3000             // We should need to add any zeros between the two requests
3001             assert (reqs[i].sector <= oldreq_last);
3002 
3003             // Add the second request
3004             qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
3005 
3006             reqs[outidx].nb_sectors = qiov->size >> 9;
3007             reqs[outidx].qiov = qiov;
3008 
3009             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3010         } else {
3011             outidx++;
3012             reqs[outidx].sector     = reqs[i].sector;
3013             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3014             reqs[outidx].qiov       = reqs[i].qiov;
3015         }
3016     }
3017 
3018     return outidx + 1;
3019 }
3020 
3021 /*
3022  * Submit multiple AIO write requests at once.
3023  *
3024  * On success, the function returns 0 and all requests in the reqs array have
3025  * been submitted. In error case this function returns -1, and any of the
3026  * requests may or may not be submitted yet. In particular, this means that the
3027  * callback will be called for some of the requests, for others it won't. The
3028  * caller must check the error field of the BlockRequest to wait for the right
3029  * callbacks (if error != 0, no callback will be called).
3030  *
3031  * The implementation may modify the contents of the reqs array, e.g. to merge
3032  * requests. However, the fields opaque and error are left unmodified as they
3033  * are used to signal failure for a single request to the caller.
3034  */
3035 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3036 {
3037     MultiwriteCB *mcb;
3038     int i;
3039 
3040     /* don't submit writes if we don't have a medium */
3041     if (bs->drv == NULL) {
3042         for (i = 0; i < num_reqs; i++) {
3043             reqs[i].error = -ENOMEDIUM;
3044         }
3045         return -1;
3046     }
3047 
3048     if (num_reqs == 0) {
3049         return 0;
3050     }
3051 
3052     // Create MultiwriteCB structure
3053     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3054     mcb->num_requests = 0;
3055     mcb->num_callbacks = num_reqs;
3056 
3057     for (i = 0; i < num_reqs; i++) {
3058         mcb->callbacks[i].cb = reqs[i].cb;
3059         mcb->callbacks[i].opaque = reqs[i].opaque;
3060     }
3061 
3062     // Check for mergable requests
3063     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3064 
3065     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3066 
3067     /* Run the aio requests. */
3068     mcb->num_requests = num_reqs;
3069     for (i = 0; i < num_reqs; i++) {
3070         bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3071             reqs[i].nb_sectors, multiwrite_cb, mcb);
3072     }
3073 
3074     return 0;
3075 }
3076 
3077 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3078 {
3079     acb->pool->cancel(acb);
3080 }
3081 
3082 /* block I/O throttling */
3083 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3084                  bool is_write, double elapsed_time, uint64_t *wait)
3085 {
3086     uint64_t bps_limit = 0;
3087     double   bytes_limit, bytes_base, bytes_res;
3088     double   slice_time, wait_time;
3089 
3090     if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3091         bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3092     } else if (bs->io_limits.bps[is_write]) {
3093         bps_limit = bs->io_limits.bps[is_write];
3094     } else {
3095         if (wait) {
3096             *wait = 0;
3097         }
3098 
3099         return false;
3100     }
3101 
3102     slice_time = bs->slice_end - bs->slice_start;
3103     slice_time /= (NANOSECONDS_PER_SECOND);
3104     bytes_limit = bps_limit * slice_time;
3105     bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3106     if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3107         bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3108     }
3109 
3110     /* bytes_base: the bytes of data which have been read/written; and
3111      *             it is obtained from the history statistic info.
3112      * bytes_res: the remaining bytes of data which need to be read/written.
3113      * (bytes_base + bytes_res) / bps_limit: used to calcuate
3114      *             the total time for completing reading/writting all data.
3115      */
3116     bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3117 
3118     if (bytes_base + bytes_res <= bytes_limit) {
3119         if (wait) {
3120             *wait = 0;
3121         }
3122 
3123         return false;
3124     }
3125 
3126     /* Calc approx time to dispatch */
3127     wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3128 
3129     /* When the I/O rate at runtime exceeds the limits,
3130      * bs->slice_end need to be extended in order that the current statistic
3131      * info can be kept until the timer fire, so it is increased and tuned
3132      * based on the result of experiment.
3133      */
3134     bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3135     bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3136     if (wait) {
3137         *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3138     }
3139 
3140     return true;
3141 }
3142 
3143 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3144                              double elapsed_time, uint64_t *wait)
3145 {
3146     uint64_t iops_limit = 0;
3147     double   ios_limit, ios_base;
3148     double   slice_time, wait_time;
3149 
3150     if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3151         iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3152     } else if (bs->io_limits.iops[is_write]) {
3153         iops_limit = bs->io_limits.iops[is_write];
3154     } else {
3155         if (wait) {
3156             *wait = 0;
3157         }
3158 
3159         return false;
3160     }
3161 
3162     slice_time = bs->slice_end - bs->slice_start;
3163     slice_time /= (NANOSECONDS_PER_SECOND);
3164     ios_limit  = iops_limit * slice_time;
3165     ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3166     if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3167         ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3168     }
3169 
3170     if (ios_base + 1 <= ios_limit) {
3171         if (wait) {
3172             *wait = 0;
3173         }
3174 
3175         return false;
3176     }
3177 
3178     /* Calc approx time to dispatch */
3179     wait_time = (ios_base + 1) / iops_limit;
3180     if (wait_time > elapsed_time) {
3181         wait_time = wait_time - elapsed_time;
3182     } else {
3183         wait_time = 0;
3184     }
3185 
3186     bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3187     bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3188     if (wait) {
3189         *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3190     }
3191 
3192     return true;
3193 }
3194 
3195 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3196                            bool is_write, int64_t *wait)
3197 {
3198     int64_t  now, max_wait;
3199     uint64_t bps_wait = 0, iops_wait = 0;
3200     double   elapsed_time;
3201     int      bps_ret, iops_ret;
3202 
3203     now = qemu_get_clock_ns(vm_clock);
3204     if ((bs->slice_start < now)
3205         && (bs->slice_end > now)) {
3206         bs->slice_end = now + bs->slice_time;
3207     } else {
3208         bs->slice_time  =  5 * BLOCK_IO_SLICE_TIME;
3209         bs->slice_start = now;
3210         bs->slice_end   = now + bs->slice_time;
3211 
3212         bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
3213         bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3214 
3215         bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
3216         bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
3217     }
3218 
3219     elapsed_time  = now - bs->slice_start;
3220     elapsed_time  /= (NANOSECONDS_PER_SECOND);
3221 
3222     bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
3223                                       is_write, elapsed_time, &bps_wait);
3224     iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3225                                       elapsed_time, &iops_wait);
3226     if (bps_ret || iops_ret) {
3227         max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3228         if (wait) {
3229             *wait = max_wait;
3230         }
3231 
3232         now = qemu_get_clock_ns(vm_clock);
3233         if (bs->slice_end < now + max_wait) {
3234             bs->slice_end = now + max_wait;
3235         }
3236 
3237         return true;
3238     }
3239 
3240     if (wait) {
3241         *wait = 0;
3242     }
3243 
3244     return false;
3245 }
3246 
3247 /**************************************************************/
3248 /* async block device emulation */
3249 
3250 typedef struct BlockDriverAIOCBSync {
3251     BlockDriverAIOCB common;
3252     QEMUBH *bh;
3253     int ret;
3254     /* vector translation state */
3255     QEMUIOVector *qiov;
3256     uint8_t *bounce;
3257     int is_write;
3258 } BlockDriverAIOCBSync;
3259 
3260 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3261 {
3262     BlockDriverAIOCBSync *acb =
3263         container_of(blockacb, BlockDriverAIOCBSync, common);
3264     qemu_bh_delete(acb->bh);
3265     acb->bh = NULL;
3266     qemu_aio_release(acb);
3267 }
3268 
3269 static AIOPool bdrv_em_aio_pool = {
3270     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3271     .cancel             = bdrv_aio_cancel_em,
3272 };
3273 
3274 static void bdrv_aio_bh_cb(void *opaque)
3275 {
3276     BlockDriverAIOCBSync *acb = opaque;
3277 
3278     if (!acb->is_write)
3279         qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
3280     qemu_vfree(acb->bounce);
3281     acb->common.cb(acb->common.opaque, acb->ret);
3282     qemu_bh_delete(acb->bh);
3283     acb->bh = NULL;
3284     qemu_aio_release(acb);
3285 }
3286 
3287 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3288                                             int64_t sector_num,
3289                                             QEMUIOVector *qiov,
3290                                             int nb_sectors,
3291                                             BlockDriverCompletionFunc *cb,
3292                                             void *opaque,
3293                                             int is_write)
3294 
3295 {
3296     BlockDriverAIOCBSync *acb;
3297 
3298     acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3299     acb->is_write = is_write;
3300     acb->qiov = qiov;
3301     acb->bounce = qemu_blockalign(bs, qiov->size);
3302     acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3303 
3304     if (is_write) {
3305         qemu_iovec_to_buffer(acb->qiov, acb->bounce);
3306         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3307     } else {
3308         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3309     }
3310 
3311     qemu_bh_schedule(acb->bh);
3312 
3313     return &acb->common;
3314 }
3315 
3316 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3317         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3318         BlockDriverCompletionFunc *cb, void *opaque)
3319 {
3320     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3321 }
3322 
3323 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3324         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3325         BlockDriverCompletionFunc *cb, void *opaque)
3326 {
3327     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3328 }
3329 
3330 
3331 typedef struct BlockDriverAIOCBCoroutine {
3332     BlockDriverAIOCB common;
3333     BlockRequest req;
3334     bool is_write;
3335     QEMUBH* bh;
3336 } BlockDriverAIOCBCoroutine;
3337 
3338 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3339 {
3340     qemu_aio_flush();
3341 }
3342 
3343 static AIOPool bdrv_em_co_aio_pool = {
3344     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3345     .cancel             = bdrv_aio_co_cancel_em,
3346 };
3347 
3348 static void bdrv_co_em_bh(void *opaque)
3349 {
3350     BlockDriverAIOCBCoroutine *acb = opaque;
3351 
3352     acb->common.cb(acb->common.opaque, acb->req.error);
3353     qemu_bh_delete(acb->bh);
3354     qemu_aio_release(acb);
3355 }
3356 
3357 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3358 static void coroutine_fn bdrv_co_do_rw(void *opaque)
3359 {
3360     BlockDriverAIOCBCoroutine *acb = opaque;
3361     BlockDriverState *bs = acb->common.bs;
3362 
3363     if (!acb->is_write) {
3364         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3365             acb->req.nb_sectors, acb->req.qiov, 0);
3366     } else {
3367         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3368             acb->req.nb_sectors, acb->req.qiov, 0);
3369     }
3370 
3371     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3372     qemu_bh_schedule(acb->bh);
3373 }
3374 
3375 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3376                                                int64_t sector_num,
3377                                                QEMUIOVector *qiov,
3378                                                int nb_sectors,
3379                                                BlockDriverCompletionFunc *cb,
3380                                                void *opaque,
3381                                                bool is_write)
3382 {
3383     Coroutine *co;
3384     BlockDriverAIOCBCoroutine *acb;
3385 
3386     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3387     acb->req.sector = sector_num;
3388     acb->req.nb_sectors = nb_sectors;
3389     acb->req.qiov = qiov;
3390     acb->is_write = is_write;
3391 
3392     co = qemu_coroutine_create(bdrv_co_do_rw);
3393     qemu_coroutine_enter(co, acb);
3394 
3395     return &acb->common;
3396 }
3397 
3398 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3399 {
3400     BlockDriverAIOCBCoroutine *acb = opaque;
3401     BlockDriverState *bs = acb->common.bs;
3402 
3403     acb->req.error = bdrv_co_flush(bs);
3404     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3405     qemu_bh_schedule(acb->bh);
3406 }
3407 
3408 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3409         BlockDriverCompletionFunc *cb, void *opaque)
3410 {
3411     trace_bdrv_aio_flush(bs, opaque);
3412 
3413     Coroutine *co;
3414     BlockDriverAIOCBCoroutine *acb;
3415 
3416     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3417     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3418     qemu_coroutine_enter(co, acb);
3419 
3420     return &acb->common;
3421 }
3422 
3423 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3424 {
3425     BlockDriverAIOCBCoroutine *acb = opaque;
3426     BlockDriverState *bs = acb->common.bs;
3427 
3428     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3429     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3430     qemu_bh_schedule(acb->bh);
3431 }
3432 
3433 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3434         int64_t sector_num, int nb_sectors,
3435         BlockDriverCompletionFunc *cb, void *opaque)
3436 {
3437     Coroutine *co;
3438     BlockDriverAIOCBCoroutine *acb;
3439 
3440     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3441 
3442     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3443     acb->req.sector = sector_num;
3444     acb->req.nb_sectors = nb_sectors;
3445     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3446     qemu_coroutine_enter(co, acb);
3447 
3448     return &acb->common;
3449 }
3450 
3451 void bdrv_init(void)
3452 {
3453     module_call_init(MODULE_INIT_BLOCK);
3454 }
3455 
3456 void bdrv_init_with_whitelist(void)
3457 {
3458     use_bdrv_whitelist = 1;
3459     bdrv_init();
3460 }
3461 
3462 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3463                    BlockDriverCompletionFunc *cb, void *opaque)
3464 {
3465     BlockDriverAIOCB *acb;
3466 
3467     if (pool->free_aiocb) {
3468         acb = pool->free_aiocb;
3469         pool->free_aiocb = acb->next;
3470     } else {
3471         acb = g_malloc0(pool->aiocb_size);
3472         acb->pool = pool;
3473     }
3474     acb->bs = bs;
3475     acb->cb = cb;
3476     acb->opaque = opaque;
3477     return acb;
3478 }
3479 
3480 void qemu_aio_release(void *p)
3481 {
3482     BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3483     AIOPool *pool = acb->pool;
3484     acb->next = pool->free_aiocb;
3485     pool->free_aiocb = acb;
3486 }
3487 
3488 /**************************************************************/
3489 /* Coroutine block device emulation */
3490 
3491 typedef struct CoroutineIOCompletion {
3492     Coroutine *coroutine;
3493     int ret;
3494 } CoroutineIOCompletion;
3495 
3496 static void bdrv_co_io_em_complete(void *opaque, int ret)
3497 {
3498     CoroutineIOCompletion *co = opaque;
3499 
3500     co->ret = ret;
3501     qemu_coroutine_enter(co->coroutine, NULL);
3502 }
3503 
3504 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3505                                       int nb_sectors, QEMUIOVector *iov,
3506                                       bool is_write)
3507 {
3508     CoroutineIOCompletion co = {
3509         .coroutine = qemu_coroutine_self(),
3510     };
3511     BlockDriverAIOCB *acb;
3512 
3513     if (is_write) {
3514         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3515                                        bdrv_co_io_em_complete, &co);
3516     } else {
3517         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3518                                       bdrv_co_io_em_complete, &co);
3519     }
3520 
3521     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3522     if (!acb) {
3523         return -EIO;
3524     }
3525     qemu_coroutine_yield();
3526 
3527     return co.ret;
3528 }
3529 
3530 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3531                                          int64_t sector_num, int nb_sectors,
3532                                          QEMUIOVector *iov)
3533 {
3534     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3535 }
3536 
3537 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3538                                          int64_t sector_num, int nb_sectors,
3539                                          QEMUIOVector *iov)
3540 {
3541     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3542 }
3543 
3544 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3545 {
3546     RwCo *rwco = opaque;
3547 
3548     rwco->ret = bdrv_co_flush(rwco->bs);
3549 }
3550 
3551 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3552 {
3553     int ret;
3554 
3555     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3556         return 0;
3557     }
3558 
3559     /* Write back cached data to the OS even with cache=unsafe */
3560     if (bs->drv->bdrv_co_flush_to_os) {
3561         ret = bs->drv->bdrv_co_flush_to_os(bs);
3562         if (ret < 0) {
3563             return ret;
3564         }
3565     }
3566 
3567     /* But don't actually force it to the disk with cache=unsafe */
3568     if (bs->open_flags & BDRV_O_NO_FLUSH) {
3569         return 0;
3570     }
3571 
3572     if (bs->drv->bdrv_co_flush_to_disk) {
3573         ret = bs->drv->bdrv_co_flush_to_disk(bs);
3574     } else if (bs->drv->bdrv_aio_flush) {
3575         BlockDriverAIOCB *acb;
3576         CoroutineIOCompletion co = {
3577             .coroutine = qemu_coroutine_self(),
3578         };
3579 
3580         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3581         if (acb == NULL) {
3582             ret = -EIO;
3583         } else {
3584             qemu_coroutine_yield();
3585             ret = co.ret;
3586         }
3587     } else {
3588         /*
3589          * Some block drivers always operate in either writethrough or unsafe
3590          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3591          * know how the server works (because the behaviour is hardcoded or
3592          * depends on server-side configuration), so we can't ensure that
3593          * everything is safe on disk. Returning an error doesn't work because
3594          * that would break guests even if the server operates in writethrough
3595          * mode.
3596          *
3597          * Let's hope the user knows what he's doing.
3598          */
3599         ret = 0;
3600     }
3601     if (ret < 0) {
3602         return ret;
3603     }
3604 
3605     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
3606      * in the case of cache=unsafe, so there are no useless flushes.
3607      */
3608     return bdrv_co_flush(bs->file);
3609 }
3610 
3611 void bdrv_invalidate_cache(BlockDriverState *bs)
3612 {
3613     if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3614         bs->drv->bdrv_invalidate_cache(bs);
3615     }
3616 }
3617 
3618 void bdrv_invalidate_cache_all(void)
3619 {
3620     BlockDriverState *bs;
3621 
3622     QTAILQ_FOREACH(bs, &bdrv_states, list) {
3623         bdrv_invalidate_cache(bs);
3624     }
3625 }
3626 
3627 void bdrv_clear_incoming_migration_all(void)
3628 {
3629     BlockDriverState *bs;
3630 
3631     QTAILQ_FOREACH(bs, &bdrv_states, list) {
3632         bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3633     }
3634 }
3635 
3636 int bdrv_flush(BlockDriverState *bs)
3637 {
3638     Coroutine *co;
3639     RwCo rwco = {
3640         .bs = bs,
3641         .ret = NOT_DONE,
3642     };
3643 
3644     if (qemu_in_coroutine()) {
3645         /* Fast-path if already in coroutine context */
3646         bdrv_flush_co_entry(&rwco);
3647     } else {
3648         co = qemu_coroutine_create(bdrv_flush_co_entry);
3649         qemu_coroutine_enter(co, &rwco);
3650         while (rwco.ret == NOT_DONE) {
3651             qemu_aio_wait();
3652         }
3653     }
3654 
3655     return rwco.ret;
3656 }
3657 
3658 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3659 {
3660     RwCo *rwco = opaque;
3661 
3662     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3663 }
3664 
3665 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3666                                  int nb_sectors)
3667 {
3668     if (!bs->drv) {
3669         return -ENOMEDIUM;
3670     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3671         return -EIO;
3672     } else if (bs->read_only) {
3673         return -EROFS;
3674     } else if (bs->drv->bdrv_co_discard) {
3675         return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3676     } else if (bs->drv->bdrv_aio_discard) {
3677         BlockDriverAIOCB *acb;
3678         CoroutineIOCompletion co = {
3679             .coroutine = qemu_coroutine_self(),
3680         };
3681 
3682         acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3683                                         bdrv_co_io_em_complete, &co);
3684         if (acb == NULL) {
3685             return -EIO;
3686         } else {
3687             qemu_coroutine_yield();
3688             return co.ret;
3689         }
3690     } else {
3691         return 0;
3692     }
3693 }
3694 
3695 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3696 {
3697     Coroutine *co;
3698     RwCo rwco = {
3699         .bs = bs,
3700         .sector_num = sector_num,
3701         .nb_sectors = nb_sectors,
3702         .ret = NOT_DONE,
3703     };
3704 
3705     if (qemu_in_coroutine()) {
3706         /* Fast-path if already in coroutine context */
3707         bdrv_discard_co_entry(&rwco);
3708     } else {
3709         co = qemu_coroutine_create(bdrv_discard_co_entry);
3710         qemu_coroutine_enter(co, &rwco);
3711         while (rwco.ret == NOT_DONE) {
3712             qemu_aio_wait();
3713         }
3714     }
3715 
3716     return rwco.ret;
3717 }
3718 
3719 /**************************************************************/
3720 /* removable device support */
3721 
3722 /**
3723  * Return TRUE if the media is present
3724  */
3725 int bdrv_is_inserted(BlockDriverState *bs)
3726 {
3727     BlockDriver *drv = bs->drv;
3728 
3729     if (!drv)
3730         return 0;
3731     if (!drv->bdrv_is_inserted)
3732         return 1;
3733     return drv->bdrv_is_inserted(bs);
3734 }
3735 
3736 /**
3737  * Return whether the media changed since the last call to this
3738  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
3739  */
3740 int bdrv_media_changed(BlockDriverState *bs)
3741 {
3742     BlockDriver *drv = bs->drv;
3743 
3744     if (drv && drv->bdrv_media_changed) {
3745         return drv->bdrv_media_changed(bs);
3746     }
3747     return -ENOTSUP;
3748 }
3749 
3750 /**
3751  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3752  */
3753 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3754 {
3755     BlockDriver *drv = bs->drv;
3756 
3757     if (drv && drv->bdrv_eject) {
3758         drv->bdrv_eject(bs, eject_flag);
3759     }
3760 
3761     if (bs->device_name[0] != '\0') {
3762         bdrv_emit_qmp_eject_event(bs, eject_flag);
3763     }
3764 }
3765 
3766 /**
3767  * Lock or unlock the media (if it is locked, the user won't be able
3768  * to eject it manually).
3769  */
3770 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3771 {
3772     BlockDriver *drv = bs->drv;
3773 
3774     trace_bdrv_lock_medium(bs, locked);
3775 
3776     if (drv && drv->bdrv_lock_medium) {
3777         drv->bdrv_lock_medium(bs, locked);
3778     }
3779 }
3780 
3781 /* needed for generic scsi interface */
3782 
3783 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3784 {
3785     BlockDriver *drv = bs->drv;
3786 
3787     if (drv && drv->bdrv_ioctl)
3788         return drv->bdrv_ioctl(bs, req, buf);
3789     return -ENOTSUP;
3790 }
3791 
3792 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3793         unsigned long int req, void *buf,
3794         BlockDriverCompletionFunc *cb, void *opaque)
3795 {
3796     BlockDriver *drv = bs->drv;
3797 
3798     if (drv && drv->bdrv_aio_ioctl)
3799         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3800     return NULL;
3801 }
3802 
3803 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3804 {
3805     bs->buffer_alignment = align;
3806 }
3807 
3808 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3809 {
3810     return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3811 }
3812 
3813 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3814 {
3815     int64_t bitmap_size;
3816 
3817     bs->dirty_count = 0;
3818     if (enable) {
3819         if (!bs->dirty_bitmap) {
3820             bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3821                     BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3822             bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
3823 
3824             bs->dirty_bitmap = g_malloc0(bitmap_size);
3825         }
3826     } else {
3827         if (bs->dirty_bitmap) {
3828             g_free(bs->dirty_bitmap);
3829             bs->dirty_bitmap = NULL;
3830         }
3831     }
3832 }
3833 
3834 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3835 {
3836     int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3837 
3838     if (bs->dirty_bitmap &&
3839         (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3840         return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3841             (1UL << (chunk % (sizeof(unsigned long) * 8))));
3842     } else {
3843         return 0;
3844     }
3845 }
3846 
3847 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3848                       int nr_sectors)
3849 {
3850     set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3851 }
3852 
3853 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3854 {
3855     return bs->dirty_count;
3856 }
3857 
3858 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3859 {
3860     assert(bs->in_use != in_use);
3861     bs->in_use = in_use;
3862 }
3863 
3864 int bdrv_in_use(BlockDriverState *bs)
3865 {
3866     return bs->in_use;
3867 }
3868 
3869 void bdrv_iostatus_enable(BlockDriverState *bs)
3870 {
3871     bs->iostatus_enabled = true;
3872     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3873 }
3874 
3875 /* The I/O status is only enabled if the drive explicitly
3876  * enables it _and_ the VM is configured to stop on errors */
3877 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3878 {
3879     return (bs->iostatus_enabled &&
3880            (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3881             bs->on_write_error == BLOCK_ERR_STOP_ANY    ||
3882             bs->on_read_error == BLOCK_ERR_STOP_ANY));
3883 }
3884 
3885 void bdrv_iostatus_disable(BlockDriverState *bs)
3886 {
3887     bs->iostatus_enabled = false;
3888 }
3889 
3890 void bdrv_iostatus_reset(BlockDriverState *bs)
3891 {
3892     if (bdrv_iostatus_is_enabled(bs)) {
3893         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3894     }
3895 }
3896 
3897 /* XXX: Today this is set by device models because it makes the implementation
3898    quite simple. However, the block layer knows about the error, so it's
3899    possible to implement this without device models being involved */
3900 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3901 {
3902     if (bdrv_iostatus_is_enabled(bs) &&
3903         bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
3904         assert(error >= 0);
3905         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3906                                          BLOCK_DEVICE_IO_STATUS_FAILED;
3907     }
3908 }
3909 
3910 void
3911 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3912         enum BlockAcctType type)
3913 {
3914     assert(type < BDRV_MAX_IOTYPE);
3915 
3916     cookie->bytes = bytes;
3917     cookie->start_time_ns = get_clock();
3918     cookie->type = type;
3919 }
3920 
3921 void
3922 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3923 {
3924     assert(cookie->type < BDRV_MAX_IOTYPE);
3925 
3926     bs->nr_bytes[cookie->type] += cookie->bytes;
3927     bs->nr_ops[cookie->type]++;
3928     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
3929 }
3930 
3931 int bdrv_img_create(const char *filename, const char *fmt,
3932                     const char *base_filename, const char *base_fmt,
3933                     char *options, uint64_t img_size, int flags)
3934 {
3935     QEMUOptionParameter *param = NULL, *create_options = NULL;
3936     QEMUOptionParameter *backing_fmt, *backing_file, *size;
3937     BlockDriverState *bs = NULL;
3938     BlockDriver *drv, *proto_drv;
3939     BlockDriver *backing_drv = NULL;
3940     int ret = 0;
3941 
3942     /* Find driver and parse its options */
3943     drv = bdrv_find_format(fmt);
3944     if (!drv) {
3945         error_report("Unknown file format '%s'", fmt);
3946         ret = -EINVAL;
3947         goto out;
3948     }
3949 
3950     proto_drv = bdrv_find_protocol(filename);
3951     if (!proto_drv) {
3952         error_report("Unknown protocol '%s'", filename);
3953         ret = -EINVAL;
3954         goto out;
3955     }
3956 
3957     create_options = append_option_parameters(create_options,
3958                                               drv->create_options);
3959     create_options = append_option_parameters(create_options,
3960                                               proto_drv->create_options);
3961 
3962     /* Create parameter list with default values */
3963     param = parse_option_parameters("", create_options, param);
3964 
3965     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
3966 
3967     /* Parse -o options */
3968     if (options) {
3969         param = parse_option_parameters(options, create_options, param);
3970         if (param == NULL) {
3971             error_report("Invalid options for file format '%s'.", fmt);
3972             ret = -EINVAL;
3973             goto out;
3974         }
3975     }
3976 
3977     if (base_filename) {
3978         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
3979                                  base_filename)) {
3980             error_report("Backing file not supported for file format '%s'",
3981                          fmt);
3982             ret = -EINVAL;
3983             goto out;
3984         }
3985     }
3986 
3987     if (base_fmt) {
3988         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
3989             error_report("Backing file format not supported for file "
3990                          "format '%s'", fmt);
3991             ret = -EINVAL;
3992             goto out;
3993         }
3994     }
3995 
3996     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
3997     if (backing_file && backing_file->value.s) {
3998         if (!strcmp(filename, backing_file->value.s)) {
3999             error_report("Error: Trying to create an image with the "
4000                          "same filename as the backing file");
4001             ret = -EINVAL;
4002             goto out;
4003         }
4004     }
4005 
4006     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4007     if (backing_fmt && backing_fmt->value.s) {
4008         backing_drv = bdrv_find_format(backing_fmt->value.s);
4009         if (!backing_drv) {
4010             error_report("Unknown backing file format '%s'",
4011                          backing_fmt->value.s);
4012             ret = -EINVAL;
4013             goto out;
4014         }
4015     }
4016 
4017     // The size for the image must always be specified, with one exception:
4018     // If we are using a backing file, we can obtain the size from there
4019     size = get_option_parameter(param, BLOCK_OPT_SIZE);
4020     if (size && size->value.n == -1) {
4021         if (backing_file && backing_file->value.s) {
4022             uint64_t size;
4023             char buf[32];
4024 
4025             bs = bdrv_new("");
4026 
4027             ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
4028             if (ret < 0) {
4029                 error_report("Could not open '%s'", backing_file->value.s);
4030                 goto out;
4031             }
4032             bdrv_get_geometry(bs, &size);
4033             size *= 512;
4034 
4035             snprintf(buf, sizeof(buf), "%" PRId64, size);
4036             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4037         } else {
4038             error_report("Image creation needs a size parameter");
4039             ret = -EINVAL;
4040             goto out;
4041         }
4042     }
4043 
4044     printf("Formatting '%s', fmt=%s ", filename, fmt);
4045     print_option_parameters(param);
4046     puts("");
4047 
4048     ret = bdrv_create(drv, filename, param);
4049 
4050     if (ret < 0) {
4051         if (ret == -ENOTSUP) {
4052             error_report("Formatting or formatting option not supported for "
4053                          "file format '%s'", fmt);
4054         } else if (ret == -EFBIG) {
4055             error_report("The image size is too large for file format '%s'",
4056                          fmt);
4057         } else {
4058             error_report("%s: error while creating %s: %s", filename, fmt,
4059                          strerror(-ret));
4060         }
4061     }
4062 
4063 out:
4064     free_option_parameters(create_options);
4065     free_option_parameters(param);
4066 
4067     if (bs) {
4068         bdrv_delete(bs);
4069     }
4070 
4071     return ret;
4072 }
4073 
4074 void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4075                        BlockDriverCompletionFunc *cb, void *opaque)
4076 {
4077     BlockJob *job;
4078 
4079     if (bs->job || bdrv_in_use(bs)) {
4080         return NULL;
4081     }
4082     bdrv_set_in_use(bs, 1);
4083 
4084     job = g_malloc0(job_type->instance_size);
4085     job->job_type      = job_type;
4086     job->bs            = bs;
4087     job->cb            = cb;
4088     job->opaque        = opaque;
4089     bs->job = job;
4090     return job;
4091 }
4092 
4093 void block_job_complete(BlockJob *job, int ret)
4094 {
4095     BlockDriverState *bs = job->bs;
4096 
4097     assert(bs->job == job);
4098     job->cb(job->opaque, ret);
4099     bs->job = NULL;
4100     g_free(job);
4101     bdrv_set_in_use(bs, 0);
4102 }
4103 
4104 int block_job_set_speed(BlockJob *job, int64_t value)
4105 {
4106     int rc;
4107 
4108     if (!job->job_type->set_speed) {
4109         return -ENOTSUP;
4110     }
4111     rc = job->job_type->set_speed(job, value);
4112     if (rc == 0) {
4113         job->speed = value;
4114     }
4115     return rc;
4116 }
4117 
4118 void block_job_cancel(BlockJob *job)
4119 {
4120     job->cancelled = true;
4121 }
4122 
4123 bool block_job_is_cancelled(BlockJob *job)
4124 {
4125     return job->cancelled;
4126 }
4127 
4128 void block_job_cancel_sync(BlockJob *job)
4129 {
4130     BlockDriverState *bs = job->bs;
4131 
4132     assert(bs->job == job);
4133     block_job_cancel(job);
4134     while (bs->job != NULL && bs->job->busy) {
4135         qemu_aio_wait();
4136     }
4137 }
4138