xref: /openbmc/qemu/block.c (revision 5f3777945d22248d805fb7c134e206c2d943b77b)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor.h"
28 #include "block_int.h"
29 #include "module.h"
30 #include "qjson.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 #include "qemu-timer.h"
34 
35 #ifdef CONFIG_BSD
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/ioctl.h>
39 #include <sys/queue.h>
40 #ifndef __DragonFly__
41 #include <sys/disk.h>
42 #endif
43 #endif
44 
45 #ifdef _WIN32
46 #include <windows.h>
47 #endif
48 
49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50 
51 typedef enum {
52     BDRV_REQ_COPY_ON_READ = 0x1,
53     BDRV_REQ_ZERO_WRITE   = 0x2,
54 } BdrvRequestFlags;
55 
56 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59         BlockDriverCompletionFunc *cb, void *opaque);
60 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62         BlockDriverCompletionFunc *cb, void *opaque);
63 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64                                          int64_t sector_num, int nb_sectors,
65                                          QEMUIOVector *iov);
66 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67                                          int64_t sector_num, int nb_sectors,
68                                          QEMUIOVector *iov);
69 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71     BdrvRequestFlags flags);
72 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74     BdrvRequestFlags flags);
75 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76                                                int64_t sector_num,
77                                                QEMUIOVector *qiov,
78                                                int nb_sectors,
79                                                BlockDriverCompletionFunc *cb,
80                                                void *opaque,
81                                                bool is_write);
82 static void coroutine_fn bdrv_co_do_rw(void *opaque);
83 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
84     int64_t sector_num, int nb_sectors);
85 
86 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
87         bool is_write, double elapsed_time, uint64_t *wait);
88 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
89         double elapsed_time, uint64_t *wait);
90 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
91         bool is_write, int64_t *wait);
92 
93 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94     QTAILQ_HEAD_INITIALIZER(bdrv_states);
95 
96 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
97     QLIST_HEAD_INITIALIZER(bdrv_drivers);
98 
99 /* The device to use for VM snapshots */
100 static BlockDriverState *bs_snapshots;
101 
102 /* If non-zero, use only whitelisted block drivers */
103 static int use_bdrv_whitelist;
104 
105 #ifdef _WIN32
106 static int is_windows_drive_prefix(const char *filename)
107 {
108     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
110             filename[1] == ':');
111 }
112 
113 int is_windows_drive(const char *filename)
114 {
115     if (is_windows_drive_prefix(filename) &&
116         filename[2] == '\0')
117         return 1;
118     if (strstart(filename, "\\\\.\\", NULL) ||
119         strstart(filename, "//./", NULL))
120         return 1;
121     return 0;
122 }
123 #endif
124 
125 /* throttling disk I/O limits */
126 void bdrv_io_limits_disable(BlockDriverState *bs)
127 {
128     bs->io_limits_enabled = false;
129 
130     while (qemu_co_queue_next(&bs->throttled_reqs));
131 
132     if (bs->block_timer) {
133         qemu_del_timer(bs->block_timer);
134         qemu_free_timer(bs->block_timer);
135         bs->block_timer = NULL;
136     }
137 
138     bs->slice_start = 0;
139     bs->slice_end   = 0;
140     bs->slice_time  = 0;
141     memset(&bs->io_base, 0, sizeof(bs->io_base));
142 }
143 
144 static void bdrv_block_timer(void *opaque)
145 {
146     BlockDriverState *bs = opaque;
147 
148     qemu_co_queue_next(&bs->throttled_reqs);
149 }
150 
151 void bdrv_io_limits_enable(BlockDriverState *bs)
152 {
153     qemu_co_queue_init(&bs->throttled_reqs);
154     bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
155     bs->slice_time  = 5 * BLOCK_IO_SLICE_TIME;
156     bs->slice_start = qemu_get_clock_ns(vm_clock);
157     bs->slice_end   = bs->slice_start + bs->slice_time;
158     memset(&bs->io_base, 0, sizeof(bs->io_base));
159     bs->io_limits_enabled = true;
160 }
161 
162 bool bdrv_io_limits_enabled(BlockDriverState *bs)
163 {
164     BlockIOLimit *io_limits = &bs->io_limits;
165     return io_limits->bps[BLOCK_IO_LIMIT_READ]
166          || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
167          || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
168          || io_limits->iops[BLOCK_IO_LIMIT_READ]
169          || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
170          || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
171 }
172 
173 static void bdrv_io_limits_intercept(BlockDriverState *bs,
174                                      bool is_write, int nb_sectors)
175 {
176     int64_t wait_time = -1;
177 
178     if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
179         qemu_co_queue_wait(&bs->throttled_reqs);
180     }
181 
182     /* In fact, we hope to keep each request's timing, in FIFO mode. The next
183      * throttled requests will not be dequeued until the current request is
184      * allowed to be serviced. So if the current request still exceeds the
185      * limits, it will be inserted to the head. All requests followed it will
186      * be still in throttled_reqs queue.
187      */
188 
189     while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
190         qemu_mod_timer(bs->block_timer,
191                        wait_time + qemu_get_clock_ns(vm_clock));
192         qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
193     }
194 
195     qemu_co_queue_next(&bs->throttled_reqs);
196 }
197 
198 /* check if the path starts with "<protocol>:" */
199 static int path_has_protocol(const char *path)
200 {
201 #ifdef _WIN32
202     if (is_windows_drive(path) ||
203         is_windows_drive_prefix(path)) {
204         return 0;
205     }
206 #endif
207 
208     return strchr(path, ':') != NULL;
209 }
210 
211 int path_is_absolute(const char *path)
212 {
213     const char *p;
214 #ifdef _WIN32
215     /* specific case for names like: "\\.\d:" */
216     if (*path == '/' || *path == '\\')
217         return 1;
218 #endif
219     p = strchr(path, ':');
220     if (p)
221         p++;
222     else
223         p = path;
224 #ifdef _WIN32
225     return (*p == '/' || *p == '\\');
226 #else
227     return (*p == '/');
228 #endif
229 }
230 
231 /* if filename is absolute, just copy it to dest. Otherwise, build a
232    path to it by considering it is relative to base_path. URL are
233    supported. */
234 void path_combine(char *dest, int dest_size,
235                   const char *base_path,
236                   const char *filename)
237 {
238     const char *p, *p1;
239     int len;
240 
241     if (dest_size <= 0)
242         return;
243     if (path_is_absolute(filename)) {
244         pstrcpy(dest, dest_size, filename);
245     } else {
246         p = strchr(base_path, ':');
247         if (p)
248             p++;
249         else
250             p = base_path;
251         p1 = strrchr(base_path, '/');
252 #ifdef _WIN32
253         {
254             const char *p2;
255             p2 = strrchr(base_path, '\\');
256             if (!p1 || p2 > p1)
257                 p1 = p2;
258         }
259 #endif
260         if (p1)
261             p1++;
262         else
263             p1 = base_path;
264         if (p1 > p)
265             p = p1;
266         len = p - base_path;
267         if (len > dest_size - 1)
268             len = dest_size - 1;
269         memcpy(dest, base_path, len);
270         dest[len] = '\0';
271         pstrcat(dest, dest_size, filename);
272     }
273 }
274 
275 void bdrv_register(BlockDriver *bdrv)
276 {
277     /* Block drivers without coroutine functions need emulation */
278     if (!bdrv->bdrv_co_readv) {
279         bdrv->bdrv_co_readv = bdrv_co_readv_em;
280         bdrv->bdrv_co_writev = bdrv_co_writev_em;
281 
282         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
283          * the block driver lacks aio we need to emulate that too.
284          */
285         if (!bdrv->bdrv_aio_readv) {
286             /* add AIO emulation layer */
287             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
288             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
289         }
290     }
291 
292     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
293 }
294 
295 /* create a new block device (by default it is empty) */
296 BlockDriverState *bdrv_new(const char *device_name)
297 {
298     BlockDriverState *bs;
299 
300     bs = g_malloc0(sizeof(BlockDriverState));
301     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
302     if (device_name[0] != '\0') {
303         QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
304     }
305     bdrv_iostatus_disable(bs);
306     return bs;
307 }
308 
309 BlockDriver *bdrv_find_format(const char *format_name)
310 {
311     BlockDriver *drv1;
312     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
313         if (!strcmp(drv1->format_name, format_name)) {
314             return drv1;
315         }
316     }
317     return NULL;
318 }
319 
320 static int bdrv_is_whitelisted(BlockDriver *drv)
321 {
322     static const char *whitelist[] = {
323         CONFIG_BDRV_WHITELIST
324     };
325     const char **p;
326 
327     if (!whitelist[0])
328         return 1;               /* no whitelist, anything goes */
329 
330     for (p = whitelist; *p; p++) {
331         if (!strcmp(drv->format_name, *p)) {
332             return 1;
333         }
334     }
335     return 0;
336 }
337 
338 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
339 {
340     BlockDriver *drv = bdrv_find_format(format_name);
341     return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
342 }
343 
344 typedef struct CreateCo {
345     BlockDriver *drv;
346     char *filename;
347     QEMUOptionParameter *options;
348     int ret;
349 } CreateCo;
350 
351 static void coroutine_fn bdrv_create_co_entry(void *opaque)
352 {
353     CreateCo *cco = opaque;
354     assert(cco->drv);
355 
356     cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
357 }
358 
359 int bdrv_create(BlockDriver *drv, const char* filename,
360     QEMUOptionParameter *options)
361 {
362     int ret;
363 
364     Coroutine *co;
365     CreateCo cco = {
366         .drv = drv,
367         .filename = g_strdup(filename),
368         .options = options,
369         .ret = NOT_DONE,
370     };
371 
372     if (!drv->bdrv_create) {
373         return -ENOTSUP;
374     }
375 
376     if (qemu_in_coroutine()) {
377         /* Fast-path if already in coroutine context */
378         bdrv_create_co_entry(&cco);
379     } else {
380         co = qemu_coroutine_create(bdrv_create_co_entry);
381         qemu_coroutine_enter(co, &cco);
382         while (cco.ret == NOT_DONE) {
383             qemu_aio_wait();
384         }
385     }
386 
387     ret = cco.ret;
388     g_free(cco.filename);
389 
390     return ret;
391 }
392 
393 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
394 {
395     BlockDriver *drv;
396 
397     drv = bdrv_find_protocol(filename);
398     if (drv == NULL) {
399         return -ENOENT;
400     }
401 
402     return bdrv_create(drv, filename, options);
403 }
404 
405 #ifdef _WIN32
406 void get_tmp_filename(char *filename, int size)
407 {
408     char temp_dir[MAX_PATH];
409 
410     GetTempPath(MAX_PATH, temp_dir);
411     GetTempFileName(temp_dir, "qem", 0, filename);
412 }
413 #else
414 void get_tmp_filename(char *filename, int size)
415 {
416     int fd;
417     const char *tmpdir;
418     /* XXX: race condition possible */
419     tmpdir = getenv("TMPDIR");
420     if (!tmpdir)
421         tmpdir = "/tmp";
422     snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
423     fd = mkstemp(filename);
424     close(fd);
425 }
426 #endif
427 
428 /*
429  * Detect host devices. By convention, /dev/cdrom[N] is always
430  * recognized as a host CDROM.
431  */
432 static BlockDriver *find_hdev_driver(const char *filename)
433 {
434     int score_max = 0, score;
435     BlockDriver *drv = NULL, *d;
436 
437     QLIST_FOREACH(d, &bdrv_drivers, list) {
438         if (d->bdrv_probe_device) {
439             score = d->bdrv_probe_device(filename);
440             if (score > score_max) {
441                 score_max = score;
442                 drv = d;
443             }
444         }
445     }
446 
447     return drv;
448 }
449 
450 BlockDriver *bdrv_find_protocol(const char *filename)
451 {
452     BlockDriver *drv1;
453     char protocol[128];
454     int len;
455     const char *p;
456 
457     /* TODO Drivers without bdrv_file_open must be specified explicitly */
458 
459     /*
460      * XXX(hch): we really should not let host device detection
461      * override an explicit protocol specification, but moving this
462      * later breaks access to device names with colons in them.
463      * Thanks to the brain-dead persistent naming schemes on udev-
464      * based Linux systems those actually are quite common.
465      */
466     drv1 = find_hdev_driver(filename);
467     if (drv1) {
468         return drv1;
469     }
470 
471     if (!path_has_protocol(filename)) {
472         return bdrv_find_format("file");
473     }
474     p = strchr(filename, ':');
475     assert(p != NULL);
476     len = p - filename;
477     if (len > sizeof(protocol) - 1)
478         len = sizeof(protocol) - 1;
479     memcpy(protocol, filename, len);
480     protocol[len] = '\0';
481     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
482         if (drv1->protocol_name &&
483             !strcmp(drv1->protocol_name, protocol)) {
484             return drv1;
485         }
486     }
487     return NULL;
488 }
489 
490 static int find_image_format(const char *filename, BlockDriver **pdrv)
491 {
492     int ret, score, score_max;
493     BlockDriver *drv1, *drv;
494     uint8_t buf[2048];
495     BlockDriverState *bs;
496 
497     ret = bdrv_file_open(&bs, filename, 0);
498     if (ret < 0) {
499         *pdrv = NULL;
500         return ret;
501     }
502 
503     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
504     if (bs->sg || !bdrv_is_inserted(bs)) {
505         bdrv_delete(bs);
506         drv = bdrv_find_format("raw");
507         if (!drv) {
508             ret = -ENOENT;
509         }
510         *pdrv = drv;
511         return ret;
512     }
513 
514     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
515     bdrv_delete(bs);
516     if (ret < 0) {
517         *pdrv = NULL;
518         return ret;
519     }
520 
521     score_max = 0;
522     drv = NULL;
523     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
524         if (drv1->bdrv_probe) {
525             score = drv1->bdrv_probe(buf, ret, filename);
526             if (score > score_max) {
527                 score_max = score;
528                 drv = drv1;
529             }
530         }
531     }
532     if (!drv) {
533         ret = -ENOENT;
534     }
535     *pdrv = drv;
536     return ret;
537 }
538 
539 /**
540  * Set the current 'total_sectors' value
541  */
542 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
543 {
544     BlockDriver *drv = bs->drv;
545 
546     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
547     if (bs->sg)
548         return 0;
549 
550     /* query actual device if possible, otherwise just trust the hint */
551     if (drv->bdrv_getlength) {
552         int64_t length = drv->bdrv_getlength(bs);
553         if (length < 0) {
554             return length;
555         }
556         hint = length >> BDRV_SECTOR_BITS;
557     }
558 
559     bs->total_sectors = hint;
560     return 0;
561 }
562 
563 /**
564  * Set open flags for a given cache mode
565  *
566  * Return 0 on success, -1 if the cache mode was invalid.
567  */
568 int bdrv_parse_cache_flags(const char *mode, int *flags)
569 {
570     *flags &= ~BDRV_O_CACHE_MASK;
571 
572     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
573         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
574     } else if (!strcmp(mode, "directsync")) {
575         *flags |= BDRV_O_NOCACHE;
576     } else if (!strcmp(mode, "writeback")) {
577         *flags |= BDRV_O_CACHE_WB;
578     } else if (!strcmp(mode, "unsafe")) {
579         *flags |= BDRV_O_CACHE_WB;
580         *flags |= BDRV_O_NO_FLUSH;
581     } else if (!strcmp(mode, "writethrough")) {
582         /* this is the default */
583     } else {
584         return -1;
585     }
586 
587     return 0;
588 }
589 
590 /**
591  * The copy-on-read flag is actually a reference count so multiple users may
592  * use the feature without worrying about clobbering its previous state.
593  * Copy-on-read stays enabled until all users have called to disable it.
594  */
595 void bdrv_enable_copy_on_read(BlockDriverState *bs)
596 {
597     bs->copy_on_read++;
598 }
599 
600 void bdrv_disable_copy_on_read(BlockDriverState *bs)
601 {
602     assert(bs->copy_on_read > 0);
603     bs->copy_on_read--;
604 }
605 
606 /*
607  * Common part for opening disk images and files
608  */
609 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
610     int flags, BlockDriver *drv)
611 {
612     int ret, open_flags;
613 
614     assert(drv != NULL);
615 
616     trace_bdrv_open_common(bs, filename, flags, drv->format_name);
617 
618     bs->file = NULL;
619     bs->total_sectors = 0;
620     bs->encrypted = 0;
621     bs->valid_key = 0;
622     bs->sg = 0;
623     bs->open_flags = flags;
624     bs->growable = 0;
625     bs->buffer_alignment = 512;
626 
627     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
628     if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
629         bdrv_enable_copy_on_read(bs);
630     }
631 
632     pstrcpy(bs->filename, sizeof(bs->filename), filename);
633     bs->backing_file[0] = '\0';
634 
635     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
636         return -ENOTSUP;
637     }
638 
639     bs->drv = drv;
640     bs->opaque = g_malloc0(drv->instance_size);
641 
642     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
643 
644     /*
645      * Clear flags that are internal to the block layer before opening the
646      * image.
647      */
648     open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
649 
650     /*
651      * Snapshots should be writable.
652      */
653     if (bs->is_temporary) {
654         open_flags |= BDRV_O_RDWR;
655     }
656 
657     bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
658 
659     /* Open the image, either directly or using a protocol */
660     if (drv->bdrv_file_open) {
661         ret = drv->bdrv_file_open(bs, filename, open_flags);
662     } else {
663         ret = bdrv_file_open(&bs->file, filename, open_flags);
664         if (ret >= 0) {
665             ret = drv->bdrv_open(bs, open_flags);
666         }
667     }
668 
669     if (ret < 0) {
670         goto free_and_fail;
671     }
672 
673     ret = refresh_total_sectors(bs, bs->total_sectors);
674     if (ret < 0) {
675         goto free_and_fail;
676     }
677 
678 #ifndef _WIN32
679     if (bs->is_temporary) {
680         unlink(filename);
681     }
682 #endif
683     return 0;
684 
685 free_and_fail:
686     if (bs->file) {
687         bdrv_delete(bs->file);
688         bs->file = NULL;
689     }
690     g_free(bs->opaque);
691     bs->opaque = NULL;
692     bs->drv = NULL;
693     return ret;
694 }
695 
696 /*
697  * Opens a file using a protocol (file, host_device, nbd, ...)
698  */
699 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
700 {
701     BlockDriverState *bs;
702     BlockDriver *drv;
703     int ret;
704 
705     drv = bdrv_find_protocol(filename);
706     if (!drv) {
707         return -ENOENT;
708     }
709 
710     bs = bdrv_new("");
711     ret = bdrv_open_common(bs, filename, flags, drv);
712     if (ret < 0) {
713         bdrv_delete(bs);
714         return ret;
715     }
716     bs->growable = 1;
717     *pbs = bs;
718     return 0;
719 }
720 
721 /*
722  * Opens a disk image (raw, qcow2, vmdk, ...)
723  */
724 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
725               BlockDriver *drv)
726 {
727     int ret;
728     char tmp_filename[PATH_MAX];
729 
730     if (flags & BDRV_O_SNAPSHOT) {
731         BlockDriverState *bs1;
732         int64_t total_size;
733         int is_protocol = 0;
734         BlockDriver *bdrv_qcow2;
735         QEMUOptionParameter *options;
736         char backing_filename[PATH_MAX];
737 
738         /* if snapshot, we create a temporary backing file and open it
739            instead of opening 'filename' directly */
740 
741         /* if there is a backing file, use it */
742         bs1 = bdrv_new("");
743         ret = bdrv_open(bs1, filename, 0, drv);
744         if (ret < 0) {
745             bdrv_delete(bs1);
746             return ret;
747         }
748         total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
749 
750         if (bs1->drv && bs1->drv->protocol_name)
751             is_protocol = 1;
752 
753         bdrv_delete(bs1);
754 
755         get_tmp_filename(tmp_filename, sizeof(tmp_filename));
756 
757         /* Real path is meaningless for protocols */
758         if (is_protocol)
759             snprintf(backing_filename, sizeof(backing_filename),
760                      "%s", filename);
761         else if (!realpath(filename, backing_filename))
762             return -errno;
763 
764         bdrv_qcow2 = bdrv_find_format("qcow2");
765         options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
766 
767         set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
768         set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
769         if (drv) {
770             set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
771                 drv->format_name);
772         }
773 
774         ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
775         free_option_parameters(options);
776         if (ret < 0) {
777             return ret;
778         }
779 
780         filename = tmp_filename;
781         drv = bdrv_qcow2;
782         bs->is_temporary = 1;
783     }
784 
785     /* Find the right image format driver */
786     if (!drv) {
787         ret = find_image_format(filename, &drv);
788     }
789 
790     if (!drv) {
791         goto unlink_and_fail;
792     }
793 
794     /* Open the image */
795     ret = bdrv_open_common(bs, filename, flags, drv);
796     if (ret < 0) {
797         goto unlink_and_fail;
798     }
799 
800     /* If there is a backing file, use it */
801     if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
802         char backing_filename[PATH_MAX];
803         int back_flags;
804         BlockDriver *back_drv = NULL;
805 
806         bs->backing_hd = bdrv_new("");
807 
808         if (path_has_protocol(bs->backing_file)) {
809             pstrcpy(backing_filename, sizeof(backing_filename),
810                     bs->backing_file);
811         } else {
812             path_combine(backing_filename, sizeof(backing_filename),
813                          filename, bs->backing_file);
814         }
815 
816         if (bs->backing_format[0] != '\0') {
817             back_drv = bdrv_find_format(bs->backing_format);
818         }
819 
820         /* backing files always opened read-only */
821         back_flags =
822             flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
823 
824         ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
825         if (ret < 0) {
826             bdrv_close(bs);
827             return ret;
828         }
829         if (bs->is_temporary) {
830             bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
831         } else {
832             /* base image inherits from "parent" */
833             bs->backing_hd->keep_read_only = bs->keep_read_only;
834         }
835     }
836 
837     if (!bdrv_key_required(bs)) {
838         bdrv_dev_change_media_cb(bs, true);
839     }
840 
841     /* throttling disk I/O limits */
842     if (bs->io_limits_enabled) {
843         bdrv_io_limits_enable(bs);
844     }
845 
846     return 0;
847 
848 unlink_and_fail:
849     if (bs->is_temporary) {
850         unlink(filename);
851     }
852     return ret;
853 }
854 
855 void bdrv_close(BlockDriverState *bs)
856 {
857     bdrv_flush(bs);
858     if (bs->drv) {
859         if (bs->job) {
860             block_job_cancel_sync(bs->job);
861         }
862         bdrv_drain_all();
863 
864         if (bs == bs_snapshots) {
865             bs_snapshots = NULL;
866         }
867         if (bs->backing_hd) {
868             bdrv_delete(bs->backing_hd);
869             bs->backing_hd = NULL;
870         }
871         bs->drv->bdrv_close(bs);
872         g_free(bs->opaque);
873 #ifdef _WIN32
874         if (bs->is_temporary) {
875             unlink(bs->filename);
876         }
877 #endif
878         bs->opaque = NULL;
879         bs->drv = NULL;
880         bs->copy_on_read = 0;
881 
882         if (bs->file != NULL) {
883             bdrv_close(bs->file);
884         }
885 
886         bdrv_dev_change_media_cb(bs, false);
887     }
888 
889     /*throttling disk I/O limits*/
890     if (bs->io_limits_enabled) {
891         bdrv_io_limits_disable(bs);
892     }
893 }
894 
895 void bdrv_close_all(void)
896 {
897     BlockDriverState *bs;
898 
899     QTAILQ_FOREACH(bs, &bdrv_states, list) {
900         bdrv_close(bs);
901     }
902 }
903 
904 /*
905  * Wait for pending requests to complete across all BlockDriverStates
906  *
907  * This function does not flush data to disk, use bdrv_flush_all() for that
908  * after calling this function.
909  *
910  * Note that completion of an asynchronous I/O operation can trigger any
911  * number of other I/O operations on other devices---for example a coroutine
912  * can be arbitrarily complex and a constant flow of I/O can come until the
913  * coroutine is complete.  Because of this, it is not possible to have a
914  * function to drain a single device's I/O queue.
915  */
916 void bdrv_drain_all(void)
917 {
918     BlockDriverState *bs;
919     bool busy;
920 
921     do {
922         busy = qemu_aio_wait();
923 
924         /* FIXME: We do not have timer support here, so this is effectively
925          * a busy wait.
926          */
927         QTAILQ_FOREACH(bs, &bdrv_states, list) {
928             if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
929                 qemu_co_queue_restart_all(&bs->throttled_reqs);
930                 busy = true;
931             }
932         }
933     } while (busy);
934 
935     /* If requests are still pending there is a bug somewhere */
936     QTAILQ_FOREACH(bs, &bdrv_states, list) {
937         assert(QLIST_EMPTY(&bs->tracked_requests));
938         assert(qemu_co_queue_empty(&bs->throttled_reqs));
939     }
940 }
941 
942 /* make a BlockDriverState anonymous by removing from bdrv_state list.
943    Also, NULL terminate the device_name to prevent double remove */
944 void bdrv_make_anon(BlockDriverState *bs)
945 {
946     if (bs->device_name[0] != '\0') {
947         QTAILQ_REMOVE(&bdrv_states, bs, list);
948     }
949     bs->device_name[0] = '\0';
950 }
951 
952 /*
953  * Add new bs contents at the top of an image chain while the chain is
954  * live, while keeping required fields on the top layer.
955  *
956  * This will modify the BlockDriverState fields, and swap contents
957  * between bs_new and bs_top. Both bs_new and bs_top are modified.
958  *
959  * bs_new is required to be anonymous.
960  *
961  * This function does not create any image files.
962  */
963 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
964 {
965     BlockDriverState tmp;
966 
967     /* bs_new must be anonymous */
968     assert(bs_new->device_name[0] == '\0');
969 
970     tmp = *bs_new;
971 
972     /* there are some fields that need to stay on the top layer: */
973 
974     /* dev info */
975     tmp.dev_ops           = bs_top->dev_ops;
976     tmp.dev_opaque        = bs_top->dev_opaque;
977     tmp.dev               = bs_top->dev;
978     tmp.buffer_alignment  = bs_top->buffer_alignment;
979     tmp.copy_on_read      = bs_top->copy_on_read;
980 
981     /* i/o timing parameters */
982     tmp.slice_time        = bs_top->slice_time;
983     tmp.slice_start       = bs_top->slice_start;
984     tmp.slice_end         = bs_top->slice_end;
985     tmp.io_limits         = bs_top->io_limits;
986     tmp.io_base           = bs_top->io_base;
987     tmp.throttled_reqs    = bs_top->throttled_reqs;
988     tmp.block_timer       = bs_top->block_timer;
989     tmp.io_limits_enabled = bs_top->io_limits_enabled;
990 
991     /* geometry */
992     tmp.cyls              = bs_top->cyls;
993     tmp.heads             = bs_top->heads;
994     tmp.secs              = bs_top->secs;
995     tmp.translation       = bs_top->translation;
996 
997     /* r/w error */
998     tmp.on_read_error     = bs_top->on_read_error;
999     tmp.on_write_error    = bs_top->on_write_error;
1000 
1001     /* i/o status */
1002     tmp.iostatus_enabled  = bs_top->iostatus_enabled;
1003     tmp.iostatus          = bs_top->iostatus;
1004 
1005     /* keep the same entry in bdrv_states */
1006     pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name);
1007     tmp.list = bs_top->list;
1008 
1009     /* The contents of 'tmp' will become bs_top, as we are
1010      * swapping bs_new and bs_top contents. */
1011     tmp.backing_hd = bs_new;
1012     pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
1013     bdrv_get_format(bs_top, tmp.backing_format, sizeof(tmp.backing_format));
1014 
1015     /* swap contents of the fixed new bs and the current top */
1016     *bs_new = *bs_top;
1017     *bs_top = tmp;
1018 
1019     /* device_name[] was carried over from the old bs_top.  bs_new
1020      * shouldn't be in bdrv_states, so we need to make device_name[]
1021      * reflect the anonymity of bs_new
1022      */
1023     bs_new->device_name[0] = '\0';
1024 
1025     /* clear the copied fields in the new backing file */
1026     bdrv_detach_dev(bs_new, bs_new->dev);
1027 
1028     qemu_co_queue_init(&bs_new->throttled_reqs);
1029     memset(&bs_new->io_base,   0, sizeof(bs_new->io_base));
1030     memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits));
1031     bdrv_iostatus_disable(bs_new);
1032 
1033     /* we don't use bdrv_io_limits_disable() for this, because we don't want
1034      * to affect or delete the block_timer, as it has been moved to bs_top */
1035     bs_new->io_limits_enabled = false;
1036     bs_new->block_timer       = NULL;
1037     bs_new->slice_time        = 0;
1038     bs_new->slice_start       = 0;
1039     bs_new->slice_end         = 0;
1040 }
1041 
1042 void bdrv_delete(BlockDriverState *bs)
1043 {
1044     assert(!bs->dev);
1045     assert(!bs->job);
1046     assert(!bs->in_use);
1047 
1048     /* remove from list, if necessary */
1049     bdrv_make_anon(bs);
1050 
1051     bdrv_close(bs);
1052     if (bs->file != NULL) {
1053         bdrv_delete(bs->file);
1054     }
1055 
1056     assert(bs != bs_snapshots);
1057     g_free(bs);
1058 }
1059 
1060 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1061 /* TODO change to DeviceState *dev when all users are qdevified */
1062 {
1063     if (bs->dev) {
1064         return -EBUSY;
1065     }
1066     bs->dev = dev;
1067     bdrv_iostatus_reset(bs);
1068     return 0;
1069 }
1070 
1071 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1072 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1073 {
1074     if (bdrv_attach_dev(bs, dev) < 0) {
1075         abort();
1076     }
1077 }
1078 
1079 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1080 /* TODO change to DeviceState *dev when all users are qdevified */
1081 {
1082     assert(bs->dev == dev);
1083     bs->dev = NULL;
1084     bs->dev_ops = NULL;
1085     bs->dev_opaque = NULL;
1086     bs->buffer_alignment = 512;
1087 }
1088 
1089 /* TODO change to return DeviceState * when all users are qdevified */
1090 void *bdrv_get_attached_dev(BlockDriverState *bs)
1091 {
1092     return bs->dev;
1093 }
1094 
1095 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1096                       void *opaque)
1097 {
1098     bs->dev_ops = ops;
1099     bs->dev_opaque = opaque;
1100     if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1101         bs_snapshots = NULL;
1102     }
1103 }
1104 
1105 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1106                                BlockQMPEventAction action, int is_read)
1107 {
1108     QObject *data;
1109     const char *action_str;
1110 
1111     switch (action) {
1112     case BDRV_ACTION_REPORT:
1113         action_str = "report";
1114         break;
1115     case BDRV_ACTION_IGNORE:
1116         action_str = "ignore";
1117         break;
1118     case BDRV_ACTION_STOP:
1119         action_str = "stop";
1120         break;
1121     default:
1122         abort();
1123     }
1124 
1125     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1126                               bdrv->device_name,
1127                               action_str,
1128                               is_read ? "read" : "write");
1129     monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1130 
1131     qobject_decref(data);
1132 }
1133 
1134 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1135 {
1136     QObject *data;
1137 
1138     data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1139                               bdrv_get_device_name(bs), ejected);
1140     monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1141 
1142     qobject_decref(data);
1143 }
1144 
1145 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1146 {
1147     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1148         bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1149         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1150         if (tray_was_closed) {
1151             /* tray open */
1152             bdrv_emit_qmp_eject_event(bs, true);
1153         }
1154         if (load) {
1155             /* tray close */
1156             bdrv_emit_qmp_eject_event(bs, false);
1157         }
1158     }
1159 }
1160 
1161 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1162 {
1163     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1164 }
1165 
1166 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1167 {
1168     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1169         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1170     }
1171 }
1172 
1173 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1174 {
1175     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1176         return bs->dev_ops->is_tray_open(bs->dev_opaque);
1177     }
1178     return false;
1179 }
1180 
1181 static void bdrv_dev_resize_cb(BlockDriverState *bs)
1182 {
1183     if (bs->dev_ops && bs->dev_ops->resize_cb) {
1184         bs->dev_ops->resize_cb(bs->dev_opaque);
1185     }
1186 }
1187 
1188 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1189 {
1190     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1191         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1192     }
1193     return false;
1194 }
1195 
1196 /*
1197  * Run consistency checks on an image
1198  *
1199  * Returns 0 if the check could be completed (it doesn't mean that the image is
1200  * free of errors) or -errno when an internal error occurred. The results of the
1201  * check are stored in res.
1202  */
1203 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
1204 {
1205     if (bs->drv->bdrv_check == NULL) {
1206         return -ENOTSUP;
1207     }
1208 
1209     memset(res, 0, sizeof(*res));
1210     return bs->drv->bdrv_check(bs, res);
1211 }
1212 
1213 #define COMMIT_BUF_SECTORS 2048
1214 
1215 /* commit COW file into the raw image */
1216 int bdrv_commit(BlockDriverState *bs)
1217 {
1218     BlockDriver *drv = bs->drv;
1219     BlockDriver *backing_drv;
1220     int64_t sector, total_sectors;
1221     int n, ro, open_flags;
1222     int ret = 0, rw_ret = 0;
1223     uint8_t *buf;
1224     char filename[1024];
1225     BlockDriverState *bs_rw, *bs_ro;
1226 
1227     if (!drv)
1228         return -ENOMEDIUM;
1229 
1230     if (!bs->backing_hd) {
1231         return -ENOTSUP;
1232     }
1233 
1234     if (bs->backing_hd->keep_read_only) {
1235         return -EACCES;
1236     }
1237 
1238     if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1239         return -EBUSY;
1240     }
1241 
1242     backing_drv = bs->backing_hd->drv;
1243     ro = bs->backing_hd->read_only;
1244     strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1245     open_flags =  bs->backing_hd->open_flags;
1246 
1247     if (ro) {
1248         /* re-open as RW */
1249         bdrv_delete(bs->backing_hd);
1250         bs->backing_hd = NULL;
1251         bs_rw = bdrv_new("");
1252         rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1253             backing_drv);
1254         if (rw_ret < 0) {
1255             bdrv_delete(bs_rw);
1256             /* try to re-open read-only */
1257             bs_ro = bdrv_new("");
1258             ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1259                 backing_drv);
1260             if (ret < 0) {
1261                 bdrv_delete(bs_ro);
1262                 /* drive not functional anymore */
1263                 bs->drv = NULL;
1264                 return ret;
1265             }
1266             bs->backing_hd = bs_ro;
1267             return rw_ret;
1268         }
1269         bs->backing_hd = bs_rw;
1270     }
1271 
1272     total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1273     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1274 
1275     for (sector = 0; sector < total_sectors; sector += n) {
1276         if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1277 
1278             if (bdrv_read(bs, sector, buf, n) != 0) {
1279                 ret = -EIO;
1280                 goto ro_cleanup;
1281             }
1282 
1283             if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1284                 ret = -EIO;
1285                 goto ro_cleanup;
1286             }
1287         }
1288     }
1289 
1290     if (drv->bdrv_make_empty) {
1291         ret = drv->bdrv_make_empty(bs);
1292         bdrv_flush(bs);
1293     }
1294 
1295     /*
1296      * Make sure all data we wrote to the backing device is actually
1297      * stable on disk.
1298      */
1299     if (bs->backing_hd)
1300         bdrv_flush(bs->backing_hd);
1301 
1302 ro_cleanup:
1303     g_free(buf);
1304 
1305     if (ro) {
1306         /* re-open as RO */
1307         bdrv_delete(bs->backing_hd);
1308         bs->backing_hd = NULL;
1309         bs_ro = bdrv_new("");
1310         ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1311             backing_drv);
1312         if (ret < 0) {
1313             bdrv_delete(bs_ro);
1314             /* drive not functional anymore */
1315             bs->drv = NULL;
1316             return ret;
1317         }
1318         bs->backing_hd = bs_ro;
1319         bs->backing_hd->keep_read_only = 0;
1320     }
1321 
1322     return ret;
1323 }
1324 
1325 int bdrv_commit_all(void)
1326 {
1327     BlockDriverState *bs;
1328 
1329     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1330         int ret = bdrv_commit(bs);
1331         if (ret < 0) {
1332             return ret;
1333         }
1334     }
1335     return 0;
1336 }
1337 
1338 struct BdrvTrackedRequest {
1339     BlockDriverState *bs;
1340     int64_t sector_num;
1341     int nb_sectors;
1342     bool is_write;
1343     QLIST_ENTRY(BdrvTrackedRequest) list;
1344     Coroutine *co; /* owner, used for deadlock detection */
1345     CoQueue wait_queue; /* coroutines blocked on this request */
1346 };
1347 
1348 /**
1349  * Remove an active request from the tracked requests list
1350  *
1351  * This function should be called when a tracked request is completing.
1352  */
1353 static void tracked_request_end(BdrvTrackedRequest *req)
1354 {
1355     QLIST_REMOVE(req, list);
1356     qemu_co_queue_restart_all(&req->wait_queue);
1357 }
1358 
1359 /**
1360  * Add an active request to the tracked requests list
1361  */
1362 static void tracked_request_begin(BdrvTrackedRequest *req,
1363                                   BlockDriverState *bs,
1364                                   int64_t sector_num,
1365                                   int nb_sectors, bool is_write)
1366 {
1367     *req = (BdrvTrackedRequest){
1368         .bs = bs,
1369         .sector_num = sector_num,
1370         .nb_sectors = nb_sectors,
1371         .is_write = is_write,
1372         .co = qemu_coroutine_self(),
1373     };
1374 
1375     qemu_co_queue_init(&req->wait_queue);
1376 
1377     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1378 }
1379 
1380 /**
1381  * Round a region to cluster boundaries
1382  */
1383 static void round_to_clusters(BlockDriverState *bs,
1384                               int64_t sector_num, int nb_sectors,
1385                               int64_t *cluster_sector_num,
1386                               int *cluster_nb_sectors)
1387 {
1388     BlockDriverInfo bdi;
1389 
1390     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1391         *cluster_sector_num = sector_num;
1392         *cluster_nb_sectors = nb_sectors;
1393     } else {
1394         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1395         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1396         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1397                                             nb_sectors, c);
1398     }
1399 }
1400 
1401 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1402                                      int64_t sector_num, int nb_sectors) {
1403     /*        aaaa   bbbb */
1404     if (sector_num >= req->sector_num + req->nb_sectors) {
1405         return false;
1406     }
1407     /* bbbb   aaaa        */
1408     if (req->sector_num >= sector_num + nb_sectors) {
1409         return false;
1410     }
1411     return true;
1412 }
1413 
1414 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1415         int64_t sector_num, int nb_sectors)
1416 {
1417     BdrvTrackedRequest *req;
1418     int64_t cluster_sector_num;
1419     int cluster_nb_sectors;
1420     bool retry;
1421 
1422     /* If we touch the same cluster it counts as an overlap.  This guarantees
1423      * that allocating writes will be serialized and not race with each other
1424      * for the same cluster.  For example, in copy-on-read it ensures that the
1425      * CoR read and write operations are atomic and guest writes cannot
1426      * interleave between them.
1427      */
1428     round_to_clusters(bs, sector_num, nb_sectors,
1429                       &cluster_sector_num, &cluster_nb_sectors);
1430 
1431     do {
1432         retry = false;
1433         QLIST_FOREACH(req, &bs->tracked_requests, list) {
1434             if (tracked_request_overlaps(req, cluster_sector_num,
1435                                          cluster_nb_sectors)) {
1436                 /* Hitting this means there was a reentrant request, for
1437                  * example, a block driver issuing nested requests.  This must
1438                  * never happen since it means deadlock.
1439                  */
1440                 assert(qemu_coroutine_self() != req->co);
1441 
1442                 qemu_co_queue_wait(&req->wait_queue);
1443                 retry = true;
1444                 break;
1445             }
1446         }
1447     } while (retry);
1448 }
1449 
1450 /*
1451  * Return values:
1452  * 0        - success
1453  * -EINVAL  - backing format specified, but no file
1454  * -ENOSPC  - can't update the backing file because no space is left in the
1455  *            image file header
1456  * -ENOTSUP - format driver doesn't support changing the backing file
1457  */
1458 int bdrv_change_backing_file(BlockDriverState *bs,
1459     const char *backing_file, const char *backing_fmt)
1460 {
1461     BlockDriver *drv = bs->drv;
1462 
1463     /* Backing file format doesn't make sense without a backing file */
1464     if (backing_fmt && !backing_file) {
1465         return -EINVAL;
1466     }
1467 
1468     if (drv->bdrv_change_backing_file != NULL) {
1469         return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1470     } else {
1471         return -ENOTSUP;
1472     }
1473 }
1474 
1475 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1476                                    size_t size)
1477 {
1478     int64_t len;
1479 
1480     if (!bdrv_is_inserted(bs))
1481         return -ENOMEDIUM;
1482 
1483     if (bs->growable)
1484         return 0;
1485 
1486     len = bdrv_getlength(bs);
1487 
1488     if (offset < 0)
1489         return -EIO;
1490 
1491     if ((offset > len) || (len - offset < size))
1492         return -EIO;
1493 
1494     return 0;
1495 }
1496 
1497 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1498                               int nb_sectors)
1499 {
1500     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1501                                    nb_sectors * BDRV_SECTOR_SIZE);
1502 }
1503 
1504 typedef struct RwCo {
1505     BlockDriverState *bs;
1506     int64_t sector_num;
1507     int nb_sectors;
1508     QEMUIOVector *qiov;
1509     bool is_write;
1510     int ret;
1511 } RwCo;
1512 
1513 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1514 {
1515     RwCo *rwco = opaque;
1516 
1517     if (!rwco->is_write) {
1518         rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1519                                      rwco->nb_sectors, rwco->qiov, 0);
1520     } else {
1521         rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1522                                       rwco->nb_sectors, rwco->qiov, 0);
1523     }
1524 }
1525 
1526 /*
1527  * Process a synchronous request using coroutines
1528  */
1529 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1530                       int nb_sectors, bool is_write)
1531 {
1532     QEMUIOVector qiov;
1533     struct iovec iov = {
1534         .iov_base = (void *)buf,
1535         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1536     };
1537     Coroutine *co;
1538     RwCo rwco = {
1539         .bs = bs,
1540         .sector_num = sector_num,
1541         .nb_sectors = nb_sectors,
1542         .qiov = &qiov,
1543         .is_write = is_write,
1544         .ret = NOT_DONE,
1545     };
1546 
1547     qemu_iovec_init_external(&qiov, &iov, 1);
1548 
1549     /**
1550      * In sync call context, when the vcpu is blocked, this throttling timer
1551      * will not fire; so the I/O throttling function has to be disabled here
1552      * if it has been enabled.
1553      */
1554     if (bs->io_limits_enabled) {
1555         fprintf(stderr, "Disabling I/O throttling on '%s' due "
1556                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
1557         bdrv_io_limits_disable(bs);
1558     }
1559 
1560     if (qemu_in_coroutine()) {
1561         /* Fast-path if already in coroutine context */
1562         bdrv_rw_co_entry(&rwco);
1563     } else {
1564         co = qemu_coroutine_create(bdrv_rw_co_entry);
1565         qemu_coroutine_enter(co, &rwco);
1566         while (rwco.ret == NOT_DONE) {
1567             qemu_aio_wait();
1568         }
1569     }
1570     return rwco.ret;
1571 }
1572 
1573 /* return < 0 if error. See bdrv_write() for the return codes */
1574 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1575               uint8_t *buf, int nb_sectors)
1576 {
1577     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1578 }
1579 
1580 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1581                              int nb_sectors, int dirty)
1582 {
1583     int64_t start, end;
1584     unsigned long val, idx, bit;
1585 
1586     start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1587     end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1588 
1589     for (; start <= end; start++) {
1590         idx = start / (sizeof(unsigned long) * 8);
1591         bit = start % (sizeof(unsigned long) * 8);
1592         val = bs->dirty_bitmap[idx];
1593         if (dirty) {
1594             if (!(val & (1UL << bit))) {
1595                 bs->dirty_count++;
1596                 val |= 1UL << bit;
1597             }
1598         } else {
1599             if (val & (1UL << bit)) {
1600                 bs->dirty_count--;
1601                 val &= ~(1UL << bit);
1602             }
1603         }
1604         bs->dirty_bitmap[idx] = val;
1605     }
1606 }
1607 
1608 /* Return < 0 if error. Important errors are:
1609   -EIO         generic I/O error (may happen for all errors)
1610   -ENOMEDIUM   No media inserted.
1611   -EINVAL      Invalid sector number or nb_sectors
1612   -EACCES      Trying to write a read-only device
1613 */
1614 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1615                const uint8_t *buf, int nb_sectors)
1616 {
1617     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1618 }
1619 
1620 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1621                void *buf, int count1)
1622 {
1623     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1624     int len, nb_sectors, count;
1625     int64_t sector_num;
1626     int ret;
1627 
1628     count = count1;
1629     /* first read to align to sector start */
1630     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1631     if (len > count)
1632         len = count;
1633     sector_num = offset >> BDRV_SECTOR_BITS;
1634     if (len > 0) {
1635         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1636             return ret;
1637         memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1638         count -= len;
1639         if (count == 0)
1640             return count1;
1641         sector_num++;
1642         buf += len;
1643     }
1644 
1645     /* read the sectors "in place" */
1646     nb_sectors = count >> BDRV_SECTOR_BITS;
1647     if (nb_sectors > 0) {
1648         if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1649             return ret;
1650         sector_num += nb_sectors;
1651         len = nb_sectors << BDRV_SECTOR_BITS;
1652         buf += len;
1653         count -= len;
1654     }
1655 
1656     /* add data from the last sector */
1657     if (count > 0) {
1658         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1659             return ret;
1660         memcpy(buf, tmp_buf, count);
1661     }
1662     return count1;
1663 }
1664 
1665 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1666                 const void *buf, int count1)
1667 {
1668     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1669     int len, nb_sectors, count;
1670     int64_t sector_num;
1671     int ret;
1672 
1673     count = count1;
1674     /* first write to align to sector start */
1675     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1676     if (len > count)
1677         len = count;
1678     sector_num = offset >> BDRV_SECTOR_BITS;
1679     if (len > 0) {
1680         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1681             return ret;
1682         memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1683         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1684             return ret;
1685         count -= len;
1686         if (count == 0)
1687             return count1;
1688         sector_num++;
1689         buf += len;
1690     }
1691 
1692     /* write the sectors "in place" */
1693     nb_sectors = count >> BDRV_SECTOR_BITS;
1694     if (nb_sectors > 0) {
1695         if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1696             return ret;
1697         sector_num += nb_sectors;
1698         len = nb_sectors << BDRV_SECTOR_BITS;
1699         buf += len;
1700         count -= len;
1701     }
1702 
1703     /* add data from the last sector */
1704     if (count > 0) {
1705         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1706             return ret;
1707         memcpy(tmp_buf, buf, count);
1708         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1709             return ret;
1710     }
1711     return count1;
1712 }
1713 
1714 /*
1715  * Writes to the file and ensures that no writes are reordered across this
1716  * request (acts as a barrier)
1717  *
1718  * Returns 0 on success, -errno in error cases.
1719  */
1720 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1721     const void *buf, int count)
1722 {
1723     int ret;
1724 
1725     ret = bdrv_pwrite(bs, offset, buf, count);
1726     if (ret < 0) {
1727         return ret;
1728     }
1729 
1730     /* No flush needed for cache modes that use O_DSYNC */
1731     if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
1732         bdrv_flush(bs);
1733     }
1734 
1735     return 0;
1736 }
1737 
1738 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1739         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1740 {
1741     /* Perform I/O through a temporary buffer so that users who scribble over
1742      * their read buffer while the operation is in progress do not end up
1743      * modifying the image file.  This is critical for zero-copy guest I/O
1744      * where anything might happen inside guest memory.
1745      */
1746     void *bounce_buffer;
1747 
1748     BlockDriver *drv = bs->drv;
1749     struct iovec iov;
1750     QEMUIOVector bounce_qiov;
1751     int64_t cluster_sector_num;
1752     int cluster_nb_sectors;
1753     size_t skip_bytes;
1754     int ret;
1755 
1756     /* Cover entire cluster so no additional backing file I/O is required when
1757      * allocating cluster in the image file.
1758      */
1759     round_to_clusters(bs, sector_num, nb_sectors,
1760                       &cluster_sector_num, &cluster_nb_sectors);
1761 
1762     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1763                                    cluster_sector_num, cluster_nb_sectors);
1764 
1765     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1766     iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1767     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1768 
1769     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1770                              &bounce_qiov);
1771     if (ret < 0) {
1772         goto err;
1773     }
1774 
1775     if (drv->bdrv_co_write_zeroes &&
1776         buffer_is_zero(bounce_buffer, iov.iov_len)) {
1777         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
1778                                       cluster_nb_sectors);
1779     } else {
1780         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1781                                   &bounce_qiov);
1782     }
1783 
1784     if (ret < 0) {
1785         /* It might be okay to ignore write errors for guest requests.  If this
1786          * is a deliberate copy-on-read then we don't want to ignore the error.
1787          * Simply report it in all cases.
1788          */
1789         goto err;
1790     }
1791 
1792     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1793     qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1794                            nb_sectors * BDRV_SECTOR_SIZE);
1795 
1796 err:
1797     qemu_vfree(bounce_buffer);
1798     return ret;
1799 }
1800 
1801 /*
1802  * Handle a read request in coroutine context
1803  */
1804 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1805     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1806     BdrvRequestFlags flags)
1807 {
1808     BlockDriver *drv = bs->drv;
1809     BdrvTrackedRequest req;
1810     int ret;
1811 
1812     if (!drv) {
1813         return -ENOMEDIUM;
1814     }
1815     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1816         return -EIO;
1817     }
1818 
1819     /* throttling disk read I/O */
1820     if (bs->io_limits_enabled) {
1821         bdrv_io_limits_intercept(bs, false, nb_sectors);
1822     }
1823 
1824     if (bs->copy_on_read) {
1825         flags |= BDRV_REQ_COPY_ON_READ;
1826     }
1827     if (flags & BDRV_REQ_COPY_ON_READ) {
1828         bs->copy_on_read_in_flight++;
1829     }
1830 
1831     if (bs->copy_on_read_in_flight) {
1832         wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1833     }
1834 
1835     tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1836 
1837     if (flags & BDRV_REQ_COPY_ON_READ) {
1838         int pnum;
1839 
1840         ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1841         if (ret < 0) {
1842             goto out;
1843         }
1844 
1845         if (!ret || pnum != nb_sectors) {
1846             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1847             goto out;
1848         }
1849     }
1850 
1851     ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1852 
1853 out:
1854     tracked_request_end(&req);
1855 
1856     if (flags & BDRV_REQ_COPY_ON_READ) {
1857         bs->copy_on_read_in_flight--;
1858     }
1859 
1860     return ret;
1861 }
1862 
1863 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1864     int nb_sectors, QEMUIOVector *qiov)
1865 {
1866     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1867 
1868     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1869 }
1870 
1871 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1872     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1873 {
1874     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1875 
1876     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1877                             BDRV_REQ_COPY_ON_READ);
1878 }
1879 
1880 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1881     int64_t sector_num, int nb_sectors)
1882 {
1883     BlockDriver *drv = bs->drv;
1884     QEMUIOVector qiov;
1885     struct iovec iov;
1886     int ret;
1887 
1888     /* TODO Emulate only part of misaligned requests instead of letting block
1889      * drivers return -ENOTSUP and emulate everything */
1890 
1891     /* First try the efficient write zeroes operation */
1892     if (drv->bdrv_co_write_zeroes) {
1893         ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1894         if (ret != -ENOTSUP) {
1895             return ret;
1896         }
1897     }
1898 
1899     /* Fall back to bounce buffer if write zeroes is unsupported */
1900     iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
1901     iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1902     memset(iov.iov_base, 0, iov.iov_len);
1903     qemu_iovec_init_external(&qiov, &iov, 1);
1904 
1905     ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1906 
1907     qemu_vfree(iov.iov_base);
1908     return ret;
1909 }
1910 
1911 /*
1912  * Handle a write request in coroutine context
1913  */
1914 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1915     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1916     BdrvRequestFlags flags)
1917 {
1918     BlockDriver *drv = bs->drv;
1919     BdrvTrackedRequest req;
1920     int ret;
1921 
1922     if (!bs->drv) {
1923         return -ENOMEDIUM;
1924     }
1925     if (bs->read_only) {
1926         return -EACCES;
1927     }
1928     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1929         return -EIO;
1930     }
1931 
1932     /* throttling disk write I/O */
1933     if (bs->io_limits_enabled) {
1934         bdrv_io_limits_intercept(bs, true, nb_sectors);
1935     }
1936 
1937     if (bs->copy_on_read_in_flight) {
1938         wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1939     }
1940 
1941     tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1942 
1943     if (flags & BDRV_REQ_ZERO_WRITE) {
1944         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1945     } else {
1946         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1947     }
1948 
1949     if (bs->dirty_bitmap) {
1950         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1951     }
1952 
1953     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1954         bs->wr_highest_sector = sector_num + nb_sectors - 1;
1955     }
1956 
1957     tracked_request_end(&req);
1958 
1959     return ret;
1960 }
1961 
1962 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1963     int nb_sectors, QEMUIOVector *qiov)
1964 {
1965     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1966 
1967     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1968 }
1969 
1970 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1971                                       int64_t sector_num, int nb_sectors)
1972 {
1973     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1974 
1975     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1976                              BDRV_REQ_ZERO_WRITE);
1977 }
1978 
1979 /**
1980  * Truncate file to 'offset' bytes (needed only for file protocols)
1981  */
1982 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1983 {
1984     BlockDriver *drv = bs->drv;
1985     int ret;
1986     if (!drv)
1987         return -ENOMEDIUM;
1988     if (!drv->bdrv_truncate)
1989         return -ENOTSUP;
1990     if (bs->read_only)
1991         return -EACCES;
1992     if (bdrv_in_use(bs))
1993         return -EBUSY;
1994     ret = drv->bdrv_truncate(bs, offset);
1995     if (ret == 0) {
1996         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1997         bdrv_dev_resize_cb(bs);
1998     }
1999     return ret;
2000 }
2001 
2002 /**
2003  * Length of a allocated file in bytes. Sparse files are counted by actual
2004  * allocated space. Return < 0 if error or unknown.
2005  */
2006 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2007 {
2008     BlockDriver *drv = bs->drv;
2009     if (!drv) {
2010         return -ENOMEDIUM;
2011     }
2012     if (drv->bdrv_get_allocated_file_size) {
2013         return drv->bdrv_get_allocated_file_size(bs);
2014     }
2015     if (bs->file) {
2016         return bdrv_get_allocated_file_size(bs->file);
2017     }
2018     return -ENOTSUP;
2019 }
2020 
2021 /**
2022  * Length of a file in bytes. Return < 0 if error or unknown.
2023  */
2024 int64_t bdrv_getlength(BlockDriverState *bs)
2025 {
2026     BlockDriver *drv = bs->drv;
2027     if (!drv)
2028         return -ENOMEDIUM;
2029 
2030     if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2031         if (drv->bdrv_getlength) {
2032             return drv->bdrv_getlength(bs);
2033         }
2034     }
2035     return bs->total_sectors * BDRV_SECTOR_SIZE;
2036 }
2037 
2038 /* return 0 as number of sectors if no device present or error */
2039 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2040 {
2041     int64_t length;
2042     length = bdrv_getlength(bs);
2043     if (length < 0)
2044         length = 0;
2045     else
2046         length = length >> BDRV_SECTOR_BITS;
2047     *nb_sectors_ptr = length;
2048 }
2049 
2050 struct partition {
2051         uint8_t boot_ind;           /* 0x80 - active */
2052         uint8_t head;               /* starting head */
2053         uint8_t sector;             /* starting sector */
2054         uint8_t cyl;                /* starting cylinder */
2055         uint8_t sys_ind;            /* What partition type */
2056         uint8_t end_head;           /* end head */
2057         uint8_t end_sector;         /* end sector */
2058         uint8_t end_cyl;            /* end cylinder */
2059         uint32_t start_sect;        /* starting sector counting from 0 */
2060         uint32_t nr_sects;          /* nr of sectors in partition */
2061 } QEMU_PACKED;
2062 
2063 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
2064 static int guess_disk_lchs(BlockDriverState *bs,
2065                            int *pcylinders, int *pheads, int *psectors)
2066 {
2067     uint8_t buf[BDRV_SECTOR_SIZE];
2068     int ret, i, heads, sectors, cylinders;
2069     struct partition *p;
2070     uint32_t nr_sects;
2071     uint64_t nb_sectors;
2072     bool enabled;
2073 
2074     bdrv_get_geometry(bs, &nb_sectors);
2075 
2076     /**
2077      * The function will be invoked during startup not only in sync I/O mode,
2078      * but also in async I/O mode. So the I/O throttling function has to
2079      * be disabled temporarily here, not permanently.
2080      */
2081     enabled = bs->io_limits_enabled;
2082     bs->io_limits_enabled = false;
2083     ret = bdrv_read(bs, 0, buf, 1);
2084     bs->io_limits_enabled = enabled;
2085     if (ret < 0)
2086         return -1;
2087     /* test msdos magic */
2088     if (buf[510] != 0x55 || buf[511] != 0xaa)
2089         return -1;
2090     for(i = 0; i < 4; i++) {
2091         p = ((struct partition *)(buf + 0x1be)) + i;
2092         nr_sects = le32_to_cpu(p->nr_sects);
2093         if (nr_sects && p->end_head) {
2094             /* We make the assumption that the partition terminates on
2095                a cylinder boundary */
2096             heads = p->end_head + 1;
2097             sectors = p->end_sector & 63;
2098             if (sectors == 0)
2099                 continue;
2100             cylinders = nb_sectors / (heads * sectors);
2101             if (cylinders < 1 || cylinders > 16383)
2102                 continue;
2103             *pheads = heads;
2104             *psectors = sectors;
2105             *pcylinders = cylinders;
2106 #if 0
2107             printf("guessed geometry: LCHS=%d %d %d\n",
2108                    cylinders, heads, sectors);
2109 #endif
2110             return 0;
2111         }
2112     }
2113     return -1;
2114 }
2115 
2116 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2117 {
2118     int translation, lba_detected = 0;
2119     int cylinders, heads, secs;
2120     uint64_t nb_sectors;
2121 
2122     /* if a geometry hint is available, use it */
2123     bdrv_get_geometry(bs, &nb_sectors);
2124     bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2125     translation = bdrv_get_translation_hint(bs);
2126     if (cylinders != 0) {
2127         *pcyls = cylinders;
2128         *pheads = heads;
2129         *psecs = secs;
2130     } else {
2131         if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2132             if (heads > 16) {
2133                 /* if heads > 16, it means that a BIOS LBA
2134                    translation was active, so the default
2135                    hardware geometry is OK */
2136                 lba_detected = 1;
2137                 goto default_geometry;
2138             } else {
2139                 *pcyls = cylinders;
2140                 *pheads = heads;
2141                 *psecs = secs;
2142                 /* disable any translation to be in sync with
2143                    the logical geometry */
2144                 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2145                     bdrv_set_translation_hint(bs,
2146                                               BIOS_ATA_TRANSLATION_NONE);
2147                 }
2148             }
2149         } else {
2150         default_geometry:
2151             /* if no geometry, use a standard physical disk geometry */
2152             cylinders = nb_sectors / (16 * 63);
2153 
2154             if (cylinders > 16383)
2155                 cylinders = 16383;
2156             else if (cylinders < 2)
2157                 cylinders = 2;
2158             *pcyls = cylinders;
2159             *pheads = 16;
2160             *psecs = 63;
2161             if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2162                 if ((*pcyls * *pheads) <= 131072) {
2163                     bdrv_set_translation_hint(bs,
2164                                               BIOS_ATA_TRANSLATION_LARGE);
2165                 } else {
2166                     bdrv_set_translation_hint(bs,
2167                                               BIOS_ATA_TRANSLATION_LBA);
2168                 }
2169             }
2170         }
2171         bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2172     }
2173 }
2174 
2175 void bdrv_set_geometry_hint(BlockDriverState *bs,
2176                             int cyls, int heads, int secs)
2177 {
2178     bs->cyls = cyls;
2179     bs->heads = heads;
2180     bs->secs = secs;
2181 }
2182 
2183 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2184 {
2185     bs->translation = translation;
2186 }
2187 
2188 void bdrv_get_geometry_hint(BlockDriverState *bs,
2189                             int *pcyls, int *pheads, int *psecs)
2190 {
2191     *pcyls = bs->cyls;
2192     *pheads = bs->heads;
2193     *psecs = bs->secs;
2194 }
2195 
2196 /* throttling disk io limits */
2197 void bdrv_set_io_limits(BlockDriverState *bs,
2198                         BlockIOLimit *io_limits)
2199 {
2200     bs->io_limits = *io_limits;
2201     bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2202 }
2203 
2204 /* Recognize floppy formats */
2205 typedef struct FDFormat {
2206     FDriveType drive;
2207     uint8_t last_sect;
2208     uint8_t max_track;
2209     uint8_t max_head;
2210     FDriveRate rate;
2211 } FDFormat;
2212 
2213 static const FDFormat fd_formats[] = {
2214     /* First entry is default format */
2215     /* 1.44 MB 3"1/2 floppy disks */
2216     { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2217     { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2218     { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2219     { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2220     { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2221     { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2222     { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2223     { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
2224     /* 2.88 MB 3"1/2 floppy disks */
2225     { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2226     { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2227     { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2228     { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2229     { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
2230     /* 720 kB 3"1/2 floppy disks */
2231     { FDRIVE_DRV_144,  9, 80, 1, FDRIVE_RATE_250K, },
2232     { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2233     { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2234     { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2235     { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2236     { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
2237     /* 1.2 MB 5"1/4 floppy disks */
2238     { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2239     { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2240     { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2241     { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2242     { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
2243     /* 720 kB 5"1/4 floppy disks */
2244     { FDRIVE_DRV_120,  9, 80, 1, FDRIVE_RATE_250K, },
2245     { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
2246     /* 360 kB 5"1/4 floppy disks */
2247     { FDRIVE_DRV_120,  9, 40, 1, FDRIVE_RATE_300K, },
2248     { FDRIVE_DRV_120,  9, 40, 0, FDRIVE_RATE_300K, },
2249     { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2250     { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
2251     /* 320 kB 5"1/4 floppy disks */
2252     { FDRIVE_DRV_120,  8, 40, 1, FDRIVE_RATE_250K, },
2253     { FDRIVE_DRV_120,  8, 40, 0, FDRIVE_RATE_250K, },
2254     /* 360 kB must match 5"1/4 better than 3"1/2... */
2255     { FDRIVE_DRV_144,  9, 80, 0, FDRIVE_RATE_250K, },
2256     /* end */
2257     { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
2258 };
2259 
2260 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2261                                    int *max_track, int *last_sect,
2262                                    FDriveType drive_in, FDriveType *drive,
2263                                    FDriveRate *rate)
2264 {
2265     const FDFormat *parse;
2266     uint64_t nb_sectors, size;
2267     int i, first_match, match;
2268 
2269     bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2270     if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2271         /* User defined disk */
2272         *rate = FDRIVE_RATE_500K;
2273     } else {
2274         bdrv_get_geometry(bs, &nb_sectors);
2275         match = -1;
2276         first_match = -1;
2277         for (i = 0; ; i++) {
2278             parse = &fd_formats[i];
2279             if (parse->drive == FDRIVE_DRV_NONE) {
2280                 break;
2281             }
2282             if (drive_in == parse->drive ||
2283                 drive_in == FDRIVE_DRV_NONE) {
2284                 size = (parse->max_head + 1) * parse->max_track *
2285                     parse->last_sect;
2286                 if (nb_sectors == size) {
2287                     match = i;
2288                     break;
2289                 }
2290                 if (first_match == -1) {
2291                     first_match = i;
2292                 }
2293             }
2294         }
2295         if (match == -1) {
2296             if (first_match == -1) {
2297                 match = 1;
2298             } else {
2299                 match = first_match;
2300             }
2301             parse = &fd_formats[match];
2302         }
2303         *nb_heads = parse->max_head + 1;
2304         *max_track = parse->max_track;
2305         *last_sect = parse->last_sect;
2306         *drive = parse->drive;
2307         *rate = parse->rate;
2308     }
2309 }
2310 
2311 int bdrv_get_translation_hint(BlockDriverState *bs)
2312 {
2313     return bs->translation;
2314 }
2315 
2316 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2317                        BlockErrorAction on_write_error)
2318 {
2319     bs->on_read_error = on_read_error;
2320     bs->on_write_error = on_write_error;
2321 }
2322 
2323 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2324 {
2325     return is_read ? bs->on_read_error : bs->on_write_error;
2326 }
2327 
2328 int bdrv_is_read_only(BlockDriverState *bs)
2329 {
2330     return bs->read_only;
2331 }
2332 
2333 int bdrv_is_sg(BlockDriverState *bs)
2334 {
2335     return bs->sg;
2336 }
2337 
2338 int bdrv_enable_write_cache(BlockDriverState *bs)
2339 {
2340     return bs->enable_write_cache;
2341 }
2342 
2343 int bdrv_is_encrypted(BlockDriverState *bs)
2344 {
2345     if (bs->backing_hd && bs->backing_hd->encrypted)
2346         return 1;
2347     return bs->encrypted;
2348 }
2349 
2350 int bdrv_key_required(BlockDriverState *bs)
2351 {
2352     BlockDriverState *backing_hd = bs->backing_hd;
2353 
2354     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2355         return 1;
2356     return (bs->encrypted && !bs->valid_key);
2357 }
2358 
2359 int bdrv_set_key(BlockDriverState *bs, const char *key)
2360 {
2361     int ret;
2362     if (bs->backing_hd && bs->backing_hd->encrypted) {
2363         ret = bdrv_set_key(bs->backing_hd, key);
2364         if (ret < 0)
2365             return ret;
2366         if (!bs->encrypted)
2367             return 0;
2368     }
2369     if (!bs->encrypted) {
2370         return -EINVAL;
2371     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2372         return -ENOMEDIUM;
2373     }
2374     ret = bs->drv->bdrv_set_key(bs, key);
2375     if (ret < 0) {
2376         bs->valid_key = 0;
2377     } else if (!bs->valid_key) {
2378         bs->valid_key = 1;
2379         /* call the change callback now, we skipped it on open */
2380         bdrv_dev_change_media_cb(bs, true);
2381     }
2382     return ret;
2383 }
2384 
2385 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2386 {
2387     if (!bs->drv) {
2388         buf[0] = '\0';
2389     } else {
2390         pstrcpy(buf, buf_size, bs->drv->format_name);
2391     }
2392 }
2393 
2394 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2395                          void *opaque)
2396 {
2397     BlockDriver *drv;
2398 
2399     QLIST_FOREACH(drv, &bdrv_drivers, list) {
2400         it(opaque, drv->format_name);
2401     }
2402 }
2403 
2404 BlockDriverState *bdrv_find(const char *name)
2405 {
2406     BlockDriverState *bs;
2407 
2408     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2409         if (!strcmp(name, bs->device_name)) {
2410             return bs;
2411         }
2412     }
2413     return NULL;
2414 }
2415 
2416 BlockDriverState *bdrv_next(BlockDriverState *bs)
2417 {
2418     if (!bs) {
2419         return QTAILQ_FIRST(&bdrv_states);
2420     }
2421     return QTAILQ_NEXT(bs, list);
2422 }
2423 
2424 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2425 {
2426     BlockDriverState *bs;
2427 
2428     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2429         it(opaque, bs);
2430     }
2431 }
2432 
2433 const char *bdrv_get_device_name(BlockDriverState *bs)
2434 {
2435     return bs->device_name;
2436 }
2437 
2438 void bdrv_flush_all(void)
2439 {
2440     BlockDriverState *bs;
2441 
2442     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2443         bdrv_flush(bs);
2444     }
2445 }
2446 
2447 int bdrv_has_zero_init(BlockDriverState *bs)
2448 {
2449     assert(bs->drv);
2450 
2451     if (bs->drv->bdrv_has_zero_init) {
2452         return bs->drv->bdrv_has_zero_init(bs);
2453     }
2454 
2455     return 1;
2456 }
2457 
2458 typedef struct BdrvCoIsAllocatedData {
2459     BlockDriverState *bs;
2460     int64_t sector_num;
2461     int nb_sectors;
2462     int *pnum;
2463     int ret;
2464     bool done;
2465 } BdrvCoIsAllocatedData;
2466 
2467 /*
2468  * Returns true iff the specified sector is present in the disk image. Drivers
2469  * not implementing the functionality are assumed to not support backing files,
2470  * hence all their sectors are reported as allocated.
2471  *
2472  * If 'sector_num' is beyond the end of the disk image the return value is 0
2473  * and 'pnum' is set to 0.
2474  *
2475  * 'pnum' is set to the number of sectors (including and immediately following
2476  * the specified sector) that are known to be in the same
2477  * allocated/unallocated state.
2478  *
2479  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
2480  * beyond the end of the disk image it will be clamped.
2481  */
2482 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2483                                       int nb_sectors, int *pnum)
2484 {
2485     int64_t n;
2486 
2487     if (sector_num >= bs->total_sectors) {
2488         *pnum = 0;
2489         return 0;
2490     }
2491 
2492     n = bs->total_sectors - sector_num;
2493     if (n < nb_sectors) {
2494         nb_sectors = n;
2495     }
2496 
2497     if (!bs->drv->bdrv_co_is_allocated) {
2498         *pnum = nb_sectors;
2499         return 1;
2500     }
2501 
2502     return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2503 }
2504 
2505 /* Coroutine wrapper for bdrv_is_allocated() */
2506 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2507 {
2508     BdrvCoIsAllocatedData *data = opaque;
2509     BlockDriverState *bs = data->bs;
2510 
2511     data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2512                                      data->pnum);
2513     data->done = true;
2514 }
2515 
2516 /*
2517  * Synchronous wrapper around bdrv_co_is_allocated().
2518  *
2519  * See bdrv_co_is_allocated() for details.
2520  */
2521 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2522                       int *pnum)
2523 {
2524     Coroutine *co;
2525     BdrvCoIsAllocatedData data = {
2526         .bs = bs,
2527         .sector_num = sector_num,
2528         .nb_sectors = nb_sectors,
2529         .pnum = pnum,
2530         .done = false,
2531     };
2532 
2533     co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2534     qemu_coroutine_enter(co, &data);
2535     while (!data.done) {
2536         qemu_aio_wait();
2537     }
2538     return data.ret;
2539 }
2540 
2541 BlockInfoList *qmp_query_block(Error **errp)
2542 {
2543     BlockInfoList *head = NULL, *cur_item = NULL;
2544     BlockDriverState *bs;
2545 
2546     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2547         BlockInfoList *info = g_malloc0(sizeof(*info));
2548 
2549         info->value = g_malloc0(sizeof(*info->value));
2550         info->value->device = g_strdup(bs->device_name);
2551         info->value->type = g_strdup("unknown");
2552         info->value->locked = bdrv_dev_is_medium_locked(bs);
2553         info->value->removable = bdrv_dev_has_removable_media(bs);
2554 
2555         if (bdrv_dev_has_removable_media(bs)) {
2556             info->value->has_tray_open = true;
2557             info->value->tray_open = bdrv_dev_is_tray_open(bs);
2558         }
2559 
2560         if (bdrv_iostatus_is_enabled(bs)) {
2561             info->value->has_io_status = true;
2562             info->value->io_status = bs->iostatus;
2563         }
2564 
2565         if (bs->drv) {
2566             info->value->has_inserted = true;
2567             info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2568             info->value->inserted->file = g_strdup(bs->filename);
2569             info->value->inserted->ro = bs->read_only;
2570             info->value->inserted->drv = g_strdup(bs->drv->format_name);
2571             info->value->inserted->encrypted = bs->encrypted;
2572             if (bs->backing_file[0]) {
2573                 info->value->inserted->has_backing_file = true;
2574                 info->value->inserted->backing_file = g_strdup(bs->backing_file);
2575             }
2576 
2577             if (bs->io_limits_enabled) {
2578                 info->value->inserted->bps =
2579                                bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2580                 info->value->inserted->bps_rd =
2581                                bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2582                 info->value->inserted->bps_wr =
2583                                bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2584                 info->value->inserted->iops =
2585                                bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2586                 info->value->inserted->iops_rd =
2587                                bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2588                 info->value->inserted->iops_wr =
2589                                bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2590             }
2591         }
2592 
2593         /* XXX: waiting for the qapi to support GSList */
2594         if (!cur_item) {
2595             head = cur_item = info;
2596         } else {
2597             cur_item->next = info;
2598             cur_item = info;
2599         }
2600     }
2601 
2602     return head;
2603 }
2604 
2605 /* Consider exposing this as a full fledged QMP command */
2606 static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2607 {
2608     BlockStats *s;
2609 
2610     s = g_malloc0(sizeof(*s));
2611 
2612     if (bs->device_name[0]) {
2613         s->has_device = true;
2614         s->device = g_strdup(bs->device_name);
2615     }
2616 
2617     s->stats = g_malloc0(sizeof(*s->stats));
2618     s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2619     s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2620     s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2621     s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2622     s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2623     s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2624     s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2625     s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2626     s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2627 
2628     if (bs->file) {
2629         s->has_parent = true;
2630         s->parent = qmp_query_blockstat(bs->file, NULL);
2631     }
2632 
2633     return s;
2634 }
2635 
2636 BlockStatsList *qmp_query_blockstats(Error **errp)
2637 {
2638     BlockStatsList *head = NULL, *cur_item = NULL;
2639     BlockDriverState *bs;
2640 
2641     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2642         BlockStatsList *info = g_malloc0(sizeof(*info));
2643         info->value = qmp_query_blockstat(bs, NULL);
2644 
2645         /* XXX: waiting for the qapi to support GSList */
2646         if (!cur_item) {
2647             head = cur_item = info;
2648         } else {
2649             cur_item->next = info;
2650             cur_item = info;
2651         }
2652     }
2653 
2654     return head;
2655 }
2656 
2657 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2658 {
2659     if (bs->backing_hd && bs->backing_hd->encrypted)
2660         return bs->backing_file;
2661     else if (bs->encrypted)
2662         return bs->filename;
2663     else
2664         return NULL;
2665 }
2666 
2667 void bdrv_get_backing_filename(BlockDriverState *bs,
2668                                char *filename, int filename_size)
2669 {
2670     pstrcpy(filename, filename_size, bs->backing_file);
2671 }
2672 
2673 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2674                           const uint8_t *buf, int nb_sectors)
2675 {
2676     BlockDriver *drv = bs->drv;
2677     if (!drv)
2678         return -ENOMEDIUM;
2679     if (!drv->bdrv_write_compressed)
2680         return -ENOTSUP;
2681     if (bdrv_check_request(bs, sector_num, nb_sectors))
2682         return -EIO;
2683 
2684     if (bs->dirty_bitmap) {
2685         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2686     }
2687 
2688     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2689 }
2690 
2691 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2692 {
2693     BlockDriver *drv = bs->drv;
2694     if (!drv)
2695         return -ENOMEDIUM;
2696     if (!drv->bdrv_get_info)
2697         return -ENOTSUP;
2698     memset(bdi, 0, sizeof(*bdi));
2699     return drv->bdrv_get_info(bs, bdi);
2700 }
2701 
2702 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2703                       int64_t pos, int size)
2704 {
2705     BlockDriver *drv = bs->drv;
2706     if (!drv)
2707         return -ENOMEDIUM;
2708     if (drv->bdrv_save_vmstate)
2709         return drv->bdrv_save_vmstate(bs, buf, pos, size);
2710     if (bs->file)
2711         return bdrv_save_vmstate(bs->file, buf, pos, size);
2712     return -ENOTSUP;
2713 }
2714 
2715 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2716                       int64_t pos, int size)
2717 {
2718     BlockDriver *drv = bs->drv;
2719     if (!drv)
2720         return -ENOMEDIUM;
2721     if (drv->bdrv_load_vmstate)
2722         return drv->bdrv_load_vmstate(bs, buf, pos, size);
2723     if (bs->file)
2724         return bdrv_load_vmstate(bs->file, buf, pos, size);
2725     return -ENOTSUP;
2726 }
2727 
2728 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2729 {
2730     BlockDriver *drv = bs->drv;
2731 
2732     if (!drv || !drv->bdrv_debug_event) {
2733         return;
2734     }
2735 
2736     return drv->bdrv_debug_event(bs, event);
2737 
2738 }
2739 
2740 /**************************************************************/
2741 /* handling of snapshots */
2742 
2743 int bdrv_can_snapshot(BlockDriverState *bs)
2744 {
2745     BlockDriver *drv = bs->drv;
2746     if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2747         return 0;
2748     }
2749 
2750     if (!drv->bdrv_snapshot_create) {
2751         if (bs->file != NULL) {
2752             return bdrv_can_snapshot(bs->file);
2753         }
2754         return 0;
2755     }
2756 
2757     return 1;
2758 }
2759 
2760 int bdrv_is_snapshot(BlockDriverState *bs)
2761 {
2762     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2763 }
2764 
2765 BlockDriverState *bdrv_snapshots(void)
2766 {
2767     BlockDriverState *bs;
2768 
2769     if (bs_snapshots) {
2770         return bs_snapshots;
2771     }
2772 
2773     bs = NULL;
2774     while ((bs = bdrv_next(bs))) {
2775         if (bdrv_can_snapshot(bs)) {
2776             bs_snapshots = bs;
2777             return bs;
2778         }
2779     }
2780     return NULL;
2781 }
2782 
2783 int bdrv_snapshot_create(BlockDriverState *bs,
2784                          QEMUSnapshotInfo *sn_info)
2785 {
2786     BlockDriver *drv = bs->drv;
2787     if (!drv)
2788         return -ENOMEDIUM;
2789     if (drv->bdrv_snapshot_create)
2790         return drv->bdrv_snapshot_create(bs, sn_info);
2791     if (bs->file)
2792         return bdrv_snapshot_create(bs->file, sn_info);
2793     return -ENOTSUP;
2794 }
2795 
2796 int bdrv_snapshot_goto(BlockDriverState *bs,
2797                        const char *snapshot_id)
2798 {
2799     BlockDriver *drv = bs->drv;
2800     int ret, open_ret;
2801 
2802     if (!drv)
2803         return -ENOMEDIUM;
2804     if (drv->bdrv_snapshot_goto)
2805         return drv->bdrv_snapshot_goto(bs, snapshot_id);
2806 
2807     if (bs->file) {
2808         drv->bdrv_close(bs);
2809         ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2810         open_ret = drv->bdrv_open(bs, bs->open_flags);
2811         if (open_ret < 0) {
2812             bdrv_delete(bs->file);
2813             bs->drv = NULL;
2814             return open_ret;
2815         }
2816         return ret;
2817     }
2818 
2819     return -ENOTSUP;
2820 }
2821 
2822 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2823 {
2824     BlockDriver *drv = bs->drv;
2825     if (!drv)
2826         return -ENOMEDIUM;
2827     if (drv->bdrv_snapshot_delete)
2828         return drv->bdrv_snapshot_delete(bs, snapshot_id);
2829     if (bs->file)
2830         return bdrv_snapshot_delete(bs->file, snapshot_id);
2831     return -ENOTSUP;
2832 }
2833 
2834 int bdrv_snapshot_list(BlockDriverState *bs,
2835                        QEMUSnapshotInfo **psn_info)
2836 {
2837     BlockDriver *drv = bs->drv;
2838     if (!drv)
2839         return -ENOMEDIUM;
2840     if (drv->bdrv_snapshot_list)
2841         return drv->bdrv_snapshot_list(bs, psn_info);
2842     if (bs->file)
2843         return bdrv_snapshot_list(bs->file, psn_info);
2844     return -ENOTSUP;
2845 }
2846 
2847 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2848         const char *snapshot_name)
2849 {
2850     BlockDriver *drv = bs->drv;
2851     if (!drv) {
2852         return -ENOMEDIUM;
2853     }
2854     if (!bs->read_only) {
2855         return -EINVAL;
2856     }
2857     if (drv->bdrv_snapshot_load_tmp) {
2858         return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2859     }
2860     return -ENOTSUP;
2861 }
2862 
2863 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2864         const char *backing_file)
2865 {
2866     if (!bs->drv) {
2867         return NULL;
2868     }
2869 
2870     if (bs->backing_hd) {
2871         if (strcmp(bs->backing_file, backing_file) == 0) {
2872             return bs->backing_hd;
2873         } else {
2874             return bdrv_find_backing_image(bs->backing_hd, backing_file);
2875         }
2876     }
2877 
2878     return NULL;
2879 }
2880 
2881 #define NB_SUFFIXES 4
2882 
2883 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2884 {
2885     static const char suffixes[NB_SUFFIXES] = "KMGT";
2886     int64_t base;
2887     int i;
2888 
2889     if (size <= 999) {
2890         snprintf(buf, buf_size, "%" PRId64, size);
2891     } else {
2892         base = 1024;
2893         for(i = 0; i < NB_SUFFIXES; i++) {
2894             if (size < (10 * base)) {
2895                 snprintf(buf, buf_size, "%0.1f%c",
2896                          (double)size / base,
2897                          suffixes[i]);
2898                 break;
2899             } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2900                 snprintf(buf, buf_size, "%" PRId64 "%c",
2901                          ((size + (base >> 1)) / base),
2902                          suffixes[i]);
2903                 break;
2904             }
2905             base = base * 1024;
2906         }
2907     }
2908     return buf;
2909 }
2910 
2911 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2912 {
2913     char buf1[128], date_buf[128], clock_buf[128];
2914 #ifdef _WIN32
2915     struct tm *ptm;
2916 #else
2917     struct tm tm;
2918 #endif
2919     time_t ti;
2920     int64_t secs;
2921 
2922     if (!sn) {
2923         snprintf(buf, buf_size,
2924                  "%-10s%-20s%7s%20s%15s",
2925                  "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2926     } else {
2927         ti = sn->date_sec;
2928 #ifdef _WIN32
2929         ptm = localtime(&ti);
2930         strftime(date_buf, sizeof(date_buf),
2931                  "%Y-%m-%d %H:%M:%S", ptm);
2932 #else
2933         localtime_r(&ti, &tm);
2934         strftime(date_buf, sizeof(date_buf),
2935                  "%Y-%m-%d %H:%M:%S", &tm);
2936 #endif
2937         secs = sn->vm_clock_nsec / 1000000000;
2938         snprintf(clock_buf, sizeof(clock_buf),
2939                  "%02d:%02d:%02d.%03d",
2940                  (int)(secs / 3600),
2941                  (int)((secs / 60) % 60),
2942                  (int)(secs % 60),
2943                  (int)((sn->vm_clock_nsec / 1000000) % 1000));
2944         snprintf(buf, buf_size,
2945                  "%-10s%-20s%7s%20s%15s",
2946                  sn->id_str, sn->name,
2947                  get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2948                  date_buf,
2949                  clock_buf);
2950     }
2951     return buf;
2952 }
2953 
2954 /**************************************************************/
2955 /* async I/Os */
2956 
2957 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2958                                  QEMUIOVector *qiov, int nb_sectors,
2959                                  BlockDriverCompletionFunc *cb, void *opaque)
2960 {
2961     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2962 
2963     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2964                                  cb, opaque, false);
2965 }
2966 
2967 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2968                                   QEMUIOVector *qiov, int nb_sectors,
2969                                   BlockDriverCompletionFunc *cb, void *opaque)
2970 {
2971     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2972 
2973     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2974                                  cb, opaque, true);
2975 }
2976 
2977 
2978 typedef struct MultiwriteCB {
2979     int error;
2980     int num_requests;
2981     int num_callbacks;
2982     struct {
2983         BlockDriverCompletionFunc *cb;
2984         void *opaque;
2985         QEMUIOVector *free_qiov;
2986     } callbacks[];
2987 } MultiwriteCB;
2988 
2989 static void multiwrite_user_cb(MultiwriteCB *mcb)
2990 {
2991     int i;
2992 
2993     for (i = 0; i < mcb->num_callbacks; i++) {
2994         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2995         if (mcb->callbacks[i].free_qiov) {
2996             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2997         }
2998         g_free(mcb->callbacks[i].free_qiov);
2999     }
3000 }
3001 
3002 static void multiwrite_cb(void *opaque, int ret)
3003 {
3004     MultiwriteCB *mcb = opaque;
3005 
3006     trace_multiwrite_cb(mcb, ret);
3007 
3008     if (ret < 0 && !mcb->error) {
3009         mcb->error = ret;
3010     }
3011 
3012     mcb->num_requests--;
3013     if (mcb->num_requests == 0) {
3014         multiwrite_user_cb(mcb);
3015         g_free(mcb);
3016     }
3017 }
3018 
3019 static int multiwrite_req_compare(const void *a, const void *b)
3020 {
3021     const BlockRequest *req1 = a, *req2 = b;
3022 
3023     /*
3024      * Note that we can't simply subtract req2->sector from req1->sector
3025      * here as that could overflow the return value.
3026      */
3027     if (req1->sector > req2->sector) {
3028         return 1;
3029     } else if (req1->sector < req2->sector) {
3030         return -1;
3031     } else {
3032         return 0;
3033     }
3034 }
3035 
3036 /*
3037  * Takes a bunch of requests and tries to merge them. Returns the number of
3038  * requests that remain after merging.
3039  */
3040 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3041     int num_reqs, MultiwriteCB *mcb)
3042 {
3043     int i, outidx;
3044 
3045     // Sort requests by start sector
3046     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3047 
3048     // Check if adjacent requests touch the same clusters. If so, combine them,
3049     // filling up gaps with zero sectors.
3050     outidx = 0;
3051     for (i = 1; i < num_reqs; i++) {
3052         int merge = 0;
3053         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3054 
3055         // Handle exactly sequential writes and overlapping writes.
3056         if (reqs[i].sector <= oldreq_last) {
3057             merge = 1;
3058         }
3059 
3060         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3061             merge = 0;
3062         }
3063 
3064         if (merge) {
3065             size_t size;
3066             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3067             qemu_iovec_init(qiov,
3068                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3069 
3070             // Add the first request to the merged one. If the requests are
3071             // overlapping, drop the last sectors of the first request.
3072             size = (reqs[i].sector - reqs[outidx].sector) << 9;
3073             qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
3074 
3075             // We should need to add any zeros between the two requests
3076             assert (reqs[i].sector <= oldreq_last);
3077 
3078             // Add the second request
3079             qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
3080 
3081             reqs[outidx].nb_sectors = qiov->size >> 9;
3082             reqs[outidx].qiov = qiov;
3083 
3084             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3085         } else {
3086             outidx++;
3087             reqs[outidx].sector     = reqs[i].sector;
3088             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3089             reqs[outidx].qiov       = reqs[i].qiov;
3090         }
3091     }
3092 
3093     return outidx + 1;
3094 }
3095 
3096 /*
3097  * Submit multiple AIO write requests at once.
3098  *
3099  * On success, the function returns 0 and all requests in the reqs array have
3100  * been submitted. In error case this function returns -1, and any of the
3101  * requests may or may not be submitted yet. In particular, this means that the
3102  * callback will be called for some of the requests, for others it won't. The
3103  * caller must check the error field of the BlockRequest to wait for the right
3104  * callbacks (if error != 0, no callback will be called).
3105  *
3106  * The implementation may modify the contents of the reqs array, e.g. to merge
3107  * requests. However, the fields opaque and error are left unmodified as they
3108  * are used to signal failure for a single request to the caller.
3109  */
3110 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3111 {
3112     MultiwriteCB *mcb;
3113     int i;
3114 
3115     /* don't submit writes if we don't have a medium */
3116     if (bs->drv == NULL) {
3117         for (i = 0; i < num_reqs; i++) {
3118             reqs[i].error = -ENOMEDIUM;
3119         }
3120         return -1;
3121     }
3122 
3123     if (num_reqs == 0) {
3124         return 0;
3125     }
3126 
3127     // Create MultiwriteCB structure
3128     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3129     mcb->num_requests = 0;
3130     mcb->num_callbacks = num_reqs;
3131 
3132     for (i = 0; i < num_reqs; i++) {
3133         mcb->callbacks[i].cb = reqs[i].cb;
3134         mcb->callbacks[i].opaque = reqs[i].opaque;
3135     }
3136 
3137     // Check for mergable requests
3138     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3139 
3140     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3141 
3142     /* Run the aio requests. */
3143     mcb->num_requests = num_reqs;
3144     for (i = 0; i < num_reqs; i++) {
3145         bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3146             reqs[i].nb_sectors, multiwrite_cb, mcb);
3147     }
3148 
3149     return 0;
3150 }
3151 
3152 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3153 {
3154     acb->pool->cancel(acb);
3155 }
3156 
3157 /* block I/O throttling */
3158 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3159                  bool is_write, double elapsed_time, uint64_t *wait)
3160 {
3161     uint64_t bps_limit = 0;
3162     double   bytes_limit, bytes_base, bytes_res;
3163     double   slice_time, wait_time;
3164 
3165     if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3166         bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3167     } else if (bs->io_limits.bps[is_write]) {
3168         bps_limit = bs->io_limits.bps[is_write];
3169     } else {
3170         if (wait) {
3171             *wait = 0;
3172         }
3173 
3174         return false;
3175     }
3176 
3177     slice_time = bs->slice_end - bs->slice_start;
3178     slice_time /= (NANOSECONDS_PER_SECOND);
3179     bytes_limit = bps_limit * slice_time;
3180     bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3181     if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3182         bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3183     }
3184 
3185     /* bytes_base: the bytes of data which have been read/written; and
3186      *             it is obtained from the history statistic info.
3187      * bytes_res: the remaining bytes of data which need to be read/written.
3188      * (bytes_base + bytes_res) / bps_limit: used to calcuate
3189      *             the total time for completing reading/writting all data.
3190      */
3191     bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3192 
3193     if (bytes_base + bytes_res <= bytes_limit) {
3194         if (wait) {
3195             *wait = 0;
3196         }
3197 
3198         return false;
3199     }
3200 
3201     /* Calc approx time to dispatch */
3202     wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3203 
3204     /* When the I/O rate at runtime exceeds the limits,
3205      * bs->slice_end need to be extended in order that the current statistic
3206      * info can be kept until the timer fire, so it is increased and tuned
3207      * based on the result of experiment.
3208      */
3209     bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3210     bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3211     if (wait) {
3212         *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3213     }
3214 
3215     return true;
3216 }
3217 
3218 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3219                              double elapsed_time, uint64_t *wait)
3220 {
3221     uint64_t iops_limit = 0;
3222     double   ios_limit, ios_base;
3223     double   slice_time, wait_time;
3224 
3225     if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3226         iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3227     } else if (bs->io_limits.iops[is_write]) {
3228         iops_limit = bs->io_limits.iops[is_write];
3229     } else {
3230         if (wait) {
3231             *wait = 0;
3232         }
3233 
3234         return false;
3235     }
3236 
3237     slice_time = bs->slice_end - bs->slice_start;
3238     slice_time /= (NANOSECONDS_PER_SECOND);
3239     ios_limit  = iops_limit * slice_time;
3240     ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3241     if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3242         ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3243     }
3244 
3245     if (ios_base + 1 <= ios_limit) {
3246         if (wait) {
3247             *wait = 0;
3248         }
3249 
3250         return false;
3251     }
3252 
3253     /* Calc approx time to dispatch */
3254     wait_time = (ios_base + 1) / iops_limit;
3255     if (wait_time > elapsed_time) {
3256         wait_time = wait_time - elapsed_time;
3257     } else {
3258         wait_time = 0;
3259     }
3260 
3261     bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3262     bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3263     if (wait) {
3264         *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3265     }
3266 
3267     return true;
3268 }
3269 
3270 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3271                            bool is_write, int64_t *wait)
3272 {
3273     int64_t  now, max_wait;
3274     uint64_t bps_wait = 0, iops_wait = 0;
3275     double   elapsed_time;
3276     int      bps_ret, iops_ret;
3277 
3278     now = qemu_get_clock_ns(vm_clock);
3279     if ((bs->slice_start < now)
3280         && (bs->slice_end > now)) {
3281         bs->slice_end = now + bs->slice_time;
3282     } else {
3283         bs->slice_time  =  5 * BLOCK_IO_SLICE_TIME;
3284         bs->slice_start = now;
3285         bs->slice_end   = now + bs->slice_time;
3286 
3287         bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
3288         bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3289 
3290         bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
3291         bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
3292     }
3293 
3294     elapsed_time  = now - bs->slice_start;
3295     elapsed_time  /= (NANOSECONDS_PER_SECOND);
3296 
3297     bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
3298                                       is_write, elapsed_time, &bps_wait);
3299     iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3300                                       elapsed_time, &iops_wait);
3301     if (bps_ret || iops_ret) {
3302         max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3303         if (wait) {
3304             *wait = max_wait;
3305         }
3306 
3307         now = qemu_get_clock_ns(vm_clock);
3308         if (bs->slice_end < now + max_wait) {
3309             bs->slice_end = now + max_wait;
3310         }
3311 
3312         return true;
3313     }
3314 
3315     if (wait) {
3316         *wait = 0;
3317     }
3318 
3319     return false;
3320 }
3321 
3322 /**************************************************************/
3323 /* async block device emulation */
3324 
3325 typedef struct BlockDriverAIOCBSync {
3326     BlockDriverAIOCB common;
3327     QEMUBH *bh;
3328     int ret;
3329     /* vector translation state */
3330     QEMUIOVector *qiov;
3331     uint8_t *bounce;
3332     int is_write;
3333 } BlockDriverAIOCBSync;
3334 
3335 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3336 {
3337     BlockDriverAIOCBSync *acb =
3338         container_of(blockacb, BlockDriverAIOCBSync, common);
3339     qemu_bh_delete(acb->bh);
3340     acb->bh = NULL;
3341     qemu_aio_release(acb);
3342 }
3343 
3344 static AIOPool bdrv_em_aio_pool = {
3345     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3346     .cancel             = bdrv_aio_cancel_em,
3347 };
3348 
3349 static void bdrv_aio_bh_cb(void *opaque)
3350 {
3351     BlockDriverAIOCBSync *acb = opaque;
3352 
3353     if (!acb->is_write)
3354         qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
3355     qemu_vfree(acb->bounce);
3356     acb->common.cb(acb->common.opaque, acb->ret);
3357     qemu_bh_delete(acb->bh);
3358     acb->bh = NULL;
3359     qemu_aio_release(acb);
3360 }
3361 
3362 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3363                                             int64_t sector_num,
3364                                             QEMUIOVector *qiov,
3365                                             int nb_sectors,
3366                                             BlockDriverCompletionFunc *cb,
3367                                             void *opaque,
3368                                             int is_write)
3369 
3370 {
3371     BlockDriverAIOCBSync *acb;
3372 
3373     acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3374     acb->is_write = is_write;
3375     acb->qiov = qiov;
3376     acb->bounce = qemu_blockalign(bs, qiov->size);
3377     acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3378 
3379     if (is_write) {
3380         qemu_iovec_to_buffer(acb->qiov, acb->bounce);
3381         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3382     } else {
3383         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3384     }
3385 
3386     qemu_bh_schedule(acb->bh);
3387 
3388     return &acb->common;
3389 }
3390 
3391 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3392         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3393         BlockDriverCompletionFunc *cb, void *opaque)
3394 {
3395     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3396 }
3397 
3398 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3399         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3400         BlockDriverCompletionFunc *cb, void *opaque)
3401 {
3402     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3403 }
3404 
3405 
3406 typedef struct BlockDriverAIOCBCoroutine {
3407     BlockDriverAIOCB common;
3408     BlockRequest req;
3409     bool is_write;
3410     QEMUBH* bh;
3411 } BlockDriverAIOCBCoroutine;
3412 
3413 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3414 {
3415     qemu_aio_flush();
3416 }
3417 
3418 static AIOPool bdrv_em_co_aio_pool = {
3419     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3420     .cancel             = bdrv_aio_co_cancel_em,
3421 };
3422 
3423 static void bdrv_co_em_bh(void *opaque)
3424 {
3425     BlockDriverAIOCBCoroutine *acb = opaque;
3426 
3427     acb->common.cb(acb->common.opaque, acb->req.error);
3428     qemu_bh_delete(acb->bh);
3429     qemu_aio_release(acb);
3430 }
3431 
3432 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3433 static void coroutine_fn bdrv_co_do_rw(void *opaque)
3434 {
3435     BlockDriverAIOCBCoroutine *acb = opaque;
3436     BlockDriverState *bs = acb->common.bs;
3437 
3438     if (!acb->is_write) {
3439         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3440             acb->req.nb_sectors, acb->req.qiov, 0);
3441     } else {
3442         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3443             acb->req.nb_sectors, acb->req.qiov, 0);
3444     }
3445 
3446     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3447     qemu_bh_schedule(acb->bh);
3448 }
3449 
3450 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3451                                                int64_t sector_num,
3452                                                QEMUIOVector *qiov,
3453                                                int nb_sectors,
3454                                                BlockDriverCompletionFunc *cb,
3455                                                void *opaque,
3456                                                bool is_write)
3457 {
3458     Coroutine *co;
3459     BlockDriverAIOCBCoroutine *acb;
3460 
3461     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3462     acb->req.sector = sector_num;
3463     acb->req.nb_sectors = nb_sectors;
3464     acb->req.qiov = qiov;
3465     acb->is_write = is_write;
3466 
3467     co = qemu_coroutine_create(bdrv_co_do_rw);
3468     qemu_coroutine_enter(co, acb);
3469 
3470     return &acb->common;
3471 }
3472 
3473 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3474 {
3475     BlockDriverAIOCBCoroutine *acb = opaque;
3476     BlockDriverState *bs = acb->common.bs;
3477 
3478     acb->req.error = bdrv_co_flush(bs);
3479     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3480     qemu_bh_schedule(acb->bh);
3481 }
3482 
3483 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3484         BlockDriverCompletionFunc *cb, void *opaque)
3485 {
3486     trace_bdrv_aio_flush(bs, opaque);
3487 
3488     Coroutine *co;
3489     BlockDriverAIOCBCoroutine *acb;
3490 
3491     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3492     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3493     qemu_coroutine_enter(co, acb);
3494 
3495     return &acb->common;
3496 }
3497 
3498 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3499 {
3500     BlockDriverAIOCBCoroutine *acb = opaque;
3501     BlockDriverState *bs = acb->common.bs;
3502 
3503     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3504     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3505     qemu_bh_schedule(acb->bh);
3506 }
3507 
3508 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3509         int64_t sector_num, int nb_sectors,
3510         BlockDriverCompletionFunc *cb, void *opaque)
3511 {
3512     Coroutine *co;
3513     BlockDriverAIOCBCoroutine *acb;
3514 
3515     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3516 
3517     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3518     acb->req.sector = sector_num;
3519     acb->req.nb_sectors = nb_sectors;
3520     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3521     qemu_coroutine_enter(co, acb);
3522 
3523     return &acb->common;
3524 }
3525 
3526 void bdrv_init(void)
3527 {
3528     module_call_init(MODULE_INIT_BLOCK);
3529 }
3530 
3531 void bdrv_init_with_whitelist(void)
3532 {
3533     use_bdrv_whitelist = 1;
3534     bdrv_init();
3535 }
3536 
3537 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3538                    BlockDriverCompletionFunc *cb, void *opaque)
3539 {
3540     BlockDriverAIOCB *acb;
3541 
3542     if (pool->free_aiocb) {
3543         acb = pool->free_aiocb;
3544         pool->free_aiocb = acb->next;
3545     } else {
3546         acb = g_malloc0(pool->aiocb_size);
3547         acb->pool = pool;
3548     }
3549     acb->bs = bs;
3550     acb->cb = cb;
3551     acb->opaque = opaque;
3552     return acb;
3553 }
3554 
3555 void qemu_aio_release(void *p)
3556 {
3557     BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3558     AIOPool *pool = acb->pool;
3559     acb->next = pool->free_aiocb;
3560     pool->free_aiocb = acb;
3561 }
3562 
3563 /**************************************************************/
3564 /* Coroutine block device emulation */
3565 
3566 typedef struct CoroutineIOCompletion {
3567     Coroutine *coroutine;
3568     int ret;
3569 } CoroutineIOCompletion;
3570 
3571 static void bdrv_co_io_em_complete(void *opaque, int ret)
3572 {
3573     CoroutineIOCompletion *co = opaque;
3574 
3575     co->ret = ret;
3576     qemu_coroutine_enter(co->coroutine, NULL);
3577 }
3578 
3579 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3580                                       int nb_sectors, QEMUIOVector *iov,
3581                                       bool is_write)
3582 {
3583     CoroutineIOCompletion co = {
3584         .coroutine = qemu_coroutine_self(),
3585     };
3586     BlockDriverAIOCB *acb;
3587 
3588     if (is_write) {
3589         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3590                                        bdrv_co_io_em_complete, &co);
3591     } else {
3592         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3593                                       bdrv_co_io_em_complete, &co);
3594     }
3595 
3596     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3597     if (!acb) {
3598         return -EIO;
3599     }
3600     qemu_coroutine_yield();
3601 
3602     return co.ret;
3603 }
3604 
3605 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3606                                          int64_t sector_num, int nb_sectors,
3607                                          QEMUIOVector *iov)
3608 {
3609     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3610 }
3611 
3612 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3613                                          int64_t sector_num, int nb_sectors,
3614                                          QEMUIOVector *iov)
3615 {
3616     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3617 }
3618 
3619 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3620 {
3621     RwCo *rwco = opaque;
3622 
3623     rwco->ret = bdrv_co_flush(rwco->bs);
3624 }
3625 
3626 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3627 {
3628     int ret;
3629 
3630     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3631         return 0;
3632     }
3633 
3634     /* Write back cached data to the OS even with cache=unsafe */
3635     if (bs->drv->bdrv_co_flush_to_os) {
3636         ret = bs->drv->bdrv_co_flush_to_os(bs);
3637         if (ret < 0) {
3638             return ret;
3639         }
3640     }
3641 
3642     /* But don't actually force it to the disk with cache=unsafe */
3643     if (bs->open_flags & BDRV_O_NO_FLUSH) {
3644         return 0;
3645     }
3646 
3647     if (bs->drv->bdrv_co_flush_to_disk) {
3648         ret = bs->drv->bdrv_co_flush_to_disk(bs);
3649     } else if (bs->drv->bdrv_aio_flush) {
3650         BlockDriverAIOCB *acb;
3651         CoroutineIOCompletion co = {
3652             .coroutine = qemu_coroutine_self(),
3653         };
3654 
3655         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3656         if (acb == NULL) {
3657             ret = -EIO;
3658         } else {
3659             qemu_coroutine_yield();
3660             ret = co.ret;
3661         }
3662     } else {
3663         /*
3664          * Some block drivers always operate in either writethrough or unsafe
3665          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3666          * know how the server works (because the behaviour is hardcoded or
3667          * depends on server-side configuration), so we can't ensure that
3668          * everything is safe on disk. Returning an error doesn't work because
3669          * that would break guests even if the server operates in writethrough
3670          * mode.
3671          *
3672          * Let's hope the user knows what he's doing.
3673          */
3674         ret = 0;
3675     }
3676     if (ret < 0) {
3677         return ret;
3678     }
3679 
3680     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
3681      * in the case of cache=unsafe, so there are no useless flushes.
3682      */
3683     return bdrv_co_flush(bs->file);
3684 }
3685 
3686 void bdrv_invalidate_cache(BlockDriverState *bs)
3687 {
3688     if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3689         bs->drv->bdrv_invalidate_cache(bs);
3690     }
3691 }
3692 
3693 void bdrv_invalidate_cache_all(void)
3694 {
3695     BlockDriverState *bs;
3696 
3697     QTAILQ_FOREACH(bs, &bdrv_states, list) {
3698         bdrv_invalidate_cache(bs);
3699     }
3700 }
3701 
3702 void bdrv_clear_incoming_migration_all(void)
3703 {
3704     BlockDriverState *bs;
3705 
3706     QTAILQ_FOREACH(bs, &bdrv_states, list) {
3707         bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3708     }
3709 }
3710 
3711 int bdrv_flush(BlockDriverState *bs)
3712 {
3713     Coroutine *co;
3714     RwCo rwco = {
3715         .bs = bs,
3716         .ret = NOT_DONE,
3717     };
3718 
3719     if (qemu_in_coroutine()) {
3720         /* Fast-path if already in coroutine context */
3721         bdrv_flush_co_entry(&rwco);
3722     } else {
3723         co = qemu_coroutine_create(bdrv_flush_co_entry);
3724         qemu_coroutine_enter(co, &rwco);
3725         while (rwco.ret == NOT_DONE) {
3726             qemu_aio_wait();
3727         }
3728     }
3729 
3730     return rwco.ret;
3731 }
3732 
3733 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3734 {
3735     RwCo *rwco = opaque;
3736 
3737     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3738 }
3739 
3740 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3741                                  int nb_sectors)
3742 {
3743     if (!bs->drv) {
3744         return -ENOMEDIUM;
3745     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3746         return -EIO;
3747     } else if (bs->read_only) {
3748         return -EROFS;
3749     } else if (bs->drv->bdrv_co_discard) {
3750         return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3751     } else if (bs->drv->bdrv_aio_discard) {
3752         BlockDriverAIOCB *acb;
3753         CoroutineIOCompletion co = {
3754             .coroutine = qemu_coroutine_self(),
3755         };
3756 
3757         acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3758                                         bdrv_co_io_em_complete, &co);
3759         if (acb == NULL) {
3760             return -EIO;
3761         } else {
3762             qemu_coroutine_yield();
3763             return co.ret;
3764         }
3765     } else {
3766         return 0;
3767     }
3768 }
3769 
3770 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3771 {
3772     Coroutine *co;
3773     RwCo rwco = {
3774         .bs = bs,
3775         .sector_num = sector_num,
3776         .nb_sectors = nb_sectors,
3777         .ret = NOT_DONE,
3778     };
3779 
3780     if (qemu_in_coroutine()) {
3781         /* Fast-path if already in coroutine context */
3782         bdrv_discard_co_entry(&rwco);
3783     } else {
3784         co = qemu_coroutine_create(bdrv_discard_co_entry);
3785         qemu_coroutine_enter(co, &rwco);
3786         while (rwco.ret == NOT_DONE) {
3787             qemu_aio_wait();
3788         }
3789     }
3790 
3791     return rwco.ret;
3792 }
3793 
3794 /**************************************************************/
3795 /* removable device support */
3796 
3797 /**
3798  * Return TRUE if the media is present
3799  */
3800 int bdrv_is_inserted(BlockDriverState *bs)
3801 {
3802     BlockDriver *drv = bs->drv;
3803 
3804     if (!drv)
3805         return 0;
3806     if (!drv->bdrv_is_inserted)
3807         return 1;
3808     return drv->bdrv_is_inserted(bs);
3809 }
3810 
3811 /**
3812  * Return whether the media changed since the last call to this
3813  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
3814  */
3815 int bdrv_media_changed(BlockDriverState *bs)
3816 {
3817     BlockDriver *drv = bs->drv;
3818 
3819     if (drv && drv->bdrv_media_changed) {
3820         return drv->bdrv_media_changed(bs);
3821     }
3822     return -ENOTSUP;
3823 }
3824 
3825 /**
3826  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3827  */
3828 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3829 {
3830     BlockDriver *drv = bs->drv;
3831 
3832     if (drv && drv->bdrv_eject) {
3833         drv->bdrv_eject(bs, eject_flag);
3834     }
3835 
3836     if (bs->device_name[0] != '\0') {
3837         bdrv_emit_qmp_eject_event(bs, eject_flag);
3838     }
3839 }
3840 
3841 /**
3842  * Lock or unlock the media (if it is locked, the user won't be able
3843  * to eject it manually).
3844  */
3845 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3846 {
3847     BlockDriver *drv = bs->drv;
3848 
3849     trace_bdrv_lock_medium(bs, locked);
3850 
3851     if (drv && drv->bdrv_lock_medium) {
3852         drv->bdrv_lock_medium(bs, locked);
3853     }
3854 }
3855 
3856 /* needed for generic scsi interface */
3857 
3858 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3859 {
3860     BlockDriver *drv = bs->drv;
3861 
3862     if (drv && drv->bdrv_ioctl)
3863         return drv->bdrv_ioctl(bs, req, buf);
3864     return -ENOTSUP;
3865 }
3866 
3867 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3868         unsigned long int req, void *buf,
3869         BlockDriverCompletionFunc *cb, void *opaque)
3870 {
3871     BlockDriver *drv = bs->drv;
3872 
3873     if (drv && drv->bdrv_aio_ioctl)
3874         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3875     return NULL;
3876 }
3877 
3878 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3879 {
3880     bs->buffer_alignment = align;
3881 }
3882 
3883 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3884 {
3885     return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3886 }
3887 
3888 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3889 {
3890     int64_t bitmap_size;
3891 
3892     bs->dirty_count = 0;
3893     if (enable) {
3894         if (!bs->dirty_bitmap) {
3895             bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3896                     BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3897             bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
3898 
3899             bs->dirty_bitmap = g_malloc0(bitmap_size);
3900         }
3901     } else {
3902         if (bs->dirty_bitmap) {
3903             g_free(bs->dirty_bitmap);
3904             bs->dirty_bitmap = NULL;
3905         }
3906     }
3907 }
3908 
3909 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3910 {
3911     int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3912 
3913     if (bs->dirty_bitmap &&
3914         (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3915         return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3916             (1UL << (chunk % (sizeof(unsigned long) * 8))));
3917     } else {
3918         return 0;
3919     }
3920 }
3921 
3922 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3923                       int nr_sectors)
3924 {
3925     set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3926 }
3927 
3928 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3929 {
3930     return bs->dirty_count;
3931 }
3932 
3933 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3934 {
3935     assert(bs->in_use != in_use);
3936     bs->in_use = in_use;
3937 }
3938 
3939 int bdrv_in_use(BlockDriverState *bs)
3940 {
3941     return bs->in_use;
3942 }
3943 
3944 void bdrv_iostatus_enable(BlockDriverState *bs)
3945 {
3946     bs->iostatus_enabled = true;
3947     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3948 }
3949 
3950 /* The I/O status is only enabled if the drive explicitly
3951  * enables it _and_ the VM is configured to stop on errors */
3952 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3953 {
3954     return (bs->iostatus_enabled &&
3955            (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3956             bs->on_write_error == BLOCK_ERR_STOP_ANY    ||
3957             bs->on_read_error == BLOCK_ERR_STOP_ANY));
3958 }
3959 
3960 void bdrv_iostatus_disable(BlockDriverState *bs)
3961 {
3962     bs->iostatus_enabled = false;
3963 }
3964 
3965 void bdrv_iostatus_reset(BlockDriverState *bs)
3966 {
3967     if (bdrv_iostatus_is_enabled(bs)) {
3968         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3969     }
3970 }
3971 
3972 /* XXX: Today this is set by device models because it makes the implementation
3973    quite simple. However, the block layer knows about the error, so it's
3974    possible to implement this without device models being involved */
3975 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3976 {
3977     if (bdrv_iostatus_is_enabled(bs) &&
3978         bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
3979         assert(error >= 0);
3980         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3981                                          BLOCK_DEVICE_IO_STATUS_FAILED;
3982     }
3983 }
3984 
3985 void
3986 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3987         enum BlockAcctType type)
3988 {
3989     assert(type < BDRV_MAX_IOTYPE);
3990 
3991     cookie->bytes = bytes;
3992     cookie->start_time_ns = get_clock();
3993     cookie->type = type;
3994 }
3995 
3996 void
3997 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3998 {
3999     assert(cookie->type < BDRV_MAX_IOTYPE);
4000 
4001     bs->nr_bytes[cookie->type] += cookie->bytes;
4002     bs->nr_ops[cookie->type]++;
4003     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4004 }
4005 
4006 int bdrv_img_create(const char *filename, const char *fmt,
4007                     const char *base_filename, const char *base_fmt,
4008                     char *options, uint64_t img_size, int flags)
4009 {
4010     QEMUOptionParameter *param = NULL, *create_options = NULL;
4011     QEMUOptionParameter *backing_fmt, *backing_file, *size;
4012     BlockDriverState *bs = NULL;
4013     BlockDriver *drv, *proto_drv;
4014     BlockDriver *backing_drv = NULL;
4015     int ret = 0;
4016 
4017     /* Find driver and parse its options */
4018     drv = bdrv_find_format(fmt);
4019     if (!drv) {
4020         error_report("Unknown file format '%s'", fmt);
4021         ret = -EINVAL;
4022         goto out;
4023     }
4024 
4025     proto_drv = bdrv_find_protocol(filename);
4026     if (!proto_drv) {
4027         error_report("Unknown protocol '%s'", filename);
4028         ret = -EINVAL;
4029         goto out;
4030     }
4031 
4032     create_options = append_option_parameters(create_options,
4033                                               drv->create_options);
4034     create_options = append_option_parameters(create_options,
4035                                               proto_drv->create_options);
4036 
4037     /* Create parameter list with default values */
4038     param = parse_option_parameters("", create_options, param);
4039 
4040     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4041 
4042     /* Parse -o options */
4043     if (options) {
4044         param = parse_option_parameters(options, create_options, param);
4045         if (param == NULL) {
4046             error_report("Invalid options for file format '%s'.", fmt);
4047             ret = -EINVAL;
4048             goto out;
4049         }
4050     }
4051 
4052     if (base_filename) {
4053         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4054                                  base_filename)) {
4055             error_report("Backing file not supported for file format '%s'",
4056                          fmt);
4057             ret = -EINVAL;
4058             goto out;
4059         }
4060     }
4061 
4062     if (base_fmt) {
4063         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4064             error_report("Backing file format not supported for file "
4065                          "format '%s'", fmt);
4066             ret = -EINVAL;
4067             goto out;
4068         }
4069     }
4070 
4071     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4072     if (backing_file && backing_file->value.s) {
4073         if (!strcmp(filename, backing_file->value.s)) {
4074             error_report("Error: Trying to create an image with the "
4075                          "same filename as the backing file");
4076             ret = -EINVAL;
4077             goto out;
4078         }
4079     }
4080 
4081     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4082     if (backing_fmt && backing_fmt->value.s) {
4083         backing_drv = bdrv_find_format(backing_fmt->value.s);
4084         if (!backing_drv) {
4085             error_report("Unknown backing file format '%s'",
4086                          backing_fmt->value.s);
4087             ret = -EINVAL;
4088             goto out;
4089         }
4090     }
4091 
4092     // The size for the image must always be specified, with one exception:
4093     // If we are using a backing file, we can obtain the size from there
4094     size = get_option_parameter(param, BLOCK_OPT_SIZE);
4095     if (size && size->value.n == -1) {
4096         if (backing_file && backing_file->value.s) {
4097             uint64_t size;
4098             char buf[32];
4099 
4100             bs = bdrv_new("");
4101 
4102             ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
4103             if (ret < 0) {
4104                 error_report("Could not open '%s'", backing_file->value.s);
4105                 goto out;
4106             }
4107             bdrv_get_geometry(bs, &size);
4108             size *= 512;
4109 
4110             snprintf(buf, sizeof(buf), "%" PRId64, size);
4111             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4112         } else {
4113             error_report("Image creation needs a size parameter");
4114             ret = -EINVAL;
4115             goto out;
4116         }
4117     }
4118 
4119     printf("Formatting '%s', fmt=%s ", filename, fmt);
4120     print_option_parameters(param);
4121     puts("");
4122 
4123     ret = bdrv_create(drv, filename, param);
4124 
4125     if (ret < 0) {
4126         if (ret == -ENOTSUP) {
4127             error_report("Formatting or formatting option not supported for "
4128                          "file format '%s'", fmt);
4129         } else if (ret == -EFBIG) {
4130             error_report("The image size is too large for file format '%s'",
4131                          fmt);
4132         } else {
4133             error_report("%s: error while creating %s: %s", filename, fmt,
4134                          strerror(-ret));
4135         }
4136     }
4137 
4138 out:
4139     free_option_parameters(create_options);
4140     free_option_parameters(param);
4141 
4142     if (bs) {
4143         bdrv_delete(bs);
4144     }
4145 
4146     return ret;
4147 }
4148 
4149 void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4150                        int64_t speed, BlockDriverCompletionFunc *cb,
4151                        void *opaque, Error **errp)
4152 {
4153     BlockJob *job;
4154 
4155     if (bs->job || bdrv_in_use(bs)) {
4156         error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
4157         return NULL;
4158     }
4159     bdrv_set_in_use(bs, 1);
4160 
4161     job = g_malloc0(job_type->instance_size);
4162     job->job_type      = job_type;
4163     job->bs            = bs;
4164     job->cb            = cb;
4165     job->opaque        = opaque;
4166     bs->job = job;
4167 
4168     /* Only set speed when necessary to avoid NotSupported error */
4169     if (speed != 0) {
4170         Error *local_err = NULL;
4171 
4172         block_job_set_speed(job, speed, &local_err);
4173         if (error_is_set(&local_err)) {
4174             bs->job = NULL;
4175             g_free(job);
4176             bdrv_set_in_use(bs, 0);
4177             error_propagate(errp, local_err);
4178             return NULL;
4179         }
4180     }
4181     return job;
4182 }
4183 
4184 void block_job_complete(BlockJob *job, int ret)
4185 {
4186     BlockDriverState *bs = job->bs;
4187 
4188     assert(bs->job == job);
4189     job->cb(job->opaque, ret);
4190     bs->job = NULL;
4191     g_free(job);
4192     bdrv_set_in_use(bs, 0);
4193 }
4194 
4195 void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
4196 {
4197     Error *local_err = NULL;
4198 
4199     if (!job->job_type->set_speed) {
4200         error_set(errp, QERR_NOT_SUPPORTED);
4201         return;
4202     }
4203     job->job_type->set_speed(job, speed, &local_err);
4204     if (error_is_set(&local_err)) {
4205         error_propagate(errp, local_err);
4206         return;
4207     }
4208 
4209     job->speed = speed;
4210 }
4211 
4212 void block_job_cancel(BlockJob *job)
4213 {
4214     job->cancelled = true;
4215 }
4216 
4217 bool block_job_is_cancelled(BlockJob *job)
4218 {
4219     return job->cancelled;
4220 }
4221 
4222 void block_job_cancel_sync(BlockJob *job)
4223 {
4224     BlockDriverState *bs = job->bs;
4225 
4226     assert(bs->job == job);
4227     block_job_cancel(job);
4228     while (bs->job != NULL && bs->job->busy) {
4229         qemu_aio_wait();
4230     }
4231 }
4232