xref: /openbmc/qemu/block.c (revision 329c0a48a92664eb48b70993c0f2473b37aa7429)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor.h"
28 #include "block_int.h"
29 #include "module.h"
30 #include "qjson.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 #include "qemu-timer.h"
34 
35 #ifdef CONFIG_BSD
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/ioctl.h>
39 #include <sys/queue.h>
40 #ifndef __DragonFly__
41 #include <sys/disk.h>
42 #endif
43 #endif
44 
45 #ifdef _WIN32
46 #include <windows.h>
47 #endif
48 
49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50 
51 typedef enum {
52     BDRV_REQ_COPY_ON_READ = 0x1,
53     BDRV_REQ_ZERO_WRITE   = 0x2,
54 } BdrvRequestFlags;
55 
56 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59         BlockDriverCompletionFunc *cb, void *opaque);
60 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62         BlockDriverCompletionFunc *cb, void *opaque);
63 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64                                          int64_t sector_num, int nb_sectors,
65                                          QEMUIOVector *iov);
66 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67                                          int64_t sector_num, int nb_sectors,
68                                          QEMUIOVector *iov);
69 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71     BdrvRequestFlags flags);
72 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74     BdrvRequestFlags flags);
75 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76                                                int64_t sector_num,
77                                                QEMUIOVector *qiov,
78                                                int nb_sectors,
79                                                BlockDriverCompletionFunc *cb,
80                                                void *opaque,
81                                                bool is_write);
82 static void coroutine_fn bdrv_co_do_rw(void *opaque);
83 
84 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
85         bool is_write, double elapsed_time, uint64_t *wait);
86 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
87         double elapsed_time, uint64_t *wait);
88 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
89         bool is_write, int64_t *wait);
90 
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92     QTAILQ_HEAD_INITIALIZER(bdrv_states);
93 
94 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
95     QLIST_HEAD_INITIALIZER(bdrv_drivers);
96 
97 /* The device to use for VM snapshots */
98 static BlockDriverState *bs_snapshots;
99 
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
102 
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
105 {
106     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108             filename[1] == ':');
109 }
110 
111 int is_windows_drive(const char *filename)
112 {
113     if (is_windows_drive_prefix(filename) &&
114         filename[2] == '\0')
115         return 1;
116     if (strstart(filename, "\\\\.\\", NULL) ||
117         strstart(filename, "//./", NULL))
118         return 1;
119     return 0;
120 }
121 #endif
122 
123 /* throttling disk I/O limits */
124 void bdrv_io_limits_disable(BlockDriverState *bs)
125 {
126     bs->io_limits_enabled = false;
127 
128     while (qemu_co_queue_next(&bs->throttled_reqs));
129 
130     if (bs->block_timer) {
131         qemu_del_timer(bs->block_timer);
132         qemu_free_timer(bs->block_timer);
133         bs->block_timer = NULL;
134     }
135 
136     bs->slice_start = 0;
137     bs->slice_end   = 0;
138     bs->slice_time  = 0;
139     memset(&bs->io_base, 0, sizeof(bs->io_base));
140 }
141 
142 static void bdrv_block_timer(void *opaque)
143 {
144     BlockDriverState *bs = opaque;
145 
146     qemu_co_queue_next(&bs->throttled_reqs);
147 }
148 
149 void bdrv_io_limits_enable(BlockDriverState *bs)
150 {
151     qemu_co_queue_init(&bs->throttled_reqs);
152     bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
153     bs->slice_time  = 5 * BLOCK_IO_SLICE_TIME;
154     bs->slice_start = qemu_get_clock_ns(vm_clock);
155     bs->slice_end   = bs->slice_start + bs->slice_time;
156     memset(&bs->io_base, 0, sizeof(bs->io_base));
157     bs->io_limits_enabled = true;
158 }
159 
160 bool bdrv_io_limits_enabled(BlockDriverState *bs)
161 {
162     BlockIOLimit *io_limits = &bs->io_limits;
163     return io_limits->bps[BLOCK_IO_LIMIT_READ]
164          || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
165          || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
166          || io_limits->iops[BLOCK_IO_LIMIT_READ]
167          || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
168          || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
169 }
170 
171 static void bdrv_io_limits_intercept(BlockDriverState *bs,
172                                      bool is_write, int nb_sectors)
173 {
174     int64_t wait_time = -1;
175 
176     if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
177         qemu_co_queue_wait(&bs->throttled_reqs);
178     }
179 
180     /* In fact, we hope to keep each request's timing, in FIFO mode. The next
181      * throttled requests will not be dequeued until the current request is
182      * allowed to be serviced. So if the current request still exceeds the
183      * limits, it will be inserted to the head. All requests followed it will
184      * be still in throttled_reqs queue.
185      */
186 
187     while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
188         qemu_mod_timer(bs->block_timer,
189                        wait_time + qemu_get_clock_ns(vm_clock));
190         qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
191     }
192 
193     qemu_co_queue_next(&bs->throttled_reqs);
194 }
195 
196 /* check if the path starts with "<protocol>:" */
197 static int path_has_protocol(const char *path)
198 {
199 #ifdef _WIN32
200     if (is_windows_drive(path) ||
201         is_windows_drive_prefix(path)) {
202         return 0;
203     }
204 #endif
205 
206     return strchr(path, ':') != NULL;
207 }
208 
209 int path_is_absolute(const char *path)
210 {
211     const char *p;
212 #ifdef _WIN32
213     /* specific case for names like: "\\.\d:" */
214     if (*path == '/' || *path == '\\')
215         return 1;
216 #endif
217     p = strchr(path, ':');
218     if (p)
219         p++;
220     else
221         p = path;
222 #ifdef _WIN32
223     return (*p == '/' || *p == '\\');
224 #else
225     return (*p == '/');
226 #endif
227 }
228 
229 /* if filename is absolute, just copy it to dest. Otherwise, build a
230    path to it by considering it is relative to base_path. URL are
231    supported. */
232 void path_combine(char *dest, int dest_size,
233                   const char *base_path,
234                   const char *filename)
235 {
236     const char *p, *p1;
237     int len;
238 
239     if (dest_size <= 0)
240         return;
241     if (path_is_absolute(filename)) {
242         pstrcpy(dest, dest_size, filename);
243     } else {
244         p = strchr(base_path, ':');
245         if (p)
246             p++;
247         else
248             p = base_path;
249         p1 = strrchr(base_path, '/');
250 #ifdef _WIN32
251         {
252             const char *p2;
253             p2 = strrchr(base_path, '\\');
254             if (!p1 || p2 > p1)
255                 p1 = p2;
256         }
257 #endif
258         if (p1)
259             p1++;
260         else
261             p1 = base_path;
262         if (p1 > p)
263             p = p1;
264         len = p - base_path;
265         if (len > dest_size - 1)
266             len = dest_size - 1;
267         memcpy(dest, base_path, len);
268         dest[len] = '\0';
269         pstrcat(dest, dest_size, filename);
270     }
271 }
272 
273 void bdrv_register(BlockDriver *bdrv)
274 {
275     /* Block drivers without coroutine functions need emulation */
276     if (!bdrv->bdrv_co_readv) {
277         bdrv->bdrv_co_readv = bdrv_co_readv_em;
278         bdrv->bdrv_co_writev = bdrv_co_writev_em;
279 
280         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
281          * the block driver lacks aio we need to emulate that too.
282          */
283         if (!bdrv->bdrv_aio_readv) {
284             /* add AIO emulation layer */
285             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
286             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
287         }
288     }
289 
290     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
291 }
292 
293 /* create a new block device (by default it is empty) */
294 BlockDriverState *bdrv_new(const char *device_name)
295 {
296     BlockDriverState *bs;
297 
298     bs = g_malloc0(sizeof(BlockDriverState));
299     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
300     if (device_name[0] != '\0') {
301         QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
302     }
303     bdrv_iostatus_disable(bs);
304     return bs;
305 }
306 
307 BlockDriver *bdrv_find_format(const char *format_name)
308 {
309     BlockDriver *drv1;
310     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
311         if (!strcmp(drv1->format_name, format_name)) {
312             return drv1;
313         }
314     }
315     return NULL;
316 }
317 
318 static int bdrv_is_whitelisted(BlockDriver *drv)
319 {
320     static const char *whitelist[] = {
321         CONFIG_BDRV_WHITELIST
322     };
323     const char **p;
324 
325     if (!whitelist[0])
326         return 1;               /* no whitelist, anything goes */
327 
328     for (p = whitelist; *p; p++) {
329         if (!strcmp(drv->format_name, *p)) {
330             return 1;
331         }
332     }
333     return 0;
334 }
335 
336 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
337 {
338     BlockDriver *drv = bdrv_find_format(format_name);
339     return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
340 }
341 
342 int bdrv_create(BlockDriver *drv, const char* filename,
343     QEMUOptionParameter *options)
344 {
345     if (!drv->bdrv_create)
346         return -ENOTSUP;
347 
348     return drv->bdrv_create(filename, options);
349 }
350 
351 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
352 {
353     BlockDriver *drv;
354 
355     drv = bdrv_find_protocol(filename);
356     if (drv == NULL) {
357         return -ENOENT;
358     }
359 
360     return bdrv_create(drv, filename, options);
361 }
362 
363 #ifdef _WIN32
364 void get_tmp_filename(char *filename, int size)
365 {
366     char temp_dir[MAX_PATH];
367 
368     GetTempPath(MAX_PATH, temp_dir);
369     GetTempFileName(temp_dir, "qem", 0, filename);
370 }
371 #else
372 void get_tmp_filename(char *filename, int size)
373 {
374     int fd;
375     const char *tmpdir;
376     /* XXX: race condition possible */
377     tmpdir = getenv("TMPDIR");
378     if (!tmpdir)
379         tmpdir = "/tmp";
380     snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
381     fd = mkstemp(filename);
382     close(fd);
383 }
384 #endif
385 
386 /*
387  * Detect host devices. By convention, /dev/cdrom[N] is always
388  * recognized as a host CDROM.
389  */
390 static BlockDriver *find_hdev_driver(const char *filename)
391 {
392     int score_max = 0, score;
393     BlockDriver *drv = NULL, *d;
394 
395     QLIST_FOREACH(d, &bdrv_drivers, list) {
396         if (d->bdrv_probe_device) {
397             score = d->bdrv_probe_device(filename);
398             if (score > score_max) {
399                 score_max = score;
400                 drv = d;
401             }
402         }
403     }
404 
405     return drv;
406 }
407 
408 BlockDriver *bdrv_find_protocol(const char *filename)
409 {
410     BlockDriver *drv1;
411     char protocol[128];
412     int len;
413     const char *p;
414 
415     /* TODO Drivers without bdrv_file_open must be specified explicitly */
416 
417     /*
418      * XXX(hch): we really should not let host device detection
419      * override an explicit protocol specification, but moving this
420      * later breaks access to device names with colons in them.
421      * Thanks to the brain-dead persistent naming schemes on udev-
422      * based Linux systems those actually are quite common.
423      */
424     drv1 = find_hdev_driver(filename);
425     if (drv1) {
426         return drv1;
427     }
428 
429     if (!path_has_protocol(filename)) {
430         return bdrv_find_format("file");
431     }
432     p = strchr(filename, ':');
433     assert(p != NULL);
434     len = p - filename;
435     if (len > sizeof(protocol) - 1)
436         len = sizeof(protocol) - 1;
437     memcpy(protocol, filename, len);
438     protocol[len] = '\0';
439     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
440         if (drv1->protocol_name &&
441             !strcmp(drv1->protocol_name, protocol)) {
442             return drv1;
443         }
444     }
445     return NULL;
446 }
447 
448 static int find_image_format(const char *filename, BlockDriver **pdrv)
449 {
450     int ret, score, score_max;
451     BlockDriver *drv1, *drv;
452     uint8_t buf[2048];
453     BlockDriverState *bs;
454 
455     ret = bdrv_file_open(&bs, filename, 0);
456     if (ret < 0) {
457         *pdrv = NULL;
458         return ret;
459     }
460 
461     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
462     if (bs->sg || !bdrv_is_inserted(bs)) {
463         bdrv_delete(bs);
464         drv = bdrv_find_format("raw");
465         if (!drv) {
466             ret = -ENOENT;
467         }
468         *pdrv = drv;
469         return ret;
470     }
471 
472     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
473     bdrv_delete(bs);
474     if (ret < 0) {
475         *pdrv = NULL;
476         return ret;
477     }
478 
479     score_max = 0;
480     drv = NULL;
481     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
482         if (drv1->bdrv_probe) {
483             score = drv1->bdrv_probe(buf, ret, filename);
484             if (score > score_max) {
485                 score_max = score;
486                 drv = drv1;
487             }
488         }
489     }
490     if (!drv) {
491         ret = -ENOENT;
492     }
493     *pdrv = drv;
494     return ret;
495 }
496 
497 /**
498  * Set the current 'total_sectors' value
499  */
500 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
501 {
502     BlockDriver *drv = bs->drv;
503 
504     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
505     if (bs->sg)
506         return 0;
507 
508     /* query actual device if possible, otherwise just trust the hint */
509     if (drv->bdrv_getlength) {
510         int64_t length = drv->bdrv_getlength(bs);
511         if (length < 0) {
512             return length;
513         }
514         hint = length >> BDRV_SECTOR_BITS;
515     }
516 
517     bs->total_sectors = hint;
518     return 0;
519 }
520 
521 /**
522  * Set open flags for a given cache mode
523  *
524  * Return 0 on success, -1 if the cache mode was invalid.
525  */
526 int bdrv_parse_cache_flags(const char *mode, int *flags)
527 {
528     *flags &= ~BDRV_O_CACHE_MASK;
529 
530     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
531         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
532     } else if (!strcmp(mode, "directsync")) {
533         *flags |= BDRV_O_NOCACHE;
534     } else if (!strcmp(mode, "writeback")) {
535         *flags |= BDRV_O_CACHE_WB;
536     } else if (!strcmp(mode, "unsafe")) {
537         *flags |= BDRV_O_CACHE_WB;
538         *flags |= BDRV_O_NO_FLUSH;
539     } else if (!strcmp(mode, "writethrough")) {
540         /* this is the default */
541     } else {
542         return -1;
543     }
544 
545     return 0;
546 }
547 
548 /**
549  * The copy-on-read flag is actually a reference count so multiple users may
550  * use the feature without worrying about clobbering its previous state.
551  * Copy-on-read stays enabled until all users have called to disable it.
552  */
553 void bdrv_enable_copy_on_read(BlockDriverState *bs)
554 {
555     bs->copy_on_read++;
556 }
557 
558 void bdrv_disable_copy_on_read(BlockDriverState *bs)
559 {
560     assert(bs->copy_on_read > 0);
561     bs->copy_on_read--;
562 }
563 
564 /*
565  * Common part for opening disk images and files
566  */
567 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
568     int flags, BlockDriver *drv)
569 {
570     int ret, open_flags;
571 
572     assert(drv != NULL);
573 
574     trace_bdrv_open_common(bs, filename, flags, drv->format_name);
575 
576     bs->file = NULL;
577     bs->total_sectors = 0;
578     bs->encrypted = 0;
579     bs->valid_key = 0;
580     bs->sg = 0;
581     bs->open_flags = flags;
582     bs->growable = 0;
583     bs->buffer_alignment = 512;
584 
585     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
586     if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
587         bdrv_enable_copy_on_read(bs);
588     }
589 
590     pstrcpy(bs->filename, sizeof(bs->filename), filename);
591     bs->backing_file[0] = '\0';
592 
593     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
594         return -ENOTSUP;
595     }
596 
597     bs->drv = drv;
598     bs->opaque = g_malloc0(drv->instance_size);
599 
600     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
601 
602     /*
603      * Clear flags that are internal to the block layer before opening the
604      * image.
605      */
606     open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
607 
608     /*
609      * Snapshots should be writable.
610      */
611     if (bs->is_temporary) {
612         open_flags |= BDRV_O_RDWR;
613     }
614 
615     bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
616 
617     /* Open the image, either directly or using a protocol */
618     if (drv->bdrv_file_open) {
619         ret = drv->bdrv_file_open(bs, filename, open_flags);
620     } else {
621         ret = bdrv_file_open(&bs->file, filename, open_flags);
622         if (ret >= 0) {
623             ret = drv->bdrv_open(bs, open_flags);
624         }
625     }
626 
627     if (ret < 0) {
628         goto free_and_fail;
629     }
630 
631     ret = refresh_total_sectors(bs, bs->total_sectors);
632     if (ret < 0) {
633         goto free_and_fail;
634     }
635 
636 #ifndef _WIN32
637     if (bs->is_temporary) {
638         unlink(filename);
639     }
640 #endif
641     return 0;
642 
643 free_and_fail:
644     if (bs->file) {
645         bdrv_delete(bs->file);
646         bs->file = NULL;
647     }
648     g_free(bs->opaque);
649     bs->opaque = NULL;
650     bs->drv = NULL;
651     return ret;
652 }
653 
654 /*
655  * Opens a file using a protocol (file, host_device, nbd, ...)
656  */
657 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
658 {
659     BlockDriverState *bs;
660     BlockDriver *drv;
661     int ret;
662 
663     drv = bdrv_find_protocol(filename);
664     if (!drv) {
665         return -ENOENT;
666     }
667 
668     bs = bdrv_new("");
669     ret = bdrv_open_common(bs, filename, flags, drv);
670     if (ret < 0) {
671         bdrv_delete(bs);
672         return ret;
673     }
674     bs->growable = 1;
675     *pbs = bs;
676     return 0;
677 }
678 
679 /*
680  * Opens a disk image (raw, qcow2, vmdk, ...)
681  */
682 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
683               BlockDriver *drv)
684 {
685     int ret;
686     char tmp_filename[PATH_MAX];
687 
688     if (flags & BDRV_O_SNAPSHOT) {
689         BlockDriverState *bs1;
690         int64_t total_size;
691         int is_protocol = 0;
692         BlockDriver *bdrv_qcow2;
693         QEMUOptionParameter *options;
694         char backing_filename[PATH_MAX];
695 
696         /* if snapshot, we create a temporary backing file and open it
697            instead of opening 'filename' directly */
698 
699         /* if there is a backing file, use it */
700         bs1 = bdrv_new("");
701         ret = bdrv_open(bs1, filename, 0, drv);
702         if (ret < 0) {
703             bdrv_delete(bs1);
704             return ret;
705         }
706         total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
707 
708         if (bs1->drv && bs1->drv->protocol_name)
709             is_protocol = 1;
710 
711         bdrv_delete(bs1);
712 
713         get_tmp_filename(tmp_filename, sizeof(tmp_filename));
714 
715         /* Real path is meaningless for protocols */
716         if (is_protocol)
717             snprintf(backing_filename, sizeof(backing_filename),
718                      "%s", filename);
719         else if (!realpath(filename, backing_filename))
720             return -errno;
721 
722         bdrv_qcow2 = bdrv_find_format("qcow2");
723         options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
724 
725         set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
726         set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
727         if (drv) {
728             set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
729                 drv->format_name);
730         }
731 
732         ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
733         free_option_parameters(options);
734         if (ret < 0) {
735             return ret;
736         }
737 
738         filename = tmp_filename;
739         drv = bdrv_qcow2;
740         bs->is_temporary = 1;
741     }
742 
743     /* Find the right image format driver */
744     if (!drv) {
745         ret = find_image_format(filename, &drv);
746     }
747 
748     if (!drv) {
749         goto unlink_and_fail;
750     }
751 
752     /* Open the image */
753     ret = bdrv_open_common(bs, filename, flags, drv);
754     if (ret < 0) {
755         goto unlink_and_fail;
756     }
757 
758     /* If there is a backing file, use it */
759     if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
760         char backing_filename[PATH_MAX];
761         int back_flags;
762         BlockDriver *back_drv = NULL;
763 
764         bs->backing_hd = bdrv_new("");
765 
766         if (path_has_protocol(bs->backing_file)) {
767             pstrcpy(backing_filename, sizeof(backing_filename),
768                     bs->backing_file);
769         } else {
770             path_combine(backing_filename, sizeof(backing_filename),
771                          filename, bs->backing_file);
772         }
773 
774         if (bs->backing_format[0] != '\0') {
775             back_drv = bdrv_find_format(bs->backing_format);
776         }
777 
778         /* backing files always opened read-only */
779         back_flags =
780             flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
781 
782         ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
783         if (ret < 0) {
784             bdrv_close(bs);
785             return ret;
786         }
787         if (bs->is_temporary) {
788             bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
789         } else {
790             /* base image inherits from "parent" */
791             bs->backing_hd->keep_read_only = bs->keep_read_only;
792         }
793     }
794 
795     if (!bdrv_key_required(bs)) {
796         bdrv_dev_change_media_cb(bs, true);
797     }
798 
799     /* throttling disk I/O limits */
800     if (bs->io_limits_enabled) {
801         bdrv_io_limits_enable(bs);
802     }
803 
804     return 0;
805 
806 unlink_and_fail:
807     if (bs->is_temporary) {
808         unlink(filename);
809     }
810     return ret;
811 }
812 
813 void bdrv_close(BlockDriverState *bs)
814 {
815     if (bs->drv) {
816         if (bs == bs_snapshots) {
817             bs_snapshots = NULL;
818         }
819         if (bs->backing_hd) {
820             bdrv_delete(bs->backing_hd);
821             bs->backing_hd = NULL;
822         }
823         bs->drv->bdrv_close(bs);
824         g_free(bs->opaque);
825 #ifdef _WIN32
826         if (bs->is_temporary) {
827             unlink(bs->filename);
828         }
829 #endif
830         bs->opaque = NULL;
831         bs->drv = NULL;
832         bs->copy_on_read = 0;
833 
834         if (bs->file != NULL) {
835             bdrv_close(bs->file);
836         }
837 
838         bdrv_dev_change_media_cb(bs, false);
839     }
840 
841     /*throttling disk I/O limits*/
842     if (bs->io_limits_enabled) {
843         bdrv_io_limits_disable(bs);
844     }
845 }
846 
847 void bdrv_close_all(void)
848 {
849     BlockDriverState *bs;
850 
851     QTAILQ_FOREACH(bs, &bdrv_states, list) {
852         bdrv_close(bs);
853     }
854 }
855 
856 /*
857  * Wait for pending requests to complete across all BlockDriverStates
858  *
859  * This function does not flush data to disk, use bdrv_flush_all() for that
860  * after calling this function.
861  */
862 void bdrv_drain_all(void)
863 {
864     BlockDriverState *bs;
865 
866     qemu_aio_flush();
867 
868     /* If requests are still pending there is a bug somewhere */
869     QTAILQ_FOREACH(bs, &bdrv_states, list) {
870         assert(QLIST_EMPTY(&bs->tracked_requests));
871         assert(qemu_co_queue_empty(&bs->throttled_reqs));
872     }
873 }
874 
875 /* make a BlockDriverState anonymous by removing from bdrv_state list.
876    Also, NULL terminate the device_name to prevent double remove */
877 void bdrv_make_anon(BlockDriverState *bs)
878 {
879     if (bs->device_name[0] != '\0') {
880         QTAILQ_REMOVE(&bdrv_states, bs, list);
881     }
882     bs->device_name[0] = '\0';
883 }
884 
885 void bdrv_delete(BlockDriverState *bs)
886 {
887     assert(!bs->dev);
888 
889     /* remove from list, if necessary */
890     bdrv_make_anon(bs);
891 
892     bdrv_close(bs);
893     if (bs->file != NULL) {
894         bdrv_delete(bs->file);
895     }
896 
897     assert(bs != bs_snapshots);
898     g_free(bs);
899 }
900 
901 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
902 /* TODO change to DeviceState *dev when all users are qdevified */
903 {
904     if (bs->dev) {
905         return -EBUSY;
906     }
907     bs->dev = dev;
908     bdrv_iostatus_reset(bs);
909     return 0;
910 }
911 
912 /* TODO qdevified devices don't use this, remove when devices are qdevified */
913 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
914 {
915     if (bdrv_attach_dev(bs, dev) < 0) {
916         abort();
917     }
918 }
919 
920 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
921 /* TODO change to DeviceState *dev when all users are qdevified */
922 {
923     assert(bs->dev == dev);
924     bs->dev = NULL;
925     bs->dev_ops = NULL;
926     bs->dev_opaque = NULL;
927     bs->buffer_alignment = 512;
928 }
929 
930 /* TODO change to return DeviceState * when all users are qdevified */
931 void *bdrv_get_attached_dev(BlockDriverState *bs)
932 {
933     return bs->dev;
934 }
935 
936 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
937                       void *opaque)
938 {
939     bs->dev_ops = ops;
940     bs->dev_opaque = opaque;
941     if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
942         bs_snapshots = NULL;
943     }
944 }
945 
946 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
947                                BlockQMPEventAction action, int is_read)
948 {
949     QObject *data;
950     const char *action_str;
951 
952     switch (action) {
953     case BDRV_ACTION_REPORT:
954         action_str = "report";
955         break;
956     case BDRV_ACTION_IGNORE:
957         action_str = "ignore";
958         break;
959     case BDRV_ACTION_STOP:
960         action_str = "stop";
961         break;
962     default:
963         abort();
964     }
965 
966     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
967                               bdrv->device_name,
968                               action_str,
969                               is_read ? "read" : "write");
970     monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
971 
972     qobject_decref(data);
973 }
974 
975 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
976 {
977     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
978         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
979     }
980 }
981 
982 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
983 {
984     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
985 }
986 
987 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
988 {
989     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
990         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
991     }
992 }
993 
994 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
995 {
996     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
997         return bs->dev_ops->is_tray_open(bs->dev_opaque);
998     }
999     return false;
1000 }
1001 
1002 static void bdrv_dev_resize_cb(BlockDriverState *bs)
1003 {
1004     if (bs->dev_ops && bs->dev_ops->resize_cb) {
1005         bs->dev_ops->resize_cb(bs->dev_opaque);
1006     }
1007 }
1008 
1009 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1010 {
1011     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1012         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1013     }
1014     return false;
1015 }
1016 
1017 /*
1018  * Run consistency checks on an image
1019  *
1020  * Returns 0 if the check could be completed (it doesn't mean that the image is
1021  * free of errors) or -errno when an internal error occurred. The results of the
1022  * check are stored in res.
1023  */
1024 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
1025 {
1026     if (bs->drv->bdrv_check == NULL) {
1027         return -ENOTSUP;
1028     }
1029 
1030     memset(res, 0, sizeof(*res));
1031     return bs->drv->bdrv_check(bs, res);
1032 }
1033 
1034 #define COMMIT_BUF_SECTORS 2048
1035 
1036 /* commit COW file into the raw image */
1037 int bdrv_commit(BlockDriverState *bs)
1038 {
1039     BlockDriver *drv = bs->drv;
1040     BlockDriver *backing_drv;
1041     int64_t sector, total_sectors;
1042     int n, ro, open_flags;
1043     int ret = 0, rw_ret = 0;
1044     uint8_t *buf;
1045     char filename[1024];
1046     BlockDriverState *bs_rw, *bs_ro;
1047 
1048     if (!drv)
1049         return -ENOMEDIUM;
1050 
1051     if (!bs->backing_hd) {
1052         return -ENOTSUP;
1053     }
1054 
1055     if (bs->backing_hd->keep_read_only) {
1056         return -EACCES;
1057     }
1058 
1059     if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1060         return -EBUSY;
1061     }
1062 
1063     backing_drv = bs->backing_hd->drv;
1064     ro = bs->backing_hd->read_only;
1065     strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1066     open_flags =  bs->backing_hd->open_flags;
1067 
1068     if (ro) {
1069         /* re-open as RW */
1070         bdrv_delete(bs->backing_hd);
1071         bs->backing_hd = NULL;
1072         bs_rw = bdrv_new("");
1073         rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1074             backing_drv);
1075         if (rw_ret < 0) {
1076             bdrv_delete(bs_rw);
1077             /* try to re-open read-only */
1078             bs_ro = bdrv_new("");
1079             ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1080                 backing_drv);
1081             if (ret < 0) {
1082                 bdrv_delete(bs_ro);
1083                 /* drive not functional anymore */
1084                 bs->drv = NULL;
1085                 return ret;
1086             }
1087             bs->backing_hd = bs_ro;
1088             return rw_ret;
1089         }
1090         bs->backing_hd = bs_rw;
1091     }
1092 
1093     total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1094     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1095 
1096     for (sector = 0; sector < total_sectors; sector += n) {
1097         if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1098 
1099             if (bdrv_read(bs, sector, buf, n) != 0) {
1100                 ret = -EIO;
1101                 goto ro_cleanup;
1102             }
1103 
1104             if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1105                 ret = -EIO;
1106                 goto ro_cleanup;
1107             }
1108         }
1109     }
1110 
1111     if (drv->bdrv_make_empty) {
1112         ret = drv->bdrv_make_empty(bs);
1113         bdrv_flush(bs);
1114     }
1115 
1116     /*
1117      * Make sure all data we wrote to the backing device is actually
1118      * stable on disk.
1119      */
1120     if (bs->backing_hd)
1121         bdrv_flush(bs->backing_hd);
1122 
1123 ro_cleanup:
1124     g_free(buf);
1125 
1126     if (ro) {
1127         /* re-open as RO */
1128         bdrv_delete(bs->backing_hd);
1129         bs->backing_hd = NULL;
1130         bs_ro = bdrv_new("");
1131         ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1132             backing_drv);
1133         if (ret < 0) {
1134             bdrv_delete(bs_ro);
1135             /* drive not functional anymore */
1136             bs->drv = NULL;
1137             return ret;
1138         }
1139         bs->backing_hd = bs_ro;
1140         bs->backing_hd->keep_read_only = 0;
1141     }
1142 
1143     return ret;
1144 }
1145 
1146 void bdrv_commit_all(void)
1147 {
1148     BlockDriverState *bs;
1149 
1150     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1151         bdrv_commit(bs);
1152     }
1153 }
1154 
1155 struct BdrvTrackedRequest {
1156     BlockDriverState *bs;
1157     int64_t sector_num;
1158     int nb_sectors;
1159     bool is_write;
1160     QLIST_ENTRY(BdrvTrackedRequest) list;
1161     Coroutine *co; /* owner, used for deadlock detection */
1162     CoQueue wait_queue; /* coroutines blocked on this request */
1163 };
1164 
1165 /**
1166  * Remove an active request from the tracked requests list
1167  *
1168  * This function should be called when a tracked request is completing.
1169  */
1170 static void tracked_request_end(BdrvTrackedRequest *req)
1171 {
1172     QLIST_REMOVE(req, list);
1173     qemu_co_queue_restart_all(&req->wait_queue);
1174 }
1175 
1176 /**
1177  * Add an active request to the tracked requests list
1178  */
1179 static void tracked_request_begin(BdrvTrackedRequest *req,
1180                                   BlockDriverState *bs,
1181                                   int64_t sector_num,
1182                                   int nb_sectors, bool is_write)
1183 {
1184     *req = (BdrvTrackedRequest){
1185         .bs = bs,
1186         .sector_num = sector_num,
1187         .nb_sectors = nb_sectors,
1188         .is_write = is_write,
1189         .co = qemu_coroutine_self(),
1190     };
1191 
1192     qemu_co_queue_init(&req->wait_queue);
1193 
1194     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1195 }
1196 
1197 /**
1198  * Round a region to cluster boundaries
1199  */
1200 static void round_to_clusters(BlockDriverState *bs,
1201                               int64_t sector_num, int nb_sectors,
1202                               int64_t *cluster_sector_num,
1203                               int *cluster_nb_sectors)
1204 {
1205     BlockDriverInfo bdi;
1206 
1207     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1208         *cluster_sector_num = sector_num;
1209         *cluster_nb_sectors = nb_sectors;
1210     } else {
1211         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1212         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1213         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1214                                             nb_sectors, c);
1215     }
1216 }
1217 
1218 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1219                                      int64_t sector_num, int nb_sectors) {
1220     /*        aaaa   bbbb */
1221     if (sector_num >= req->sector_num + req->nb_sectors) {
1222         return false;
1223     }
1224     /* bbbb   aaaa        */
1225     if (req->sector_num >= sector_num + nb_sectors) {
1226         return false;
1227     }
1228     return true;
1229 }
1230 
1231 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1232         int64_t sector_num, int nb_sectors)
1233 {
1234     BdrvTrackedRequest *req;
1235     int64_t cluster_sector_num;
1236     int cluster_nb_sectors;
1237     bool retry;
1238 
1239     /* If we touch the same cluster it counts as an overlap.  This guarantees
1240      * that allocating writes will be serialized and not race with each other
1241      * for the same cluster.  For example, in copy-on-read it ensures that the
1242      * CoR read and write operations are atomic and guest writes cannot
1243      * interleave between them.
1244      */
1245     round_to_clusters(bs, sector_num, nb_sectors,
1246                       &cluster_sector_num, &cluster_nb_sectors);
1247 
1248     do {
1249         retry = false;
1250         QLIST_FOREACH(req, &bs->tracked_requests, list) {
1251             if (tracked_request_overlaps(req, cluster_sector_num,
1252                                          cluster_nb_sectors)) {
1253                 /* Hitting this means there was a reentrant request, for
1254                  * example, a block driver issuing nested requests.  This must
1255                  * never happen since it means deadlock.
1256                  */
1257                 assert(qemu_coroutine_self() != req->co);
1258 
1259                 qemu_co_queue_wait(&req->wait_queue);
1260                 retry = true;
1261                 break;
1262             }
1263         }
1264     } while (retry);
1265 }
1266 
1267 /*
1268  * Return values:
1269  * 0        - success
1270  * -EINVAL  - backing format specified, but no file
1271  * -ENOSPC  - can't update the backing file because no space is left in the
1272  *            image file header
1273  * -ENOTSUP - format driver doesn't support changing the backing file
1274  */
1275 int bdrv_change_backing_file(BlockDriverState *bs,
1276     const char *backing_file, const char *backing_fmt)
1277 {
1278     BlockDriver *drv = bs->drv;
1279 
1280     if (drv->bdrv_change_backing_file != NULL) {
1281         return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1282     } else {
1283         return -ENOTSUP;
1284     }
1285 }
1286 
1287 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1288                                    size_t size)
1289 {
1290     int64_t len;
1291 
1292     if (!bdrv_is_inserted(bs))
1293         return -ENOMEDIUM;
1294 
1295     if (bs->growable)
1296         return 0;
1297 
1298     len = bdrv_getlength(bs);
1299 
1300     if (offset < 0)
1301         return -EIO;
1302 
1303     if ((offset > len) || (len - offset < size))
1304         return -EIO;
1305 
1306     return 0;
1307 }
1308 
1309 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1310                               int nb_sectors)
1311 {
1312     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1313                                    nb_sectors * BDRV_SECTOR_SIZE);
1314 }
1315 
1316 typedef struct RwCo {
1317     BlockDriverState *bs;
1318     int64_t sector_num;
1319     int nb_sectors;
1320     QEMUIOVector *qiov;
1321     bool is_write;
1322     int ret;
1323 } RwCo;
1324 
1325 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1326 {
1327     RwCo *rwco = opaque;
1328 
1329     if (!rwco->is_write) {
1330         rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1331                                      rwco->nb_sectors, rwco->qiov, 0);
1332     } else {
1333         rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1334                                       rwco->nb_sectors, rwco->qiov, 0);
1335     }
1336 }
1337 
1338 /*
1339  * Process a synchronous request using coroutines
1340  */
1341 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1342                       int nb_sectors, bool is_write)
1343 {
1344     QEMUIOVector qiov;
1345     struct iovec iov = {
1346         .iov_base = (void *)buf,
1347         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1348     };
1349     Coroutine *co;
1350     RwCo rwco = {
1351         .bs = bs,
1352         .sector_num = sector_num,
1353         .nb_sectors = nb_sectors,
1354         .qiov = &qiov,
1355         .is_write = is_write,
1356         .ret = NOT_DONE,
1357     };
1358 
1359     qemu_iovec_init_external(&qiov, &iov, 1);
1360 
1361     if (qemu_in_coroutine()) {
1362         /* Fast-path if already in coroutine context */
1363         bdrv_rw_co_entry(&rwco);
1364     } else {
1365         co = qemu_coroutine_create(bdrv_rw_co_entry);
1366         qemu_coroutine_enter(co, &rwco);
1367         while (rwco.ret == NOT_DONE) {
1368             qemu_aio_wait();
1369         }
1370     }
1371     return rwco.ret;
1372 }
1373 
1374 /* return < 0 if error. See bdrv_write() for the return codes */
1375 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1376               uint8_t *buf, int nb_sectors)
1377 {
1378     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1379 }
1380 
1381 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1382                              int nb_sectors, int dirty)
1383 {
1384     int64_t start, end;
1385     unsigned long val, idx, bit;
1386 
1387     start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1388     end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1389 
1390     for (; start <= end; start++) {
1391         idx = start / (sizeof(unsigned long) * 8);
1392         bit = start % (sizeof(unsigned long) * 8);
1393         val = bs->dirty_bitmap[idx];
1394         if (dirty) {
1395             if (!(val & (1UL << bit))) {
1396                 bs->dirty_count++;
1397                 val |= 1UL << bit;
1398             }
1399         } else {
1400             if (val & (1UL << bit)) {
1401                 bs->dirty_count--;
1402                 val &= ~(1UL << bit);
1403             }
1404         }
1405         bs->dirty_bitmap[idx] = val;
1406     }
1407 }
1408 
1409 /* Return < 0 if error. Important errors are:
1410   -EIO         generic I/O error (may happen for all errors)
1411   -ENOMEDIUM   No media inserted.
1412   -EINVAL      Invalid sector number or nb_sectors
1413   -EACCES      Trying to write a read-only device
1414 */
1415 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1416                const uint8_t *buf, int nb_sectors)
1417 {
1418     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1419 }
1420 
1421 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1422                void *buf, int count1)
1423 {
1424     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1425     int len, nb_sectors, count;
1426     int64_t sector_num;
1427     int ret;
1428 
1429     count = count1;
1430     /* first read to align to sector start */
1431     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1432     if (len > count)
1433         len = count;
1434     sector_num = offset >> BDRV_SECTOR_BITS;
1435     if (len > 0) {
1436         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1437             return ret;
1438         memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1439         count -= len;
1440         if (count == 0)
1441             return count1;
1442         sector_num++;
1443         buf += len;
1444     }
1445 
1446     /* read the sectors "in place" */
1447     nb_sectors = count >> BDRV_SECTOR_BITS;
1448     if (nb_sectors > 0) {
1449         if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1450             return ret;
1451         sector_num += nb_sectors;
1452         len = nb_sectors << BDRV_SECTOR_BITS;
1453         buf += len;
1454         count -= len;
1455     }
1456 
1457     /* add data from the last sector */
1458     if (count > 0) {
1459         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1460             return ret;
1461         memcpy(buf, tmp_buf, count);
1462     }
1463     return count1;
1464 }
1465 
1466 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1467                 const void *buf, int count1)
1468 {
1469     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1470     int len, nb_sectors, count;
1471     int64_t sector_num;
1472     int ret;
1473 
1474     count = count1;
1475     /* first write to align to sector start */
1476     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1477     if (len > count)
1478         len = count;
1479     sector_num = offset >> BDRV_SECTOR_BITS;
1480     if (len > 0) {
1481         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1482             return ret;
1483         memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1484         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1485             return ret;
1486         count -= len;
1487         if (count == 0)
1488             return count1;
1489         sector_num++;
1490         buf += len;
1491     }
1492 
1493     /* write the sectors "in place" */
1494     nb_sectors = count >> BDRV_SECTOR_BITS;
1495     if (nb_sectors > 0) {
1496         if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1497             return ret;
1498         sector_num += nb_sectors;
1499         len = nb_sectors << BDRV_SECTOR_BITS;
1500         buf += len;
1501         count -= len;
1502     }
1503 
1504     /* add data from the last sector */
1505     if (count > 0) {
1506         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1507             return ret;
1508         memcpy(tmp_buf, buf, count);
1509         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1510             return ret;
1511     }
1512     return count1;
1513 }
1514 
1515 /*
1516  * Writes to the file and ensures that no writes are reordered across this
1517  * request (acts as a barrier)
1518  *
1519  * Returns 0 on success, -errno in error cases.
1520  */
1521 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1522     const void *buf, int count)
1523 {
1524     int ret;
1525 
1526     ret = bdrv_pwrite(bs, offset, buf, count);
1527     if (ret < 0) {
1528         return ret;
1529     }
1530 
1531     /* No flush needed for cache modes that use O_DSYNC */
1532     if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
1533         bdrv_flush(bs);
1534     }
1535 
1536     return 0;
1537 }
1538 
1539 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1540         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1541 {
1542     /* Perform I/O through a temporary buffer so that users who scribble over
1543      * their read buffer while the operation is in progress do not end up
1544      * modifying the image file.  This is critical for zero-copy guest I/O
1545      * where anything might happen inside guest memory.
1546      */
1547     void *bounce_buffer;
1548 
1549     BlockDriver *drv = bs->drv;
1550     struct iovec iov;
1551     QEMUIOVector bounce_qiov;
1552     int64_t cluster_sector_num;
1553     int cluster_nb_sectors;
1554     size_t skip_bytes;
1555     int ret;
1556 
1557     /* Cover entire cluster so no additional backing file I/O is required when
1558      * allocating cluster in the image file.
1559      */
1560     round_to_clusters(bs, sector_num, nb_sectors,
1561                       &cluster_sector_num, &cluster_nb_sectors);
1562 
1563     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1564                                    cluster_sector_num, cluster_nb_sectors);
1565 
1566     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1567     iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1568     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1569 
1570     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1571                              &bounce_qiov);
1572     if (ret < 0) {
1573         goto err;
1574     }
1575 
1576     if (drv->bdrv_co_write_zeroes &&
1577         buffer_is_zero(bounce_buffer, iov.iov_len)) {
1578         ret = drv->bdrv_co_write_zeroes(bs, cluster_sector_num,
1579                                         cluster_nb_sectors);
1580     } else {
1581         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1582                                   &bounce_qiov);
1583     }
1584 
1585     if (ret < 0) {
1586         /* It might be okay to ignore write errors for guest requests.  If this
1587          * is a deliberate copy-on-read then we don't want to ignore the error.
1588          * Simply report it in all cases.
1589          */
1590         goto err;
1591     }
1592 
1593     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1594     qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1595                            nb_sectors * BDRV_SECTOR_SIZE);
1596 
1597 err:
1598     qemu_vfree(bounce_buffer);
1599     return ret;
1600 }
1601 
1602 /*
1603  * Handle a read request in coroutine context
1604  */
1605 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1606     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1607     BdrvRequestFlags flags)
1608 {
1609     BlockDriver *drv = bs->drv;
1610     BdrvTrackedRequest req;
1611     int ret;
1612 
1613     if (!drv) {
1614         return -ENOMEDIUM;
1615     }
1616     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1617         return -EIO;
1618     }
1619 
1620     /* throttling disk read I/O */
1621     if (bs->io_limits_enabled) {
1622         bdrv_io_limits_intercept(bs, false, nb_sectors);
1623     }
1624 
1625     if (bs->copy_on_read) {
1626         flags |= BDRV_REQ_COPY_ON_READ;
1627     }
1628     if (flags & BDRV_REQ_COPY_ON_READ) {
1629         bs->copy_on_read_in_flight++;
1630     }
1631 
1632     if (bs->copy_on_read_in_flight) {
1633         wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1634     }
1635 
1636     tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1637 
1638     if (flags & BDRV_REQ_COPY_ON_READ) {
1639         int pnum;
1640 
1641         ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1642         if (ret < 0) {
1643             goto out;
1644         }
1645 
1646         if (!ret || pnum != nb_sectors) {
1647             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1648             goto out;
1649         }
1650     }
1651 
1652     ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1653 
1654 out:
1655     tracked_request_end(&req);
1656 
1657     if (flags & BDRV_REQ_COPY_ON_READ) {
1658         bs->copy_on_read_in_flight--;
1659     }
1660 
1661     return ret;
1662 }
1663 
1664 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1665     int nb_sectors, QEMUIOVector *qiov)
1666 {
1667     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1668 
1669     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1670 }
1671 
1672 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1673     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1674 {
1675     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1676 
1677     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1678                             BDRV_REQ_COPY_ON_READ);
1679 }
1680 
1681 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1682     int64_t sector_num, int nb_sectors)
1683 {
1684     BlockDriver *drv = bs->drv;
1685     QEMUIOVector qiov;
1686     struct iovec iov;
1687     int ret;
1688 
1689     /* First try the efficient write zeroes operation */
1690     if (drv->bdrv_co_write_zeroes) {
1691         return drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1692     }
1693 
1694     /* Fall back to bounce buffer if write zeroes is unsupported */
1695     iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
1696     iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1697     memset(iov.iov_base, 0, iov.iov_len);
1698     qemu_iovec_init_external(&qiov, &iov, 1);
1699 
1700     ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1701 
1702     qemu_vfree(iov.iov_base);
1703     return ret;
1704 }
1705 
1706 /*
1707  * Handle a write request in coroutine context
1708  */
1709 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1710     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1711     BdrvRequestFlags flags)
1712 {
1713     BlockDriver *drv = bs->drv;
1714     BdrvTrackedRequest req;
1715     int ret;
1716 
1717     if (!bs->drv) {
1718         return -ENOMEDIUM;
1719     }
1720     if (bs->read_only) {
1721         return -EACCES;
1722     }
1723     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1724         return -EIO;
1725     }
1726 
1727     /* throttling disk write I/O */
1728     if (bs->io_limits_enabled) {
1729         bdrv_io_limits_intercept(bs, true, nb_sectors);
1730     }
1731 
1732     if (bs->copy_on_read_in_flight) {
1733         wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1734     }
1735 
1736     tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1737 
1738     if (flags & BDRV_REQ_ZERO_WRITE) {
1739         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1740     } else {
1741         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1742     }
1743 
1744     if (bs->dirty_bitmap) {
1745         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1746     }
1747 
1748     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1749         bs->wr_highest_sector = sector_num + nb_sectors - 1;
1750     }
1751 
1752     tracked_request_end(&req);
1753 
1754     return ret;
1755 }
1756 
1757 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1758     int nb_sectors, QEMUIOVector *qiov)
1759 {
1760     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1761 
1762     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1763 }
1764 
1765 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1766                                       int64_t sector_num, int nb_sectors)
1767 {
1768     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1769 
1770     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1771                              BDRV_REQ_ZERO_WRITE);
1772 }
1773 
1774 /**
1775  * Truncate file to 'offset' bytes (needed only for file protocols)
1776  */
1777 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1778 {
1779     BlockDriver *drv = bs->drv;
1780     int ret;
1781     if (!drv)
1782         return -ENOMEDIUM;
1783     if (!drv->bdrv_truncate)
1784         return -ENOTSUP;
1785     if (bs->read_only)
1786         return -EACCES;
1787     if (bdrv_in_use(bs))
1788         return -EBUSY;
1789     ret = drv->bdrv_truncate(bs, offset);
1790     if (ret == 0) {
1791         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1792         bdrv_dev_resize_cb(bs);
1793     }
1794     return ret;
1795 }
1796 
1797 /**
1798  * Length of a allocated file in bytes. Sparse files are counted by actual
1799  * allocated space. Return < 0 if error or unknown.
1800  */
1801 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
1802 {
1803     BlockDriver *drv = bs->drv;
1804     if (!drv) {
1805         return -ENOMEDIUM;
1806     }
1807     if (drv->bdrv_get_allocated_file_size) {
1808         return drv->bdrv_get_allocated_file_size(bs);
1809     }
1810     if (bs->file) {
1811         return bdrv_get_allocated_file_size(bs->file);
1812     }
1813     return -ENOTSUP;
1814 }
1815 
1816 /**
1817  * Length of a file in bytes. Return < 0 if error or unknown.
1818  */
1819 int64_t bdrv_getlength(BlockDriverState *bs)
1820 {
1821     BlockDriver *drv = bs->drv;
1822     if (!drv)
1823         return -ENOMEDIUM;
1824 
1825     if (bs->growable || bdrv_dev_has_removable_media(bs)) {
1826         if (drv->bdrv_getlength) {
1827             return drv->bdrv_getlength(bs);
1828         }
1829     }
1830     return bs->total_sectors * BDRV_SECTOR_SIZE;
1831 }
1832 
1833 /* return 0 as number of sectors if no device present or error */
1834 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
1835 {
1836     int64_t length;
1837     length = bdrv_getlength(bs);
1838     if (length < 0)
1839         length = 0;
1840     else
1841         length = length >> BDRV_SECTOR_BITS;
1842     *nb_sectors_ptr = length;
1843 }
1844 
1845 struct partition {
1846         uint8_t boot_ind;           /* 0x80 - active */
1847         uint8_t head;               /* starting head */
1848         uint8_t sector;             /* starting sector */
1849         uint8_t cyl;                /* starting cylinder */
1850         uint8_t sys_ind;            /* What partition type */
1851         uint8_t end_head;           /* end head */
1852         uint8_t end_sector;         /* end sector */
1853         uint8_t end_cyl;            /* end cylinder */
1854         uint32_t start_sect;        /* starting sector counting from 0 */
1855         uint32_t nr_sects;          /* nr of sectors in partition */
1856 } QEMU_PACKED;
1857 
1858 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1859 static int guess_disk_lchs(BlockDriverState *bs,
1860                            int *pcylinders, int *pheads, int *psectors)
1861 {
1862     uint8_t buf[BDRV_SECTOR_SIZE];
1863     int ret, i, heads, sectors, cylinders;
1864     struct partition *p;
1865     uint32_t nr_sects;
1866     uint64_t nb_sectors;
1867 
1868     bdrv_get_geometry(bs, &nb_sectors);
1869 
1870     ret = bdrv_read(bs, 0, buf, 1);
1871     if (ret < 0)
1872         return -1;
1873     /* test msdos magic */
1874     if (buf[510] != 0x55 || buf[511] != 0xaa)
1875         return -1;
1876     for(i = 0; i < 4; i++) {
1877         p = ((struct partition *)(buf + 0x1be)) + i;
1878         nr_sects = le32_to_cpu(p->nr_sects);
1879         if (nr_sects && p->end_head) {
1880             /* We make the assumption that the partition terminates on
1881                a cylinder boundary */
1882             heads = p->end_head + 1;
1883             sectors = p->end_sector & 63;
1884             if (sectors == 0)
1885                 continue;
1886             cylinders = nb_sectors / (heads * sectors);
1887             if (cylinders < 1 || cylinders > 16383)
1888                 continue;
1889             *pheads = heads;
1890             *psectors = sectors;
1891             *pcylinders = cylinders;
1892 #if 0
1893             printf("guessed geometry: LCHS=%d %d %d\n",
1894                    cylinders, heads, sectors);
1895 #endif
1896             return 0;
1897         }
1898     }
1899     return -1;
1900 }
1901 
1902 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
1903 {
1904     int translation, lba_detected = 0;
1905     int cylinders, heads, secs;
1906     uint64_t nb_sectors;
1907 
1908     /* if a geometry hint is available, use it */
1909     bdrv_get_geometry(bs, &nb_sectors);
1910     bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
1911     translation = bdrv_get_translation_hint(bs);
1912     if (cylinders != 0) {
1913         *pcyls = cylinders;
1914         *pheads = heads;
1915         *psecs = secs;
1916     } else {
1917         if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
1918             if (heads > 16) {
1919                 /* if heads > 16, it means that a BIOS LBA
1920                    translation was active, so the default
1921                    hardware geometry is OK */
1922                 lba_detected = 1;
1923                 goto default_geometry;
1924             } else {
1925                 *pcyls = cylinders;
1926                 *pheads = heads;
1927                 *psecs = secs;
1928                 /* disable any translation to be in sync with
1929                    the logical geometry */
1930                 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
1931                     bdrv_set_translation_hint(bs,
1932                                               BIOS_ATA_TRANSLATION_NONE);
1933                 }
1934             }
1935         } else {
1936         default_geometry:
1937             /* if no geometry, use a standard physical disk geometry */
1938             cylinders = nb_sectors / (16 * 63);
1939 
1940             if (cylinders > 16383)
1941                 cylinders = 16383;
1942             else if (cylinders < 2)
1943                 cylinders = 2;
1944             *pcyls = cylinders;
1945             *pheads = 16;
1946             *psecs = 63;
1947             if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
1948                 if ((*pcyls * *pheads) <= 131072) {
1949                     bdrv_set_translation_hint(bs,
1950                                               BIOS_ATA_TRANSLATION_LARGE);
1951                 } else {
1952                     bdrv_set_translation_hint(bs,
1953                                               BIOS_ATA_TRANSLATION_LBA);
1954                 }
1955             }
1956         }
1957         bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
1958     }
1959 }
1960 
1961 void bdrv_set_geometry_hint(BlockDriverState *bs,
1962                             int cyls, int heads, int secs)
1963 {
1964     bs->cyls = cyls;
1965     bs->heads = heads;
1966     bs->secs = secs;
1967 }
1968 
1969 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
1970 {
1971     bs->translation = translation;
1972 }
1973 
1974 void bdrv_get_geometry_hint(BlockDriverState *bs,
1975                             int *pcyls, int *pheads, int *psecs)
1976 {
1977     *pcyls = bs->cyls;
1978     *pheads = bs->heads;
1979     *psecs = bs->secs;
1980 }
1981 
1982 /* throttling disk io limits */
1983 void bdrv_set_io_limits(BlockDriverState *bs,
1984                         BlockIOLimit *io_limits)
1985 {
1986     bs->io_limits = *io_limits;
1987     bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
1988 }
1989 
1990 /* Recognize floppy formats */
1991 typedef struct FDFormat {
1992     FDriveType drive;
1993     uint8_t last_sect;
1994     uint8_t max_track;
1995     uint8_t max_head;
1996 } FDFormat;
1997 
1998 static const FDFormat fd_formats[] = {
1999     /* First entry is default format */
2000     /* 1.44 MB 3"1/2 floppy disks */
2001     { FDRIVE_DRV_144, 18, 80, 1, },
2002     { FDRIVE_DRV_144, 20, 80, 1, },
2003     { FDRIVE_DRV_144, 21, 80, 1, },
2004     { FDRIVE_DRV_144, 21, 82, 1, },
2005     { FDRIVE_DRV_144, 21, 83, 1, },
2006     { FDRIVE_DRV_144, 22, 80, 1, },
2007     { FDRIVE_DRV_144, 23, 80, 1, },
2008     { FDRIVE_DRV_144, 24, 80, 1, },
2009     /* 2.88 MB 3"1/2 floppy disks */
2010     { FDRIVE_DRV_288, 36, 80, 1, },
2011     { FDRIVE_DRV_288, 39, 80, 1, },
2012     { FDRIVE_DRV_288, 40, 80, 1, },
2013     { FDRIVE_DRV_288, 44, 80, 1, },
2014     { FDRIVE_DRV_288, 48, 80, 1, },
2015     /* 720 kB 3"1/2 floppy disks */
2016     { FDRIVE_DRV_144,  9, 80, 1, },
2017     { FDRIVE_DRV_144, 10, 80, 1, },
2018     { FDRIVE_DRV_144, 10, 82, 1, },
2019     { FDRIVE_DRV_144, 10, 83, 1, },
2020     { FDRIVE_DRV_144, 13, 80, 1, },
2021     { FDRIVE_DRV_144, 14, 80, 1, },
2022     /* 1.2 MB 5"1/4 floppy disks */
2023     { FDRIVE_DRV_120, 15, 80, 1, },
2024     { FDRIVE_DRV_120, 18, 80, 1, },
2025     { FDRIVE_DRV_120, 18, 82, 1, },
2026     { FDRIVE_DRV_120, 18, 83, 1, },
2027     { FDRIVE_DRV_120, 20, 80, 1, },
2028     /* 720 kB 5"1/4 floppy disks */
2029     { FDRIVE_DRV_120,  9, 80, 1, },
2030     { FDRIVE_DRV_120, 11, 80, 1, },
2031     /* 360 kB 5"1/4 floppy disks */
2032     { FDRIVE_DRV_120,  9, 40, 1, },
2033     { FDRIVE_DRV_120,  9, 40, 0, },
2034     { FDRIVE_DRV_120, 10, 41, 1, },
2035     { FDRIVE_DRV_120, 10, 42, 1, },
2036     /* 320 kB 5"1/4 floppy disks */
2037     { FDRIVE_DRV_120,  8, 40, 1, },
2038     { FDRIVE_DRV_120,  8, 40, 0, },
2039     /* 360 kB must match 5"1/4 better than 3"1/2... */
2040     { FDRIVE_DRV_144,  9, 80, 0, },
2041     /* end */
2042     { FDRIVE_DRV_NONE, -1, -1, 0, },
2043 };
2044 
2045 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2046                                    int *max_track, int *last_sect,
2047                                    FDriveType drive_in, FDriveType *drive)
2048 {
2049     const FDFormat *parse;
2050     uint64_t nb_sectors, size;
2051     int i, first_match, match;
2052 
2053     bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2054     if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2055         /* User defined disk */
2056     } else {
2057         bdrv_get_geometry(bs, &nb_sectors);
2058         match = -1;
2059         first_match = -1;
2060         for (i = 0; ; i++) {
2061             parse = &fd_formats[i];
2062             if (parse->drive == FDRIVE_DRV_NONE) {
2063                 break;
2064             }
2065             if (drive_in == parse->drive ||
2066                 drive_in == FDRIVE_DRV_NONE) {
2067                 size = (parse->max_head + 1) * parse->max_track *
2068                     parse->last_sect;
2069                 if (nb_sectors == size) {
2070                     match = i;
2071                     break;
2072                 }
2073                 if (first_match == -1) {
2074                     first_match = i;
2075                 }
2076             }
2077         }
2078         if (match == -1) {
2079             if (first_match == -1) {
2080                 match = 1;
2081             } else {
2082                 match = first_match;
2083             }
2084             parse = &fd_formats[match];
2085         }
2086         *nb_heads = parse->max_head + 1;
2087         *max_track = parse->max_track;
2088         *last_sect = parse->last_sect;
2089         *drive = parse->drive;
2090     }
2091 }
2092 
2093 int bdrv_get_translation_hint(BlockDriverState *bs)
2094 {
2095     return bs->translation;
2096 }
2097 
2098 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2099                        BlockErrorAction on_write_error)
2100 {
2101     bs->on_read_error = on_read_error;
2102     bs->on_write_error = on_write_error;
2103 }
2104 
2105 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2106 {
2107     return is_read ? bs->on_read_error : bs->on_write_error;
2108 }
2109 
2110 int bdrv_is_read_only(BlockDriverState *bs)
2111 {
2112     return bs->read_only;
2113 }
2114 
2115 int bdrv_is_sg(BlockDriverState *bs)
2116 {
2117     return bs->sg;
2118 }
2119 
2120 int bdrv_enable_write_cache(BlockDriverState *bs)
2121 {
2122     return bs->enable_write_cache;
2123 }
2124 
2125 int bdrv_is_encrypted(BlockDriverState *bs)
2126 {
2127     if (bs->backing_hd && bs->backing_hd->encrypted)
2128         return 1;
2129     return bs->encrypted;
2130 }
2131 
2132 int bdrv_key_required(BlockDriverState *bs)
2133 {
2134     BlockDriverState *backing_hd = bs->backing_hd;
2135 
2136     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2137         return 1;
2138     return (bs->encrypted && !bs->valid_key);
2139 }
2140 
2141 int bdrv_set_key(BlockDriverState *bs, const char *key)
2142 {
2143     int ret;
2144     if (bs->backing_hd && bs->backing_hd->encrypted) {
2145         ret = bdrv_set_key(bs->backing_hd, key);
2146         if (ret < 0)
2147             return ret;
2148         if (!bs->encrypted)
2149             return 0;
2150     }
2151     if (!bs->encrypted) {
2152         return -EINVAL;
2153     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2154         return -ENOMEDIUM;
2155     }
2156     ret = bs->drv->bdrv_set_key(bs, key);
2157     if (ret < 0) {
2158         bs->valid_key = 0;
2159     } else if (!bs->valid_key) {
2160         bs->valid_key = 1;
2161         /* call the change callback now, we skipped it on open */
2162         bdrv_dev_change_media_cb(bs, true);
2163     }
2164     return ret;
2165 }
2166 
2167 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2168 {
2169     if (!bs->drv) {
2170         buf[0] = '\0';
2171     } else {
2172         pstrcpy(buf, buf_size, bs->drv->format_name);
2173     }
2174 }
2175 
2176 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2177                          void *opaque)
2178 {
2179     BlockDriver *drv;
2180 
2181     QLIST_FOREACH(drv, &bdrv_drivers, list) {
2182         it(opaque, drv->format_name);
2183     }
2184 }
2185 
2186 BlockDriverState *bdrv_find(const char *name)
2187 {
2188     BlockDriverState *bs;
2189 
2190     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2191         if (!strcmp(name, bs->device_name)) {
2192             return bs;
2193         }
2194     }
2195     return NULL;
2196 }
2197 
2198 BlockDriverState *bdrv_next(BlockDriverState *bs)
2199 {
2200     if (!bs) {
2201         return QTAILQ_FIRST(&bdrv_states);
2202     }
2203     return QTAILQ_NEXT(bs, list);
2204 }
2205 
2206 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2207 {
2208     BlockDriverState *bs;
2209 
2210     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2211         it(opaque, bs);
2212     }
2213 }
2214 
2215 const char *bdrv_get_device_name(BlockDriverState *bs)
2216 {
2217     return bs->device_name;
2218 }
2219 
2220 void bdrv_flush_all(void)
2221 {
2222     BlockDriverState *bs;
2223 
2224     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2225         if (!bdrv_is_read_only(bs) && bdrv_is_inserted(bs)) {
2226             bdrv_flush(bs);
2227         }
2228     }
2229 }
2230 
2231 int bdrv_has_zero_init(BlockDriverState *bs)
2232 {
2233     assert(bs->drv);
2234 
2235     if (bs->drv->bdrv_has_zero_init) {
2236         return bs->drv->bdrv_has_zero_init(bs);
2237     }
2238 
2239     return 1;
2240 }
2241 
2242 typedef struct BdrvCoIsAllocatedData {
2243     BlockDriverState *bs;
2244     int64_t sector_num;
2245     int nb_sectors;
2246     int *pnum;
2247     int ret;
2248     bool done;
2249 } BdrvCoIsAllocatedData;
2250 
2251 /*
2252  * Returns true iff the specified sector is present in the disk image. Drivers
2253  * not implementing the functionality are assumed to not support backing files,
2254  * hence all their sectors are reported as allocated.
2255  *
2256  * If 'sector_num' is beyond the end of the disk image the return value is 0
2257  * and 'pnum' is set to 0.
2258  *
2259  * 'pnum' is set to the number of sectors (including and immediately following
2260  * the specified sector) that are known to be in the same
2261  * allocated/unallocated state.
2262  *
2263  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
2264  * beyond the end of the disk image it will be clamped.
2265  */
2266 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2267                                       int nb_sectors, int *pnum)
2268 {
2269     int64_t n;
2270 
2271     if (sector_num >= bs->total_sectors) {
2272         *pnum = 0;
2273         return 0;
2274     }
2275 
2276     n = bs->total_sectors - sector_num;
2277     if (n < nb_sectors) {
2278         nb_sectors = n;
2279     }
2280 
2281     if (!bs->drv->bdrv_co_is_allocated) {
2282         *pnum = nb_sectors;
2283         return 1;
2284     }
2285 
2286     return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2287 }
2288 
2289 /* Coroutine wrapper for bdrv_is_allocated() */
2290 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2291 {
2292     BdrvCoIsAllocatedData *data = opaque;
2293     BlockDriverState *bs = data->bs;
2294 
2295     data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2296                                      data->pnum);
2297     data->done = true;
2298 }
2299 
2300 /*
2301  * Synchronous wrapper around bdrv_co_is_allocated().
2302  *
2303  * See bdrv_co_is_allocated() for details.
2304  */
2305 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2306                       int *pnum)
2307 {
2308     Coroutine *co;
2309     BdrvCoIsAllocatedData data = {
2310         .bs = bs,
2311         .sector_num = sector_num,
2312         .nb_sectors = nb_sectors,
2313         .pnum = pnum,
2314         .done = false,
2315     };
2316 
2317     co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2318     qemu_coroutine_enter(co, &data);
2319     while (!data.done) {
2320         qemu_aio_wait();
2321     }
2322     return data.ret;
2323 }
2324 
2325 BlockInfoList *qmp_query_block(Error **errp)
2326 {
2327     BlockInfoList *head = NULL, *cur_item = NULL;
2328     BlockDriverState *bs;
2329 
2330     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2331         BlockInfoList *info = g_malloc0(sizeof(*info));
2332 
2333         info->value = g_malloc0(sizeof(*info->value));
2334         info->value->device = g_strdup(bs->device_name);
2335         info->value->type = g_strdup("unknown");
2336         info->value->locked = bdrv_dev_is_medium_locked(bs);
2337         info->value->removable = bdrv_dev_has_removable_media(bs);
2338 
2339         if (bdrv_dev_has_removable_media(bs)) {
2340             info->value->has_tray_open = true;
2341             info->value->tray_open = bdrv_dev_is_tray_open(bs);
2342         }
2343 
2344         if (bdrv_iostatus_is_enabled(bs)) {
2345             info->value->has_io_status = true;
2346             info->value->io_status = bs->iostatus;
2347         }
2348 
2349         if (bs->drv) {
2350             info->value->has_inserted = true;
2351             info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2352             info->value->inserted->file = g_strdup(bs->filename);
2353             info->value->inserted->ro = bs->read_only;
2354             info->value->inserted->drv = g_strdup(bs->drv->format_name);
2355             info->value->inserted->encrypted = bs->encrypted;
2356             if (bs->backing_file[0]) {
2357                 info->value->inserted->has_backing_file = true;
2358                 info->value->inserted->backing_file = g_strdup(bs->backing_file);
2359             }
2360 
2361             if (bs->io_limits_enabled) {
2362                 info->value->inserted->bps =
2363                                bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2364                 info->value->inserted->bps_rd =
2365                                bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2366                 info->value->inserted->bps_wr =
2367                                bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2368                 info->value->inserted->iops =
2369                                bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2370                 info->value->inserted->iops_rd =
2371                                bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2372                 info->value->inserted->iops_wr =
2373                                bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2374             }
2375         }
2376 
2377         /* XXX: waiting for the qapi to support GSList */
2378         if (!cur_item) {
2379             head = cur_item = info;
2380         } else {
2381             cur_item->next = info;
2382             cur_item = info;
2383         }
2384     }
2385 
2386     return head;
2387 }
2388 
2389 /* Consider exposing this as a full fledged QMP command */
2390 static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2391 {
2392     BlockStats *s;
2393 
2394     s = g_malloc0(sizeof(*s));
2395 
2396     if (bs->device_name[0]) {
2397         s->has_device = true;
2398         s->device = g_strdup(bs->device_name);
2399     }
2400 
2401     s->stats = g_malloc0(sizeof(*s->stats));
2402     s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2403     s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2404     s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2405     s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2406     s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2407     s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2408     s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2409     s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2410     s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2411 
2412     if (bs->file) {
2413         s->has_parent = true;
2414         s->parent = qmp_query_blockstat(bs->file, NULL);
2415     }
2416 
2417     return s;
2418 }
2419 
2420 BlockStatsList *qmp_query_blockstats(Error **errp)
2421 {
2422     BlockStatsList *head = NULL, *cur_item = NULL;
2423     BlockDriverState *bs;
2424 
2425     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2426         BlockStatsList *info = g_malloc0(sizeof(*info));
2427         info->value = qmp_query_blockstat(bs, NULL);
2428 
2429         /* XXX: waiting for the qapi to support GSList */
2430         if (!cur_item) {
2431             head = cur_item = info;
2432         } else {
2433             cur_item->next = info;
2434             cur_item = info;
2435         }
2436     }
2437 
2438     return head;
2439 }
2440 
2441 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2442 {
2443     if (bs->backing_hd && bs->backing_hd->encrypted)
2444         return bs->backing_file;
2445     else if (bs->encrypted)
2446         return bs->filename;
2447     else
2448         return NULL;
2449 }
2450 
2451 void bdrv_get_backing_filename(BlockDriverState *bs,
2452                                char *filename, int filename_size)
2453 {
2454     pstrcpy(filename, filename_size, bs->backing_file);
2455 }
2456 
2457 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2458                           const uint8_t *buf, int nb_sectors)
2459 {
2460     BlockDriver *drv = bs->drv;
2461     if (!drv)
2462         return -ENOMEDIUM;
2463     if (!drv->bdrv_write_compressed)
2464         return -ENOTSUP;
2465     if (bdrv_check_request(bs, sector_num, nb_sectors))
2466         return -EIO;
2467 
2468     if (bs->dirty_bitmap) {
2469         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2470     }
2471 
2472     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2473 }
2474 
2475 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2476 {
2477     BlockDriver *drv = bs->drv;
2478     if (!drv)
2479         return -ENOMEDIUM;
2480     if (!drv->bdrv_get_info)
2481         return -ENOTSUP;
2482     memset(bdi, 0, sizeof(*bdi));
2483     return drv->bdrv_get_info(bs, bdi);
2484 }
2485 
2486 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2487                       int64_t pos, int size)
2488 {
2489     BlockDriver *drv = bs->drv;
2490     if (!drv)
2491         return -ENOMEDIUM;
2492     if (drv->bdrv_save_vmstate)
2493         return drv->bdrv_save_vmstate(bs, buf, pos, size);
2494     if (bs->file)
2495         return bdrv_save_vmstate(bs->file, buf, pos, size);
2496     return -ENOTSUP;
2497 }
2498 
2499 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2500                       int64_t pos, int size)
2501 {
2502     BlockDriver *drv = bs->drv;
2503     if (!drv)
2504         return -ENOMEDIUM;
2505     if (drv->bdrv_load_vmstate)
2506         return drv->bdrv_load_vmstate(bs, buf, pos, size);
2507     if (bs->file)
2508         return bdrv_load_vmstate(bs->file, buf, pos, size);
2509     return -ENOTSUP;
2510 }
2511 
2512 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2513 {
2514     BlockDriver *drv = bs->drv;
2515 
2516     if (!drv || !drv->bdrv_debug_event) {
2517         return;
2518     }
2519 
2520     return drv->bdrv_debug_event(bs, event);
2521 
2522 }
2523 
2524 /**************************************************************/
2525 /* handling of snapshots */
2526 
2527 int bdrv_can_snapshot(BlockDriverState *bs)
2528 {
2529     BlockDriver *drv = bs->drv;
2530     if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2531         return 0;
2532     }
2533 
2534     if (!drv->bdrv_snapshot_create) {
2535         if (bs->file != NULL) {
2536             return bdrv_can_snapshot(bs->file);
2537         }
2538         return 0;
2539     }
2540 
2541     return 1;
2542 }
2543 
2544 int bdrv_is_snapshot(BlockDriverState *bs)
2545 {
2546     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2547 }
2548 
2549 BlockDriverState *bdrv_snapshots(void)
2550 {
2551     BlockDriverState *bs;
2552 
2553     if (bs_snapshots) {
2554         return bs_snapshots;
2555     }
2556 
2557     bs = NULL;
2558     while ((bs = bdrv_next(bs))) {
2559         if (bdrv_can_snapshot(bs)) {
2560             bs_snapshots = bs;
2561             return bs;
2562         }
2563     }
2564     return NULL;
2565 }
2566 
2567 int bdrv_snapshot_create(BlockDriverState *bs,
2568                          QEMUSnapshotInfo *sn_info)
2569 {
2570     BlockDriver *drv = bs->drv;
2571     if (!drv)
2572         return -ENOMEDIUM;
2573     if (drv->bdrv_snapshot_create)
2574         return drv->bdrv_snapshot_create(bs, sn_info);
2575     if (bs->file)
2576         return bdrv_snapshot_create(bs->file, sn_info);
2577     return -ENOTSUP;
2578 }
2579 
2580 int bdrv_snapshot_goto(BlockDriverState *bs,
2581                        const char *snapshot_id)
2582 {
2583     BlockDriver *drv = bs->drv;
2584     int ret, open_ret;
2585 
2586     if (!drv)
2587         return -ENOMEDIUM;
2588     if (drv->bdrv_snapshot_goto)
2589         return drv->bdrv_snapshot_goto(bs, snapshot_id);
2590 
2591     if (bs->file) {
2592         drv->bdrv_close(bs);
2593         ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2594         open_ret = drv->bdrv_open(bs, bs->open_flags);
2595         if (open_ret < 0) {
2596             bdrv_delete(bs->file);
2597             bs->drv = NULL;
2598             return open_ret;
2599         }
2600         return ret;
2601     }
2602 
2603     return -ENOTSUP;
2604 }
2605 
2606 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2607 {
2608     BlockDriver *drv = bs->drv;
2609     if (!drv)
2610         return -ENOMEDIUM;
2611     if (drv->bdrv_snapshot_delete)
2612         return drv->bdrv_snapshot_delete(bs, snapshot_id);
2613     if (bs->file)
2614         return bdrv_snapshot_delete(bs->file, snapshot_id);
2615     return -ENOTSUP;
2616 }
2617 
2618 int bdrv_snapshot_list(BlockDriverState *bs,
2619                        QEMUSnapshotInfo **psn_info)
2620 {
2621     BlockDriver *drv = bs->drv;
2622     if (!drv)
2623         return -ENOMEDIUM;
2624     if (drv->bdrv_snapshot_list)
2625         return drv->bdrv_snapshot_list(bs, psn_info);
2626     if (bs->file)
2627         return bdrv_snapshot_list(bs->file, psn_info);
2628     return -ENOTSUP;
2629 }
2630 
2631 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2632         const char *snapshot_name)
2633 {
2634     BlockDriver *drv = bs->drv;
2635     if (!drv) {
2636         return -ENOMEDIUM;
2637     }
2638     if (!bs->read_only) {
2639         return -EINVAL;
2640     }
2641     if (drv->bdrv_snapshot_load_tmp) {
2642         return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2643     }
2644     return -ENOTSUP;
2645 }
2646 
2647 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2648         const char *backing_file)
2649 {
2650     if (!bs->drv) {
2651         return NULL;
2652     }
2653 
2654     if (bs->backing_hd) {
2655         if (strcmp(bs->backing_file, backing_file) == 0) {
2656             return bs->backing_hd;
2657         } else {
2658             return bdrv_find_backing_image(bs->backing_hd, backing_file);
2659         }
2660     }
2661 
2662     return NULL;
2663 }
2664 
2665 #define NB_SUFFIXES 4
2666 
2667 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2668 {
2669     static const char suffixes[NB_SUFFIXES] = "KMGT";
2670     int64_t base;
2671     int i;
2672 
2673     if (size <= 999) {
2674         snprintf(buf, buf_size, "%" PRId64, size);
2675     } else {
2676         base = 1024;
2677         for(i = 0; i < NB_SUFFIXES; i++) {
2678             if (size < (10 * base)) {
2679                 snprintf(buf, buf_size, "%0.1f%c",
2680                          (double)size / base,
2681                          suffixes[i]);
2682                 break;
2683             } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2684                 snprintf(buf, buf_size, "%" PRId64 "%c",
2685                          ((size + (base >> 1)) / base),
2686                          suffixes[i]);
2687                 break;
2688             }
2689             base = base * 1024;
2690         }
2691     }
2692     return buf;
2693 }
2694 
2695 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2696 {
2697     char buf1[128], date_buf[128], clock_buf[128];
2698 #ifdef _WIN32
2699     struct tm *ptm;
2700 #else
2701     struct tm tm;
2702 #endif
2703     time_t ti;
2704     int64_t secs;
2705 
2706     if (!sn) {
2707         snprintf(buf, buf_size,
2708                  "%-10s%-20s%7s%20s%15s",
2709                  "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2710     } else {
2711         ti = sn->date_sec;
2712 #ifdef _WIN32
2713         ptm = localtime(&ti);
2714         strftime(date_buf, sizeof(date_buf),
2715                  "%Y-%m-%d %H:%M:%S", ptm);
2716 #else
2717         localtime_r(&ti, &tm);
2718         strftime(date_buf, sizeof(date_buf),
2719                  "%Y-%m-%d %H:%M:%S", &tm);
2720 #endif
2721         secs = sn->vm_clock_nsec / 1000000000;
2722         snprintf(clock_buf, sizeof(clock_buf),
2723                  "%02d:%02d:%02d.%03d",
2724                  (int)(secs / 3600),
2725                  (int)((secs / 60) % 60),
2726                  (int)(secs % 60),
2727                  (int)((sn->vm_clock_nsec / 1000000) % 1000));
2728         snprintf(buf, buf_size,
2729                  "%-10s%-20s%7s%20s%15s",
2730                  sn->id_str, sn->name,
2731                  get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2732                  date_buf,
2733                  clock_buf);
2734     }
2735     return buf;
2736 }
2737 
2738 /**************************************************************/
2739 /* async I/Os */
2740 
2741 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2742                                  QEMUIOVector *qiov, int nb_sectors,
2743                                  BlockDriverCompletionFunc *cb, void *opaque)
2744 {
2745     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2746 
2747     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2748                                  cb, opaque, false);
2749 }
2750 
2751 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2752                                   QEMUIOVector *qiov, int nb_sectors,
2753                                   BlockDriverCompletionFunc *cb, void *opaque)
2754 {
2755     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2756 
2757     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2758                                  cb, opaque, true);
2759 }
2760 
2761 
2762 typedef struct MultiwriteCB {
2763     int error;
2764     int num_requests;
2765     int num_callbacks;
2766     struct {
2767         BlockDriverCompletionFunc *cb;
2768         void *opaque;
2769         QEMUIOVector *free_qiov;
2770         void *free_buf;
2771     } callbacks[];
2772 } MultiwriteCB;
2773 
2774 static void multiwrite_user_cb(MultiwriteCB *mcb)
2775 {
2776     int i;
2777 
2778     for (i = 0; i < mcb->num_callbacks; i++) {
2779         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2780         if (mcb->callbacks[i].free_qiov) {
2781             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2782         }
2783         g_free(mcb->callbacks[i].free_qiov);
2784         qemu_vfree(mcb->callbacks[i].free_buf);
2785     }
2786 }
2787 
2788 static void multiwrite_cb(void *opaque, int ret)
2789 {
2790     MultiwriteCB *mcb = opaque;
2791 
2792     trace_multiwrite_cb(mcb, ret);
2793 
2794     if (ret < 0 && !mcb->error) {
2795         mcb->error = ret;
2796     }
2797 
2798     mcb->num_requests--;
2799     if (mcb->num_requests == 0) {
2800         multiwrite_user_cb(mcb);
2801         g_free(mcb);
2802     }
2803 }
2804 
2805 static int multiwrite_req_compare(const void *a, const void *b)
2806 {
2807     const BlockRequest *req1 = a, *req2 = b;
2808 
2809     /*
2810      * Note that we can't simply subtract req2->sector from req1->sector
2811      * here as that could overflow the return value.
2812      */
2813     if (req1->sector > req2->sector) {
2814         return 1;
2815     } else if (req1->sector < req2->sector) {
2816         return -1;
2817     } else {
2818         return 0;
2819     }
2820 }
2821 
2822 /*
2823  * Takes a bunch of requests and tries to merge them. Returns the number of
2824  * requests that remain after merging.
2825  */
2826 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2827     int num_reqs, MultiwriteCB *mcb)
2828 {
2829     int i, outidx;
2830 
2831     // Sort requests by start sector
2832     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2833 
2834     // Check if adjacent requests touch the same clusters. If so, combine them,
2835     // filling up gaps with zero sectors.
2836     outidx = 0;
2837     for (i = 1; i < num_reqs; i++) {
2838         int merge = 0;
2839         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2840 
2841         // This handles the cases that are valid for all block drivers, namely
2842         // exactly sequential writes and overlapping writes.
2843         if (reqs[i].sector <= oldreq_last) {
2844             merge = 1;
2845         }
2846 
2847         // The block driver may decide that it makes sense to combine requests
2848         // even if there is a gap of some sectors between them. In this case,
2849         // the gap is filled with zeros (therefore only applicable for yet
2850         // unused space in format like qcow2).
2851         if (!merge && bs->drv->bdrv_merge_requests) {
2852             merge = bs->drv->bdrv_merge_requests(bs, &reqs[outidx], &reqs[i]);
2853         }
2854 
2855         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2856             merge = 0;
2857         }
2858 
2859         if (merge) {
2860             size_t size;
2861             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
2862             qemu_iovec_init(qiov,
2863                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2864 
2865             // Add the first request to the merged one. If the requests are
2866             // overlapping, drop the last sectors of the first request.
2867             size = (reqs[i].sector - reqs[outidx].sector) << 9;
2868             qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
2869 
2870             // We might need to add some zeros between the two requests
2871             if (reqs[i].sector > oldreq_last) {
2872                 size_t zero_bytes = (reqs[i].sector - oldreq_last) << 9;
2873                 uint8_t *buf = qemu_blockalign(bs, zero_bytes);
2874                 memset(buf, 0, zero_bytes);
2875                 qemu_iovec_add(qiov, buf, zero_bytes);
2876                 mcb->callbacks[i].free_buf = buf;
2877             }
2878 
2879             // Add the second request
2880             qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
2881 
2882             reqs[outidx].nb_sectors = qiov->size >> 9;
2883             reqs[outidx].qiov = qiov;
2884 
2885             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
2886         } else {
2887             outidx++;
2888             reqs[outidx].sector     = reqs[i].sector;
2889             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
2890             reqs[outidx].qiov       = reqs[i].qiov;
2891         }
2892     }
2893 
2894     return outidx + 1;
2895 }
2896 
2897 /*
2898  * Submit multiple AIO write requests at once.
2899  *
2900  * On success, the function returns 0 and all requests in the reqs array have
2901  * been submitted. In error case this function returns -1, and any of the
2902  * requests may or may not be submitted yet. In particular, this means that the
2903  * callback will be called for some of the requests, for others it won't. The
2904  * caller must check the error field of the BlockRequest to wait for the right
2905  * callbacks (if error != 0, no callback will be called).
2906  *
2907  * The implementation may modify the contents of the reqs array, e.g. to merge
2908  * requests. However, the fields opaque and error are left unmodified as they
2909  * are used to signal failure for a single request to the caller.
2910  */
2911 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
2912 {
2913     MultiwriteCB *mcb;
2914     int i;
2915 
2916     /* don't submit writes if we don't have a medium */
2917     if (bs->drv == NULL) {
2918         for (i = 0; i < num_reqs; i++) {
2919             reqs[i].error = -ENOMEDIUM;
2920         }
2921         return -1;
2922     }
2923 
2924     if (num_reqs == 0) {
2925         return 0;
2926     }
2927 
2928     // Create MultiwriteCB structure
2929     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
2930     mcb->num_requests = 0;
2931     mcb->num_callbacks = num_reqs;
2932 
2933     for (i = 0; i < num_reqs; i++) {
2934         mcb->callbacks[i].cb = reqs[i].cb;
2935         mcb->callbacks[i].opaque = reqs[i].opaque;
2936     }
2937 
2938     // Check for mergable requests
2939     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
2940 
2941     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
2942 
2943     /* Run the aio requests. */
2944     mcb->num_requests = num_reqs;
2945     for (i = 0; i < num_reqs; i++) {
2946         bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
2947             reqs[i].nb_sectors, multiwrite_cb, mcb);
2948     }
2949 
2950     return 0;
2951 }
2952 
2953 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
2954 {
2955     acb->pool->cancel(acb);
2956 }
2957 
2958 /* block I/O throttling */
2959 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
2960                  bool is_write, double elapsed_time, uint64_t *wait)
2961 {
2962     uint64_t bps_limit = 0;
2963     double   bytes_limit, bytes_base, bytes_res;
2964     double   slice_time, wait_time;
2965 
2966     if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
2967         bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2968     } else if (bs->io_limits.bps[is_write]) {
2969         bps_limit = bs->io_limits.bps[is_write];
2970     } else {
2971         if (wait) {
2972             *wait = 0;
2973         }
2974 
2975         return false;
2976     }
2977 
2978     slice_time = bs->slice_end - bs->slice_start;
2979     slice_time /= (NANOSECONDS_PER_SECOND);
2980     bytes_limit = bps_limit * slice_time;
2981     bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
2982     if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
2983         bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
2984     }
2985 
2986     /* bytes_base: the bytes of data which have been read/written; and
2987      *             it is obtained from the history statistic info.
2988      * bytes_res: the remaining bytes of data which need to be read/written.
2989      * (bytes_base + bytes_res) / bps_limit: used to calcuate
2990      *             the total time for completing reading/writting all data.
2991      */
2992     bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
2993 
2994     if (bytes_base + bytes_res <= bytes_limit) {
2995         if (wait) {
2996             *wait = 0;
2997         }
2998 
2999         return false;
3000     }
3001 
3002     /* Calc approx time to dispatch */
3003     wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3004 
3005     /* When the I/O rate at runtime exceeds the limits,
3006      * bs->slice_end need to be extended in order that the current statistic
3007      * info can be kept until the timer fire, so it is increased and tuned
3008      * based on the result of experiment.
3009      */
3010     bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3011     bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3012     if (wait) {
3013         *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3014     }
3015 
3016     return true;
3017 }
3018 
3019 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3020                              double elapsed_time, uint64_t *wait)
3021 {
3022     uint64_t iops_limit = 0;
3023     double   ios_limit, ios_base;
3024     double   slice_time, wait_time;
3025 
3026     if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3027         iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3028     } else if (bs->io_limits.iops[is_write]) {
3029         iops_limit = bs->io_limits.iops[is_write];
3030     } else {
3031         if (wait) {
3032             *wait = 0;
3033         }
3034 
3035         return false;
3036     }
3037 
3038     slice_time = bs->slice_end - bs->slice_start;
3039     slice_time /= (NANOSECONDS_PER_SECOND);
3040     ios_limit  = iops_limit * slice_time;
3041     ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3042     if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3043         ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3044     }
3045 
3046     if (ios_base + 1 <= ios_limit) {
3047         if (wait) {
3048             *wait = 0;
3049         }
3050 
3051         return false;
3052     }
3053 
3054     /* Calc approx time to dispatch */
3055     wait_time = (ios_base + 1) / iops_limit;
3056     if (wait_time > elapsed_time) {
3057         wait_time = wait_time - elapsed_time;
3058     } else {
3059         wait_time = 0;
3060     }
3061 
3062     bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3063     bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3064     if (wait) {
3065         *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3066     }
3067 
3068     return true;
3069 }
3070 
3071 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3072                            bool is_write, int64_t *wait)
3073 {
3074     int64_t  now, max_wait;
3075     uint64_t bps_wait = 0, iops_wait = 0;
3076     double   elapsed_time;
3077     int      bps_ret, iops_ret;
3078 
3079     now = qemu_get_clock_ns(vm_clock);
3080     if ((bs->slice_start < now)
3081         && (bs->slice_end > now)) {
3082         bs->slice_end = now + bs->slice_time;
3083     } else {
3084         bs->slice_time  =  5 * BLOCK_IO_SLICE_TIME;
3085         bs->slice_start = now;
3086         bs->slice_end   = now + bs->slice_time;
3087 
3088         bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
3089         bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3090 
3091         bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
3092         bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
3093     }
3094 
3095     elapsed_time  = now - bs->slice_start;
3096     elapsed_time  /= (NANOSECONDS_PER_SECOND);
3097 
3098     bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
3099                                       is_write, elapsed_time, &bps_wait);
3100     iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3101                                       elapsed_time, &iops_wait);
3102     if (bps_ret || iops_ret) {
3103         max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3104         if (wait) {
3105             *wait = max_wait;
3106         }
3107 
3108         now = qemu_get_clock_ns(vm_clock);
3109         if (bs->slice_end < now + max_wait) {
3110             bs->slice_end = now + max_wait;
3111         }
3112 
3113         return true;
3114     }
3115 
3116     if (wait) {
3117         *wait = 0;
3118     }
3119 
3120     return false;
3121 }
3122 
3123 /**************************************************************/
3124 /* async block device emulation */
3125 
3126 typedef struct BlockDriverAIOCBSync {
3127     BlockDriverAIOCB common;
3128     QEMUBH *bh;
3129     int ret;
3130     /* vector translation state */
3131     QEMUIOVector *qiov;
3132     uint8_t *bounce;
3133     int is_write;
3134 } BlockDriverAIOCBSync;
3135 
3136 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3137 {
3138     BlockDriverAIOCBSync *acb =
3139         container_of(blockacb, BlockDriverAIOCBSync, common);
3140     qemu_bh_delete(acb->bh);
3141     acb->bh = NULL;
3142     qemu_aio_release(acb);
3143 }
3144 
3145 static AIOPool bdrv_em_aio_pool = {
3146     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3147     .cancel             = bdrv_aio_cancel_em,
3148 };
3149 
3150 static void bdrv_aio_bh_cb(void *opaque)
3151 {
3152     BlockDriverAIOCBSync *acb = opaque;
3153 
3154     if (!acb->is_write)
3155         qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
3156     qemu_vfree(acb->bounce);
3157     acb->common.cb(acb->common.opaque, acb->ret);
3158     qemu_bh_delete(acb->bh);
3159     acb->bh = NULL;
3160     qemu_aio_release(acb);
3161 }
3162 
3163 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3164                                             int64_t sector_num,
3165                                             QEMUIOVector *qiov,
3166                                             int nb_sectors,
3167                                             BlockDriverCompletionFunc *cb,
3168                                             void *opaque,
3169                                             int is_write)
3170 
3171 {
3172     BlockDriverAIOCBSync *acb;
3173 
3174     acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3175     acb->is_write = is_write;
3176     acb->qiov = qiov;
3177     acb->bounce = qemu_blockalign(bs, qiov->size);
3178     acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3179 
3180     if (is_write) {
3181         qemu_iovec_to_buffer(acb->qiov, acb->bounce);
3182         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3183     } else {
3184         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3185     }
3186 
3187     qemu_bh_schedule(acb->bh);
3188 
3189     return &acb->common;
3190 }
3191 
3192 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3193         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3194         BlockDriverCompletionFunc *cb, void *opaque)
3195 {
3196     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3197 }
3198 
3199 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3200         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3201         BlockDriverCompletionFunc *cb, void *opaque)
3202 {
3203     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3204 }
3205 
3206 
3207 typedef struct BlockDriverAIOCBCoroutine {
3208     BlockDriverAIOCB common;
3209     BlockRequest req;
3210     bool is_write;
3211     QEMUBH* bh;
3212 } BlockDriverAIOCBCoroutine;
3213 
3214 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3215 {
3216     qemu_aio_flush();
3217 }
3218 
3219 static AIOPool bdrv_em_co_aio_pool = {
3220     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3221     .cancel             = bdrv_aio_co_cancel_em,
3222 };
3223 
3224 static void bdrv_co_em_bh(void *opaque)
3225 {
3226     BlockDriverAIOCBCoroutine *acb = opaque;
3227 
3228     acb->common.cb(acb->common.opaque, acb->req.error);
3229     qemu_bh_delete(acb->bh);
3230     qemu_aio_release(acb);
3231 }
3232 
3233 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3234 static void coroutine_fn bdrv_co_do_rw(void *opaque)
3235 {
3236     BlockDriverAIOCBCoroutine *acb = opaque;
3237     BlockDriverState *bs = acb->common.bs;
3238 
3239     if (!acb->is_write) {
3240         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3241             acb->req.nb_sectors, acb->req.qiov, 0);
3242     } else {
3243         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3244             acb->req.nb_sectors, acb->req.qiov, 0);
3245     }
3246 
3247     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3248     qemu_bh_schedule(acb->bh);
3249 }
3250 
3251 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3252                                                int64_t sector_num,
3253                                                QEMUIOVector *qiov,
3254                                                int nb_sectors,
3255                                                BlockDriverCompletionFunc *cb,
3256                                                void *opaque,
3257                                                bool is_write)
3258 {
3259     Coroutine *co;
3260     BlockDriverAIOCBCoroutine *acb;
3261 
3262     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3263     acb->req.sector = sector_num;
3264     acb->req.nb_sectors = nb_sectors;
3265     acb->req.qiov = qiov;
3266     acb->is_write = is_write;
3267 
3268     co = qemu_coroutine_create(bdrv_co_do_rw);
3269     qemu_coroutine_enter(co, acb);
3270 
3271     return &acb->common;
3272 }
3273 
3274 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3275 {
3276     BlockDriverAIOCBCoroutine *acb = opaque;
3277     BlockDriverState *bs = acb->common.bs;
3278 
3279     acb->req.error = bdrv_co_flush(bs);
3280     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3281     qemu_bh_schedule(acb->bh);
3282 }
3283 
3284 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3285         BlockDriverCompletionFunc *cb, void *opaque)
3286 {
3287     trace_bdrv_aio_flush(bs, opaque);
3288 
3289     Coroutine *co;
3290     BlockDriverAIOCBCoroutine *acb;
3291 
3292     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3293     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3294     qemu_coroutine_enter(co, acb);
3295 
3296     return &acb->common;
3297 }
3298 
3299 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3300 {
3301     BlockDriverAIOCBCoroutine *acb = opaque;
3302     BlockDriverState *bs = acb->common.bs;
3303 
3304     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3305     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3306     qemu_bh_schedule(acb->bh);
3307 }
3308 
3309 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3310         int64_t sector_num, int nb_sectors,
3311         BlockDriverCompletionFunc *cb, void *opaque)
3312 {
3313     Coroutine *co;
3314     BlockDriverAIOCBCoroutine *acb;
3315 
3316     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3317 
3318     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3319     acb->req.sector = sector_num;
3320     acb->req.nb_sectors = nb_sectors;
3321     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3322     qemu_coroutine_enter(co, acb);
3323 
3324     return &acb->common;
3325 }
3326 
3327 void bdrv_init(void)
3328 {
3329     module_call_init(MODULE_INIT_BLOCK);
3330 }
3331 
3332 void bdrv_init_with_whitelist(void)
3333 {
3334     use_bdrv_whitelist = 1;
3335     bdrv_init();
3336 }
3337 
3338 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3339                    BlockDriverCompletionFunc *cb, void *opaque)
3340 {
3341     BlockDriverAIOCB *acb;
3342 
3343     if (pool->free_aiocb) {
3344         acb = pool->free_aiocb;
3345         pool->free_aiocb = acb->next;
3346     } else {
3347         acb = g_malloc0(pool->aiocb_size);
3348         acb->pool = pool;
3349     }
3350     acb->bs = bs;
3351     acb->cb = cb;
3352     acb->opaque = opaque;
3353     return acb;
3354 }
3355 
3356 void qemu_aio_release(void *p)
3357 {
3358     BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3359     AIOPool *pool = acb->pool;
3360     acb->next = pool->free_aiocb;
3361     pool->free_aiocb = acb;
3362 }
3363 
3364 /**************************************************************/
3365 /* Coroutine block device emulation */
3366 
3367 typedef struct CoroutineIOCompletion {
3368     Coroutine *coroutine;
3369     int ret;
3370 } CoroutineIOCompletion;
3371 
3372 static void bdrv_co_io_em_complete(void *opaque, int ret)
3373 {
3374     CoroutineIOCompletion *co = opaque;
3375 
3376     co->ret = ret;
3377     qemu_coroutine_enter(co->coroutine, NULL);
3378 }
3379 
3380 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3381                                       int nb_sectors, QEMUIOVector *iov,
3382                                       bool is_write)
3383 {
3384     CoroutineIOCompletion co = {
3385         .coroutine = qemu_coroutine_self(),
3386     };
3387     BlockDriverAIOCB *acb;
3388 
3389     if (is_write) {
3390         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3391                                        bdrv_co_io_em_complete, &co);
3392     } else {
3393         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3394                                       bdrv_co_io_em_complete, &co);
3395     }
3396 
3397     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3398     if (!acb) {
3399         return -EIO;
3400     }
3401     qemu_coroutine_yield();
3402 
3403     return co.ret;
3404 }
3405 
3406 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3407                                          int64_t sector_num, int nb_sectors,
3408                                          QEMUIOVector *iov)
3409 {
3410     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3411 }
3412 
3413 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3414                                          int64_t sector_num, int nb_sectors,
3415                                          QEMUIOVector *iov)
3416 {
3417     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3418 }
3419 
3420 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3421 {
3422     RwCo *rwco = opaque;
3423 
3424     rwco->ret = bdrv_co_flush(rwco->bs);
3425 }
3426 
3427 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3428 {
3429     int ret;
3430 
3431     if (!bs->drv) {
3432         return 0;
3433     }
3434 
3435     /* Write back cached data to the OS even with cache=unsafe */
3436     if (bs->drv->bdrv_co_flush_to_os) {
3437         ret = bs->drv->bdrv_co_flush_to_os(bs);
3438         if (ret < 0) {
3439             return ret;
3440         }
3441     }
3442 
3443     /* But don't actually force it to the disk with cache=unsafe */
3444     if (bs->open_flags & BDRV_O_NO_FLUSH) {
3445         return 0;
3446     }
3447 
3448     if (bs->drv->bdrv_co_flush_to_disk) {
3449         return bs->drv->bdrv_co_flush_to_disk(bs);
3450     } else if (bs->drv->bdrv_aio_flush) {
3451         BlockDriverAIOCB *acb;
3452         CoroutineIOCompletion co = {
3453             .coroutine = qemu_coroutine_self(),
3454         };
3455 
3456         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3457         if (acb == NULL) {
3458             return -EIO;
3459         } else {
3460             qemu_coroutine_yield();
3461             return co.ret;
3462         }
3463     } else {
3464         /*
3465          * Some block drivers always operate in either writethrough or unsafe
3466          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3467          * know how the server works (because the behaviour is hardcoded or
3468          * depends on server-side configuration), so we can't ensure that
3469          * everything is safe on disk. Returning an error doesn't work because
3470          * that would break guests even if the server operates in writethrough
3471          * mode.
3472          *
3473          * Let's hope the user knows what he's doing.
3474          */
3475         return 0;
3476     }
3477 }
3478 
3479 void bdrv_invalidate_cache(BlockDriverState *bs)
3480 {
3481     if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3482         bs->drv->bdrv_invalidate_cache(bs);
3483     }
3484 }
3485 
3486 void bdrv_invalidate_cache_all(void)
3487 {
3488     BlockDriverState *bs;
3489 
3490     QTAILQ_FOREACH(bs, &bdrv_states, list) {
3491         bdrv_invalidate_cache(bs);
3492     }
3493 }
3494 
3495 int bdrv_flush(BlockDriverState *bs)
3496 {
3497     Coroutine *co;
3498     RwCo rwco = {
3499         .bs = bs,
3500         .ret = NOT_DONE,
3501     };
3502 
3503     if (qemu_in_coroutine()) {
3504         /* Fast-path if already in coroutine context */
3505         bdrv_flush_co_entry(&rwco);
3506     } else {
3507         co = qemu_coroutine_create(bdrv_flush_co_entry);
3508         qemu_coroutine_enter(co, &rwco);
3509         while (rwco.ret == NOT_DONE) {
3510             qemu_aio_wait();
3511         }
3512     }
3513 
3514     return rwco.ret;
3515 }
3516 
3517 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3518 {
3519     RwCo *rwco = opaque;
3520 
3521     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3522 }
3523 
3524 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3525                                  int nb_sectors)
3526 {
3527     if (!bs->drv) {
3528         return -ENOMEDIUM;
3529     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3530         return -EIO;
3531     } else if (bs->read_only) {
3532         return -EROFS;
3533     } else if (bs->drv->bdrv_co_discard) {
3534         return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3535     } else if (bs->drv->bdrv_aio_discard) {
3536         BlockDriverAIOCB *acb;
3537         CoroutineIOCompletion co = {
3538             .coroutine = qemu_coroutine_self(),
3539         };
3540 
3541         acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3542                                         bdrv_co_io_em_complete, &co);
3543         if (acb == NULL) {
3544             return -EIO;
3545         } else {
3546             qemu_coroutine_yield();
3547             return co.ret;
3548         }
3549     } else {
3550         return 0;
3551     }
3552 }
3553 
3554 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3555 {
3556     Coroutine *co;
3557     RwCo rwco = {
3558         .bs = bs,
3559         .sector_num = sector_num,
3560         .nb_sectors = nb_sectors,
3561         .ret = NOT_DONE,
3562     };
3563 
3564     if (qemu_in_coroutine()) {
3565         /* Fast-path if already in coroutine context */
3566         bdrv_discard_co_entry(&rwco);
3567     } else {
3568         co = qemu_coroutine_create(bdrv_discard_co_entry);
3569         qemu_coroutine_enter(co, &rwco);
3570         while (rwco.ret == NOT_DONE) {
3571             qemu_aio_wait();
3572         }
3573     }
3574 
3575     return rwco.ret;
3576 }
3577 
3578 /**************************************************************/
3579 /* removable device support */
3580 
3581 /**
3582  * Return TRUE if the media is present
3583  */
3584 int bdrv_is_inserted(BlockDriverState *bs)
3585 {
3586     BlockDriver *drv = bs->drv;
3587 
3588     if (!drv)
3589         return 0;
3590     if (!drv->bdrv_is_inserted)
3591         return 1;
3592     return drv->bdrv_is_inserted(bs);
3593 }
3594 
3595 /**
3596  * Return whether the media changed since the last call to this
3597  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
3598  */
3599 int bdrv_media_changed(BlockDriverState *bs)
3600 {
3601     BlockDriver *drv = bs->drv;
3602 
3603     if (drv && drv->bdrv_media_changed) {
3604         return drv->bdrv_media_changed(bs);
3605     }
3606     return -ENOTSUP;
3607 }
3608 
3609 /**
3610  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3611  */
3612 void bdrv_eject(BlockDriverState *bs, int eject_flag)
3613 {
3614     BlockDriver *drv = bs->drv;
3615 
3616     if (drv && drv->bdrv_eject) {
3617         drv->bdrv_eject(bs, eject_flag);
3618     }
3619 }
3620 
3621 /**
3622  * Lock or unlock the media (if it is locked, the user won't be able
3623  * to eject it manually).
3624  */
3625 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3626 {
3627     BlockDriver *drv = bs->drv;
3628 
3629     trace_bdrv_lock_medium(bs, locked);
3630 
3631     if (drv && drv->bdrv_lock_medium) {
3632         drv->bdrv_lock_medium(bs, locked);
3633     }
3634 }
3635 
3636 /* needed for generic scsi interface */
3637 
3638 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3639 {
3640     BlockDriver *drv = bs->drv;
3641 
3642     if (drv && drv->bdrv_ioctl)
3643         return drv->bdrv_ioctl(bs, req, buf);
3644     return -ENOTSUP;
3645 }
3646 
3647 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3648         unsigned long int req, void *buf,
3649         BlockDriverCompletionFunc *cb, void *opaque)
3650 {
3651     BlockDriver *drv = bs->drv;
3652 
3653     if (drv && drv->bdrv_aio_ioctl)
3654         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3655     return NULL;
3656 }
3657 
3658 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3659 {
3660     bs->buffer_alignment = align;
3661 }
3662 
3663 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3664 {
3665     return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3666 }
3667 
3668 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3669 {
3670     int64_t bitmap_size;
3671 
3672     bs->dirty_count = 0;
3673     if (enable) {
3674         if (!bs->dirty_bitmap) {
3675             bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3676                     BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3677             bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
3678 
3679             bs->dirty_bitmap = g_malloc0(bitmap_size);
3680         }
3681     } else {
3682         if (bs->dirty_bitmap) {
3683             g_free(bs->dirty_bitmap);
3684             bs->dirty_bitmap = NULL;
3685         }
3686     }
3687 }
3688 
3689 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3690 {
3691     int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3692 
3693     if (bs->dirty_bitmap &&
3694         (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3695         return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3696             (1UL << (chunk % (sizeof(unsigned long) * 8))));
3697     } else {
3698         return 0;
3699     }
3700 }
3701 
3702 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3703                       int nr_sectors)
3704 {
3705     set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3706 }
3707 
3708 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3709 {
3710     return bs->dirty_count;
3711 }
3712 
3713 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3714 {
3715     assert(bs->in_use != in_use);
3716     bs->in_use = in_use;
3717 }
3718 
3719 int bdrv_in_use(BlockDriverState *bs)
3720 {
3721     return bs->in_use;
3722 }
3723 
3724 void bdrv_iostatus_enable(BlockDriverState *bs)
3725 {
3726     bs->iostatus_enabled = true;
3727     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3728 }
3729 
3730 /* The I/O status is only enabled if the drive explicitly
3731  * enables it _and_ the VM is configured to stop on errors */
3732 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3733 {
3734     return (bs->iostatus_enabled &&
3735            (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3736             bs->on_write_error == BLOCK_ERR_STOP_ANY    ||
3737             bs->on_read_error == BLOCK_ERR_STOP_ANY));
3738 }
3739 
3740 void bdrv_iostatus_disable(BlockDriverState *bs)
3741 {
3742     bs->iostatus_enabled = false;
3743 }
3744 
3745 void bdrv_iostatus_reset(BlockDriverState *bs)
3746 {
3747     if (bdrv_iostatus_is_enabled(bs)) {
3748         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3749     }
3750 }
3751 
3752 /* XXX: Today this is set by device models because it makes the implementation
3753    quite simple. However, the block layer knows about the error, so it's
3754    possible to implement this without device models being involved */
3755 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3756 {
3757     if (bdrv_iostatus_is_enabled(bs) &&
3758         bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
3759         assert(error >= 0);
3760         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3761                                          BLOCK_DEVICE_IO_STATUS_FAILED;
3762     }
3763 }
3764 
3765 void
3766 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3767         enum BlockAcctType type)
3768 {
3769     assert(type < BDRV_MAX_IOTYPE);
3770 
3771     cookie->bytes = bytes;
3772     cookie->start_time_ns = get_clock();
3773     cookie->type = type;
3774 }
3775 
3776 void
3777 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3778 {
3779     assert(cookie->type < BDRV_MAX_IOTYPE);
3780 
3781     bs->nr_bytes[cookie->type] += cookie->bytes;
3782     bs->nr_ops[cookie->type]++;
3783     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
3784 }
3785 
3786 int bdrv_img_create(const char *filename, const char *fmt,
3787                     const char *base_filename, const char *base_fmt,
3788                     char *options, uint64_t img_size, int flags)
3789 {
3790     QEMUOptionParameter *param = NULL, *create_options = NULL;
3791     QEMUOptionParameter *backing_fmt, *backing_file, *size;
3792     BlockDriverState *bs = NULL;
3793     BlockDriver *drv, *proto_drv;
3794     BlockDriver *backing_drv = NULL;
3795     int ret = 0;
3796 
3797     /* Find driver and parse its options */
3798     drv = bdrv_find_format(fmt);
3799     if (!drv) {
3800         error_report("Unknown file format '%s'", fmt);
3801         ret = -EINVAL;
3802         goto out;
3803     }
3804 
3805     proto_drv = bdrv_find_protocol(filename);
3806     if (!proto_drv) {
3807         error_report("Unknown protocol '%s'", filename);
3808         ret = -EINVAL;
3809         goto out;
3810     }
3811 
3812     create_options = append_option_parameters(create_options,
3813                                               drv->create_options);
3814     create_options = append_option_parameters(create_options,
3815                                               proto_drv->create_options);
3816 
3817     /* Create parameter list with default values */
3818     param = parse_option_parameters("", create_options, param);
3819 
3820     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
3821 
3822     /* Parse -o options */
3823     if (options) {
3824         param = parse_option_parameters(options, create_options, param);
3825         if (param == NULL) {
3826             error_report("Invalid options for file format '%s'.", fmt);
3827             ret = -EINVAL;
3828             goto out;
3829         }
3830     }
3831 
3832     if (base_filename) {
3833         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
3834                                  base_filename)) {
3835             error_report("Backing file not supported for file format '%s'",
3836                          fmt);
3837             ret = -EINVAL;
3838             goto out;
3839         }
3840     }
3841 
3842     if (base_fmt) {
3843         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
3844             error_report("Backing file format not supported for file "
3845                          "format '%s'", fmt);
3846             ret = -EINVAL;
3847             goto out;
3848         }
3849     }
3850 
3851     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
3852     if (backing_file && backing_file->value.s) {
3853         if (!strcmp(filename, backing_file->value.s)) {
3854             error_report("Error: Trying to create an image with the "
3855                          "same filename as the backing file");
3856             ret = -EINVAL;
3857             goto out;
3858         }
3859     }
3860 
3861     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
3862     if (backing_fmt && backing_fmt->value.s) {
3863         backing_drv = bdrv_find_format(backing_fmt->value.s);
3864         if (!backing_drv) {
3865             error_report("Unknown backing file format '%s'",
3866                          backing_fmt->value.s);
3867             ret = -EINVAL;
3868             goto out;
3869         }
3870     }
3871 
3872     // The size for the image must always be specified, with one exception:
3873     // If we are using a backing file, we can obtain the size from there
3874     size = get_option_parameter(param, BLOCK_OPT_SIZE);
3875     if (size && size->value.n == -1) {
3876         if (backing_file && backing_file->value.s) {
3877             uint64_t size;
3878             char buf[32];
3879 
3880             bs = bdrv_new("");
3881 
3882             ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
3883             if (ret < 0) {
3884                 error_report("Could not open '%s'", backing_file->value.s);
3885                 goto out;
3886             }
3887             bdrv_get_geometry(bs, &size);
3888             size *= 512;
3889 
3890             snprintf(buf, sizeof(buf), "%" PRId64, size);
3891             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
3892         } else {
3893             error_report("Image creation needs a size parameter");
3894             ret = -EINVAL;
3895             goto out;
3896         }
3897     }
3898 
3899     printf("Formatting '%s', fmt=%s ", filename, fmt);
3900     print_option_parameters(param);
3901     puts("");
3902 
3903     ret = bdrv_create(drv, filename, param);
3904 
3905     if (ret < 0) {
3906         if (ret == -ENOTSUP) {
3907             error_report("Formatting or formatting option not supported for "
3908                          "file format '%s'", fmt);
3909         } else if (ret == -EFBIG) {
3910             error_report("The image size is too large for file format '%s'",
3911                          fmt);
3912         } else {
3913             error_report("%s: error while creating %s: %s", filename, fmt,
3914                          strerror(-ret));
3915         }
3916     }
3917 
3918 out:
3919     free_option_parameters(create_options);
3920     free_option_parameters(param);
3921 
3922     if (bs) {
3923         bdrv_delete(bs);
3924     }
3925 
3926     return ret;
3927 }
3928 
3929 void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
3930                        BlockDriverCompletionFunc *cb, void *opaque)
3931 {
3932     BlockJob *job;
3933 
3934     if (bs->job || bdrv_in_use(bs)) {
3935         return NULL;
3936     }
3937     bdrv_set_in_use(bs, 1);
3938 
3939     job = g_malloc0(job_type->instance_size);
3940     job->job_type      = job_type;
3941     job->bs            = bs;
3942     job->cb            = cb;
3943     job->opaque        = opaque;
3944     bs->job = job;
3945     return job;
3946 }
3947 
3948 void block_job_complete(BlockJob *job, int ret)
3949 {
3950     BlockDriverState *bs = job->bs;
3951 
3952     assert(bs->job == job);
3953     job->cb(job->opaque, ret);
3954     bs->job = NULL;
3955     g_free(job);
3956     bdrv_set_in_use(bs, 0);
3957 }
3958 
3959 int block_job_set_speed(BlockJob *job, int64_t value)
3960 {
3961     if (!job->job_type->set_speed) {
3962         return -ENOTSUP;
3963     }
3964     return job->job_type->set_speed(job, value);
3965 }
3966 
3967 void block_job_cancel(BlockJob *job)
3968 {
3969     job->cancelled = true;
3970 }
3971 
3972 bool block_job_is_cancelled(BlockJob *job)
3973 {
3974     return job->cancelled;
3975 }
3976