xref: /openbmc/qemu/block.c (revision 05c4af54c670f5143bd4ac5d79aa1ef53a9f31ca)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor.h"
28 #include "block_int.h"
29 #include "module.h"
30 #include "qjson.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 #include "qemu-timer.h"
34 
35 #ifdef CONFIG_BSD
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/ioctl.h>
39 #include <sys/queue.h>
40 #ifndef __DragonFly__
41 #include <sys/disk.h>
42 #endif
43 #endif
44 
45 #ifdef _WIN32
46 #include <windows.h>
47 #endif
48 
49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50 
51 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
52 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
53         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
54         BlockDriverCompletionFunc *cb, void *opaque);
55 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
56         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
57         BlockDriverCompletionFunc *cb, void *opaque);
58 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
59                                          int64_t sector_num, int nb_sectors,
60                                          QEMUIOVector *iov);
61 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
62                                          int64_t sector_num, int nb_sectors,
63                                          QEMUIOVector *iov);
64 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
65     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
66 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
67     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
68 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
69                                                int64_t sector_num,
70                                                QEMUIOVector *qiov,
71                                                int nb_sectors,
72                                                BlockDriverCompletionFunc *cb,
73                                                void *opaque,
74                                                bool is_write);
75 static void coroutine_fn bdrv_co_do_rw(void *opaque);
76 
77 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
78         bool is_write, double elapsed_time, uint64_t *wait);
79 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
80         double elapsed_time, uint64_t *wait);
81 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
82         bool is_write, int64_t *wait);
83 
84 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
85     QTAILQ_HEAD_INITIALIZER(bdrv_states);
86 
87 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
88     QLIST_HEAD_INITIALIZER(bdrv_drivers);
89 
90 /* The device to use for VM snapshots */
91 static BlockDriverState *bs_snapshots;
92 
93 /* If non-zero, use only whitelisted block drivers */
94 static int use_bdrv_whitelist;
95 
96 #ifdef _WIN32
97 static int is_windows_drive_prefix(const char *filename)
98 {
99     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
100              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
101             filename[1] == ':');
102 }
103 
104 int is_windows_drive(const char *filename)
105 {
106     if (is_windows_drive_prefix(filename) &&
107         filename[2] == '\0')
108         return 1;
109     if (strstart(filename, "\\\\.\\", NULL) ||
110         strstart(filename, "//./", NULL))
111         return 1;
112     return 0;
113 }
114 #endif
115 
116 /* throttling disk I/O limits */
117 void bdrv_io_limits_disable(BlockDriverState *bs)
118 {
119     bs->io_limits_enabled = false;
120 
121     while (qemu_co_queue_next(&bs->throttled_reqs));
122 
123     if (bs->block_timer) {
124         qemu_del_timer(bs->block_timer);
125         qemu_free_timer(bs->block_timer);
126         bs->block_timer = NULL;
127     }
128 
129     bs->slice_start = 0;
130     bs->slice_end   = 0;
131     bs->slice_time  = 0;
132     memset(&bs->io_base, 0, sizeof(bs->io_base));
133 }
134 
135 static void bdrv_block_timer(void *opaque)
136 {
137     BlockDriverState *bs = opaque;
138 
139     qemu_co_queue_next(&bs->throttled_reqs);
140 }
141 
142 void bdrv_io_limits_enable(BlockDriverState *bs)
143 {
144     qemu_co_queue_init(&bs->throttled_reqs);
145     bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
146     bs->slice_time  = 5 * BLOCK_IO_SLICE_TIME;
147     bs->slice_start = qemu_get_clock_ns(vm_clock);
148     bs->slice_end   = bs->slice_start + bs->slice_time;
149     memset(&bs->io_base, 0, sizeof(bs->io_base));
150     bs->io_limits_enabled = true;
151 }
152 
153 bool bdrv_io_limits_enabled(BlockDriverState *bs)
154 {
155     BlockIOLimit *io_limits = &bs->io_limits;
156     return io_limits->bps[BLOCK_IO_LIMIT_READ]
157          || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
158          || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
159          || io_limits->iops[BLOCK_IO_LIMIT_READ]
160          || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
161          || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
162 }
163 
164 static void bdrv_io_limits_intercept(BlockDriverState *bs,
165                                      bool is_write, int nb_sectors)
166 {
167     int64_t wait_time = -1;
168 
169     if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
170         qemu_co_queue_wait(&bs->throttled_reqs);
171     }
172 
173     /* In fact, we hope to keep each request's timing, in FIFO mode. The next
174      * throttled requests will not be dequeued until the current request is
175      * allowed to be serviced. So if the current request still exceeds the
176      * limits, it will be inserted to the head. All requests followed it will
177      * be still in throttled_reqs queue.
178      */
179 
180     while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
181         qemu_mod_timer(bs->block_timer,
182                        wait_time + qemu_get_clock_ns(vm_clock));
183         qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
184     }
185 
186     qemu_co_queue_next(&bs->throttled_reqs);
187 }
188 
189 /* check if the path starts with "<protocol>:" */
190 static int path_has_protocol(const char *path)
191 {
192 #ifdef _WIN32
193     if (is_windows_drive(path) ||
194         is_windows_drive_prefix(path)) {
195         return 0;
196     }
197 #endif
198 
199     return strchr(path, ':') != NULL;
200 }
201 
202 int path_is_absolute(const char *path)
203 {
204     const char *p;
205 #ifdef _WIN32
206     /* specific case for names like: "\\.\d:" */
207     if (*path == '/' || *path == '\\')
208         return 1;
209 #endif
210     p = strchr(path, ':');
211     if (p)
212         p++;
213     else
214         p = path;
215 #ifdef _WIN32
216     return (*p == '/' || *p == '\\');
217 #else
218     return (*p == '/');
219 #endif
220 }
221 
222 /* if filename is absolute, just copy it to dest. Otherwise, build a
223    path to it by considering it is relative to base_path. URL are
224    supported. */
225 void path_combine(char *dest, int dest_size,
226                   const char *base_path,
227                   const char *filename)
228 {
229     const char *p, *p1;
230     int len;
231 
232     if (dest_size <= 0)
233         return;
234     if (path_is_absolute(filename)) {
235         pstrcpy(dest, dest_size, filename);
236     } else {
237         p = strchr(base_path, ':');
238         if (p)
239             p++;
240         else
241             p = base_path;
242         p1 = strrchr(base_path, '/');
243 #ifdef _WIN32
244         {
245             const char *p2;
246             p2 = strrchr(base_path, '\\');
247             if (!p1 || p2 > p1)
248                 p1 = p2;
249         }
250 #endif
251         if (p1)
252             p1++;
253         else
254             p1 = base_path;
255         if (p1 > p)
256             p = p1;
257         len = p - base_path;
258         if (len > dest_size - 1)
259             len = dest_size - 1;
260         memcpy(dest, base_path, len);
261         dest[len] = '\0';
262         pstrcat(dest, dest_size, filename);
263     }
264 }
265 
266 void bdrv_register(BlockDriver *bdrv)
267 {
268     /* Block drivers without coroutine functions need emulation */
269     if (!bdrv->bdrv_co_readv) {
270         bdrv->bdrv_co_readv = bdrv_co_readv_em;
271         bdrv->bdrv_co_writev = bdrv_co_writev_em;
272 
273         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
274          * the block driver lacks aio we need to emulate that too.
275          */
276         if (!bdrv->bdrv_aio_readv) {
277             /* add AIO emulation layer */
278             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
279             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
280         }
281     }
282 
283     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
284 }
285 
286 /* create a new block device (by default it is empty) */
287 BlockDriverState *bdrv_new(const char *device_name)
288 {
289     BlockDriverState *bs;
290 
291     bs = g_malloc0(sizeof(BlockDriverState));
292     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
293     if (device_name[0] != '\0') {
294         QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
295     }
296     bdrv_iostatus_disable(bs);
297     return bs;
298 }
299 
300 BlockDriver *bdrv_find_format(const char *format_name)
301 {
302     BlockDriver *drv1;
303     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
304         if (!strcmp(drv1->format_name, format_name)) {
305             return drv1;
306         }
307     }
308     return NULL;
309 }
310 
311 static int bdrv_is_whitelisted(BlockDriver *drv)
312 {
313     static const char *whitelist[] = {
314         CONFIG_BDRV_WHITELIST
315     };
316     const char **p;
317 
318     if (!whitelist[0])
319         return 1;               /* no whitelist, anything goes */
320 
321     for (p = whitelist; *p; p++) {
322         if (!strcmp(drv->format_name, *p)) {
323             return 1;
324         }
325     }
326     return 0;
327 }
328 
329 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
330 {
331     BlockDriver *drv = bdrv_find_format(format_name);
332     return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
333 }
334 
335 int bdrv_create(BlockDriver *drv, const char* filename,
336     QEMUOptionParameter *options)
337 {
338     if (!drv->bdrv_create)
339         return -ENOTSUP;
340 
341     return drv->bdrv_create(filename, options);
342 }
343 
344 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
345 {
346     BlockDriver *drv;
347 
348     drv = bdrv_find_protocol(filename);
349     if (drv == NULL) {
350         return -ENOENT;
351     }
352 
353     return bdrv_create(drv, filename, options);
354 }
355 
356 #ifdef _WIN32
357 void get_tmp_filename(char *filename, int size)
358 {
359     char temp_dir[MAX_PATH];
360 
361     GetTempPath(MAX_PATH, temp_dir);
362     GetTempFileName(temp_dir, "qem", 0, filename);
363 }
364 #else
365 void get_tmp_filename(char *filename, int size)
366 {
367     int fd;
368     const char *tmpdir;
369     /* XXX: race condition possible */
370     tmpdir = getenv("TMPDIR");
371     if (!tmpdir)
372         tmpdir = "/tmp";
373     snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
374     fd = mkstemp(filename);
375     close(fd);
376 }
377 #endif
378 
379 /*
380  * Detect host devices. By convention, /dev/cdrom[N] is always
381  * recognized as a host CDROM.
382  */
383 static BlockDriver *find_hdev_driver(const char *filename)
384 {
385     int score_max = 0, score;
386     BlockDriver *drv = NULL, *d;
387 
388     QLIST_FOREACH(d, &bdrv_drivers, list) {
389         if (d->bdrv_probe_device) {
390             score = d->bdrv_probe_device(filename);
391             if (score > score_max) {
392                 score_max = score;
393                 drv = d;
394             }
395         }
396     }
397 
398     return drv;
399 }
400 
401 BlockDriver *bdrv_find_protocol(const char *filename)
402 {
403     BlockDriver *drv1;
404     char protocol[128];
405     int len;
406     const char *p;
407 
408     /* TODO Drivers without bdrv_file_open must be specified explicitly */
409 
410     /*
411      * XXX(hch): we really should not let host device detection
412      * override an explicit protocol specification, but moving this
413      * later breaks access to device names with colons in them.
414      * Thanks to the brain-dead persistent naming schemes on udev-
415      * based Linux systems those actually are quite common.
416      */
417     drv1 = find_hdev_driver(filename);
418     if (drv1) {
419         return drv1;
420     }
421 
422     if (!path_has_protocol(filename)) {
423         return bdrv_find_format("file");
424     }
425     p = strchr(filename, ':');
426     assert(p != NULL);
427     len = p - filename;
428     if (len > sizeof(protocol) - 1)
429         len = sizeof(protocol) - 1;
430     memcpy(protocol, filename, len);
431     protocol[len] = '\0';
432     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
433         if (drv1->protocol_name &&
434             !strcmp(drv1->protocol_name, protocol)) {
435             return drv1;
436         }
437     }
438     return NULL;
439 }
440 
441 static int find_image_format(const char *filename, BlockDriver **pdrv)
442 {
443     int ret, score, score_max;
444     BlockDriver *drv1, *drv;
445     uint8_t buf[2048];
446     BlockDriverState *bs;
447 
448     ret = bdrv_file_open(&bs, filename, 0);
449     if (ret < 0) {
450         *pdrv = NULL;
451         return ret;
452     }
453 
454     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
455     if (bs->sg || !bdrv_is_inserted(bs)) {
456         bdrv_delete(bs);
457         drv = bdrv_find_format("raw");
458         if (!drv) {
459             ret = -ENOENT;
460         }
461         *pdrv = drv;
462         return ret;
463     }
464 
465     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
466     bdrv_delete(bs);
467     if (ret < 0) {
468         *pdrv = NULL;
469         return ret;
470     }
471 
472     score_max = 0;
473     drv = NULL;
474     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
475         if (drv1->bdrv_probe) {
476             score = drv1->bdrv_probe(buf, ret, filename);
477             if (score > score_max) {
478                 score_max = score;
479                 drv = drv1;
480             }
481         }
482     }
483     if (!drv) {
484         ret = -ENOENT;
485     }
486     *pdrv = drv;
487     return ret;
488 }
489 
490 /**
491  * Set the current 'total_sectors' value
492  */
493 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
494 {
495     BlockDriver *drv = bs->drv;
496 
497     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
498     if (bs->sg)
499         return 0;
500 
501     /* query actual device if possible, otherwise just trust the hint */
502     if (drv->bdrv_getlength) {
503         int64_t length = drv->bdrv_getlength(bs);
504         if (length < 0) {
505             return length;
506         }
507         hint = length >> BDRV_SECTOR_BITS;
508     }
509 
510     bs->total_sectors = hint;
511     return 0;
512 }
513 
514 /**
515  * Set open flags for a given cache mode
516  *
517  * Return 0 on success, -1 if the cache mode was invalid.
518  */
519 int bdrv_parse_cache_flags(const char *mode, int *flags)
520 {
521     *flags &= ~BDRV_O_CACHE_MASK;
522 
523     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
524         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
525     } else if (!strcmp(mode, "directsync")) {
526         *flags |= BDRV_O_NOCACHE;
527     } else if (!strcmp(mode, "writeback")) {
528         *flags |= BDRV_O_CACHE_WB;
529     } else if (!strcmp(mode, "unsafe")) {
530         *flags |= BDRV_O_CACHE_WB;
531         *flags |= BDRV_O_NO_FLUSH;
532     } else if (!strcmp(mode, "writethrough")) {
533         /* this is the default */
534     } else {
535         return -1;
536     }
537 
538     return 0;
539 }
540 
541 /*
542  * Common part for opening disk images and files
543  */
544 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
545     int flags, BlockDriver *drv)
546 {
547     int ret, open_flags;
548 
549     assert(drv != NULL);
550 
551     trace_bdrv_open_common(bs, filename, flags, drv->format_name);
552 
553     bs->file = NULL;
554     bs->total_sectors = 0;
555     bs->encrypted = 0;
556     bs->valid_key = 0;
557     bs->sg = 0;
558     bs->open_flags = flags;
559     bs->growable = 0;
560     bs->buffer_alignment = 512;
561 
562     pstrcpy(bs->filename, sizeof(bs->filename), filename);
563     bs->backing_file[0] = '\0';
564 
565     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
566         return -ENOTSUP;
567     }
568 
569     bs->drv = drv;
570     bs->opaque = g_malloc0(drv->instance_size);
571 
572     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
573 
574     /*
575      * Clear flags that are internal to the block layer before opening the
576      * image.
577      */
578     open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
579 
580     /*
581      * Snapshots should be writable.
582      */
583     if (bs->is_temporary) {
584         open_flags |= BDRV_O_RDWR;
585     }
586 
587     bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
588 
589     /* Open the image, either directly or using a protocol */
590     if (drv->bdrv_file_open) {
591         ret = drv->bdrv_file_open(bs, filename, open_flags);
592     } else {
593         ret = bdrv_file_open(&bs->file, filename, open_flags);
594         if (ret >= 0) {
595             ret = drv->bdrv_open(bs, open_flags);
596         }
597     }
598 
599     if (ret < 0) {
600         goto free_and_fail;
601     }
602 
603     ret = refresh_total_sectors(bs, bs->total_sectors);
604     if (ret < 0) {
605         goto free_and_fail;
606     }
607 
608 #ifndef _WIN32
609     if (bs->is_temporary) {
610         unlink(filename);
611     }
612 #endif
613     return 0;
614 
615 free_and_fail:
616     if (bs->file) {
617         bdrv_delete(bs->file);
618         bs->file = NULL;
619     }
620     g_free(bs->opaque);
621     bs->opaque = NULL;
622     bs->drv = NULL;
623     return ret;
624 }
625 
626 /*
627  * Opens a file using a protocol (file, host_device, nbd, ...)
628  */
629 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
630 {
631     BlockDriverState *bs;
632     BlockDriver *drv;
633     int ret;
634 
635     drv = bdrv_find_protocol(filename);
636     if (!drv) {
637         return -ENOENT;
638     }
639 
640     bs = bdrv_new("");
641     ret = bdrv_open_common(bs, filename, flags, drv);
642     if (ret < 0) {
643         bdrv_delete(bs);
644         return ret;
645     }
646     bs->growable = 1;
647     *pbs = bs;
648     return 0;
649 }
650 
651 /*
652  * Opens a disk image (raw, qcow2, vmdk, ...)
653  */
654 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
655               BlockDriver *drv)
656 {
657     int ret;
658     char tmp_filename[PATH_MAX];
659 
660     if (flags & BDRV_O_SNAPSHOT) {
661         BlockDriverState *bs1;
662         int64_t total_size;
663         int is_protocol = 0;
664         BlockDriver *bdrv_qcow2;
665         QEMUOptionParameter *options;
666         char backing_filename[PATH_MAX];
667 
668         /* if snapshot, we create a temporary backing file and open it
669            instead of opening 'filename' directly */
670 
671         /* if there is a backing file, use it */
672         bs1 = bdrv_new("");
673         ret = bdrv_open(bs1, filename, 0, drv);
674         if (ret < 0) {
675             bdrv_delete(bs1);
676             return ret;
677         }
678         total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
679 
680         if (bs1->drv && bs1->drv->protocol_name)
681             is_protocol = 1;
682 
683         bdrv_delete(bs1);
684 
685         get_tmp_filename(tmp_filename, sizeof(tmp_filename));
686 
687         /* Real path is meaningless for protocols */
688         if (is_protocol)
689             snprintf(backing_filename, sizeof(backing_filename),
690                      "%s", filename);
691         else if (!realpath(filename, backing_filename))
692             return -errno;
693 
694         bdrv_qcow2 = bdrv_find_format("qcow2");
695         options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
696 
697         set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
698         set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
699         if (drv) {
700             set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
701                 drv->format_name);
702         }
703 
704         ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
705         free_option_parameters(options);
706         if (ret < 0) {
707             return ret;
708         }
709 
710         filename = tmp_filename;
711         drv = bdrv_qcow2;
712         bs->is_temporary = 1;
713     }
714 
715     /* Find the right image format driver */
716     if (!drv) {
717         ret = find_image_format(filename, &drv);
718     }
719 
720     if (!drv) {
721         goto unlink_and_fail;
722     }
723 
724     /* Open the image */
725     ret = bdrv_open_common(bs, filename, flags, drv);
726     if (ret < 0) {
727         goto unlink_and_fail;
728     }
729 
730     /* If there is a backing file, use it */
731     if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
732         char backing_filename[PATH_MAX];
733         int back_flags;
734         BlockDriver *back_drv = NULL;
735 
736         bs->backing_hd = bdrv_new("");
737 
738         if (path_has_protocol(bs->backing_file)) {
739             pstrcpy(backing_filename, sizeof(backing_filename),
740                     bs->backing_file);
741         } else {
742             path_combine(backing_filename, sizeof(backing_filename),
743                          filename, bs->backing_file);
744         }
745 
746         if (bs->backing_format[0] != '\0') {
747             back_drv = bdrv_find_format(bs->backing_format);
748         }
749 
750         /* backing files always opened read-only */
751         back_flags =
752             flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
753 
754         ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
755         if (ret < 0) {
756             bdrv_close(bs);
757             return ret;
758         }
759         if (bs->is_temporary) {
760             bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
761         } else {
762             /* base image inherits from "parent" */
763             bs->backing_hd->keep_read_only = bs->keep_read_only;
764         }
765     }
766 
767     if (!bdrv_key_required(bs)) {
768         bdrv_dev_change_media_cb(bs, true);
769     }
770 
771     /* throttling disk I/O limits */
772     if (bs->io_limits_enabled) {
773         bdrv_io_limits_enable(bs);
774     }
775 
776     return 0;
777 
778 unlink_and_fail:
779     if (bs->is_temporary) {
780         unlink(filename);
781     }
782     return ret;
783 }
784 
785 void bdrv_close(BlockDriverState *bs)
786 {
787     if (bs->drv) {
788         if (bs == bs_snapshots) {
789             bs_snapshots = NULL;
790         }
791         if (bs->backing_hd) {
792             bdrv_delete(bs->backing_hd);
793             bs->backing_hd = NULL;
794         }
795         bs->drv->bdrv_close(bs);
796         g_free(bs->opaque);
797 #ifdef _WIN32
798         if (bs->is_temporary) {
799             unlink(bs->filename);
800         }
801 #endif
802         bs->opaque = NULL;
803         bs->drv = NULL;
804 
805         if (bs->file != NULL) {
806             bdrv_close(bs->file);
807         }
808 
809         bdrv_dev_change_media_cb(bs, false);
810     }
811 
812     /*throttling disk I/O limits*/
813     if (bs->io_limits_enabled) {
814         bdrv_io_limits_disable(bs);
815     }
816 }
817 
818 void bdrv_close_all(void)
819 {
820     BlockDriverState *bs;
821 
822     QTAILQ_FOREACH(bs, &bdrv_states, list) {
823         bdrv_close(bs);
824     }
825 }
826 
827 /* make a BlockDriverState anonymous by removing from bdrv_state list.
828    Also, NULL terminate the device_name to prevent double remove */
829 void bdrv_make_anon(BlockDriverState *bs)
830 {
831     if (bs->device_name[0] != '\0') {
832         QTAILQ_REMOVE(&bdrv_states, bs, list);
833     }
834     bs->device_name[0] = '\0';
835 }
836 
837 void bdrv_delete(BlockDriverState *bs)
838 {
839     assert(!bs->dev);
840 
841     /* remove from list, if necessary */
842     bdrv_make_anon(bs);
843 
844     bdrv_close(bs);
845     if (bs->file != NULL) {
846         bdrv_delete(bs->file);
847     }
848 
849     assert(bs != bs_snapshots);
850     g_free(bs);
851 }
852 
853 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
854 /* TODO change to DeviceState *dev when all users are qdevified */
855 {
856     if (bs->dev) {
857         return -EBUSY;
858     }
859     bs->dev = dev;
860     bdrv_iostatus_reset(bs);
861     return 0;
862 }
863 
864 /* TODO qdevified devices don't use this, remove when devices are qdevified */
865 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
866 {
867     if (bdrv_attach_dev(bs, dev) < 0) {
868         abort();
869     }
870 }
871 
872 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
873 /* TODO change to DeviceState *dev when all users are qdevified */
874 {
875     assert(bs->dev == dev);
876     bs->dev = NULL;
877     bs->dev_ops = NULL;
878     bs->dev_opaque = NULL;
879     bs->buffer_alignment = 512;
880 }
881 
882 /* TODO change to return DeviceState * when all users are qdevified */
883 void *bdrv_get_attached_dev(BlockDriverState *bs)
884 {
885     return bs->dev;
886 }
887 
888 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
889                       void *opaque)
890 {
891     bs->dev_ops = ops;
892     bs->dev_opaque = opaque;
893     if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
894         bs_snapshots = NULL;
895     }
896 }
897 
898 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
899 {
900     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
901         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
902     }
903 }
904 
905 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
906 {
907     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
908 }
909 
910 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
911 {
912     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
913         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
914     }
915 }
916 
917 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
918 {
919     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
920         return bs->dev_ops->is_tray_open(bs->dev_opaque);
921     }
922     return false;
923 }
924 
925 static void bdrv_dev_resize_cb(BlockDriverState *bs)
926 {
927     if (bs->dev_ops && bs->dev_ops->resize_cb) {
928         bs->dev_ops->resize_cb(bs->dev_opaque);
929     }
930 }
931 
932 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
933 {
934     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
935         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
936     }
937     return false;
938 }
939 
940 /*
941  * Run consistency checks on an image
942  *
943  * Returns 0 if the check could be completed (it doesn't mean that the image is
944  * free of errors) or -errno when an internal error occurred. The results of the
945  * check are stored in res.
946  */
947 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
948 {
949     if (bs->drv->bdrv_check == NULL) {
950         return -ENOTSUP;
951     }
952 
953     memset(res, 0, sizeof(*res));
954     return bs->drv->bdrv_check(bs, res);
955 }
956 
957 #define COMMIT_BUF_SECTORS 2048
958 
959 /* commit COW file into the raw image */
960 int bdrv_commit(BlockDriverState *bs)
961 {
962     BlockDriver *drv = bs->drv;
963     BlockDriver *backing_drv;
964     int64_t sector, total_sectors;
965     int n, ro, open_flags;
966     int ret = 0, rw_ret = 0;
967     uint8_t *buf;
968     char filename[1024];
969     BlockDriverState *bs_rw, *bs_ro;
970 
971     if (!drv)
972         return -ENOMEDIUM;
973 
974     if (!bs->backing_hd) {
975         return -ENOTSUP;
976     }
977 
978     if (bs->backing_hd->keep_read_only) {
979         return -EACCES;
980     }
981 
982     backing_drv = bs->backing_hd->drv;
983     ro = bs->backing_hd->read_only;
984     strncpy(filename, bs->backing_hd->filename, sizeof(filename));
985     open_flags =  bs->backing_hd->open_flags;
986 
987     if (ro) {
988         /* re-open as RW */
989         bdrv_delete(bs->backing_hd);
990         bs->backing_hd = NULL;
991         bs_rw = bdrv_new("");
992         rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
993             backing_drv);
994         if (rw_ret < 0) {
995             bdrv_delete(bs_rw);
996             /* try to re-open read-only */
997             bs_ro = bdrv_new("");
998             ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
999                 backing_drv);
1000             if (ret < 0) {
1001                 bdrv_delete(bs_ro);
1002                 /* drive not functional anymore */
1003                 bs->drv = NULL;
1004                 return ret;
1005             }
1006             bs->backing_hd = bs_ro;
1007             return rw_ret;
1008         }
1009         bs->backing_hd = bs_rw;
1010     }
1011 
1012     total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1013     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1014 
1015     for (sector = 0; sector < total_sectors; sector += n) {
1016         if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1017 
1018             if (bdrv_read(bs, sector, buf, n) != 0) {
1019                 ret = -EIO;
1020                 goto ro_cleanup;
1021             }
1022 
1023             if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1024                 ret = -EIO;
1025                 goto ro_cleanup;
1026             }
1027         }
1028     }
1029 
1030     if (drv->bdrv_make_empty) {
1031         ret = drv->bdrv_make_empty(bs);
1032         bdrv_flush(bs);
1033     }
1034 
1035     /*
1036      * Make sure all data we wrote to the backing device is actually
1037      * stable on disk.
1038      */
1039     if (bs->backing_hd)
1040         bdrv_flush(bs->backing_hd);
1041 
1042 ro_cleanup:
1043     g_free(buf);
1044 
1045     if (ro) {
1046         /* re-open as RO */
1047         bdrv_delete(bs->backing_hd);
1048         bs->backing_hd = NULL;
1049         bs_ro = bdrv_new("");
1050         ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1051             backing_drv);
1052         if (ret < 0) {
1053             bdrv_delete(bs_ro);
1054             /* drive not functional anymore */
1055             bs->drv = NULL;
1056             return ret;
1057         }
1058         bs->backing_hd = bs_ro;
1059         bs->backing_hd->keep_read_only = 0;
1060     }
1061 
1062     return ret;
1063 }
1064 
1065 void bdrv_commit_all(void)
1066 {
1067     BlockDriverState *bs;
1068 
1069     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1070         bdrv_commit(bs);
1071     }
1072 }
1073 
1074 /*
1075  * Return values:
1076  * 0        - success
1077  * -EINVAL  - backing format specified, but no file
1078  * -ENOSPC  - can't update the backing file because no space is left in the
1079  *            image file header
1080  * -ENOTSUP - format driver doesn't support changing the backing file
1081  */
1082 int bdrv_change_backing_file(BlockDriverState *bs,
1083     const char *backing_file, const char *backing_fmt)
1084 {
1085     BlockDriver *drv = bs->drv;
1086 
1087     if (drv->bdrv_change_backing_file != NULL) {
1088         return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1089     } else {
1090         return -ENOTSUP;
1091     }
1092 }
1093 
1094 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1095                                    size_t size)
1096 {
1097     int64_t len;
1098 
1099     if (!bdrv_is_inserted(bs))
1100         return -ENOMEDIUM;
1101 
1102     if (bs->growable)
1103         return 0;
1104 
1105     len = bdrv_getlength(bs);
1106 
1107     if (offset < 0)
1108         return -EIO;
1109 
1110     if ((offset > len) || (len - offset < size))
1111         return -EIO;
1112 
1113     return 0;
1114 }
1115 
1116 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1117                               int nb_sectors)
1118 {
1119     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1120                                    nb_sectors * BDRV_SECTOR_SIZE);
1121 }
1122 
1123 typedef struct RwCo {
1124     BlockDriverState *bs;
1125     int64_t sector_num;
1126     int nb_sectors;
1127     QEMUIOVector *qiov;
1128     bool is_write;
1129     int ret;
1130 } RwCo;
1131 
1132 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1133 {
1134     RwCo *rwco = opaque;
1135 
1136     if (!rwco->is_write) {
1137         rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1138                                      rwco->nb_sectors, rwco->qiov);
1139     } else {
1140         rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1141                                       rwco->nb_sectors, rwco->qiov);
1142     }
1143 }
1144 
1145 /*
1146  * Process a synchronous request using coroutines
1147  */
1148 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1149                       int nb_sectors, bool is_write)
1150 {
1151     QEMUIOVector qiov;
1152     struct iovec iov = {
1153         .iov_base = (void *)buf,
1154         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1155     };
1156     Coroutine *co;
1157     RwCo rwco = {
1158         .bs = bs,
1159         .sector_num = sector_num,
1160         .nb_sectors = nb_sectors,
1161         .qiov = &qiov,
1162         .is_write = is_write,
1163         .ret = NOT_DONE,
1164     };
1165 
1166     qemu_iovec_init_external(&qiov, &iov, 1);
1167 
1168     if (qemu_in_coroutine()) {
1169         /* Fast-path if already in coroutine context */
1170         bdrv_rw_co_entry(&rwco);
1171     } else {
1172         co = qemu_coroutine_create(bdrv_rw_co_entry);
1173         qemu_coroutine_enter(co, &rwco);
1174         while (rwco.ret == NOT_DONE) {
1175             qemu_aio_wait();
1176         }
1177     }
1178     return rwco.ret;
1179 }
1180 
1181 /* return < 0 if error. See bdrv_write() for the return codes */
1182 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1183               uint8_t *buf, int nb_sectors)
1184 {
1185     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1186 }
1187 
1188 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1189                              int nb_sectors, int dirty)
1190 {
1191     int64_t start, end;
1192     unsigned long val, idx, bit;
1193 
1194     start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1195     end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1196 
1197     for (; start <= end; start++) {
1198         idx = start / (sizeof(unsigned long) * 8);
1199         bit = start % (sizeof(unsigned long) * 8);
1200         val = bs->dirty_bitmap[idx];
1201         if (dirty) {
1202             if (!(val & (1UL << bit))) {
1203                 bs->dirty_count++;
1204                 val |= 1UL << bit;
1205             }
1206         } else {
1207             if (val & (1UL << bit)) {
1208                 bs->dirty_count--;
1209                 val &= ~(1UL << bit);
1210             }
1211         }
1212         bs->dirty_bitmap[idx] = val;
1213     }
1214 }
1215 
1216 /* Return < 0 if error. Important errors are:
1217   -EIO         generic I/O error (may happen for all errors)
1218   -ENOMEDIUM   No media inserted.
1219   -EINVAL      Invalid sector number or nb_sectors
1220   -EACCES      Trying to write a read-only device
1221 */
1222 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1223                const uint8_t *buf, int nb_sectors)
1224 {
1225     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1226 }
1227 
1228 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1229                void *buf, int count1)
1230 {
1231     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1232     int len, nb_sectors, count;
1233     int64_t sector_num;
1234     int ret;
1235 
1236     count = count1;
1237     /* first read to align to sector start */
1238     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1239     if (len > count)
1240         len = count;
1241     sector_num = offset >> BDRV_SECTOR_BITS;
1242     if (len > 0) {
1243         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1244             return ret;
1245         memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1246         count -= len;
1247         if (count == 0)
1248             return count1;
1249         sector_num++;
1250         buf += len;
1251     }
1252 
1253     /* read the sectors "in place" */
1254     nb_sectors = count >> BDRV_SECTOR_BITS;
1255     if (nb_sectors > 0) {
1256         if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1257             return ret;
1258         sector_num += nb_sectors;
1259         len = nb_sectors << BDRV_SECTOR_BITS;
1260         buf += len;
1261         count -= len;
1262     }
1263 
1264     /* add data from the last sector */
1265     if (count > 0) {
1266         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1267             return ret;
1268         memcpy(buf, tmp_buf, count);
1269     }
1270     return count1;
1271 }
1272 
1273 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1274                 const void *buf, int count1)
1275 {
1276     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1277     int len, nb_sectors, count;
1278     int64_t sector_num;
1279     int ret;
1280 
1281     count = count1;
1282     /* first write to align to sector start */
1283     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1284     if (len > count)
1285         len = count;
1286     sector_num = offset >> BDRV_SECTOR_BITS;
1287     if (len > 0) {
1288         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1289             return ret;
1290         memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1291         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1292             return ret;
1293         count -= len;
1294         if (count == 0)
1295             return count1;
1296         sector_num++;
1297         buf += len;
1298     }
1299 
1300     /* write the sectors "in place" */
1301     nb_sectors = count >> BDRV_SECTOR_BITS;
1302     if (nb_sectors > 0) {
1303         if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1304             return ret;
1305         sector_num += nb_sectors;
1306         len = nb_sectors << BDRV_SECTOR_BITS;
1307         buf += len;
1308         count -= len;
1309     }
1310 
1311     /* add data from the last sector */
1312     if (count > 0) {
1313         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1314             return ret;
1315         memcpy(tmp_buf, buf, count);
1316         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1317             return ret;
1318     }
1319     return count1;
1320 }
1321 
1322 /*
1323  * Writes to the file and ensures that no writes are reordered across this
1324  * request (acts as a barrier)
1325  *
1326  * Returns 0 on success, -errno in error cases.
1327  */
1328 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1329     const void *buf, int count)
1330 {
1331     int ret;
1332 
1333     ret = bdrv_pwrite(bs, offset, buf, count);
1334     if (ret < 0) {
1335         return ret;
1336     }
1337 
1338     /* No flush needed for cache modes that use O_DSYNC */
1339     if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
1340         bdrv_flush(bs);
1341     }
1342 
1343     return 0;
1344 }
1345 
1346 /*
1347  * Handle a read request in coroutine context
1348  */
1349 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1350     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1351 {
1352     BlockDriver *drv = bs->drv;
1353 
1354     if (!drv) {
1355         return -ENOMEDIUM;
1356     }
1357     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1358         return -EIO;
1359     }
1360 
1361     /* throttling disk read I/O */
1362     if (bs->io_limits_enabled) {
1363         bdrv_io_limits_intercept(bs, false, nb_sectors);
1364     }
1365 
1366     return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1367 }
1368 
1369 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1370     int nb_sectors, QEMUIOVector *qiov)
1371 {
1372     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1373 
1374     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov);
1375 }
1376 
1377 /*
1378  * Handle a write request in coroutine context
1379  */
1380 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1381     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1382 {
1383     BlockDriver *drv = bs->drv;
1384     int ret;
1385 
1386     if (!bs->drv) {
1387         return -ENOMEDIUM;
1388     }
1389     if (bs->read_only) {
1390         return -EACCES;
1391     }
1392     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1393         return -EIO;
1394     }
1395 
1396     /* throttling disk write I/O */
1397     if (bs->io_limits_enabled) {
1398         bdrv_io_limits_intercept(bs, true, nb_sectors);
1399     }
1400 
1401     ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1402 
1403     if (bs->dirty_bitmap) {
1404         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1405     }
1406 
1407     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1408         bs->wr_highest_sector = sector_num + nb_sectors - 1;
1409     }
1410 
1411     return ret;
1412 }
1413 
1414 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1415     int nb_sectors, QEMUIOVector *qiov)
1416 {
1417     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1418 
1419     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov);
1420 }
1421 
1422 /**
1423  * Truncate file to 'offset' bytes (needed only for file protocols)
1424  */
1425 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1426 {
1427     BlockDriver *drv = bs->drv;
1428     int ret;
1429     if (!drv)
1430         return -ENOMEDIUM;
1431     if (!drv->bdrv_truncate)
1432         return -ENOTSUP;
1433     if (bs->read_only)
1434         return -EACCES;
1435     if (bdrv_in_use(bs))
1436         return -EBUSY;
1437     ret = drv->bdrv_truncate(bs, offset);
1438     if (ret == 0) {
1439         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1440         bdrv_dev_resize_cb(bs);
1441     }
1442     return ret;
1443 }
1444 
1445 /**
1446  * Length of a allocated file in bytes. Sparse files are counted by actual
1447  * allocated space. Return < 0 if error or unknown.
1448  */
1449 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
1450 {
1451     BlockDriver *drv = bs->drv;
1452     if (!drv) {
1453         return -ENOMEDIUM;
1454     }
1455     if (drv->bdrv_get_allocated_file_size) {
1456         return drv->bdrv_get_allocated_file_size(bs);
1457     }
1458     if (bs->file) {
1459         return bdrv_get_allocated_file_size(bs->file);
1460     }
1461     return -ENOTSUP;
1462 }
1463 
1464 /**
1465  * Length of a file in bytes. Return < 0 if error or unknown.
1466  */
1467 int64_t bdrv_getlength(BlockDriverState *bs)
1468 {
1469     BlockDriver *drv = bs->drv;
1470     if (!drv)
1471         return -ENOMEDIUM;
1472 
1473     if (bs->growable || bdrv_dev_has_removable_media(bs)) {
1474         if (drv->bdrv_getlength) {
1475             return drv->bdrv_getlength(bs);
1476         }
1477     }
1478     return bs->total_sectors * BDRV_SECTOR_SIZE;
1479 }
1480 
1481 /* return 0 as number of sectors if no device present or error */
1482 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
1483 {
1484     int64_t length;
1485     length = bdrv_getlength(bs);
1486     if (length < 0)
1487         length = 0;
1488     else
1489         length = length >> BDRV_SECTOR_BITS;
1490     *nb_sectors_ptr = length;
1491 }
1492 
1493 struct partition {
1494         uint8_t boot_ind;           /* 0x80 - active */
1495         uint8_t head;               /* starting head */
1496         uint8_t sector;             /* starting sector */
1497         uint8_t cyl;                /* starting cylinder */
1498         uint8_t sys_ind;            /* What partition type */
1499         uint8_t end_head;           /* end head */
1500         uint8_t end_sector;         /* end sector */
1501         uint8_t end_cyl;            /* end cylinder */
1502         uint32_t start_sect;        /* starting sector counting from 0 */
1503         uint32_t nr_sects;          /* nr of sectors in partition */
1504 } QEMU_PACKED;
1505 
1506 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1507 static int guess_disk_lchs(BlockDriverState *bs,
1508                            int *pcylinders, int *pheads, int *psectors)
1509 {
1510     uint8_t buf[BDRV_SECTOR_SIZE];
1511     int ret, i, heads, sectors, cylinders;
1512     struct partition *p;
1513     uint32_t nr_sects;
1514     uint64_t nb_sectors;
1515 
1516     bdrv_get_geometry(bs, &nb_sectors);
1517 
1518     ret = bdrv_read(bs, 0, buf, 1);
1519     if (ret < 0)
1520         return -1;
1521     /* test msdos magic */
1522     if (buf[510] != 0x55 || buf[511] != 0xaa)
1523         return -1;
1524     for(i = 0; i < 4; i++) {
1525         p = ((struct partition *)(buf + 0x1be)) + i;
1526         nr_sects = le32_to_cpu(p->nr_sects);
1527         if (nr_sects && p->end_head) {
1528             /* We make the assumption that the partition terminates on
1529                a cylinder boundary */
1530             heads = p->end_head + 1;
1531             sectors = p->end_sector & 63;
1532             if (sectors == 0)
1533                 continue;
1534             cylinders = nb_sectors / (heads * sectors);
1535             if (cylinders < 1 || cylinders > 16383)
1536                 continue;
1537             *pheads = heads;
1538             *psectors = sectors;
1539             *pcylinders = cylinders;
1540 #if 0
1541             printf("guessed geometry: LCHS=%d %d %d\n",
1542                    cylinders, heads, sectors);
1543 #endif
1544             return 0;
1545         }
1546     }
1547     return -1;
1548 }
1549 
1550 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
1551 {
1552     int translation, lba_detected = 0;
1553     int cylinders, heads, secs;
1554     uint64_t nb_sectors;
1555 
1556     /* if a geometry hint is available, use it */
1557     bdrv_get_geometry(bs, &nb_sectors);
1558     bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
1559     translation = bdrv_get_translation_hint(bs);
1560     if (cylinders != 0) {
1561         *pcyls = cylinders;
1562         *pheads = heads;
1563         *psecs = secs;
1564     } else {
1565         if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
1566             if (heads > 16) {
1567                 /* if heads > 16, it means that a BIOS LBA
1568                    translation was active, so the default
1569                    hardware geometry is OK */
1570                 lba_detected = 1;
1571                 goto default_geometry;
1572             } else {
1573                 *pcyls = cylinders;
1574                 *pheads = heads;
1575                 *psecs = secs;
1576                 /* disable any translation to be in sync with
1577                    the logical geometry */
1578                 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
1579                     bdrv_set_translation_hint(bs,
1580                                               BIOS_ATA_TRANSLATION_NONE);
1581                 }
1582             }
1583         } else {
1584         default_geometry:
1585             /* if no geometry, use a standard physical disk geometry */
1586             cylinders = nb_sectors / (16 * 63);
1587 
1588             if (cylinders > 16383)
1589                 cylinders = 16383;
1590             else if (cylinders < 2)
1591                 cylinders = 2;
1592             *pcyls = cylinders;
1593             *pheads = 16;
1594             *psecs = 63;
1595             if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
1596                 if ((*pcyls * *pheads) <= 131072) {
1597                     bdrv_set_translation_hint(bs,
1598                                               BIOS_ATA_TRANSLATION_LARGE);
1599                 } else {
1600                     bdrv_set_translation_hint(bs,
1601                                               BIOS_ATA_TRANSLATION_LBA);
1602                 }
1603             }
1604         }
1605         bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
1606     }
1607 }
1608 
1609 void bdrv_set_geometry_hint(BlockDriverState *bs,
1610                             int cyls, int heads, int secs)
1611 {
1612     bs->cyls = cyls;
1613     bs->heads = heads;
1614     bs->secs = secs;
1615 }
1616 
1617 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
1618 {
1619     bs->translation = translation;
1620 }
1621 
1622 void bdrv_get_geometry_hint(BlockDriverState *bs,
1623                             int *pcyls, int *pheads, int *psecs)
1624 {
1625     *pcyls = bs->cyls;
1626     *pheads = bs->heads;
1627     *psecs = bs->secs;
1628 }
1629 
1630 /* throttling disk io limits */
1631 void bdrv_set_io_limits(BlockDriverState *bs,
1632                         BlockIOLimit *io_limits)
1633 {
1634     bs->io_limits = *io_limits;
1635     bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
1636 }
1637 
1638 /* Recognize floppy formats */
1639 typedef struct FDFormat {
1640     FDriveType drive;
1641     uint8_t last_sect;
1642     uint8_t max_track;
1643     uint8_t max_head;
1644 } FDFormat;
1645 
1646 static const FDFormat fd_formats[] = {
1647     /* First entry is default format */
1648     /* 1.44 MB 3"1/2 floppy disks */
1649     { FDRIVE_DRV_144, 18, 80, 1, },
1650     { FDRIVE_DRV_144, 20, 80, 1, },
1651     { FDRIVE_DRV_144, 21, 80, 1, },
1652     { FDRIVE_DRV_144, 21, 82, 1, },
1653     { FDRIVE_DRV_144, 21, 83, 1, },
1654     { FDRIVE_DRV_144, 22, 80, 1, },
1655     { FDRIVE_DRV_144, 23, 80, 1, },
1656     { FDRIVE_DRV_144, 24, 80, 1, },
1657     /* 2.88 MB 3"1/2 floppy disks */
1658     { FDRIVE_DRV_288, 36, 80, 1, },
1659     { FDRIVE_DRV_288, 39, 80, 1, },
1660     { FDRIVE_DRV_288, 40, 80, 1, },
1661     { FDRIVE_DRV_288, 44, 80, 1, },
1662     { FDRIVE_DRV_288, 48, 80, 1, },
1663     /* 720 kB 3"1/2 floppy disks */
1664     { FDRIVE_DRV_144,  9, 80, 1, },
1665     { FDRIVE_DRV_144, 10, 80, 1, },
1666     { FDRIVE_DRV_144, 10, 82, 1, },
1667     { FDRIVE_DRV_144, 10, 83, 1, },
1668     { FDRIVE_DRV_144, 13, 80, 1, },
1669     { FDRIVE_DRV_144, 14, 80, 1, },
1670     /* 1.2 MB 5"1/4 floppy disks */
1671     { FDRIVE_DRV_120, 15, 80, 1, },
1672     { FDRIVE_DRV_120, 18, 80, 1, },
1673     { FDRIVE_DRV_120, 18, 82, 1, },
1674     { FDRIVE_DRV_120, 18, 83, 1, },
1675     { FDRIVE_DRV_120, 20, 80, 1, },
1676     /* 720 kB 5"1/4 floppy disks */
1677     { FDRIVE_DRV_120,  9, 80, 1, },
1678     { FDRIVE_DRV_120, 11, 80, 1, },
1679     /* 360 kB 5"1/4 floppy disks */
1680     { FDRIVE_DRV_120,  9, 40, 1, },
1681     { FDRIVE_DRV_120,  9, 40, 0, },
1682     { FDRIVE_DRV_120, 10, 41, 1, },
1683     { FDRIVE_DRV_120, 10, 42, 1, },
1684     /* 320 kB 5"1/4 floppy disks */
1685     { FDRIVE_DRV_120,  8, 40, 1, },
1686     { FDRIVE_DRV_120,  8, 40, 0, },
1687     /* 360 kB must match 5"1/4 better than 3"1/2... */
1688     { FDRIVE_DRV_144,  9, 80, 0, },
1689     /* end */
1690     { FDRIVE_DRV_NONE, -1, -1, 0, },
1691 };
1692 
1693 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
1694                                    int *max_track, int *last_sect,
1695                                    FDriveType drive_in, FDriveType *drive)
1696 {
1697     const FDFormat *parse;
1698     uint64_t nb_sectors, size;
1699     int i, first_match, match;
1700 
1701     bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
1702     if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
1703         /* User defined disk */
1704     } else {
1705         bdrv_get_geometry(bs, &nb_sectors);
1706         match = -1;
1707         first_match = -1;
1708         for (i = 0; ; i++) {
1709             parse = &fd_formats[i];
1710             if (parse->drive == FDRIVE_DRV_NONE) {
1711                 break;
1712             }
1713             if (drive_in == parse->drive ||
1714                 drive_in == FDRIVE_DRV_NONE) {
1715                 size = (parse->max_head + 1) * parse->max_track *
1716                     parse->last_sect;
1717                 if (nb_sectors == size) {
1718                     match = i;
1719                     break;
1720                 }
1721                 if (first_match == -1) {
1722                     first_match = i;
1723                 }
1724             }
1725         }
1726         if (match == -1) {
1727             if (first_match == -1) {
1728                 match = 1;
1729             } else {
1730                 match = first_match;
1731             }
1732             parse = &fd_formats[match];
1733         }
1734         *nb_heads = parse->max_head + 1;
1735         *max_track = parse->max_track;
1736         *last_sect = parse->last_sect;
1737         *drive = parse->drive;
1738     }
1739 }
1740 
1741 int bdrv_get_translation_hint(BlockDriverState *bs)
1742 {
1743     return bs->translation;
1744 }
1745 
1746 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
1747                        BlockErrorAction on_write_error)
1748 {
1749     bs->on_read_error = on_read_error;
1750     bs->on_write_error = on_write_error;
1751 }
1752 
1753 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
1754 {
1755     return is_read ? bs->on_read_error : bs->on_write_error;
1756 }
1757 
1758 int bdrv_is_read_only(BlockDriverState *bs)
1759 {
1760     return bs->read_only;
1761 }
1762 
1763 int bdrv_is_sg(BlockDriverState *bs)
1764 {
1765     return bs->sg;
1766 }
1767 
1768 int bdrv_enable_write_cache(BlockDriverState *bs)
1769 {
1770     return bs->enable_write_cache;
1771 }
1772 
1773 int bdrv_is_encrypted(BlockDriverState *bs)
1774 {
1775     if (bs->backing_hd && bs->backing_hd->encrypted)
1776         return 1;
1777     return bs->encrypted;
1778 }
1779 
1780 int bdrv_key_required(BlockDriverState *bs)
1781 {
1782     BlockDriverState *backing_hd = bs->backing_hd;
1783 
1784     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
1785         return 1;
1786     return (bs->encrypted && !bs->valid_key);
1787 }
1788 
1789 int bdrv_set_key(BlockDriverState *bs, const char *key)
1790 {
1791     int ret;
1792     if (bs->backing_hd && bs->backing_hd->encrypted) {
1793         ret = bdrv_set_key(bs->backing_hd, key);
1794         if (ret < 0)
1795             return ret;
1796         if (!bs->encrypted)
1797             return 0;
1798     }
1799     if (!bs->encrypted) {
1800         return -EINVAL;
1801     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
1802         return -ENOMEDIUM;
1803     }
1804     ret = bs->drv->bdrv_set_key(bs, key);
1805     if (ret < 0) {
1806         bs->valid_key = 0;
1807     } else if (!bs->valid_key) {
1808         bs->valid_key = 1;
1809         /* call the change callback now, we skipped it on open */
1810         bdrv_dev_change_media_cb(bs, true);
1811     }
1812     return ret;
1813 }
1814 
1815 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
1816 {
1817     if (!bs->drv) {
1818         buf[0] = '\0';
1819     } else {
1820         pstrcpy(buf, buf_size, bs->drv->format_name);
1821     }
1822 }
1823 
1824 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
1825                          void *opaque)
1826 {
1827     BlockDriver *drv;
1828 
1829     QLIST_FOREACH(drv, &bdrv_drivers, list) {
1830         it(opaque, drv->format_name);
1831     }
1832 }
1833 
1834 BlockDriverState *bdrv_find(const char *name)
1835 {
1836     BlockDriverState *bs;
1837 
1838     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1839         if (!strcmp(name, bs->device_name)) {
1840             return bs;
1841         }
1842     }
1843     return NULL;
1844 }
1845 
1846 BlockDriverState *bdrv_next(BlockDriverState *bs)
1847 {
1848     if (!bs) {
1849         return QTAILQ_FIRST(&bdrv_states);
1850     }
1851     return QTAILQ_NEXT(bs, list);
1852 }
1853 
1854 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
1855 {
1856     BlockDriverState *bs;
1857 
1858     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1859         it(opaque, bs);
1860     }
1861 }
1862 
1863 const char *bdrv_get_device_name(BlockDriverState *bs)
1864 {
1865     return bs->device_name;
1866 }
1867 
1868 void bdrv_flush_all(void)
1869 {
1870     BlockDriverState *bs;
1871 
1872     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1873         if (!bdrv_is_read_only(bs) && bdrv_is_inserted(bs)) {
1874             bdrv_flush(bs);
1875         }
1876     }
1877 }
1878 
1879 int bdrv_has_zero_init(BlockDriverState *bs)
1880 {
1881     assert(bs->drv);
1882 
1883     if (bs->drv->bdrv_has_zero_init) {
1884         return bs->drv->bdrv_has_zero_init(bs);
1885     }
1886 
1887     return 1;
1888 }
1889 
1890 /*
1891  * Returns true iff the specified sector is present in the disk image. Drivers
1892  * not implementing the functionality are assumed to not support backing files,
1893  * hence all their sectors are reported as allocated.
1894  *
1895  * 'pnum' is set to the number of sectors (including and immediately following
1896  * the specified sector) that are known to be in the same
1897  * allocated/unallocated state.
1898  *
1899  * 'nb_sectors' is the max value 'pnum' should be set to.
1900  */
1901 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
1902 	int *pnum)
1903 {
1904     int64_t n;
1905     if (!bs->drv->bdrv_is_allocated) {
1906         if (sector_num >= bs->total_sectors) {
1907             *pnum = 0;
1908             return 0;
1909         }
1910         n = bs->total_sectors - sector_num;
1911         *pnum = (n < nb_sectors) ? (n) : (nb_sectors);
1912         return 1;
1913     }
1914     return bs->drv->bdrv_is_allocated(bs, sector_num, nb_sectors, pnum);
1915 }
1916 
1917 void bdrv_mon_event(const BlockDriverState *bdrv,
1918                     BlockMonEventAction action, int is_read)
1919 {
1920     QObject *data;
1921     const char *action_str;
1922 
1923     switch (action) {
1924     case BDRV_ACTION_REPORT:
1925         action_str = "report";
1926         break;
1927     case BDRV_ACTION_IGNORE:
1928         action_str = "ignore";
1929         break;
1930     case BDRV_ACTION_STOP:
1931         action_str = "stop";
1932         break;
1933     default:
1934         abort();
1935     }
1936 
1937     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1938                               bdrv->device_name,
1939                               action_str,
1940                               is_read ? "read" : "write");
1941     monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1942 
1943     qobject_decref(data);
1944 }
1945 
1946 BlockInfoList *qmp_query_block(Error **errp)
1947 {
1948     BlockInfoList *head = NULL, *cur_item = NULL;
1949     BlockDriverState *bs;
1950 
1951     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1952         BlockInfoList *info = g_malloc0(sizeof(*info));
1953 
1954         info->value = g_malloc0(sizeof(*info->value));
1955         info->value->device = g_strdup(bs->device_name);
1956         info->value->type = g_strdup("unknown");
1957         info->value->locked = bdrv_dev_is_medium_locked(bs);
1958         info->value->removable = bdrv_dev_has_removable_media(bs);
1959 
1960         if (bdrv_dev_has_removable_media(bs)) {
1961             info->value->has_tray_open = true;
1962             info->value->tray_open = bdrv_dev_is_tray_open(bs);
1963         }
1964 
1965         if (bdrv_iostatus_is_enabled(bs)) {
1966             info->value->has_io_status = true;
1967             info->value->io_status = bs->iostatus;
1968         }
1969 
1970         if (bs->drv) {
1971             info->value->has_inserted = true;
1972             info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
1973             info->value->inserted->file = g_strdup(bs->filename);
1974             info->value->inserted->ro = bs->read_only;
1975             info->value->inserted->drv = g_strdup(bs->drv->format_name);
1976             info->value->inserted->encrypted = bs->encrypted;
1977             if (bs->backing_file[0]) {
1978                 info->value->inserted->has_backing_file = true;
1979                 info->value->inserted->backing_file = g_strdup(bs->backing_file);
1980             }
1981 
1982             if (bs->io_limits_enabled) {
1983                 info->value->inserted->bps =
1984                                bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
1985                 info->value->inserted->bps_rd =
1986                                bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
1987                 info->value->inserted->bps_wr =
1988                                bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
1989                 info->value->inserted->iops =
1990                                bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
1991                 info->value->inserted->iops_rd =
1992                                bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
1993                 info->value->inserted->iops_wr =
1994                                bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
1995             }
1996         }
1997 
1998         /* XXX: waiting for the qapi to support GSList */
1999         if (!cur_item) {
2000             head = cur_item = info;
2001         } else {
2002             cur_item->next = info;
2003             cur_item = info;
2004         }
2005     }
2006 
2007     return head;
2008 }
2009 
2010 /* Consider exposing this as a full fledged QMP command */
2011 static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2012 {
2013     BlockStats *s;
2014 
2015     s = g_malloc0(sizeof(*s));
2016 
2017     if (bs->device_name[0]) {
2018         s->has_device = true;
2019         s->device = g_strdup(bs->device_name);
2020     }
2021 
2022     s->stats = g_malloc0(sizeof(*s->stats));
2023     s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2024     s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2025     s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2026     s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2027     s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2028     s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2029     s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2030     s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2031     s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2032 
2033     if (bs->file) {
2034         s->has_parent = true;
2035         s->parent = qmp_query_blockstat(bs->file, NULL);
2036     }
2037 
2038     return s;
2039 }
2040 
2041 BlockStatsList *qmp_query_blockstats(Error **errp)
2042 {
2043     BlockStatsList *head = NULL, *cur_item = NULL;
2044     BlockDriverState *bs;
2045 
2046     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2047         BlockStatsList *info = g_malloc0(sizeof(*info));
2048         info->value = qmp_query_blockstat(bs, NULL);
2049 
2050         /* XXX: waiting for the qapi to support GSList */
2051         if (!cur_item) {
2052             head = cur_item = info;
2053         } else {
2054             cur_item->next = info;
2055             cur_item = info;
2056         }
2057     }
2058 
2059     return head;
2060 }
2061 
2062 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2063 {
2064     if (bs->backing_hd && bs->backing_hd->encrypted)
2065         return bs->backing_file;
2066     else if (bs->encrypted)
2067         return bs->filename;
2068     else
2069         return NULL;
2070 }
2071 
2072 void bdrv_get_backing_filename(BlockDriverState *bs,
2073                                char *filename, int filename_size)
2074 {
2075     pstrcpy(filename, filename_size, bs->backing_file);
2076 }
2077 
2078 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2079                           const uint8_t *buf, int nb_sectors)
2080 {
2081     BlockDriver *drv = bs->drv;
2082     if (!drv)
2083         return -ENOMEDIUM;
2084     if (!drv->bdrv_write_compressed)
2085         return -ENOTSUP;
2086     if (bdrv_check_request(bs, sector_num, nb_sectors))
2087         return -EIO;
2088 
2089     if (bs->dirty_bitmap) {
2090         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2091     }
2092 
2093     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2094 }
2095 
2096 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2097 {
2098     BlockDriver *drv = bs->drv;
2099     if (!drv)
2100         return -ENOMEDIUM;
2101     if (!drv->bdrv_get_info)
2102         return -ENOTSUP;
2103     memset(bdi, 0, sizeof(*bdi));
2104     return drv->bdrv_get_info(bs, bdi);
2105 }
2106 
2107 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2108                       int64_t pos, int size)
2109 {
2110     BlockDriver *drv = bs->drv;
2111     if (!drv)
2112         return -ENOMEDIUM;
2113     if (drv->bdrv_save_vmstate)
2114         return drv->bdrv_save_vmstate(bs, buf, pos, size);
2115     if (bs->file)
2116         return bdrv_save_vmstate(bs->file, buf, pos, size);
2117     return -ENOTSUP;
2118 }
2119 
2120 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2121                       int64_t pos, int size)
2122 {
2123     BlockDriver *drv = bs->drv;
2124     if (!drv)
2125         return -ENOMEDIUM;
2126     if (drv->bdrv_load_vmstate)
2127         return drv->bdrv_load_vmstate(bs, buf, pos, size);
2128     if (bs->file)
2129         return bdrv_load_vmstate(bs->file, buf, pos, size);
2130     return -ENOTSUP;
2131 }
2132 
2133 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2134 {
2135     BlockDriver *drv = bs->drv;
2136 
2137     if (!drv || !drv->bdrv_debug_event) {
2138         return;
2139     }
2140 
2141     return drv->bdrv_debug_event(bs, event);
2142 
2143 }
2144 
2145 /**************************************************************/
2146 /* handling of snapshots */
2147 
2148 int bdrv_can_snapshot(BlockDriverState *bs)
2149 {
2150     BlockDriver *drv = bs->drv;
2151     if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2152         return 0;
2153     }
2154 
2155     if (!drv->bdrv_snapshot_create) {
2156         if (bs->file != NULL) {
2157             return bdrv_can_snapshot(bs->file);
2158         }
2159         return 0;
2160     }
2161 
2162     return 1;
2163 }
2164 
2165 int bdrv_is_snapshot(BlockDriverState *bs)
2166 {
2167     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2168 }
2169 
2170 BlockDriverState *bdrv_snapshots(void)
2171 {
2172     BlockDriverState *bs;
2173 
2174     if (bs_snapshots) {
2175         return bs_snapshots;
2176     }
2177 
2178     bs = NULL;
2179     while ((bs = bdrv_next(bs))) {
2180         if (bdrv_can_snapshot(bs)) {
2181             bs_snapshots = bs;
2182             return bs;
2183         }
2184     }
2185     return NULL;
2186 }
2187 
2188 int bdrv_snapshot_create(BlockDriverState *bs,
2189                          QEMUSnapshotInfo *sn_info)
2190 {
2191     BlockDriver *drv = bs->drv;
2192     if (!drv)
2193         return -ENOMEDIUM;
2194     if (drv->bdrv_snapshot_create)
2195         return drv->bdrv_snapshot_create(bs, sn_info);
2196     if (bs->file)
2197         return bdrv_snapshot_create(bs->file, sn_info);
2198     return -ENOTSUP;
2199 }
2200 
2201 int bdrv_snapshot_goto(BlockDriverState *bs,
2202                        const char *snapshot_id)
2203 {
2204     BlockDriver *drv = bs->drv;
2205     int ret, open_ret;
2206 
2207     if (!drv)
2208         return -ENOMEDIUM;
2209     if (drv->bdrv_snapshot_goto)
2210         return drv->bdrv_snapshot_goto(bs, snapshot_id);
2211 
2212     if (bs->file) {
2213         drv->bdrv_close(bs);
2214         ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2215         open_ret = drv->bdrv_open(bs, bs->open_flags);
2216         if (open_ret < 0) {
2217             bdrv_delete(bs->file);
2218             bs->drv = NULL;
2219             return open_ret;
2220         }
2221         return ret;
2222     }
2223 
2224     return -ENOTSUP;
2225 }
2226 
2227 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2228 {
2229     BlockDriver *drv = bs->drv;
2230     if (!drv)
2231         return -ENOMEDIUM;
2232     if (drv->bdrv_snapshot_delete)
2233         return drv->bdrv_snapshot_delete(bs, snapshot_id);
2234     if (bs->file)
2235         return bdrv_snapshot_delete(bs->file, snapshot_id);
2236     return -ENOTSUP;
2237 }
2238 
2239 int bdrv_snapshot_list(BlockDriverState *bs,
2240                        QEMUSnapshotInfo **psn_info)
2241 {
2242     BlockDriver *drv = bs->drv;
2243     if (!drv)
2244         return -ENOMEDIUM;
2245     if (drv->bdrv_snapshot_list)
2246         return drv->bdrv_snapshot_list(bs, psn_info);
2247     if (bs->file)
2248         return bdrv_snapshot_list(bs->file, psn_info);
2249     return -ENOTSUP;
2250 }
2251 
2252 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2253         const char *snapshot_name)
2254 {
2255     BlockDriver *drv = bs->drv;
2256     if (!drv) {
2257         return -ENOMEDIUM;
2258     }
2259     if (!bs->read_only) {
2260         return -EINVAL;
2261     }
2262     if (drv->bdrv_snapshot_load_tmp) {
2263         return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2264     }
2265     return -ENOTSUP;
2266 }
2267 
2268 #define NB_SUFFIXES 4
2269 
2270 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2271 {
2272     static const char suffixes[NB_SUFFIXES] = "KMGT";
2273     int64_t base;
2274     int i;
2275 
2276     if (size <= 999) {
2277         snprintf(buf, buf_size, "%" PRId64, size);
2278     } else {
2279         base = 1024;
2280         for(i = 0; i < NB_SUFFIXES; i++) {
2281             if (size < (10 * base)) {
2282                 snprintf(buf, buf_size, "%0.1f%c",
2283                          (double)size / base,
2284                          suffixes[i]);
2285                 break;
2286             } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2287                 snprintf(buf, buf_size, "%" PRId64 "%c",
2288                          ((size + (base >> 1)) / base),
2289                          suffixes[i]);
2290                 break;
2291             }
2292             base = base * 1024;
2293         }
2294     }
2295     return buf;
2296 }
2297 
2298 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2299 {
2300     char buf1[128], date_buf[128], clock_buf[128];
2301 #ifdef _WIN32
2302     struct tm *ptm;
2303 #else
2304     struct tm tm;
2305 #endif
2306     time_t ti;
2307     int64_t secs;
2308 
2309     if (!sn) {
2310         snprintf(buf, buf_size,
2311                  "%-10s%-20s%7s%20s%15s",
2312                  "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2313     } else {
2314         ti = sn->date_sec;
2315 #ifdef _WIN32
2316         ptm = localtime(&ti);
2317         strftime(date_buf, sizeof(date_buf),
2318                  "%Y-%m-%d %H:%M:%S", ptm);
2319 #else
2320         localtime_r(&ti, &tm);
2321         strftime(date_buf, sizeof(date_buf),
2322                  "%Y-%m-%d %H:%M:%S", &tm);
2323 #endif
2324         secs = sn->vm_clock_nsec / 1000000000;
2325         snprintf(clock_buf, sizeof(clock_buf),
2326                  "%02d:%02d:%02d.%03d",
2327                  (int)(secs / 3600),
2328                  (int)((secs / 60) % 60),
2329                  (int)(secs % 60),
2330                  (int)((sn->vm_clock_nsec / 1000000) % 1000));
2331         snprintf(buf, buf_size,
2332                  "%-10s%-20s%7s%20s%15s",
2333                  sn->id_str, sn->name,
2334                  get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2335                  date_buf,
2336                  clock_buf);
2337     }
2338     return buf;
2339 }
2340 
2341 /**************************************************************/
2342 /* async I/Os */
2343 
2344 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2345                                  QEMUIOVector *qiov, int nb_sectors,
2346                                  BlockDriverCompletionFunc *cb, void *opaque)
2347 {
2348     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2349 
2350     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2351                                  cb, opaque, false);
2352 }
2353 
2354 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2355                                   QEMUIOVector *qiov, int nb_sectors,
2356                                   BlockDriverCompletionFunc *cb, void *opaque)
2357 {
2358     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2359 
2360     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2361                                  cb, opaque, true);
2362 }
2363 
2364 
2365 typedef struct MultiwriteCB {
2366     int error;
2367     int num_requests;
2368     int num_callbacks;
2369     struct {
2370         BlockDriverCompletionFunc *cb;
2371         void *opaque;
2372         QEMUIOVector *free_qiov;
2373         void *free_buf;
2374     } callbacks[];
2375 } MultiwriteCB;
2376 
2377 static void multiwrite_user_cb(MultiwriteCB *mcb)
2378 {
2379     int i;
2380 
2381     for (i = 0; i < mcb->num_callbacks; i++) {
2382         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2383         if (mcb->callbacks[i].free_qiov) {
2384             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2385         }
2386         g_free(mcb->callbacks[i].free_qiov);
2387         qemu_vfree(mcb->callbacks[i].free_buf);
2388     }
2389 }
2390 
2391 static void multiwrite_cb(void *opaque, int ret)
2392 {
2393     MultiwriteCB *mcb = opaque;
2394 
2395     trace_multiwrite_cb(mcb, ret);
2396 
2397     if (ret < 0 && !mcb->error) {
2398         mcb->error = ret;
2399     }
2400 
2401     mcb->num_requests--;
2402     if (mcb->num_requests == 0) {
2403         multiwrite_user_cb(mcb);
2404         g_free(mcb);
2405     }
2406 }
2407 
2408 static int multiwrite_req_compare(const void *a, const void *b)
2409 {
2410     const BlockRequest *req1 = a, *req2 = b;
2411 
2412     /*
2413      * Note that we can't simply subtract req2->sector from req1->sector
2414      * here as that could overflow the return value.
2415      */
2416     if (req1->sector > req2->sector) {
2417         return 1;
2418     } else if (req1->sector < req2->sector) {
2419         return -1;
2420     } else {
2421         return 0;
2422     }
2423 }
2424 
2425 /*
2426  * Takes a bunch of requests and tries to merge them. Returns the number of
2427  * requests that remain after merging.
2428  */
2429 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2430     int num_reqs, MultiwriteCB *mcb)
2431 {
2432     int i, outidx;
2433 
2434     // Sort requests by start sector
2435     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2436 
2437     // Check if adjacent requests touch the same clusters. If so, combine them,
2438     // filling up gaps with zero sectors.
2439     outidx = 0;
2440     for (i = 1; i < num_reqs; i++) {
2441         int merge = 0;
2442         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2443 
2444         // This handles the cases that are valid for all block drivers, namely
2445         // exactly sequential writes and overlapping writes.
2446         if (reqs[i].sector <= oldreq_last) {
2447             merge = 1;
2448         }
2449 
2450         // The block driver may decide that it makes sense to combine requests
2451         // even if there is a gap of some sectors between them. In this case,
2452         // the gap is filled with zeros (therefore only applicable for yet
2453         // unused space in format like qcow2).
2454         if (!merge && bs->drv->bdrv_merge_requests) {
2455             merge = bs->drv->bdrv_merge_requests(bs, &reqs[outidx], &reqs[i]);
2456         }
2457 
2458         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2459             merge = 0;
2460         }
2461 
2462         if (merge) {
2463             size_t size;
2464             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
2465             qemu_iovec_init(qiov,
2466                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2467 
2468             // Add the first request to the merged one. If the requests are
2469             // overlapping, drop the last sectors of the first request.
2470             size = (reqs[i].sector - reqs[outidx].sector) << 9;
2471             qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
2472 
2473             // We might need to add some zeros between the two requests
2474             if (reqs[i].sector > oldreq_last) {
2475                 size_t zero_bytes = (reqs[i].sector - oldreq_last) << 9;
2476                 uint8_t *buf = qemu_blockalign(bs, zero_bytes);
2477                 memset(buf, 0, zero_bytes);
2478                 qemu_iovec_add(qiov, buf, zero_bytes);
2479                 mcb->callbacks[i].free_buf = buf;
2480             }
2481 
2482             // Add the second request
2483             qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
2484 
2485             reqs[outidx].nb_sectors = qiov->size >> 9;
2486             reqs[outidx].qiov = qiov;
2487 
2488             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
2489         } else {
2490             outidx++;
2491             reqs[outidx].sector     = reqs[i].sector;
2492             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
2493             reqs[outidx].qiov       = reqs[i].qiov;
2494         }
2495     }
2496 
2497     return outidx + 1;
2498 }
2499 
2500 /*
2501  * Submit multiple AIO write requests at once.
2502  *
2503  * On success, the function returns 0 and all requests in the reqs array have
2504  * been submitted. In error case this function returns -1, and any of the
2505  * requests may or may not be submitted yet. In particular, this means that the
2506  * callback will be called for some of the requests, for others it won't. The
2507  * caller must check the error field of the BlockRequest to wait for the right
2508  * callbacks (if error != 0, no callback will be called).
2509  *
2510  * The implementation may modify the contents of the reqs array, e.g. to merge
2511  * requests. However, the fields opaque and error are left unmodified as they
2512  * are used to signal failure for a single request to the caller.
2513  */
2514 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
2515 {
2516     BlockDriverAIOCB *acb;
2517     MultiwriteCB *mcb;
2518     int i;
2519 
2520     /* don't submit writes if we don't have a medium */
2521     if (bs->drv == NULL) {
2522         for (i = 0; i < num_reqs; i++) {
2523             reqs[i].error = -ENOMEDIUM;
2524         }
2525         return -1;
2526     }
2527 
2528     if (num_reqs == 0) {
2529         return 0;
2530     }
2531 
2532     // Create MultiwriteCB structure
2533     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
2534     mcb->num_requests = 0;
2535     mcb->num_callbacks = num_reqs;
2536 
2537     for (i = 0; i < num_reqs; i++) {
2538         mcb->callbacks[i].cb = reqs[i].cb;
2539         mcb->callbacks[i].opaque = reqs[i].opaque;
2540     }
2541 
2542     // Check for mergable requests
2543     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
2544 
2545     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
2546 
2547     /*
2548      * Run the aio requests. As soon as one request can't be submitted
2549      * successfully, fail all requests that are not yet submitted (we must
2550      * return failure for all requests anyway)
2551      *
2552      * num_requests cannot be set to the right value immediately: If
2553      * bdrv_aio_writev fails for some request, num_requests would be too high
2554      * and therefore multiwrite_cb() would never recognize the multiwrite
2555      * request as completed. We also cannot use the loop variable i to set it
2556      * when the first request fails because the callback may already have been
2557      * called for previously submitted requests. Thus, num_requests must be
2558      * incremented for each request that is submitted.
2559      *
2560      * The problem that callbacks may be called early also means that we need
2561      * to take care that num_requests doesn't become 0 before all requests are
2562      * submitted - multiwrite_cb() would consider the multiwrite request
2563      * completed. A dummy request that is "completed" by a manual call to
2564      * multiwrite_cb() takes care of this.
2565      */
2566     mcb->num_requests = 1;
2567 
2568     // Run the aio requests
2569     for (i = 0; i < num_reqs; i++) {
2570         mcb->num_requests++;
2571         acb = bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
2572             reqs[i].nb_sectors, multiwrite_cb, mcb);
2573 
2574         if (acb == NULL) {
2575             // We can only fail the whole thing if no request has been
2576             // submitted yet. Otherwise we'll wait for the submitted AIOs to
2577             // complete and report the error in the callback.
2578             if (i == 0) {
2579                 trace_bdrv_aio_multiwrite_earlyfail(mcb);
2580                 goto fail;
2581             } else {
2582                 trace_bdrv_aio_multiwrite_latefail(mcb, i);
2583                 multiwrite_cb(mcb, -EIO);
2584                 break;
2585             }
2586         }
2587     }
2588 
2589     /* Complete the dummy request */
2590     multiwrite_cb(mcb, 0);
2591 
2592     return 0;
2593 
2594 fail:
2595     for (i = 0; i < mcb->num_callbacks; i++) {
2596         reqs[i].error = -EIO;
2597     }
2598     g_free(mcb);
2599     return -1;
2600 }
2601 
2602 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
2603 {
2604     acb->pool->cancel(acb);
2605 }
2606 
2607 /* block I/O throttling */
2608 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
2609                  bool is_write, double elapsed_time, uint64_t *wait)
2610 {
2611     uint64_t bps_limit = 0;
2612     double   bytes_limit, bytes_base, bytes_res;
2613     double   slice_time, wait_time;
2614 
2615     if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
2616         bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2617     } else if (bs->io_limits.bps[is_write]) {
2618         bps_limit = bs->io_limits.bps[is_write];
2619     } else {
2620         if (wait) {
2621             *wait = 0;
2622         }
2623 
2624         return false;
2625     }
2626 
2627     slice_time = bs->slice_end - bs->slice_start;
2628     slice_time /= (NANOSECONDS_PER_SECOND);
2629     bytes_limit = bps_limit * slice_time;
2630     bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
2631     if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
2632         bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
2633     }
2634 
2635     /* bytes_base: the bytes of data which have been read/written; and
2636      *             it is obtained from the history statistic info.
2637      * bytes_res: the remaining bytes of data which need to be read/written.
2638      * (bytes_base + bytes_res) / bps_limit: used to calcuate
2639      *             the total time for completing reading/writting all data.
2640      */
2641     bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
2642 
2643     if (bytes_base + bytes_res <= bytes_limit) {
2644         if (wait) {
2645             *wait = 0;
2646         }
2647 
2648         return false;
2649     }
2650 
2651     /* Calc approx time to dispatch */
2652     wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
2653 
2654     /* When the I/O rate at runtime exceeds the limits,
2655      * bs->slice_end need to be extended in order that the current statistic
2656      * info can be kept until the timer fire, so it is increased and tuned
2657      * based on the result of experiment.
2658      */
2659     bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
2660     bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
2661     if (wait) {
2662         *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
2663     }
2664 
2665     return true;
2666 }
2667 
2668 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
2669                              double elapsed_time, uint64_t *wait)
2670 {
2671     uint64_t iops_limit = 0;
2672     double   ios_limit, ios_base;
2673     double   slice_time, wait_time;
2674 
2675     if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
2676         iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2677     } else if (bs->io_limits.iops[is_write]) {
2678         iops_limit = bs->io_limits.iops[is_write];
2679     } else {
2680         if (wait) {
2681             *wait = 0;
2682         }
2683 
2684         return false;
2685     }
2686 
2687     slice_time = bs->slice_end - bs->slice_start;
2688     slice_time /= (NANOSECONDS_PER_SECOND);
2689     ios_limit  = iops_limit * slice_time;
2690     ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
2691     if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
2692         ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
2693     }
2694 
2695     if (ios_base + 1 <= ios_limit) {
2696         if (wait) {
2697             *wait = 0;
2698         }
2699 
2700         return false;
2701     }
2702 
2703     /* Calc approx time to dispatch */
2704     wait_time = (ios_base + 1) / iops_limit;
2705     if (wait_time > elapsed_time) {
2706         wait_time = wait_time - elapsed_time;
2707     } else {
2708         wait_time = 0;
2709     }
2710 
2711     bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
2712     bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
2713     if (wait) {
2714         *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
2715     }
2716 
2717     return true;
2718 }
2719 
2720 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
2721                            bool is_write, int64_t *wait)
2722 {
2723     int64_t  now, max_wait;
2724     uint64_t bps_wait = 0, iops_wait = 0;
2725     double   elapsed_time;
2726     int      bps_ret, iops_ret;
2727 
2728     now = qemu_get_clock_ns(vm_clock);
2729     if ((bs->slice_start < now)
2730         && (bs->slice_end > now)) {
2731         bs->slice_end = now + bs->slice_time;
2732     } else {
2733         bs->slice_time  =  5 * BLOCK_IO_SLICE_TIME;
2734         bs->slice_start = now;
2735         bs->slice_end   = now + bs->slice_time;
2736 
2737         bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
2738         bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
2739 
2740         bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
2741         bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
2742     }
2743 
2744     elapsed_time  = now - bs->slice_start;
2745     elapsed_time  /= (NANOSECONDS_PER_SECOND);
2746 
2747     bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
2748                                       is_write, elapsed_time, &bps_wait);
2749     iops_ret = bdrv_exceed_iops_limits(bs, is_write,
2750                                       elapsed_time, &iops_wait);
2751     if (bps_ret || iops_ret) {
2752         max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
2753         if (wait) {
2754             *wait = max_wait;
2755         }
2756 
2757         now = qemu_get_clock_ns(vm_clock);
2758         if (bs->slice_end < now + max_wait) {
2759             bs->slice_end = now + max_wait;
2760         }
2761 
2762         return true;
2763     }
2764 
2765     if (wait) {
2766         *wait = 0;
2767     }
2768 
2769     return false;
2770 }
2771 
2772 /**************************************************************/
2773 /* async block device emulation */
2774 
2775 typedef struct BlockDriverAIOCBSync {
2776     BlockDriverAIOCB common;
2777     QEMUBH *bh;
2778     int ret;
2779     /* vector translation state */
2780     QEMUIOVector *qiov;
2781     uint8_t *bounce;
2782     int is_write;
2783 } BlockDriverAIOCBSync;
2784 
2785 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
2786 {
2787     BlockDriverAIOCBSync *acb =
2788         container_of(blockacb, BlockDriverAIOCBSync, common);
2789     qemu_bh_delete(acb->bh);
2790     acb->bh = NULL;
2791     qemu_aio_release(acb);
2792 }
2793 
2794 static AIOPool bdrv_em_aio_pool = {
2795     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
2796     .cancel             = bdrv_aio_cancel_em,
2797 };
2798 
2799 static void bdrv_aio_bh_cb(void *opaque)
2800 {
2801     BlockDriverAIOCBSync *acb = opaque;
2802 
2803     if (!acb->is_write)
2804         qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
2805     qemu_vfree(acb->bounce);
2806     acb->common.cb(acb->common.opaque, acb->ret);
2807     qemu_bh_delete(acb->bh);
2808     acb->bh = NULL;
2809     qemu_aio_release(acb);
2810 }
2811 
2812 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
2813                                             int64_t sector_num,
2814                                             QEMUIOVector *qiov,
2815                                             int nb_sectors,
2816                                             BlockDriverCompletionFunc *cb,
2817                                             void *opaque,
2818                                             int is_write)
2819 
2820 {
2821     BlockDriverAIOCBSync *acb;
2822 
2823     acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2824     acb->is_write = is_write;
2825     acb->qiov = qiov;
2826     acb->bounce = qemu_blockalign(bs, qiov->size);
2827 
2828     if (!acb->bh)
2829         acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2830 
2831     if (is_write) {
2832         qemu_iovec_to_buffer(acb->qiov, acb->bounce);
2833         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
2834     } else {
2835         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
2836     }
2837 
2838     qemu_bh_schedule(acb->bh);
2839 
2840     return &acb->common;
2841 }
2842 
2843 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
2844         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2845         BlockDriverCompletionFunc *cb, void *opaque)
2846 {
2847     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
2848 }
2849 
2850 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
2851         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2852         BlockDriverCompletionFunc *cb, void *opaque)
2853 {
2854     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
2855 }
2856 
2857 
2858 typedef struct BlockDriverAIOCBCoroutine {
2859     BlockDriverAIOCB common;
2860     BlockRequest req;
2861     bool is_write;
2862     QEMUBH* bh;
2863 } BlockDriverAIOCBCoroutine;
2864 
2865 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
2866 {
2867     qemu_aio_flush();
2868 }
2869 
2870 static AIOPool bdrv_em_co_aio_pool = {
2871     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
2872     .cancel             = bdrv_aio_co_cancel_em,
2873 };
2874 
2875 static void bdrv_co_em_bh(void *opaque)
2876 {
2877     BlockDriverAIOCBCoroutine *acb = opaque;
2878 
2879     acb->common.cb(acb->common.opaque, acb->req.error);
2880     qemu_bh_delete(acb->bh);
2881     qemu_aio_release(acb);
2882 }
2883 
2884 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
2885 static void coroutine_fn bdrv_co_do_rw(void *opaque)
2886 {
2887     BlockDriverAIOCBCoroutine *acb = opaque;
2888     BlockDriverState *bs = acb->common.bs;
2889 
2890     if (!acb->is_write) {
2891         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
2892             acb->req.nb_sectors, acb->req.qiov);
2893     } else {
2894         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
2895             acb->req.nb_sectors, acb->req.qiov);
2896     }
2897 
2898     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
2899     qemu_bh_schedule(acb->bh);
2900 }
2901 
2902 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
2903                                                int64_t sector_num,
2904                                                QEMUIOVector *qiov,
2905                                                int nb_sectors,
2906                                                BlockDriverCompletionFunc *cb,
2907                                                void *opaque,
2908                                                bool is_write)
2909 {
2910     Coroutine *co;
2911     BlockDriverAIOCBCoroutine *acb;
2912 
2913     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
2914     acb->req.sector = sector_num;
2915     acb->req.nb_sectors = nb_sectors;
2916     acb->req.qiov = qiov;
2917     acb->is_write = is_write;
2918 
2919     co = qemu_coroutine_create(bdrv_co_do_rw);
2920     qemu_coroutine_enter(co, acb);
2921 
2922     return &acb->common;
2923 }
2924 
2925 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
2926 {
2927     BlockDriverAIOCBCoroutine *acb = opaque;
2928     BlockDriverState *bs = acb->common.bs;
2929 
2930     acb->req.error = bdrv_co_flush(bs);
2931     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
2932     qemu_bh_schedule(acb->bh);
2933 }
2934 
2935 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
2936         BlockDriverCompletionFunc *cb, void *opaque)
2937 {
2938     trace_bdrv_aio_flush(bs, opaque);
2939 
2940     Coroutine *co;
2941     BlockDriverAIOCBCoroutine *acb;
2942 
2943     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
2944     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
2945     qemu_coroutine_enter(co, acb);
2946 
2947     return &acb->common;
2948 }
2949 
2950 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
2951 {
2952     BlockDriverAIOCBCoroutine *acb = opaque;
2953     BlockDriverState *bs = acb->common.bs;
2954 
2955     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
2956     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
2957     qemu_bh_schedule(acb->bh);
2958 }
2959 
2960 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
2961         int64_t sector_num, int nb_sectors,
2962         BlockDriverCompletionFunc *cb, void *opaque)
2963 {
2964     Coroutine *co;
2965     BlockDriverAIOCBCoroutine *acb;
2966 
2967     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
2968 
2969     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
2970     acb->req.sector = sector_num;
2971     acb->req.nb_sectors = nb_sectors;
2972     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
2973     qemu_coroutine_enter(co, acb);
2974 
2975     return &acb->common;
2976 }
2977 
2978 void bdrv_init(void)
2979 {
2980     module_call_init(MODULE_INIT_BLOCK);
2981 }
2982 
2983 void bdrv_init_with_whitelist(void)
2984 {
2985     use_bdrv_whitelist = 1;
2986     bdrv_init();
2987 }
2988 
2989 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
2990                    BlockDriverCompletionFunc *cb, void *opaque)
2991 {
2992     BlockDriverAIOCB *acb;
2993 
2994     if (pool->free_aiocb) {
2995         acb = pool->free_aiocb;
2996         pool->free_aiocb = acb->next;
2997     } else {
2998         acb = g_malloc0(pool->aiocb_size);
2999         acb->pool = pool;
3000     }
3001     acb->bs = bs;
3002     acb->cb = cb;
3003     acb->opaque = opaque;
3004     return acb;
3005 }
3006 
3007 void qemu_aio_release(void *p)
3008 {
3009     BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3010     AIOPool *pool = acb->pool;
3011     acb->next = pool->free_aiocb;
3012     pool->free_aiocb = acb;
3013 }
3014 
3015 /**************************************************************/
3016 /* Coroutine block device emulation */
3017 
3018 typedef struct CoroutineIOCompletion {
3019     Coroutine *coroutine;
3020     int ret;
3021 } CoroutineIOCompletion;
3022 
3023 static void bdrv_co_io_em_complete(void *opaque, int ret)
3024 {
3025     CoroutineIOCompletion *co = opaque;
3026 
3027     co->ret = ret;
3028     qemu_coroutine_enter(co->coroutine, NULL);
3029 }
3030 
3031 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3032                                       int nb_sectors, QEMUIOVector *iov,
3033                                       bool is_write)
3034 {
3035     CoroutineIOCompletion co = {
3036         .coroutine = qemu_coroutine_self(),
3037     };
3038     BlockDriverAIOCB *acb;
3039 
3040     if (is_write) {
3041         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3042                                        bdrv_co_io_em_complete, &co);
3043     } else {
3044         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3045                                       bdrv_co_io_em_complete, &co);
3046     }
3047 
3048     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3049     if (!acb) {
3050         return -EIO;
3051     }
3052     qemu_coroutine_yield();
3053 
3054     return co.ret;
3055 }
3056 
3057 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3058                                          int64_t sector_num, int nb_sectors,
3059                                          QEMUIOVector *iov)
3060 {
3061     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3062 }
3063 
3064 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3065                                          int64_t sector_num, int nb_sectors,
3066                                          QEMUIOVector *iov)
3067 {
3068     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3069 }
3070 
3071 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3072 {
3073     RwCo *rwco = opaque;
3074 
3075     rwco->ret = bdrv_co_flush(rwco->bs);
3076 }
3077 
3078 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3079 {
3080     int ret;
3081 
3082     if (!bs->drv) {
3083         return 0;
3084     }
3085 
3086     /* Write back cached data to the OS even with cache=unsafe */
3087     if (bs->drv->bdrv_co_flush_to_os) {
3088         ret = bs->drv->bdrv_co_flush_to_os(bs);
3089         if (ret < 0) {
3090             return ret;
3091         }
3092     }
3093 
3094     /* But don't actually force it to the disk with cache=unsafe */
3095     if (bs->open_flags & BDRV_O_NO_FLUSH) {
3096         return 0;
3097     }
3098 
3099     if (bs->drv->bdrv_co_flush_to_disk) {
3100         return bs->drv->bdrv_co_flush_to_disk(bs);
3101     } else if (bs->drv->bdrv_aio_flush) {
3102         BlockDriverAIOCB *acb;
3103         CoroutineIOCompletion co = {
3104             .coroutine = qemu_coroutine_self(),
3105         };
3106 
3107         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3108         if (acb == NULL) {
3109             return -EIO;
3110         } else {
3111             qemu_coroutine_yield();
3112             return co.ret;
3113         }
3114     } else {
3115         /*
3116          * Some block drivers always operate in either writethrough or unsafe
3117          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3118          * know how the server works (because the behaviour is hardcoded or
3119          * depends on server-side configuration), so we can't ensure that
3120          * everything is safe on disk. Returning an error doesn't work because
3121          * that would break guests even if the server operates in writethrough
3122          * mode.
3123          *
3124          * Let's hope the user knows what he's doing.
3125          */
3126         return 0;
3127     }
3128 }
3129 
3130 void bdrv_invalidate_cache(BlockDriverState *bs)
3131 {
3132     if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3133         bs->drv->bdrv_invalidate_cache(bs);
3134     }
3135 }
3136 
3137 void bdrv_invalidate_cache_all(void)
3138 {
3139     BlockDriverState *bs;
3140 
3141     QTAILQ_FOREACH(bs, &bdrv_states, list) {
3142         bdrv_invalidate_cache(bs);
3143     }
3144 }
3145 
3146 int bdrv_flush(BlockDriverState *bs)
3147 {
3148     Coroutine *co;
3149     RwCo rwco = {
3150         .bs = bs,
3151         .ret = NOT_DONE,
3152     };
3153 
3154     if (qemu_in_coroutine()) {
3155         /* Fast-path if already in coroutine context */
3156         bdrv_flush_co_entry(&rwco);
3157     } else {
3158         co = qemu_coroutine_create(bdrv_flush_co_entry);
3159         qemu_coroutine_enter(co, &rwco);
3160         while (rwco.ret == NOT_DONE) {
3161             qemu_aio_wait();
3162         }
3163     }
3164 
3165     return rwco.ret;
3166 }
3167 
3168 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3169 {
3170     RwCo *rwco = opaque;
3171 
3172     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3173 }
3174 
3175 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3176                                  int nb_sectors)
3177 {
3178     if (!bs->drv) {
3179         return -ENOMEDIUM;
3180     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3181         return -EIO;
3182     } else if (bs->read_only) {
3183         return -EROFS;
3184     } else if (bs->drv->bdrv_co_discard) {
3185         return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3186     } else if (bs->drv->bdrv_aio_discard) {
3187         BlockDriverAIOCB *acb;
3188         CoroutineIOCompletion co = {
3189             .coroutine = qemu_coroutine_self(),
3190         };
3191 
3192         acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3193                                         bdrv_co_io_em_complete, &co);
3194         if (acb == NULL) {
3195             return -EIO;
3196         } else {
3197             qemu_coroutine_yield();
3198             return co.ret;
3199         }
3200     } else {
3201         return 0;
3202     }
3203 }
3204 
3205 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3206 {
3207     Coroutine *co;
3208     RwCo rwco = {
3209         .bs = bs,
3210         .sector_num = sector_num,
3211         .nb_sectors = nb_sectors,
3212         .ret = NOT_DONE,
3213     };
3214 
3215     if (qemu_in_coroutine()) {
3216         /* Fast-path if already in coroutine context */
3217         bdrv_discard_co_entry(&rwco);
3218     } else {
3219         co = qemu_coroutine_create(bdrv_discard_co_entry);
3220         qemu_coroutine_enter(co, &rwco);
3221         while (rwco.ret == NOT_DONE) {
3222             qemu_aio_wait();
3223         }
3224     }
3225 
3226     return rwco.ret;
3227 }
3228 
3229 /**************************************************************/
3230 /* removable device support */
3231 
3232 /**
3233  * Return TRUE if the media is present
3234  */
3235 int bdrv_is_inserted(BlockDriverState *bs)
3236 {
3237     BlockDriver *drv = bs->drv;
3238 
3239     if (!drv)
3240         return 0;
3241     if (!drv->bdrv_is_inserted)
3242         return 1;
3243     return drv->bdrv_is_inserted(bs);
3244 }
3245 
3246 /**
3247  * Return whether the media changed since the last call to this
3248  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
3249  */
3250 int bdrv_media_changed(BlockDriverState *bs)
3251 {
3252     BlockDriver *drv = bs->drv;
3253 
3254     if (drv && drv->bdrv_media_changed) {
3255         return drv->bdrv_media_changed(bs);
3256     }
3257     return -ENOTSUP;
3258 }
3259 
3260 /**
3261  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3262  */
3263 void bdrv_eject(BlockDriverState *bs, int eject_flag)
3264 {
3265     BlockDriver *drv = bs->drv;
3266 
3267     if (drv && drv->bdrv_eject) {
3268         drv->bdrv_eject(bs, eject_flag);
3269     }
3270 }
3271 
3272 /**
3273  * Lock or unlock the media (if it is locked, the user won't be able
3274  * to eject it manually).
3275  */
3276 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3277 {
3278     BlockDriver *drv = bs->drv;
3279 
3280     trace_bdrv_lock_medium(bs, locked);
3281 
3282     if (drv && drv->bdrv_lock_medium) {
3283         drv->bdrv_lock_medium(bs, locked);
3284     }
3285 }
3286 
3287 /* needed for generic scsi interface */
3288 
3289 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3290 {
3291     BlockDriver *drv = bs->drv;
3292 
3293     if (drv && drv->bdrv_ioctl)
3294         return drv->bdrv_ioctl(bs, req, buf);
3295     return -ENOTSUP;
3296 }
3297 
3298 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3299         unsigned long int req, void *buf,
3300         BlockDriverCompletionFunc *cb, void *opaque)
3301 {
3302     BlockDriver *drv = bs->drv;
3303 
3304     if (drv && drv->bdrv_aio_ioctl)
3305         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3306     return NULL;
3307 }
3308 
3309 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3310 {
3311     bs->buffer_alignment = align;
3312 }
3313 
3314 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3315 {
3316     return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3317 }
3318 
3319 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3320 {
3321     int64_t bitmap_size;
3322 
3323     bs->dirty_count = 0;
3324     if (enable) {
3325         if (!bs->dirty_bitmap) {
3326             bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3327                     BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3328             bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
3329 
3330             bs->dirty_bitmap = g_malloc0(bitmap_size);
3331         }
3332     } else {
3333         if (bs->dirty_bitmap) {
3334             g_free(bs->dirty_bitmap);
3335             bs->dirty_bitmap = NULL;
3336         }
3337     }
3338 }
3339 
3340 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3341 {
3342     int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3343 
3344     if (bs->dirty_bitmap &&
3345         (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3346         return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3347             (1UL << (chunk % (sizeof(unsigned long) * 8))));
3348     } else {
3349         return 0;
3350     }
3351 }
3352 
3353 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3354                       int nr_sectors)
3355 {
3356     set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3357 }
3358 
3359 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3360 {
3361     return bs->dirty_count;
3362 }
3363 
3364 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3365 {
3366     assert(bs->in_use != in_use);
3367     bs->in_use = in_use;
3368 }
3369 
3370 int bdrv_in_use(BlockDriverState *bs)
3371 {
3372     return bs->in_use;
3373 }
3374 
3375 void bdrv_iostatus_enable(BlockDriverState *bs)
3376 {
3377     bs->iostatus_enabled = true;
3378     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3379 }
3380 
3381 /* The I/O status is only enabled if the drive explicitly
3382  * enables it _and_ the VM is configured to stop on errors */
3383 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3384 {
3385     return (bs->iostatus_enabled &&
3386            (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3387             bs->on_write_error == BLOCK_ERR_STOP_ANY    ||
3388             bs->on_read_error == BLOCK_ERR_STOP_ANY));
3389 }
3390 
3391 void bdrv_iostatus_disable(BlockDriverState *bs)
3392 {
3393     bs->iostatus_enabled = false;
3394 }
3395 
3396 void bdrv_iostatus_reset(BlockDriverState *bs)
3397 {
3398     if (bdrv_iostatus_is_enabled(bs)) {
3399         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3400     }
3401 }
3402 
3403 /* XXX: Today this is set by device models because it makes the implementation
3404    quite simple. However, the block layer knows about the error, so it's
3405    possible to implement this without device models being involved */
3406 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3407 {
3408     if (bdrv_iostatus_is_enabled(bs) &&
3409         bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
3410         assert(error >= 0);
3411         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3412                                          BLOCK_DEVICE_IO_STATUS_FAILED;
3413     }
3414 }
3415 
3416 void
3417 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3418         enum BlockAcctType type)
3419 {
3420     assert(type < BDRV_MAX_IOTYPE);
3421 
3422     cookie->bytes = bytes;
3423     cookie->start_time_ns = get_clock();
3424     cookie->type = type;
3425 }
3426 
3427 void
3428 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3429 {
3430     assert(cookie->type < BDRV_MAX_IOTYPE);
3431 
3432     bs->nr_bytes[cookie->type] += cookie->bytes;
3433     bs->nr_ops[cookie->type]++;
3434     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
3435 }
3436 
3437 int bdrv_img_create(const char *filename, const char *fmt,
3438                     const char *base_filename, const char *base_fmt,
3439                     char *options, uint64_t img_size, int flags)
3440 {
3441     QEMUOptionParameter *param = NULL, *create_options = NULL;
3442     QEMUOptionParameter *backing_fmt, *backing_file, *size;
3443     BlockDriverState *bs = NULL;
3444     BlockDriver *drv, *proto_drv;
3445     BlockDriver *backing_drv = NULL;
3446     int ret = 0;
3447 
3448     /* Find driver and parse its options */
3449     drv = bdrv_find_format(fmt);
3450     if (!drv) {
3451         error_report("Unknown file format '%s'", fmt);
3452         ret = -EINVAL;
3453         goto out;
3454     }
3455 
3456     proto_drv = bdrv_find_protocol(filename);
3457     if (!proto_drv) {
3458         error_report("Unknown protocol '%s'", filename);
3459         ret = -EINVAL;
3460         goto out;
3461     }
3462 
3463     create_options = append_option_parameters(create_options,
3464                                               drv->create_options);
3465     create_options = append_option_parameters(create_options,
3466                                               proto_drv->create_options);
3467 
3468     /* Create parameter list with default values */
3469     param = parse_option_parameters("", create_options, param);
3470 
3471     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
3472 
3473     /* Parse -o options */
3474     if (options) {
3475         param = parse_option_parameters(options, create_options, param);
3476         if (param == NULL) {
3477             error_report("Invalid options for file format '%s'.", fmt);
3478             ret = -EINVAL;
3479             goto out;
3480         }
3481     }
3482 
3483     if (base_filename) {
3484         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
3485                                  base_filename)) {
3486             error_report("Backing file not supported for file format '%s'",
3487                          fmt);
3488             ret = -EINVAL;
3489             goto out;
3490         }
3491     }
3492 
3493     if (base_fmt) {
3494         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
3495             error_report("Backing file format not supported for file "
3496                          "format '%s'", fmt);
3497             ret = -EINVAL;
3498             goto out;
3499         }
3500     }
3501 
3502     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
3503     if (backing_file && backing_file->value.s) {
3504         if (!strcmp(filename, backing_file->value.s)) {
3505             error_report("Error: Trying to create an image with the "
3506                          "same filename as the backing file");
3507             ret = -EINVAL;
3508             goto out;
3509         }
3510     }
3511 
3512     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
3513     if (backing_fmt && backing_fmt->value.s) {
3514         backing_drv = bdrv_find_format(backing_fmt->value.s);
3515         if (!backing_drv) {
3516             error_report("Unknown backing file format '%s'",
3517                          backing_fmt->value.s);
3518             ret = -EINVAL;
3519             goto out;
3520         }
3521     }
3522 
3523     // The size for the image must always be specified, with one exception:
3524     // If we are using a backing file, we can obtain the size from there
3525     size = get_option_parameter(param, BLOCK_OPT_SIZE);
3526     if (size && size->value.n == -1) {
3527         if (backing_file && backing_file->value.s) {
3528             uint64_t size;
3529             char buf[32];
3530 
3531             bs = bdrv_new("");
3532 
3533             ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
3534             if (ret < 0) {
3535                 error_report("Could not open '%s'", backing_file->value.s);
3536                 goto out;
3537             }
3538             bdrv_get_geometry(bs, &size);
3539             size *= 512;
3540 
3541             snprintf(buf, sizeof(buf), "%" PRId64, size);
3542             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
3543         } else {
3544             error_report("Image creation needs a size parameter");
3545             ret = -EINVAL;
3546             goto out;
3547         }
3548     }
3549 
3550     printf("Formatting '%s', fmt=%s ", filename, fmt);
3551     print_option_parameters(param);
3552     puts("");
3553 
3554     ret = bdrv_create(drv, filename, param);
3555 
3556     if (ret < 0) {
3557         if (ret == -ENOTSUP) {
3558             error_report("Formatting or formatting option not supported for "
3559                          "file format '%s'", fmt);
3560         } else if (ret == -EFBIG) {
3561             error_report("The image size is too large for file format '%s'",
3562                          fmt);
3563         } else {
3564             error_report("%s: error while creating %s: %s", filename, fmt,
3565                          strerror(-ret));
3566         }
3567     }
3568 
3569 out:
3570     free_option_parameters(create_options);
3571     free_option_parameters(param);
3572 
3573     if (bs) {
3574         bdrv_delete(bs);
3575     }
3576 
3577     return ret;
3578 }
3579