xref: /openbmc/qemu/block.c (revision dbffbdcfff69431b622866ac5ea78df74fdc02d4)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor.h"
28 #include "block_int.h"
29 #include "module.h"
30 #include "qjson.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 #include "qemu-timer.h"
34 
35 #ifdef CONFIG_BSD
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/ioctl.h>
39 #include <sys/queue.h>
40 #ifndef __DragonFly__
41 #include <sys/disk.h>
42 #endif
43 #endif
44 
45 #ifdef _WIN32
46 #include <windows.h>
47 #endif
48 
49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50 
51 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
52 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
53         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
54         BlockDriverCompletionFunc *cb, void *opaque);
55 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
56         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
57         BlockDriverCompletionFunc *cb, void *opaque);
58 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
59                                          int64_t sector_num, int nb_sectors,
60                                          QEMUIOVector *iov);
61 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
62                                          int64_t sector_num, int nb_sectors,
63                                          QEMUIOVector *iov);
64 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
65     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
66 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
67     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
68 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
69                                                int64_t sector_num,
70                                                QEMUIOVector *qiov,
71                                                int nb_sectors,
72                                                BlockDriverCompletionFunc *cb,
73                                                void *opaque,
74                                                bool is_write);
75 static void coroutine_fn bdrv_co_do_rw(void *opaque);
76 
77 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
78         bool is_write, double elapsed_time, uint64_t *wait);
79 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
80         double elapsed_time, uint64_t *wait);
81 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
82         bool is_write, int64_t *wait);
83 
84 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
85     QTAILQ_HEAD_INITIALIZER(bdrv_states);
86 
87 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
88     QLIST_HEAD_INITIALIZER(bdrv_drivers);
89 
90 /* The device to use for VM snapshots */
91 static BlockDriverState *bs_snapshots;
92 
93 /* If non-zero, use only whitelisted block drivers */
94 static int use_bdrv_whitelist;
95 
96 #ifdef _WIN32
97 static int is_windows_drive_prefix(const char *filename)
98 {
99     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
100              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
101             filename[1] == ':');
102 }
103 
104 int is_windows_drive(const char *filename)
105 {
106     if (is_windows_drive_prefix(filename) &&
107         filename[2] == '\0')
108         return 1;
109     if (strstart(filename, "\\\\.\\", NULL) ||
110         strstart(filename, "//./", NULL))
111         return 1;
112     return 0;
113 }
114 #endif
115 
116 /* throttling disk I/O limits */
117 void bdrv_io_limits_disable(BlockDriverState *bs)
118 {
119     bs->io_limits_enabled = false;
120 
121     while (qemu_co_queue_next(&bs->throttled_reqs));
122 
123     if (bs->block_timer) {
124         qemu_del_timer(bs->block_timer);
125         qemu_free_timer(bs->block_timer);
126         bs->block_timer = NULL;
127     }
128 
129     bs->slice_start = 0;
130     bs->slice_end   = 0;
131     bs->slice_time  = 0;
132     memset(&bs->io_base, 0, sizeof(bs->io_base));
133 }
134 
135 static void bdrv_block_timer(void *opaque)
136 {
137     BlockDriverState *bs = opaque;
138 
139     qemu_co_queue_next(&bs->throttled_reqs);
140 }
141 
142 void bdrv_io_limits_enable(BlockDriverState *bs)
143 {
144     qemu_co_queue_init(&bs->throttled_reqs);
145     bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
146     bs->slice_time  = 5 * BLOCK_IO_SLICE_TIME;
147     bs->slice_start = qemu_get_clock_ns(vm_clock);
148     bs->slice_end   = bs->slice_start + bs->slice_time;
149     memset(&bs->io_base, 0, sizeof(bs->io_base));
150     bs->io_limits_enabled = true;
151 }
152 
153 bool bdrv_io_limits_enabled(BlockDriverState *bs)
154 {
155     BlockIOLimit *io_limits = &bs->io_limits;
156     return io_limits->bps[BLOCK_IO_LIMIT_READ]
157          || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
158          || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
159          || io_limits->iops[BLOCK_IO_LIMIT_READ]
160          || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
161          || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
162 }
163 
164 static void bdrv_io_limits_intercept(BlockDriverState *bs,
165                                      bool is_write, int nb_sectors)
166 {
167     int64_t wait_time = -1;
168 
169     if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
170         qemu_co_queue_wait(&bs->throttled_reqs);
171     }
172 
173     /* In fact, we hope to keep each request's timing, in FIFO mode. The next
174      * throttled requests will not be dequeued until the current request is
175      * allowed to be serviced. So if the current request still exceeds the
176      * limits, it will be inserted to the head. All requests followed it will
177      * be still in throttled_reqs queue.
178      */
179 
180     while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
181         qemu_mod_timer(bs->block_timer,
182                        wait_time + qemu_get_clock_ns(vm_clock));
183         qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
184     }
185 
186     qemu_co_queue_next(&bs->throttled_reqs);
187 }
188 
189 /* check if the path starts with "<protocol>:" */
190 static int path_has_protocol(const char *path)
191 {
192 #ifdef _WIN32
193     if (is_windows_drive(path) ||
194         is_windows_drive_prefix(path)) {
195         return 0;
196     }
197 #endif
198 
199     return strchr(path, ':') != NULL;
200 }
201 
202 int path_is_absolute(const char *path)
203 {
204     const char *p;
205 #ifdef _WIN32
206     /* specific case for names like: "\\.\d:" */
207     if (*path == '/' || *path == '\\')
208         return 1;
209 #endif
210     p = strchr(path, ':');
211     if (p)
212         p++;
213     else
214         p = path;
215 #ifdef _WIN32
216     return (*p == '/' || *p == '\\');
217 #else
218     return (*p == '/');
219 #endif
220 }
221 
222 /* if filename is absolute, just copy it to dest. Otherwise, build a
223    path to it by considering it is relative to base_path. URL are
224    supported. */
225 void path_combine(char *dest, int dest_size,
226                   const char *base_path,
227                   const char *filename)
228 {
229     const char *p, *p1;
230     int len;
231 
232     if (dest_size <= 0)
233         return;
234     if (path_is_absolute(filename)) {
235         pstrcpy(dest, dest_size, filename);
236     } else {
237         p = strchr(base_path, ':');
238         if (p)
239             p++;
240         else
241             p = base_path;
242         p1 = strrchr(base_path, '/');
243 #ifdef _WIN32
244         {
245             const char *p2;
246             p2 = strrchr(base_path, '\\');
247             if (!p1 || p2 > p1)
248                 p1 = p2;
249         }
250 #endif
251         if (p1)
252             p1++;
253         else
254             p1 = base_path;
255         if (p1 > p)
256             p = p1;
257         len = p - base_path;
258         if (len > dest_size - 1)
259             len = dest_size - 1;
260         memcpy(dest, base_path, len);
261         dest[len] = '\0';
262         pstrcat(dest, dest_size, filename);
263     }
264 }
265 
266 void bdrv_register(BlockDriver *bdrv)
267 {
268     /* Block drivers without coroutine functions need emulation */
269     if (!bdrv->bdrv_co_readv) {
270         bdrv->bdrv_co_readv = bdrv_co_readv_em;
271         bdrv->bdrv_co_writev = bdrv_co_writev_em;
272 
273         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
274          * the block driver lacks aio we need to emulate that too.
275          */
276         if (!bdrv->bdrv_aio_readv) {
277             /* add AIO emulation layer */
278             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
279             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
280         }
281     }
282 
283     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
284 }
285 
286 /* create a new block device (by default it is empty) */
287 BlockDriverState *bdrv_new(const char *device_name)
288 {
289     BlockDriverState *bs;
290 
291     bs = g_malloc0(sizeof(BlockDriverState));
292     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
293     if (device_name[0] != '\0') {
294         QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
295     }
296     bdrv_iostatus_disable(bs);
297     return bs;
298 }
299 
300 BlockDriver *bdrv_find_format(const char *format_name)
301 {
302     BlockDriver *drv1;
303     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
304         if (!strcmp(drv1->format_name, format_name)) {
305             return drv1;
306         }
307     }
308     return NULL;
309 }
310 
311 static int bdrv_is_whitelisted(BlockDriver *drv)
312 {
313     static const char *whitelist[] = {
314         CONFIG_BDRV_WHITELIST
315     };
316     const char **p;
317 
318     if (!whitelist[0])
319         return 1;               /* no whitelist, anything goes */
320 
321     for (p = whitelist; *p; p++) {
322         if (!strcmp(drv->format_name, *p)) {
323             return 1;
324         }
325     }
326     return 0;
327 }
328 
329 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
330 {
331     BlockDriver *drv = bdrv_find_format(format_name);
332     return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
333 }
334 
335 int bdrv_create(BlockDriver *drv, const char* filename,
336     QEMUOptionParameter *options)
337 {
338     if (!drv->bdrv_create)
339         return -ENOTSUP;
340 
341     return drv->bdrv_create(filename, options);
342 }
343 
344 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
345 {
346     BlockDriver *drv;
347 
348     drv = bdrv_find_protocol(filename);
349     if (drv == NULL) {
350         return -ENOENT;
351     }
352 
353     return bdrv_create(drv, filename, options);
354 }
355 
356 #ifdef _WIN32
357 void get_tmp_filename(char *filename, int size)
358 {
359     char temp_dir[MAX_PATH];
360 
361     GetTempPath(MAX_PATH, temp_dir);
362     GetTempFileName(temp_dir, "qem", 0, filename);
363 }
364 #else
365 void get_tmp_filename(char *filename, int size)
366 {
367     int fd;
368     const char *tmpdir;
369     /* XXX: race condition possible */
370     tmpdir = getenv("TMPDIR");
371     if (!tmpdir)
372         tmpdir = "/tmp";
373     snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
374     fd = mkstemp(filename);
375     close(fd);
376 }
377 #endif
378 
379 /*
380  * Detect host devices. By convention, /dev/cdrom[N] is always
381  * recognized as a host CDROM.
382  */
383 static BlockDriver *find_hdev_driver(const char *filename)
384 {
385     int score_max = 0, score;
386     BlockDriver *drv = NULL, *d;
387 
388     QLIST_FOREACH(d, &bdrv_drivers, list) {
389         if (d->bdrv_probe_device) {
390             score = d->bdrv_probe_device(filename);
391             if (score > score_max) {
392                 score_max = score;
393                 drv = d;
394             }
395         }
396     }
397 
398     return drv;
399 }
400 
401 BlockDriver *bdrv_find_protocol(const char *filename)
402 {
403     BlockDriver *drv1;
404     char protocol[128];
405     int len;
406     const char *p;
407 
408     /* TODO Drivers without bdrv_file_open must be specified explicitly */
409 
410     /*
411      * XXX(hch): we really should not let host device detection
412      * override an explicit protocol specification, but moving this
413      * later breaks access to device names with colons in them.
414      * Thanks to the brain-dead persistent naming schemes on udev-
415      * based Linux systems those actually are quite common.
416      */
417     drv1 = find_hdev_driver(filename);
418     if (drv1) {
419         return drv1;
420     }
421 
422     if (!path_has_protocol(filename)) {
423         return bdrv_find_format("file");
424     }
425     p = strchr(filename, ':');
426     assert(p != NULL);
427     len = p - filename;
428     if (len > sizeof(protocol) - 1)
429         len = sizeof(protocol) - 1;
430     memcpy(protocol, filename, len);
431     protocol[len] = '\0';
432     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
433         if (drv1->protocol_name &&
434             !strcmp(drv1->protocol_name, protocol)) {
435             return drv1;
436         }
437     }
438     return NULL;
439 }
440 
441 static int find_image_format(const char *filename, BlockDriver **pdrv)
442 {
443     int ret, score, score_max;
444     BlockDriver *drv1, *drv;
445     uint8_t buf[2048];
446     BlockDriverState *bs;
447 
448     ret = bdrv_file_open(&bs, filename, 0);
449     if (ret < 0) {
450         *pdrv = NULL;
451         return ret;
452     }
453 
454     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
455     if (bs->sg || !bdrv_is_inserted(bs)) {
456         bdrv_delete(bs);
457         drv = bdrv_find_format("raw");
458         if (!drv) {
459             ret = -ENOENT;
460         }
461         *pdrv = drv;
462         return ret;
463     }
464 
465     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
466     bdrv_delete(bs);
467     if (ret < 0) {
468         *pdrv = NULL;
469         return ret;
470     }
471 
472     score_max = 0;
473     drv = NULL;
474     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
475         if (drv1->bdrv_probe) {
476             score = drv1->bdrv_probe(buf, ret, filename);
477             if (score > score_max) {
478                 score_max = score;
479                 drv = drv1;
480             }
481         }
482     }
483     if (!drv) {
484         ret = -ENOENT;
485     }
486     *pdrv = drv;
487     return ret;
488 }
489 
490 /**
491  * Set the current 'total_sectors' value
492  */
493 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
494 {
495     BlockDriver *drv = bs->drv;
496 
497     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
498     if (bs->sg)
499         return 0;
500 
501     /* query actual device if possible, otherwise just trust the hint */
502     if (drv->bdrv_getlength) {
503         int64_t length = drv->bdrv_getlength(bs);
504         if (length < 0) {
505             return length;
506         }
507         hint = length >> BDRV_SECTOR_BITS;
508     }
509 
510     bs->total_sectors = hint;
511     return 0;
512 }
513 
514 /**
515  * Set open flags for a given cache mode
516  *
517  * Return 0 on success, -1 if the cache mode was invalid.
518  */
519 int bdrv_parse_cache_flags(const char *mode, int *flags)
520 {
521     *flags &= ~BDRV_O_CACHE_MASK;
522 
523     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
524         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
525     } else if (!strcmp(mode, "directsync")) {
526         *flags |= BDRV_O_NOCACHE;
527     } else if (!strcmp(mode, "writeback")) {
528         *flags |= BDRV_O_CACHE_WB;
529     } else if (!strcmp(mode, "unsafe")) {
530         *flags |= BDRV_O_CACHE_WB;
531         *flags |= BDRV_O_NO_FLUSH;
532     } else if (!strcmp(mode, "writethrough")) {
533         /* this is the default */
534     } else {
535         return -1;
536     }
537 
538     return 0;
539 }
540 
541 /*
542  * Common part for opening disk images and files
543  */
544 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
545     int flags, BlockDriver *drv)
546 {
547     int ret, open_flags;
548 
549     assert(drv != NULL);
550 
551     trace_bdrv_open_common(bs, filename, flags, drv->format_name);
552 
553     bs->file = NULL;
554     bs->total_sectors = 0;
555     bs->encrypted = 0;
556     bs->valid_key = 0;
557     bs->sg = 0;
558     bs->open_flags = flags;
559     bs->growable = 0;
560     bs->buffer_alignment = 512;
561 
562     pstrcpy(bs->filename, sizeof(bs->filename), filename);
563     bs->backing_file[0] = '\0';
564 
565     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
566         return -ENOTSUP;
567     }
568 
569     bs->drv = drv;
570     bs->opaque = g_malloc0(drv->instance_size);
571 
572     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
573 
574     /*
575      * Clear flags that are internal to the block layer before opening the
576      * image.
577      */
578     open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
579 
580     /*
581      * Snapshots should be writable.
582      */
583     if (bs->is_temporary) {
584         open_flags |= BDRV_O_RDWR;
585     }
586 
587     bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
588 
589     /* Open the image, either directly or using a protocol */
590     if (drv->bdrv_file_open) {
591         ret = drv->bdrv_file_open(bs, filename, open_flags);
592     } else {
593         ret = bdrv_file_open(&bs->file, filename, open_flags);
594         if (ret >= 0) {
595             ret = drv->bdrv_open(bs, open_flags);
596         }
597     }
598 
599     if (ret < 0) {
600         goto free_and_fail;
601     }
602 
603     ret = refresh_total_sectors(bs, bs->total_sectors);
604     if (ret < 0) {
605         goto free_and_fail;
606     }
607 
608 #ifndef _WIN32
609     if (bs->is_temporary) {
610         unlink(filename);
611     }
612 #endif
613     return 0;
614 
615 free_and_fail:
616     if (bs->file) {
617         bdrv_delete(bs->file);
618         bs->file = NULL;
619     }
620     g_free(bs->opaque);
621     bs->opaque = NULL;
622     bs->drv = NULL;
623     return ret;
624 }
625 
626 /*
627  * Opens a file using a protocol (file, host_device, nbd, ...)
628  */
629 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
630 {
631     BlockDriverState *bs;
632     BlockDriver *drv;
633     int ret;
634 
635     drv = bdrv_find_protocol(filename);
636     if (!drv) {
637         return -ENOENT;
638     }
639 
640     bs = bdrv_new("");
641     ret = bdrv_open_common(bs, filename, flags, drv);
642     if (ret < 0) {
643         bdrv_delete(bs);
644         return ret;
645     }
646     bs->growable = 1;
647     *pbs = bs;
648     return 0;
649 }
650 
651 /*
652  * Opens a disk image (raw, qcow2, vmdk, ...)
653  */
654 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
655               BlockDriver *drv)
656 {
657     int ret;
658     char tmp_filename[PATH_MAX];
659 
660     if (flags & BDRV_O_SNAPSHOT) {
661         BlockDriverState *bs1;
662         int64_t total_size;
663         int is_protocol = 0;
664         BlockDriver *bdrv_qcow2;
665         QEMUOptionParameter *options;
666         char backing_filename[PATH_MAX];
667 
668         /* if snapshot, we create a temporary backing file and open it
669            instead of opening 'filename' directly */
670 
671         /* if there is a backing file, use it */
672         bs1 = bdrv_new("");
673         ret = bdrv_open(bs1, filename, 0, drv);
674         if (ret < 0) {
675             bdrv_delete(bs1);
676             return ret;
677         }
678         total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
679 
680         if (bs1->drv && bs1->drv->protocol_name)
681             is_protocol = 1;
682 
683         bdrv_delete(bs1);
684 
685         get_tmp_filename(tmp_filename, sizeof(tmp_filename));
686 
687         /* Real path is meaningless for protocols */
688         if (is_protocol)
689             snprintf(backing_filename, sizeof(backing_filename),
690                      "%s", filename);
691         else if (!realpath(filename, backing_filename))
692             return -errno;
693 
694         bdrv_qcow2 = bdrv_find_format("qcow2");
695         options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
696 
697         set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
698         set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
699         if (drv) {
700             set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
701                 drv->format_name);
702         }
703 
704         ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
705         free_option_parameters(options);
706         if (ret < 0) {
707             return ret;
708         }
709 
710         filename = tmp_filename;
711         drv = bdrv_qcow2;
712         bs->is_temporary = 1;
713     }
714 
715     /* Find the right image format driver */
716     if (!drv) {
717         ret = find_image_format(filename, &drv);
718     }
719 
720     if (!drv) {
721         goto unlink_and_fail;
722     }
723 
724     /* Open the image */
725     ret = bdrv_open_common(bs, filename, flags, drv);
726     if (ret < 0) {
727         goto unlink_and_fail;
728     }
729 
730     /* If there is a backing file, use it */
731     if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
732         char backing_filename[PATH_MAX];
733         int back_flags;
734         BlockDriver *back_drv = NULL;
735 
736         bs->backing_hd = bdrv_new("");
737 
738         if (path_has_protocol(bs->backing_file)) {
739             pstrcpy(backing_filename, sizeof(backing_filename),
740                     bs->backing_file);
741         } else {
742             path_combine(backing_filename, sizeof(backing_filename),
743                          filename, bs->backing_file);
744         }
745 
746         if (bs->backing_format[0] != '\0') {
747             back_drv = bdrv_find_format(bs->backing_format);
748         }
749 
750         /* backing files always opened read-only */
751         back_flags =
752             flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
753 
754         ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
755         if (ret < 0) {
756             bdrv_close(bs);
757             return ret;
758         }
759         if (bs->is_temporary) {
760             bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
761         } else {
762             /* base image inherits from "parent" */
763             bs->backing_hd->keep_read_only = bs->keep_read_only;
764         }
765     }
766 
767     if (!bdrv_key_required(bs)) {
768         bdrv_dev_change_media_cb(bs, true);
769     }
770 
771     /* throttling disk I/O limits */
772     if (bs->io_limits_enabled) {
773         bdrv_io_limits_enable(bs);
774     }
775 
776     return 0;
777 
778 unlink_and_fail:
779     if (bs->is_temporary) {
780         unlink(filename);
781     }
782     return ret;
783 }
784 
785 void bdrv_close(BlockDriverState *bs)
786 {
787     if (bs->drv) {
788         if (bs == bs_snapshots) {
789             bs_snapshots = NULL;
790         }
791         if (bs->backing_hd) {
792             bdrv_delete(bs->backing_hd);
793             bs->backing_hd = NULL;
794         }
795         bs->drv->bdrv_close(bs);
796         g_free(bs->opaque);
797 #ifdef _WIN32
798         if (bs->is_temporary) {
799             unlink(bs->filename);
800         }
801 #endif
802         bs->opaque = NULL;
803         bs->drv = NULL;
804 
805         if (bs->file != NULL) {
806             bdrv_close(bs->file);
807         }
808 
809         bdrv_dev_change_media_cb(bs, false);
810     }
811 
812     /*throttling disk I/O limits*/
813     if (bs->io_limits_enabled) {
814         bdrv_io_limits_disable(bs);
815     }
816 }
817 
818 void bdrv_close_all(void)
819 {
820     BlockDriverState *bs;
821 
822     QTAILQ_FOREACH(bs, &bdrv_states, list) {
823         bdrv_close(bs);
824     }
825 }
826 
827 /* make a BlockDriverState anonymous by removing from bdrv_state list.
828    Also, NULL terminate the device_name to prevent double remove */
829 void bdrv_make_anon(BlockDriverState *bs)
830 {
831     if (bs->device_name[0] != '\0') {
832         QTAILQ_REMOVE(&bdrv_states, bs, list);
833     }
834     bs->device_name[0] = '\0';
835 }
836 
837 void bdrv_delete(BlockDriverState *bs)
838 {
839     assert(!bs->dev);
840 
841     /* remove from list, if necessary */
842     bdrv_make_anon(bs);
843 
844     bdrv_close(bs);
845     if (bs->file != NULL) {
846         bdrv_delete(bs->file);
847     }
848 
849     assert(bs != bs_snapshots);
850     g_free(bs);
851 }
852 
853 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
854 /* TODO change to DeviceState *dev when all users are qdevified */
855 {
856     if (bs->dev) {
857         return -EBUSY;
858     }
859     bs->dev = dev;
860     bdrv_iostatus_reset(bs);
861     return 0;
862 }
863 
864 /* TODO qdevified devices don't use this, remove when devices are qdevified */
865 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
866 {
867     if (bdrv_attach_dev(bs, dev) < 0) {
868         abort();
869     }
870 }
871 
872 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
873 /* TODO change to DeviceState *dev when all users are qdevified */
874 {
875     assert(bs->dev == dev);
876     bs->dev = NULL;
877     bs->dev_ops = NULL;
878     bs->dev_opaque = NULL;
879     bs->buffer_alignment = 512;
880 }
881 
882 /* TODO change to return DeviceState * when all users are qdevified */
883 void *bdrv_get_attached_dev(BlockDriverState *bs)
884 {
885     return bs->dev;
886 }
887 
888 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
889                       void *opaque)
890 {
891     bs->dev_ops = ops;
892     bs->dev_opaque = opaque;
893     if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
894         bs_snapshots = NULL;
895     }
896 }
897 
898 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
899 {
900     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
901         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
902     }
903 }
904 
905 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
906 {
907     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
908 }
909 
910 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
911 {
912     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
913         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
914     }
915 }
916 
917 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
918 {
919     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
920         return bs->dev_ops->is_tray_open(bs->dev_opaque);
921     }
922     return false;
923 }
924 
925 static void bdrv_dev_resize_cb(BlockDriverState *bs)
926 {
927     if (bs->dev_ops && bs->dev_ops->resize_cb) {
928         bs->dev_ops->resize_cb(bs->dev_opaque);
929     }
930 }
931 
932 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
933 {
934     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
935         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
936     }
937     return false;
938 }
939 
940 /*
941  * Run consistency checks on an image
942  *
943  * Returns 0 if the check could be completed (it doesn't mean that the image is
944  * free of errors) or -errno when an internal error occurred. The results of the
945  * check are stored in res.
946  */
947 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
948 {
949     if (bs->drv->bdrv_check == NULL) {
950         return -ENOTSUP;
951     }
952 
953     memset(res, 0, sizeof(*res));
954     return bs->drv->bdrv_check(bs, res);
955 }
956 
957 #define COMMIT_BUF_SECTORS 2048
958 
959 /* commit COW file into the raw image */
960 int bdrv_commit(BlockDriverState *bs)
961 {
962     BlockDriver *drv = bs->drv;
963     BlockDriver *backing_drv;
964     int64_t sector, total_sectors;
965     int n, ro, open_flags;
966     int ret = 0, rw_ret = 0;
967     uint8_t *buf;
968     char filename[1024];
969     BlockDriverState *bs_rw, *bs_ro;
970 
971     if (!drv)
972         return -ENOMEDIUM;
973 
974     if (!bs->backing_hd) {
975         return -ENOTSUP;
976     }
977 
978     if (bs->backing_hd->keep_read_only) {
979         return -EACCES;
980     }
981 
982     backing_drv = bs->backing_hd->drv;
983     ro = bs->backing_hd->read_only;
984     strncpy(filename, bs->backing_hd->filename, sizeof(filename));
985     open_flags =  bs->backing_hd->open_flags;
986 
987     if (ro) {
988         /* re-open as RW */
989         bdrv_delete(bs->backing_hd);
990         bs->backing_hd = NULL;
991         bs_rw = bdrv_new("");
992         rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
993             backing_drv);
994         if (rw_ret < 0) {
995             bdrv_delete(bs_rw);
996             /* try to re-open read-only */
997             bs_ro = bdrv_new("");
998             ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
999                 backing_drv);
1000             if (ret < 0) {
1001                 bdrv_delete(bs_ro);
1002                 /* drive not functional anymore */
1003                 bs->drv = NULL;
1004                 return ret;
1005             }
1006             bs->backing_hd = bs_ro;
1007             return rw_ret;
1008         }
1009         bs->backing_hd = bs_rw;
1010     }
1011 
1012     total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1013     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1014 
1015     for (sector = 0; sector < total_sectors; sector += n) {
1016         if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1017 
1018             if (bdrv_read(bs, sector, buf, n) != 0) {
1019                 ret = -EIO;
1020                 goto ro_cleanup;
1021             }
1022 
1023             if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1024                 ret = -EIO;
1025                 goto ro_cleanup;
1026             }
1027         }
1028     }
1029 
1030     if (drv->bdrv_make_empty) {
1031         ret = drv->bdrv_make_empty(bs);
1032         bdrv_flush(bs);
1033     }
1034 
1035     /*
1036      * Make sure all data we wrote to the backing device is actually
1037      * stable on disk.
1038      */
1039     if (bs->backing_hd)
1040         bdrv_flush(bs->backing_hd);
1041 
1042 ro_cleanup:
1043     g_free(buf);
1044 
1045     if (ro) {
1046         /* re-open as RO */
1047         bdrv_delete(bs->backing_hd);
1048         bs->backing_hd = NULL;
1049         bs_ro = bdrv_new("");
1050         ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1051             backing_drv);
1052         if (ret < 0) {
1053             bdrv_delete(bs_ro);
1054             /* drive not functional anymore */
1055             bs->drv = NULL;
1056             return ret;
1057         }
1058         bs->backing_hd = bs_ro;
1059         bs->backing_hd->keep_read_only = 0;
1060     }
1061 
1062     return ret;
1063 }
1064 
1065 void bdrv_commit_all(void)
1066 {
1067     BlockDriverState *bs;
1068 
1069     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1070         bdrv_commit(bs);
1071     }
1072 }
1073 
1074 struct BdrvTrackedRequest {
1075     BlockDriverState *bs;
1076     int64_t sector_num;
1077     int nb_sectors;
1078     bool is_write;
1079     QLIST_ENTRY(BdrvTrackedRequest) list;
1080 };
1081 
1082 /**
1083  * Remove an active request from the tracked requests list
1084  *
1085  * This function should be called when a tracked request is completing.
1086  */
1087 static void tracked_request_end(BdrvTrackedRequest *req)
1088 {
1089     QLIST_REMOVE(req, list);
1090 }
1091 
1092 /**
1093  * Add an active request to the tracked requests list
1094  */
1095 static void tracked_request_begin(BdrvTrackedRequest *req,
1096                                   BlockDriverState *bs,
1097                                   int64_t sector_num,
1098                                   int nb_sectors, bool is_write)
1099 {
1100     *req = (BdrvTrackedRequest){
1101         .bs = bs,
1102         .sector_num = sector_num,
1103         .nb_sectors = nb_sectors,
1104         .is_write = is_write,
1105     };
1106 
1107     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1108 }
1109 
1110 /*
1111  * Return values:
1112  * 0        - success
1113  * -EINVAL  - backing format specified, but no file
1114  * -ENOSPC  - can't update the backing file because no space is left in the
1115  *            image file header
1116  * -ENOTSUP - format driver doesn't support changing the backing file
1117  */
1118 int bdrv_change_backing_file(BlockDriverState *bs,
1119     const char *backing_file, const char *backing_fmt)
1120 {
1121     BlockDriver *drv = bs->drv;
1122 
1123     if (drv->bdrv_change_backing_file != NULL) {
1124         return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1125     } else {
1126         return -ENOTSUP;
1127     }
1128 }
1129 
1130 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1131                                    size_t size)
1132 {
1133     int64_t len;
1134 
1135     if (!bdrv_is_inserted(bs))
1136         return -ENOMEDIUM;
1137 
1138     if (bs->growable)
1139         return 0;
1140 
1141     len = bdrv_getlength(bs);
1142 
1143     if (offset < 0)
1144         return -EIO;
1145 
1146     if ((offset > len) || (len - offset < size))
1147         return -EIO;
1148 
1149     return 0;
1150 }
1151 
1152 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1153                               int nb_sectors)
1154 {
1155     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1156                                    nb_sectors * BDRV_SECTOR_SIZE);
1157 }
1158 
1159 typedef struct RwCo {
1160     BlockDriverState *bs;
1161     int64_t sector_num;
1162     int nb_sectors;
1163     QEMUIOVector *qiov;
1164     bool is_write;
1165     int ret;
1166 } RwCo;
1167 
1168 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1169 {
1170     RwCo *rwco = opaque;
1171 
1172     if (!rwco->is_write) {
1173         rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1174                                      rwco->nb_sectors, rwco->qiov);
1175     } else {
1176         rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1177                                       rwco->nb_sectors, rwco->qiov);
1178     }
1179 }
1180 
1181 /*
1182  * Process a synchronous request using coroutines
1183  */
1184 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1185                       int nb_sectors, bool is_write)
1186 {
1187     QEMUIOVector qiov;
1188     struct iovec iov = {
1189         .iov_base = (void *)buf,
1190         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1191     };
1192     Coroutine *co;
1193     RwCo rwco = {
1194         .bs = bs,
1195         .sector_num = sector_num,
1196         .nb_sectors = nb_sectors,
1197         .qiov = &qiov,
1198         .is_write = is_write,
1199         .ret = NOT_DONE,
1200     };
1201 
1202     qemu_iovec_init_external(&qiov, &iov, 1);
1203 
1204     if (qemu_in_coroutine()) {
1205         /* Fast-path if already in coroutine context */
1206         bdrv_rw_co_entry(&rwco);
1207     } else {
1208         co = qemu_coroutine_create(bdrv_rw_co_entry);
1209         qemu_coroutine_enter(co, &rwco);
1210         while (rwco.ret == NOT_DONE) {
1211             qemu_aio_wait();
1212         }
1213     }
1214     return rwco.ret;
1215 }
1216 
1217 /* return < 0 if error. See bdrv_write() for the return codes */
1218 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1219               uint8_t *buf, int nb_sectors)
1220 {
1221     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1222 }
1223 
1224 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1225                              int nb_sectors, int dirty)
1226 {
1227     int64_t start, end;
1228     unsigned long val, idx, bit;
1229 
1230     start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1231     end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1232 
1233     for (; start <= end; start++) {
1234         idx = start / (sizeof(unsigned long) * 8);
1235         bit = start % (sizeof(unsigned long) * 8);
1236         val = bs->dirty_bitmap[idx];
1237         if (dirty) {
1238             if (!(val & (1UL << bit))) {
1239                 bs->dirty_count++;
1240                 val |= 1UL << bit;
1241             }
1242         } else {
1243             if (val & (1UL << bit)) {
1244                 bs->dirty_count--;
1245                 val &= ~(1UL << bit);
1246             }
1247         }
1248         bs->dirty_bitmap[idx] = val;
1249     }
1250 }
1251 
1252 /* Return < 0 if error. Important errors are:
1253   -EIO         generic I/O error (may happen for all errors)
1254   -ENOMEDIUM   No media inserted.
1255   -EINVAL      Invalid sector number or nb_sectors
1256   -EACCES      Trying to write a read-only device
1257 */
1258 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1259                const uint8_t *buf, int nb_sectors)
1260 {
1261     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1262 }
1263 
1264 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1265                void *buf, int count1)
1266 {
1267     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1268     int len, nb_sectors, count;
1269     int64_t sector_num;
1270     int ret;
1271 
1272     count = count1;
1273     /* first read to align to sector start */
1274     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1275     if (len > count)
1276         len = count;
1277     sector_num = offset >> BDRV_SECTOR_BITS;
1278     if (len > 0) {
1279         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1280             return ret;
1281         memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1282         count -= len;
1283         if (count == 0)
1284             return count1;
1285         sector_num++;
1286         buf += len;
1287     }
1288 
1289     /* read the sectors "in place" */
1290     nb_sectors = count >> BDRV_SECTOR_BITS;
1291     if (nb_sectors > 0) {
1292         if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1293             return ret;
1294         sector_num += nb_sectors;
1295         len = nb_sectors << BDRV_SECTOR_BITS;
1296         buf += len;
1297         count -= len;
1298     }
1299 
1300     /* add data from the last sector */
1301     if (count > 0) {
1302         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1303             return ret;
1304         memcpy(buf, tmp_buf, count);
1305     }
1306     return count1;
1307 }
1308 
1309 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1310                 const void *buf, int count1)
1311 {
1312     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1313     int len, nb_sectors, count;
1314     int64_t sector_num;
1315     int ret;
1316 
1317     count = count1;
1318     /* first write to align to sector start */
1319     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1320     if (len > count)
1321         len = count;
1322     sector_num = offset >> BDRV_SECTOR_BITS;
1323     if (len > 0) {
1324         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1325             return ret;
1326         memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1327         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1328             return ret;
1329         count -= len;
1330         if (count == 0)
1331             return count1;
1332         sector_num++;
1333         buf += len;
1334     }
1335 
1336     /* write the sectors "in place" */
1337     nb_sectors = count >> BDRV_SECTOR_BITS;
1338     if (nb_sectors > 0) {
1339         if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1340             return ret;
1341         sector_num += nb_sectors;
1342         len = nb_sectors << BDRV_SECTOR_BITS;
1343         buf += len;
1344         count -= len;
1345     }
1346 
1347     /* add data from the last sector */
1348     if (count > 0) {
1349         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1350             return ret;
1351         memcpy(tmp_buf, buf, count);
1352         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1353             return ret;
1354     }
1355     return count1;
1356 }
1357 
1358 /*
1359  * Writes to the file and ensures that no writes are reordered across this
1360  * request (acts as a barrier)
1361  *
1362  * Returns 0 on success, -errno in error cases.
1363  */
1364 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1365     const void *buf, int count)
1366 {
1367     int ret;
1368 
1369     ret = bdrv_pwrite(bs, offset, buf, count);
1370     if (ret < 0) {
1371         return ret;
1372     }
1373 
1374     /* No flush needed for cache modes that use O_DSYNC */
1375     if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
1376         bdrv_flush(bs);
1377     }
1378 
1379     return 0;
1380 }
1381 
1382 /*
1383  * Handle a read request in coroutine context
1384  */
1385 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1386     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1387 {
1388     BlockDriver *drv = bs->drv;
1389     BdrvTrackedRequest req;
1390     int ret;
1391 
1392     if (!drv) {
1393         return -ENOMEDIUM;
1394     }
1395     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1396         return -EIO;
1397     }
1398 
1399     /* throttling disk read I/O */
1400     if (bs->io_limits_enabled) {
1401         bdrv_io_limits_intercept(bs, false, nb_sectors);
1402     }
1403 
1404     tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1405     ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1406     tracked_request_end(&req);
1407     return ret;
1408 }
1409 
1410 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1411     int nb_sectors, QEMUIOVector *qiov)
1412 {
1413     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1414 
1415     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov);
1416 }
1417 
1418 /*
1419  * Handle a write request in coroutine context
1420  */
1421 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1422     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1423 {
1424     BlockDriver *drv = bs->drv;
1425     BdrvTrackedRequest req;
1426     int ret;
1427 
1428     if (!bs->drv) {
1429         return -ENOMEDIUM;
1430     }
1431     if (bs->read_only) {
1432         return -EACCES;
1433     }
1434     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1435         return -EIO;
1436     }
1437 
1438     /* throttling disk write I/O */
1439     if (bs->io_limits_enabled) {
1440         bdrv_io_limits_intercept(bs, true, nb_sectors);
1441     }
1442 
1443     tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1444 
1445     ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1446 
1447     if (bs->dirty_bitmap) {
1448         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1449     }
1450 
1451     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1452         bs->wr_highest_sector = sector_num + nb_sectors - 1;
1453     }
1454 
1455     tracked_request_end(&req);
1456 
1457     return ret;
1458 }
1459 
1460 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1461     int nb_sectors, QEMUIOVector *qiov)
1462 {
1463     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1464 
1465     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov);
1466 }
1467 
1468 /**
1469  * Truncate file to 'offset' bytes (needed only for file protocols)
1470  */
1471 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1472 {
1473     BlockDriver *drv = bs->drv;
1474     int ret;
1475     if (!drv)
1476         return -ENOMEDIUM;
1477     if (!drv->bdrv_truncate)
1478         return -ENOTSUP;
1479     if (bs->read_only)
1480         return -EACCES;
1481     if (bdrv_in_use(bs))
1482         return -EBUSY;
1483     ret = drv->bdrv_truncate(bs, offset);
1484     if (ret == 0) {
1485         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1486         bdrv_dev_resize_cb(bs);
1487     }
1488     return ret;
1489 }
1490 
1491 /**
1492  * Length of a allocated file in bytes. Sparse files are counted by actual
1493  * allocated space. Return < 0 if error or unknown.
1494  */
1495 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
1496 {
1497     BlockDriver *drv = bs->drv;
1498     if (!drv) {
1499         return -ENOMEDIUM;
1500     }
1501     if (drv->bdrv_get_allocated_file_size) {
1502         return drv->bdrv_get_allocated_file_size(bs);
1503     }
1504     if (bs->file) {
1505         return bdrv_get_allocated_file_size(bs->file);
1506     }
1507     return -ENOTSUP;
1508 }
1509 
1510 /**
1511  * Length of a file in bytes. Return < 0 if error or unknown.
1512  */
1513 int64_t bdrv_getlength(BlockDriverState *bs)
1514 {
1515     BlockDriver *drv = bs->drv;
1516     if (!drv)
1517         return -ENOMEDIUM;
1518 
1519     if (bs->growable || bdrv_dev_has_removable_media(bs)) {
1520         if (drv->bdrv_getlength) {
1521             return drv->bdrv_getlength(bs);
1522         }
1523     }
1524     return bs->total_sectors * BDRV_SECTOR_SIZE;
1525 }
1526 
1527 /* return 0 as number of sectors if no device present or error */
1528 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
1529 {
1530     int64_t length;
1531     length = bdrv_getlength(bs);
1532     if (length < 0)
1533         length = 0;
1534     else
1535         length = length >> BDRV_SECTOR_BITS;
1536     *nb_sectors_ptr = length;
1537 }
1538 
1539 struct partition {
1540         uint8_t boot_ind;           /* 0x80 - active */
1541         uint8_t head;               /* starting head */
1542         uint8_t sector;             /* starting sector */
1543         uint8_t cyl;                /* starting cylinder */
1544         uint8_t sys_ind;            /* What partition type */
1545         uint8_t end_head;           /* end head */
1546         uint8_t end_sector;         /* end sector */
1547         uint8_t end_cyl;            /* end cylinder */
1548         uint32_t start_sect;        /* starting sector counting from 0 */
1549         uint32_t nr_sects;          /* nr of sectors in partition */
1550 } QEMU_PACKED;
1551 
1552 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1553 static int guess_disk_lchs(BlockDriverState *bs,
1554                            int *pcylinders, int *pheads, int *psectors)
1555 {
1556     uint8_t buf[BDRV_SECTOR_SIZE];
1557     int ret, i, heads, sectors, cylinders;
1558     struct partition *p;
1559     uint32_t nr_sects;
1560     uint64_t nb_sectors;
1561 
1562     bdrv_get_geometry(bs, &nb_sectors);
1563 
1564     ret = bdrv_read(bs, 0, buf, 1);
1565     if (ret < 0)
1566         return -1;
1567     /* test msdos magic */
1568     if (buf[510] != 0x55 || buf[511] != 0xaa)
1569         return -1;
1570     for(i = 0; i < 4; i++) {
1571         p = ((struct partition *)(buf + 0x1be)) + i;
1572         nr_sects = le32_to_cpu(p->nr_sects);
1573         if (nr_sects && p->end_head) {
1574             /* We make the assumption that the partition terminates on
1575                a cylinder boundary */
1576             heads = p->end_head + 1;
1577             sectors = p->end_sector & 63;
1578             if (sectors == 0)
1579                 continue;
1580             cylinders = nb_sectors / (heads * sectors);
1581             if (cylinders < 1 || cylinders > 16383)
1582                 continue;
1583             *pheads = heads;
1584             *psectors = sectors;
1585             *pcylinders = cylinders;
1586 #if 0
1587             printf("guessed geometry: LCHS=%d %d %d\n",
1588                    cylinders, heads, sectors);
1589 #endif
1590             return 0;
1591         }
1592     }
1593     return -1;
1594 }
1595 
1596 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
1597 {
1598     int translation, lba_detected = 0;
1599     int cylinders, heads, secs;
1600     uint64_t nb_sectors;
1601 
1602     /* if a geometry hint is available, use it */
1603     bdrv_get_geometry(bs, &nb_sectors);
1604     bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
1605     translation = bdrv_get_translation_hint(bs);
1606     if (cylinders != 0) {
1607         *pcyls = cylinders;
1608         *pheads = heads;
1609         *psecs = secs;
1610     } else {
1611         if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
1612             if (heads > 16) {
1613                 /* if heads > 16, it means that a BIOS LBA
1614                    translation was active, so the default
1615                    hardware geometry is OK */
1616                 lba_detected = 1;
1617                 goto default_geometry;
1618             } else {
1619                 *pcyls = cylinders;
1620                 *pheads = heads;
1621                 *psecs = secs;
1622                 /* disable any translation to be in sync with
1623                    the logical geometry */
1624                 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
1625                     bdrv_set_translation_hint(bs,
1626                                               BIOS_ATA_TRANSLATION_NONE);
1627                 }
1628             }
1629         } else {
1630         default_geometry:
1631             /* if no geometry, use a standard physical disk geometry */
1632             cylinders = nb_sectors / (16 * 63);
1633 
1634             if (cylinders > 16383)
1635                 cylinders = 16383;
1636             else if (cylinders < 2)
1637                 cylinders = 2;
1638             *pcyls = cylinders;
1639             *pheads = 16;
1640             *psecs = 63;
1641             if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
1642                 if ((*pcyls * *pheads) <= 131072) {
1643                     bdrv_set_translation_hint(bs,
1644                                               BIOS_ATA_TRANSLATION_LARGE);
1645                 } else {
1646                     bdrv_set_translation_hint(bs,
1647                                               BIOS_ATA_TRANSLATION_LBA);
1648                 }
1649             }
1650         }
1651         bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
1652     }
1653 }
1654 
1655 void bdrv_set_geometry_hint(BlockDriverState *bs,
1656                             int cyls, int heads, int secs)
1657 {
1658     bs->cyls = cyls;
1659     bs->heads = heads;
1660     bs->secs = secs;
1661 }
1662 
1663 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
1664 {
1665     bs->translation = translation;
1666 }
1667 
1668 void bdrv_get_geometry_hint(BlockDriverState *bs,
1669                             int *pcyls, int *pheads, int *psecs)
1670 {
1671     *pcyls = bs->cyls;
1672     *pheads = bs->heads;
1673     *psecs = bs->secs;
1674 }
1675 
1676 /* throttling disk io limits */
1677 void bdrv_set_io_limits(BlockDriverState *bs,
1678                         BlockIOLimit *io_limits)
1679 {
1680     bs->io_limits = *io_limits;
1681     bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
1682 }
1683 
1684 /* Recognize floppy formats */
1685 typedef struct FDFormat {
1686     FDriveType drive;
1687     uint8_t last_sect;
1688     uint8_t max_track;
1689     uint8_t max_head;
1690 } FDFormat;
1691 
1692 static const FDFormat fd_formats[] = {
1693     /* First entry is default format */
1694     /* 1.44 MB 3"1/2 floppy disks */
1695     { FDRIVE_DRV_144, 18, 80, 1, },
1696     { FDRIVE_DRV_144, 20, 80, 1, },
1697     { FDRIVE_DRV_144, 21, 80, 1, },
1698     { FDRIVE_DRV_144, 21, 82, 1, },
1699     { FDRIVE_DRV_144, 21, 83, 1, },
1700     { FDRIVE_DRV_144, 22, 80, 1, },
1701     { FDRIVE_DRV_144, 23, 80, 1, },
1702     { FDRIVE_DRV_144, 24, 80, 1, },
1703     /* 2.88 MB 3"1/2 floppy disks */
1704     { FDRIVE_DRV_288, 36, 80, 1, },
1705     { FDRIVE_DRV_288, 39, 80, 1, },
1706     { FDRIVE_DRV_288, 40, 80, 1, },
1707     { FDRIVE_DRV_288, 44, 80, 1, },
1708     { FDRIVE_DRV_288, 48, 80, 1, },
1709     /* 720 kB 3"1/2 floppy disks */
1710     { FDRIVE_DRV_144,  9, 80, 1, },
1711     { FDRIVE_DRV_144, 10, 80, 1, },
1712     { FDRIVE_DRV_144, 10, 82, 1, },
1713     { FDRIVE_DRV_144, 10, 83, 1, },
1714     { FDRIVE_DRV_144, 13, 80, 1, },
1715     { FDRIVE_DRV_144, 14, 80, 1, },
1716     /* 1.2 MB 5"1/4 floppy disks */
1717     { FDRIVE_DRV_120, 15, 80, 1, },
1718     { FDRIVE_DRV_120, 18, 80, 1, },
1719     { FDRIVE_DRV_120, 18, 82, 1, },
1720     { FDRIVE_DRV_120, 18, 83, 1, },
1721     { FDRIVE_DRV_120, 20, 80, 1, },
1722     /* 720 kB 5"1/4 floppy disks */
1723     { FDRIVE_DRV_120,  9, 80, 1, },
1724     { FDRIVE_DRV_120, 11, 80, 1, },
1725     /* 360 kB 5"1/4 floppy disks */
1726     { FDRIVE_DRV_120,  9, 40, 1, },
1727     { FDRIVE_DRV_120,  9, 40, 0, },
1728     { FDRIVE_DRV_120, 10, 41, 1, },
1729     { FDRIVE_DRV_120, 10, 42, 1, },
1730     /* 320 kB 5"1/4 floppy disks */
1731     { FDRIVE_DRV_120,  8, 40, 1, },
1732     { FDRIVE_DRV_120,  8, 40, 0, },
1733     /* 360 kB must match 5"1/4 better than 3"1/2... */
1734     { FDRIVE_DRV_144,  9, 80, 0, },
1735     /* end */
1736     { FDRIVE_DRV_NONE, -1, -1, 0, },
1737 };
1738 
1739 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
1740                                    int *max_track, int *last_sect,
1741                                    FDriveType drive_in, FDriveType *drive)
1742 {
1743     const FDFormat *parse;
1744     uint64_t nb_sectors, size;
1745     int i, first_match, match;
1746 
1747     bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
1748     if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
1749         /* User defined disk */
1750     } else {
1751         bdrv_get_geometry(bs, &nb_sectors);
1752         match = -1;
1753         first_match = -1;
1754         for (i = 0; ; i++) {
1755             parse = &fd_formats[i];
1756             if (parse->drive == FDRIVE_DRV_NONE) {
1757                 break;
1758             }
1759             if (drive_in == parse->drive ||
1760                 drive_in == FDRIVE_DRV_NONE) {
1761                 size = (parse->max_head + 1) * parse->max_track *
1762                     parse->last_sect;
1763                 if (nb_sectors == size) {
1764                     match = i;
1765                     break;
1766                 }
1767                 if (first_match == -1) {
1768                     first_match = i;
1769                 }
1770             }
1771         }
1772         if (match == -1) {
1773             if (first_match == -1) {
1774                 match = 1;
1775             } else {
1776                 match = first_match;
1777             }
1778             parse = &fd_formats[match];
1779         }
1780         *nb_heads = parse->max_head + 1;
1781         *max_track = parse->max_track;
1782         *last_sect = parse->last_sect;
1783         *drive = parse->drive;
1784     }
1785 }
1786 
1787 int bdrv_get_translation_hint(BlockDriverState *bs)
1788 {
1789     return bs->translation;
1790 }
1791 
1792 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
1793                        BlockErrorAction on_write_error)
1794 {
1795     bs->on_read_error = on_read_error;
1796     bs->on_write_error = on_write_error;
1797 }
1798 
1799 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
1800 {
1801     return is_read ? bs->on_read_error : bs->on_write_error;
1802 }
1803 
1804 int bdrv_is_read_only(BlockDriverState *bs)
1805 {
1806     return bs->read_only;
1807 }
1808 
1809 int bdrv_is_sg(BlockDriverState *bs)
1810 {
1811     return bs->sg;
1812 }
1813 
1814 int bdrv_enable_write_cache(BlockDriverState *bs)
1815 {
1816     return bs->enable_write_cache;
1817 }
1818 
1819 int bdrv_is_encrypted(BlockDriverState *bs)
1820 {
1821     if (bs->backing_hd && bs->backing_hd->encrypted)
1822         return 1;
1823     return bs->encrypted;
1824 }
1825 
1826 int bdrv_key_required(BlockDriverState *bs)
1827 {
1828     BlockDriverState *backing_hd = bs->backing_hd;
1829 
1830     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
1831         return 1;
1832     return (bs->encrypted && !bs->valid_key);
1833 }
1834 
1835 int bdrv_set_key(BlockDriverState *bs, const char *key)
1836 {
1837     int ret;
1838     if (bs->backing_hd && bs->backing_hd->encrypted) {
1839         ret = bdrv_set_key(bs->backing_hd, key);
1840         if (ret < 0)
1841             return ret;
1842         if (!bs->encrypted)
1843             return 0;
1844     }
1845     if (!bs->encrypted) {
1846         return -EINVAL;
1847     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
1848         return -ENOMEDIUM;
1849     }
1850     ret = bs->drv->bdrv_set_key(bs, key);
1851     if (ret < 0) {
1852         bs->valid_key = 0;
1853     } else if (!bs->valid_key) {
1854         bs->valid_key = 1;
1855         /* call the change callback now, we skipped it on open */
1856         bdrv_dev_change_media_cb(bs, true);
1857     }
1858     return ret;
1859 }
1860 
1861 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
1862 {
1863     if (!bs->drv) {
1864         buf[0] = '\0';
1865     } else {
1866         pstrcpy(buf, buf_size, bs->drv->format_name);
1867     }
1868 }
1869 
1870 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
1871                          void *opaque)
1872 {
1873     BlockDriver *drv;
1874 
1875     QLIST_FOREACH(drv, &bdrv_drivers, list) {
1876         it(opaque, drv->format_name);
1877     }
1878 }
1879 
1880 BlockDriverState *bdrv_find(const char *name)
1881 {
1882     BlockDriverState *bs;
1883 
1884     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1885         if (!strcmp(name, bs->device_name)) {
1886             return bs;
1887         }
1888     }
1889     return NULL;
1890 }
1891 
1892 BlockDriverState *bdrv_next(BlockDriverState *bs)
1893 {
1894     if (!bs) {
1895         return QTAILQ_FIRST(&bdrv_states);
1896     }
1897     return QTAILQ_NEXT(bs, list);
1898 }
1899 
1900 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
1901 {
1902     BlockDriverState *bs;
1903 
1904     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1905         it(opaque, bs);
1906     }
1907 }
1908 
1909 const char *bdrv_get_device_name(BlockDriverState *bs)
1910 {
1911     return bs->device_name;
1912 }
1913 
1914 void bdrv_flush_all(void)
1915 {
1916     BlockDriverState *bs;
1917 
1918     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1919         if (!bdrv_is_read_only(bs) && bdrv_is_inserted(bs)) {
1920             bdrv_flush(bs);
1921         }
1922     }
1923 }
1924 
1925 int bdrv_has_zero_init(BlockDriverState *bs)
1926 {
1927     assert(bs->drv);
1928 
1929     if (bs->drv->bdrv_has_zero_init) {
1930         return bs->drv->bdrv_has_zero_init(bs);
1931     }
1932 
1933     return 1;
1934 }
1935 
1936 typedef struct BdrvCoIsAllocatedData {
1937     BlockDriverState *bs;
1938     int64_t sector_num;
1939     int nb_sectors;
1940     int *pnum;
1941     int ret;
1942     bool done;
1943 } BdrvCoIsAllocatedData;
1944 
1945 /*
1946  * Returns true iff the specified sector is present in the disk image. Drivers
1947  * not implementing the functionality are assumed to not support backing files,
1948  * hence all their sectors are reported as allocated.
1949  *
1950  * 'pnum' is set to the number of sectors (including and immediately following
1951  * the specified sector) that are known to be in the same
1952  * allocated/unallocated state.
1953  *
1954  * 'nb_sectors' is the max value 'pnum' should be set to.
1955  */
1956 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
1957                                       int nb_sectors, int *pnum)
1958 {
1959     if (!bs->drv->bdrv_co_is_allocated) {
1960         int64_t n;
1961         if (sector_num >= bs->total_sectors) {
1962             *pnum = 0;
1963             return 0;
1964         }
1965         n = bs->total_sectors - sector_num;
1966         *pnum = (n < nb_sectors) ? (n) : (nb_sectors);
1967         return 1;
1968     }
1969 
1970     return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
1971 }
1972 
1973 /* Coroutine wrapper for bdrv_is_allocated() */
1974 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
1975 {
1976     BdrvCoIsAllocatedData *data = opaque;
1977     BlockDriverState *bs = data->bs;
1978 
1979     data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
1980                                      data->pnum);
1981     data->done = true;
1982 }
1983 
1984 /*
1985  * Synchronous wrapper around bdrv_co_is_allocated().
1986  *
1987  * See bdrv_co_is_allocated() for details.
1988  */
1989 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
1990                       int *pnum)
1991 {
1992     Coroutine *co;
1993     BdrvCoIsAllocatedData data = {
1994         .bs = bs,
1995         .sector_num = sector_num,
1996         .nb_sectors = nb_sectors,
1997         .pnum = pnum,
1998         .done = false,
1999     };
2000 
2001     co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2002     qemu_coroutine_enter(co, &data);
2003     while (!data.done) {
2004         qemu_aio_wait();
2005     }
2006     return data.ret;
2007 }
2008 
2009 void bdrv_mon_event(const BlockDriverState *bdrv,
2010                     BlockMonEventAction action, int is_read)
2011 {
2012     QObject *data;
2013     const char *action_str;
2014 
2015     switch (action) {
2016     case BDRV_ACTION_REPORT:
2017         action_str = "report";
2018         break;
2019     case BDRV_ACTION_IGNORE:
2020         action_str = "ignore";
2021         break;
2022     case BDRV_ACTION_STOP:
2023         action_str = "stop";
2024         break;
2025     default:
2026         abort();
2027     }
2028 
2029     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2030                               bdrv->device_name,
2031                               action_str,
2032                               is_read ? "read" : "write");
2033     monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
2034 
2035     qobject_decref(data);
2036 }
2037 
2038 BlockInfoList *qmp_query_block(Error **errp)
2039 {
2040     BlockInfoList *head = NULL, *cur_item = NULL;
2041     BlockDriverState *bs;
2042 
2043     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2044         BlockInfoList *info = g_malloc0(sizeof(*info));
2045 
2046         info->value = g_malloc0(sizeof(*info->value));
2047         info->value->device = g_strdup(bs->device_name);
2048         info->value->type = g_strdup("unknown");
2049         info->value->locked = bdrv_dev_is_medium_locked(bs);
2050         info->value->removable = bdrv_dev_has_removable_media(bs);
2051 
2052         if (bdrv_dev_has_removable_media(bs)) {
2053             info->value->has_tray_open = true;
2054             info->value->tray_open = bdrv_dev_is_tray_open(bs);
2055         }
2056 
2057         if (bdrv_iostatus_is_enabled(bs)) {
2058             info->value->has_io_status = true;
2059             info->value->io_status = bs->iostatus;
2060         }
2061 
2062         if (bs->drv) {
2063             info->value->has_inserted = true;
2064             info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2065             info->value->inserted->file = g_strdup(bs->filename);
2066             info->value->inserted->ro = bs->read_only;
2067             info->value->inserted->drv = g_strdup(bs->drv->format_name);
2068             info->value->inserted->encrypted = bs->encrypted;
2069             if (bs->backing_file[0]) {
2070                 info->value->inserted->has_backing_file = true;
2071                 info->value->inserted->backing_file = g_strdup(bs->backing_file);
2072             }
2073 
2074             if (bs->io_limits_enabled) {
2075                 info->value->inserted->bps =
2076                                bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2077                 info->value->inserted->bps_rd =
2078                                bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2079                 info->value->inserted->bps_wr =
2080                                bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2081                 info->value->inserted->iops =
2082                                bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2083                 info->value->inserted->iops_rd =
2084                                bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2085                 info->value->inserted->iops_wr =
2086                                bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2087             }
2088         }
2089 
2090         /* XXX: waiting for the qapi to support GSList */
2091         if (!cur_item) {
2092             head = cur_item = info;
2093         } else {
2094             cur_item->next = info;
2095             cur_item = info;
2096         }
2097     }
2098 
2099     return head;
2100 }
2101 
2102 /* Consider exposing this as a full fledged QMP command */
2103 static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2104 {
2105     BlockStats *s;
2106 
2107     s = g_malloc0(sizeof(*s));
2108 
2109     if (bs->device_name[0]) {
2110         s->has_device = true;
2111         s->device = g_strdup(bs->device_name);
2112     }
2113 
2114     s->stats = g_malloc0(sizeof(*s->stats));
2115     s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2116     s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2117     s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2118     s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2119     s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2120     s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2121     s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2122     s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2123     s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2124 
2125     if (bs->file) {
2126         s->has_parent = true;
2127         s->parent = qmp_query_blockstat(bs->file, NULL);
2128     }
2129 
2130     return s;
2131 }
2132 
2133 BlockStatsList *qmp_query_blockstats(Error **errp)
2134 {
2135     BlockStatsList *head = NULL, *cur_item = NULL;
2136     BlockDriverState *bs;
2137 
2138     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2139         BlockStatsList *info = g_malloc0(sizeof(*info));
2140         info->value = qmp_query_blockstat(bs, NULL);
2141 
2142         /* XXX: waiting for the qapi to support GSList */
2143         if (!cur_item) {
2144             head = cur_item = info;
2145         } else {
2146             cur_item->next = info;
2147             cur_item = info;
2148         }
2149     }
2150 
2151     return head;
2152 }
2153 
2154 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2155 {
2156     if (bs->backing_hd && bs->backing_hd->encrypted)
2157         return bs->backing_file;
2158     else if (bs->encrypted)
2159         return bs->filename;
2160     else
2161         return NULL;
2162 }
2163 
2164 void bdrv_get_backing_filename(BlockDriverState *bs,
2165                                char *filename, int filename_size)
2166 {
2167     pstrcpy(filename, filename_size, bs->backing_file);
2168 }
2169 
2170 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2171                           const uint8_t *buf, int nb_sectors)
2172 {
2173     BlockDriver *drv = bs->drv;
2174     if (!drv)
2175         return -ENOMEDIUM;
2176     if (!drv->bdrv_write_compressed)
2177         return -ENOTSUP;
2178     if (bdrv_check_request(bs, sector_num, nb_sectors))
2179         return -EIO;
2180 
2181     if (bs->dirty_bitmap) {
2182         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2183     }
2184 
2185     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2186 }
2187 
2188 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2189 {
2190     BlockDriver *drv = bs->drv;
2191     if (!drv)
2192         return -ENOMEDIUM;
2193     if (!drv->bdrv_get_info)
2194         return -ENOTSUP;
2195     memset(bdi, 0, sizeof(*bdi));
2196     return drv->bdrv_get_info(bs, bdi);
2197 }
2198 
2199 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2200                       int64_t pos, int size)
2201 {
2202     BlockDriver *drv = bs->drv;
2203     if (!drv)
2204         return -ENOMEDIUM;
2205     if (drv->bdrv_save_vmstate)
2206         return drv->bdrv_save_vmstate(bs, buf, pos, size);
2207     if (bs->file)
2208         return bdrv_save_vmstate(bs->file, buf, pos, size);
2209     return -ENOTSUP;
2210 }
2211 
2212 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2213                       int64_t pos, int size)
2214 {
2215     BlockDriver *drv = bs->drv;
2216     if (!drv)
2217         return -ENOMEDIUM;
2218     if (drv->bdrv_load_vmstate)
2219         return drv->bdrv_load_vmstate(bs, buf, pos, size);
2220     if (bs->file)
2221         return bdrv_load_vmstate(bs->file, buf, pos, size);
2222     return -ENOTSUP;
2223 }
2224 
2225 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2226 {
2227     BlockDriver *drv = bs->drv;
2228 
2229     if (!drv || !drv->bdrv_debug_event) {
2230         return;
2231     }
2232 
2233     return drv->bdrv_debug_event(bs, event);
2234 
2235 }
2236 
2237 /**************************************************************/
2238 /* handling of snapshots */
2239 
2240 int bdrv_can_snapshot(BlockDriverState *bs)
2241 {
2242     BlockDriver *drv = bs->drv;
2243     if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2244         return 0;
2245     }
2246 
2247     if (!drv->bdrv_snapshot_create) {
2248         if (bs->file != NULL) {
2249             return bdrv_can_snapshot(bs->file);
2250         }
2251         return 0;
2252     }
2253 
2254     return 1;
2255 }
2256 
2257 int bdrv_is_snapshot(BlockDriverState *bs)
2258 {
2259     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2260 }
2261 
2262 BlockDriverState *bdrv_snapshots(void)
2263 {
2264     BlockDriverState *bs;
2265 
2266     if (bs_snapshots) {
2267         return bs_snapshots;
2268     }
2269 
2270     bs = NULL;
2271     while ((bs = bdrv_next(bs))) {
2272         if (bdrv_can_snapshot(bs)) {
2273             bs_snapshots = bs;
2274             return bs;
2275         }
2276     }
2277     return NULL;
2278 }
2279 
2280 int bdrv_snapshot_create(BlockDriverState *bs,
2281                          QEMUSnapshotInfo *sn_info)
2282 {
2283     BlockDriver *drv = bs->drv;
2284     if (!drv)
2285         return -ENOMEDIUM;
2286     if (drv->bdrv_snapshot_create)
2287         return drv->bdrv_snapshot_create(bs, sn_info);
2288     if (bs->file)
2289         return bdrv_snapshot_create(bs->file, sn_info);
2290     return -ENOTSUP;
2291 }
2292 
2293 int bdrv_snapshot_goto(BlockDriverState *bs,
2294                        const char *snapshot_id)
2295 {
2296     BlockDriver *drv = bs->drv;
2297     int ret, open_ret;
2298 
2299     if (!drv)
2300         return -ENOMEDIUM;
2301     if (drv->bdrv_snapshot_goto)
2302         return drv->bdrv_snapshot_goto(bs, snapshot_id);
2303 
2304     if (bs->file) {
2305         drv->bdrv_close(bs);
2306         ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2307         open_ret = drv->bdrv_open(bs, bs->open_flags);
2308         if (open_ret < 0) {
2309             bdrv_delete(bs->file);
2310             bs->drv = NULL;
2311             return open_ret;
2312         }
2313         return ret;
2314     }
2315 
2316     return -ENOTSUP;
2317 }
2318 
2319 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2320 {
2321     BlockDriver *drv = bs->drv;
2322     if (!drv)
2323         return -ENOMEDIUM;
2324     if (drv->bdrv_snapshot_delete)
2325         return drv->bdrv_snapshot_delete(bs, snapshot_id);
2326     if (bs->file)
2327         return bdrv_snapshot_delete(bs->file, snapshot_id);
2328     return -ENOTSUP;
2329 }
2330 
2331 int bdrv_snapshot_list(BlockDriverState *bs,
2332                        QEMUSnapshotInfo **psn_info)
2333 {
2334     BlockDriver *drv = bs->drv;
2335     if (!drv)
2336         return -ENOMEDIUM;
2337     if (drv->bdrv_snapshot_list)
2338         return drv->bdrv_snapshot_list(bs, psn_info);
2339     if (bs->file)
2340         return bdrv_snapshot_list(bs->file, psn_info);
2341     return -ENOTSUP;
2342 }
2343 
2344 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2345         const char *snapshot_name)
2346 {
2347     BlockDriver *drv = bs->drv;
2348     if (!drv) {
2349         return -ENOMEDIUM;
2350     }
2351     if (!bs->read_only) {
2352         return -EINVAL;
2353     }
2354     if (drv->bdrv_snapshot_load_tmp) {
2355         return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2356     }
2357     return -ENOTSUP;
2358 }
2359 
2360 #define NB_SUFFIXES 4
2361 
2362 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2363 {
2364     static const char suffixes[NB_SUFFIXES] = "KMGT";
2365     int64_t base;
2366     int i;
2367 
2368     if (size <= 999) {
2369         snprintf(buf, buf_size, "%" PRId64, size);
2370     } else {
2371         base = 1024;
2372         for(i = 0; i < NB_SUFFIXES; i++) {
2373             if (size < (10 * base)) {
2374                 snprintf(buf, buf_size, "%0.1f%c",
2375                          (double)size / base,
2376                          suffixes[i]);
2377                 break;
2378             } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2379                 snprintf(buf, buf_size, "%" PRId64 "%c",
2380                          ((size + (base >> 1)) / base),
2381                          suffixes[i]);
2382                 break;
2383             }
2384             base = base * 1024;
2385         }
2386     }
2387     return buf;
2388 }
2389 
2390 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2391 {
2392     char buf1[128], date_buf[128], clock_buf[128];
2393 #ifdef _WIN32
2394     struct tm *ptm;
2395 #else
2396     struct tm tm;
2397 #endif
2398     time_t ti;
2399     int64_t secs;
2400 
2401     if (!sn) {
2402         snprintf(buf, buf_size,
2403                  "%-10s%-20s%7s%20s%15s",
2404                  "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2405     } else {
2406         ti = sn->date_sec;
2407 #ifdef _WIN32
2408         ptm = localtime(&ti);
2409         strftime(date_buf, sizeof(date_buf),
2410                  "%Y-%m-%d %H:%M:%S", ptm);
2411 #else
2412         localtime_r(&ti, &tm);
2413         strftime(date_buf, sizeof(date_buf),
2414                  "%Y-%m-%d %H:%M:%S", &tm);
2415 #endif
2416         secs = sn->vm_clock_nsec / 1000000000;
2417         snprintf(clock_buf, sizeof(clock_buf),
2418                  "%02d:%02d:%02d.%03d",
2419                  (int)(secs / 3600),
2420                  (int)((secs / 60) % 60),
2421                  (int)(secs % 60),
2422                  (int)((sn->vm_clock_nsec / 1000000) % 1000));
2423         snprintf(buf, buf_size,
2424                  "%-10s%-20s%7s%20s%15s",
2425                  sn->id_str, sn->name,
2426                  get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2427                  date_buf,
2428                  clock_buf);
2429     }
2430     return buf;
2431 }
2432 
2433 /**************************************************************/
2434 /* async I/Os */
2435 
2436 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2437                                  QEMUIOVector *qiov, int nb_sectors,
2438                                  BlockDriverCompletionFunc *cb, void *opaque)
2439 {
2440     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2441 
2442     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2443                                  cb, opaque, false);
2444 }
2445 
2446 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2447                                   QEMUIOVector *qiov, int nb_sectors,
2448                                   BlockDriverCompletionFunc *cb, void *opaque)
2449 {
2450     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2451 
2452     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2453                                  cb, opaque, true);
2454 }
2455 
2456 
2457 typedef struct MultiwriteCB {
2458     int error;
2459     int num_requests;
2460     int num_callbacks;
2461     struct {
2462         BlockDriverCompletionFunc *cb;
2463         void *opaque;
2464         QEMUIOVector *free_qiov;
2465         void *free_buf;
2466     } callbacks[];
2467 } MultiwriteCB;
2468 
2469 static void multiwrite_user_cb(MultiwriteCB *mcb)
2470 {
2471     int i;
2472 
2473     for (i = 0; i < mcb->num_callbacks; i++) {
2474         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2475         if (mcb->callbacks[i].free_qiov) {
2476             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2477         }
2478         g_free(mcb->callbacks[i].free_qiov);
2479         qemu_vfree(mcb->callbacks[i].free_buf);
2480     }
2481 }
2482 
2483 static void multiwrite_cb(void *opaque, int ret)
2484 {
2485     MultiwriteCB *mcb = opaque;
2486 
2487     trace_multiwrite_cb(mcb, ret);
2488 
2489     if (ret < 0 && !mcb->error) {
2490         mcb->error = ret;
2491     }
2492 
2493     mcb->num_requests--;
2494     if (mcb->num_requests == 0) {
2495         multiwrite_user_cb(mcb);
2496         g_free(mcb);
2497     }
2498 }
2499 
2500 static int multiwrite_req_compare(const void *a, const void *b)
2501 {
2502     const BlockRequest *req1 = a, *req2 = b;
2503 
2504     /*
2505      * Note that we can't simply subtract req2->sector from req1->sector
2506      * here as that could overflow the return value.
2507      */
2508     if (req1->sector > req2->sector) {
2509         return 1;
2510     } else if (req1->sector < req2->sector) {
2511         return -1;
2512     } else {
2513         return 0;
2514     }
2515 }
2516 
2517 /*
2518  * Takes a bunch of requests and tries to merge them. Returns the number of
2519  * requests that remain after merging.
2520  */
2521 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2522     int num_reqs, MultiwriteCB *mcb)
2523 {
2524     int i, outidx;
2525 
2526     // Sort requests by start sector
2527     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2528 
2529     // Check if adjacent requests touch the same clusters. If so, combine them,
2530     // filling up gaps with zero sectors.
2531     outidx = 0;
2532     for (i = 1; i < num_reqs; i++) {
2533         int merge = 0;
2534         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2535 
2536         // This handles the cases that are valid for all block drivers, namely
2537         // exactly sequential writes and overlapping writes.
2538         if (reqs[i].sector <= oldreq_last) {
2539             merge = 1;
2540         }
2541 
2542         // The block driver may decide that it makes sense to combine requests
2543         // even if there is a gap of some sectors between them. In this case,
2544         // the gap is filled with zeros (therefore only applicable for yet
2545         // unused space in format like qcow2).
2546         if (!merge && bs->drv->bdrv_merge_requests) {
2547             merge = bs->drv->bdrv_merge_requests(bs, &reqs[outidx], &reqs[i]);
2548         }
2549 
2550         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2551             merge = 0;
2552         }
2553 
2554         if (merge) {
2555             size_t size;
2556             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
2557             qemu_iovec_init(qiov,
2558                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2559 
2560             // Add the first request to the merged one. If the requests are
2561             // overlapping, drop the last sectors of the first request.
2562             size = (reqs[i].sector - reqs[outidx].sector) << 9;
2563             qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
2564 
2565             // We might need to add some zeros between the two requests
2566             if (reqs[i].sector > oldreq_last) {
2567                 size_t zero_bytes = (reqs[i].sector - oldreq_last) << 9;
2568                 uint8_t *buf = qemu_blockalign(bs, zero_bytes);
2569                 memset(buf, 0, zero_bytes);
2570                 qemu_iovec_add(qiov, buf, zero_bytes);
2571                 mcb->callbacks[i].free_buf = buf;
2572             }
2573 
2574             // Add the second request
2575             qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
2576 
2577             reqs[outidx].nb_sectors = qiov->size >> 9;
2578             reqs[outidx].qiov = qiov;
2579 
2580             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
2581         } else {
2582             outidx++;
2583             reqs[outidx].sector     = reqs[i].sector;
2584             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
2585             reqs[outidx].qiov       = reqs[i].qiov;
2586         }
2587     }
2588 
2589     return outidx + 1;
2590 }
2591 
2592 /*
2593  * Submit multiple AIO write requests at once.
2594  *
2595  * On success, the function returns 0 and all requests in the reqs array have
2596  * been submitted. In error case this function returns -1, and any of the
2597  * requests may or may not be submitted yet. In particular, this means that the
2598  * callback will be called for some of the requests, for others it won't. The
2599  * caller must check the error field of the BlockRequest to wait for the right
2600  * callbacks (if error != 0, no callback will be called).
2601  *
2602  * The implementation may modify the contents of the reqs array, e.g. to merge
2603  * requests. However, the fields opaque and error are left unmodified as they
2604  * are used to signal failure for a single request to the caller.
2605  */
2606 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
2607 {
2608     BlockDriverAIOCB *acb;
2609     MultiwriteCB *mcb;
2610     int i;
2611 
2612     /* don't submit writes if we don't have a medium */
2613     if (bs->drv == NULL) {
2614         for (i = 0; i < num_reqs; i++) {
2615             reqs[i].error = -ENOMEDIUM;
2616         }
2617         return -1;
2618     }
2619 
2620     if (num_reqs == 0) {
2621         return 0;
2622     }
2623 
2624     // Create MultiwriteCB structure
2625     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
2626     mcb->num_requests = 0;
2627     mcb->num_callbacks = num_reqs;
2628 
2629     for (i = 0; i < num_reqs; i++) {
2630         mcb->callbacks[i].cb = reqs[i].cb;
2631         mcb->callbacks[i].opaque = reqs[i].opaque;
2632     }
2633 
2634     // Check for mergable requests
2635     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
2636 
2637     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
2638 
2639     /*
2640      * Run the aio requests. As soon as one request can't be submitted
2641      * successfully, fail all requests that are not yet submitted (we must
2642      * return failure for all requests anyway)
2643      *
2644      * num_requests cannot be set to the right value immediately: If
2645      * bdrv_aio_writev fails for some request, num_requests would be too high
2646      * and therefore multiwrite_cb() would never recognize the multiwrite
2647      * request as completed. We also cannot use the loop variable i to set it
2648      * when the first request fails because the callback may already have been
2649      * called for previously submitted requests. Thus, num_requests must be
2650      * incremented for each request that is submitted.
2651      *
2652      * The problem that callbacks may be called early also means that we need
2653      * to take care that num_requests doesn't become 0 before all requests are
2654      * submitted - multiwrite_cb() would consider the multiwrite request
2655      * completed. A dummy request that is "completed" by a manual call to
2656      * multiwrite_cb() takes care of this.
2657      */
2658     mcb->num_requests = 1;
2659 
2660     // Run the aio requests
2661     for (i = 0; i < num_reqs; i++) {
2662         mcb->num_requests++;
2663         acb = bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
2664             reqs[i].nb_sectors, multiwrite_cb, mcb);
2665 
2666         if (acb == NULL) {
2667             // We can only fail the whole thing if no request has been
2668             // submitted yet. Otherwise we'll wait for the submitted AIOs to
2669             // complete and report the error in the callback.
2670             if (i == 0) {
2671                 trace_bdrv_aio_multiwrite_earlyfail(mcb);
2672                 goto fail;
2673             } else {
2674                 trace_bdrv_aio_multiwrite_latefail(mcb, i);
2675                 multiwrite_cb(mcb, -EIO);
2676                 break;
2677             }
2678         }
2679     }
2680 
2681     /* Complete the dummy request */
2682     multiwrite_cb(mcb, 0);
2683 
2684     return 0;
2685 
2686 fail:
2687     for (i = 0; i < mcb->num_callbacks; i++) {
2688         reqs[i].error = -EIO;
2689     }
2690     g_free(mcb);
2691     return -1;
2692 }
2693 
2694 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
2695 {
2696     acb->pool->cancel(acb);
2697 }
2698 
2699 /* block I/O throttling */
2700 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
2701                  bool is_write, double elapsed_time, uint64_t *wait)
2702 {
2703     uint64_t bps_limit = 0;
2704     double   bytes_limit, bytes_base, bytes_res;
2705     double   slice_time, wait_time;
2706 
2707     if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
2708         bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2709     } else if (bs->io_limits.bps[is_write]) {
2710         bps_limit = bs->io_limits.bps[is_write];
2711     } else {
2712         if (wait) {
2713             *wait = 0;
2714         }
2715 
2716         return false;
2717     }
2718 
2719     slice_time = bs->slice_end - bs->slice_start;
2720     slice_time /= (NANOSECONDS_PER_SECOND);
2721     bytes_limit = bps_limit * slice_time;
2722     bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
2723     if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
2724         bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
2725     }
2726 
2727     /* bytes_base: the bytes of data which have been read/written; and
2728      *             it is obtained from the history statistic info.
2729      * bytes_res: the remaining bytes of data which need to be read/written.
2730      * (bytes_base + bytes_res) / bps_limit: used to calcuate
2731      *             the total time for completing reading/writting all data.
2732      */
2733     bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
2734 
2735     if (bytes_base + bytes_res <= bytes_limit) {
2736         if (wait) {
2737             *wait = 0;
2738         }
2739 
2740         return false;
2741     }
2742 
2743     /* Calc approx time to dispatch */
2744     wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
2745 
2746     /* When the I/O rate at runtime exceeds the limits,
2747      * bs->slice_end need to be extended in order that the current statistic
2748      * info can be kept until the timer fire, so it is increased and tuned
2749      * based on the result of experiment.
2750      */
2751     bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
2752     bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
2753     if (wait) {
2754         *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
2755     }
2756 
2757     return true;
2758 }
2759 
2760 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
2761                              double elapsed_time, uint64_t *wait)
2762 {
2763     uint64_t iops_limit = 0;
2764     double   ios_limit, ios_base;
2765     double   slice_time, wait_time;
2766 
2767     if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
2768         iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2769     } else if (bs->io_limits.iops[is_write]) {
2770         iops_limit = bs->io_limits.iops[is_write];
2771     } else {
2772         if (wait) {
2773             *wait = 0;
2774         }
2775 
2776         return false;
2777     }
2778 
2779     slice_time = bs->slice_end - bs->slice_start;
2780     slice_time /= (NANOSECONDS_PER_SECOND);
2781     ios_limit  = iops_limit * slice_time;
2782     ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
2783     if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
2784         ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
2785     }
2786 
2787     if (ios_base + 1 <= ios_limit) {
2788         if (wait) {
2789             *wait = 0;
2790         }
2791 
2792         return false;
2793     }
2794 
2795     /* Calc approx time to dispatch */
2796     wait_time = (ios_base + 1) / iops_limit;
2797     if (wait_time > elapsed_time) {
2798         wait_time = wait_time - elapsed_time;
2799     } else {
2800         wait_time = 0;
2801     }
2802 
2803     bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
2804     bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
2805     if (wait) {
2806         *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
2807     }
2808 
2809     return true;
2810 }
2811 
2812 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
2813                            bool is_write, int64_t *wait)
2814 {
2815     int64_t  now, max_wait;
2816     uint64_t bps_wait = 0, iops_wait = 0;
2817     double   elapsed_time;
2818     int      bps_ret, iops_ret;
2819 
2820     now = qemu_get_clock_ns(vm_clock);
2821     if ((bs->slice_start < now)
2822         && (bs->slice_end > now)) {
2823         bs->slice_end = now + bs->slice_time;
2824     } else {
2825         bs->slice_time  =  5 * BLOCK_IO_SLICE_TIME;
2826         bs->slice_start = now;
2827         bs->slice_end   = now + bs->slice_time;
2828 
2829         bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
2830         bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
2831 
2832         bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
2833         bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
2834     }
2835 
2836     elapsed_time  = now - bs->slice_start;
2837     elapsed_time  /= (NANOSECONDS_PER_SECOND);
2838 
2839     bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
2840                                       is_write, elapsed_time, &bps_wait);
2841     iops_ret = bdrv_exceed_iops_limits(bs, is_write,
2842                                       elapsed_time, &iops_wait);
2843     if (bps_ret || iops_ret) {
2844         max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
2845         if (wait) {
2846             *wait = max_wait;
2847         }
2848 
2849         now = qemu_get_clock_ns(vm_clock);
2850         if (bs->slice_end < now + max_wait) {
2851             bs->slice_end = now + max_wait;
2852         }
2853 
2854         return true;
2855     }
2856 
2857     if (wait) {
2858         *wait = 0;
2859     }
2860 
2861     return false;
2862 }
2863 
2864 /**************************************************************/
2865 /* async block device emulation */
2866 
2867 typedef struct BlockDriverAIOCBSync {
2868     BlockDriverAIOCB common;
2869     QEMUBH *bh;
2870     int ret;
2871     /* vector translation state */
2872     QEMUIOVector *qiov;
2873     uint8_t *bounce;
2874     int is_write;
2875 } BlockDriverAIOCBSync;
2876 
2877 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
2878 {
2879     BlockDriverAIOCBSync *acb =
2880         container_of(blockacb, BlockDriverAIOCBSync, common);
2881     qemu_bh_delete(acb->bh);
2882     acb->bh = NULL;
2883     qemu_aio_release(acb);
2884 }
2885 
2886 static AIOPool bdrv_em_aio_pool = {
2887     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
2888     .cancel             = bdrv_aio_cancel_em,
2889 };
2890 
2891 static void bdrv_aio_bh_cb(void *opaque)
2892 {
2893     BlockDriverAIOCBSync *acb = opaque;
2894 
2895     if (!acb->is_write)
2896         qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
2897     qemu_vfree(acb->bounce);
2898     acb->common.cb(acb->common.opaque, acb->ret);
2899     qemu_bh_delete(acb->bh);
2900     acb->bh = NULL;
2901     qemu_aio_release(acb);
2902 }
2903 
2904 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
2905                                             int64_t sector_num,
2906                                             QEMUIOVector *qiov,
2907                                             int nb_sectors,
2908                                             BlockDriverCompletionFunc *cb,
2909                                             void *opaque,
2910                                             int is_write)
2911 
2912 {
2913     BlockDriverAIOCBSync *acb;
2914 
2915     acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2916     acb->is_write = is_write;
2917     acb->qiov = qiov;
2918     acb->bounce = qemu_blockalign(bs, qiov->size);
2919 
2920     if (!acb->bh)
2921         acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2922 
2923     if (is_write) {
2924         qemu_iovec_to_buffer(acb->qiov, acb->bounce);
2925         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
2926     } else {
2927         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
2928     }
2929 
2930     qemu_bh_schedule(acb->bh);
2931 
2932     return &acb->common;
2933 }
2934 
2935 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
2936         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2937         BlockDriverCompletionFunc *cb, void *opaque)
2938 {
2939     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
2940 }
2941 
2942 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
2943         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2944         BlockDriverCompletionFunc *cb, void *opaque)
2945 {
2946     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
2947 }
2948 
2949 
2950 typedef struct BlockDriverAIOCBCoroutine {
2951     BlockDriverAIOCB common;
2952     BlockRequest req;
2953     bool is_write;
2954     QEMUBH* bh;
2955 } BlockDriverAIOCBCoroutine;
2956 
2957 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
2958 {
2959     qemu_aio_flush();
2960 }
2961 
2962 static AIOPool bdrv_em_co_aio_pool = {
2963     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
2964     .cancel             = bdrv_aio_co_cancel_em,
2965 };
2966 
2967 static void bdrv_co_em_bh(void *opaque)
2968 {
2969     BlockDriverAIOCBCoroutine *acb = opaque;
2970 
2971     acb->common.cb(acb->common.opaque, acb->req.error);
2972     qemu_bh_delete(acb->bh);
2973     qemu_aio_release(acb);
2974 }
2975 
2976 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
2977 static void coroutine_fn bdrv_co_do_rw(void *opaque)
2978 {
2979     BlockDriverAIOCBCoroutine *acb = opaque;
2980     BlockDriverState *bs = acb->common.bs;
2981 
2982     if (!acb->is_write) {
2983         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
2984             acb->req.nb_sectors, acb->req.qiov);
2985     } else {
2986         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
2987             acb->req.nb_sectors, acb->req.qiov);
2988     }
2989 
2990     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
2991     qemu_bh_schedule(acb->bh);
2992 }
2993 
2994 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
2995                                                int64_t sector_num,
2996                                                QEMUIOVector *qiov,
2997                                                int nb_sectors,
2998                                                BlockDriverCompletionFunc *cb,
2999                                                void *opaque,
3000                                                bool is_write)
3001 {
3002     Coroutine *co;
3003     BlockDriverAIOCBCoroutine *acb;
3004 
3005     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3006     acb->req.sector = sector_num;
3007     acb->req.nb_sectors = nb_sectors;
3008     acb->req.qiov = qiov;
3009     acb->is_write = is_write;
3010 
3011     co = qemu_coroutine_create(bdrv_co_do_rw);
3012     qemu_coroutine_enter(co, acb);
3013 
3014     return &acb->common;
3015 }
3016 
3017 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3018 {
3019     BlockDriverAIOCBCoroutine *acb = opaque;
3020     BlockDriverState *bs = acb->common.bs;
3021 
3022     acb->req.error = bdrv_co_flush(bs);
3023     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3024     qemu_bh_schedule(acb->bh);
3025 }
3026 
3027 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3028         BlockDriverCompletionFunc *cb, void *opaque)
3029 {
3030     trace_bdrv_aio_flush(bs, opaque);
3031 
3032     Coroutine *co;
3033     BlockDriverAIOCBCoroutine *acb;
3034 
3035     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3036     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3037     qemu_coroutine_enter(co, acb);
3038 
3039     return &acb->common;
3040 }
3041 
3042 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3043 {
3044     BlockDriverAIOCBCoroutine *acb = opaque;
3045     BlockDriverState *bs = acb->common.bs;
3046 
3047     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3048     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3049     qemu_bh_schedule(acb->bh);
3050 }
3051 
3052 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3053         int64_t sector_num, int nb_sectors,
3054         BlockDriverCompletionFunc *cb, void *opaque)
3055 {
3056     Coroutine *co;
3057     BlockDriverAIOCBCoroutine *acb;
3058 
3059     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3060 
3061     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3062     acb->req.sector = sector_num;
3063     acb->req.nb_sectors = nb_sectors;
3064     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3065     qemu_coroutine_enter(co, acb);
3066 
3067     return &acb->common;
3068 }
3069 
3070 void bdrv_init(void)
3071 {
3072     module_call_init(MODULE_INIT_BLOCK);
3073 }
3074 
3075 void bdrv_init_with_whitelist(void)
3076 {
3077     use_bdrv_whitelist = 1;
3078     bdrv_init();
3079 }
3080 
3081 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3082                    BlockDriverCompletionFunc *cb, void *opaque)
3083 {
3084     BlockDriverAIOCB *acb;
3085 
3086     if (pool->free_aiocb) {
3087         acb = pool->free_aiocb;
3088         pool->free_aiocb = acb->next;
3089     } else {
3090         acb = g_malloc0(pool->aiocb_size);
3091         acb->pool = pool;
3092     }
3093     acb->bs = bs;
3094     acb->cb = cb;
3095     acb->opaque = opaque;
3096     return acb;
3097 }
3098 
3099 void qemu_aio_release(void *p)
3100 {
3101     BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3102     AIOPool *pool = acb->pool;
3103     acb->next = pool->free_aiocb;
3104     pool->free_aiocb = acb;
3105 }
3106 
3107 /**************************************************************/
3108 /* Coroutine block device emulation */
3109 
3110 typedef struct CoroutineIOCompletion {
3111     Coroutine *coroutine;
3112     int ret;
3113 } CoroutineIOCompletion;
3114 
3115 static void bdrv_co_io_em_complete(void *opaque, int ret)
3116 {
3117     CoroutineIOCompletion *co = opaque;
3118 
3119     co->ret = ret;
3120     qemu_coroutine_enter(co->coroutine, NULL);
3121 }
3122 
3123 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3124                                       int nb_sectors, QEMUIOVector *iov,
3125                                       bool is_write)
3126 {
3127     CoroutineIOCompletion co = {
3128         .coroutine = qemu_coroutine_self(),
3129     };
3130     BlockDriverAIOCB *acb;
3131 
3132     if (is_write) {
3133         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3134                                        bdrv_co_io_em_complete, &co);
3135     } else {
3136         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3137                                       bdrv_co_io_em_complete, &co);
3138     }
3139 
3140     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3141     if (!acb) {
3142         return -EIO;
3143     }
3144     qemu_coroutine_yield();
3145 
3146     return co.ret;
3147 }
3148 
3149 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3150                                          int64_t sector_num, int nb_sectors,
3151                                          QEMUIOVector *iov)
3152 {
3153     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3154 }
3155 
3156 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3157                                          int64_t sector_num, int nb_sectors,
3158                                          QEMUIOVector *iov)
3159 {
3160     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3161 }
3162 
3163 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3164 {
3165     RwCo *rwco = opaque;
3166 
3167     rwco->ret = bdrv_co_flush(rwco->bs);
3168 }
3169 
3170 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3171 {
3172     int ret;
3173 
3174     if (!bs->drv) {
3175         return 0;
3176     }
3177 
3178     /* Write back cached data to the OS even with cache=unsafe */
3179     if (bs->drv->bdrv_co_flush_to_os) {
3180         ret = bs->drv->bdrv_co_flush_to_os(bs);
3181         if (ret < 0) {
3182             return ret;
3183         }
3184     }
3185 
3186     /* But don't actually force it to the disk with cache=unsafe */
3187     if (bs->open_flags & BDRV_O_NO_FLUSH) {
3188         return 0;
3189     }
3190 
3191     if (bs->drv->bdrv_co_flush_to_disk) {
3192         return bs->drv->bdrv_co_flush_to_disk(bs);
3193     } else if (bs->drv->bdrv_aio_flush) {
3194         BlockDriverAIOCB *acb;
3195         CoroutineIOCompletion co = {
3196             .coroutine = qemu_coroutine_self(),
3197         };
3198 
3199         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3200         if (acb == NULL) {
3201             return -EIO;
3202         } else {
3203             qemu_coroutine_yield();
3204             return co.ret;
3205         }
3206     } else {
3207         /*
3208          * Some block drivers always operate in either writethrough or unsafe
3209          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3210          * know how the server works (because the behaviour is hardcoded or
3211          * depends on server-side configuration), so we can't ensure that
3212          * everything is safe on disk. Returning an error doesn't work because
3213          * that would break guests even if the server operates in writethrough
3214          * mode.
3215          *
3216          * Let's hope the user knows what he's doing.
3217          */
3218         return 0;
3219     }
3220 }
3221 
3222 void bdrv_invalidate_cache(BlockDriverState *bs)
3223 {
3224     if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3225         bs->drv->bdrv_invalidate_cache(bs);
3226     }
3227 }
3228 
3229 void bdrv_invalidate_cache_all(void)
3230 {
3231     BlockDriverState *bs;
3232 
3233     QTAILQ_FOREACH(bs, &bdrv_states, list) {
3234         bdrv_invalidate_cache(bs);
3235     }
3236 }
3237 
3238 int bdrv_flush(BlockDriverState *bs)
3239 {
3240     Coroutine *co;
3241     RwCo rwco = {
3242         .bs = bs,
3243         .ret = NOT_DONE,
3244     };
3245 
3246     if (qemu_in_coroutine()) {
3247         /* Fast-path if already in coroutine context */
3248         bdrv_flush_co_entry(&rwco);
3249     } else {
3250         co = qemu_coroutine_create(bdrv_flush_co_entry);
3251         qemu_coroutine_enter(co, &rwco);
3252         while (rwco.ret == NOT_DONE) {
3253             qemu_aio_wait();
3254         }
3255     }
3256 
3257     return rwco.ret;
3258 }
3259 
3260 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3261 {
3262     RwCo *rwco = opaque;
3263 
3264     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3265 }
3266 
3267 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3268                                  int nb_sectors)
3269 {
3270     if (!bs->drv) {
3271         return -ENOMEDIUM;
3272     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3273         return -EIO;
3274     } else if (bs->read_only) {
3275         return -EROFS;
3276     } else if (bs->drv->bdrv_co_discard) {
3277         return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3278     } else if (bs->drv->bdrv_aio_discard) {
3279         BlockDriverAIOCB *acb;
3280         CoroutineIOCompletion co = {
3281             .coroutine = qemu_coroutine_self(),
3282         };
3283 
3284         acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3285                                         bdrv_co_io_em_complete, &co);
3286         if (acb == NULL) {
3287             return -EIO;
3288         } else {
3289             qemu_coroutine_yield();
3290             return co.ret;
3291         }
3292     } else {
3293         return 0;
3294     }
3295 }
3296 
3297 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3298 {
3299     Coroutine *co;
3300     RwCo rwco = {
3301         .bs = bs,
3302         .sector_num = sector_num,
3303         .nb_sectors = nb_sectors,
3304         .ret = NOT_DONE,
3305     };
3306 
3307     if (qemu_in_coroutine()) {
3308         /* Fast-path if already in coroutine context */
3309         bdrv_discard_co_entry(&rwco);
3310     } else {
3311         co = qemu_coroutine_create(bdrv_discard_co_entry);
3312         qemu_coroutine_enter(co, &rwco);
3313         while (rwco.ret == NOT_DONE) {
3314             qemu_aio_wait();
3315         }
3316     }
3317 
3318     return rwco.ret;
3319 }
3320 
3321 /**************************************************************/
3322 /* removable device support */
3323 
3324 /**
3325  * Return TRUE if the media is present
3326  */
3327 int bdrv_is_inserted(BlockDriverState *bs)
3328 {
3329     BlockDriver *drv = bs->drv;
3330 
3331     if (!drv)
3332         return 0;
3333     if (!drv->bdrv_is_inserted)
3334         return 1;
3335     return drv->bdrv_is_inserted(bs);
3336 }
3337 
3338 /**
3339  * Return whether the media changed since the last call to this
3340  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
3341  */
3342 int bdrv_media_changed(BlockDriverState *bs)
3343 {
3344     BlockDriver *drv = bs->drv;
3345 
3346     if (drv && drv->bdrv_media_changed) {
3347         return drv->bdrv_media_changed(bs);
3348     }
3349     return -ENOTSUP;
3350 }
3351 
3352 /**
3353  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3354  */
3355 void bdrv_eject(BlockDriverState *bs, int eject_flag)
3356 {
3357     BlockDriver *drv = bs->drv;
3358 
3359     if (drv && drv->bdrv_eject) {
3360         drv->bdrv_eject(bs, eject_flag);
3361     }
3362 }
3363 
3364 /**
3365  * Lock or unlock the media (if it is locked, the user won't be able
3366  * to eject it manually).
3367  */
3368 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3369 {
3370     BlockDriver *drv = bs->drv;
3371 
3372     trace_bdrv_lock_medium(bs, locked);
3373 
3374     if (drv && drv->bdrv_lock_medium) {
3375         drv->bdrv_lock_medium(bs, locked);
3376     }
3377 }
3378 
3379 /* needed for generic scsi interface */
3380 
3381 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3382 {
3383     BlockDriver *drv = bs->drv;
3384 
3385     if (drv && drv->bdrv_ioctl)
3386         return drv->bdrv_ioctl(bs, req, buf);
3387     return -ENOTSUP;
3388 }
3389 
3390 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3391         unsigned long int req, void *buf,
3392         BlockDriverCompletionFunc *cb, void *opaque)
3393 {
3394     BlockDriver *drv = bs->drv;
3395 
3396     if (drv && drv->bdrv_aio_ioctl)
3397         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3398     return NULL;
3399 }
3400 
3401 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3402 {
3403     bs->buffer_alignment = align;
3404 }
3405 
3406 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3407 {
3408     return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3409 }
3410 
3411 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3412 {
3413     int64_t bitmap_size;
3414 
3415     bs->dirty_count = 0;
3416     if (enable) {
3417         if (!bs->dirty_bitmap) {
3418             bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3419                     BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3420             bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
3421 
3422             bs->dirty_bitmap = g_malloc0(bitmap_size);
3423         }
3424     } else {
3425         if (bs->dirty_bitmap) {
3426             g_free(bs->dirty_bitmap);
3427             bs->dirty_bitmap = NULL;
3428         }
3429     }
3430 }
3431 
3432 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3433 {
3434     int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3435 
3436     if (bs->dirty_bitmap &&
3437         (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3438         return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3439             (1UL << (chunk % (sizeof(unsigned long) * 8))));
3440     } else {
3441         return 0;
3442     }
3443 }
3444 
3445 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3446                       int nr_sectors)
3447 {
3448     set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3449 }
3450 
3451 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3452 {
3453     return bs->dirty_count;
3454 }
3455 
3456 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3457 {
3458     assert(bs->in_use != in_use);
3459     bs->in_use = in_use;
3460 }
3461 
3462 int bdrv_in_use(BlockDriverState *bs)
3463 {
3464     return bs->in_use;
3465 }
3466 
3467 void bdrv_iostatus_enable(BlockDriverState *bs)
3468 {
3469     bs->iostatus_enabled = true;
3470     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3471 }
3472 
3473 /* The I/O status is only enabled if the drive explicitly
3474  * enables it _and_ the VM is configured to stop on errors */
3475 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3476 {
3477     return (bs->iostatus_enabled &&
3478            (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3479             bs->on_write_error == BLOCK_ERR_STOP_ANY    ||
3480             bs->on_read_error == BLOCK_ERR_STOP_ANY));
3481 }
3482 
3483 void bdrv_iostatus_disable(BlockDriverState *bs)
3484 {
3485     bs->iostatus_enabled = false;
3486 }
3487 
3488 void bdrv_iostatus_reset(BlockDriverState *bs)
3489 {
3490     if (bdrv_iostatus_is_enabled(bs)) {
3491         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3492     }
3493 }
3494 
3495 /* XXX: Today this is set by device models because it makes the implementation
3496    quite simple. However, the block layer knows about the error, so it's
3497    possible to implement this without device models being involved */
3498 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3499 {
3500     if (bdrv_iostatus_is_enabled(bs) &&
3501         bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
3502         assert(error >= 0);
3503         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3504                                          BLOCK_DEVICE_IO_STATUS_FAILED;
3505     }
3506 }
3507 
3508 void
3509 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3510         enum BlockAcctType type)
3511 {
3512     assert(type < BDRV_MAX_IOTYPE);
3513 
3514     cookie->bytes = bytes;
3515     cookie->start_time_ns = get_clock();
3516     cookie->type = type;
3517 }
3518 
3519 void
3520 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3521 {
3522     assert(cookie->type < BDRV_MAX_IOTYPE);
3523 
3524     bs->nr_bytes[cookie->type] += cookie->bytes;
3525     bs->nr_ops[cookie->type]++;
3526     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
3527 }
3528 
3529 int bdrv_img_create(const char *filename, const char *fmt,
3530                     const char *base_filename, const char *base_fmt,
3531                     char *options, uint64_t img_size, int flags)
3532 {
3533     QEMUOptionParameter *param = NULL, *create_options = NULL;
3534     QEMUOptionParameter *backing_fmt, *backing_file, *size;
3535     BlockDriverState *bs = NULL;
3536     BlockDriver *drv, *proto_drv;
3537     BlockDriver *backing_drv = NULL;
3538     int ret = 0;
3539 
3540     /* Find driver and parse its options */
3541     drv = bdrv_find_format(fmt);
3542     if (!drv) {
3543         error_report("Unknown file format '%s'", fmt);
3544         ret = -EINVAL;
3545         goto out;
3546     }
3547 
3548     proto_drv = bdrv_find_protocol(filename);
3549     if (!proto_drv) {
3550         error_report("Unknown protocol '%s'", filename);
3551         ret = -EINVAL;
3552         goto out;
3553     }
3554 
3555     create_options = append_option_parameters(create_options,
3556                                               drv->create_options);
3557     create_options = append_option_parameters(create_options,
3558                                               proto_drv->create_options);
3559 
3560     /* Create parameter list with default values */
3561     param = parse_option_parameters("", create_options, param);
3562 
3563     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
3564 
3565     /* Parse -o options */
3566     if (options) {
3567         param = parse_option_parameters(options, create_options, param);
3568         if (param == NULL) {
3569             error_report("Invalid options for file format '%s'.", fmt);
3570             ret = -EINVAL;
3571             goto out;
3572         }
3573     }
3574 
3575     if (base_filename) {
3576         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
3577                                  base_filename)) {
3578             error_report("Backing file not supported for file format '%s'",
3579                          fmt);
3580             ret = -EINVAL;
3581             goto out;
3582         }
3583     }
3584 
3585     if (base_fmt) {
3586         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
3587             error_report("Backing file format not supported for file "
3588                          "format '%s'", fmt);
3589             ret = -EINVAL;
3590             goto out;
3591         }
3592     }
3593 
3594     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
3595     if (backing_file && backing_file->value.s) {
3596         if (!strcmp(filename, backing_file->value.s)) {
3597             error_report("Error: Trying to create an image with the "
3598                          "same filename as the backing file");
3599             ret = -EINVAL;
3600             goto out;
3601         }
3602     }
3603 
3604     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
3605     if (backing_fmt && backing_fmt->value.s) {
3606         backing_drv = bdrv_find_format(backing_fmt->value.s);
3607         if (!backing_drv) {
3608             error_report("Unknown backing file format '%s'",
3609                          backing_fmt->value.s);
3610             ret = -EINVAL;
3611             goto out;
3612         }
3613     }
3614 
3615     // The size for the image must always be specified, with one exception:
3616     // If we are using a backing file, we can obtain the size from there
3617     size = get_option_parameter(param, BLOCK_OPT_SIZE);
3618     if (size && size->value.n == -1) {
3619         if (backing_file && backing_file->value.s) {
3620             uint64_t size;
3621             char buf[32];
3622 
3623             bs = bdrv_new("");
3624 
3625             ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
3626             if (ret < 0) {
3627                 error_report("Could not open '%s'", backing_file->value.s);
3628                 goto out;
3629             }
3630             bdrv_get_geometry(bs, &size);
3631             size *= 512;
3632 
3633             snprintf(buf, sizeof(buf), "%" PRId64, size);
3634             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
3635         } else {
3636             error_report("Image creation needs a size parameter");
3637             ret = -EINVAL;
3638             goto out;
3639         }
3640     }
3641 
3642     printf("Formatting '%s', fmt=%s ", filename, fmt);
3643     print_option_parameters(param);
3644     puts("");
3645 
3646     ret = bdrv_create(drv, filename, param);
3647 
3648     if (ret < 0) {
3649         if (ret == -ENOTSUP) {
3650             error_report("Formatting or formatting option not supported for "
3651                          "file format '%s'", fmt);
3652         } else if (ret == -EFBIG) {
3653             error_report("The image size is too large for file format '%s'",
3654                          fmt);
3655         } else {
3656             error_report("%s: error while creating %s: %s", filename, fmt,
3657                          strerror(-ret));
3658         }
3659     }
3660 
3661 out:
3662     free_option_parameters(create_options);
3663     free_option_parameters(param);
3664 
3665     if (bs) {
3666         bdrv_delete(bs);
3667     }
3668 
3669     return ret;
3670 }
3671