xref: /openbmc/qemu/block.c (revision 5f6caa4f2ba45c8a99c915c09c4d56bd1621a450)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor.h"
28 #include "block_int.h"
29 #include "module.h"
30 #include "qemu-objects.h"
31 #include "qemu-coroutine.h"
32 
33 #ifdef CONFIG_BSD
34 #include <sys/types.h>
35 #include <sys/stat.h>
36 #include <sys/ioctl.h>
37 #include <sys/queue.h>
38 #ifndef __DragonFly__
39 #include <sys/disk.h>
40 #endif
41 #endif
42 
43 #ifdef _WIN32
44 #include <windows.h>
45 #endif
46 
47 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
48 
49 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
50 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
51         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
52         BlockDriverCompletionFunc *cb, void *opaque);
53 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
54         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
55         BlockDriverCompletionFunc *cb, void *opaque);
56 static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
57         BlockDriverCompletionFunc *cb, void *opaque);
58 static BlockDriverAIOCB *bdrv_aio_noop_em(BlockDriverState *bs,
59         BlockDriverCompletionFunc *cb, void *opaque);
60 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
61                                          int64_t sector_num, int nb_sectors,
62                                          QEMUIOVector *iov);
63 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
64                                          int64_t sector_num, int nb_sectors,
65                                          QEMUIOVector *iov);
66 static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs);
67 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
68     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
69 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
70     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
71 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
72                                                int64_t sector_num,
73                                                QEMUIOVector *qiov,
74                                                int nb_sectors,
75                                                BlockDriverCompletionFunc *cb,
76                                                void *opaque,
77                                                bool is_write);
78 static void coroutine_fn bdrv_co_do_rw(void *opaque);
79 
80 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
81     QTAILQ_HEAD_INITIALIZER(bdrv_states);
82 
83 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
84     QLIST_HEAD_INITIALIZER(bdrv_drivers);
85 
86 /* The device to use for VM snapshots */
87 static BlockDriverState *bs_snapshots;
88 
89 /* If non-zero, use only whitelisted block drivers */
90 static int use_bdrv_whitelist;
91 
92 #ifdef _WIN32
93 static int is_windows_drive_prefix(const char *filename)
94 {
95     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
96              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
97             filename[1] == ':');
98 }
99 
100 int is_windows_drive(const char *filename)
101 {
102     if (is_windows_drive_prefix(filename) &&
103         filename[2] == '\0')
104         return 1;
105     if (strstart(filename, "\\\\.\\", NULL) ||
106         strstart(filename, "//./", NULL))
107         return 1;
108     return 0;
109 }
110 #endif
111 
112 /* check if the path starts with "<protocol>:" */
113 static int path_has_protocol(const char *path)
114 {
115 #ifdef _WIN32
116     if (is_windows_drive(path) ||
117         is_windows_drive_prefix(path)) {
118         return 0;
119     }
120 #endif
121 
122     return strchr(path, ':') != NULL;
123 }
124 
125 int path_is_absolute(const char *path)
126 {
127     const char *p;
128 #ifdef _WIN32
129     /* specific case for names like: "\\.\d:" */
130     if (*path == '/' || *path == '\\')
131         return 1;
132 #endif
133     p = strchr(path, ':');
134     if (p)
135         p++;
136     else
137         p = path;
138 #ifdef _WIN32
139     return (*p == '/' || *p == '\\');
140 #else
141     return (*p == '/');
142 #endif
143 }
144 
145 /* if filename is absolute, just copy it to dest. Otherwise, build a
146    path to it by considering it is relative to base_path. URL are
147    supported. */
148 void path_combine(char *dest, int dest_size,
149                   const char *base_path,
150                   const char *filename)
151 {
152     const char *p, *p1;
153     int len;
154 
155     if (dest_size <= 0)
156         return;
157     if (path_is_absolute(filename)) {
158         pstrcpy(dest, dest_size, filename);
159     } else {
160         p = strchr(base_path, ':');
161         if (p)
162             p++;
163         else
164             p = base_path;
165         p1 = strrchr(base_path, '/');
166 #ifdef _WIN32
167         {
168             const char *p2;
169             p2 = strrchr(base_path, '\\');
170             if (!p1 || p2 > p1)
171                 p1 = p2;
172         }
173 #endif
174         if (p1)
175             p1++;
176         else
177             p1 = base_path;
178         if (p1 > p)
179             p = p1;
180         len = p - base_path;
181         if (len > dest_size - 1)
182             len = dest_size - 1;
183         memcpy(dest, base_path, len);
184         dest[len] = '\0';
185         pstrcat(dest, dest_size, filename);
186     }
187 }
188 
189 void bdrv_register(BlockDriver *bdrv)
190 {
191     /* Block drivers without coroutine functions need emulation */
192     if (!bdrv->bdrv_co_readv) {
193         bdrv->bdrv_co_readv = bdrv_co_readv_em;
194         bdrv->bdrv_co_writev = bdrv_co_writev_em;
195 
196         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
197          * the block driver lacks aio we need to emulate that too.
198          */
199         if (!bdrv->bdrv_aio_readv) {
200             /* add AIO emulation layer */
201             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
202             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
203         }
204     }
205 
206     if (!bdrv->bdrv_aio_flush)
207         bdrv->bdrv_aio_flush = bdrv_aio_flush_em;
208 
209     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
210 }
211 
212 /* create a new block device (by default it is empty) */
213 BlockDriverState *bdrv_new(const char *device_name)
214 {
215     BlockDriverState *bs;
216 
217     bs = g_malloc0(sizeof(BlockDriverState));
218     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
219     if (device_name[0] != '\0') {
220         QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
221     }
222     bdrv_iostatus_disable(bs);
223     return bs;
224 }
225 
226 BlockDriver *bdrv_find_format(const char *format_name)
227 {
228     BlockDriver *drv1;
229     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
230         if (!strcmp(drv1->format_name, format_name)) {
231             return drv1;
232         }
233     }
234     return NULL;
235 }
236 
237 static int bdrv_is_whitelisted(BlockDriver *drv)
238 {
239     static const char *whitelist[] = {
240         CONFIG_BDRV_WHITELIST
241     };
242     const char **p;
243 
244     if (!whitelist[0])
245         return 1;               /* no whitelist, anything goes */
246 
247     for (p = whitelist; *p; p++) {
248         if (!strcmp(drv->format_name, *p)) {
249             return 1;
250         }
251     }
252     return 0;
253 }
254 
255 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
256 {
257     BlockDriver *drv = bdrv_find_format(format_name);
258     return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
259 }
260 
261 int bdrv_create(BlockDriver *drv, const char* filename,
262     QEMUOptionParameter *options)
263 {
264     if (!drv->bdrv_create)
265         return -ENOTSUP;
266 
267     return drv->bdrv_create(filename, options);
268 }
269 
270 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
271 {
272     BlockDriver *drv;
273 
274     drv = bdrv_find_protocol(filename);
275     if (drv == NULL) {
276         return -ENOENT;
277     }
278 
279     return bdrv_create(drv, filename, options);
280 }
281 
282 #ifdef _WIN32
283 void get_tmp_filename(char *filename, int size)
284 {
285     char temp_dir[MAX_PATH];
286 
287     GetTempPath(MAX_PATH, temp_dir);
288     GetTempFileName(temp_dir, "qem", 0, filename);
289 }
290 #else
291 void get_tmp_filename(char *filename, int size)
292 {
293     int fd;
294     const char *tmpdir;
295     /* XXX: race condition possible */
296     tmpdir = getenv("TMPDIR");
297     if (!tmpdir)
298         tmpdir = "/tmp";
299     snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
300     fd = mkstemp(filename);
301     close(fd);
302 }
303 #endif
304 
305 /*
306  * Detect host devices. By convention, /dev/cdrom[N] is always
307  * recognized as a host CDROM.
308  */
309 static BlockDriver *find_hdev_driver(const char *filename)
310 {
311     int score_max = 0, score;
312     BlockDriver *drv = NULL, *d;
313 
314     QLIST_FOREACH(d, &bdrv_drivers, list) {
315         if (d->bdrv_probe_device) {
316             score = d->bdrv_probe_device(filename);
317             if (score > score_max) {
318                 score_max = score;
319                 drv = d;
320             }
321         }
322     }
323 
324     return drv;
325 }
326 
327 BlockDriver *bdrv_find_protocol(const char *filename)
328 {
329     BlockDriver *drv1;
330     char protocol[128];
331     int len;
332     const char *p;
333 
334     /* TODO Drivers without bdrv_file_open must be specified explicitly */
335 
336     /*
337      * XXX(hch): we really should not let host device detection
338      * override an explicit protocol specification, but moving this
339      * later breaks access to device names with colons in them.
340      * Thanks to the brain-dead persistent naming schemes on udev-
341      * based Linux systems those actually are quite common.
342      */
343     drv1 = find_hdev_driver(filename);
344     if (drv1) {
345         return drv1;
346     }
347 
348     if (!path_has_protocol(filename)) {
349         return bdrv_find_format("file");
350     }
351     p = strchr(filename, ':');
352     assert(p != NULL);
353     len = p - filename;
354     if (len > sizeof(protocol) - 1)
355         len = sizeof(protocol) - 1;
356     memcpy(protocol, filename, len);
357     protocol[len] = '\0';
358     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
359         if (drv1->protocol_name &&
360             !strcmp(drv1->protocol_name, protocol)) {
361             return drv1;
362         }
363     }
364     return NULL;
365 }
366 
367 static int find_image_format(const char *filename, BlockDriver **pdrv)
368 {
369     int ret, score, score_max;
370     BlockDriver *drv1, *drv;
371     uint8_t buf[2048];
372     BlockDriverState *bs;
373 
374     ret = bdrv_file_open(&bs, filename, 0);
375     if (ret < 0) {
376         *pdrv = NULL;
377         return ret;
378     }
379 
380     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
381     if (bs->sg || !bdrv_is_inserted(bs)) {
382         bdrv_delete(bs);
383         drv = bdrv_find_format("raw");
384         if (!drv) {
385             ret = -ENOENT;
386         }
387         *pdrv = drv;
388         return ret;
389     }
390 
391     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
392     bdrv_delete(bs);
393     if (ret < 0) {
394         *pdrv = NULL;
395         return ret;
396     }
397 
398     score_max = 0;
399     drv = NULL;
400     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
401         if (drv1->bdrv_probe) {
402             score = drv1->bdrv_probe(buf, ret, filename);
403             if (score > score_max) {
404                 score_max = score;
405                 drv = drv1;
406             }
407         }
408     }
409     if (!drv) {
410         ret = -ENOENT;
411     }
412     *pdrv = drv;
413     return ret;
414 }
415 
416 /**
417  * Set the current 'total_sectors' value
418  */
419 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
420 {
421     BlockDriver *drv = bs->drv;
422 
423     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
424     if (bs->sg)
425         return 0;
426 
427     /* query actual device if possible, otherwise just trust the hint */
428     if (drv->bdrv_getlength) {
429         int64_t length = drv->bdrv_getlength(bs);
430         if (length < 0) {
431             return length;
432         }
433         hint = length >> BDRV_SECTOR_BITS;
434     }
435 
436     bs->total_sectors = hint;
437     return 0;
438 }
439 
440 /**
441  * Set open flags for a given cache mode
442  *
443  * Return 0 on success, -1 if the cache mode was invalid.
444  */
445 int bdrv_parse_cache_flags(const char *mode, int *flags)
446 {
447     *flags &= ~BDRV_O_CACHE_MASK;
448 
449     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
450         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
451     } else if (!strcmp(mode, "directsync")) {
452         *flags |= BDRV_O_NOCACHE;
453     } else if (!strcmp(mode, "writeback")) {
454         *flags |= BDRV_O_CACHE_WB;
455     } else if (!strcmp(mode, "unsafe")) {
456         *flags |= BDRV_O_CACHE_WB;
457         *flags |= BDRV_O_NO_FLUSH;
458     } else if (!strcmp(mode, "writethrough")) {
459         /* this is the default */
460     } else {
461         return -1;
462     }
463 
464     return 0;
465 }
466 
467 /*
468  * Common part for opening disk images and files
469  */
470 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
471     int flags, BlockDriver *drv)
472 {
473     int ret, open_flags;
474 
475     assert(drv != NULL);
476 
477     trace_bdrv_open_common(bs, filename, flags, drv->format_name);
478 
479     bs->file = NULL;
480     bs->total_sectors = 0;
481     bs->encrypted = 0;
482     bs->valid_key = 0;
483     bs->open_flags = flags;
484     bs->buffer_alignment = 512;
485 
486     pstrcpy(bs->filename, sizeof(bs->filename), filename);
487 
488     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
489         return -ENOTSUP;
490     }
491 
492     bs->drv = drv;
493     bs->opaque = g_malloc0(drv->instance_size);
494 
495     if (flags & BDRV_O_CACHE_WB)
496         bs->enable_write_cache = 1;
497 
498     /*
499      * Clear flags that are internal to the block layer before opening the
500      * image.
501      */
502     open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
503 
504     /*
505      * Snapshots should be writable.
506      */
507     if (bs->is_temporary) {
508         open_flags |= BDRV_O_RDWR;
509     }
510 
511     /* Open the image, either directly or using a protocol */
512     if (drv->bdrv_file_open) {
513         ret = drv->bdrv_file_open(bs, filename, open_flags);
514     } else {
515         ret = bdrv_file_open(&bs->file, filename, open_flags);
516         if (ret >= 0) {
517             ret = drv->bdrv_open(bs, open_flags);
518         }
519     }
520 
521     if (ret < 0) {
522         goto free_and_fail;
523     }
524 
525     bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
526 
527     ret = refresh_total_sectors(bs, bs->total_sectors);
528     if (ret < 0) {
529         goto free_and_fail;
530     }
531 
532 #ifndef _WIN32
533     if (bs->is_temporary) {
534         unlink(filename);
535     }
536 #endif
537     return 0;
538 
539 free_and_fail:
540     if (bs->file) {
541         bdrv_delete(bs->file);
542         bs->file = NULL;
543     }
544     g_free(bs->opaque);
545     bs->opaque = NULL;
546     bs->drv = NULL;
547     return ret;
548 }
549 
550 /*
551  * Opens a file using a protocol (file, host_device, nbd, ...)
552  */
553 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
554 {
555     BlockDriverState *bs;
556     BlockDriver *drv;
557     int ret;
558 
559     drv = bdrv_find_protocol(filename);
560     if (!drv) {
561         return -ENOENT;
562     }
563 
564     bs = bdrv_new("");
565     ret = bdrv_open_common(bs, filename, flags, drv);
566     if (ret < 0) {
567         bdrv_delete(bs);
568         return ret;
569     }
570     bs->growable = 1;
571     *pbs = bs;
572     return 0;
573 }
574 
575 /*
576  * Opens a disk image (raw, qcow2, vmdk, ...)
577  */
578 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
579               BlockDriver *drv)
580 {
581     int ret;
582 
583     if (flags & BDRV_O_SNAPSHOT) {
584         BlockDriverState *bs1;
585         int64_t total_size;
586         int is_protocol = 0;
587         BlockDriver *bdrv_qcow2;
588         QEMUOptionParameter *options;
589         char tmp_filename[PATH_MAX];
590         char backing_filename[PATH_MAX];
591 
592         /* if snapshot, we create a temporary backing file and open it
593            instead of opening 'filename' directly */
594 
595         /* if there is a backing file, use it */
596         bs1 = bdrv_new("");
597         ret = bdrv_open(bs1, filename, 0, drv);
598         if (ret < 0) {
599             bdrv_delete(bs1);
600             return ret;
601         }
602         total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
603 
604         if (bs1->drv && bs1->drv->protocol_name)
605             is_protocol = 1;
606 
607         bdrv_delete(bs1);
608 
609         get_tmp_filename(tmp_filename, sizeof(tmp_filename));
610 
611         /* Real path is meaningless for protocols */
612         if (is_protocol)
613             snprintf(backing_filename, sizeof(backing_filename),
614                      "%s", filename);
615         else if (!realpath(filename, backing_filename))
616             return -errno;
617 
618         bdrv_qcow2 = bdrv_find_format("qcow2");
619         options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
620 
621         set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
622         set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
623         if (drv) {
624             set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
625                 drv->format_name);
626         }
627 
628         ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
629         free_option_parameters(options);
630         if (ret < 0) {
631             return ret;
632         }
633 
634         filename = tmp_filename;
635         drv = bdrv_qcow2;
636         bs->is_temporary = 1;
637     }
638 
639     /* Find the right image format driver */
640     if (!drv) {
641         ret = find_image_format(filename, &drv);
642     }
643 
644     if (!drv) {
645         goto unlink_and_fail;
646     }
647 
648     /* Open the image */
649     ret = bdrv_open_common(bs, filename, flags, drv);
650     if (ret < 0) {
651         goto unlink_and_fail;
652     }
653 
654     /* If there is a backing file, use it */
655     if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
656         char backing_filename[PATH_MAX];
657         int back_flags;
658         BlockDriver *back_drv = NULL;
659 
660         bs->backing_hd = bdrv_new("");
661 
662         if (path_has_protocol(bs->backing_file)) {
663             pstrcpy(backing_filename, sizeof(backing_filename),
664                     bs->backing_file);
665         } else {
666             path_combine(backing_filename, sizeof(backing_filename),
667                          filename, bs->backing_file);
668         }
669 
670         if (bs->backing_format[0] != '\0') {
671             back_drv = bdrv_find_format(bs->backing_format);
672         }
673 
674         /* backing files always opened read-only */
675         back_flags =
676             flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
677 
678         ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
679         if (ret < 0) {
680             bdrv_close(bs);
681             return ret;
682         }
683         if (bs->is_temporary) {
684             bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
685         } else {
686             /* base image inherits from "parent" */
687             bs->backing_hd->keep_read_only = bs->keep_read_only;
688         }
689     }
690 
691     if (!bdrv_key_required(bs)) {
692         bdrv_dev_change_media_cb(bs, true);
693     }
694 
695     return 0;
696 
697 unlink_and_fail:
698     if (bs->is_temporary) {
699         unlink(filename);
700     }
701     return ret;
702 }
703 
704 void bdrv_close(BlockDriverState *bs)
705 {
706     if (bs->drv) {
707         if (bs == bs_snapshots) {
708             bs_snapshots = NULL;
709         }
710         if (bs->backing_hd) {
711             bdrv_delete(bs->backing_hd);
712             bs->backing_hd = NULL;
713         }
714         bs->drv->bdrv_close(bs);
715         g_free(bs->opaque);
716 #ifdef _WIN32
717         if (bs->is_temporary) {
718             unlink(bs->filename);
719         }
720 #endif
721         bs->opaque = NULL;
722         bs->drv = NULL;
723 
724         if (bs->file != NULL) {
725             bdrv_close(bs->file);
726         }
727 
728         bdrv_dev_change_media_cb(bs, false);
729     }
730 }
731 
732 void bdrv_close_all(void)
733 {
734     BlockDriverState *bs;
735 
736     QTAILQ_FOREACH(bs, &bdrv_states, list) {
737         bdrv_close(bs);
738     }
739 }
740 
741 /* make a BlockDriverState anonymous by removing from bdrv_state list.
742    Also, NULL terminate the device_name to prevent double remove */
743 void bdrv_make_anon(BlockDriverState *bs)
744 {
745     if (bs->device_name[0] != '\0') {
746         QTAILQ_REMOVE(&bdrv_states, bs, list);
747     }
748     bs->device_name[0] = '\0';
749 }
750 
751 void bdrv_delete(BlockDriverState *bs)
752 {
753     assert(!bs->dev);
754 
755     /* remove from list, if necessary */
756     bdrv_make_anon(bs);
757 
758     bdrv_close(bs);
759     if (bs->file != NULL) {
760         bdrv_delete(bs->file);
761     }
762 
763     assert(bs != bs_snapshots);
764     g_free(bs);
765 }
766 
767 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
768 /* TODO change to DeviceState *dev when all users are qdevified */
769 {
770     if (bs->dev) {
771         return -EBUSY;
772     }
773     bs->dev = dev;
774     bdrv_iostatus_reset(bs);
775     return 0;
776 }
777 
778 /* TODO qdevified devices don't use this, remove when devices are qdevified */
779 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
780 {
781     if (bdrv_attach_dev(bs, dev) < 0) {
782         abort();
783     }
784 }
785 
786 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
787 /* TODO change to DeviceState *dev when all users are qdevified */
788 {
789     assert(bs->dev == dev);
790     bs->dev = NULL;
791     bs->dev_ops = NULL;
792     bs->dev_opaque = NULL;
793     bs->buffer_alignment = 512;
794 }
795 
796 /* TODO change to return DeviceState * when all users are qdevified */
797 void *bdrv_get_attached_dev(BlockDriverState *bs)
798 {
799     return bs->dev;
800 }
801 
802 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
803                       void *opaque)
804 {
805     bs->dev_ops = ops;
806     bs->dev_opaque = opaque;
807     if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
808         bs_snapshots = NULL;
809     }
810 }
811 
812 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
813 {
814     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
815         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
816     }
817 }
818 
819 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
820 {
821     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
822 }
823 
824 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
825 {
826     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
827         return bs->dev_ops->is_tray_open(bs->dev_opaque);
828     }
829     return false;
830 }
831 
832 static void bdrv_dev_resize_cb(BlockDriverState *bs)
833 {
834     if (bs->dev_ops && bs->dev_ops->resize_cb) {
835         bs->dev_ops->resize_cb(bs->dev_opaque);
836     }
837 }
838 
839 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
840 {
841     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
842         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
843     }
844     return false;
845 }
846 
847 /*
848  * Run consistency checks on an image
849  *
850  * Returns 0 if the check could be completed (it doesn't mean that the image is
851  * free of errors) or -errno when an internal error occurred. The results of the
852  * check are stored in res.
853  */
854 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
855 {
856     if (bs->drv->bdrv_check == NULL) {
857         return -ENOTSUP;
858     }
859 
860     memset(res, 0, sizeof(*res));
861     return bs->drv->bdrv_check(bs, res);
862 }
863 
864 #define COMMIT_BUF_SECTORS 2048
865 
866 /* commit COW file into the raw image */
867 int bdrv_commit(BlockDriverState *bs)
868 {
869     BlockDriver *drv = bs->drv;
870     BlockDriver *backing_drv;
871     int64_t sector, total_sectors;
872     int n, ro, open_flags;
873     int ret = 0, rw_ret = 0;
874     uint8_t *buf;
875     char filename[1024];
876     BlockDriverState *bs_rw, *bs_ro;
877 
878     if (!drv)
879         return -ENOMEDIUM;
880 
881     if (!bs->backing_hd) {
882         return -ENOTSUP;
883     }
884 
885     if (bs->backing_hd->keep_read_only) {
886         return -EACCES;
887     }
888 
889     backing_drv = bs->backing_hd->drv;
890     ro = bs->backing_hd->read_only;
891     strncpy(filename, bs->backing_hd->filename, sizeof(filename));
892     open_flags =  bs->backing_hd->open_flags;
893 
894     if (ro) {
895         /* re-open as RW */
896         bdrv_delete(bs->backing_hd);
897         bs->backing_hd = NULL;
898         bs_rw = bdrv_new("");
899         rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
900             backing_drv);
901         if (rw_ret < 0) {
902             bdrv_delete(bs_rw);
903             /* try to re-open read-only */
904             bs_ro = bdrv_new("");
905             ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
906                 backing_drv);
907             if (ret < 0) {
908                 bdrv_delete(bs_ro);
909                 /* drive not functional anymore */
910                 bs->drv = NULL;
911                 return ret;
912             }
913             bs->backing_hd = bs_ro;
914             return rw_ret;
915         }
916         bs->backing_hd = bs_rw;
917     }
918 
919     total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
920     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
921 
922     for (sector = 0; sector < total_sectors; sector += n) {
923         if (drv->bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
924 
925             if (bdrv_read(bs, sector, buf, n) != 0) {
926                 ret = -EIO;
927                 goto ro_cleanup;
928             }
929 
930             if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
931                 ret = -EIO;
932                 goto ro_cleanup;
933             }
934         }
935     }
936 
937     if (drv->bdrv_make_empty) {
938         ret = drv->bdrv_make_empty(bs);
939         bdrv_flush(bs);
940     }
941 
942     /*
943      * Make sure all data we wrote to the backing device is actually
944      * stable on disk.
945      */
946     if (bs->backing_hd)
947         bdrv_flush(bs->backing_hd);
948 
949 ro_cleanup:
950     g_free(buf);
951 
952     if (ro) {
953         /* re-open as RO */
954         bdrv_delete(bs->backing_hd);
955         bs->backing_hd = NULL;
956         bs_ro = bdrv_new("");
957         ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
958             backing_drv);
959         if (ret < 0) {
960             bdrv_delete(bs_ro);
961             /* drive not functional anymore */
962             bs->drv = NULL;
963             return ret;
964         }
965         bs->backing_hd = bs_ro;
966         bs->backing_hd->keep_read_only = 0;
967     }
968 
969     return ret;
970 }
971 
972 void bdrv_commit_all(void)
973 {
974     BlockDriverState *bs;
975 
976     QTAILQ_FOREACH(bs, &bdrv_states, list) {
977         bdrv_commit(bs);
978     }
979 }
980 
981 /*
982  * Return values:
983  * 0        - success
984  * -EINVAL  - backing format specified, but no file
985  * -ENOSPC  - can't update the backing file because no space is left in the
986  *            image file header
987  * -ENOTSUP - format driver doesn't support changing the backing file
988  */
989 int bdrv_change_backing_file(BlockDriverState *bs,
990     const char *backing_file, const char *backing_fmt)
991 {
992     BlockDriver *drv = bs->drv;
993 
994     if (drv->bdrv_change_backing_file != NULL) {
995         return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
996     } else {
997         return -ENOTSUP;
998     }
999 }
1000 
1001 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1002                                    size_t size)
1003 {
1004     int64_t len;
1005 
1006     if (!bdrv_is_inserted(bs))
1007         return -ENOMEDIUM;
1008 
1009     if (bs->growable)
1010         return 0;
1011 
1012     len = bdrv_getlength(bs);
1013 
1014     if (offset < 0)
1015         return -EIO;
1016 
1017     if ((offset > len) || (len - offset < size))
1018         return -EIO;
1019 
1020     return 0;
1021 }
1022 
1023 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1024                               int nb_sectors)
1025 {
1026     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1027                                    nb_sectors * BDRV_SECTOR_SIZE);
1028 }
1029 
1030 static inline bool bdrv_has_async_flush(BlockDriver *drv)
1031 {
1032     return drv->bdrv_aio_flush != bdrv_aio_flush_em;
1033 }
1034 
1035 typedef struct RwCo {
1036     BlockDriverState *bs;
1037     int64_t sector_num;
1038     int nb_sectors;
1039     QEMUIOVector *qiov;
1040     bool is_write;
1041     int ret;
1042 } RwCo;
1043 
1044 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1045 {
1046     RwCo *rwco = opaque;
1047 
1048     if (!rwco->is_write) {
1049         rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1050                                      rwco->nb_sectors, rwco->qiov);
1051     } else {
1052         rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1053                                       rwco->nb_sectors, rwco->qiov);
1054     }
1055 }
1056 
1057 /*
1058  * Process a synchronous request using coroutines
1059  */
1060 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1061                       int nb_sectors, bool is_write)
1062 {
1063     QEMUIOVector qiov;
1064     struct iovec iov = {
1065         .iov_base = (void *)buf,
1066         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1067     };
1068     Coroutine *co;
1069     RwCo rwco = {
1070         .bs = bs,
1071         .sector_num = sector_num,
1072         .nb_sectors = nb_sectors,
1073         .qiov = &qiov,
1074         .is_write = is_write,
1075         .ret = NOT_DONE,
1076     };
1077 
1078     qemu_iovec_init_external(&qiov, &iov, 1);
1079 
1080     if (qemu_in_coroutine()) {
1081         /* Fast-path if already in coroutine context */
1082         bdrv_rw_co_entry(&rwco);
1083     } else {
1084         co = qemu_coroutine_create(bdrv_rw_co_entry);
1085         qemu_coroutine_enter(co, &rwco);
1086         while (rwco.ret == NOT_DONE) {
1087             qemu_aio_wait();
1088         }
1089     }
1090     return rwco.ret;
1091 }
1092 
1093 /* return < 0 if error. See bdrv_write() for the return codes */
1094 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1095               uint8_t *buf, int nb_sectors)
1096 {
1097     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1098 }
1099 
1100 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1101                              int nb_sectors, int dirty)
1102 {
1103     int64_t start, end;
1104     unsigned long val, idx, bit;
1105 
1106     start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1107     end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1108 
1109     for (; start <= end; start++) {
1110         idx = start / (sizeof(unsigned long) * 8);
1111         bit = start % (sizeof(unsigned long) * 8);
1112         val = bs->dirty_bitmap[idx];
1113         if (dirty) {
1114             if (!(val & (1UL << bit))) {
1115                 bs->dirty_count++;
1116                 val |= 1UL << bit;
1117             }
1118         } else {
1119             if (val & (1UL << bit)) {
1120                 bs->dirty_count--;
1121                 val &= ~(1UL << bit);
1122             }
1123         }
1124         bs->dirty_bitmap[idx] = val;
1125     }
1126 }
1127 
1128 /* Return < 0 if error. Important errors are:
1129   -EIO         generic I/O error (may happen for all errors)
1130   -ENOMEDIUM   No media inserted.
1131   -EINVAL      Invalid sector number or nb_sectors
1132   -EACCES      Trying to write a read-only device
1133 */
1134 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1135                const uint8_t *buf, int nb_sectors)
1136 {
1137     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1138 }
1139 
1140 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1141                void *buf, int count1)
1142 {
1143     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1144     int len, nb_sectors, count;
1145     int64_t sector_num;
1146     int ret;
1147 
1148     count = count1;
1149     /* first read to align to sector start */
1150     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1151     if (len > count)
1152         len = count;
1153     sector_num = offset >> BDRV_SECTOR_BITS;
1154     if (len > 0) {
1155         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1156             return ret;
1157         memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1158         count -= len;
1159         if (count == 0)
1160             return count1;
1161         sector_num++;
1162         buf += len;
1163     }
1164 
1165     /* read the sectors "in place" */
1166     nb_sectors = count >> BDRV_SECTOR_BITS;
1167     if (nb_sectors > 0) {
1168         if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1169             return ret;
1170         sector_num += nb_sectors;
1171         len = nb_sectors << BDRV_SECTOR_BITS;
1172         buf += len;
1173         count -= len;
1174     }
1175 
1176     /* add data from the last sector */
1177     if (count > 0) {
1178         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1179             return ret;
1180         memcpy(buf, tmp_buf, count);
1181     }
1182     return count1;
1183 }
1184 
1185 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1186                 const void *buf, int count1)
1187 {
1188     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1189     int len, nb_sectors, count;
1190     int64_t sector_num;
1191     int ret;
1192 
1193     count = count1;
1194     /* first write to align to sector start */
1195     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1196     if (len > count)
1197         len = count;
1198     sector_num = offset >> BDRV_SECTOR_BITS;
1199     if (len > 0) {
1200         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1201             return ret;
1202         memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1203         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1204             return ret;
1205         count -= len;
1206         if (count == 0)
1207             return count1;
1208         sector_num++;
1209         buf += len;
1210     }
1211 
1212     /* write the sectors "in place" */
1213     nb_sectors = count >> BDRV_SECTOR_BITS;
1214     if (nb_sectors > 0) {
1215         if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1216             return ret;
1217         sector_num += nb_sectors;
1218         len = nb_sectors << BDRV_SECTOR_BITS;
1219         buf += len;
1220         count -= len;
1221     }
1222 
1223     /* add data from the last sector */
1224     if (count > 0) {
1225         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1226             return ret;
1227         memcpy(tmp_buf, buf, count);
1228         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1229             return ret;
1230     }
1231     return count1;
1232 }
1233 
1234 /*
1235  * Writes to the file and ensures that no writes are reordered across this
1236  * request (acts as a barrier)
1237  *
1238  * Returns 0 on success, -errno in error cases.
1239  */
1240 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1241     const void *buf, int count)
1242 {
1243     int ret;
1244 
1245     ret = bdrv_pwrite(bs, offset, buf, count);
1246     if (ret < 0) {
1247         return ret;
1248     }
1249 
1250     /* No flush needed for cache modes that use O_DSYNC */
1251     if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
1252         bdrv_flush(bs);
1253     }
1254 
1255     return 0;
1256 }
1257 
1258 /*
1259  * Handle a read request in coroutine context
1260  */
1261 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1262     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1263 {
1264     BlockDriver *drv = bs->drv;
1265 
1266     if (!drv) {
1267         return -ENOMEDIUM;
1268     }
1269     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1270         return -EIO;
1271     }
1272 
1273     return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1274 }
1275 
1276 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1277     int nb_sectors, QEMUIOVector *qiov)
1278 {
1279     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1280 
1281     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov);
1282 }
1283 
1284 /*
1285  * Handle a write request in coroutine context
1286  */
1287 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1288     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1289 {
1290     BlockDriver *drv = bs->drv;
1291     int ret;
1292 
1293     if (!bs->drv) {
1294         return -ENOMEDIUM;
1295     }
1296     if (bs->read_only) {
1297         return -EACCES;
1298     }
1299     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1300         return -EIO;
1301     }
1302 
1303     ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1304 
1305     if (bs->dirty_bitmap) {
1306         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1307     }
1308 
1309     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1310         bs->wr_highest_sector = sector_num + nb_sectors - 1;
1311     }
1312 
1313     return ret;
1314 }
1315 
1316 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1317     int nb_sectors, QEMUIOVector *qiov)
1318 {
1319     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1320 
1321     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov);
1322 }
1323 
1324 /**
1325  * Truncate file to 'offset' bytes (needed only for file protocols)
1326  */
1327 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1328 {
1329     BlockDriver *drv = bs->drv;
1330     int ret;
1331     if (!drv)
1332         return -ENOMEDIUM;
1333     if (!drv->bdrv_truncate)
1334         return -ENOTSUP;
1335     if (bs->read_only)
1336         return -EACCES;
1337     if (bdrv_in_use(bs))
1338         return -EBUSY;
1339     ret = drv->bdrv_truncate(bs, offset);
1340     if (ret == 0) {
1341         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1342         bdrv_dev_resize_cb(bs);
1343     }
1344     return ret;
1345 }
1346 
1347 /**
1348  * Length of a allocated file in bytes. Sparse files are counted by actual
1349  * allocated space. Return < 0 if error or unknown.
1350  */
1351 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
1352 {
1353     BlockDriver *drv = bs->drv;
1354     if (!drv) {
1355         return -ENOMEDIUM;
1356     }
1357     if (drv->bdrv_get_allocated_file_size) {
1358         return drv->bdrv_get_allocated_file_size(bs);
1359     }
1360     if (bs->file) {
1361         return bdrv_get_allocated_file_size(bs->file);
1362     }
1363     return -ENOTSUP;
1364 }
1365 
1366 /**
1367  * Length of a file in bytes. Return < 0 if error or unknown.
1368  */
1369 int64_t bdrv_getlength(BlockDriverState *bs)
1370 {
1371     BlockDriver *drv = bs->drv;
1372     if (!drv)
1373         return -ENOMEDIUM;
1374 
1375     if (bs->growable || bdrv_dev_has_removable_media(bs)) {
1376         if (drv->bdrv_getlength) {
1377             return drv->bdrv_getlength(bs);
1378         }
1379     }
1380     return bs->total_sectors * BDRV_SECTOR_SIZE;
1381 }
1382 
1383 /* return 0 as number of sectors if no device present or error */
1384 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
1385 {
1386     int64_t length;
1387     length = bdrv_getlength(bs);
1388     if (length < 0)
1389         length = 0;
1390     else
1391         length = length >> BDRV_SECTOR_BITS;
1392     *nb_sectors_ptr = length;
1393 }
1394 
1395 struct partition {
1396         uint8_t boot_ind;           /* 0x80 - active */
1397         uint8_t head;               /* starting head */
1398         uint8_t sector;             /* starting sector */
1399         uint8_t cyl;                /* starting cylinder */
1400         uint8_t sys_ind;            /* What partition type */
1401         uint8_t end_head;           /* end head */
1402         uint8_t end_sector;         /* end sector */
1403         uint8_t end_cyl;            /* end cylinder */
1404         uint32_t start_sect;        /* starting sector counting from 0 */
1405         uint32_t nr_sects;          /* nr of sectors in partition */
1406 } QEMU_PACKED;
1407 
1408 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1409 static int guess_disk_lchs(BlockDriverState *bs,
1410                            int *pcylinders, int *pheads, int *psectors)
1411 {
1412     uint8_t buf[BDRV_SECTOR_SIZE];
1413     int ret, i, heads, sectors, cylinders;
1414     struct partition *p;
1415     uint32_t nr_sects;
1416     uint64_t nb_sectors;
1417 
1418     bdrv_get_geometry(bs, &nb_sectors);
1419 
1420     ret = bdrv_read(bs, 0, buf, 1);
1421     if (ret < 0)
1422         return -1;
1423     /* test msdos magic */
1424     if (buf[510] != 0x55 || buf[511] != 0xaa)
1425         return -1;
1426     for(i = 0; i < 4; i++) {
1427         p = ((struct partition *)(buf + 0x1be)) + i;
1428         nr_sects = le32_to_cpu(p->nr_sects);
1429         if (nr_sects && p->end_head) {
1430             /* We make the assumption that the partition terminates on
1431                a cylinder boundary */
1432             heads = p->end_head + 1;
1433             sectors = p->end_sector & 63;
1434             if (sectors == 0)
1435                 continue;
1436             cylinders = nb_sectors / (heads * sectors);
1437             if (cylinders < 1 || cylinders > 16383)
1438                 continue;
1439             *pheads = heads;
1440             *psectors = sectors;
1441             *pcylinders = cylinders;
1442 #if 0
1443             printf("guessed geometry: LCHS=%d %d %d\n",
1444                    cylinders, heads, sectors);
1445 #endif
1446             return 0;
1447         }
1448     }
1449     return -1;
1450 }
1451 
1452 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
1453 {
1454     int translation, lba_detected = 0;
1455     int cylinders, heads, secs;
1456     uint64_t nb_sectors;
1457 
1458     /* if a geometry hint is available, use it */
1459     bdrv_get_geometry(bs, &nb_sectors);
1460     bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
1461     translation = bdrv_get_translation_hint(bs);
1462     if (cylinders != 0) {
1463         *pcyls = cylinders;
1464         *pheads = heads;
1465         *psecs = secs;
1466     } else {
1467         if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
1468             if (heads > 16) {
1469                 /* if heads > 16, it means that a BIOS LBA
1470                    translation was active, so the default
1471                    hardware geometry is OK */
1472                 lba_detected = 1;
1473                 goto default_geometry;
1474             } else {
1475                 *pcyls = cylinders;
1476                 *pheads = heads;
1477                 *psecs = secs;
1478                 /* disable any translation to be in sync with
1479                    the logical geometry */
1480                 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
1481                     bdrv_set_translation_hint(bs,
1482                                               BIOS_ATA_TRANSLATION_NONE);
1483                 }
1484             }
1485         } else {
1486         default_geometry:
1487             /* if no geometry, use a standard physical disk geometry */
1488             cylinders = nb_sectors / (16 * 63);
1489 
1490             if (cylinders > 16383)
1491                 cylinders = 16383;
1492             else if (cylinders < 2)
1493                 cylinders = 2;
1494             *pcyls = cylinders;
1495             *pheads = 16;
1496             *psecs = 63;
1497             if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
1498                 if ((*pcyls * *pheads) <= 131072) {
1499                     bdrv_set_translation_hint(bs,
1500                                               BIOS_ATA_TRANSLATION_LARGE);
1501                 } else {
1502                     bdrv_set_translation_hint(bs,
1503                                               BIOS_ATA_TRANSLATION_LBA);
1504                 }
1505             }
1506         }
1507         bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
1508     }
1509 }
1510 
1511 void bdrv_set_geometry_hint(BlockDriverState *bs,
1512                             int cyls, int heads, int secs)
1513 {
1514     bs->cyls = cyls;
1515     bs->heads = heads;
1516     bs->secs = secs;
1517 }
1518 
1519 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
1520 {
1521     bs->translation = translation;
1522 }
1523 
1524 void bdrv_get_geometry_hint(BlockDriverState *bs,
1525                             int *pcyls, int *pheads, int *psecs)
1526 {
1527     *pcyls = bs->cyls;
1528     *pheads = bs->heads;
1529     *psecs = bs->secs;
1530 }
1531 
1532 /* Recognize floppy formats */
1533 typedef struct FDFormat {
1534     FDriveType drive;
1535     uint8_t last_sect;
1536     uint8_t max_track;
1537     uint8_t max_head;
1538 } FDFormat;
1539 
1540 static const FDFormat fd_formats[] = {
1541     /* First entry is default format */
1542     /* 1.44 MB 3"1/2 floppy disks */
1543     { FDRIVE_DRV_144, 18, 80, 1, },
1544     { FDRIVE_DRV_144, 20, 80, 1, },
1545     { FDRIVE_DRV_144, 21, 80, 1, },
1546     { FDRIVE_DRV_144, 21, 82, 1, },
1547     { FDRIVE_DRV_144, 21, 83, 1, },
1548     { FDRIVE_DRV_144, 22, 80, 1, },
1549     { FDRIVE_DRV_144, 23, 80, 1, },
1550     { FDRIVE_DRV_144, 24, 80, 1, },
1551     /* 2.88 MB 3"1/2 floppy disks */
1552     { FDRIVE_DRV_288, 36, 80, 1, },
1553     { FDRIVE_DRV_288, 39, 80, 1, },
1554     { FDRIVE_DRV_288, 40, 80, 1, },
1555     { FDRIVE_DRV_288, 44, 80, 1, },
1556     { FDRIVE_DRV_288, 48, 80, 1, },
1557     /* 720 kB 3"1/2 floppy disks */
1558     { FDRIVE_DRV_144,  9, 80, 1, },
1559     { FDRIVE_DRV_144, 10, 80, 1, },
1560     { FDRIVE_DRV_144, 10, 82, 1, },
1561     { FDRIVE_DRV_144, 10, 83, 1, },
1562     { FDRIVE_DRV_144, 13, 80, 1, },
1563     { FDRIVE_DRV_144, 14, 80, 1, },
1564     /* 1.2 MB 5"1/4 floppy disks */
1565     { FDRIVE_DRV_120, 15, 80, 1, },
1566     { FDRIVE_DRV_120, 18, 80, 1, },
1567     { FDRIVE_DRV_120, 18, 82, 1, },
1568     { FDRIVE_DRV_120, 18, 83, 1, },
1569     { FDRIVE_DRV_120, 20, 80, 1, },
1570     /* 720 kB 5"1/4 floppy disks */
1571     { FDRIVE_DRV_120,  9, 80, 1, },
1572     { FDRIVE_DRV_120, 11, 80, 1, },
1573     /* 360 kB 5"1/4 floppy disks */
1574     { FDRIVE_DRV_120,  9, 40, 1, },
1575     { FDRIVE_DRV_120,  9, 40, 0, },
1576     { FDRIVE_DRV_120, 10, 41, 1, },
1577     { FDRIVE_DRV_120, 10, 42, 1, },
1578     /* 320 kB 5"1/4 floppy disks */
1579     { FDRIVE_DRV_120,  8, 40, 1, },
1580     { FDRIVE_DRV_120,  8, 40, 0, },
1581     /* 360 kB must match 5"1/4 better than 3"1/2... */
1582     { FDRIVE_DRV_144,  9, 80, 0, },
1583     /* end */
1584     { FDRIVE_DRV_NONE, -1, -1, 0, },
1585 };
1586 
1587 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
1588                                    int *max_track, int *last_sect,
1589                                    FDriveType drive_in, FDriveType *drive)
1590 {
1591     const FDFormat *parse;
1592     uint64_t nb_sectors, size;
1593     int i, first_match, match;
1594 
1595     bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
1596     if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
1597         /* User defined disk */
1598     } else {
1599         bdrv_get_geometry(bs, &nb_sectors);
1600         match = -1;
1601         first_match = -1;
1602         for (i = 0; ; i++) {
1603             parse = &fd_formats[i];
1604             if (parse->drive == FDRIVE_DRV_NONE) {
1605                 break;
1606             }
1607             if (drive_in == parse->drive ||
1608                 drive_in == FDRIVE_DRV_NONE) {
1609                 size = (parse->max_head + 1) * parse->max_track *
1610                     parse->last_sect;
1611                 if (nb_sectors == size) {
1612                     match = i;
1613                     break;
1614                 }
1615                 if (first_match == -1) {
1616                     first_match = i;
1617                 }
1618             }
1619         }
1620         if (match == -1) {
1621             if (first_match == -1) {
1622                 match = 1;
1623             } else {
1624                 match = first_match;
1625             }
1626             parse = &fd_formats[match];
1627         }
1628         *nb_heads = parse->max_head + 1;
1629         *max_track = parse->max_track;
1630         *last_sect = parse->last_sect;
1631         *drive = parse->drive;
1632     }
1633 }
1634 
1635 int bdrv_get_translation_hint(BlockDriverState *bs)
1636 {
1637     return bs->translation;
1638 }
1639 
1640 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
1641                        BlockErrorAction on_write_error)
1642 {
1643     bs->on_read_error = on_read_error;
1644     bs->on_write_error = on_write_error;
1645 }
1646 
1647 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
1648 {
1649     return is_read ? bs->on_read_error : bs->on_write_error;
1650 }
1651 
1652 int bdrv_is_read_only(BlockDriverState *bs)
1653 {
1654     return bs->read_only;
1655 }
1656 
1657 int bdrv_is_sg(BlockDriverState *bs)
1658 {
1659     return bs->sg;
1660 }
1661 
1662 int bdrv_enable_write_cache(BlockDriverState *bs)
1663 {
1664     return bs->enable_write_cache;
1665 }
1666 
1667 int bdrv_is_encrypted(BlockDriverState *bs)
1668 {
1669     if (bs->backing_hd && bs->backing_hd->encrypted)
1670         return 1;
1671     return bs->encrypted;
1672 }
1673 
1674 int bdrv_key_required(BlockDriverState *bs)
1675 {
1676     BlockDriverState *backing_hd = bs->backing_hd;
1677 
1678     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
1679         return 1;
1680     return (bs->encrypted && !bs->valid_key);
1681 }
1682 
1683 int bdrv_set_key(BlockDriverState *bs, const char *key)
1684 {
1685     int ret;
1686     if (bs->backing_hd && bs->backing_hd->encrypted) {
1687         ret = bdrv_set_key(bs->backing_hd, key);
1688         if (ret < 0)
1689             return ret;
1690         if (!bs->encrypted)
1691             return 0;
1692     }
1693     if (!bs->encrypted) {
1694         return -EINVAL;
1695     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
1696         return -ENOMEDIUM;
1697     }
1698     ret = bs->drv->bdrv_set_key(bs, key);
1699     if (ret < 0) {
1700         bs->valid_key = 0;
1701     } else if (!bs->valid_key) {
1702         bs->valid_key = 1;
1703         /* call the change callback now, we skipped it on open */
1704         bdrv_dev_change_media_cb(bs, true);
1705     }
1706     return ret;
1707 }
1708 
1709 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
1710 {
1711     if (!bs->drv) {
1712         buf[0] = '\0';
1713     } else {
1714         pstrcpy(buf, buf_size, bs->drv->format_name);
1715     }
1716 }
1717 
1718 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
1719                          void *opaque)
1720 {
1721     BlockDriver *drv;
1722 
1723     QLIST_FOREACH(drv, &bdrv_drivers, list) {
1724         it(opaque, drv->format_name);
1725     }
1726 }
1727 
1728 BlockDriverState *bdrv_find(const char *name)
1729 {
1730     BlockDriverState *bs;
1731 
1732     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1733         if (!strcmp(name, bs->device_name)) {
1734             return bs;
1735         }
1736     }
1737     return NULL;
1738 }
1739 
1740 BlockDriverState *bdrv_next(BlockDriverState *bs)
1741 {
1742     if (!bs) {
1743         return QTAILQ_FIRST(&bdrv_states);
1744     }
1745     return QTAILQ_NEXT(bs, list);
1746 }
1747 
1748 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
1749 {
1750     BlockDriverState *bs;
1751 
1752     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1753         it(opaque, bs);
1754     }
1755 }
1756 
1757 const char *bdrv_get_device_name(BlockDriverState *bs)
1758 {
1759     return bs->device_name;
1760 }
1761 
1762 int bdrv_flush(BlockDriverState *bs)
1763 {
1764     if (bs->open_flags & BDRV_O_NO_FLUSH) {
1765         return 0;
1766     }
1767 
1768     if (bs->drv && bdrv_has_async_flush(bs->drv) && qemu_in_coroutine()) {
1769         return bdrv_co_flush_em(bs);
1770     }
1771 
1772     if (bs->drv && bs->drv->bdrv_flush) {
1773         return bs->drv->bdrv_flush(bs);
1774     }
1775 
1776     /*
1777      * Some block drivers always operate in either writethrough or unsafe mode
1778      * and don't support bdrv_flush therefore. Usually qemu doesn't know how
1779      * the server works (because the behaviour is hardcoded or depends on
1780      * server-side configuration), so we can't ensure that everything is safe
1781      * on disk. Returning an error doesn't work because that would break guests
1782      * even if the server operates in writethrough mode.
1783      *
1784      * Let's hope the user knows what he's doing.
1785      */
1786     return 0;
1787 }
1788 
1789 void bdrv_flush_all(void)
1790 {
1791     BlockDriverState *bs;
1792 
1793     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1794         if (!bdrv_is_read_only(bs) && bdrv_is_inserted(bs)) {
1795             bdrv_flush(bs);
1796         }
1797     }
1798 }
1799 
1800 int bdrv_has_zero_init(BlockDriverState *bs)
1801 {
1802     assert(bs->drv);
1803 
1804     if (bs->drv->bdrv_has_zero_init) {
1805         return bs->drv->bdrv_has_zero_init(bs);
1806     }
1807 
1808     return 1;
1809 }
1810 
1811 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
1812 {
1813     if (!bs->drv) {
1814         return -ENOMEDIUM;
1815     }
1816     if (!bs->drv->bdrv_discard) {
1817         return 0;
1818     }
1819     return bs->drv->bdrv_discard(bs, sector_num, nb_sectors);
1820 }
1821 
1822 /*
1823  * Returns true iff the specified sector is present in the disk image. Drivers
1824  * not implementing the functionality are assumed to not support backing files,
1825  * hence all their sectors are reported as allocated.
1826  *
1827  * 'pnum' is set to the number of sectors (including and immediately following
1828  * the specified sector) that are known to be in the same
1829  * allocated/unallocated state.
1830  *
1831  * 'nb_sectors' is the max value 'pnum' should be set to.
1832  */
1833 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
1834 	int *pnum)
1835 {
1836     int64_t n;
1837     if (!bs->drv->bdrv_is_allocated) {
1838         if (sector_num >= bs->total_sectors) {
1839             *pnum = 0;
1840             return 0;
1841         }
1842         n = bs->total_sectors - sector_num;
1843         *pnum = (n < nb_sectors) ? (n) : (nb_sectors);
1844         return 1;
1845     }
1846     return bs->drv->bdrv_is_allocated(bs, sector_num, nb_sectors, pnum);
1847 }
1848 
1849 void bdrv_mon_event(const BlockDriverState *bdrv,
1850                     BlockMonEventAction action, int is_read)
1851 {
1852     QObject *data;
1853     const char *action_str;
1854 
1855     switch (action) {
1856     case BDRV_ACTION_REPORT:
1857         action_str = "report";
1858         break;
1859     case BDRV_ACTION_IGNORE:
1860         action_str = "ignore";
1861         break;
1862     case BDRV_ACTION_STOP:
1863         action_str = "stop";
1864         break;
1865     default:
1866         abort();
1867     }
1868 
1869     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1870                               bdrv->device_name,
1871                               action_str,
1872                               is_read ? "read" : "write");
1873     monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1874 
1875     qobject_decref(data);
1876 }
1877 
1878 static void bdrv_print_dict(QObject *obj, void *opaque)
1879 {
1880     QDict *bs_dict;
1881     Monitor *mon = opaque;
1882 
1883     bs_dict = qobject_to_qdict(obj);
1884 
1885     monitor_printf(mon, "%s: removable=%d",
1886                         qdict_get_str(bs_dict, "device"),
1887                         qdict_get_bool(bs_dict, "removable"));
1888 
1889     if (qdict_get_bool(bs_dict, "removable")) {
1890         monitor_printf(mon, " locked=%d", qdict_get_bool(bs_dict, "locked"));
1891         monitor_printf(mon, " tray-open=%d",
1892                        qdict_get_bool(bs_dict, "tray-open"));
1893     }
1894 
1895     if (qdict_haskey(bs_dict, "io-status")) {
1896         monitor_printf(mon, " io-status=%s", qdict_get_str(bs_dict, "io-status"));
1897     }
1898 
1899     if (qdict_haskey(bs_dict, "inserted")) {
1900         QDict *qdict = qobject_to_qdict(qdict_get(bs_dict, "inserted"));
1901 
1902         monitor_printf(mon, " file=");
1903         monitor_print_filename(mon, qdict_get_str(qdict, "file"));
1904         if (qdict_haskey(qdict, "backing_file")) {
1905             monitor_printf(mon, " backing_file=");
1906             monitor_print_filename(mon, qdict_get_str(qdict, "backing_file"));
1907         }
1908         monitor_printf(mon, " ro=%d drv=%s encrypted=%d",
1909                             qdict_get_bool(qdict, "ro"),
1910                             qdict_get_str(qdict, "drv"),
1911                             qdict_get_bool(qdict, "encrypted"));
1912     } else {
1913         monitor_printf(mon, " [not inserted]");
1914     }
1915 
1916     monitor_printf(mon, "\n");
1917 }
1918 
1919 void bdrv_info_print(Monitor *mon, const QObject *data)
1920 {
1921     qlist_iter(qobject_to_qlist(data), bdrv_print_dict, mon);
1922 }
1923 
1924 static const char *const io_status_name[BDRV_IOS_MAX] = {
1925     [BDRV_IOS_OK] = "ok",
1926     [BDRV_IOS_FAILED] = "failed",
1927     [BDRV_IOS_ENOSPC] = "nospace",
1928 };
1929 
1930 void bdrv_info(Monitor *mon, QObject **ret_data)
1931 {
1932     QList *bs_list;
1933     BlockDriverState *bs;
1934 
1935     bs_list = qlist_new();
1936 
1937     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1938         QObject *bs_obj;
1939         QDict *bs_dict;
1940 
1941         bs_obj = qobject_from_jsonf("{ 'device': %s, 'type': 'unknown', "
1942                                     "'removable': %i, 'locked': %i }",
1943                                     bs->device_name,
1944                                     bdrv_dev_has_removable_media(bs),
1945                                     bdrv_dev_is_medium_locked(bs));
1946         bs_dict = qobject_to_qdict(bs_obj);
1947 
1948         if (bdrv_dev_has_removable_media(bs)) {
1949             qdict_put(bs_dict, "tray-open",
1950                       qbool_from_int(bdrv_dev_is_tray_open(bs)));
1951         }
1952 
1953         if (bdrv_iostatus_is_enabled(bs)) {
1954             qdict_put(bs_dict, "io-status",
1955                       qstring_from_str(io_status_name[bs->iostatus]));
1956         }
1957 
1958         if (bs->drv) {
1959             QObject *obj;
1960 
1961             obj = qobject_from_jsonf("{ 'file': %s, 'ro': %i, 'drv': %s, "
1962                                      "'encrypted': %i }",
1963                                      bs->filename, bs->read_only,
1964                                      bs->drv->format_name,
1965                                      bdrv_is_encrypted(bs));
1966             if (bs->backing_file[0] != '\0') {
1967                 QDict *qdict = qobject_to_qdict(obj);
1968                 qdict_put(qdict, "backing_file",
1969                           qstring_from_str(bs->backing_file));
1970             }
1971 
1972             qdict_put_obj(bs_dict, "inserted", obj);
1973         }
1974         qlist_append_obj(bs_list, bs_obj);
1975     }
1976 
1977     *ret_data = QOBJECT(bs_list);
1978 }
1979 
1980 static void bdrv_stats_iter(QObject *data, void *opaque)
1981 {
1982     QDict *qdict;
1983     Monitor *mon = opaque;
1984 
1985     qdict = qobject_to_qdict(data);
1986     monitor_printf(mon, "%s:", qdict_get_str(qdict, "device"));
1987 
1988     qdict = qobject_to_qdict(qdict_get(qdict, "stats"));
1989     monitor_printf(mon, " rd_bytes=%" PRId64
1990                         " wr_bytes=%" PRId64
1991                         " rd_operations=%" PRId64
1992                         " wr_operations=%" PRId64
1993                         " flush_operations=%" PRId64
1994                         " wr_total_time_ns=%" PRId64
1995                         " rd_total_time_ns=%" PRId64
1996                         " flush_total_time_ns=%" PRId64
1997                         "\n",
1998                         qdict_get_int(qdict, "rd_bytes"),
1999                         qdict_get_int(qdict, "wr_bytes"),
2000                         qdict_get_int(qdict, "rd_operations"),
2001                         qdict_get_int(qdict, "wr_operations"),
2002                         qdict_get_int(qdict, "flush_operations"),
2003                         qdict_get_int(qdict, "wr_total_time_ns"),
2004                         qdict_get_int(qdict, "rd_total_time_ns"),
2005                         qdict_get_int(qdict, "flush_total_time_ns"));
2006 }
2007 
2008 void bdrv_stats_print(Monitor *mon, const QObject *data)
2009 {
2010     qlist_iter(qobject_to_qlist(data), bdrv_stats_iter, mon);
2011 }
2012 
2013 static QObject* bdrv_info_stats_bs(BlockDriverState *bs)
2014 {
2015     QObject *res;
2016     QDict *dict;
2017 
2018     res = qobject_from_jsonf("{ 'stats': {"
2019                              "'rd_bytes': %" PRId64 ","
2020                              "'wr_bytes': %" PRId64 ","
2021                              "'rd_operations': %" PRId64 ","
2022                              "'wr_operations': %" PRId64 ","
2023                              "'wr_highest_offset': %" PRId64 ","
2024                              "'flush_operations': %" PRId64 ","
2025                              "'wr_total_time_ns': %" PRId64 ","
2026                              "'rd_total_time_ns': %" PRId64 ","
2027                              "'flush_total_time_ns': %" PRId64
2028                              "} }",
2029                              bs->nr_bytes[BDRV_ACCT_READ],
2030                              bs->nr_bytes[BDRV_ACCT_WRITE],
2031                              bs->nr_ops[BDRV_ACCT_READ],
2032                              bs->nr_ops[BDRV_ACCT_WRITE],
2033                              bs->wr_highest_sector *
2034                              (uint64_t)BDRV_SECTOR_SIZE,
2035                              bs->nr_ops[BDRV_ACCT_FLUSH],
2036                              bs->total_time_ns[BDRV_ACCT_WRITE],
2037                              bs->total_time_ns[BDRV_ACCT_READ],
2038                              bs->total_time_ns[BDRV_ACCT_FLUSH]);
2039     dict  = qobject_to_qdict(res);
2040 
2041     if (*bs->device_name) {
2042         qdict_put(dict, "device", qstring_from_str(bs->device_name));
2043     }
2044 
2045     if (bs->file) {
2046         QObject *parent = bdrv_info_stats_bs(bs->file);
2047         qdict_put_obj(dict, "parent", parent);
2048     }
2049 
2050     return res;
2051 }
2052 
2053 void bdrv_info_stats(Monitor *mon, QObject **ret_data)
2054 {
2055     QObject *obj;
2056     QList *devices;
2057     BlockDriverState *bs;
2058 
2059     devices = qlist_new();
2060 
2061     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2062         obj = bdrv_info_stats_bs(bs);
2063         qlist_append_obj(devices, obj);
2064     }
2065 
2066     *ret_data = QOBJECT(devices);
2067 }
2068 
2069 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2070 {
2071     if (bs->backing_hd && bs->backing_hd->encrypted)
2072         return bs->backing_file;
2073     else if (bs->encrypted)
2074         return bs->filename;
2075     else
2076         return NULL;
2077 }
2078 
2079 void bdrv_get_backing_filename(BlockDriverState *bs,
2080                                char *filename, int filename_size)
2081 {
2082     if (!bs->backing_file) {
2083         pstrcpy(filename, filename_size, "");
2084     } else {
2085         pstrcpy(filename, filename_size, bs->backing_file);
2086     }
2087 }
2088 
2089 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2090                           const uint8_t *buf, int nb_sectors)
2091 {
2092     BlockDriver *drv = bs->drv;
2093     if (!drv)
2094         return -ENOMEDIUM;
2095     if (!drv->bdrv_write_compressed)
2096         return -ENOTSUP;
2097     if (bdrv_check_request(bs, sector_num, nb_sectors))
2098         return -EIO;
2099 
2100     if (bs->dirty_bitmap) {
2101         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2102     }
2103 
2104     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2105 }
2106 
2107 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2108 {
2109     BlockDriver *drv = bs->drv;
2110     if (!drv)
2111         return -ENOMEDIUM;
2112     if (!drv->bdrv_get_info)
2113         return -ENOTSUP;
2114     memset(bdi, 0, sizeof(*bdi));
2115     return drv->bdrv_get_info(bs, bdi);
2116 }
2117 
2118 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2119                       int64_t pos, int size)
2120 {
2121     BlockDriver *drv = bs->drv;
2122     if (!drv)
2123         return -ENOMEDIUM;
2124     if (drv->bdrv_save_vmstate)
2125         return drv->bdrv_save_vmstate(bs, buf, pos, size);
2126     if (bs->file)
2127         return bdrv_save_vmstate(bs->file, buf, pos, size);
2128     return -ENOTSUP;
2129 }
2130 
2131 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2132                       int64_t pos, int size)
2133 {
2134     BlockDriver *drv = bs->drv;
2135     if (!drv)
2136         return -ENOMEDIUM;
2137     if (drv->bdrv_load_vmstate)
2138         return drv->bdrv_load_vmstate(bs, buf, pos, size);
2139     if (bs->file)
2140         return bdrv_load_vmstate(bs->file, buf, pos, size);
2141     return -ENOTSUP;
2142 }
2143 
2144 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2145 {
2146     BlockDriver *drv = bs->drv;
2147 
2148     if (!drv || !drv->bdrv_debug_event) {
2149         return;
2150     }
2151 
2152     return drv->bdrv_debug_event(bs, event);
2153 
2154 }
2155 
2156 /**************************************************************/
2157 /* handling of snapshots */
2158 
2159 int bdrv_can_snapshot(BlockDriverState *bs)
2160 {
2161     BlockDriver *drv = bs->drv;
2162     if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2163         return 0;
2164     }
2165 
2166     if (!drv->bdrv_snapshot_create) {
2167         if (bs->file != NULL) {
2168             return bdrv_can_snapshot(bs->file);
2169         }
2170         return 0;
2171     }
2172 
2173     return 1;
2174 }
2175 
2176 int bdrv_is_snapshot(BlockDriverState *bs)
2177 {
2178     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2179 }
2180 
2181 BlockDriverState *bdrv_snapshots(void)
2182 {
2183     BlockDriverState *bs;
2184 
2185     if (bs_snapshots) {
2186         return bs_snapshots;
2187     }
2188 
2189     bs = NULL;
2190     while ((bs = bdrv_next(bs))) {
2191         if (bdrv_can_snapshot(bs)) {
2192             bs_snapshots = bs;
2193             return bs;
2194         }
2195     }
2196     return NULL;
2197 }
2198 
2199 int bdrv_snapshot_create(BlockDriverState *bs,
2200                          QEMUSnapshotInfo *sn_info)
2201 {
2202     BlockDriver *drv = bs->drv;
2203     if (!drv)
2204         return -ENOMEDIUM;
2205     if (drv->bdrv_snapshot_create)
2206         return drv->bdrv_snapshot_create(bs, sn_info);
2207     if (bs->file)
2208         return bdrv_snapshot_create(bs->file, sn_info);
2209     return -ENOTSUP;
2210 }
2211 
2212 int bdrv_snapshot_goto(BlockDriverState *bs,
2213                        const char *snapshot_id)
2214 {
2215     BlockDriver *drv = bs->drv;
2216     int ret, open_ret;
2217 
2218     if (!drv)
2219         return -ENOMEDIUM;
2220     if (drv->bdrv_snapshot_goto)
2221         return drv->bdrv_snapshot_goto(bs, snapshot_id);
2222 
2223     if (bs->file) {
2224         drv->bdrv_close(bs);
2225         ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2226         open_ret = drv->bdrv_open(bs, bs->open_flags);
2227         if (open_ret < 0) {
2228             bdrv_delete(bs->file);
2229             bs->drv = NULL;
2230             return open_ret;
2231         }
2232         return ret;
2233     }
2234 
2235     return -ENOTSUP;
2236 }
2237 
2238 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2239 {
2240     BlockDriver *drv = bs->drv;
2241     if (!drv)
2242         return -ENOMEDIUM;
2243     if (drv->bdrv_snapshot_delete)
2244         return drv->bdrv_snapshot_delete(bs, snapshot_id);
2245     if (bs->file)
2246         return bdrv_snapshot_delete(bs->file, snapshot_id);
2247     return -ENOTSUP;
2248 }
2249 
2250 int bdrv_snapshot_list(BlockDriverState *bs,
2251                        QEMUSnapshotInfo **psn_info)
2252 {
2253     BlockDriver *drv = bs->drv;
2254     if (!drv)
2255         return -ENOMEDIUM;
2256     if (drv->bdrv_snapshot_list)
2257         return drv->bdrv_snapshot_list(bs, psn_info);
2258     if (bs->file)
2259         return bdrv_snapshot_list(bs->file, psn_info);
2260     return -ENOTSUP;
2261 }
2262 
2263 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2264         const char *snapshot_name)
2265 {
2266     BlockDriver *drv = bs->drv;
2267     if (!drv) {
2268         return -ENOMEDIUM;
2269     }
2270     if (!bs->read_only) {
2271         return -EINVAL;
2272     }
2273     if (drv->bdrv_snapshot_load_tmp) {
2274         return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2275     }
2276     return -ENOTSUP;
2277 }
2278 
2279 #define NB_SUFFIXES 4
2280 
2281 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2282 {
2283     static const char suffixes[NB_SUFFIXES] = "KMGT";
2284     int64_t base;
2285     int i;
2286 
2287     if (size <= 999) {
2288         snprintf(buf, buf_size, "%" PRId64, size);
2289     } else {
2290         base = 1024;
2291         for(i = 0; i < NB_SUFFIXES; i++) {
2292             if (size < (10 * base)) {
2293                 snprintf(buf, buf_size, "%0.1f%c",
2294                          (double)size / base,
2295                          suffixes[i]);
2296                 break;
2297             } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2298                 snprintf(buf, buf_size, "%" PRId64 "%c",
2299                          ((size + (base >> 1)) / base),
2300                          suffixes[i]);
2301                 break;
2302             }
2303             base = base * 1024;
2304         }
2305     }
2306     return buf;
2307 }
2308 
2309 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2310 {
2311     char buf1[128], date_buf[128], clock_buf[128];
2312 #ifdef _WIN32
2313     struct tm *ptm;
2314 #else
2315     struct tm tm;
2316 #endif
2317     time_t ti;
2318     int64_t secs;
2319 
2320     if (!sn) {
2321         snprintf(buf, buf_size,
2322                  "%-10s%-20s%7s%20s%15s",
2323                  "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2324     } else {
2325         ti = sn->date_sec;
2326 #ifdef _WIN32
2327         ptm = localtime(&ti);
2328         strftime(date_buf, sizeof(date_buf),
2329                  "%Y-%m-%d %H:%M:%S", ptm);
2330 #else
2331         localtime_r(&ti, &tm);
2332         strftime(date_buf, sizeof(date_buf),
2333                  "%Y-%m-%d %H:%M:%S", &tm);
2334 #endif
2335         secs = sn->vm_clock_nsec / 1000000000;
2336         snprintf(clock_buf, sizeof(clock_buf),
2337                  "%02d:%02d:%02d.%03d",
2338                  (int)(secs / 3600),
2339                  (int)((secs / 60) % 60),
2340                  (int)(secs % 60),
2341                  (int)((sn->vm_clock_nsec / 1000000) % 1000));
2342         snprintf(buf, buf_size,
2343                  "%-10s%-20s%7s%20s%15s",
2344                  sn->id_str, sn->name,
2345                  get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2346                  date_buf,
2347                  clock_buf);
2348     }
2349     return buf;
2350 }
2351 
2352 /**************************************************************/
2353 /* async I/Os */
2354 
2355 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2356                                  QEMUIOVector *qiov, int nb_sectors,
2357                                  BlockDriverCompletionFunc *cb, void *opaque)
2358 {
2359     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2360 
2361     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2362                                  cb, opaque, false);
2363 }
2364 
2365 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2366                                   QEMUIOVector *qiov, int nb_sectors,
2367                                   BlockDriverCompletionFunc *cb, void *opaque)
2368 {
2369     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2370 
2371     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2372                                  cb, opaque, true);
2373 }
2374 
2375 
2376 typedef struct MultiwriteCB {
2377     int error;
2378     int num_requests;
2379     int num_callbacks;
2380     struct {
2381         BlockDriverCompletionFunc *cb;
2382         void *opaque;
2383         QEMUIOVector *free_qiov;
2384         void *free_buf;
2385     } callbacks[];
2386 } MultiwriteCB;
2387 
2388 static void multiwrite_user_cb(MultiwriteCB *mcb)
2389 {
2390     int i;
2391 
2392     for (i = 0; i < mcb->num_callbacks; i++) {
2393         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2394         if (mcb->callbacks[i].free_qiov) {
2395             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2396         }
2397         g_free(mcb->callbacks[i].free_qiov);
2398         qemu_vfree(mcb->callbacks[i].free_buf);
2399     }
2400 }
2401 
2402 static void multiwrite_cb(void *opaque, int ret)
2403 {
2404     MultiwriteCB *mcb = opaque;
2405 
2406     trace_multiwrite_cb(mcb, ret);
2407 
2408     if (ret < 0 && !mcb->error) {
2409         mcb->error = ret;
2410     }
2411 
2412     mcb->num_requests--;
2413     if (mcb->num_requests == 0) {
2414         multiwrite_user_cb(mcb);
2415         g_free(mcb);
2416     }
2417 }
2418 
2419 static int multiwrite_req_compare(const void *a, const void *b)
2420 {
2421     const BlockRequest *req1 = a, *req2 = b;
2422 
2423     /*
2424      * Note that we can't simply subtract req2->sector from req1->sector
2425      * here as that could overflow the return value.
2426      */
2427     if (req1->sector > req2->sector) {
2428         return 1;
2429     } else if (req1->sector < req2->sector) {
2430         return -1;
2431     } else {
2432         return 0;
2433     }
2434 }
2435 
2436 /*
2437  * Takes a bunch of requests and tries to merge them. Returns the number of
2438  * requests that remain after merging.
2439  */
2440 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2441     int num_reqs, MultiwriteCB *mcb)
2442 {
2443     int i, outidx;
2444 
2445     // Sort requests by start sector
2446     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2447 
2448     // Check if adjacent requests touch the same clusters. If so, combine them,
2449     // filling up gaps with zero sectors.
2450     outidx = 0;
2451     for (i = 1; i < num_reqs; i++) {
2452         int merge = 0;
2453         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2454 
2455         // This handles the cases that are valid for all block drivers, namely
2456         // exactly sequential writes and overlapping writes.
2457         if (reqs[i].sector <= oldreq_last) {
2458             merge = 1;
2459         }
2460 
2461         // The block driver may decide that it makes sense to combine requests
2462         // even if there is a gap of some sectors between them. In this case,
2463         // the gap is filled with zeros (therefore only applicable for yet
2464         // unused space in format like qcow2).
2465         if (!merge && bs->drv->bdrv_merge_requests) {
2466             merge = bs->drv->bdrv_merge_requests(bs, &reqs[outidx], &reqs[i]);
2467         }
2468 
2469         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2470             merge = 0;
2471         }
2472 
2473         if (merge) {
2474             size_t size;
2475             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
2476             qemu_iovec_init(qiov,
2477                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2478 
2479             // Add the first request to the merged one. If the requests are
2480             // overlapping, drop the last sectors of the first request.
2481             size = (reqs[i].sector - reqs[outidx].sector) << 9;
2482             qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
2483 
2484             // We might need to add some zeros between the two requests
2485             if (reqs[i].sector > oldreq_last) {
2486                 size_t zero_bytes = (reqs[i].sector - oldreq_last) << 9;
2487                 uint8_t *buf = qemu_blockalign(bs, zero_bytes);
2488                 memset(buf, 0, zero_bytes);
2489                 qemu_iovec_add(qiov, buf, zero_bytes);
2490                 mcb->callbacks[i].free_buf = buf;
2491             }
2492 
2493             // Add the second request
2494             qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
2495 
2496             reqs[outidx].nb_sectors = qiov->size >> 9;
2497             reqs[outidx].qiov = qiov;
2498 
2499             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
2500         } else {
2501             outidx++;
2502             reqs[outidx].sector     = reqs[i].sector;
2503             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
2504             reqs[outidx].qiov       = reqs[i].qiov;
2505         }
2506     }
2507 
2508     return outidx + 1;
2509 }
2510 
2511 /*
2512  * Submit multiple AIO write requests at once.
2513  *
2514  * On success, the function returns 0 and all requests in the reqs array have
2515  * been submitted. In error case this function returns -1, and any of the
2516  * requests may or may not be submitted yet. In particular, this means that the
2517  * callback will be called for some of the requests, for others it won't. The
2518  * caller must check the error field of the BlockRequest to wait for the right
2519  * callbacks (if error != 0, no callback will be called).
2520  *
2521  * The implementation may modify the contents of the reqs array, e.g. to merge
2522  * requests. However, the fields opaque and error are left unmodified as they
2523  * are used to signal failure for a single request to the caller.
2524  */
2525 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
2526 {
2527     BlockDriverAIOCB *acb;
2528     MultiwriteCB *mcb;
2529     int i;
2530 
2531     /* don't submit writes if we don't have a medium */
2532     if (bs->drv == NULL) {
2533         for (i = 0; i < num_reqs; i++) {
2534             reqs[i].error = -ENOMEDIUM;
2535         }
2536         return -1;
2537     }
2538 
2539     if (num_reqs == 0) {
2540         return 0;
2541     }
2542 
2543     // Create MultiwriteCB structure
2544     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
2545     mcb->num_requests = 0;
2546     mcb->num_callbacks = num_reqs;
2547 
2548     for (i = 0; i < num_reqs; i++) {
2549         mcb->callbacks[i].cb = reqs[i].cb;
2550         mcb->callbacks[i].opaque = reqs[i].opaque;
2551     }
2552 
2553     // Check for mergable requests
2554     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
2555 
2556     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
2557 
2558     /*
2559      * Run the aio requests. As soon as one request can't be submitted
2560      * successfully, fail all requests that are not yet submitted (we must
2561      * return failure for all requests anyway)
2562      *
2563      * num_requests cannot be set to the right value immediately: If
2564      * bdrv_aio_writev fails for some request, num_requests would be too high
2565      * and therefore multiwrite_cb() would never recognize the multiwrite
2566      * request as completed. We also cannot use the loop variable i to set it
2567      * when the first request fails because the callback may already have been
2568      * called for previously submitted requests. Thus, num_requests must be
2569      * incremented for each request that is submitted.
2570      *
2571      * The problem that callbacks may be called early also means that we need
2572      * to take care that num_requests doesn't become 0 before all requests are
2573      * submitted - multiwrite_cb() would consider the multiwrite request
2574      * completed. A dummy request that is "completed" by a manual call to
2575      * multiwrite_cb() takes care of this.
2576      */
2577     mcb->num_requests = 1;
2578 
2579     // Run the aio requests
2580     for (i = 0; i < num_reqs; i++) {
2581         mcb->num_requests++;
2582         acb = bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
2583             reqs[i].nb_sectors, multiwrite_cb, mcb);
2584 
2585         if (acb == NULL) {
2586             // We can only fail the whole thing if no request has been
2587             // submitted yet. Otherwise we'll wait for the submitted AIOs to
2588             // complete and report the error in the callback.
2589             if (i == 0) {
2590                 trace_bdrv_aio_multiwrite_earlyfail(mcb);
2591                 goto fail;
2592             } else {
2593                 trace_bdrv_aio_multiwrite_latefail(mcb, i);
2594                 multiwrite_cb(mcb, -EIO);
2595                 break;
2596             }
2597         }
2598     }
2599 
2600     /* Complete the dummy request */
2601     multiwrite_cb(mcb, 0);
2602 
2603     return 0;
2604 
2605 fail:
2606     for (i = 0; i < mcb->num_callbacks; i++) {
2607         reqs[i].error = -EIO;
2608     }
2609     g_free(mcb);
2610     return -1;
2611 }
2612 
2613 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
2614         BlockDriverCompletionFunc *cb, void *opaque)
2615 {
2616     BlockDriver *drv = bs->drv;
2617 
2618     trace_bdrv_aio_flush(bs, opaque);
2619 
2620     if (bs->open_flags & BDRV_O_NO_FLUSH) {
2621         return bdrv_aio_noop_em(bs, cb, opaque);
2622     }
2623 
2624     if (!drv)
2625         return NULL;
2626     return drv->bdrv_aio_flush(bs, cb, opaque);
2627 }
2628 
2629 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
2630 {
2631     acb->pool->cancel(acb);
2632 }
2633 
2634 
2635 /**************************************************************/
2636 /* async block device emulation */
2637 
2638 typedef struct BlockDriverAIOCBSync {
2639     BlockDriverAIOCB common;
2640     QEMUBH *bh;
2641     int ret;
2642     /* vector translation state */
2643     QEMUIOVector *qiov;
2644     uint8_t *bounce;
2645     int is_write;
2646 } BlockDriverAIOCBSync;
2647 
2648 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
2649 {
2650     BlockDriverAIOCBSync *acb =
2651         container_of(blockacb, BlockDriverAIOCBSync, common);
2652     qemu_bh_delete(acb->bh);
2653     acb->bh = NULL;
2654     qemu_aio_release(acb);
2655 }
2656 
2657 static AIOPool bdrv_em_aio_pool = {
2658     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
2659     .cancel             = bdrv_aio_cancel_em,
2660 };
2661 
2662 static void bdrv_aio_bh_cb(void *opaque)
2663 {
2664     BlockDriverAIOCBSync *acb = opaque;
2665 
2666     if (!acb->is_write)
2667         qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
2668     qemu_vfree(acb->bounce);
2669     acb->common.cb(acb->common.opaque, acb->ret);
2670     qemu_bh_delete(acb->bh);
2671     acb->bh = NULL;
2672     qemu_aio_release(acb);
2673 }
2674 
2675 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
2676                                             int64_t sector_num,
2677                                             QEMUIOVector *qiov,
2678                                             int nb_sectors,
2679                                             BlockDriverCompletionFunc *cb,
2680                                             void *opaque,
2681                                             int is_write)
2682 
2683 {
2684     BlockDriverAIOCBSync *acb;
2685 
2686     acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2687     acb->is_write = is_write;
2688     acb->qiov = qiov;
2689     acb->bounce = qemu_blockalign(bs, qiov->size);
2690 
2691     if (!acb->bh)
2692         acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2693 
2694     if (is_write) {
2695         qemu_iovec_to_buffer(acb->qiov, acb->bounce);
2696         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
2697     } else {
2698         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
2699     }
2700 
2701     qemu_bh_schedule(acb->bh);
2702 
2703     return &acb->common;
2704 }
2705 
2706 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
2707         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2708         BlockDriverCompletionFunc *cb, void *opaque)
2709 {
2710     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
2711 }
2712 
2713 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
2714         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2715         BlockDriverCompletionFunc *cb, void *opaque)
2716 {
2717     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
2718 }
2719 
2720 
2721 typedef struct BlockDriverAIOCBCoroutine {
2722     BlockDriverAIOCB common;
2723     BlockRequest req;
2724     bool is_write;
2725     QEMUBH* bh;
2726 } BlockDriverAIOCBCoroutine;
2727 
2728 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
2729 {
2730     qemu_aio_flush();
2731 }
2732 
2733 static AIOPool bdrv_em_co_aio_pool = {
2734     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
2735     .cancel             = bdrv_aio_co_cancel_em,
2736 };
2737 
2738 static void bdrv_co_rw_bh(void *opaque)
2739 {
2740     BlockDriverAIOCBCoroutine *acb = opaque;
2741 
2742     acb->common.cb(acb->common.opaque, acb->req.error);
2743     qemu_bh_delete(acb->bh);
2744     qemu_aio_release(acb);
2745 }
2746 
2747 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
2748 static void coroutine_fn bdrv_co_do_rw(void *opaque)
2749 {
2750     BlockDriverAIOCBCoroutine *acb = opaque;
2751     BlockDriverState *bs = acb->common.bs;
2752 
2753     if (!acb->is_write) {
2754         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
2755             acb->req.nb_sectors, acb->req.qiov);
2756     } else {
2757         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
2758             acb->req.nb_sectors, acb->req.qiov);
2759     }
2760 
2761     acb->bh = qemu_bh_new(bdrv_co_rw_bh, acb);
2762     qemu_bh_schedule(acb->bh);
2763 }
2764 
2765 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
2766                                                int64_t sector_num,
2767                                                QEMUIOVector *qiov,
2768                                                int nb_sectors,
2769                                                BlockDriverCompletionFunc *cb,
2770                                                void *opaque,
2771                                                bool is_write)
2772 {
2773     Coroutine *co;
2774     BlockDriverAIOCBCoroutine *acb;
2775 
2776     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
2777     acb->req.sector = sector_num;
2778     acb->req.nb_sectors = nb_sectors;
2779     acb->req.qiov = qiov;
2780     acb->is_write = is_write;
2781 
2782     co = qemu_coroutine_create(bdrv_co_do_rw);
2783     qemu_coroutine_enter(co, acb);
2784 
2785     return &acb->common;
2786 }
2787 
2788 static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
2789         BlockDriverCompletionFunc *cb, void *opaque)
2790 {
2791     BlockDriverAIOCBSync *acb;
2792 
2793     acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2794     acb->is_write = 1; /* don't bounce in the completion hadler */
2795     acb->qiov = NULL;
2796     acb->bounce = NULL;
2797     acb->ret = 0;
2798 
2799     if (!acb->bh)
2800         acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2801 
2802     bdrv_flush(bs);
2803     qemu_bh_schedule(acb->bh);
2804     return &acb->common;
2805 }
2806 
2807 static BlockDriverAIOCB *bdrv_aio_noop_em(BlockDriverState *bs,
2808         BlockDriverCompletionFunc *cb, void *opaque)
2809 {
2810     BlockDriverAIOCBSync *acb;
2811 
2812     acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2813     acb->is_write = 1; /* don't bounce in the completion handler */
2814     acb->qiov = NULL;
2815     acb->bounce = NULL;
2816     acb->ret = 0;
2817 
2818     if (!acb->bh) {
2819         acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2820     }
2821 
2822     qemu_bh_schedule(acb->bh);
2823     return &acb->common;
2824 }
2825 
2826 void bdrv_init(void)
2827 {
2828     module_call_init(MODULE_INIT_BLOCK);
2829 }
2830 
2831 void bdrv_init_with_whitelist(void)
2832 {
2833     use_bdrv_whitelist = 1;
2834     bdrv_init();
2835 }
2836 
2837 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
2838                    BlockDriverCompletionFunc *cb, void *opaque)
2839 {
2840     BlockDriverAIOCB *acb;
2841 
2842     if (pool->free_aiocb) {
2843         acb = pool->free_aiocb;
2844         pool->free_aiocb = acb->next;
2845     } else {
2846         acb = g_malloc0(pool->aiocb_size);
2847         acb->pool = pool;
2848     }
2849     acb->bs = bs;
2850     acb->cb = cb;
2851     acb->opaque = opaque;
2852     return acb;
2853 }
2854 
2855 void qemu_aio_release(void *p)
2856 {
2857     BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
2858     AIOPool *pool = acb->pool;
2859     acb->next = pool->free_aiocb;
2860     pool->free_aiocb = acb;
2861 }
2862 
2863 /**************************************************************/
2864 /* Coroutine block device emulation */
2865 
2866 typedef struct CoroutineIOCompletion {
2867     Coroutine *coroutine;
2868     int ret;
2869 } CoroutineIOCompletion;
2870 
2871 static void bdrv_co_io_em_complete(void *opaque, int ret)
2872 {
2873     CoroutineIOCompletion *co = opaque;
2874 
2875     co->ret = ret;
2876     qemu_coroutine_enter(co->coroutine, NULL);
2877 }
2878 
2879 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
2880                                       int nb_sectors, QEMUIOVector *iov,
2881                                       bool is_write)
2882 {
2883     CoroutineIOCompletion co = {
2884         .coroutine = qemu_coroutine_self(),
2885     };
2886     BlockDriverAIOCB *acb;
2887 
2888     if (is_write) {
2889         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
2890                                        bdrv_co_io_em_complete, &co);
2891     } else {
2892         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
2893                                       bdrv_co_io_em_complete, &co);
2894     }
2895 
2896     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
2897     if (!acb) {
2898         return -EIO;
2899     }
2900     qemu_coroutine_yield();
2901 
2902     return co.ret;
2903 }
2904 
2905 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
2906                                          int64_t sector_num, int nb_sectors,
2907                                          QEMUIOVector *iov)
2908 {
2909     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
2910 }
2911 
2912 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
2913                                          int64_t sector_num, int nb_sectors,
2914                                          QEMUIOVector *iov)
2915 {
2916     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
2917 }
2918 
2919 static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs)
2920 {
2921     CoroutineIOCompletion co = {
2922         .coroutine = qemu_coroutine_self(),
2923     };
2924     BlockDriverAIOCB *acb;
2925 
2926     acb = bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2927     if (!acb) {
2928         return -EIO;
2929     }
2930     qemu_coroutine_yield();
2931     return co.ret;
2932 }
2933 
2934 /**************************************************************/
2935 /* removable device support */
2936 
2937 /**
2938  * Return TRUE if the media is present
2939  */
2940 int bdrv_is_inserted(BlockDriverState *bs)
2941 {
2942     BlockDriver *drv = bs->drv;
2943 
2944     if (!drv)
2945         return 0;
2946     if (!drv->bdrv_is_inserted)
2947         return 1;
2948     return drv->bdrv_is_inserted(bs);
2949 }
2950 
2951 /**
2952  * Return whether the media changed since the last call to this
2953  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
2954  */
2955 int bdrv_media_changed(BlockDriverState *bs)
2956 {
2957     BlockDriver *drv = bs->drv;
2958 
2959     if (drv && drv->bdrv_media_changed) {
2960         return drv->bdrv_media_changed(bs);
2961     }
2962     return -ENOTSUP;
2963 }
2964 
2965 /**
2966  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
2967  */
2968 void bdrv_eject(BlockDriverState *bs, int eject_flag)
2969 {
2970     BlockDriver *drv = bs->drv;
2971 
2972     if (drv && drv->bdrv_eject) {
2973         drv->bdrv_eject(bs, eject_flag);
2974     }
2975 }
2976 
2977 /**
2978  * Lock or unlock the media (if it is locked, the user won't be able
2979  * to eject it manually).
2980  */
2981 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
2982 {
2983     BlockDriver *drv = bs->drv;
2984 
2985     trace_bdrv_lock_medium(bs, locked);
2986 
2987     if (drv && drv->bdrv_lock_medium) {
2988         drv->bdrv_lock_medium(bs, locked);
2989     }
2990 }
2991 
2992 /* needed for generic scsi interface */
2993 
2994 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
2995 {
2996     BlockDriver *drv = bs->drv;
2997 
2998     if (drv && drv->bdrv_ioctl)
2999         return drv->bdrv_ioctl(bs, req, buf);
3000     return -ENOTSUP;
3001 }
3002 
3003 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3004         unsigned long int req, void *buf,
3005         BlockDriverCompletionFunc *cb, void *opaque)
3006 {
3007     BlockDriver *drv = bs->drv;
3008 
3009     if (drv && drv->bdrv_aio_ioctl)
3010         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3011     return NULL;
3012 }
3013 
3014 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3015 {
3016     bs->buffer_alignment = align;
3017 }
3018 
3019 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3020 {
3021     return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3022 }
3023 
3024 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3025 {
3026     int64_t bitmap_size;
3027 
3028     bs->dirty_count = 0;
3029     if (enable) {
3030         if (!bs->dirty_bitmap) {
3031             bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3032                     BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3033             bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
3034 
3035             bs->dirty_bitmap = g_malloc0(bitmap_size);
3036         }
3037     } else {
3038         if (bs->dirty_bitmap) {
3039             g_free(bs->dirty_bitmap);
3040             bs->dirty_bitmap = NULL;
3041         }
3042     }
3043 }
3044 
3045 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3046 {
3047     int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3048 
3049     if (bs->dirty_bitmap &&
3050         (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3051         return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3052             (1UL << (chunk % (sizeof(unsigned long) * 8))));
3053     } else {
3054         return 0;
3055     }
3056 }
3057 
3058 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3059                       int nr_sectors)
3060 {
3061     set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3062 }
3063 
3064 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3065 {
3066     return bs->dirty_count;
3067 }
3068 
3069 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3070 {
3071     assert(bs->in_use != in_use);
3072     bs->in_use = in_use;
3073 }
3074 
3075 int bdrv_in_use(BlockDriverState *bs)
3076 {
3077     return bs->in_use;
3078 }
3079 
3080 void bdrv_iostatus_enable(BlockDriverState *bs)
3081 {
3082     bs->iostatus = BDRV_IOS_OK;
3083 }
3084 
3085 /* The I/O status is only enabled if the drive explicitly
3086  * enables it _and_ the VM is configured to stop on errors */
3087 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3088 {
3089     return (bs->iostatus != BDRV_IOS_INVAL &&
3090            (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3091             bs->on_write_error == BLOCK_ERR_STOP_ANY    ||
3092             bs->on_read_error == BLOCK_ERR_STOP_ANY));
3093 }
3094 
3095 void bdrv_iostatus_disable(BlockDriverState *bs)
3096 {
3097     bs->iostatus = BDRV_IOS_INVAL;
3098 }
3099 
3100 void bdrv_iostatus_reset(BlockDriverState *bs)
3101 {
3102     if (bdrv_iostatus_is_enabled(bs)) {
3103         bs->iostatus = BDRV_IOS_OK;
3104     }
3105 }
3106 
3107 /* XXX: Today this is set by device models because it makes the implementation
3108    quite simple. However, the block layer knows about the error, so it's
3109    possible to implement this without device models being involved */
3110 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3111 {
3112     if (bdrv_iostatus_is_enabled(bs) && bs->iostatus == BDRV_IOS_OK) {
3113         assert(error >= 0);
3114         bs->iostatus = error == ENOSPC ? BDRV_IOS_ENOSPC : BDRV_IOS_FAILED;
3115     }
3116 }
3117 
3118 void
3119 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3120         enum BlockAcctType type)
3121 {
3122     assert(type < BDRV_MAX_IOTYPE);
3123 
3124     cookie->bytes = bytes;
3125     cookie->start_time_ns = get_clock();
3126     cookie->type = type;
3127 }
3128 
3129 void
3130 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3131 {
3132     assert(cookie->type < BDRV_MAX_IOTYPE);
3133 
3134     bs->nr_bytes[cookie->type] += cookie->bytes;
3135     bs->nr_ops[cookie->type]++;
3136     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
3137 }
3138 
3139 int bdrv_img_create(const char *filename, const char *fmt,
3140                     const char *base_filename, const char *base_fmt,
3141                     char *options, uint64_t img_size, int flags)
3142 {
3143     QEMUOptionParameter *param = NULL, *create_options = NULL;
3144     QEMUOptionParameter *backing_fmt, *backing_file, *size;
3145     BlockDriverState *bs = NULL;
3146     BlockDriver *drv, *proto_drv;
3147     BlockDriver *backing_drv = NULL;
3148     int ret = 0;
3149 
3150     /* Find driver and parse its options */
3151     drv = bdrv_find_format(fmt);
3152     if (!drv) {
3153         error_report("Unknown file format '%s'", fmt);
3154         ret = -EINVAL;
3155         goto out;
3156     }
3157 
3158     proto_drv = bdrv_find_protocol(filename);
3159     if (!proto_drv) {
3160         error_report("Unknown protocol '%s'", filename);
3161         ret = -EINVAL;
3162         goto out;
3163     }
3164 
3165     create_options = append_option_parameters(create_options,
3166                                               drv->create_options);
3167     create_options = append_option_parameters(create_options,
3168                                               proto_drv->create_options);
3169 
3170     /* Create parameter list with default values */
3171     param = parse_option_parameters("", create_options, param);
3172 
3173     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
3174 
3175     /* Parse -o options */
3176     if (options) {
3177         param = parse_option_parameters(options, create_options, param);
3178         if (param == NULL) {
3179             error_report("Invalid options for file format '%s'.", fmt);
3180             ret = -EINVAL;
3181             goto out;
3182         }
3183     }
3184 
3185     if (base_filename) {
3186         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
3187                                  base_filename)) {
3188             error_report("Backing file not supported for file format '%s'",
3189                          fmt);
3190             ret = -EINVAL;
3191             goto out;
3192         }
3193     }
3194 
3195     if (base_fmt) {
3196         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
3197             error_report("Backing file format not supported for file "
3198                          "format '%s'", fmt);
3199             ret = -EINVAL;
3200             goto out;
3201         }
3202     }
3203 
3204     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
3205     if (backing_file && backing_file->value.s) {
3206         if (!strcmp(filename, backing_file->value.s)) {
3207             error_report("Error: Trying to create an image with the "
3208                          "same filename as the backing file");
3209             ret = -EINVAL;
3210             goto out;
3211         }
3212     }
3213 
3214     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
3215     if (backing_fmt && backing_fmt->value.s) {
3216         backing_drv = bdrv_find_format(backing_fmt->value.s);
3217         if (!backing_drv) {
3218             error_report("Unknown backing file format '%s'",
3219                          backing_fmt->value.s);
3220             ret = -EINVAL;
3221             goto out;
3222         }
3223     }
3224 
3225     // The size for the image must always be specified, with one exception:
3226     // If we are using a backing file, we can obtain the size from there
3227     size = get_option_parameter(param, BLOCK_OPT_SIZE);
3228     if (size && size->value.n == -1) {
3229         if (backing_file && backing_file->value.s) {
3230             uint64_t size;
3231             char buf[32];
3232 
3233             bs = bdrv_new("");
3234 
3235             ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
3236             if (ret < 0) {
3237                 error_report("Could not open '%s'", backing_file->value.s);
3238                 goto out;
3239             }
3240             bdrv_get_geometry(bs, &size);
3241             size *= 512;
3242 
3243             snprintf(buf, sizeof(buf), "%" PRId64, size);
3244             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
3245         } else {
3246             error_report("Image creation needs a size parameter");
3247             ret = -EINVAL;
3248             goto out;
3249         }
3250     }
3251 
3252     printf("Formatting '%s', fmt=%s ", filename, fmt);
3253     print_option_parameters(param);
3254     puts("");
3255 
3256     ret = bdrv_create(drv, filename, param);
3257 
3258     if (ret < 0) {
3259         if (ret == -ENOTSUP) {
3260             error_report("Formatting or formatting option not supported for "
3261                          "file format '%s'", fmt);
3262         } else if (ret == -EFBIG) {
3263             error_report("The image size is too large for file format '%s'",
3264                          fmt);
3265         } else {
3266             error_report("%s: error while creating %s: %s", filename, fmt,
3267                          strerror(-ret));
3268         }
3269     }
3270 
3271 out:
3272     free_option_parameters(create_options);
3273     free_option_parameters(param);
3274 
3275     if (bs) {
3276         bdrv_delete(bs);
3277     }
3278 
3279     return ret;
3280 }
3281