xref: /openbmc/qemu/block.c (revision b202381800d81fbff9978abbdea95760dd363bb6)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor.h"
28 #include "block_int.h"
29 #include "module.h"
30 #include "qemu-objects.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 
34 #ifdef CONFIG_BSD
35 #include <sys/types.h>
36 #include <sys/stat.h>
37 #include <sys/ioctl.h>
38 #include <sys/queue.h>
39 #ifndef __DragonFly__
40 #include <sys/disk.h>
41 #endif
42 #endif
43 
44 #ifdef _WIN32
45 #include <windows.h>
46 #endif
47 
48 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
49 
50 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
51 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
52         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
53         BlockDriverCompletionFunc *cb, void *opaque);
54 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
55         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
56         BlockDriverCompletionFunc *cb, void *opaque);
57 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
58                                          int64_t sector_num, int nb_sectors,
59                                          QEMUIOVector *iov);
60 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
61                                          int64_t sector_num, int nb_sectors,
62                                          QEMUIOVector *iov);
63 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
64     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
65 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
66     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
67 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
68                                                int64_t sector_num,
69                                                QEMUIOVector *qiov,
70                                                int nb_sectors,
71                                                BlockDriverCompletionFunc *cb,
72                                                void *opaque,
73                                                bool is_write);
74 static void coroutine_fn bdrv_co_do_rw(void *opaque);
75 
76 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
77     QTAILQ_HEAD_INITIALIZER(bdrv_states);
78 
79 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
80     QLIST_HEAD_INITIALIZER(bdrv_drivers);
81 
82 /* The device to use for VM snapshots */
83 static BlockDriverState *bs_snapshots;
84 
85 /* If non-zero, use only whitelisted block drivers */
86 static int use_bdrv_whitelist;
87 
88 #ifdef _WIN32
89 static int is_windows_drive_prefix(const char *filename)
90 {
91     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
92              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
93             filename[1] == ':');
94 }
95 
96 int is_windows_drive(const char *filename)
97 {
98     if (is_windows_drive_prefix(filename) &&
99         filename[2] == '\0')
100         return 1;
101     if (strstart(filename, "\\\\.\\", NULL) ||
102         strstart(filename, "//./", NULL))
103         return 1;
104     return 0;
105 }
106 #endif
107 
108 /* check if the path starts with "<protocol>:" */
109 static int path_has_protocol(const char *path)
110 {
111 #ifdef _WIN32
112     if (is_windows_drive(path) ||
113         is_windows_drive_prefix(path)) {
114         return 0;
115     }
116 #endif
117 
118     return strchr(path, ':') != NULL;
119 }
120 
121 int path_is_absolute(const char *path)
122 {
123     const char *p;
124 #ifdef _WIN32
125     /* specific case for names like: "\\.\d:" */
126     if (*path == '/' || *path == '\\')
127         return 1;
128 #endif
129     p = strchr(path, ':');
130     if (p)
131         p++;
132     else
133         p = path;
134 #ifdef _WIN32
135     return (*p == '/' || *p == '\\');
136 #else
137     return (*p == '/');
138 #endif
139 }
140 
141 /* if filename is absolute, just copy it to dest. Otherwise, build a
142    path to it by considering it is relative to base_path. URL are
143    supported. */
144 void path_combine(char *dest, int dest_size,
145                   const char *base_path,
146                   const char *filename)
147 {
148     const char *p, *p1;
149     int len;
150 
151     if (dest_size <= 0)
152         return;
153     if (path_is_absolute(filename)) {
154         pstrcpy(dest, dest_size, filename);
155     } else {
156         p = strchr(base_path, ':');
157         if (p)
158             p++;
159         else
160             p = base_path;
161         p1 = strrchr(base_path, '/');
162 #ifdef _WIN32
163         {
164             const char *p2;
165             p2 = strrchr(base_path, '\\');
166             if (!p1 || p2 > p1)
167                 p1 = p2;
168         }
169 #endif
170         if (p1)
171             p1++;
172         else
173             p1 = base_path;
174         if (p1 > p)
175             p = p1;
176         len = p - base_path;
177         if (len > dest_size - 1)
178             len = dest_size - 1;
179         memcpy(dest, base_path, len);
180         dest[len] = '\0';
181         pstrcat(dest, dest_size, filename);
182     }
183 }
184 
185 void bdrv_register(BlockDriver *bdrv)
186 {
187     /* Block drivers without coroutine functions need emulation */
188     if (!bdrv->bdrv_co_readv) {
189         bdrv->bdrv_co_readv = bdrv_co_readv_em;
190         bdrv->bdrv_co_writev = bdrv_co_writev_em;
191 
192         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
193          * the block driver lacks aio we need to emulate that too.
194          */
195         if (!bdrv->bdrv_aio_readv) {
196             /* add AIO emulation layer */
197             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
198             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
199         }
200     }
201 
202     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
203 }
204 
205 /* create a new block device (by default it is empty) */
206 BlockDriverState *bdrv_new(const char *device_name)
207 {
208     BlockDriverState *bs;
209 
210     bs = g_malloc0(sizeof(BlockDriverState));
211     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
212     if (device_name[0] != '\0') {
213         QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
214     }
215     bdrv_iostatus_disable(bs);
216     return bs;
217 }
218 
219 BlockDriver *bdrv_find_format(const char *format_name)
220 {
221     BlockDriver *drv1;
222     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
223         if (!strcmp(drv1->format_name, format_name)) {
224             return drv1;
225         }
226     }
227     return NULL;
228 }
229 
230 static int bdrv_is_whitelisted(BlockDriver *drv)
231 {
232     static const char *whitelist[] = {
233         CONFIG_BDRV_WHITELIST
234     };
235     const char **p;
236 
237     if (!whitelist[0])
238         return 1;               /* no whitelist, anything goes */
239 
240     for (p = whitelist; *p; p++) {
241         if (!strcmp(drv->format_name, *p)) {
242             return 1;
243         }
244     }
245     return 0;
246 }
247 
248 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
249 {
250     BlockDriver *drv = bdrv_find_format(format_name);
251     return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
252 }
253 
254 int bdrv_create(BlockDriver *drv, const char* filename,
255     QEMUOptionParameter *options)
256 {
257     if (!drv->bdrv_create)
258         return -ENOTSUP;
259 
260     return drv->bdrv_create(filename, options);
261 }
262 
263 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
264 {
265     BlockDriver *drv;
266 
267     drv = bdrv_find_protocol(filename);
268     if (drv == NULL) {
269         return -ENOENT;
270     }
271 
272     return bdrv_create(drv, filename, options);
273 }
274 
275 #ifdef _WIN32
276 void get_tmp_filename(char *filename, int size)
277 {
278     char temp_dir[MAX_PATH];
279 
280     GetTempPath(MAX_PATH, temp_dir);
281     GetTempFileName(temp_dir, "qem", 0, filename);
282 }
283 #else
284 void get_tmp_filename(char *filename, int size)
285 {
286     int fd;
287     const char *tmpdir;
288     /* XXX: race condition possible */
289     tmpdir = getenv("TMPDIR");
290     if (!tmpdir)
291         tmpdir = "/tmp";
292     snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
293     fd = mkstemp(filename);
294     close(fd);
295 }
296 #endif
297 
298 /*
299  * Detect host devices. By convention, /dev/cdrom[N] is always
300  * recognized as a host CDROM.
301  */
302 static BlockDriver *find_hdev_driver(const char *filename)
303 {
304     int score_max = 0, score;
305     BlockDriver *drv = NULL, *d;
306 
307     QLIST_FOREACH(d, &bdrv_drivers, list) {
308         if (d->bdrv_probe_device) {
309             score = d->bdrv_probe_device(filename);
310             if (score > score_max) {
311                 score_max = score;
312                 drv = d;
313             }
314         }
315     }
316 
317     return drv;
318 }
319 
320 BlockDriver *bdrv_find_protocol(const char *filename)
321 {
322     BlockDriver *drv1;
323     char protocol[128];
324     int len;
325     const char *p;
326 
327     /* TODO Drivers without bdrv_file_open must be specified explicitly */
328 
329     /*
330      * XXX(hch): we really should not let host device detection
331      * override an explicit protocol specification, but moving this
332      * later breaks access to device names with colons in them.
333      * Thanks to the brain-dead persistent naming schemes on udev-
334      * based Linux systems those actually are quite common.
335      */
336     drv1 = find_hdev_driver(filename);
337     if (drv1) {
338         return drv1;
339     }
340 
341     if (!path_has_protocol(filename)) {
342         return bdrv_find_format("file");
343     }
344     p = strchr(filename, ':');
345     assert(p != NULL);
346     len = p - filename;
347     if (len > sizeof(protocol) - 1)
348         len = sizeof(protocol) - 1;
349     memcpy(protocol, filename, len);
350     protocol[len] = '\0';
351     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
352         if (drv1->protocol_name &&
353             !strcmp(drv1->protocol_name, protocol)) {
354             return drv1;
355         }
356     }
357     return NULL;
358 }
359 
360 static int find_image_format(const char *filename, BlockDriver **pdrv)
361 {
362     int ret, score, score_max;
363     BlockDriver *drv1, *drv;
364     uint8_t buf[2048];
365     BlockDriverState *bs;
366 
367     ret = bdrv_file_open(&bs, filename, 0);
368     if (ret < 0) {
369         *pdrv = NULL;
370         return ret;
371     }
372 
373     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
374     if (bs->sg || !bdrv_is_inserted(bs)) {
375         bdrv_delete(bs);
376         drv = bdrv_find_format("raw");
377         if (!drv) {
378             ret = -ENOENT;
379         }
380         *pdrv = drv;
381         return ret;
382     }
383 
384     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
385     bdrv_delete(bs);
386     if (ret < 0) {
387         *pdrv = NULL;
388         return ret;
389     }
390 
391     score_max = 0;
392     drv = NULL;
393     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
394         if (drv1->bdrv_probe) {
395             score = drv1->bdrv_probe(buf, ret, filename);
396             if (score > score_max) {
397                 score_max = score;
398                 drv = drv1;
399             }
400         }
401     }
402     if (!drv) {
403         ret = -ENOENT;
404     }
405     *pdrv = drv;
406     return ret;
407 }
408 
409 /**
410  * Set the current 'total_sectors' value
411  */
412 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
413 {
414     BlockDriver *drv = bs->drv;
415 
416     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
417     if (bs->sg)
418         return 0;
419 
420     /* query actual device if possible, otherwise just trust the hint */
421     if (drv->bdrv_getlength) {
422         int64_t length = drv->bdrv_getlength(bs);
423         if (length < 0) {
424             return length;
425         }
426         hint = length >> BDRV_SECTOR_BITS;
427     }
428 
429     bs->total_sectors = hint;
430     return 0;
431 }
432 
433 /**
434  * Set open flags for a given cache mode
435  *
436  * Return 0 on success, -1 if the cache mode was invalid.
437  */
438 int bdrv_parse_cache_flags(const char *mode, int *flags)
439 {
440     *flags &= ~BDRV_O_CACHE_MASK;
441 
442     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
443         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
444     } else if (!strcmp(mode, "directsync")) {
445         *flags |= BDRV_O_NOCACHE;
446     } else if (!strcmp(mode, "writeback")) {
447         *flags |= BDRV_O_CACHE_WB;
448     } else if (!strcmp(mode, "unsafe")) {
449         *flags |= BDRV_O_CACHE_WB;
450         *flags |= BDRV_O_NO_FLUSH;
451     } else if (!strcmp(mode, "writethrough")) {
452         /* this is the default */
453     } else {
454         return -1;
455     }
456 
457     return 0;
458 }
459 
460 /*
461  * Common part for opening disk images and files
462  */
463 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
464     int flags, BlockDriver *drv)
465 {
466     int ret, open_flags;
467 
468     assert(drv != NULL);
469 
470     trace_bdrv_open_common(bs, filename, flags, drv->format_name);
471 
472     bs->file = NULL;
473     bs->total_sectors = 0;
474     bs->encrypted = 0;
475     bs->valid_key = 0;
476     bs->open_flags = flags;
477     bs->buffer_alignment = 512;
478 
479     pstrcpy(bs->filename, sizeof(bs->filename), filename);
480 
481     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
482         return -ENOTSUP;
483     }
484 
485     bs->drv = drv;
486     bs->opaque = g_malloc0(drv->instance_size);
487 
488     if (flags & BDRV_O_CACHE_WB)
489         bs->enable_write_cache = 1;
490 
491     /*
492      * Clear flags that are internal to the block layer before opening the
493      * image.
494      */
495     open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
496 
497     /*
498      * Snapshots should be writable.
499      */
500     if (bs->is_temporary) {
501         open_flags |= BDRV_O_RDWR;
502     }
503 
504     /* Open the image, either directly or using a protocol */
505     if (drv->bdrv_file_open) {
506         ret = drv->bdrv_file_open(bs, filename, open_flags);
507     } else {
508         ret = bdrv_file_open(&bs->file, filename, open_flags);
509         if (ret >= 0) {
510             ret = drv->bdrv_open(bs, open_flags);
511         }
512     }
513 
514     if (ret < 0) {
515         goto free_and_fail;
516     }
517 
518     bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
519 
520     ret = refresh_total_sectors(bs, bs->total_sectors);
521     if (ret < 0) {
522         goto free_and_fail;
523     }
524 
525 #ifndef _WIN32
526     if (bs->is_temporary) {
527         unlink(filename);
528     }
529 #endif
530     return 0;
531 
532 free_and_fail:
533     if (bs->file) {
534         bdrv_delete(bs->file);
535         bs->file = NULL;
536     }
537     g_free(bs->opaque);
538     bs->opaque = NULL;
539     bs->drv = NULL;
540     return ret;
541 }
542 
543 /*
544  * Opens a file using a protocol (file, host_device, nbd, ...)
545  */
546 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
547 {
548     BlockDriverState *bs;
549     BlockDriver *drv;
550     int ret;
551 
552     drv = bdrv_find_protocol(filename);
553     if (!drv) {
554         return -ENOENT;
555     }
556 
557     bs = bdrv_new("");
558     ret = bdrv_open_common(bs, filename, flags, drv);
559     if (ret < 0) {
560         bdrv_delete(bs);
561         return ret;
562     }
563     bs->growable = 1;
564     *pbs = bs;
565     return 0;
566 }
567 
568 /*
569  * Opens a disk image (raw, qcow2, vmdk, ...)
570  */
571 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
572               BlockDriver *drv)
573 {
574     int ret;
575 
576     if (flags & BDRV_O_SNAPSHOT) {
577         BlockDriverState *bs1;
578         int64_t total_size;
579         int is_protocol = 0;
580         BlockDriver *bdrv_qcow2;
581         QEMUOptionParameter *options;
582         char tmp_filename[PATH_MAX];
583         char backing_filename[PATH_MAX];
584 
585         /* if snapshot, we create a temporary backing file and open it
586            instead of opening 'filename' directly */
587 
588         /* if there is a backing file, use it */
589         bs1 = bdrv_new("");
590         ret = bdrv_open(bs1, filename, 0, drv);
591         if (ret < 0) {
592             bdrv_delete(bs1);
593             return ret;
594         }
595         total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
596 
597         if (bs1->drv && bs1->drv->protocol_name)
598             is_protocol = 1;
599 
600         bdrv_delete(bs1);
601 
602         get_tmp_filename(tmp_filename, sizeof(tmp_filename));
603 
604         /* Real path is meaningless for protocols */
605         if (is_protocol)
606             snprintf(backing_filename, sizeof(backing_filename),
607                      "%s", filename);
608         else if (!realpath(filename, backing_filename))
609             return -errno;
610 
611         bdrv_qcow2 = bdrv_find_format("qcow2");
612         options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
613 
614         set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
615         set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
616         if (drv) {
617             set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
618                 drv->format_name);
619         }
620 
621         ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
622         free_option_parameters(options);
623         if (ret < 0) {
624             return ret;
625         }
626 
627         filename = tmp_filename;
628         drv = bdrv_qcow2;
629         bs->is_temporary = 1;
630     }
631 
632     /* Find the right image format driver */
633     if (!drv) {
634         ret = find_image_format(filename, &drv);
635     }
636 
637     if (!drv) {
638         goto unlink_and_fail;
639     }
640 
641     /* Open the image */
642     ret = bdrv_open_common(bs, filename, flags, drv);
643     if (ret < 0) {
644         goto unlink_and_fail;
645     }
646 
647     /* If there is a backing file, use it */
648     if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
649         char backing_filename[PATH_MAX];
650         int back_flags;
651         BlockDriver *back_drv = NULL;
652 
653         bs->backing_hd = bdrv_new("");
654 
655         if (path_has_protocol(bs->backing_file)) {
656             pstrcpy(backing_filename, sizeof(backing_filename),
657                     bs->backing_file);
658         } else {
659             path_combine(backing_filename, sizeof(backing_filename),
660                          filename, bs->backing_file);
661         }
662 
663         if (bs->backing_format[0] != '\0') {
664             back_drv = bdrv_find_format(bs->backing_format);
665         }
666 
667         /* backing files always opened read-only */
668         back_flags =
669             flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
670 
671         ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
672         if (ret < 0) {
673             bdrv_close(bs);
674             return ret;
675         }
676         if (bs->is_temporary) {
677             bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
678         } else {
679             /* base image inherits from "parent" */
680             bs->backing_hd->keep_read_only = bs->keep_read_only;
681         }
682     }
683 
684     if (!bdrv_key_required(bs)) {
685         bdrv_dev_change_media_cb(bs, true);
686     }
687 
688     return 0;
689 
690 unlink_and_fail:
691     if (bs->is_temporary) {
692         unlink(filename);
693     }
694     return ret;
695 }
696 
697 void bdrv_close(BlockDriverState *bs)
698 {
699     if (bs->drv) {
700         if (bs == bs_snapshots) {
701             bs_snapshots = NULL;
702         }
703         if (bs->backing_hd) {
704             bdrv_delete(bs->backing_hd);
705             bs->backing_hd = NULL;
706         }
707         bs->drv->bdrv_close(bs);
708         g_free(bs->opaque);
709 #ifdef _WIN32
710         if (bs->is_temporary) {
711             unlink(bs->filename);
712         }
713 #endif
714         bs->opaque = NULL;
715         bs->drv = NULL;
716 
717         if (bs->file != NULL) {
718             bdrv_close(bs->file);
719         }
720 
721         bdrv_dev_change_media_cb(bs, false);
722     }
723 }
724 
725 void bdrv_close_all(void)
726 {
727     BlockDriverState *bs;
728 
729     QTAILQ_FOREACH(bs, &bdrv_states, list) {
730         bdrv_close(bs);
731     }
732 }
733 
734 /* make a BlockDriverState anonymous by removing from bdrv_state list.
735    Also, NULL terminate the device_name to prevent double remove */
736 void bdrv_make_anon(BlockDriverState *bs)
737 {
738     if (bs->device_name[0] != '\0') {
739         QTAILQ_REMOVE(&bdrv_states, bs, list);
740     }
741     bs->device_name[0] = '\0';
742 }
743 
744 void bdrv_delete(BlockDriverState *bs)
745 {
746     assert(!bs->dev);
747 
748     /* remove from list, if necessary */
749     bdrv_make_anon(bs);
750 
751     bdrv_close(bs);
752     if (bs->file != NULL) {
753         bdrv_delete(bs->file);
754     }
755 
756     assert(bs != bs_snapshots);
757     g_free(bs);
758 }
759 
760 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
761 /* TODO change to DeviceState *dev when all users are qdevified */
762 {
763     if (bs->dev) {
764         return -EBUSY;
765     }
766     bs->dev = dev;
767     bdrv_iostatus_reset(bs);
768     return 0;
769 }
770 
771 /* TODO qdevified devices don't use this, remove when devices are qdevified */
772 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
773 {
774     if (bdrv_attach_dev(bs, dev) < 0) {
775         abort();
776     }
777 }
778 
779 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
780 /* TODO change to DeviceState *dev when all users are qdevified */
781 {
782     assert(bs->dev == dev);
783     bs->dev = NULL;
784     bs->dev_ops = NULL;
785     bs->dev_opaque = NULL;
786     bs->buffer_alignment = 512;
787 }
788 
789 /* TODO change to return DeviceState * when all users are qdevified */
790 void *bdrv_get_attached_dev(BlockDriverState *bs)
791 {
792     return bs->dev;
793 }
794 
795 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
796                       void *opaque)
797 {
798     bs->dev_ops = ops;
799     bs->dev_opaque = opaque;
800     if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
801         bs_snapshots = NULL;
802     }
803 }
804 
805 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
806 {
807     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
808         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
809     }
810 }
811 
812 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
813 {
814     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
815 }
816 
817 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
818 {
819     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
820         return bs->dev_ops->is_tray_open(bs->dev_opaque);
821     }
822     return false;
823 }
824 
825 static void bdrv_dev_resize_cb(BlockDriverState *bs)
826 {
827     if (bs->dev_ops && bs->dev_ops->resize_cb) {
828         bs->dev_ops->resize_cb(bs->dev_opaque);
829     }
830 }
831 
832 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
833 {
834     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
835         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
836     }
837     return false;
838 }
839 
840 /*
841  * Run consistency checks on an image
842  *
843  * Returns 0 if the check could be completed (it doesn't mean that the image is
844  * free of errors) or -errno when an internal error occurred. The results of the
845  * check are stored in res.
846  */
847 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
848 {
849     if (bs->drv->bdrv_check == NULL) {
850         return -ENOTSUP;
851     }
852 
853     memset(res, 0, sizeof(*res));
854     return bs->drv->bdrv_check(bs, res);
855 }
856 
857 #define COMMIT_BUF_SECTORS 2048
858 
859 /* commit COW file into the raw image */
860 int bdrv_commit(BlockDriverState *bs)
861 {
862     BlockDriver *drv = bs->drv;
863     BlockDriver *backing_drv;
864     int64_t sector, total_sectors;
865     int n, ro, open_flags;
866     int ret = 0, rw_ret = 0;
867     uint8_t *buf;
868     char filename[1024];
869     BlockDriverState *bs_rw, *bs_ro;
870 
871     if (!drv)
872         return -ENOMEDIUM;
873 
874     if (!bs->backing_hd) {
875         return -ENOTSUP;
876     }
877 
878     if (bs->backing_hd->keep_read_only) {
879         return -EACCES;
880     }
881 
882     backing_drv = bs->backing_hd->drv;
883     ro = bs->backing_hd->read_only;
884     strncpy(filename, bs->backing_hd->filename, sizeof(filename));
885     open_flags =  bs->backing_hd->open_flags;
886 
887     if (ro) {
888         /* re-open as RW */
889         bdrv_delete(bs->backing_hd);
890         bs->backing_hd = NULL;
891         bs_rw = bdrv_new("");
892         rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
893             backing_drv);
894         if (rw_ret < 0) {
895             bdrv_delete(bs_rw);
896             /* try to re-open read-only */
897             bs_ro = bdrv_new("");
898             ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
899                 backing_drv);
900             if (ret < 0) {
901                 bdrv_delete(bs_ro);
902                 /* drive not functional anymore */
903                 bs->drv = NULL;
904                 return ret;
905             }
906             bs->backing_hd = bs_ro;
907             return rw_ret;
908         }
909         bs->backing_hd = bs_rw;
910     }
911 
912     total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
913     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
914 
915     for (sector = 0; sector < total_sectors; sector += n) {
916         if (drv->bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
917 
918             if (bdrv_read(bs, sector, buf, n) != 0) {
919                 ret = -EIO;
920                 goto ro_cleanup;
921             }
922 
923             if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
924                 ret = -EIO;
925                 goto ro_cleanup;
926             }
927         }
928     }
929 
930     if (drv->bdrv_make_empty) {
931         ret = drv->bdrv_make_empty(bs);
932         bdrv_flush(bs);
933     }
934 
935     /*
936      * Make sure all data we wrote to the backing device is actually
937      * stable on disk.
938      */
939     if (bs->backing_hd)
940         bdrv_flush(bs->backing_hd);
941 
942 ro_cleanup:
943     g_free(buf);
944 
945     if (ro) {
946         /* re-open as RO */
947         bdrv_delete(bs->backing_hd);
948         bs->backing_hd = NULL;
949         bs_ro = bdrv_new("");
950         ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
951             backing_drv);
952         if (ret < 0) {
953             bdrv_delete(bs_ro);
954             /* drive not functional anymore */
955             bs->drv = NULL;
956             return ret;
957         }
958         bs->backing_hd = bs_ro;
959         bs->backing_hd->keep_read_only = 0;
960     }
961 
962     return ret;
963 }
964 
965 void bdrv_commit_all(void)
966 {
967     BlockDriverState *bs;
968 
969     QTAILQ_FOREACH(bs, &bdrv_states, list) {
970         bdrv_commit(bs);
971     }
972 }
973 
974 /*
975  * Return values:
976  * 0        - success
977  * -EINVAL  - backing format specified, but no file
978  * -ENOSPC  - can't update the backing file because no space is left in the
979  *            image file header
980  * -ENOTSUP - format driver doesn't support changing the backing file
981  */
982 int bdrv_change_backing_file(BlockDriverState *bs,
983     const char *backing_file, const char *backing_fmt)
984 {
985     BlockDriver *drv = bs->drv;
986 
987     if (drv->bdrv_change_backing_file != NULL) {
988         return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
989     } else {
990         return -ENOTSUP;
991     }
992 }
993 
994 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
995                                    size_t size)
996 {
997     int64_t len;
998 
999     if (!bdrv_is_inserted(bs))
1000         return -ENOMEDIUM;
1001 
1002     if (bs->growable)
1003         return 0;
1004 
1005     len = bdrv_getlength(bs);
1006 
1007     if (offset < 0)
1008         return -EIO;
1009 
1010     if ((offset > len) || (len - offset < size))
1011         return -EIO;
1012 
1013     return 0;
1014 }
1015 
1016 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1017                               int nb_sectors)
1018 {
1019     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1020                                    nb_sectors * BDRV_SECTOR_SIZE);
1021 }
1022 
1023 typedef struct RwCo {
1024     BlockDriverState *bs;
1025     int64_t sector_num;
1026     int nb_sectors;
1027     QEMUIOVector *qiov;
1028     bool is_write;
1029     int ret;
1030 } RwCo;
1031 
1032 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1033 {
1034     RwCo *rwco = opaque;
1035 
1036     if (!rwco->is_write) {
1037         rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1038                                      rwco->nb_sectors, rwco->qiov);
1039     } else {
1040         rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1041                                       rwco->nb_sectors, rwco->qiov);
1042     }
1043 }
1044 
1045 /*
1046  * Process a synchronous request using coroutines
1047  */
1048 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1049                       int nb_sectors, bool is_write)
1050 {
1051     QEMUIOVector qiov;
1052     struct iovec iov = {
1053         .iov_base = (void *)buf,
1054         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1055     };
1056     Coroutine *co;
1057     RwCo rwco = {
1058         .bs = bs,
1059         .sector_num = sector_num,
1060         .nb_sectors = nb_sectors,
1061         .qiov = &qiov,
1062         .is_write = is_write,
1063         .ret = NOT_DONE,
1064     };
1065 
1066     qemu_iovec_init_external(&qiov, &iov, 1);
1067 
1068     if (qemu_in_coroutine()) {
1069         /* Fast-path if already in coroutine context */
1070         bdrv_rw_co_entry(&rwco);
1071     } else {
1072         co = qemu_coroutine_create(bdrv_rw_co_entry);
1073         qemu_coroutine_enter(co, &rwco);
1074         while (rwco.ret == NOT_DONE) {
1075             qemu_aio_wait();
1076         }
1077     }
1078     return rwco.ret;
1079 }
1080 
1081 /* return < 0 if error. See bdrv_write() for the return codes */
1082 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1083               uint8_t *buf, int nb_sectors)
1084 {
1085     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1086 }
1087 
1088 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1089                              int nb_sectors, int dirty)
1090 {
1091     int64_t start, end;
1092     unsigned long val, idx, bit;
1093 
1094     start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1095     end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1096 
1097     for (; start <= end; start++) {
1098         idx = start / (sizeof(unsigned long) * 8);
1099         bit = start % (sizeof(unsigned long) * 8);
1100         val = bs->dirty_bitmap[idx];
1101         if (dirty) {
1102             if (!(val & (1UL << bit))) {
1103                 bs->dirty_count++;
1104                 val |= 1UL << bit;
1105             }
1106         } else {
1107             if (val & (1UL << bit)) {
1108                 bs->dirty_count--;
1109                 val &= ~(1UL << bit);
1110             }
1111         }
1112         bs->dirty_bitmap[idx] = val;
1113     }
1114 }
1115 
1116 /* Return < 0 if error. Important errors are:
1117   -EIO         generic I/O error (may happen for all errors)
1118   -ENOMEDIUM   No media inserted.
1119   -EINVAL      Invalid sector number or nb_sectors
1120   -EACCES      Trying to write a read-only device
1121 */
1122 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1123                const uint8_t *buf, int nb_sectors)
1124 {
1125     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1126 }
1127 
1128 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1129                void *buf, int count1)
1130 {
1131     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1132     int len, nb_sectors, count;
1133     int64_t sector_num;
1134     int ret;
1135 
1136     count = count1;
1137     /* first read to align to sector start */
1138     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1139     if (len > count)
1140         len = count;
1141     sector_num = offset >> BDRV_SECTOR_BITS;
1142     if (len > 0) {
1143         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1144             return ret;
1145         memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1146         count -= len;
1147         if (count == 0)
1148             return count1;
1149         sector_num++;
1150         buf += len;
1151     }
1152 
1153     /* read the sectors "in place" */
1154     nb_sectors = count >> BDRV_SECTOR_BITS;
1155     if (nb_sectors > 0) {
1156         if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1157             return ret;
1158         sector_num += nb_sectors;
1159         len = nb_sectors << BDRV_SECTOR_BITS;
1160         buf += len;
1161         count -= len;
1162     }
1163 
1164     /* add data from the last sector */
1165     if (count > 0) {
1166         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1167             return ret;
1168         memcpy(buf, tmp_buf, count);
1169     }
1170     return count1;
1171 }
1172 
1173 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1174                 const void *buf, int count1)
1175 {
1176     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1177     int len, nb_sectors, count;
1178     int64_t sector_num;
1179     int ret;
1180 
1181     count = count1;
1182     /* first write to align to sector start */
1183     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1184     if (len > count)
1185         len = count;
1186     sector_num = offset >> BDRV_SECTOR_BITS;
1187     if (len > 0) {
1188         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1189             return ret;
1190         memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1191         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1192             return ret;
1193         count -= len;
1194         if (count == 0)
1195             return count1;
1196         sector_num++;
1197         buf += len;
1198     }
1199 
1200     /* write the sectors "in place" */
1201     nb_sectors = count >> BDRV_SECTOR_BITS;
1202     if (nb_sectors > 0) {
1203         if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1204             return ret;
1205         sector_num += nb_sectors;
1206         len = nb_sectors << BDRV_SECTOR_BITS;
1207         buf += len;
1208         count -= len;
1209     }
1210 
1211     /* add data from the last sector */
1212     if (count > 0) {
1213         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1214             return ret;
1215         memcpy(tmp_buf, buf, count);
1216         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1217             return ret;
1218     }
1219     return count1;
1220 }
1221 
1222 /*
1223  * Writes to the file and ensures that no writes are reordered across this
1224  * request (acts as a barrier)
1225  *
1226  * Returns 0 on success, -errno in error cases.
1227  */
1228 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1229     const void *buf, int count)
1230 {
1231     int ret;
1232 
1233     ret = bdrv_pwrite(bs, offset, buf, count);
1234     if (ret < 0) {
1235         return ret;
1236     }
1237 
1238     /* No flush needed for cache modes that use O_DSYNC */
1239     if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
1240         bdrv_flush(bs);
1241     }
1242 
1243     return 0;
1244 }
1245 
1246 /*
1247  * Handle a read request in coroutine context
1248  */
1249 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1250     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1251 {
1252     BlockDriver *drv = bs->drv;
1253 
1254     if (!drv) {
1255         return -ENOMEDIUM;
1256     }
1257     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1258         return -EIO;
1259     }
1260 
1261     return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1262 }
1263 
1264 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1265     int nb_sectors, QEMUIOVector *qiov)
1266 {
1267     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1268 
1269     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov);
1270 }
1271 
1272 /*
1273  * Handle a write request in coroutine context
1274  */
1275 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1276     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1277 {
1278     BlockDriver *drv = bs->drv;
1279     int ret;
1280 
1281     if (!bs->drv) {
1282         return -ENOMEDIUM;
1283     }
1284     if (bs->read_only) {
1285         return -EACCES;
1286     }
1287     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1288         return -EIO;
1289     }
1290 
1291     ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1292 
1293     if (bs->dirty_bitmap) {
1294         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1295     }
1296 
1297     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1298         bs->wr_highest_sector = sector_num + nb_sectors - 1;
1299     }
1300 
1301     return ret;
1302 }
1303 
1304 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1305     int nb_sectors, QEMUIOVector *qiov)
1306 {
1307     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1308 
1309     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov);
1310 }
1311 
1312 /**
1313  * Truncate file to 'offset' bytes (needed only for file protocols)
1314  */
1315 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1316 {
1317     BlockDriver *drv = bs->drv;
1318     int ret;
1319     if (!drv)
1320         return -ENOMEDIUM;
1321     if (!drv->bdrv_truncate)
1322         return -ENOTSUP;
1323     if (bs->read_only)
1324         return -EACCES;
1325     if (bdrv_in_use(bs))
1326         return -EBUSY;
1327     ret = drv->bdrv_truncate(bs, offset);
1328     if (ret == 0) {
1329         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1330         bdrv_dev_resize_cb(bs);
1331     }
1332     return ret;
1333 }
1334 
1335 /**
1336  * Length of a allocated file in bytes. Sparse files are counted by actual
1337  * allocated space. Return < 0 if error or unknown.
1338  */
1339 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
1340 {
1341     BlockDriver *drv = bs->drv;
1342     if (!drv) {
1343         return -ENOMEDIUM;
1344     }
1345     if (drv->bdrv_get_allocated_file_size) {
1346         return drv->bdrv_get_allocated_file_size(bs);
1347     }
1348     if (bs->file) {
1349         return bdrv_get_allocated_file_size(bs->file);
1350     }
1351     return -ENOTSUP;
1352 }
1353 
1354 /**
1355  * Length of a file in bytes. Return < 0 if error or unknown.
1356  */
1357 int64_t bdrv_getlength(BlockDriverState *bs)
1358 {
1359     BlockDriver *drv = bs->drv;
1360     if (!drv)
1361         return -ENOMEDIUM;
1362 
1363     if (bs->growable || bdrv_dev_has_removable_media(bs)) {
1364         if (drv->bdrv_getlength) {
1365             return drv->bdrv_getlength(bs);
1366         }
1367     }
1368     return bs->total_sectors * BDRV_SECTOR_SIZE;
1369 }
1370 
1371 /* return 0 as number of sectors if no device present or error */
1372 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
1373 {
1374     int64_t length;
1375     length = bdrv_getlength(bs);
1376     if (length < 0)
1377         length = 0;
1378     else
1379         length = length >> BDRV_SECTOR_BITS;
1380     *nb_sectors_ptr = length;
1381 }
1382 
1383 struct partition {
1384         uint8_t boot_ind;           /* 0x80 - active */
1385         uint8_t head;               /* starting head */
1386         uint8_t sector;             /* starting sector */
1387         uint8_t cyl;                /* starting cylinder */
1388         uint8_t sys_ind;            /* What partition type */
1389         uint8_t end_head;           /* end head */
1390         uint8_t end_sector;         /* end sector */
1391         uint8_t end_cyl;            /* end cylinder */
1392         uint32_t start_sect;        /* starting sector counting from 0 */
1393         uint32_t nr_sects;          /* nr of sectors in partition */
1394 } QEMU_PACKED;
1395 
1396 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1397 static int guess_disk_lchs(BlockDriverState *bs,
1398                            int *pcylinders, int *pheads, int *psectors)
1399 {
1400     uint8_t buf[BDRV_SECTOR_SIZE];
1401     int ret, i, heads, sectors, cylinders;
1402     struct partition *p;
1403     uint32_t nr_sects;
1404     uint64_t nb_sectors;
1405 
1406     bdrv_get_geometry(bs, &nb_sectors);
1407 
1408     ret = bdrv_read(bs, 0, buf, 1);
1409     if (ret < 0)
1410         return -1;
1411     /* test msdos magic */
1412     if (buf[510] != 0x55 || buf[511] != 0xaa)
1413         return -1;
1414     for(i = 0; i < 4; i++) {
1415         p = ((struct partition *)(buf + 0x1be)) + i;
1416         nr_sects = le32_to_cpu(p->nr_sects);
1417         if (nr_sects && p->end_head) {
1418             /* We make the assumption that the partition terminates on
1419                a cylinder boundary */
1420             heads = p->end_head + 1;
1421             sectors = p->end_sector & 63;
1422             if (sectors == 0)
1423                 continue;
1424             cylinders = nb_sectors / (heads * sectors);
1425             if (cylinders < 1 || cylinders > 16383)
1426                 continue;
1427             *pheads = heads;
1428             *psectors = sectors;
1429             *pcylinders = cylinders;
1430 #if 0
1431             printf("guessed geometry: LCHS=%d %d %d\n",
1432                    cylinders, heads, sectors);
1433 #endif
1434             return 0;
1435         }
1436     }
1437     return -1;
1438 }
1439 
1440 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
1441 {
1442     int translation, lba_detected = 0;
1443     int cylinders, heads, secs;
1444     uint64_t nb_sectors;
1445 
1446     /* if a geometry hint is available, use it */
1447     bdrv_get_geometry(bs, &nb_sectors);
1448     bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
1449     translation = bdrv_get_translation_hint(bs);
1450     if (cylinders != 0) {
1451         *pcyls = cylinders;
1452         *pheads = heads;
1453         *psecs = secs;
1454     } else {
1455         if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
1456             if (heads > 16) {
1457                 /* if heads > 16, it means that a BIOS LBA
1458                    translation was active, so the default
1459                    hardware geometry is OK */
1460                 lba_detected = 1;
1461                 goto default_geometry;
1462             } else {
1463                 *pcyls = cylinders;
1464                 *pheads = heads;
1465                 *psecs = secs;
1466                 /* disable any translation to be in sync with
1467                    the logical geometry */
1468                 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
1469                     bdrv_set_translation_hint(bs,
1470                                               BIOS_ATA_TRANSLATION_NONE);
1471                 }
1472             }
1473         } else {
1474         default_geometry:
1475             /* if no geometry, use a standard physical disk geometry */
1476             cylinders = nb_sectors / (16 * 63);
1477 
1478             if (cylinders > 16383)
1479                 cylinders = 16383;
1480             else if (cylinders < 2)
1481                 cylinders = 2;
1482             *pcyls = cylinders;
1483             *pheads = 16;
1484             *psecs = 63;
1485             if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
1486                 if ((*pcyls * *pheads) <= 131072) {
1487                     bdrv_set_translation_hint(bs,
1488                                               BIOS_ATA_TRANSLATION_LARGE);
1489                 } else {
1490                     bdrv_set_translation_hint(bs,
1491                                               BIOS_ATA_TRANSLATION_LBA);
1492                 }
1493             }
1494         }
1495         bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
1496     }
1497 }
1498 
1499 void bdrv_set_geometry_hint(BlockDriverState *bs,
1500                             int cyls, int heads, int secs)
1501 {
1502     bs->cyls = cyls;
1503     bs->heads = heads;
1504     bs->secs = secs;
1505 }
1506 
1507 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
1508 {
1509     bs->translation = translation;
1510 }
1511 
1512 void bdrv_get_geometry_hint(BlockDriverState *bs,
1513                             int *pcyls, int *pheads, int *psecs)
1514 {
1515     *pcyls = bs->cyls;
1516     *pheads = bs->heads;
1517     *psecs = bs->secs;
1518 }
1519 
1520 /* Recognize floppy formats */
1521 typedef struct FDFormat {
1522     FDriveType drive;
1523     uint8_t last_sect;
1524     uint8_t max_track;
1525     uint8_t max_head;
1526 } FDFormat;
1527 
1528 static const FDFormat fd_formats[] = {
1529     /* First entry is default format */
1530     /* 1.44 MB 3"1/2 floppy disks */
1531     { FDRIVE_DRV_144, 18, 80, 1, },
1532     { FDRIVE_DRV_144, 20, 80, 1, },
1533     { FDRIVE_DRV_144, 21, 80, 1, },
1534     { FDRIVE_DRV_144, 21, 82, 1, },
1535     { FDRIVE_DRV_144, 21, 83, 1, },
1536     { FDRIVE_DRV_144, 22, 80, 1, },
1537     { FDRIVE_DRV_144, 23, 80, 1, },
1538     { FDRIVE_DRV_144, 24, 80, 1, },
1539     /* 2.88 MB 3"1/2 floppy disks */
1540     { FDRIVE_DRV_288, 36, 80, 1, },
1541     { FDRIVE_DRV_288, 39, 80, 1, },
1542     { FDRIVE_DRV_288, 40, 80, 1, },
1543     { FDRIVE_DRV_288, 44, 80, 1, },
1544     { FDRIVE_DRV_288, 48, 80, 1, },
1545     /* 720 kB 3"1/2 floppy disks */
1546     { FDRIVE_DRV_144,  9, 80, 1, },
1547     { FDRIVE_DRV_144, 10, 80, 1, },
1548     { FDRIVE_DRV_144, 10, 82, 1, },
1549     { FDRIVE_DRV_144, 10, 83, 1, },
1550     { FDRIVE_DRV_144, 13, 80, 1, },
1551     { FDRIVE_DRV_144, 14, 80, 1, },
1552     /* 1.2 MB 5"1/4 floppy disks */
1553     { FDRIVE_DRV_120, 15, 80, 1, },
1554     { FDRIVE_DRV_120, 18, 80, 1, },
1555     { FDRIVE_DRV_120, 18, 82, 1, },
1556     { FDRIVE_DRV_120, 18, 83, 1, },
1557     { FDRIVE_DRV_120, 20, 80, 1, },
1558     /* 720 kB 5"1/4 floppy disks */
1559     { FDRIVE_DRV_120,  9, 80, 1, },
1560     { FDRIVE_DRV_120, 11, 80, 1, },
1561     /* 360 kB 5"1/4 floppy disks */
1562     { FDRIVE_DRV_120,  9, 40, 1, },
1563     { FDRIVE_DRV_120,  9, 40, 0, },
1564     { FDRIVE_DRV_120, 10, 41, 1, },
1565     { FDRIVE_DRV_120, 10, 42, 1, },
1566     /* 320 kB 5"1/4 floppy disks */
1567     { FDRIVE_DRV_120,  8, 40, 1, },
1568     { FDRIVE_DRV_120,  8, 40, 0, },
1569     /* 360 kB must match 5"1/4 better than 3"1/2... */
1570     { FDRIVE_DRV_144,  9, 80, 0, },
1571     /* end */
1572     { FDRIVE_DRV_NONE, -1, -1, 0, },
1573 };
1574 
1575 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
1576                                    int *max_track, int *last_sect,
1577                                    FDriveType drive_in, FDriveType *drive)
1578 {
1579     const FDFormat *parse;
1580     uint64_t nb_sectors, size;
1581     int i, first_match, match;
1582 
1583     bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
1584     if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
1585         /* User defined disk */
1586     } else {
1587         bdrv_get_geometry(bs, &nb_sectors);
1588         match = -1;
1589         first_match = -1;
1590         for (i = 0; ; i++) {
1591             parse = &fd_formats[i];
1592             if (parse->drive == FDRIVE_DRV_NONE) {
1593                 break;
1594             }
1595             if (drive_in == parse->drive ||
1596                 drive_in == FDRIVE_DRV_NONE) {
1597                 size = (parse->max_head + 1) * parse->max_track *
1598                     parse->last_sect;
1599                 if (nb_sectors == size) {
1600                     match = i;
1601                     break;
1602                 }
1603                 if (first_match == -1) {
1604                     first_match = i;
1605                 }
1606             }
1607         }
1608         if (match == -1) {
1609             if (first_match == -1) {
1610                 match = 1;
1611             } else {
1612                 match = first_match;
1613             }
1614             parse = &fd_formats[match];
1615         }
1616         *nb_heads = parse->max_head + 1;
1617         *max_track = parse->max_track;
1618         *last_sect = parse->last_sect;
1619         *drive = parse->drive;
1620     }
1621 }
1622 
1623 int bdrv_get_translation_hint(BlockDriverState *bs)
1624 {
1625     return bs->translation;
1626 }
1627 
1628 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
1629                        BlockErrorAction on_write_error)
1630 {
1631     bs->on_read_error = on_read_error;
1632     bs->on_write_error = on_write_error;
1633 }
1634 
1635 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
1636 {
1637     return is_read ? bs->on_read_error : bs->on_write_error;
1638 }
1639 
1640 int bdrv_is_read_only(BlockDriverState *bs)
1641 {
1642     return bs->read_only;
1643 }
1644 
1645 int bdrv_is_sg(BlockDriverState *bs)
1646 {
1647     return bs->sg;
1648 }
1649 
1650 int bdrv_enable_write_cache(BlockDriverState *bs)
1651 {
1652     return bs->enable_write_cache;
1653 }
1654 
1655 int bdrv_is_encrypted(BlockDriverState *bs)
1656 {
1657     if (bs->backing_hd && bs->backing_hd->encrypted)
1658         return 1;
1659     return bs->encrypted;
1660 }
1661 
1662 int bdrv_key_required(BlockDriverState *bs)
1663 {
1664     BlockDriverState *backing_hd = bs->backing_hd;
1665 
1666     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
1667         return 1;
1668     return (bs->encrypted && !bs->valid_key);
1669 }
1670 
1671 int bdrv_set_key(BlockDriverState *bs, const char *key)
1672 {
1673     int ret;
1674     if (bs->backing_hd && bs->backing_hd->encrypted) {
1675         ret = bdrv_set_key(bs->backing_hd, key);
1676         if (ret < 0)
1677             return ret;
1678         if (!bs->encrypted)
1679             return 0;
1680     }
1681     if (!bs->encrypted) {
1682         return -EINVAL;
1683     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
1684         return -ENOMEDIUM;
1685     }
1686     ret = bs->drv->bdrv_set_key(bs, key);
1687     if (ret < 0) {
1688         bs->valid_key = 0;
1689     } else if (!bs->valid_key) {
1690         bs->valid_key = 1;
1691         /* call the change callback now, we skipped it on open */
1692         bdrv_dev_change_media_cb(bs, true);
1693     }
1694     return ret;
1695 }
1696 
1697 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
1698 {
1699     if (!bs->drv) {
1700         buf[0] = '\0';
1701     } else {
1702         pstrcpy(buf, buf_size, bs->drv->format_name);
1703     }
1704 }
1705 
1706 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
1707                          void *opaque)
1708 {
1709     BlockDriver *drv;
1710 
1711     QLIST_FOREACH(drv, &bdrv_drivers, list) {
1712         it(opaque, drv->format_name);
1713     }
1714 }
1715 
1716 BlockDriverState *bdrv_find(const char *name)
1717 {
1718     BlockDriverState *bs;
1719 
1720     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1721         if (!strcmp(name, bs->device_name)) {
1722             return bs;
1723         }
1724     }
1725     return NULL;
1726 }
1727 
1728 BlockDriverState *bdrv_next(BlockDriverState *bs)
1729 {
1730     if (!bs) {
1731         return QTAILQ_FIRST(&bdrv_states);
1732     }
1733     return QTAILQ_NEXT(bs, list);
1734 }
1735 
1736 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
1737 {
1738     BlockDriverState *bs;
1739 
1740     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1741         it(opaque, bs);
1742     }
1743 }
1744 
1745 const char *bdrv_get_device_name(BlockDriverState *bs)
1746 {
1747     return bs->device_name;
1748 }
1749 
1750 void bdrv_flush_all(void)
1751 {
1752     BlockDriverState *bs;
1753 
1754     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1755         if (!bdrv_is_read_only(bs) && bdrv_is_inserted(bs)) {
1756             bdrv_flush(bs);
1757         }
1758     }
1759 }
1760 
1761 int bdrv_has_zero_init(BlockDriverState *bs)
1762 {
1763     assert(bs->drv);
1764 
1765     if (bs->drv->bdrv_has_zero_init) {
1766         return bs->drv->bdrv_has_zero_init(bs);
1767     }
1768 
1769     return 1;
1770 }
1771 
1772 /*
1773  * Returns true iff the specified sector is present in the disk image. Drivers
1774  * not implementing the functionality are assumed to not support backing files,
1775  * hence all their sectors are reported as allocated.
1776  *
1777  * 'pnum' is set to the number of sectors (including and immediately following
1778  * the specified sector) that are known to be in the same
1779  * allocated/unallocated state.
1780  *
1781  * 'nb_sectors' is the max value 'pnum' should be set to.
1782  */
1783 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
1784 	int *pnum)
1785 {
1786     int64_t n;
1787     if (!bs->drv->bdrv_is_allocated) {
1788         if (sector_num >= bs->total_sectors) {
1789             *pnum = 0;
1790             return 0;
1791         }
1792         n = bs->total_sectors - sector_num;
1793         *pnum = (n < nb_sectors) ? (n) : (nb_sectors);
1794         return 1;
1795     }
1796     return bs->drv->bdrv_is_allocated(bs, sector_num, nb_sectors, pnum);
1797 }
1798 
1799 void bdrv_mon_event(const BlockDriverState *bdrv,
1800                     BlockMonEventAction action, int is_read)
1801 {
1802     QObject *data;
1803     const char *action_str;
1804 
1805     switch (action) {
1806     case BDRV_ACTION_REPORT:
1807         action_str = "report";
1808         break;
1809     case BDRV_ACTION_IGNORE:
1810         action_str = "ignore";
1811         break;
1812     case BDRV_ACTION_STOP:
1813         action_str = "stop";
1814         break;
1815     default:
1816         abort();
1817     }
1818 
1819     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1820                               bdrv->device_name,
1821                               action_str,
1822                               is_read ? "read" : "write");
1823     monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1824 
1825     qobject_decref(data);
1826 }
1827 
1828 BlockInfoList *qmp_query_block(Error **errp)
1829 {
1830     BlockInfoList *head = NULL, *cur_item = NULL;
1831     BlockDriverState *bs;
1832 
1833     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1834         BlockInfoList *info = g_malloc0(sizeof(*info));
1835 
1836         info->value = g_malloc0(sizeof(*info->value));
1837         info->value->device = g_strdup(bs->device_name);
1838         info->value->type = g_strdup("unknown");
1839         info->value->locked = bdrv_dev_is_medium_locked(bs);
1840         info->value->removable = bdrv_dev_has_removable_media(bs);
1841 
1842         if (bdrv_dev_has_removable_media(bs)) {
1843             info->value->has_tray_open = true;
1844             info->value->tray_open = bdrv_dev_is_tray_open(bs);
1845         }
1846 
1847         if (bdrv_iostatus_is_enabled(bs)) {
1848             info->value->has_io_status = true;
1849             info->value->io_status = bs->iostatus;
1850         }
1851 
1852         if (bs->drv) {
1853             info->value->has_inserted = true;
1854             info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
1855             info->value->inserted->file = g_strdup(bs->filename);
1856             info->value->inserted->ro = bs->read_only;
1857             info->value->inserted->drv = g_strdup(bs->drv->format_name);
1858             info->value->inserted->encrypted = bs->encrypted;
1859             if (bs->backing_file[0]) {
1860                 info->value->inserted->has_backing_file = true;
1861                 info->value->inserted->backing_file = g_strdup(bs->backing_file);
1862             }
1863         }
1864 
1865         /* XXX: waiting for the qapi to support GSList */
1866         if (!cur_item) {
1867             head = cur_item = info;
1868         } else {
1869             cur_item->next = info;
1870             cur_item = info;
1871         }
1872     }
1873 
1874     return head;
1875 }
1876 
1877 static void bdrv_stats_iter(QObject *data, void *opaque)
1878 {
1879     QDict *qdict;
1880     Monitor *mon = opaque;
1881 
1882     qdict = qobject_to_qdict(data);
1883     monitor_printf(mon, "%s:", qdict_get_str(qdict, "device"));
1884 
1885     qdict = qobject_to_qdict(qdict_get(qdict, "stats"));
1886     monitor_printf(mon, " rd_bytes=%" PRId64
1887                         " wr_bytes=%" PRId64
1888                         " rd_operations=%" PRId64
1889                         " wr_operations=%" PRId64
1890                         " flush_operations=%" PRId64
1891                         " wr_total_time_ns=%" PRId64
1892                         " rd_total_time_ns=%" PRId64
1893                         " flush_total_time_ns=%" PRId64
1894                         "\n",
1895                         qdict_get_int(qdict, "rd_bytes"),
1896                         qdict_get_int(qdict, "wr_bytes"),
1897                         qdict_get_int(qdict, "rd_operations"),
1898                         qdict_get_int(qdict, "wr_operations"),
1899                         qdict_get_int(qdict, "flush_operations"),
1900                         qdict_get_int(qdict, "wr_total_time_ns"),
1901                         qdict_get_int(qdict, "rd_total_time_ns"),
1902                         qdict_get_int(qdict, "flush_total_time_ns"));
1903 }
1904 
1905 void bdrv_stats_print(Monitor *mon, const QObject *data)
1906 {
1907     qlist_iter(qobject_to_qlist(data), bdrv_stats_iter, mon);
1908 }
1909 
1910 static QObject* bdrv_info_stats_bs(BlockDriverState *bs)
1911 {
1912     QObject *res;
1913     QDict *dict;
1914 
1915     res = qobject_from_jsonf("{ 'stats': {"
1916                              "'rd_bytes': %" PRId64 ","
1917                              "'wr_bytes': %" PRId64 ","
1918                              "'rd_operations': %" PRId64 ","
1919                              "'wr_operations': %" PRId64 ","
1920                              "'wr_highest_offset': %" PRId64 ","
1921                              "'flush_operations': %" PRId64 ","
1922                              "'wr_total_time_ns': %" PRId64 ","
1923                              "'rd_total_time_ns': %" PRId64 ","
1924                              "'flush_total_time_ns': %" PRId64
1925                              "} }",
1926                              bs->nr_bytes[BDRV_ACCT_READ],
1927                              bs->nr_bytes[BDRV_ACCT_WRITE],
1928                              bs->nr_ops[BDRV_ACCT_READ],
1929                              bs->nr_ops[BDRV_ACCT_WRITE],
1930                              bs->wr_highest_sector *
1931                              (uint64_t)BDRV_SECTOR_SIZE,
1932                              bs->nr_ops[BDRV_ACCT_FLUSH],
1933                              bs->total_time_ns[BDRV_ACCT_WRITE],
1934                              bs->total_time_ns[BDRV_ACCT_READ],
1935                              bs->total_time_ns[BDRV_ACCT_FLUSH]);
1936     dict  = qobject_to_qdict(res);
1937 
1938     if (*bs->device_name) {
1939         qdict_put(dict, "device", qstring_from_str(bs->device_name));
1940     }
1941 
1942     if (bs->file) {
1943         QObject *parent = bdrv_info_stats_bs(bs->file);
1944         qdict_put_obj(dict, "parent", parent);
1945     }
1946 
1947     return res;
1948 }
1949 
1950 void bdrv_info_stats(Monitor *mon, QObject **ret_data)
1951 {
1952     QObject *obj;
1953     QList *devices;
1954     BlockDriverState *bs;
1955 
1956     devices = qlist_new();
1957 
1958     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1959         obj = bdrv_info_stats_bs(bs);
1960         qlist_append_obj(devices, obj);
1961     }
1962 
1963     *ret_data = QOBJECT(devices);
1964 }
1965 
1966 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
1967 {
1968     if (bs->backing_hd && bs->backing_hd->encrypted)
1969         return bs->backing_file;
1970     else if (bs->encrypted)
1971         return bs->filename;
1972     else
1973         return NULL;
1974 }
1975 
1976 void bdrv_get_backing_filename(BlockDriverState *bs,
1977                                char *filename, int filename_size)
1978 {
1979     if (!bs->backing_file) {
1980         pstrcpy(filename, filename_size, "");
1981     } else {
1982         pstrcpy(filename, filename_size, bs->backing_file);
1983     }
1984 }
1985 
1986 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
1987                           const uint8_t *buf, int nb_sectors)
1988 {
1989     BlockDriver *drv = bs->drv;
1990     if (!drv)
1991         return -ENOMEDIUM;
1992     if (!drv->bdrv_write_compressed)
1993         return -ENOTSUP;
1994     if (bdrv_check_request(bs, sector_num, nb_sectors))
1995         return -EIO;
1996 
1997     if (bs->dirty_bitmap) {
1998         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1999     }
2000 
2001     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2002 }
2003 
2004 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2005 {
2006     BlockDriver *drv = bs->drv;
2007     if (!drv)
2008         return -ENOMEDIUM;
2009     if (!drv->bdrv_get_info)
2010         return -ENOTSUP;
2011     memset(bdi, 0, sizeof(*bdi));
2012     return drv->bdrv_get_info(bs, bdi);
2013 }
2014 
2015 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2016                       int64_t pos, int size)
2017 {
2018     BlockDriver *drv = bs->drv;
2019     if (!drv)
2020         return -ENOMEDIUM;
2021     if (drv->bdrv_save_vmstate)
2022         return drv->bdrv_save_vmstate(bs, buf, pos, size);
2023     if (bs->file)
2024         return bdrv_save_vmstate(bs->file, buf, pos, size);
2025     return -ENOTSUP;
2026 }
2027 
2028 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2029                       int64_t pos, int size)
2030 {
2031     BlockDriver *drv = bs->drv;
2032     if (!drv)
2033         return -ENOMEDIUM;
2034     if (drv->bdrv_load_vmstate)
2035         return drv->bdrv_load_vmstate(bs, buf, pos, size);
2036     if (bs->file)
2037         return bdrv_load_vmstate(bs->file, buf, pos, size);
2038     return -ENOTSUP;
2039 }
2040 
2041 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2042 {
2043     BlockDriver *drv = bs->drv;
2044 
2045     if (!drv || !drv->bdrv_debug_event) {
2046         return;
2047     }
2048 
2049     return drv->bdrv_debug_event(bs, event);
2050 
2051 }
2052 
2053 /**************************************************************/
2054 /* handling of snapshots */
2055 
2056 int bdrv_can_snapshot(BlockDriverState *bs)
2057 {
2058     BlockDriver *drv = bs->drv;
2059     if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2060         return 0;
2061     }
2062 
2063     if (!drv->bdrv_snapshot_create) {
2064         if (bs->file != NULL) {
2065             return bdrv_can_snapshot(bs->file);
2066         }
2067         return 0;
2068     }
2069 
2070     return 1;
2071 }
2072 
2073 int bdrv_is_snapshot(BlockDriverState *bs)
2074 {
2075     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2076 }
2077 
2078 BlockDriverState *bdrv_snapshots(void)
2079 {
2080     BlockDriverState *bs;
2081 
2082     if (bs_snapshots) {
2083         return bs_snapshots;
2084     }
2085 
2086     bs = NULL;
2087     while ((bs = bdrv_next(bs))) {
2088         if (bdrv_can_snapshot(bs)) {
2089             bs_snapshots = bs;
2090             return bs;
2091         }
2092     }
2093     return NULL;
2094 }
2095 
2096 int bdrv_snapshot_create(BlockDriverState *bs,
2097                          QEMUSnapshotInfo *sn_info)
2098 {
2099     BlockDriver *drv = bs->drv;
2100     if (!drv)
2101         return -ENOMEDIUM;
2102     if (drv->bdrv_snapshot_create)
2103         return drv->bdrv_snapshot_create(bs, sn_info);
2104     if (bs->file)
2105         return bdrv_snapshot_create(bs->file, sn_info);
2106     return -ENOTSUP;
2107 }
2108 
2109 int bdrv_snapshot_goto(BlockDriverState *bs,
2110                        const char *snapshot_id)
2111 {
2112     BlockDriver *drv = bs->drv;
2113     int ret, open_ret;
2114 
2115     if (!drv)
2116         return -ENOMEDIUM;
2117     if (drv->bdrv_snapshot_goto)
2118         return drv->bdrv_snapshot_goto(bs, snapshot_id);
2119 
2120     if (bs->file) {
2121         drv->bdrv_close(bs);
2122         ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2123         open_ret = drv->bdrv_open(bs, bs->open_flags);
2124         if (open_ret < 0) {
2125             bdrv_delete(bs->file);
2126             bs->drv = NULL;
2127             return open_ret;
2128         }
2129         return ret;
2130     }
2131 
2132     return -ENOTSUP;
2133 }
2134 
2135 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2136 {
2137     BlockDriver *drv = bs->drv;
2138     if (!drv)
2139         return -ENOMEDIUM;
2140     if (drv->bdrv_snapshot_delete)
2141         return drv->bdrv_snapshot_delete(bs, snapshot_id);
2142     if (bs->file)
2143         return bdrv_snapshot_delete(bs->file, snapshot_id);
2144     return -ENOTSUP;
2145 }
2146 
2147 int bdrv_snapshot_list(BlockDriverState *bs,
2148                        QEMUSnapshotInfo **psn_info)
2149 {
2150     BlockDriver *drv = bs->drv;
2151     if (!drv)
2152         return -ENOMEDIUM;
2153     if (drv->bdrv_snapshot_list)
2154         return drv->bdrv_snapshot_list(bs, psn_info);
2155     if (bs->file)
2156         return bdrv_snapshot_list(bs->file, psn_info);
2157     return -ENOTSUP;
2158 }
2159 
2160 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2161         const char *snapshot_name)
2162 {
2163     BlockDriver *drv = bs->drv;
2164     if (!drv) {
2165         return -ENOMEDIUM;
2166     }
2167     if (!bs->read_only) {
2168         return -EINVAL;
2169     }
2170     if (drv->bdrv_snapshot_load_tmp) {
2171         return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2172     }
2173     return -ENOTSUP;
2174 }
2175 
2176 #define NB_SUFFIXES 4
2177 
2178 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2179 {
2180     static const char suffixes[NB_SUFFIXES] = "KMGT";
2181     int64_t base;
2182     int i;
2183 
2184     if (size <= 999) {
2185         snprintf(buf, buf_size, "%" PRId64, size);
2186     } else {
2187         base = 1024;
2188         for(i = 0; i < NB_SUFFIXES; i++) {
2189             if (size < (10 * base)) {
2190                 snprintf(buf, buf_size, "%0.1f%c",
2191                          (double)size / base,
2192                          suffixes[i]);
2193                 break;
2194             } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2195                 snprintf(buf, buf_size, "%" PRId64 "%c",
2196                          ((size + (base >> 1)) / base),
2197                          suffixes[i]);
2198                 break;
2199             }
2200             base = base * 1024;
2201         }
2202     }
2203     return buf;
2204 }
2205 
2206 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2207 {
2208     char buf1[128], date_buf[128], clock_buf[128];
2209 #ifdef _WIN32
2210     struct tm *ptm;
2211 #else
2212     struct tm tm;
2213 #endif
2214     time_t ti;
2215     int64_t secs;
2216 
2217     if (!sn) {
2218         snprintf(buf, buf_size,
2219                  "%-10s%-20s%7s%20s%15s",
2220                  "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2221     } else {
2222         ti = sn->date_sec;
2223 #ifdef _WIN32
2224         ptm = localtime(&ti);
2225         strftime(date_buf, sizeof(date_buf),
2226                  "%Y-%m-%d %H:%M:%S", ptm);
2227 #else
2228         localtime_r(&ti, &tm);
2229         strftime(date_buf, sizeof(date_buf),
2230                  "%Y-%m-%d %H:%M:%S", &tm);
2231 #endif
2232         secs = sn->vm_clock_nsec / 1000000000;
2233         snprintf(clock_buf, sizeof(clock_buf),
2234                  "%02d:%02d:%02d.%03d",
2235                  (int)(secs / 3600),
2236                  (int)((secs / 60) % 60),
2237                  (int)(secs % 60),
2238                  (int)((sn->vm_clock_nsec / 1000000) % 1000));
2239         snprintf(buf, buf_size,
2240                  "%-10s%-20s%7s%20s%15s",
2241                  sn->id_str, sn->name,
2242                  get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2243                  date_buf,
2244                  clock_buf);
2245     }
2246     return buf;
2247 }
2248 
2249 /**************************************************************/
2250 /* async I/Os */
2251 
2252 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2253                                  QEMUIOVector *qiov, int nb_sectors,
2254                                  BlockDriverCompletionFunc *cb, void *opaque)
2255 {
2256     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2257 
2258     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2259                                  cb, opaque, false);
2260 }
2261 
2262 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2263                                   QEMUIOVector *qiov, int nb_sectors,
2264                                   BlockDriverCompletionFunc *cb, void *opaque)
2265 {
2266     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2267 
2268     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2269                                  cb, opaque, true);
2270 }
2271 
2272 
2273 typedef struct MultiwriteCB {
2274     int error;
2275     int num_requests;
2276     int num_callbacks;
2277     struct {
2278         BlockDriverCompletionFunc *cb;
2279         void *opaque;
2280         QEMUIOVector *free_qiov;
2281         void *free_buf;
2282     } callbacks[];
2283 } MultiwriteCB;
2284 
2285 static void multiwrite_user_cb(MultiwriteCB *mcb)
2286 {
2287     int i;
2288 
2289     for (i = 0; i < mcb->num_callbacks; i++) {
2290         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2291         if (mcb->callbacks[i].free_qiov) {
2292             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2293         }
2294         g_free(mcb->callbacks[i].free_qiov);
2295         qemu_vfree(mcb->callbacks[i].free_buf);
2296     }
2297 }
2298 
2299 static void multiwrite_cb(void *opaque, int ret)
2300 {
2301     MultiwriteCB *mcb = opaque;
2302 
2303     trace_multiwrite_cb(mcb, ret);
2304 
2305     if (ret < 0 && !mcb->error) {
2306         mcb->error = ret;
2307     }
2308 
2309     mcb->num_requests--;
2310     if (mcb->num_requests == 0) {
2311         multiwrite_user_cb(mcb);
2312         g_free(mcb);
2313     }
2314 }
2315 
2316 static int multiwrite_req_compare(const void *a, const void *b)
2317 {
2318     const BlockRequest *req1 = a, *req2 = b;
2319 
2320     /*
2321      * Note that we can't simply subtract req2->sector from req1->sector
2322      * here as that could overflow the return value.
2323      */
2324     if (req1->sector > req2->sector) {
2325         return 1;
2326     } else if (req1->sector < req2->sector) {
2327         return -1;
2328     } else {
2329         return 0;
2330     }
2331 }
2332 
2333 /*
2334  * Takes a bunch of requests and tries to merge them. Returns the number of
2335  * requests that remain after merging.
2336  */
2337 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2338     int num_reqs, MultiwriteCB *mcb)
2339 {
2340     int i, outidx;
2341 
2342     // Sort requests by start sector
2343     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2344 
2345     // Check if adjacent requests touch the same clusters. If so, combine them,
2346     // filling up gaps with zero sectors.
2347     outidx = 0;
2348     for (i = 1; i < num_reqs; i++) {
2349         int merge = 0;
2350         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2351 
2352         // This handles the cases that are valid for all block drivers, namely
2353         // exactly sequential writes and overlapping writes.
2354         if (reqs[i].sector <= oldreq_last) {
2355             merge = 1;
2356         }
2357 
2358         // The block driver may decide that it makes sense to combine requests
2359         // even if there is a gap of some sectors between them. In this case,
2360         // the gap is filled with zeros (therefore only applicable for yet
2361         // unused space in format like qcow2).
2362         if (!merge && bs->drv->bdrv_merge_requests) {
2363             merge = bs->drv->bdrv_merge_requests(bs, &reqs[outidx], &reqs[i]);
2364         }
2365 
2366         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2367             merge = 0;
2368         }
2369 
2370         if (merge) {
2371             size_t size;
2372             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
2373             qemu_iovec_init(qiov,
2374                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2375 
2376             // Add the first request to the merged one. If the requests are
2377             // overlapping, drop the last sectors of the first request.
2378             size = (reqs[i].sector - reqs[outidx].sector) << 9;
2379             qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
2380 
2381             // We might need to add some zeros between the two requests
2382             if (reqs[i].sector > oldreq_last) {
2383                 size_t zero_bytes = (reqs[i].sector - oldreq_last) << 9;
2384                 uint8_t *buf = qemu_blockalign(bs, zero_bytes);
2385                 memset(buf, 0, zero_bytes);
2386                 qemu_iovec_add(qiov, buf, zero_bytes);
2387                 mcb->callbacks[i].free_buf = buf;
2388             }
2389 
2390             // Add the second request
2391             qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
2392 
2393             reqs[outidx].nb_sectors = qiov->size >> 9;
2394             reqs[outidx].qiov = qiov;
2395 
2396             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
2397         } else {
2398             outidx++;
2399             reqs[outidx].sector     = reqs[i].sector;
2400             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
2401             reqs[outidx].qiov       = reqs[i].qiov;
2402         }
2403     }
2404 
2405     return outidx + 1;
2406 }
2407 
2408 /*
2409  * Submit multiple AIO write requests at once.
2410  *
2411  * On success, the function returns 0 and all requests in the reqs array have
2412  * been submitted. In error case this function returns -1, and any of the
2413  * requests may or may not be submitted yet. In particular, this means that the
2414  * callback will be called for some of the requests, for others it won't. The
2415  * caller must check the error field of the BlockRequest to wait for the right
2416  * callbacks (if error != 0, no callback will be called).
2417  *
2418  * The implementation may modify the contents of the reqs array, e.g. to merge
2419  * requests. However, the fields opaque and error are left unmodified as they
2420  * are used to signal failure for a single request to the caller.
2421  */
2422 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
2423 {
2424     BlockDriverAIOCB *acb;
2425     MultiwriteCB *mcb;
2426     int i;
2427 
2428     /* don't submit writes if we don't have a medium */
2429     if (bs->drv == NULL) {
2430         for (i = 0; i < num_reqs; i++) {
2431             reqs[i].error = -ENOMEDIUM;
2432         }
2433         return -1;
2434     }
2435 
2436     if (num_reqs == 0) {
2437         return 0;
2438     }
2439 
2440     // Create MultiwriteCB structure
2441     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
2442     mcb->num_requests = 0;
2443     mcb->num_callbacks = num_reqs;
2444 
2445     for (i = 0; i < num_reqs; i++) {
2446         mcb->callbacks[i].cb = reqs[i].cb;
2447         mcb->callbacks[i].opaque = reqs[i].opaque;
2448     }
2449 
2450     // Check for mergable requests
2451     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
2452 
2453     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
2454 
2455     /*
2456      * Run the aio requests. As soon as one request can't be submitted
2457      * successfully, fail all requests that are not yet submitted (we must
2458      * return failure for all requests anyway)
2459      *
2460      * num_requests cannot be set to the right value immediately: If
2461      * bdrv_aio_writev fails for some request, num_requests would be too high
2462      * and therefore multiwrite_cb() would never recognize the multiwrite
2463      * request as completed. We also cannot use the loop variable i to set it
2464      * when the first request fails because the callback may already have been
2465      * called for previously submitted requests. Thus, num_requests must be
2466      * incremented for each request that is submitted.
2467      *
2468      * The problem that callbacks may be called early also means that we need
2469      * to take care that num_requests doesn't become 0 before all requests are
2470      * submitted - multiwrite_cb() would consider the multiwrite request
2471      * completed. A dummy request that is "completed" by a manual call to
2472      * multiwrite_cb() takes care of this.
2473      */
2474     mcb->num_requests = 1;
2475 
2476     // Run the aio requests
2477     for (i = 0; i < num_reqs; i++) {
2478         mcb->num_requests++;
2479         acb = bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
2480             reqs[i].nb_sectors, multiwrite_cb, mcb);
2481 
2482         if (acb == NULL) {
2483             // We can only fail the whole thing if no request has been
2484             // submitted yet. Otherwise we'll wait for the submitted AIOs to
2485             // complete and report the error in the callback.
2486             if (i == 0) {
2487                 trace_bdrv_aio_multiwrite_earlyfail(mcb);
2488                 goto fail;
2489             } else {
2490                 trace_bdrv_aio_multiwrite_latefail(mcb, i);
2491                 multiwrite_cb(mcb, -EIO);
2492                 break;
2493             }
2494         }
2495     }
2496 
2497     /* Complete the dummy request */
2498     multiwrite_cb(mcb, 0);
2499 
2500     return 0;
2501 
2502 fail:
2503     for (i = 0; i < mcb->num_callbacks; i++) {
2504         reqs[i].error = -EIO;
2505     }
2506     g_free(mcb);
2507     return -1;
2508 }
2509 
2510 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
2511 {
2512     acb->pool->cancel(acb);
2513 }
2514 
2515 
2516 /**************************************************************/
2517 /* async block device emulation */
2518 
2519 typedef struct BlockDriverAIOCBSync {
2520     BlockDriverAIOCB common;
2521     QEMUBH *bh;
2522     int ret;
2523     /* vector translation state */
2524     QEMUIOVector *qiov;
2525     uint8_t *bounce;
2526     int is_write;
2527 } BlockDriverAIOCBSync;
2528 
2529 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
2530 {
2531     BlockDriverAIOCBSync *acb =
2532         container_of(blockacb, BlockDriverAIOCBSync, common);
2533     qemu_bh_delete(acb->bh);
2534     acb->bh = NULL;
2535     qemu_aio_release(acb);
2536 }
2537 
2538 static AIOPool bdrv_em_aio_pool = {
2539     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
2540     .cancel             = bdrv_aio_cancel_em,
2541 };
2542 
2543 static void bdrv_aio_bh_cb(void *opaque)
2544 {
2545     BlockDriverAIOCBSync *acb = opaque;
2546 
2547     if (!acb->is_write)
2548         qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
2549     qemu_vfree(acb->bounce);
2550     acb->common.cb(acb->common.opaque, acb->ret);
2551     qemu_bh_delete(acb->bh);
2552     acb->bh = NULL;
2553     qemu_aio_release(acb);
2554 }
2555 
2556 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
2557                                             int64_t sector_num,
2558                                             QEMUIOVector *qiov,
2559                                             int nb_sectors,
2560                                             BlockDriverCompletionFunc *cb,
2561                                             void *opaque,
2562                                             int is_write)
2563 
2564 {
2565     BlockDriverAIOCBSync *acb;
2566 
2567     acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2568     acb->is_write = is_write;
2569     acb->qiov = qiov;
2570     acb->bounce = qemu_blockalign(bs, qiov->size);
2571 
2572     if (!acb->bh)
2573         acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2574 
2575     if (is_write) {
2576         qemu_iovec_to_buffer(acb->qiov, acb->bounce);
2577         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
2578     } else {
2579         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
2580     }
2581 
2582     qemu_bh_schedule(acb->bh);
2583 
2584     return &acb->common;
2585 }
2586 
2587 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
2588         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2589         BlockDriverCompletionFunc *cb, void *opaque)
2590 {
2591     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
2592 }
2593 
2594 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
2595         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2596         BlockDriverCompletionFunc *cb, void *opaque)
2597 {
2598     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
2599 }
2600 
2601 
2602 typedef struct BlockDriverAIOCBCoroutine {
2603     BlockDriverAIOCB common;
2604     BlockRequest req;
2605     bool is_write;
2606     QEMUBH* bh;
2607 } BlockDriverAIOCBCoroutine;
2608 
2609 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
2610 {
2611     qemu_aio_flush();
2612 }
2613 
2614 static AIOPool bdrv_em_co_aio_pool = {
2615     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
2616     .cancel             = bdrv_aio_co_cancel_em,
2617 };
2618 
2619 static void bdrv_co_em_bh(void *opaque)
2620 {
2621     BlockDriverAIOCBCoroutine *acb = opaque;
2622 
2623     acb->common.cb(acb->common.opaque, acb->req.error);
2624     qemu_bh_delete(acb->bh);
2625     qemu_aio_release(acb);
2626 }
2627 
2628 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
2629 static void coroutine_fn bdrv_co_do_rw(void *opaque)
2630 {
2631     BlockDriverAIOCBCoroutine *acb = opaque;
2632     BlockDriverState *bs = acb->common.bs;
2633 
2634     if (!acb->is_write) {
2635         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
2636             acb->req.nb_sectors, acb->req.qiov);
2637     } else {
2638         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
2639             acb->req.nb_sectors, acb->req.qiov);
2640     }
2641 
2642     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
2643     qemu_bh_schedule(acb->bh);
2644 }
2645 
2646 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
2647                                                int64_t sector_num,
2648                                                QEMUIOVector *qiov,
2649                                                int nb_sectors,
2650                                                BlockDriverCompletionFunc *cb,
2651                                                void *opaque,
2652                                                bool is_write)
2653 {
2654     Coroutine *co;
2655     BlockDriverAIOCBCoroutine *acb;
2656 
2657     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
2658     acb->req.sector = sector_num;
2659     acb->req.nb_sectors = nb_sectors;
2660     acb->req.qiov = qiov;
2661     acb->is_write = is_write;
2662 
2663     co = qemu_coroutine_create(bdrv_co_do_rw);
2664     qemu_coroutine_enter(co, acb);
2665 
2666     return &acb->common;
2667 }
2668 
2669 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
2670 {
2671     BlockDriverAIOCBCoroutine *acb = opaque;
2672     BlockDriverState *bs = acb->common.bs;
2673 
2674     acb->req.error = bdrv_co_flush(bs);
2675     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
2676     qemu_bh_schedule(acb->bh);
2677 }
2678 
2679 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
2680         BlockDriverCompletionFunc *cb, void *opaque)
2681 {
2682     trace_bdrv_aio_flush(bs, opaque);
2683 
2684     Coroutine *co;
2685     BlockDriverAIOCBCoroutine *acb;
2686 
2687     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
2688     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
2689     qemu_coroutine_enter(co, acb);
2690 
2691     return &acb->common;
2692 }
2693 
2694 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
2695 {
2696     BlockDriverAIOCBCoroutine *acb = opaque;
2697     BlockDriverState *bs = acb->common.bs;
2698 
2699     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
2700     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
2701     qemu_bh_schedule(acb->bh);
2702 }
2703 
2704 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
2705         int64_t sector_num, int nb_sectors,
2706         BlockDriverCompletionFunc *cb, void *opaque)
2707 {
2708     Coroutine *co;
2709     BlockDriverAIOCBCoroutine *acb;
2710 
2711     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
2712 
2713     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
2714     acb->req.sector = sector_num;
2715     acb->req.nb_sectors = nb_sectors;
2716     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
2717     qemu_coroutine_enter(co, acb);
2718 
2719     return &acb->common;
2720 }
2721 
2722 void bdrv_init(void)
2723 {
2724     module_call_init(MODULE_INIT_BLOCK);
2725 }
2726 
2727 void bdrv_init_with_whitelist(void)
2728 {
2729     use_bdrv_whitelist = 1;
2730     bdrv_init();
2731 }
2732 
2733 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
2734                    BlockDriverCompletionFunc *cb, void *opaque)
2735 {
2736     BlockDriverAIOCB *acb;
2737 
2738     if (pool->free_aiocb) {
2739         acb = pool->free_aiocb;
2740         pool->free_aiocb = acb->next;
2741     } else {
2742         acb = g_malloc0(pool->aiocb_size);
2743         acb->pool = pool;
2744     }
2745     acb->bs = bs;
2746     acb->cb = cb;
2747     acb->opaque = opaque;
2748     return acb;
2749 }
2750 
2751 void qemu_aio_release(void *p)
2752 {
2753     BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
2754     AIOPool *pool = acb->pool;
2755     acb->next = pool->free_aiocb;
2756     pool->free_aiocb = acb;
2757 }
2758 
2759 /**************************************************************/
2760 /* Coroutine block device emulation */
2761 
2762 typedef struct CoroutineIOCompletion {
2763     Coroutine *coroutine;
2764     int ret;
2765 } CoroutineIOCompletion;
2766 
2767 static void bdrv_co_io_em_complete(void *opaque, int ret)
2768 {
2769     CoroutineIOCompletion *co = opaque;
2770 
2771     co->ret = ret;
2772     qemu_coroutine_enter(co->coroutine, NULL);
2773 }
2774 
2775 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
2776                                       int nb_sectors, QEMUIOVector *iov,
2777                                       bool is_write)
2778 {
2779     CoroutineIOCompletion co = {
2780         .coroutine = qemu_coroutine_self(),
2781     };
2782     BlockDriverAIOCB *acb;
2783 
2784     if (is_write) {
2785         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
2786                                        bdrv_co_io_em_complete, &co);
2787     } else {
2788         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
2789                                       bdrv_co_io_em_complete, &co);
2790     }
2791 
2792     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
2793     if (!acb) {
2794         return -EIO;
2795     }
2796     qemu_coroutine_yield();
2797 
2798     return co.ret;
2799 }
2800 
2801 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
2802                                          int64_t sector_num, int nb_sectors,
2803                                          QEMUIOVector *iov)
2804 {
2805     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
2806 }
2807 
2808 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
2809                                          int64_t sector_num, int nb_sectors,
2810                                          QEMUIOVector *iov)
2811 {
2812     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
2813 }
2814 
2815 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2816 {
2817     RwCo *rwco = opaque;
2818 
2819     rwco->ret = bdrv_co_flush(rwco->bs);
2820 }
2821 
2822 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2823 {
2824     if (bs->open_flags & BDRV_O_NO_FLUSH) {
2825         return 0;
2826     } else if (!bs->drv) {
2827         return 0;
2828     } else if (bs->drv->bdrv_co_flush) {
2829         return bs->drv->bdrv_co_flush(bs);
2830     } else if (bs->drv->bdrv_aio_flush) {
2831         BlockDriverAIOCB *acb;
2832         CoroutineIOCompletion co = {
2833             .coroutine = qemu_coroutine_self(),
2834         };
2835 
2836         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2837         if (acb == NULL) {
2838             return -EIO;
2839         } else {
2840             qemu_coroutine_yield();
2841             return co.ret;
2842         }
2843     } else {
2844         /*
2845          * Some block drivers always operate in either writethrough or unsafe
2846          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2847          * know how the server works (because the behaviour is hardcoded or
2848          * depends on server-side configuration), so we can't ensure that
2849          * everything is safe on disk. Returning an error doesn't work because
2850          * that would break guests even if the server operates in writethrough
2851          * mode.
2852          *
2853          * Let's hope the user knows what he's doing.
2854          */
2855         return 0;
2856     }
2857 }
2858 
2859 int bdrv_flush(BlockDriverState *bs)
2860 {
2861     Coroutine *co;
2862     RwCo rwco = {
2863         .bs = bs,
2864         .ret = NOT_DONE,
2865     };
2866 
2867     if (qemu_in_coroutine()) {
2868         /* Fast-path if already in coroutine context */
2869         bdrv_flush_co_entry(&rwco);
2870     } else {
2871         co = qemu_coroutine_create(bdrv_flush_co_entry);
2872         qemu_coroutine_enter(co, &rwco);
2873         while (rwco.ret == NOT_DONE) {
2874             qemu_aio_wait();
2875         }
2876     }
2877 
2878     return rwco.ret;
2879 }
2880 
2881 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
2882 {
2883     RwCo *rwco = opaque;
2884 
2885     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
2886 }
2887 
2888 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
2889                                  int nb_sectors)
2890 {
2891     if (!bs->drv) {
2892         return -ENOMEDIUM;
2893     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2894         return -EIO;
2895     } else if (bs->read_only) {
2896         return -EROFS;
2897     } else if (bs->drv->bdrv_co_discard) {
2898         return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
2899     } else if (bs->drv->bdrv_aio_discard) {
2900         BlockDriverAIOCB *acb;
2901         CoroutineIOCompletion co = {
2902             .coroutine = qemu_coroutine_self(),
2903         };
2904 
2905         acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
2906                                         bdrv_co_io_em_complete, &co);
2907         if (acb == NULL) {
2908             return -EIO;
2909         } else {
2910             qemu_coroutine_yield();
2911             return co.ret;
2912         }
2913     } else {
2914         return 0;
2915     }
2916 }
2917 
2918 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
2919 {
2920     Coroutine *co;
2921     RwCo rwco = {
2922         .bs = bs,
2923         .sector_num = sector_num,
2924         .nb_sectors = nb_sectors,
2925         .ret = NOT_DONE,
2926     };
2927 
2928     if (qemu_in_coroutine()) {
2929         /* Fast-path if already in coroutine context */
2930         bdrv_discard_co_entry(&rwco);
2931     } else {
2932         co = qemu_coroutine_create(bdrv_discard_co_entry);
2933         qemu_coroutine_enter(co, &rwco);
2934         while (rwco.ret == NOT_DONE) {
2935             qemu_aio_wait();
2936         }
2937     }
2938 
2939     return rwco.ret;
2940 }
2941 
2942 /**************************************************************/
2943 /* removable device support */
2944 
2945 /**
2946  * Return TRUE if the media is present
2947  */
2948 int bdrv_is_inserted(BlockDriverState *bs)
2949 {
2950     BlockDriver *drv = bs->drv;
2951 
2952     if (!drv)
2953         return 0;
2954     if (!drv->bdrv_is_inserted)
2955         return 1;
2956     return drv->bdrv_is_inserted(bs);
2957 }
2958 
2959 /**
2960  * Return whether the media changed since the last call to this
2961  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
2962  */
2963 int bdrv_media_changed(BlockDriverState *bs)
2964 {
2965     BlockDriver *drv = bs->drv;
2966 
2967     if (drv && drv->bdrv_media_changed) {
2968         return drv->bdrv_media_changed(bs);
2969     }
2970     return -ENOTSUP;
2971 }
2972 
2973 /**
2974  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
2975  */
2976 void bdrv_eject(BlockDriverState *bs, int eject_flag)
2977 {
2978     BlockDriver *drv = bs->drv;
2979 
2980     if (drv && drv->bdrv_eject) {
2981         drv->bdrv_eject(bs, eject_flag);
2982     }
2983 }
2984 
2985 /**
2986  * Lock or unlock the media (if it is locked, the user won't be able
2987  * to eject it manually).
2988  */
2989 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
2990 {
2991     BlockDriver *drv = bs->drv;
2992 
2993     trace_bdrv_lock_medium(bs, locked);
2994 
2995     if (drv && drv->bdrv_lock_medium) {
2996         drv->bdrv_lock_medium(bs, locked);
2997     }
2998 }
2999 
3000 /* needed for generic scsi interface */
3001 
3002 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3003 {
3004     BlockDriver *drv = bs->drv;
3005 
3006     if (drv && drv->bdrv_ioctl)
3007         return drv->bdrv_ioctl(bs, req, buf);
3008     return -ENOTSUP;
3009 }
3010 
3011 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3012         unsigned long int req, void *buf,
3013         BlockDriverCompletionFunc *cb, void *opaque)
3014 {
3015     BlockDriver *drv = bs->drv;
3016 
3017     if (drv && drv->bdrv_aio_ioctl)
3018         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3019     return NULL;
3020 }
3021 
3022 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3023 {
3024     bs->buffer_alignment = align;
3025 }
3026 
3027 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3028 {
3029     return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3030 }
3031 
3032 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3033 {
3034     int64_t bitmap_size;
3035 
3036     bs->dirty_count = 0;
3037     if (enable) {
3038         if (!bs->dirty_bitmap) {
3039             bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3040                     BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3041             bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
3042 
3043             bs->dirty_bitmap = g_malloc0(bitmap_size);
3044         }
3045     } else {
3046         if (bs->dirty_bitmap) {
3047             g_free(bs->dirty_bitmap);
3048             bs->dirty_bitmap = NULL;
3049         }
3050     }
3051 }
3052 
3053 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3054 {
3055     int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3056 
3057     if (bs->dirty_bitmap &&
3058         (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3059         return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3060             (1UL << (chunk % (sizeof(unsigned long) * 8))));
3061     } else {
3062         return 0;
3063     }
3064 }
3065 
3066 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3067                       int nr_sectors)
3068 {
3069     set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3070 }
3071 
3072 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3073 {
3074     return bs->dirty_count;
3075 }
3076 
3077 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3078 {
3079     assert(bs->in_use != in_use);
3080     bs->in_use = in_use;
3081 }
3082 
3083 int bdrv_in_use(BlockDriverState *bs)
3084 {
3085     return bs->in_use;
3086 }
3087 
3088 void bdrv_iostatus_enable(BlockDriverState *bs)
3089 {
3090     bs->iostatus_enabled = true;
3091     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3092 }
3093 
3094 /* The I/O status is only enabled if the drive explicitly
3095  * enables it _and_ the VM is configured to stop on errors */
3096 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3097 {
3098     return (bs->iostatus_enabled &&
3099            (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3100             bs->on_write_error == BLOCK_ERR_STOP_ANY    ||
3101             bs->on_read_error == BLOCK_ERR_STOP_ANY));
3102 }
3103 
3104 void bdrv_iostatus_disable(BlockDriverState *bs)
3105 {
3106     bs->iostatus_enabled = false;
3107 }
3108 
3109 void bdrv_iostatus_reset(BlockDriverState *bs)
3110 {
3111     if (bdrv_iostatus_is_enabled(bs)) {
3112         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3113     }
3114 }
3115 
3116 /* XXX: Today this is set by device models because it makes the implementation
3117    quite simple. However, the block layer knows about the error, so it's
3118    possible to implement this without device models being involved */
3119 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3120 {
3121     if (bdrv_iostatus_is_enabled(bs) &&
3122         bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
3123         assert(error >= 0);
3124         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3125                                          BLOCK_DEVICE_IO_STATUS_FAILED;
3126     }
3127 }
3128 
3129 void
3130 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3131         enum BlockAcctType type)
3132 {
3133     assert(type < BDRV_MAX_IOTYPE);
3134 
3135     cookie->bytes = bytes;
3136     cookie->start_time_ns = get_clock();
3137     cookie->type = type;
3138 }
3139 
3140 void
3141 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3142 {
3143     assert(cookie->type < BDRV_MAX_IOTYPE);
3144 
3145     bs->nr_bytes[cookie->type] += cookie->bytes;
3146     bs->nr_ops[cookie->type]++;
3147     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
3148 }
3149 
3150 int bdrv_img_create(const char *filename, const char *fmt,
3151                     const char *base_filename, const char *base_fmt,
3152                     char *options, uint64_t img_size, int flags)
3153 {
3154     QEMUOptionParameter *param = NULL, *create_options = NULL;
3155     QEMUOptionParameter *backing_fmt, *backing_file, *size;
3156     BlockDriverState *bs = NULL;
3157     BlockDriver *drv, *proto_drv;
3158     BlockDriver *backing_drv = NULL;
3159     int ret = 0;
3160 
3161     /* Find driver and parse its options */
3162     drv = bdrv_find_format(fmt);
3163     if (!drv) {
3164         error_report("Unknown file format '%s'", fmt);
3165         ret = -EINVAL;
3166         goto out;
3167     }
3168 
3169     proto_drv = bdrv_find_protocol(filename);
3170     if (!proto_drv) {
3171         error_report("Unknown protocol '%s'", filename);
3172         ret = -EINVAL;
3173         goto out;
3174     }
3175 
3176     create_options = append_option_parameters(create_options,
3177                                               drv->create_options);
3178     create_options = append_option_parameters(create_options,
3179                                               proto_drv->create_options);
3180 
3181     /* Create parameter list with default values */
3182     param = parse_option_parameters("", create_options, param);
3183 
3184     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
3185 
3186     /* Parse -o options */
3187     if (options) {
3188         param = parse_option_parameters(options, create_options, param);
3189         if (param == NULL) {
3190             error_report("Invalid options for file format '%s'.", fmt);
3191             ret = -EINVAL;
3192             goto out;
3193         }
3194     }
3195 
3196     if (base_filename) {
3197         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
3198                                  base_filename)) {
3199             error_report("Backing file not supported for file format '%s'",
3200                          fmt);
3201             ret = -EINVAL;
3202             goto out;
3203         }
3204     }
3205 
3206     if (base_fmt) {
3207         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
3208             error_report("Backing file format not supported for file "
3209                          "format '%s'", fmt);
3210             ret = -EINVAL;
3211             goto out;
3212         }
3213     }
3214 
3215     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
3216     if (backing_file && backing_file->value.s) {
3217         if (!strcmp(filename, backing_file->value.s)) {
3218             error_report("Error: Trying to create an image with the "
3219                          "same filename as the backing file");
3220             ret = -EINVAL;
3221             goto out;
3222         }
3223     }
3224 
3225     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
3226     if (backing_fmt && backing_fmt->value.s) {
3227         backing_drv = bdrv_find_format(backing_fmt->value.s);
3228         if (!backing_drv) {
3229             error_report("Unknown backing file format '%s'",
3230                          backing_fmt->value.s);
3231             ret = -EINVAL;
3232             goto out;
3233         }
3234     }
3235 
3236     // The size for the image must always be specified, with one exception:
3237     // If we are using a backing file, we can obtain the size from there
3238     size = get_option_parameter(param, BLOCK_OPT_SIZE);
3239     if (size && size->value.n == -1) {
3240         if (backing_file && backing_file->value.s) {
3241             uint64_t size;
3242             char buf[32];
3243 
3244             bs = bdrv_new("");
3245 
3246             ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
3247             if (ret < 0) {
3248                 error_report("Could not open '%s'", backing_file->value.s);
3249                 goto out;
3250             }
3251             bdrv_get_geometry(bs, &size);
3252             size *= 512;
3253 
3254             snprintf(buf, sizeof(buf), "%" PRId64, size);
3255             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
3256         } else {
3257             error_report("Image creation needs a size parameter");
3258             ret = -EINVAL;
3259             goto out;
3260         }
3261     }
3262 
3263     printf("Formatting '%s', fmt=%s ", filename, fmt);
3264     print_option_parameters(param);
3265     puts("");
3266 
3267     ret = bdrv_create(drv, filename, param);
3268 
3269     if (ret < 0) {
3270         if (ret == -ENOTSUP) {
3271             error_report("Formatting or formatting option not supported for "
3272                          "file format '%s'", fmt);
3273         } else if (ret == -EFBIG) {
3274             error_report("The image size is too large for file format '%s'",
3275                          fmt);
3276         } else {
3277             error_report("%s: error while creating %s: %s", filename, fmt,
3278                          strerror(-ret));
3279         }
3280     }
3281 
3282 out:
3283     free_option_parameters(create_options);
3284     free_option_parameters(param);
3285 
3286     if (bs) {
3287         bdrv_delete(bs);
3288     }
3289 
3290     return ret;
3291 }
3292