xref: /openbmc/qemu/block.c (revision 8a4266144ebffceb8fda2a5feb03d23c535923d4)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "monitor.h"
27 #include "block_int.h"
28 #include "module.h"
29 #include "qemu-objects.h"
30 
31 #ifdef CONFIG_BSD
32 #include <sys/types.h>
33 #include <sys/stat.h>
34 #include <sys/ioctl.h>
35 #include <sys/queue.h>
36 #ifndef __DragonFly__
37 #include <sys/disk.h>
38 #endif
39 #endif
40 
41 #ifdef _WIN32
42 #include <windows.h>
43 #endif
44 
45 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
46         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
47         BlockDriverCompletionFunc *cb, void *opaque);
48 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
49         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
50         BlockDriverCompletionFunc *cb, void *opaque);
51 static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
52         BlockDriverCompletionFunc *cb, void *opaque);
53 static BlockDriverAIOCB *bdrv_aio_noop_em(BlockDriverState *bs,
54         BlockDriverCompletionFunc *cb, void *opaque);
55 static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
56                         uint8_t *buf, int nb_sectors);
57 static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
58                          const uint8_t *buf, int nb_sectors);
59 
60 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
61     QTAILQ_HEAD_INITIALIZER(bdrv_states);
62 
63 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
64     QLIST_HEAD_INITIALIZER(bdrv_drivers);
65 
66 /* The device to use for VM snapshots */
67 static BlockDriverState *bs_snapshots;
68 
69 /* If non-zero, use only whitelisted block drivers */
70 static int use_bdrv_whitelist;
71 
72 int path_is_absolute(const char *path)
73 {
74     const char *p;
75 #ifdef _WIN32
76     /* specific case for names like: "\\.\d:" */
77     if (*path == '/' || *path == '\\')
78         return 1;
79 #endif
80     p = strchr(path, ':');
81     if (p)
82         p++;
83     else
84         p = path;
85 #ifdef _WIN32
86     return (*p == '/' || *p == '\\');
87 #else
88     return (*p == '/');
89 #endif
90 }
91 
92 /* if filename is absolute, just copy it to dest. Otherwise, build a
93    path to it by considering it is relative to base_path. URL are
94    supported. */
95 void path_combine(char *dest, int dest_size,
96                   const char *base_path,
97                   const char *filename)
98 {
99     const char *p, *p1;
100     int len;
101 
102     if (dest_size <= 0)
103         return;
104     if (path_is_absolute(filename)) {
105         pstrcpy(dest, dest_size, filename);
106     } else {
107         p = strchr(base_path, ':');
108         if (p)
109             p++;
110         else
111             p = base_path;
112         p1 = strrchr(base_path, '/');
113 #ifdef _WIN32
114         {
115             const char *p2;
116             p2 = strrchr(base_path, '\\');
117             if (!p1 || p2 > p1)
118                 p1 = p2;
119         }
120 #endif
121         if (p1)
122             p1++;
123         else
124             p1 = base_path;
125         if (p1 > p)
126             p = p1;
127         len = p - base_path;
128         if (len > dest_size - 1)
129             len = dest_size - 1;
130         memcpy(dest, base_path, len);
131         dest[len] = '\0';
132         pstrcat(dest, dest_size, filename);
133     }
134 }
135 
136 void bdrv_register(BlockDriver *bdrv)
137 {
138     if (!bdrv->bdrv_aio_readv) {
139         /* add AIO emulation layer */
140         bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
141         bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
142     } else if (!bdrv->bdrv_read) {
143         /* add synchronous IO emulation layer */
144         bdrv->bdrv_read = bdrv_read_em;
145         bdrv->bdrv_write = bdrv_write_em;
146     }
147 
148     if (!bdrv->bdrv_aio_flush)
149         bdrv->bdrv_aio_flush = bdrv_aio_flush_em;
150 
151     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
152 }
153 
154 /* create a new block device (by default it is empty) */
155 BlockDriverState *bdrv_new(const char *device_name)
156 {
157     BlockDriverState *bs;
158 
159     bs = qemu_mallocz(sizeof(BlockDriverState));
160     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
161     if (device_name[0] != '\0') {
162         QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
163     }
164     return bs;
165 }
166 
167 BlockDriver *bdrv_find_format(const char *format_name)
168 {
169     BlockDriver *drv1;
170     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
171         if (!strcmp(drv1->format_name, format_name)) {
172             return drv1;
173         }
174     }
175     return NULL;
176 }
177 
178 static int bdrv_is_whitelisted(BlockDriver *drv)
179 {
180     static const char *whitelist[] = {
181         CONFIG_BDRV_WHITELIST
182     };
183     const char **p;
184 
185     if (!whitelist[0])
186         return 1;               /* no whitelist, anything goes */
187 
188     for (p = whitelist; *p; p++) {
189         if (!strcmp(drv->format_name, *p)) {
190             return 1;
191         }
192     }
193     return 0;
194 }
195 
196 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
197 {
198     BlockDriver *drv = bdrv_find_format(format_name);
199     return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
200 }
201 
202 int bdrv_create(BlockDriver *drv, const char* filename,
203     QEMUOptionParameter *options)
204 {
205     if (!drv->bdrv_create)
206         return -ENOTSUP;
207 
208     return drv->bdrv_create(filename, options);
209 }
210 
211 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
212 {
213     BlockDriver *drv;
214 
215     drv = bdrv_find_protocol(filename);
216     if (drv == NULL) {
217         drv = bdrv_find_format("file");
218     }
219 
220     return bdrv_create(drv, filename, options);
221 }
222 
223 #ifdef _WIN32
224 void get_tmp_filename(char *filename, int size)
225 {
226     char temp_dir[MAX_PATH];
227 
228     GetTempPath(MAX_PATH, temp_dir);
229     GetTempFileName(temp_dir, "qem", 0, filename);
230 }
231 #else
232 void get_tmp_filename(char *filename, int size)
233 {
234     int fd;
235     const char *tmpdir;
236     /* XXX: race condition possible */
237     tmpdir = getenv("TMPDIR");
238     if (!tmpdir)
239         tmpdir = "/tmp";
240     snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
241     fd = mkstemp(filename);
242     close(fd);
243 }
244 #endif
245 
246 #ifdef _WIN32
247 static int is_windows_drive_prefix(const char *filename)
248 {
249     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
250              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
251             filename[1] == ':');
252 }
253 
254 int is_windows_drive(const char *filename)
255 {
256     if (is_windows_drive_prefix(filename) &&
257         filename[2] == '\0')
258         return 1;
259     if (strstart(filename, "\\\\.\\", NULL) ||
260         strstart(filename, "//./", NULL))
261         return 1;
262     return 0;
263 }
264 #endif
265 
266 /*
267  * Detect host devices. By convention, /dev/cdrom[N] is always
268  * recognized as a host CDROM.
269  */
270 static BlockDriver *find_hdev_driver(const char *filename)
271 {
272     int score_max = 0, score;
273     BlockDriver *drv = NULL, *d;
274 
275     QLIST_FOREACH(d, &bdrv_drivers, list) {
276         if (d->bdrv_probe_device) {
277             score = d->bdrv_probe_device(filename);
278             if (score > score_max) {
279                 score_max = score;
280                 drv = d;
281             }
282         }
283     }
284 
285     return drv;
286 }
287 
288 BlockDriver *bdrv_find_protocol(const char *filename)
289 {
290     BlockDriver *drv1;
291     char protocol[128];
292     int len;
293     const char *p;
294 
295     /* TODO Drivers without bdrv_file_open must be specified explicitly */
296 
297     /*
298      * XXX(hch): we really should not let host device detection
299      * override an explicit protocol specification, but moving this
300      * later breaks access to device names with colons in them.
301      * Thanks to the brain-dead persistent naming schemes on udev-
302      * based Linux systems those actually are quite common.
303      */
304     drv1 = find_hdev_driver(filename);
305     if (drv1) {
306         return drv1;
307     }
308 
309 #ifdef _WIN32
310      if (is_windows_drive(filename) ||
311          is_windows_drive_prefix(filename))
312          return bdrv_find_format("file");
313 #endif
314 
315     p = strchr(filename, ':');
316     if (!p) {
317         return bdrv_find_format("file");
318     }
319     len = p - filename;
320     if (len > sizeof(protocol) - 1)
321         len = sizeof(protocol) - 1;
322     memcpy(protocol, filename, len);
323     protocol[len] = '\0';
324     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
325         if (drv1->protocol_name &&
326             !strcmp(drv1->protocol_name, protocol)) {
327             return drv1;
328         }
329     }
330     return NULL;
331 }
332 
333 static int find_image_format(const char *filename, BlockDriver **pdrv)
334 {
335     int ret, score, score_max;
336     BlockDriver *drv1, *drv;
337     uint8_t buf[2048];
338     BlockDriverState *bs;
339 
340     ret = bdrv_file_open(&bs, filename, 0);
341     if (ret < 0) {
342         *pdrv = NULL;
343         return ret;
344     }
345 
346     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
347     if (bs->sg || !bdrv_is_inserted(bs)) {
348         bdrv_delete(bs);
349         drv = bdrv_find_format("raw");
350         if (!drv) {
351             ret = -ENOENT;
352         }
353         *pdrv = drv;
354         return ret;
355     }
356 
357     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
358     bdrv_delete(bs);
359     if (ret < 0) {
360         *pdrv = NULL;
361         return ret;
362     }
363 
364     score_max = 0;
365     drv = NULL;
366     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
367         if (drv1->bdrv_probe) {
368             score = drv1->bdrv_probe(buf, ret, filename);
369             if (score > score_max) {
370                 score_max = score;
371                 drv = drv1;
372             }
373         }
374     }
375     if (!drv) {
376         ret = -ENOENT;
377     }
378     *pdrv = drv;
379     return ret;
380 }
381 
382 /**
383  * Set the current 'total_sectors' value
384  */
385 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
386 {
387     BlockDriver *drv = bs->drv;
388 
389     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
390     if (bs->sg)
391         return 0;
392 
393     /* query actual device if possible, otherwise just trust the hint */
394     if (drv->bdrv_getlength) {
395         int64_t length = drv->bdrv_getlength(bs);
396         if (length < 0) {
397             return length;
398         }
399         hint = length >> BDRV_SECTOR_BITS;
400     }
401 
402     bs->total_sectors = hint;
403     return 0;
404 }
405 
406 /*
407  * Common part for opening disk images and files
408  */
409 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
410     int flags, BlockDriver *drv)
411 {
412     int ret, open_flags;
413 
414     assert(drv != NULL);
415 
416     bs->file = NULL;
417     bs->total_sectors = 0;
418     bs->encrypted = 0;
419     bs->valid_key = 0;
420     bs->open_flags = flags;
421     /* buffer_alignment defaulted to 512, drivers can change this value */
422     bs->buffer_alignment = 512;
423 
424     pstrcpy(bs->filename, sizeof(bs->filename), filename);
425 
426     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
427         return -ENOTSUP;
428     }
429 
430     bs->drv = drv;
431     bs->opaque = qemu_mallocz(drv->instance_size);
432 
433     /*
434      * Yes, BDRV_O_NOCACHE aka O_DIRECT means we have to present a
435      * write cache to the guest.  We do need the fdatasync to flush
436      * out transactions for block allocations, and we maybe have a
437      * volatile write cache in our backing device to deal with.
438      */
439     if (flags & (BDRV_O_CACHE_WB|BDRV_O_NOCACHE))
440         bs->enable_write_cache = 1;
441 
442     /*
443      * Clear flags that are internal to the block layer before opening the
444      * image.
445      */
446     open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
447 
448     /*
449      * Snapshots should be writeable.
450      */
451     if (bs->is_temporary) {
452         open_flags |= BDRV_O_RDWR;
453     }
454 
455     /* Open the image, either directly or using a protocol */
456     if (drv->bdrv_file_open) {
457         ret = drv->bdrv_file_open(bs, filename, open_flags);
458     } else {
459         ret = bdrv_file_open(&bs->file, filename, open_flags);
460         if (ret >= 0) {
461             ret = drv->bdrv_open(bs, open_flags);
462         }
463     }
464 
465     if (ret < 0) {
466         goto free_and_fail;
467     }
468 
469     bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
470 
471     ret = refresh_total_sectors(bs, bs->total_sectors);
472     if (ret < 0) {
473         goto free_and_fail;
474     }
475 
476 #ifndef _WIN32
477     if (bs->is_temporary) {
478         unlink(filename);
479     }
480 #endif
481     return 0;
482 
483 free_and_fail:
484     if (bs->file) {
485         bdrv_delete(bs->file);
486         bs->file = NULL;
487     }
488     qemu_free(bs->opaque);
489     bs->opaque = NULL;
490     bs->drv = NULL;
491     return ret;
492 }
493 
494 /*
495  * Opens a file using a protocol (file, host_device, nbd, ...)
496  */
497 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
498 {
499     BlockDriverState *bs;
500     BlockDriver *drv;
501     int ret;
502 
503     drv = bdrv_find_protocol(filename);
504     if (!drv) {
505         return -ENOENT;
506     }
507 
508     bs = bdrv_new("");
509     ret = bdrv_open_common(bs, filename, flags, drv);
510     if (ret < 0) {
511         bdrv_delete(bs);
512         return ret;
513     }
514     bs->growable = 1;
515     *pbs = bs;
516     return 0;
517 }
518 
519 /*
520  * Opens a disk image (raw, qcow2, vmdk, ...)
521  */
522 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
523               BlockDriver *drv)
524 {
525     int ret;
526     int probed = 0;
527 
528     if (flags & BDRV_O_SNAPSHOT) {
529         BlockDriverState *bs1;
530         int64_t total_size;
531         int is_protocol = 0;
532         BlockDriver *bdrv_qcow2;
533         QEMUOptionParameter *options;
534         char tmp_filename[PATH_MAX];
535         char backing_filename[PATH_MAX];
536 
537         /* if snapshot, we create a temporary backing file and open it
538            instead of opening 'filename' directly */
539 
540         /* if there is a backing file, use it */
541         bs1 = bdrv_new("");
542         ret = bdrv_open(bs1, filename, 0, drv);
543         if (ret < 0) {
544             bdrv_delete(bs1);
545             return ret;
546         }
547         total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
548 
549         if (bs1->drv && bs1->drv->protocol_name)
550             is_protocol = 1;
551 
552         bdrv_delete(bs1);
553 
554         get_tmp_filename(tmp_filename, sizeof(tmp_filename));
555 
556         /* Real path is meaningless for protocols */
557         if (is_protocol)
558             snprintf(backing_filename, sizeof(backing_filename),
559                      "%s", filename);
560         else if (!realpath(filename, backing_filename))
561             return -errno;
562 
563         bdrv_qcow2 = bdrv_find_format("qcow2");
564         options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
565 
566         set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
567         set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
568         if (drv) {
569             set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
570                 drv->format_name);
571         }
572 
573         ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
574         free_option_parameters(options);
575         if (ret < 0) {
576             return ret;
577         }
578 
579         filename = tmp_filename;
580         drv = bdrv_qcow2;
581         bs->is_temporary = 1;
582     }
583 
584     /* Find the right image format driver */
585     if (!drv) {
586         ret = find_image_format(filename, &drv);
587         probed = 1;
588     }
589 
590     if (!drv) {
591         goto unlink_and_fail;
592     }
593 
594     /* Open the image */
595     ret = bdrv_open_common(bs, filename, flags, drv);
596     if (ret < 0) {
597         goto unlink_and_fail;
598     }
599 
600     bs->probed = probed;
601 
602     /* If there is a backing file, use it */
603     if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
604         char backing_filename[PATH_MAX];
605         int back_flags;
606         BlockDriver *back_drv = NULL;
607 
608         bs->backing_hd = bdrv_new("");
609         path_combine(backing_filename, sizeof(backing_filename),
610                      filename, bs->backing_file);
611         if (bs->backing_format[0] != '\0')
612             back_drv = bdrv_find_format(bs->backing_format);
613 
614         /* backing files always opened read-only */
615         back_flags =
616             flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
617 
618         ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
619         if (ret < 0) {
620             bdrv_close(bs);
621             return ret;
622         }
623         if (bs->is_temporary) {
624             bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
625         } else {
626             /* base image inherits from "parent" */
627             bs->backing_hd->keep_read_only = bs->keep_read_only;
628         }
629     }
630 
631     if (!bdrv_key_required(bs)) {
632         /* call the change callback */
633         bs->media_changed = 1;
634         if (bs->change_cb)
635             bs->change_cb(bs->change_opaque);
636     }
637 
638     return 0;
639 
640 unlink_and_fail:
641     if (bs->is_temporary) {
642         unlink(filename);
643     }
644     return ret;
645 }
646 
647 void bdrv_close(BlockDriverState *bs)
648 {
649     if (bs->drv) {
650         if (bs == bs_snapshots) {
651             bs_snapshots = NULL;
652         }
653         if (bs->backing_hd) {
654             bdrv_delete(bs->backing_hd);
655             bs->backing_hd = NULL;
656         }
657         bs->drv->bdrv_close(bs);
658         qemu_free(bs->opaque);
659 #ifdef _WIN32
660         if (bs->is_temporary) {
661             unlink(bs->filename);
662         }
663 #endif
664         bs->opaque = NULL;
665         bs->drv = NULL;
666 
667         if (bs->file != NULL) {
668             bdrv_close(bs->file);
669         }
670 
671         /* call the change callback */
672         bs->media_changed = 1;
673         if (bs->change_cb)
674             bs->change_cb(bs->change_opaque);
675     }
676 }
677 
678 void bdrv_close_all(void)
679 {
680     BlockDriverState *bs;
681 
682     QTAILQ_FOREACH(bs, &bdrv_states, list) {
683         bdrv_close(bs);
684     }
685 }
686 
687 void bdrv_delete(BlockDriverState *bs)
688 {
689     assert(!bs->peer);
690 
691     /* remove from list, if necessary */
692     if (bs->device_name[0] != '\0') {
693         QTAILQ_REMOVE(&bdrv_states, bs, list);
694     }
695 
696     bdrv_close(bs);
697     if (bs->file != NULL) {
698         bdrv_delete(bs->file);
699     }
700 
701     assert(bs != bs_snapshots);
702     qemu_free(bs);
703 }
704 
705 int bdrv_attach(BlockDriverState *bs, DeviceState *qdev)
706 {
707     if (bs->peer) {
708         return -EBUSY;
709     }
710     bs->peer = qdev;
711     return 0;
712 }
713 
714 void bdrv_detach(BlockDriverState *bs, DeviceState *qdev)
715 {
716     assert(bs->peer == qdev);
717     bs->peer = NULL;
718 }
719 
720 DeviceState *bdrv_get_attached(BlockDriverState *bs)
721 {
722     return bs->peer;
723 }
724 
725 /*
726  * Run consistency checks on an image
727  *
728  * Returns 0 if the check could be completed (it doesn't mean that the image is
729  * free of errors) or -errno when an internal error occured. The results of the
730  * check are stored in res.
731  */
732 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
733 {
734     if (bs->drv->bdrv_check == NULL) {
735         return -ENOTSUP;
736     }
737 
738     memset(res, 0, sizeof(*res));
739     return bs->drv->bdrv_check(bs, res);
740 }
741 
742 #define COMMIT_BUF_SECTORS 2048
743 
744 /* commit COW file into the raw image */
745 int bdrv_commit(BlockDriverState *bs)
746 {
747     BlockDriver *drv = bs->drv;
748     int64_t sector, total_sectors;
749     int n, ro, open_flags;
750     int ret = 0, rw_ret = 0;
751     uint8_t *buf;
752     char filename[1024];
753     BlockDriverState *bs_rw, *bs_ro;
754 
755     if (!drv)
756         return -ENOMEDIUM;
757 
758     if (!bs->backing_hd) {
759         return -ENOTSUP;
760     }
761 
762     if (bs->backing_hd->keep_read_only) {
763         return -EACCES;
764     }
765 
766     ro = bs->backing_hd->read_only;
767     strncpy(filename, bs->backing_hd->filename, sizeof(filename));
768     open_flags =  bs->backing_hd->open_flags;
769 
770     if (ro) {
771         /* re-open as RW */
772         bdrv_delete(bs->backing_hd);
773         bs->backing_hd = NULL;
774         bs_rw = bdrv_new("");
775         rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR, drv);
776         if (rw_ret < 0) {
777             bdrv_delete(bs_rw);
778             /* try to re-open read-only */
779             bs_ro = bdrv_new("");
780             ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR, drv);
781             if (ret < 0) {
782                 bdrv_delete(bs_ro);
783                 /* drive not functional anymore */
784                 bs->drv = NULL;
785                 return ret;
786             }
787             bs->backing_hd = bs_ro;
788             return rw_ret;
789         }
790         bs->backing_hd = bs_rw;
791     }
792 
793     total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
794     buf = qemu_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
795 
796     for (sector = 0; sector < total_sectors; sector += n) {
797         if (drv->bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
798 
799             if (bdrv_read(bs, sector, buf, n) != 0) {
800                 ret = -EIO;
801                 goto ro_cleanup;
802             }
803 
804             if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
805                 ret = -EIO;
806                 goto ro_cleanup;
807             }
808         }
809     }
810 
811     if (drv->bdrv_make_empty) {
812         ret = drv->bdrv_make_empty(bs);
813         bdrv_flush(bs);
814     }
815 
816     /*
817      * Make sure all data we wrote to the backing device is actually
818      * stable on disk.
819      */
820     if (bs->backing_hd)
821         bdrv_flush(bs->backing_hd);
822 
823 ro_cleanup:
824     qemu_free(buf);
825 
826     if (ro) {
827         /* re-open as RO */
828         bdrv_delete(bs->backing_hd);
829         bs->backing_hd = NULL;
830         bs_ro = bdrv_new("");
831         ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR, drv);
832         if (ret < 0) {
833             bdrv_delete(bs_ro);
834             /* drive not functional anymore */
835             bs->drv = NULL;
836             return ret;
837         }
838         bs->backing_hd = bs_ro;
839         bs->backing_hd->keep_read_only = 0;
840     }
841 
842     return ret;
843 }
844 
845 void bdrv_commit_all(void)
846 {
847     BlockDriverState *bs;
848 
849     QTAILQ_FOREACH(bs, &bdrv_states, list) {
850         bdrv_commit(bs);
851     }
852 }
853 
854 /*
855  * Return values:
856  * 0        - success
857  * -EINVAL  - backing format specified, but no file
858  * -ENOSPC  - can't update the backing file because no space is left in the
859  *            image file header
860  * -ENOTSUP - format driver doesn't support changing the backing file
861  */
862 int bdrv_change_backing_file(BlockDriverState *bs,
863     const char *backing_file, const char *backing_fmt)
864 {
865     BlockDriver *drv = bs->drv;
866 
867     if (drv->bdrv_change_backing_file != NULL) {
868         return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
869     } else {
870         return -ENOTSUP;
871     }
872 }
873 
874 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
875                                    size_t size)
876 {
877     int64_t len;
878 
879     if (!bdrv_is_inserted(bs))
880         return -ENOMEDIUM;
881 
882     if (bs->growable)
883         return 0;
884 
885     len = bdrv_getlength(bs);
886 
887     if (offset < 0)
888         return -EIO;
889 
890     if ((offset > len) || (len - offset < size))
891         return -EIO;
892 
893     return 0;
894 }
895 
896 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
897                               int nb_sectors)
898 {
899     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
900                                    nb_sectors * BDRV_SECTOR_SIZE);
901 }
902 
903 /* return < 0 if error. See bdrv_write() for the return codes */
904 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
905               uint8_t *buf, int nb_sectors)
906 {
907     BlockDriver *drv = bs->drv;
908 
909     if (!drv)
910         return -ENOMEDIUM;
911     if (bdrv_check_request(bs, sector_num, nb_sectors))
912         return -EIO;
913 
914     return drv->bdrv_read(bs, sector_num, buf, nb_sectors);
915 }
916 
917 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
918                              int nb_sectors, int dirty)
919 {
920     int64_t start, end;
921     unsigned long val, idx, bit;
922 
923     start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
924     end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
925 
926     for (; start <= end; start++) {
927         idx = start / (sizeof(unsigned long) * 8);
928         bit = start % (sizeof(unsigned long) * 8);
929         val = bs->dirty_bitmap[idx];
930         if (dirty) {
931             if (!(val & (1 << bit))) {
932                 bs->dirty_count++;
933                 val |= 1 << bit;
934             }
935         } else {
936             if (val & (1 << bit)) {
937                 bs->dirty_count--;
938                 val &= ~(1 << bit);
939             }
940         }
941         bs->dirty_bitmap[idx] = val;
942     }
943 }
944 
945 /* Return < 0 if error. Important errors are:
946   -EIO         generic I/O error (may happen for all errors)
947   -ENOMEDIUM   No media inserted.
948   -EINVAL      Invalid sector number or nb_sectors
949   -EACCES      Trying to write a read-only device
950 */
951 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
952                const uint8_t *buf, int nb_sectors)
953 {
954     BlockDriver *drv = bs->drv;
955     if (!bs->drv)
956         return -ENOMEDIUM;
957     if (bs->read_only)
958         return -EACCES;
959     if (bdrv_check_request(bs, sector_num, nb_sectors))
960         return -EIO;
961 
962     if (bs->dirty_bitmap) {
963         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
964     }
965 
966     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
967         bs->wr_highest_sector = sector_num + nb_sectors - 1;
968     }
969 
970     return drv->bdrv_write(bs, sector_num, buf, nb_sectors);
971 }
972 
973 int bdrv_pread(BlockDriverState *bs, int64_t offset,
974                void *buf, int count1)
975 {
976     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
977     int len, nb_sectors, count;
978     int64_t sector_num;
979     int ret;
980 
981     count = count1;
982     /* first read to align to sector start */
983     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
984     if (len > count)
985         len = count;
986     sector_num = offset >> BDRV_SECTOR_BITS;
987     if (len > 0) {
988         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
989             return ret;
990         memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
991         count -= len;
992         if (count == 0)
993             return count1;
994         sector_num++;
995         buf += len;
996     }
997 
998     /* read the sectors "in place" */
999     nb_sectors = count >> BDRV_SECTOR_BITS;
1000     if (nb_sectors > 0) {
1001         if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1002             return ret;
1003         sector_num += nb_sectors;
1004         len = nb_sectors << BDRV_SECTOR_BITS;
1005         buf += len;
1006         count -= len;
1007     }
1008 
1009     /* add data from the last sector */
1010     if (count > 0) {
1011         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1012             return ret;
1013         memcpy(buf, tmp_buf, count);
1014     }
1015     return count1;
1016 }
1017 
1018 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1019                 const void *buf, int count1)
1020 {
1021     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1022     int len, nb_sectors, count;
1023     int64_t sector_num;
1024     int ret;
1025 
1026     count = count1;
1027     /* first write to align to sector start */
1028     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1029     if (len > count)
1030         len = count;
1031     sector_num = offset >> BDRV_SECTOR_BITS;
1032     if (len > 0) {
1033         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1034             return ret;
1035         memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1036         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1037             return ret;
1038         count -= len;
1039         if (count == 0)
1040             return count1;
1041         sector_num++;
1042         buf += len;
1043     }
1044 
1045     /* write the sectors "in place" */
1046     nb_sectors = count >> BDRV_SECTOR_BITS;
1047     if (nb_sectors > 0) {
1048         if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1049             return ret;
1050         sector_num += nb_sectors;
1051         len = nb_sectors << BDRV_SECTOR_BITS;
1052         buf += len;
1053         count -= len;
1054     }
1055 
1056     /* add data from the last sector */
1057     if (count > 0) {
1058         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1059             return ret;
1060         memcpy(tmp_buf, buf, count);
1061         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1062             return ret;
1063     }
1064     return count1;
1065 }
1066 
1067 /*
1068  * Writes to the file and ensures that no writes are reordered across this
1069  * request (acts as a barrier)
1070  *
1071  * Returns 0 on success, -errno in error cases.
1072  */
1073 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1074     const void *buf, int count)
1075 {
1076     int ret;
1077 
1078     ret = bdrv_pwrite(bs, offset, buf, count);
1079     if (ret < 0) {
1080         return ret;
1081     }
1082 
1083     /* No flush needed for cache=writethrough, it uses O_DSYNC */
1084     if ((bs->open_flags & BDRV_O_CACHE_MASK) != 0) {
1085         bdrv_flush(bs);
1086     }
1087 
1088     return 0;
1089 }
1090 
1091 /*
1092  * Writes to the file and ensures that no writes are reordered across this
1093  * request (acts as a barrier)
1094  *
1095  * Returns 0 on success, -errno in error cases.
1096  */
1097 int bdrv_write_sync(BlockDriverState *bs, int64_t sector_num,
1098     const uint8_t *buf, int nb_sectors)
1099 {
1100     return bdrv_pwrite_sync(bs, BDRV_SECTOR_SIZE * sector_num,
1101         buf, BDRV_SECTOR_SIZE * nb_sectors);
1102 }
1103 
1104 /**
1105  * Truncate file to 'offset' bytes (needed only for file protocols)
1106  */
1107 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1108 {
1109     BlockDriver *drv = bs->drv;
1110     int ret;
1111     if (!drv)
1112         return -ENOMEDIUM;
1113     if (!drv->bdrv_truncate)
1114         return -ENOTSUP;
1115     if (bs->read_only)
1116         return -EACCES;
1117     ret = drv->bdrv_truncate(bs, offset);
1118     if (ret == 0) {
1119         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1120     }
1121     return ret;
1122 }
1123 
1124 /**
1125  * Length of a file in bytes. Return < 0 if error or unknown.
1126  */
1127 int64_t bdrv_getlength(BlockDriverState *bs)
1128 {
1129     BlockDriver *drv = bs->drv;
1130     if (!drv)
1131         return -ENOMEDIUM;
1132 
1133     /* Fixed size devices use the total_sectors value for speed instead of
1134        issuing a length query (like lseek) on each call.  Also, legacy block
1135        drivers don't provide a bdrv_getlength function and must use
1136        total_sectors. */
1137     if (!bs->growable || !drv->bdrv_getlength) {
1138         return bs->total_sectors * BDRV_SECTOR_SIZE;
1139     }
1140     return drv->bdrv_getlength(bs);
1141 }
1142 
1143 /* return 0 as number of sectors if no device present or error */
1144 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
1145 {
1146     int64_t length;
1147     length = bdrv_getlength(bs);
1148     if (length < 0)
1149         length = 0;
1150     else
1151         length = length >> BDRV_SECTOR_BITS;
1152     *nb_sectors_ptr = length;
1153 }
1154 
1155 struct partition {
1156         uint8_t boot_ind;           /* 0x80 - active */
1157         uint8_t head;               /* starting head */
1158         uint8_t sector;             /* starting sector */
1159         uint8_t cyl;                /* starting cylinder */
1160         uint8_t sys_ind;            /* What partition type */
1161         uint8_t end_head;           /* end head */
1162         uint8_t end_sector;         /* end sector */
1163         uint8_t end_cyl;            /* end cylinder */
1164         uint32_t start_sect;        /* starting sector counting from 0 */
1165         uint32_t nr_sects;          /* nr of sectors in partition */
1166 } __attribute__((packed));
1167 
1168 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1169 static int guess_disk_lchs(BlockDriverState *bs,
1170                            int *pcylinders, int *pheads, int *psectors)
1171 {
1172     uint8_t buf[BDRV_SECTOR_SIZE];
1173     int ret, i, heads, sectors, cylinders;
1174     struct partition *p;
1175     uint32_t nr_sects;
1176     uint64_t nb_sectors;
1177 
1178     bdrv_get_geometry(bs, &nb_sectors);
1179 
1180     ret = bdrv_read(bs, 0, buf, 1);
1181     if (ret < 0)
1182         return -1;
1183     /* test msdos magic */
1184     if (buf[510] != 0x55 || buf[511] != 0xaa)
1185         return -1;
1186     for(i = 0; i < 4; i++) {
1187         p = ((struct partition *)(buf + 0x1be)) + i;
1188         nr_sects = le32_to_cpu(p->nr_sects);
1189         if (nr_sects && p->end_head) {
1190             /* We make the assumption that the partition terminates on
1191                a cylinder boundary */
1192             heads = p->end_head + 1;
1193             sectors = p->end_sector & 63;
1194             if (sectors == 0)
1195                 continue;
1196             cylinders = nb_sectors / (heads * sectors);
1197             if (cylinders < 1 || cylinders > 16383)
1198                 continue;
1199             *pheads = heads;
1200             *psectors = sectors;
1201             *pcylinders = cylinders;
1202 #if 0
1203             printf("guessed geometry: LCHS=%d %d %d\n",
1204                    cylinders, heads, sectors);
1205 #endif
1206             return 0;
1207         }
1208     }
1209     return -1;
1210 }
1211 
1212 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
1213 {
1214     int translation, lba_detected = 0;
1215     int cylinders, heads, secs;
1216     uint64_t nb_sectors;
1217 
1218     /* if a geometry hint is available, use it */
1219     bdrv_get_geometry(bs, &nb_sectors);
1220     bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
1221     translation = bdrv_get_translation_hint(bs);
1222     if (cylinders != 0) {
1223         *pcyls = cylinders;
1224         *pheads = heads;
1225         *psecs = secs;
1226     } else {
1227         if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
1228             if (heads > 16) {
1229                 /* if heads > 16, it means that a BIOS LBA
1230                    translation was active, so the default
1231                    hardware geometry is OK */
1232                 lba_detected = 1;
1233                 goto default_geometry;
1234             } else {
1235                 *pcyls = cylinders;
1236                 *pheads = heads;
1237                 *psecs = secs;
1238                 /* disable any translation to be in sync with
1239                    the logical geometry */
1240                 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
1241                     bdrv_set_translation_hint(bs,
1242                                               BIOS_ATA_TRANSLATION_NONE);
1243                 }
1244             }
1245         } else {
1246         default_geometry:
1247             /* if no geometry, use a standard physical disk geometry */
1248             cylinders = nb_sectors / (16 * 63);
1249 
1250             if (cylinders > 16383)
1251                 cylinders = 16383;
1252             else if (cylinders < 2)
1253                 cylinders = 2;
1254             *pcyls = cylinders;
1255             *pheads = 16;
1256             *psecs = 63;
1257             if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
1258                 if ((*pcyls * *pheads) <= 131072) {
1259                     bdrv_set_translation_hint(bs,
1260                                               BIOS_ATA_TRANSLATION_LARGE);
1261                 } else {
1262                     bdrv_set_translation_hint(bs,
1263                                               BIOS_ATA_TRANSLATION_LBA);
1264                 }
1265             }
1266         }
1267         bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
1268     }
1269 }
1270 
1271 void bdrv_set_geometry_hint(BlockDriverState *bs,
1272                             int cyls, int heads, int secs)
1273 {
1274     bs->cyls = cyls;
1275     bs->heads = heads;
1276     bs->secs = secs;
1277 }
1278 
1279 void bdrv_set_type_hint(BlockDriverState *bs, int type)
1280 {
1281     bs->type = type;
1282     bs->removable = ((type == BDRV_TYPE_CDROM ||
1283                       type == BDRV_TYPE_FLOPPY));
1284 }
1285 
1286 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
1287 {
1288     bs->translation = translation;
1289 }
1290 
1291 void bdrv_get_geometry_hint(BlockDriverState *bs,
1292                             int *pcyls, int *pheads, int *psecs)
1293 {
1294     *pcyls = bs->cyls;
1295     *pheads = bs->heads;
1296     *psecs = bs->secs;
1297 }
1298 
1299 int bdrv_get_type_hint(BlockDriverState *bs)
1300 {
1301     return bs->type;
1302 }
1303 
1304 int bdrv_get_translation_hint(BlockDriverState *bs)
1305 {
1306     return bs->translation;
1307 }
1308 
1309 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
1310                        BlockErrorAction on_write_error)
1311 {
1312     bs->on_read_error = on_read_error;
1313     bs->on_write_error = on_write_error;
1314 }
1315 
1316 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
1317 {
1318     return is_read ? bs->on_read_error : bs->on_write_error;
1319 }
1320 
1321 void bdrv_set_removable(BlockDriverState *bs, int removable)
1322 {
1323     bs->removable = removable;
1324     if (removable && bs == bs_snapshots) {
1325         bs_snapshots = NULL;
1326     }
1327 }
1328 
1329 int bdrv_is_removable(BlockDriverState *bs)
1330 {
1331     return bs->removable;
1332 }
1333 
1334 int bdrv_is_read_only(BlockDriverState *bs)
1335 {
1336     return bs->read_only;
1337 }
1338 
1339 int bdrv_is_sg(BlockDriverState *bs)
1340 {
1341     return bs->sg;
1342 }
1343 
1344 int bdrv_enable_write_cache(BlockDriverState *bs)
1345 {
1346     return bs->enable_write_cache;
1347 }
1348 
1349 /* XXX: no longer used */
1350 void bdrv_set_change_cb(BlockDriverState *bs,
1351                         void (*change_cb)(void *opaque), void *opaque)
1352 {
1353     bs->change_cb = change_cb;
1354     bs->change_opaque = opaque;
1355 }
1356 
1357 int bdrv_is_encrypted(BlockDriverState *bs)
1358 {
1359     if (bs->backing_hd && bs->backing_hd->encrypted)
1360         return 1;
1361     return bs->encrypted;
1362 }
1363 
1364 int bdrv_key_required(BlockDriverState *bs)
1365 {
1366     BlockDriverState *backing_hd = bs->backing_hd;
1367 
1368     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
1369         return 1;
1370     return (bs->encrypted && !bs->valid_key);
1371 }
1372 
1373 int bdrv_set_key(BlockDriverState *bs, const char *key)
1374 {
1375     int ret;
1376     if (bs->backing_hd && bs->backing_hd->encrypted) {
1377         ret = bdrv_set_key(bs->backing_hd, key);
1378         if (ret < 0)
1379             return ret;
1380         if (!bs->encrypted)
1381             return 0;
1382     }
1383     if (!bs->encrypted) {
1384         return -EINVAL;
1385     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
1386         return -ENOMEDIUM;
1387     }
1388     ret = bs->drv->bdrv_set_key(bs, key);
1389     if (ret < 0) {
1390         bs->valid_key = 0;
1391     } else if (!bs->valid_key) {
1392         bs->valid_key = 1;
1393         /* call the change callback now, we skipped it on open */
1394         bs->media_changed = 1;
1395         if (bs->change_cb)
1396             bs->change_cb(bs->change_opaque);
1397     }
1398     return ret;
1399 }
1400 
1401 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
1402 {
1403     if (!bs->drv) {
1404         buf[0] = '\0';
1405     } else {
1406         pstrcpy(buf, buf_size, bs->drv->format_name);
1407     }
1408 }
1409 
1410 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
1411                          void *opaque)
1412 {
1413     BlockDriver *drv;
1414 
1415     QLIST_FOREACH(drv, &bdrv_drivers, list) {
1416         it(opaque, drv->format_name);
1417     }
1418 }
1419 
1420 BlockDriverState *bdrv_find(const char *name)
1421 {
1422     BlockDriverState *bs;
1423 
1424     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1425         if (!strcmp(name, bs->device_name)) {
1426             return bs;
1427         }
1428     }
1429     return NULL;
1430 }
1431 
1432 BlockDriverState *bdrv_next(BlockDriverState *bs)
1433 {
1434     if (!bs) {
1435         return QTAILQ_FIRST(&bdrv_states);
1436     }
1437     return QTAILQ_NEXT(bs, list);
1438 }
1439 
1440 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
1441 {
1442     BlockDriverState *bs;
1443 
1444     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1445         it(opaque, bs);
1446     }
1447 }
1448 
1449 const char *bdrv_get_device_name(BlockDriverState *bs)
1450 {
1451     return bs->device_name;
1452 }
1453 
1454 void bdrv_flush(BlockDriverState *bs)
1455 {
1456     if (bs->open_flags & BDRV_O_NO_FLUSH) {
1457         return;
1458     }
1459 
1460     if (bs->drv && bs->drv->bdrv_flush)
1461         bs->drv->bdrv_flush(bs);
1462 }
1463 
1464 void bdrv_flush_all(void)
1465 {
1466     BlockDriverState *bs;
1467 
1468     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1469         if (bs->drv && !bdrv_is_read_only(bs) &&
1470             (!bdrv_is_removable(bs) || bdrv_is_inserted(bs))) {
1471             bdrv_flush(bs);
1472         }
1473     }
1474 }
1475 
1476 int bdrv_has_zero_init(BlockDriverState *bs)
1477 {
1478     assert(bs->drv);
1479 
1480     if (bs->drv->no_zero_init) {
1481         return 0;
1482     } else if (bs->file) {
1483         return bdrv_has_zero_init(bs->file);
1484     }
1485 
1486     return 1;
1487 }
1488 
1489 /*
1490  * Returns true iff the specified sector is present in the disk image. Drivers
1491  * not implementing the functionality are assumed to not support backing files,
1492  * hence all their sectors are reported as allocated.
1493  *
1494  * 'pnum' is set to the number of sectors (including and immediately following
1495  * the specified sector) that are known to be in the same
1496  * allocated/unallocated state.
1497  *
1498  * 'nb_sectors' is the max value 'pnum' should be set to.
1499  */
1500 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
1501 	int *pnum)
1502 {
1503     int64_t n;
1504     if (!bs->drv->bdrv_is_allocated) {
1505         if (sector_num >= bs->total_sectors) {
1506             *pnum = 0;
1507             return 0;
1508         }
1509         n = bs->total_sectors - sector_num;
1510         *pnum = (n < nb_sectors) ? (n) : (nb_sectors);
1511         return 1;
1512     }
1513     return bs->drv->bdrv_is_allocated(bs, sector_num, nb_sectors, pnum);
1514 }
1515 
1516 void bdrv_mon_event(const BlockDriverState *bdrv,
1517                     BlockMonEventAction action, int is_read)
1518 {
1519     QObject *data;
1520     const char *action_str;
1521 
1522     switch (action) {
1523     case BDRV_ACTION_REPORT:
1524         action_str = "report";
1525         break;
1526     case BDRV_ACTION_IGNORE:
1527         action_str = "ignore";
1528         break;
1529     case BDRV_ACTION_STOP:
1530         action_str = "stop";
1531         break;
1532     default:
1533         abort();
1534     }
1535 
1536     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1537                               bdrv->device_name,
1538                               action_str,
1539                               is_read ? "read" : "write");
1540     monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1541 
1542     qobject_decref(data);
1543 }
1544 
1545 static void bdrv_print_dict(QObject *obj, void *opaque)
1546 {
1547     QDict *bs_dict;
1548     Monitor *mon = opaque;
1549 
1550     bs_dict = qobject_to_qdict(obj);
1551 
1552     monitor_printf(mon, "%s: type=%s removable=%d",
1553                         qdict_get_str(bs_dict, "device"),
1554                         qdict_get_str(bs_dict, "type"),
1555                         qdict_get_bool(bs_dict, "removable"));
1556 
1557     if (qdict_get_bool(bs_dict, "removable")) {
1558         monitor_printf(mon, " locked=%d", qdict_get_bool(bs_dict, "locked"));
1559     }
1560 
1561     if (qdict_haskey(bs_dict, "inserted")) {
1562         QDict *qdict = qobject_to_qdict(qdict_get(bs_dict, "inserted"));
1563 
1564         monitor_printf(mon, " file=");
1565         monitor_print_filename(mon, qdict_get_str(qdict, "file"));
1566         if (qdict_haskey(qdict, "backing_file")) {
1567             monitor_printf(mon, " backing_file=");
1568             monitor_print_filename(mon, qdict_get_str(qdict, "backing_file"));
1569         }
1570         monitor_printf(mon, " ro=%d drv=%s encrypted=%d",
1571                             qdict_get_bool(qdict, "ro"),
1572                             qdict_get_str(qdict, "drv"),
1573                             qdict_get_bool(qdict, "encrypted"));
1574     } else {
1575         monitor_printf(mon, " [not inserted]");
1576     }
1577 
1578     monitor_printf(mon, "\n");
1579 }
1580 
1581 void bdrv_info_print(Monitor *mon, const QObject *data)
1582 {
1583     qlist_iter(qobject_to_qlist(data), bdrv_print_dict, mon);
1584 }
1585 
1586 void bdrv_info(Monitor *mon, QObject **ret_data)
1587 {
1588     QList *bs_list;
1589     BlockDriverState *bs;
1590 
1591     bs_list = qlist_new();
1592 
1593     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1594         QObject *bs_obj;
1595         const char *type = "unknown";
1596 
1597         switch(bs->type) {
1598         case BDRV_TYPE_HD:
1599             type = "hd";
1600             break;
1601         case BDRV_TYPE_CDROM:
1602             type = "cdrom";
1603             break;
1604         case BDRV_TYPE_FLOPPY:
1605             type = "floppy";
1606             break;
1607         }
1608 
1609         bs_obj = qobject_from_jsonf("{ 'device': %s, 'type': %s, "
1610                                     "'removable': %i, 'locked': %i }",
1611                                     bs->device_name, type, bs->removable,
1612                                     bs->locked);
1613 
1614         if (bs->drv) {
1615             QObject *obj;
1616             QDict *bs_dict = qobject_to_qdict(bs_obj);
1617 
1618             obj = qobject_from_jsonf("{ 'file': %s, 'ro': %i, 'drv': %s, "
1619                                      "'encrypted': %i }",
1620                                      bs->filename, bs->read_only,
1621                                      bs->drv->format_name,
1622                                      bdrv_is_encrypted(bs));
1623             if (bs->backing_file[0] != '\0') {
1624                 QDict *qdict = qobject_to_qdict(obj);
1625                 qdict_put(qdict, "backing_file",
1626                           qstring_from_str(bs->backing_file));
1627             }
1628 
1629             qdict_put_obj(bs_dict, "inserted", obj);
1630         }
1631         qlist_append_obj(bs_list, bs_obj);
1632     }
1633 
1634     *ret_data = QOBJECT(bs_list);
1635 }
1636 
1637 static void bdrv_stats_iter(QObject *data, void *opaque)
1638 {
1639     QDict *qdict;
1640     Monitor *mon = opaque;
1641 
1642     qdict = qobject_to_qdict(data);
1643     monitor_printf(mon, "%s:", qdict_get_str(qdict, "device"));
1644 
1645     qdict = qobject_to_qdict(qdict_get(qdict, "stats"));
1646     monitor_printf(mon, " rd_bytes=%" PRId64
1647                         " wr_bytes=%" PRId64
1648                         " rd_operations=%" PRId64
1649                         " wr_operations=%" PRId64
1650                         "\n",
1651                         qdict_get_int(qdict, "rd_bytes"),
1652                         qdict_get_int(qdict, "wr_bytes"),
1653                         qdict_get_int(qdict, "rd_operations"),
1654                         qdict_get_int(qdict, "wr_operations"));
1655 }
1656 
1657 void bdrv_stats_print(Monitor *mon, const QObject *data)
1658 {
1659     qlist_iter(qobject_to_qlist(data), bdrv_stats_iter, mon);
1660 }
1661 
1662 static QObject* bdrv_info_stats_bs(BlockDriverState *bs)
1663 {
1664     QObject *res;
1665     QDict *dict;
1666 
1667     res = qobject_from_jsonf("{ 'stats': {"
1668                              "'rd_bytes': %" PRId64 ","
1669                              "'wr_bytes': %" PRId64 ","
1670                              "'rd_operations': %" PRId64 ","
1671                              "'wr_operations': %" PRId64 ","
1672                              "'wr_highest_offset': %" PRId64
1673                              "} }",
1674                              bs->rd_bytes, bs->wr_bytes,
1675                              bs->rd_ops, bs->wr_ops,
1676                              bs->wr_highest_sector *
1677                              (uint64_t)BDRV_SECTOR_SIZE);
1678     dict  = qobject_to_qdict(res);
1679 
1680     if (*bs->device_name) {
1681         qdict_put(dict, "device", qstring_from_str(bs->device_name));
1682     }
1683 
1684     if (bs->file) {
1685         QObject *parent = bdrv_info_stats_bs(bs->file);
1686         qdict_put_obj(dict, "parent", parent);
1687     }
1688 
1689     return res;
1690 }
1691 
1692 void bdrv_info_stats(Monitor *mon, QObject **ret_data)
1693 {
1694     QObject *obj;
1695     QList *devices;
1696     BlockDriverState *bs;
1697 
1698     devices = qlist_new();
1699 
1700     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1701         obj = bdrv_info_stats_bs(bs);
1702         qlist_append_obj(devices, obj);
1703     }
1704 
1705     *ret_data = QOBJECT(devices);
1706 }
1707 
1708 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
1709 {
1710     if (bs->backing_hd && bs->backing_hd->encrypted)
1711         return bs->backing_file;
1712     else if (bs->encrypted)
1713         return bs->filename;
1714     else
1715         return NULL;
1716 }
1717 
1718 void bdrv_get_backing_filename(BlockDriverState *bs,
1719                                char *filename, int filename_size)
1720 {
1721     if (!bs->backing_file) {
1722         pstrcpy(filename, filename_size, "");
1723     } else {
1724         pstrcpy(filename, filename_size, bs->backing_file);
1725     }
1726 }
1727 
1728 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
1729                           const uint8_t *buf, int nb_sectors)
1730 {
1731     BlockDriver *drv = bs->drv;
1732     if (!drv)
1733         return -ENOMEDIUM;
1734     if (!drv->bdrv_write_compressed)
1735         return -ENOTSUP;
1736     if (bdrv_check_request(bs, sector_num, nb_sectors))
1737         return -EIO;
1738 
1739     if (bs->dirty_bitmap) {
1740         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1741     }
1742 
1743     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
1744 }
1745 
1746 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1747 {
1748     BlockDriver *drv = bs->drv;
1749     if (!drv)
1750         return -ENOMEDIUM;
1751     if (!drv->bdrv_get_info)
1752         return -ENOTSUP;
1753     memset(bdi, 0, sizeof(*bdi));
1754     return drv->bdrv_get_info(bs, bdi);
1755 }
1756 
1757 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
1758                       int64_t pos, int size)
1759 {
1760     BlockDriver *drv = bs->drv;
1761     if (!drv)
1762         return -ENOMEDIUM;
1763     if (drv->bdrv_save_vmstate)
1764         return drv->bdrv_save_vmstate(bs, buf, pos, size);
1765     if (bs->file)
1766         return bdrv_save_vmstate(bs->file, buf, pos, size);
1767     return -ENOTSUP;
1768 }
1769 
1770 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
1771                       int64_t pos, int size)
1772 {
1773     BlockDriver *drv = bs->drv;
1774     if (!drv)
1775         return -ENOMEDIUM;
1776     if (drv->bdrv_load_vmstate)
1777         return drv->bdrv_load_vmstate(bs, buf, pos, size);
1778     if (bs->file)
1779         return bdrv_load_vmstate(bs->file, buf, pos, size);
1780     return -ENOTSUP;
1781 }
1782 
1783 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
1784 {
1785     BlockDriver *drv = bs->drv;
1786 
1787     if (!drv || !drv->bdrv_debug_event) {
1788         return;
1789     }
1790 
1791     return drv->bdrv_debug_event(bs, event);
1792 
1793 }
1794 
1795 /**************************************************************/
1796 /* handling of snapshots */
1797 
1798 int bdrv_can_snapshot(BlockDriverState *bs)
1799 {
1800     BlockDriver *drv = bs->drv;
1801     if (!drv || bdrv_is_removable(bs) || bdrv_is_read_only(bs)) {
1802         return 0;
1803     }
1804 
1805     if (!drv->bdrv_snapshot_create) {
1806         if (bs->file != NULL) {
1807             return bdrv_can_snapshot(bs->file);
1808         }
1809         return 0;
1810     }
1811 
1812     return 1;
1813 }
1814 
1815 int bdrv_is_snapshot(BlockDriverState *bs)
1816 {
1817     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
1818 }
1819 
1820 BlockDriverState *bdrv_snapshots(void)
1821 {
1822     BlockDriverState *bs;
1823 
1824     if (bs_snapshots) {
1825         return bs_snapshots;
1826     }
1827 
1828     bs = NULL;
1829     while ((bs = bdrv_next(bs))) {
1830         if (bdrv_can_snapshot(bs)) {
1831             bs_snapshots = bs;
1832             return bs;
1833         }
1834     }
1835     return NULL;
1836 }
1837 
1838 int bdrv_snapshot_create(BlockDriverState *bs,
1839                          QEMUSnapshotInfo *sn_info)
1840 {
1841     BlockDriver *drv = bs->drv;
1842     if (!drv)
1843         return -ENOMEDIUM;
1844     if (drv->bdrv_snapshot_create)
1845         return drv->bdrv_snapshot_create(bs, sn_info);
1846     if (bs->file)
1847         return bdrv_snapshot_create(bs->file, sn_info);
1848     return -ENOTSUP;
1849 }
1850 
1851 int bdrv_snapshot_goto(BlockDriverState *bs,
1852                        const char *snapshot_id)
1853 {
1854     BlockDriver *drv = bs->drv;
1855     int ret, open_ret;
1856 
1857     if (!drv)
1858         return -ENOMEDIUM;
1859     if (drv->bdrv_snapshot_goto)
1860         return drv->bdrv_snapshot_goto(bs, snapshot_id);
1861 
1862     if (bs->file) {
1863         drv->bdrv_close(bs);
1864         ret = bdrv_snapshot_goto(bs->file, snapshot_id);
1865         open_ret = drv->bdrv_open(bs, bs->open_flags);
1866         if (open_ret < 0) {
1867             bdrv_delete(bs->file);
1868             bs->drv = NULL;
1869             return open_ret;
1870         }
1871         return ret;
1872     }
1873 
1874     return -ENOTSUP;
1875 }
1876 
1877 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
1878 {
1879     BlockDriver *drv = bs->drv;
1880     if (!drv)
1881         return -ENOMEDIUM;
1882     if (drv->bdrv_snapshot_delete)
1883         return drv->bdrv_snapshot_delete(bs, snapshot_id);
1884     if (bs->file)
1885         return bdrv_snapshot_delete(bs->file, snapshot_id);
1886     return -ENOTSUP;
1887 }
1888 
1889 int bdrv_snapshot_list(BlockDriverState *bs,
1890                        QEMUSnapshotInfo **psn_info)
1891 {
1892     BlockDriver *drv = bs->drv;
1893     if (!drv)
1894         return -ENOMEDIUM;
1895     if (drv->bdrv_snapshot_list)
1896         return drv->bdrv_snapshot_list(bs, psn_info);
1897     if (bs->file)
1898         return bdrv_snapshot_list(bs->file, psn_info);
1899     return -ENOTSUP;
1900 }
1901 
1902 #define NB_SUFFIXES 4
1903 
1904 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
1905 {
1906     static const char suffixes[NB_SUFFIXES] = "KMGT";
1907     int64_t base;
1908     int i;
1909 
1910     if (size <= 999) {
1911         snprintf(buf, buf_size, "%" PRId64, size);
1912     } else {
1913         base = 1024;
1914         for(i = 0; i < NB_SUFFIXES; i++) {
1915             if (size < (10 * base)) {
1916                 snprintf(buf, buf_size, "%0.1f%c",
1917                          (double)size / base,
1918                          suffixes[i]);
1919                 break;
1920             } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
1921                 snprintf(buf, buf_size, "%" PRId64 "%c",
1922                          ((size + (base >> 1)) / base),
1923                          suffixes[i]);
1924                 break;
1925             }
1926             base = base * 1024;
1927         }
1928     }
1929     return buf;
1930 }
1931 
1932 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
1933 {
1934     char buf1[128], date_buf[128], clock_buf[128];
1935 #ifdef _WIN32
1936     struct tm *ptm;
1937 #else
1938     struct tm tm;
1939 #endif
1940     time_t ti;
1941     int64_t secs;
1942 
1943     if (!sn) {
1944         snprintf(buf, buf_size,
1945                  "%-10s%-20s%7s%20s%15s",
1946                  "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
1947     } else {
1948         ti = sn->date_sec;
1949 #ifdef _WIN32
1950         ptm = localtime(&ti);
1951         strftime(date_buf, sizeof(date_buf),
1952                  "%Y-%m-%d %H:%M:%S", ptm);
1953 #else
1954         localtime_r(&ti, &tm);
1955         strftime(date_buf, sizeof(date_buf),
1956                  "%Y-%m-%d %H:%M:%S", &tm);
1957 #endif
1958         secs = sn->vm_clock_nsec / 1000000000;
1959         snprintf(clock_buf, sizeof(clock_buf),
1960                  "%02d:%02d:%02d.%03d",
1961                  (int)(secs / 3600),
1962                  (int)((secs / 60) % 60),
1963                  (int)(secs % 60),
1964                  (int)((sn->vm_clock_nsec / 1000000) % 1000));
1965         snprintf(buf, buf_size,
1966                  "%-10s%-20s%7s%20s%15s",
1967                  sn->id_str, sn->name,
1968                  get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
1969                  date_buf,
1970                  clock_buf);
1971     }
1972     return buf;
1973 }
1974 
1975 
1976 /**************************************************************/
1977 /* async I/Os */
1978 
1979 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
1980                                  QEMUIOVector *qiov, int nb_sectors,
1981                                  BlockDriverCompletionFunc *cb, void *opaque)
1982 {
1983     BlockDriver *drv = bs->drv;
1984     BlockDriverAIOCB *ret;
1985 
1986     if (!drv)
1987         return NULL;
1988     if (bdrv_check_request(bs, sector_num, nb_sectors))
1989         return NULL;
1990 
1991     ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
1992                               cb, opaque);
1993 
1994     if (ret) {
1995 	/* Update stats even though technically transfer has not happened. */
1996 	bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
1997 	bs->rd_ops ++;
1998     }
1999 
2000     return ret;
2001 }
2002 
2003 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2004                                   QEMUIOVector *qiov, int nb_sectors,
2005                                   BlockDriverCompletionFunc *cb, void *opaque)
2006 {
2007     BlockDriver *drv = bs->drv;
2008     BlockDriverAIOCB *ret;
2009 
2010     if (!drv)
2011         return NULL;
2012     if (bs->read_only)
2013         return NULL;
2014     if (bdrv_check_request(bs, sector_num, nb_sectors))
2015         return NULL;
2016 
2017     if (bs->dirty_bitmap) {
2018         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2019     }
2020 
2021     ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
2022                                cb, opaque);
2023 
2024     if (ret) {
2025         /* Update stats even though technically transfer has not happened. */
2026         bs->wr_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
2027         bs->wr_ops ++;
2028         if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2029             bs->wr_highest_sector = sector_num + nb_sectors - 1;
2030         }
2031     }
2032 
2033     return ret;
2034 }
2035 
2036 
2037 typedef struct MultiwriteCB {
2038     int error;
2039     int num_requests;
2040     int num_callbacks;
2041     struct {
2042         BlockDriverCompletionFunc *cb;
2043         void *opaque;
2044         QEMUIOVector *free_qiov;
2045         void *free_buf;
2046     } callbacks[];
2047 } MultiwriteCB;
2048 
2049 static void multiwrite_user_cb(MultiwriteCB *mcb)
2050 {
2051     int i;
2052 
2053     for (i = 0; i < mcb->num_callbacks; i++) {
2054         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2055         if (mcb->callbacks[i].free_qiov) {
2056             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2057         }
2058         qemu_free(mcb->callbacks[i].free_qiov);
2059         qemu_vfree(mcb->callbacks[i].free_buf);
2060     }
2061 }
2062 
2063 static void multiwrite_cb(void *opaque, int ret)
2064 {
2065     MultiwriteCB *mcb = opaque;
2066 
2067     if (ret < 0 && !mcb->error) {
2068         mcb->error = ret;
2069     }
2070 
2071     mcb->num_requests--;
2072     if (mcb->num_requests == 0) {
2073         multiwrite_user_cb(mcb);
2074         qemu_free(mcb);
2075     }
2076 }
2077 
2078 static int multiwrite_req_compare(const void *a, const void *b)
2079 {
2080     const BlockRequest *req1 = a, *req2 = b;
2081 
2082     /*
2083      * Note that we can't simply subtract req2->sector from req1->sector
2084      * here as that could overflow the return value.
2085      */
2086     if (req1->sector > req2->sector) {
2087         return 1;
2088     } else if (req1->sector < req2->sector) {
2089         return -1;
2090     } else {
2091         return 0;
2092     }
2093 }
2094 
2095 /*
2096  * Takes a bunch of requests and tries to merge them. Returns the number of
2097  * requests that remain after merging.
2098  */
2099 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2100     int num_reqs, MultiwriteCB *mcb)
2101 {
2102     int i, outidx;
2103 
2104     // Sort requests by start sector
2105     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2106 
2107     // Check if adjacent requests touch the same clusters. If so, combine them,
2108     // filling up gaps with zero sectors.
2109     outidx = 0;
2110     for (i = 1; i < num_reqs; i++) {
2111         int merge = 0;
2112         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2113 
2114         // This handles the cases that are valid for all block drivers, namely
2115         // exactly sequential writes and overlapping writes.
2116         if (reqs[i].sector <= oldreq_last) {
2117             merge = 1;
2118         }
2119 
2120         // The block driver may decide that it makes sense to combine requests
2121         // even if there is a gap of some sectors between them. In this case,
2122         // the gap is filled with zeros (therefore only applicable for yet
2123         // unused space in format like qcow2).
2124         if (!merge && bs->drv->bdrv_merge_requests) {
2125             merge = bs->drv->bdrv_merge_requests(bs, &reqs[outidx], &reqs[i]);
2126         }
2127 
2128         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2129             merge = 0;
2130         }
2131 
2132         if (merge) {
2133             size_t size;
2134             QEMUIOVector *qiov = qemu_mallocz(sizeof(*qiov));
2135             qemu_iovec_init(qiov,
2136                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2137 
2138             // Add the first request to the merged one. If the requests are
2139             // overlapping, drop the last sectors of the first request.
2140             size = (reqs[i].sector - reqs[outidx].sector) << 9;
2141             qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
2142 
2143             // We might need to add some zeros between the two requests
2144             if (reqs[i].sector > oldreq_last) {
2145                 size_t zero_bytes = (reqs[i].sector - oldreq_last) << 9;
2146                 uint8_t *buf = qemu_blockalign(bs, zero_bytes);
2147                 memset(buf, 0, zero_bytes);
2148                 qemu_iovec_add(qiov, buf, zero_bytes);
2149                 mcb->callbacks[i].free_buf = buf;
2150             }
2151 
2152             // Add the second request
2153             qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
2154 
2155             reqs[outidx].nb_sectors = qiov->size >> 9;
2156             reqs[outidx].qiov = qiov;
2157 
2158             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
2159         } else {
2160             outidx++;
2161             reqs[outidx].sector     = reqs[i].sector;
2162             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
2163             reqs[outidx].qiov       = reqs[i].qiov;
2164         }
2165     }
2166 
2167     return outidx + 1;
2168 }
2169 
2170 /*
2171  * Submit multiple AIO write requests at once.
2172  *
2173  * On success, the function returns 0 and all requests in the reqs array have
2174  * been submitted. In error case this function returns -1, and any of the
2175  * requests may or may not be submitted yet. In particular, this means that the
2176  * callback will be called for some of the requests, for others it won't. The
2177  * caller must check the error field of the BlockRequest to wait for the right
2178  * callbacks (if error != 0, no callback will be called).
2179  *
2180  * The implementation may modify the contents of the reqs array, e.g. to merge
2181  * requests. However, the fields opaque and error are left unmodified as they
2182  * are used to signal failure for a single request to the caller.
2183  */
2184 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
2185 {
2186     BlockDriverAIOCB *acb;
2187     MultiwriteCB *mcb;
2188     int i;
2189 
2190     if (num_reqs == 0) {
2191         return 0;
2192     }
2193 
2194     // Create MultiwriteCB structure
2195     mcb = qemu_mallocz(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
2196     mcb->num_requests = 0;
2197     mcb->num_callbacks = num_reqs;
2198 
2199     for (i = 0; i < num_reqs; i++) {
2200         mcb->callbacks[i].cb = reqs[i].cb;
2201         mcb->callbacks[i].opaque = reqs[i].opaque;
2202     }
2203 
2204     // Check for mergable requests
2205     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
2206 
2207     /*
2208      * Run the aio requests. As soon as one request can't be submitted
2209      * successfully, fail all requests that are not yet submitted (we must
2210      * return failure for all requests anyway)
2211      *
2212      * num_requests cannot be set to the right value immediately: If
2213      * bdrv_aio_writev fails for some request, num_requests would be too high
2214      * and therefore multiwrite_cb() would never recognize the multiwrite
2215      * request as completed. We also cannot use the loop variable i to set it
2216      * when the first request fails because the callback may already have been
2217      * called for previously submitted requests. Thus, num_requests must be
2218      * incremented for each request that is submitted.
2219      *
2220      * The problem that callbacks may be called early also means that we need
2221      * to take care that num_requests doesn't become 0 before all requests are
2222      * submitted - multiwrite_cb() would consider the multiwrite request
2223      * completed. A dummy request that is "completed" by a manual call to
2224      * multiwrite_cb() takes care of this.
2225      */
2226     mcb->num_requests = 1;
2227 
2228     for (i = 0; i < num_reqs; i++) {
2229         mcb->num_requests++;
2230         acb = bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
2231             reqs[i].nb_sectors, multiwrite_cb, mcb);
2232 
2233         if (acb == NULL) {
2234             // We can only fail the whole thing if no request has been
2235             // submitted yet. Otherwise we'll wait for the submitted AIOs to
2236             // complete and report the error in the callback.
2237             if (i == 0) {
2238                 goto fail;
2239             } else {
2240                 multiwrite_cb(mcb, -EIO);
2241                 break;
2242             }
2243         }
2244     }
2245 
2246     /* Complete the dummy request */
2247     multiwrite_cb(mcb, 0);
2248 
2249     return 0;
2250 
2251 fail:
2252     for (i = 0; i < mcb->num_callbacks; i++) {
2253         reqs[i].error = -EIO;
2254     }
2255     qemu_free(mcb);
2256     return -1;
2257 }
2258 
2259 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
2260         BlockDriverCompletionFunc *cb, void *opaque)
2261 {
2262     BlockDriver *drv = bs->drv;
2263 
2264     if (bs->open_flags & BDRV_O_NO_FLUSH) {
2265         return bdrv_aio_noop_em(bs, cb, opaque);
2266     }
2267 
2268     if (!drv)
2269         return NULL;
2270     return drv->bdrv_aio_flush(bs, cb, opaque);
2271 }
2272 
2273 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
2274 {
2275     acb->pool->cancel(acb);
2276 }
2277 
2278 
2279 /**************************************************************/
2280 /* async block device emulation */
2281 
2282 typedef struct BlockDriverAIOCBSync {
2283     BlockDriverAIOCB common;
2284     QEMUBH *bh;
2285     int ret;
2286     /* vector translation state */
2287     QEMUIOVector *qiov;
2288     uint8_t *bounce;
2289     int is_write;
2290 } BlockDriverAIOCBSync;
2291 
2292 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
2293 {
2294     BlockDriverAIOCBSync *acb =
2295         container_of(blockacb, BlockDriverAIOCBSync, common);
2296     qemu_bh_delete(acb->bh);
2297     acb->bh = NULL;
2298     qemu_aio_release(acb);
2299 }
2300 
2301 static AIOPool bdrv_em_aio_pool = {
2302     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
2303     .cancel             = bdrv_aio_cancel_em,
2304 };
2305 
2306 static void bdrv_aio_bh_cb(void *opaque)
2307 {
2308     BlockDriverAIOCBSync *acb = opaque;
2309 
2310     if (!acb->is_write)
2311         qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
2312     qemu_vfree(acb->bounce);
2313     acb->common.cb(acb->common.opaque, acb->ret);
2314     qemu_bh_delete(acb->bh);
2315     acb->bh = NULL;
2316     qemu_aio_release(acb);
2317 }
2318 
2319 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
2320                                             int64_t sector_num,
2321                                             QEMUIOVector *qiov,
2322                                             int nb_sectors,
2323                                             BlockDriverCompletionFunc *cb,
2324                                             void *opaque,
2325                                             int is_write)
2326 
2327 {
2328     BlockDriverAIOCBSync *acb;
2329 
2330     acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2331     acb->is_write = is_write;
2332     acb->qiov = qiov;
2333     acb->bounce = qemu_blockalign(bs, qiov->size);
2334 
2335     if (!acb->bh)
2336         acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2337 
2338     if (is_write) {
2339         qemu_iovec_to_buffer(acb->qiov, acb->bounce);
2340         acb->ret = bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
2341     } else {
2342         acb->ret = bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
2343     }
2344 
2345     qemu_bh_schedule(acb->bh);
2346 
2347     return &acb->common;
2348 }
2349 
2350 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
2351         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2352         BlockDriverCompletionFunc *cb, void *opaque)
2353 {
2354     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
2355 }
2356 
2357 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
2358         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2359         BlockDriverCompletionFunc *cb, void *opaque)
2360 {
2361     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
2362 }
2363 
2364 static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
2365         BlockDriverCompletionFunc *cb, void *opaque)
2366 {
2367     BlockDriverAIOCBSync *acb;
2368 
2369     acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2370     acb->is_write = 1; /* don't bounce in the completion hadler */
2371     acb->qiov = NULL;
2372     acb->bounce = NULL;
2373     acb->ret = 0;
2374 
2375     if (!acb->bh)
2376         acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2377 
2378     bdrv_flush(bs);
2379     qemu_bh_schedule(acb->bh);
2380     return &acb->common;
2381 }
2382 
2383 static BlockDriverAIOCB *bdrv_aio_noop_em(BlockDriverState *bs,
2384         BlockDriverCompletionFunc *cb, void *opaque)
2385 {
2386     BlockDriverAIOCBSync *acb;
2387 
2388     acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2389     acb->is_write = 1; /* don't bounce in the completion handler */
2390     acb->qiov = NULL;
2391     acb->bounce = NULL;
2392     acb->ret = 0;
2393 
2394     if (!acb->bh) {
2395         acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2396     }
2397 
2398     qemu_bh_schedule(acb->bh);
2399     return &acb->common;
2400 }
2401 
2402 /**************************************************************/
2403 /* sync block device emulation */
2404 
2405 static void bdrv_rw_em_cb(void *opaque, int ret)
2406 {
2407     *(int *)opaque = ret;
2408 }
2409 
2410 #define NOT_DONE 0x7fffffff
2411 
2412 static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
2413                         uint8_t *buf, int nb_sectors)
2414 {
2415     int async_ret;
2416     BlockDriverAIOCB *acb;
2417     struct iovec iov;
2418     QEMUIOVector qiov;
2419 
2420     async_context_push();
2421 
2422     async_ret = NOT_DONE;
2423     iov.iov_base = (void *)buf;
2424     iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
2425     qemu_iovec_init_external(&qiov, &iov, 1);
2426     acb = bdrv_aio_readv(bs, sector_num, &qiov, nb_sectors,
2427         bdrv_rw_em_cb, &async_ret);
2428     if (acb == NULL) {
2429         async_ret = -1;
2430         goto fail;
2431     }
2432 
2433     while (async_ret == NOT_DONE) {
2434         qemu_aio_wait();
2435     }
2436 
2437 
2438 fail:
2439     async_context_pop();
2440     return async_ret;
2441 }
2442 
2443 static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
2444                          const uint8_t *buf, int nb_sectors)
2445 {
2446     int async_ret;
2447     BlockDriverAIOCB *acb;
2448     struct iovec iov;
2449     QEMUIOVector qiov;
2450 
2451     async_context_push();
2452 
2453     async_ret = NOT_DONE;
2454     iov.iov_base = (void *)buf;
2455     iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
2456     qemu_iovec_init_external(&qiov, &iov, 1);
2457     acb = bdrv_aio_writev(bs, sector_num, &qiov, nb_sectors,
2458         bdrv_rw_em_cb, &async_ret);
2459     if (acb == NULL) {
2460         async_ret = -1;
2461         goto fail;
2462     }
2463     while (async_ret == NOT_DONE) {
2464         qemu_aio_wait();
2465     }
2466 
2467 fail:
2468     async_context_pop();
2469     return async_ret;
2470 }
2471 
2472 void bdrv_init(void)
2473 {
2474     module_call_init(MODULE_INIT_BLOCK);
2475 }
2476 
2477 void bdrv_init_with_whitelist(void)
2478 {
2479     use_bdrv_whitelist = 1;
2480     bdrv_init();
2481 }
2482 
2483 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
2484                    BlockDriverCompletionFunc *cb, void *opaque)
2485 {
2486     BlockDriverAIOCB *acb;
2487 
2488     if (pool->free_aiocb) {
2489         acb = pool->free_aiocb;
2490         pool->free_aiocb = acb->next;
2491     } else {
2492         acb = qemu_mallocz(pool->aiocb_size);
2493         acb->pool = pool;
2494     }
2495     acb->bs = bs;
2496     acb->cb = cb;
2497     acb->opaque = opaque;
2498     return acb;
2499 }
2500 
2501 void qemu_aio_release(void *p)
2502 {
2503     BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
2504     AIOPool *pool = acb->pool;
2505     acb->next = pool->free_aiocb;
2506     pool->free_aiocb = acb;
2507 }
2508 
2509 /**************************************************************/
2510 /* removable device support */
2511 
2512 /**
2513  * Return TRUE if the media is present
2514  */
2515 int bdrv_is_inserted(BlockDriverState *bs)
2516 {
2517     BlockDriver *drv = bs->drv;
2518     int ret;
2519     if (!drv)
2520         return 0;
2521     if (!drv->bdrv_is_inserted)
2522         return 1;
2523     ret = drv->bdrv_is_inserted(bs);
2524     return ret;
2525 }
2526 
2527 /**
2528  * Return TRUE if the media changed since the last call to this
2529  * function. It is currently only used for floppy disks
2530  */
2531 int bdrv_media_changed(BlockDriverState *bs)
2532 {
2533     BlockDriver *drv = bs->drv;
2534     int ret;
2535 
2536     if (!drv || !drv->bdrv_media_changed)
2537         ret = -ENOTSUP;
2538     else
2539         ret = drv->bdrv_media_changed(bs);
2540     if (ret == -ENOTSUP)
2541         ret = bs->media_changed;
2542     bs->media_changed = 0;
2543     return ret;
2544 }
2545 
2546 /**
2547  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
2548  */
2549 int bdrv_eject(BlockDriverState *bs, int eject_flag)
2550 {
2551     BlockDriver *drv = bs->drv;
2552     int ret;
2553 
2554     if (bs->locked) {
2555         return -EBUSY;
2556     }
2557 
2558     if (!drv || !drv->bdrv_eject) {
2559         ret = -ENOTSUP;
2560     } else {
2561         ret = drv->bdrv_eject(bs, eject_flag);
2562     }
2563     if (ret == -ENOTSUP) {
2564         if (eject_flag)
2565             bdrv_close(bs);
2566         ret = 0;
2567     }
2568 
2569     return ret;
2570 }
2571 
2572 int bdrv_is_locked(BlockDriverState *bs)
2573 {
2574     return bs->locked;
2575 }
2576 
2577 /**
2578  * Lock or unlock the media (if it is locked, the user won't be able
2579  * to eject it manually).
2580  */
2581 void bdrv_set_locked(BlockDriverState *bs, int locked)
2582 {
2583     BlockDriver *drv = bs->drv;
2584 
2585     bs->locked = locked;
2586     if (drv && drv->bdrv_set_locked) {
2587         drv->bdrv_set_locked(bs, locked);
2588     }
2589 }
2590 
2591 /* needed for generic scsi interface */
2592 
2593 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
2594 {
2595     BlockDriver *drv = bs->drv;
2596 
2597     if (drv && drv->bdrv_ioctl)
2598         return drv->bdrv_ioctl(bs, req, buf);
2599     return -ENOTSUP;
2600 }
2601 
2602 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
2603         unsigned long int req, void *buf,
2604         BlockDriverCompletionFunc *cb, void *opaque)
2605 {
2606     BlockDriver *drv = bs->drv;
2607 
2608     if (drv && drv->bdrv_aio_ioctl)
2609         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
2610     return NULL;
2611 }
2612 
2613 
2614 
2615 void *qemu_blockalign(BlockDriverState *bs, size_t size)
2616 {
2617     return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
2618 }
2619 
2620 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
2621 {
2622     int64_t bitmap_size;
2623 
2624     bs->dirty_count = 0;
2625     if (enable) {
2626         if (!bs->dirty_bitmap) {
2627             bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
2628                     BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
2629             bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
2630 
2631             bs->dirty_bitmap = qemu_mallocz(bitmap_size);
2632         }
2633     } else {
2634         if (bs->dirty_bitmap) {
2635             qemu_free(bs->dirty_bitmap);
2636             bs->dirty_bitmap = NULL;
2637         }
2638     }
2639 }
2640 
2641 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
2642 {
2643     int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
2644 
2645     if (bs->dirty_bitmap &&
2646         (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
2647         return bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
2648             (1 << (chunk % (sizeof(unsigned long) * 8)));
2649     } else {
2650         return 0;
2651     }
2652 }
2653 
2654 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
2655                       int nr_sectors)
2656 {
2657     set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
2658 }
2659 
2660 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
2661 {
2662     return bs->dirty_count;
2663 }
2664