xref: /openbmc/qemu/block.c (revision e1e9b0aca05747be9e2174a53205bd904c10da49)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor.h"
28 #include "block_int.h"
29 #include "module.h"
30 #include "qjson.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 #include "qemu-timer.h"
34 
35 #ifdef CONFIG_BSD
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/ioctl.h>
39 #include <sys/queue.h>
40 #ifndef __DragonFly__
41 #include <sys/disk.h>
42 #endif
43 #endif
44 
45 #ifdef _WIN32
46 #include <windows.h>
47 #endif
48 
49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50 
51 typedef enum {
52     BDRV_REQ_COPY_ON_READ = 0x1,
53     BDRV_REQ_ZERO_WRITE   = 0x2,
54 } BdrvRequestFlags;
55 
56 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59         BlockDriverCompletionFunc *cb, void *opaque);
60 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62         BlockDriverCompletionFunc *cb, void *opaque);
63 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64                                          int64_t sector_num, int nb_sectors,
65                                          QEMUIOVector *iov);
66 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67                                          int64_t sector_num, int nb_sectors,
68                                          QEMUIOVector *iov);
69 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71     BdrvRequestFlags flags);
72 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74     BdrvRequestFlags flags);
75 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76                                                int64_t sector_num,
77                                                QEMUIOVector *qiov,
78                                                int nb_sectors,
79                                                BlockDriverCompletionFunc *cb,
80                                                void *opaque,
81                                                bool is_write);
82 static void coroutine_fn bdrv_co_do_rw(void *opaque);
83 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
84     int64_t sector_num, int nb_sectors);
85 
86 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
87         bool is_write, double elapsed_time, uint64_t *wait);
88 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
89         double elapsed_time, uint64_t *wait);
90 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
91         bool is_write, int64_t *wait);
92 
93 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94     QTAILQ_HEAD_INITIALIZER(bdrv_states);
95 
96 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
97     QLIST_HEAD_INITIALIZER(bdrv_drivers);
98 
99 /* The device to use for VM snapshots */
100 static BlockDriverState *bs_snapshots;
101 
102 /* If non-zero, use only whitelisted block drivers */
103 static int use_bdrv_whitelist;
104 
105 #ifdef _WIN32
106 static int is_windows_drive_prefix(const char *filename)
107 {
108     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
110             filename[1] == ':');
111 }
112 
113 int is_windows_drive(const char *filename)
114 {
115     if (is_windows_drive_prefix(filename) &&
116         filename[2] == '\0')
117         return 1;
118     if (strstart(filename, "\\\\.\\", NULL) ||
119         strstart(filename, "//./", NULL))
120         return 1;
121     return 0;
122 }
123 #endif
124 
125 /* throttling disk I/O limits */
126 void bdrv_io_limits_disable(BlockDriverState *bs)
127 {
128     bs->io_limits_enabled = false;
129 
130     while (qemu_co_queue_next(&bs->throttled_reqs));
131 
132     if (bs->block_timer) {
133         qemu_del_timer(bs->block_timer);
134         qemu_free_timer(bs->block_timer);
135         bs->block_timer = NULL;
136     }
137 
138     bs->slice_start = 0;
139     bs->slice_end   = 0;
140     bs->slice_time  = 0;
141     memset(&bs->io_base, 0, sizeof(bs->io_base));
142 }
143 
144 static void bdrv_block_timer(void *opaque)
145 {
146     BlockDriverState *bs = opaque;
147 
148     qemu_co_queue_next(&bs->throttled_reqs);
149 }
150 
151 void bdrv_io_limits_enable(BlockDriverState *bs)
152 {
153     qemu_co_queue_init(&bs->throttled_reqs);
154     bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
155     bs->slice_time  = 5 * BLOCK_IO_SLICE_TIME;
156     bs->slice_start = qemu_get_clock_ns(vm_clock);
157     bs->slice_end   = bs->slice_start + bs->slice_time;
158     memset(&bs->io_base, 0, sizeof(bs->io_base));
159     bs->io_limits_enabled = true;
160 }
161 
162 bool bdrv_io_limits_enabled(BlockDriverState *bs)
163 {
164     BlockIOLimit *io_limits = &bs->io_limits;
165     return io_limits->bps[BLOCK_IO_LIMIT_READ]
166          || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
167          || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
168          || io_limits->iops[BLOCK_IO_LIMIT_READ]
169          || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
170          || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
171 }
172 
173 static void bdrv_io_limits_intercept(BlockDriverState *bs,
174                                      bool is_write, int nb_sectors)
175 {
176     int64_t wait_time = -1;
177 
178     if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
179         qemu_co_queue_wait(&bs->throttled_reqs);
180     }
181 
182     /* In fact, we hope to keep each request's timing, in FIFO mode. The next
183      * throttled requests will not be dequeued until the current request is
184      * allowed to be serviced. So if the current request still exceeds the
185      * limits, it will be inserted to the head. All requests followed it will
186      * be still in throttled_reqs queue.
187      */
188 
189     while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
190         qemu_mod_timer(bs->block_timer,
191                        wait_time + qemu_get_clock_ns(vm_clock));
192         qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
193     }
194 
195     qemu_co_queue_next(&bs->throttled_reqs);
196 }
197 
198 /* check if the path starts with "<protocol>:" */
199 static int path_has_protocol(const char *path)
200 {
201     const char *p;
202 
203 #ifdef _WIN32
204     if (is_windows_drive(path) ||
205         is_windows_drive_prefix(path)) {
206         return 0;
207     }
208     p = path + strcspn(path, ":/\\");
209 #else
210     p = path + strcspn(path, ":/");
211 #endif
212 
213     return *p == ':';
214 }
215 
216 int path_is_absolute(const char *path)
217 {
218 #ifdef _WIN32
219     /* specific case for names like: "\\.\d:" */
220     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
221         return 1;
222     }
223     return (*path == '/' || *path == '\\');
224 #else
225     return (*path == '/');
226 #endif
227 }
228 
229 /* if filename is absolute, just copy it to dest. Otherwise, build a
230    path to it by considering it is relative to base_path. URL are
231    supported. */
232 void path_combine(char *dest, int dest_size,
233                   const char *base_path,
234                   const char *filename)
235 {
236     const char *p, *p1;
237     int len;
238 
239     if (dest_size <= 0)
240         return;
241     if (path_is_absolute(filename)) {
242         pstrcpy(dest, dest_size, filename);
243     } else {
244         p = strchr(base_path, ':');
245         if (p)
246             p++;
247         else
248             p = base_path;
249         p1 = strrchr(base_path, '/');
250 #ifdef _WIN32
251         {
252             const char *p2;
253             p2 = strrchr(base_path, '\\');
254             if (!p1 || p2 > p1)
255                 p1 = p2;
256         }
257 #endif
258         if (p1)
259             p1++;
260         else
261             p1 = base_path;
262         if (p1 > p)
263             p = p1;
264         len = p - base_path;
265         if (len > dest_size - 1)
266             len = dest_size - 1;
267         memcpy(dest, base_path, len);
268         dest[len] = '\0';
269         pstrcat(dest, dest_size, filename);
270     }
271 }
272 
273 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
274 {
275     if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
276         pstrcpy(dest, sz, bs->backing_file);
277     } else {
278         path_combine(dest, sz, bs->filename, bs->backing_file);
279     }
280 }
281 
282 void bdrv_register(BlockDriver *bdrv)
283 {
284     /* Block drivers without coroutine functions need emulation */
285     if (!bdrv->bdrv_co_readv) {
286         bdrv->bdrv_co_readv = bdrv_co_readv_em;
287         bdrv->bdrv_co_writev = bdrv_co_writev_em;
288 
289         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
290          * the block driver lacks aio we need to emulate that too.
291          */
292         if (!bdrv->bdrv_aio_readv) {
293             /* add AIO emulation layer */
294             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
295             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
296         }
297     }
298 
299     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
300 }
301 
302 /* create a new block device (by default it is empty) */
303 BlockDriverState *bdrv_new(const char *device_name)
304 {
305     BlockDriverState *bs;
306 
307     bs = g_malloc0(sizeof(BlockDriverState));
308     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
309     if (device_name[0] != '\0') {
310         QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
311     }
312     bdrv_iostatus_disable(bs);
313     return bs;
314 }
315 
316 BlockDriver *bdrv_find_format(const char *format_name)
317 {
318     BlockDriver *drv1;
319     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
320         if (!strcmp(drv1->format_name, format_name)) {
321             return drv1;
322         }
323     }
324     return NULL;
325 }
326 
327 static int bdrv_is_whitelisted(BlockDriver *drv)
328 {
329     static const char *whitelist[] = {
330         CONFIG_BDRV_WHITELIST
331     };
332     const char **p;
333 
334     if (!whitelist[0])
335         return 1;               /* no whitelist, anything goes */
336 
337     for (p = whitelist; *p; p++) {
338         if (!strcmp(drv->format_name, *p)) {
339             return 1;
340         }
341     }
342     return 0;
343 }
344 
345 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
346 {
347     BlockDriver *drv = bdrv_find_format(format_name);
348     return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
349 }
350 
351 typedef struct CreateCo {
352     BlockDriver *drv;
353     char *filename;
354     QEMUOptionParameter *options;
355     int ret;
356 } CreateCo;
357 
358 static void coroutine_fn bdrv_create_co_entry(void *opaque)
359 {
360     CreateCo *cco = opaque;
361     assert(cco->drv);
362 
363     cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
364 }
365 
366 int bdrv_create(BlockDriver *drv, const char* filename,
367     QEMUOptionParameter *options)
368 {
369     int ret;
370 
371     Coroutine *co;
372     CreateCo cco = {
373         .drv = drv,
374         .filename = g_strdup(filename),
375         .options = options,
376         .ret = NOT_DONE,
377     };
378 
379     if (!drv->bdrv_create) {
380         return -ENOTSUP;
381     }
382 
383     if (qemu_in_coroutine()) {
384         /* Fast-path if already in coroutine context */
385         bdrv_create_co_entry(&cco);
386     } else {
387         co = qemu_coroutine_create(bdrv_create_co_entry);
388         qemu_coroutine_enter(co, &cco);
389         while (cco.ret == NOT_DONE) {
390             qemu_aio_wait();
391         }
392     }
393 
394     ret = cco.ret;
395     g_free(cco.filename);
396 
397     return ret;
398 }
399 
400 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
401 {
402     BlockDriver *drv;
403 
404     drv = bdrv_find_protocol(filename);
405     if (drv == NULL) {
406         return -ENOENT;
407     }
408 
409     return bdrv_create(drv, filename, options);
410 }
411 
412 /*
413  * Create a uniquely-named empty temporary file.
414  * Return 0 upon success, otherwise a negative errno value.
415  */
416 int get_tmp_filename(char *filename, int size)
417 {
418 #ifdef _WIN32
419     char temp_dir[MAX_PATH];
420     /* GetTempFileName requires that its output buffer (4th param)
421        have length MAX_PATH or greater.  */
422     assert(size >= MAX_PATH);
423     return (GetTempPath(MAX_PATH, temp_dir)
424             && GetTempFileName(temp_dir, "qem", 0, filename)
425             ? 0 : -GetLastError());
426 #else
427     int fd;
428     const char *tmpdir;
429     tmpdir = getenv("TMPDIR");
430     if (!tmpdir)
431         tmpdir = "/tmp";
432     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
433         return -EOVERFLOW;
434     }
435     fd = mkstemp(filename);
436     if (fd < 0 || close(fd)) {
437         return -errno;
438     }
439     return 0;
440 #endif
441 }
442 
443 /*
444  * Detect host devices. By convention, /dev/cdrom[N] is always
445  * recognized as a host CDROM.
446  */
447 static BlockDriver *find_hdev_driver(const char *filename)
448 {
449     int score_max = 0, score;
450     BlockDriver *drv = NULL, *d;
451 
452     QLIST_FOREACH(d, &bdrv_drivers, list) {
453         if (d->bdrv_probe_device) {
454             score = d->bdrv_probe_device(filename);
455             if (score > score_max) {
456                 score_max = score;
457                 drv = d;
458             }
459         }
460     }
461 
462     return drv;
463 }
464 
465 BlockDriver *bdrv_find_protocol(const char *filename)
466 {
467     BlockDriver *drv1;
468     char protocol[128];
469     int len;
470     const char *p;
471 
472     /* TODO Drivers without bdrv_file_open must be specified explicitly */
473 
474     /*
475      * XXX(hch): we really should not let host device detection
476      * override an explicit protocol specification, but moving this
477      * later breaks access to device names with colons in them.
478      * Thanks to the brain-dead persistent naming schemes on udev-
479      * based Linux systems those actually are quite common.
480      */
481     drv1 = find_hdev_driver(filename);
482     if (drv1) {
483         return drv1;
484     }
485 
486     if (!path_has_protocol(filename)) {
487         return bdrv_find_format("file");
488     }
489     p = strchr(filename, ':');
490     assert(p != NULL);
491     len = p - filename;
492     if (len > sizeof(protocol) - 1)
493         len = sizeof(protocol) - 1;
494     memcpy(protocol, filename, len);
495     protocol[len] = '\0';
496     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
497         if (drv1->protocol_name &&
498             !strcmp(drv1->protocol_name, protocol)) {
499             return drv1;
500         }
501     }
502     return NULL;
503 }
504 
505 static int find_image_format(const char *filename, BlockDriver **pdrv)
506 {
507     int ret, score, score_max;
508     BlockDriver *drv1, *drv;
509     uint8_t buf[2048];
510     BlockDriverState *bs;
511 
512     ret = bdrv_file_open(&bs, filename, 0);
513     if (ret < 0) {
514         *pdrv = NULL;
515         return ret;
516     }
517 
518     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
519     if (bs->sg || !bdrv_is_inserted(bs)) {
520         bdrv_delete(bs);
521         drv = bdrv_find_format("raw");
522         if (!drv) {
523             ret = -ENOENT;
524         }
525         *pdrv = drv;
526         return ret;
527     }
528 
529     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
530     bdrv_delete(bs);
531     if (ret < 0) {
532         *pdrv = NULL;
533         return ret;
534     }
535 
536     score_max = 0;
537     drv = NULL;
538     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
539         if (drv1->bdrv_probe) {
540             score = drv1->bdrv_probe(buf, ret, filename);
541             if (score > score_max) {
542                 score_max = score;
543                 drv = drv1;
544             }
545         }
546     }
547     if (!drv) {
548         ret = -ENOENT;
549     }
550     *pdrv = drv;
551     return ret;
552 }
553 
554 /**
555  * Set the current 'total_sectors' value
556  */
557 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
558 {
559     BlockDriver *drv = bs->drv;
560 
561     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
562     if (bs->sg)
563         return 0;
564 
565     /* query actual device if possible, otherwise just trust the hint */
566     if (drv->bdrv_getlength) {
567         int64_t length = drv->bdrv_getlength(bs);
568         if (length < 0) {
569             return length;
570         }
571         hint = length >> BDRV_SECTOR_BITS;
572     }
573 
574     bs->total_sectors = hint;
575     return 0;
576 }
577 
578 /**
579  * Set open flags for a given cache mode
580  *
581  * Return 0 on success, -1 if the cache mode was invalid.
582  */
583 int bdrv_parse_cache_flags(const char *mode, int *flags)
584 {
585     *flags &= ~BDRV_O_CACHE_MASK;
586 
587     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
588         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
589     } else if (!strcmp(mode, "directsync")) {
590         *flags |= BDRV_O_NOCACHE;
591     } else if (!strcmp(mode, "writeback")) {
592         *flags |= BDRV_O_CACHE_WB;
593     } else if (!strcmp(mode, "unsafe")) {
594         *flags |= BDRV_O_CACHE_WB;
595         *flags |= BDRV_O_NO_FLUSH;
596     } else if (!strcmp(mode, "writethrough")) {
597         /* this is the default */
598     } else {
599         return -1;
600     }
601 
602     return 0;
603 }
604 
605 /**
606  * The copy-on-read flag is actually a reference count so multiple users may
607  * use the feature without worrying about clobbering its previous state.
608  * Copy-on-read stays enabled until all users have called to disable it.
609  */
610 void bdrv_enable_copy_on_read(BlockDriverState *bs)
611 {
612     bs->copy_on_read++;
613 }
614 
615 void bdrv_disable_copy_on_read(BlockDriverState *bs)
616 {
617     assert(bs->copy_on_read > 0);
618     bs->copy_on_read--;
619 }
620 
621 /*
622  * Common part for opening disk images and files
623  */
624 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
625     int flags, BlockDriver *drv)
626 {
627     int ret, open_flags;
628 
629     assert(drv != NULL);
630     assert(bs->file == NULL);
631 
632     trace_bdrv_open_common(bs, filename, flags, drv->format_name);
633 
634     bs->open_flags = flags;
635     bs->buffer_alignment = 512;
636 
637     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
638     if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
639         bdrv_enable_copy_on_read(bs);
640     }
641 
642     pstrcpy(bs->filename, sizeof(bs->filename), filename);
643 
644     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
645         return -ENOTSUP;
646     }
647 
648     bs->drv = drv;
649     bs->opaque = g_malloc0(drv->instance_size);
650 
651     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
652     open_flags = flags | BDRV_O_CACHE_WB;
653 
654     /*
655      * Clear flags that are internal to the block layer before opening the
656      * image.
657      */
658     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
659 
660     /*
661      * Snapshots should be writable.
662      */
663     if (bs->is_temporary) {
664         open_flags |= BDRV_O_RDWR;
665     }
666 
667     bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
668 
669     /* Open the image, either directly or using a protocol */
670     if (drv->bdrv_file_open) {
671         ret = drv->bdrv_file_open(bs, filename, open_flags);
672     } else {
673         ret = bdrv_file_open(&bs->file, filename, open_flags);
674         if (ret >= 0) {
675             ret = drv->bdrv_open(bs, open_flags);
676         }
677     }
678 
679     if (ret < 0) {
680         goto free_and_fail;
681     }
682 
683     ret = refresh_total_sectors(bs, bs->total_sectors);
684     if (ret < 0) {
685         goto free_and_fail;
686     }
687 
688 #ifndef _WIN32
689     if (bs->is_temporary) {
690         unlink(filename);
691     }
692 #endif
693     return 0;
694 
695 free_and_fail:
696     if (bs->file) {
697         bdrv_delete(bs->file);
698         bs->file = NULL;
699     }
700     g_free(bs->opaque);
701     bs->opaque = NULL;
702     bs->drv = NULL;
703     return ret;
704 }
705 
706 /*
707  * Opens a file using a protocol (file, host_device, nbd, ...)
708  */
709 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
710 {
711     BlockDriverState *bs;
712     BlockDriver *drv;
713     int ret;
714 
715     drv = bdrv_find_protocol(filename);
716     if (!drv) {
717         return -ENOENT;
718     }
719 
720     bs = bdrv_new("");
721     ret = bdrv_open_common(bs, filename, flags, drv);
722     if (ret < 0) {
723         bdrv_delete(bs);
724         return ret;
725     }
726     bs->growable = 1;
727     *pbs = bs;
728     return 0;
729 }
730 
731 /*
732  * Opens a disk image (raw, qcow2, vmdk, ...)
733  */
734 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
735               BlockDriver *drv)
736 {
737     int ret;
738     char tmp_filename[PATH_MAX];
739 
740     if (flags & BDRV_O_SNAPSHOT) {
741         BlockDriverState *bs1;
742         int64_t total_size;
743         int is_protocol = 0;
744         BlockDriver *bdrv_qcow2;
745         QEMUOptionParameter *options;
746         char backing_filename[PATH_MAX];
747 
748         /* if snapshot, we create a temporary backing file and open it
749            instead of opening 'filename' directly */
750 
751         /* if there is a backing file, use it */
752         bs1 = bdrv_new("");
753         ret = bdrv_open(bs1, filename, 0, drv);
754         if (ret < 0) {
755             bdrv_delete(bs1);
756             return ret;
757         }
758         total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
759 
760         if (bs1->drv && bs1->drv->protocol_name)
761             is_protocol = 1;
762 
763         bdrv_delete(bs1);
764 
765         ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
766         if (ret < 0) {
767             return ret;
768         }
769 
770         /* Real path is meaningless for protocols */
771         if (is_protocol)
772             snprintf(backing_filename, sizeof(backing_filename),
773                      "%s", filename);
774         else if (!realpath(filename, backing_filename))
775             return -errno;
776 
777         bdrv_qcow2 = bdrv_find_format("qcow2");
778         options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
779 
780         set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
781         set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
782         if (drv) {
783             set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
784                 drv->format_name);
785         }
786 
787         ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
788         free_option_parameters(options);
789         if (ret < 0) {
790             return ret;
791         }
792 
793         filename = tmp_filename;
794         drv = bdrv_qcow2;
795         bs->is_temporary = 1;
796     }
797 
798     /* Find the right image format driver */
799     if (!drv) {
800         ret = find_image_format(filename, &drv);
801     }
802 
803     if (!drv) {
804         goto unlink_and_fail;
805     }
806 
807     /* Open the image */
808     ret = bdrv_open_common(bs, filename, flags, drv);
809     if (ret < 0) {
810         goto unlink_and_fail;
811     }
812 
813     /* If there is a backing file, use it */
814     if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
815         char backing_filename[PATH_MAX];
816         int back_flags;
817         BlockDriver *back_drv = NULL;
818 
819         bs->backing_hd = bdrv_new("");
820         bdrv_get_full_backing_filename(bs, backing_filename,
821                                        sizeof(backing_filename));
822 
823         if (bs->backing_format[0] != '\0') {
824             back_drv = bdrv_find_format(bs->backing_format);
825         }
826 
827         /* backing files always opened read-only */
828         back_flags =
829             flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
830 
831         ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
832         if (ret < 0) {
833             bdrv_close(bs);
834             return ret;
835         }
836         if (bs->is_temporary) {
837             bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
838         } else {
839             /* base image inherits from "parent" */
840             bs->backing_hd->keep_read_only = bs->keep_read_only;
841         }
842     }
843 
844     if (!bdrv_key_required(bs)) {
845         bdrv_dev_change_media_cb(bs, true);
846     }
847 
848     /* throttling disk I/O limits */
849     if (bs->io_limits_enabled) {
850         bdrv_io_limits_enable(bs);
851     }
852 
853     return 0;
854 
855 unlink_and_fail:
856     if (bs->is_temporary) {
857         unlink(filename);
858     }
859     return ret;
860 }
861 
862 void bdrv_close(BlockDriverState *bs)
863 {
864     bdrv_flush(bs);
865     if (bs->drv) {
866         if (bs->job) {
867             block_job_cancel_sync(bs->job);
868         }
869         bdrv_drain_all();
870 
871         if (bs == bs_snapshots) {
872             bs_snapshots = NULL;
873         }
874         if (bs->backing_hd) {
875             bdrv_delete(bs->backing_hd);
876             bs->backing_hd = NULL;
877         }
878         bs->drv->bdrv_close(bs);
879         g_free(bs->opaque);
880 #ifdef _WIN32
881         if (bs->is_temporary) {
882             unlink(bs->filename);
883         }
884 #endif
885         bs->opaque = NULL;
886         bs->drv = NULL;
887         bs->copy_on_read = 0;
888         bs->backing_file[0] = '\0';
889         bs->backing_format[0] = '\0';
890         bs->total_sectors = 0;
891         bs->encrypted = 0;
892         bs->valid_key = 0;
893         bs->sg = 0;
894         bs->growable = 0;
895 
896         if (bs->file != NULL) {
897             bdrv_delete(bs->file);
898             bs->file = NULL;
899         }
900 
901         bdrv_dev_change_media_cb(bs, false);
902     }
903 
904     /*throttling disk I/O limits*/
905     if (bs->io_limits_enabled) {
906         bdrv_io_limits_disable(bs);
907     }
908 }
909 
910 void bdrv_close_all(void)
911 {
912     BlockDriverState *bs;
913 
914     QTAILQ_FOREACH(bs, &bdrv_states, list) {
915         bdrv_close(bs);
916     }
917 }
918 
919 /*
920  * Wait for pending requests to complete across all BlockDriverStates
921  *
922  * This function does not flush data to disk, use bdrv_flush_all() for that
923  * after calling this function.
924  *
925  * Note that completion of an asynchronous I/O operation can trigger any
926  * number of other I/O operations on other devices---for example a coroutine
927  * can be arbitrarily complex and a constant flow of I/O can come until the
928  * coroutine is complete.  Because of this, it is not possible to have a
929  * function to drain a single device's I/O queue.
930  */
931 void bdrv_drain_all(void)
932 {
933     BlockDriverState *bs;
934     bool busy;
935 
936     do {
937         busy = qemu_aio_wait();
938 
939         /* FIXME: We do not have timer support here, so this is effectively
940          * a busy wait.
941          */
942         QTAILQ_FOREACH(bs, &bdrv_states, list) {
943             if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
944                 qemu_co_queue_restart_all(&bs->throttled_reqs);
945                 busy = true;
946             }
947         }
948     } while (busy);
949 
950     /* If requests are still pending there is a bug somewhere */
951     QTAILQ_FOREACH(bs, &bdrv_states, list) {
952         assert(QLIST_EMPTY(&bs->tracked_requests));
953         assert(qemu_co_queue_empty(&bs->throttled_reqs));
954     }
955 }
956 
957 /* make a BlockDriverState anonymous by removing from bdrv_state list.
958    Also, NULL terminate the device_name to prevent double remove */
959 void bdrv_make_anon(BlockDriverState *bs)
960 {
961     if (bs->device_name[0] != '\0') {
962         QTAILQ_REMOVE(&bdrv_states, bs, list);
963     }
964     bs->device_name[0] = '\0';
965 }
966 
967 static void bdrv_rebind(BlockDriverState *bs)
968 {
969     if (bs->drv && bs->drv->bdrv_rebind) {
970         bs->drv->bdrv_rebind(bs);
971     }
972 }
973 
974 /*
975  * Add new bs contents at the top of an image chain while the chain is
976  * live, while keeping required fields on the top layer.
977  *
978  * This will modify the BlockDriverState fields, and swap contents
979  * between bs_new and bs_top. Both bs_new and bs_top are modified.
980  *
981  * bs_new is required to be anonymous.
982  *
983  * This function does not create any image files.
984  */
985 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
986 {
987     BlockDriverState tmp;
988 
989     /* bs_new must be anonymous */
990     assert(bs_new->device_name[0] == '\0');
991 
992     tmp = *bs_new;
993 
994     /* there are some fields that need to stay on the top layer: */
995     tmp.open_flags        = bs_top->open_flags;
996 
997     /* dev info */
998     tmp.dev_ops           = bs_top->dev_ops;
999     tmp.dev_opaque        = bs_top->dev_opaque;
1000     tmp.dev               = bs_top->dev;
1001     tmp.buffer_alignment  = bs_top->buffer_alignment;
1002     tmp.copy_on_read      = bs_top->copy_on_read;
1003 
1004     tmp.enable_write_cache = bs_top->enable_write_cache;
1005 
1006     /* i/o timing parameters */
1007     tmp.slice_time        = bs_top->slice_time;
1008     tmp.slice_start       = bs_top->slice_start;
1009     tmp.slice_end         = bs_top->slice_end;
1010     tmp.io_limits         = bs_top->io_limits;
1011     tmp.io_base           = bs_top->io_base;
1012     tmp.throttled_reqs    = bs_top->throttled_reqs;
1013     tmp.block_timer       = bs_top->block_timer;
1014     tmp.io_limits_enabled = bs_top->io_limits_enabled;
1015 
1016     /* geometry */
1017     tmp.cyls              = bs_top->cyls;
1018     tmp.heads             = bs_top->heads;
1019     tmp.secs              = bs_top->secs;
1020     tmp.translation       = bs_top->translation;
1021 
1022     /* r/w error */
1023     tmp.on_read_error     = bs_top->on_read_error;
1024     tmp.on_write_error    = bs_top->on_write_error;
1025 
1026     /* i/o status */
1027     tmp.iostatus_enabled  = bs_top->iostatus_enabled;
1028     tmp.iostatus          = bs_top->iostatus;
1029 
1030     /* keep the same entry in bdrv_states */
1031     pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name);
1032     tmp.list = bs_top->list;
1033 
1034     /* The contents of 'tmp' will become bs_top, as we are
1035      * swapping bs_new and bs_top contents. */
1036     tmp.backing_hd = bs_new;
1037     pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
1038     bdrv_get_format(bs_top, tmp.backing_format, sizeof(tmp.backing_format));
1039 
1040     /* swap contents of the fixed new bs and the current top */
1041     *bs_new = *bs_top;
1042     *bs_top = tmp;
1043 
1044     /* device_name[] was carried over from the old bs_top.  bs_new
1045      * shouldn't be in bdrv_states, so we need to make device_name[]
1046      * reflect the anonymity of bs_new
1047      */
1048     bs_new->device_name[0] = '\0';
1049 
1050     /* clear the copied fields in the new backing file */
1051     bdrv_detach_dev(bs_new, bs_new->dev);
1052 
1053     qemu_co_queue_init(&bs_new->throttled_reqs);
1054     memset(&bs_new->io_base,   0, sizeof(bs_new->io_base));
1055     memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits));
1056     bdrv_iostatus_disable(bs_new);
1057 
1058     /* we don't use bdrv_io_limits_disable() for this, because we don't want
1059      * to affect or delete the block_timer, as it has been moved to bs_top */
1060     bs_new->io_limits_enabled = false;
1061     bs_new->block_timer       = NULL;
1062     bs_new->slice_time        = 0;
1063     bs_new->slice_start       = 0;
1064     bs_new->slice_end         = 0;
1065 
1066     bdrv_rebind(bs_new);
1067     bdrv_rebind(bs_top);
1068 }
1069 
1070 void bdrv_delete(BlockDriverState *bs)
1071 {
1072     assert(!bs->dev);
1073     assert(!bs->job);
1074     assert(!bs->in_use);
1075 
1076     /* remove from list, if necessary */
1077     bdrv_make_anon(bs);
1078 
1079     bdrv_close(bs);
1080 
1081     assert(bs != bs_snapshots);
1082     g_free(bs);
1083 }
1084 
1085 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1086 /* TODO change to DeviceState *dev when all users are qdevified */
1087 {
1088     if (bs->dev) {
1089         return -EBUSY;
1090     }
1091     bs->dev = dev;
1092     bdrv_iostatus_reset(bs);
1093     return 0;
1094 }
1095 
1096 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1097 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1098 {
1099     if (bdrv_attach_dev(bs, dev) < 0) {
1100         abort();
1101     }
1102 }
1103 
1104 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1105 /* TODO change to DeviceState *dev when all users are qdevified */
1106 {
1107     assert(bs->dev == dev);
1108     bs->dev = NULL;
1109     bs->dev_ops = NULL;
1110     bs->dev_opaque = NULL;
1111     bs->buffer_alignment = 512;
1112 }
1113 
1114 /* TODO change to return DeviceState * when all users are qdevified */
1115 void *bdrv_get_attached_dev(BlockDriverState *bs)
1116 {
1117     return bs->dev;
1118 }
1119 
1120 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1121                       void *opaque)
1122 {
1123     bs->dev_ops = ops;
1124     bs->dev_opaque = opaque;
1125     if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1126         bs_snapshots = NULL;
1127     }
1128 }
1129 
1130 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1131                                BlockQMPEventAction action, int is_read)
1132 {
1133     QObject *data;
1134     const char *action_str;
1135 
1136     switch (action) {
1137     case BDRV_ACTION_REPORT:
1138         action_str = "report";
1139         break;
1140     case BDRV_ACTION_IGNORE:
1141         action_str = "ignore";
1142         break;
1143     case BDRV_ACTION_STOP:
1144         action_str = "stop";
1145         break;
1146     default:
1147         abort();
1148     }
1149 
1150     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1151                               bdrv->device_name,
1152                               action_str,
1153                               is_read ? "read" : "write");
1154     monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1155 
1156     qobject_decref(data);
1157 }
1158 
1159 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1160 {
1161     QObject *data;
1162 
1163     data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1164                               bdrv_get_device_name(bs), ejected);
1165     monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1166 
1167     qobject_decref(data);
1168 }
1169 
1170 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1171 {
1172     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1173         bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1174         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1175         if (tray_was_closed) {
1176             /* tray open */
1177             bdrv_emit_qmp_eject_event(bs, true);
1178         }
1179         if (load) {
1180             /* tray close */
1181             bdrv_emit_qmp_eject_event(bs, false);
1182         }
1183     }
1184 }
1185 
1186 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1187 {
1188     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1189 }
1190 
1191 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1192 {
1193     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1194         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1195     }
1196 }
1197 
1198 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1199 {
1200     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1201         return bs->dev_ops->is_tray_open(bs->dev_opaque);
1202     }
1203     return false;
1204 }
1205 
1206 static void bdrv_dev_resize_cb(BlockDriverState *bs)
1207 {
1208     if (bs->dev_ops && bs->dev_ops->resize_cb) {
1209         bs->dev_ops->resize_cb(bs->dev_opaque);
1210     }
1211 }
1212 
1213 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1214 {
1215     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1216         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1217     }
1218     return false;
1219 }
1220 
1221 /*
1222  * Run consistency checks on an image
1223  *
1224  * Returns 0 if the check could be completed (it doesn't mean that the image is
1225  * free of errors) or -errno when an internal error occurred. The results of the
1226  * check are stored in res.
1227  */
1228 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1229 {
1230     if (bs->drv->bdrv_check == NULL) {
1231         return -ENOTSUP;
1232     }
1233 
1234     memset(res, 0, sizeof(*res));
1235     return bs->drv->bdrv_check(bs, res, fix);
1236 }
1237 
1238 #define COMMIT_BUF_SECTORS 2048
1239 
1240 /* commit COW file into the raw image */
1241 int bdrv_commit(BlockDriverState *bs)
1242 {
1243     BlockDriver *drv = bs->drv;
1244     BlockDriver *backing_drv;
1245     int64_t sector, total_sectors;
1246     int n, ro, open_flags;
1247     int ret = 0, rw_ret = 0;
1248     uint8_t *buf;
1249     char filename[1024];
1250     BlockDriverState *bs_rw, *bs_ro;
1251 
1252     if (!drv)
1253         return -ENOMEDIUM;
1254 
1255     if (!bs->backing_hd) {
1256         return -ENOTSUP;
1257     }
1258 
1259     if (bs->backing_hd->keep_read_only) {
1260         return -EACCES;
1261     }
1262 
1263     if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1264         return -EBUSY;
1265     }
1266 
1267     backing_drv = bs->backing_hd->drv;
1268     ro = bs->backing_hd->read_only;
1269     strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1270     open_flags =  bs->backing_hd->open_flags;
1271 
1272     if (ro) {
1273         /* re-open as RW */
1274         bdrv_delete(bs->backing_hd);
1275         bs->backing_hd = NULL;
1276         bs_rw = bdrv_new("");
1277         rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1278             backing_drv);
1279         if (rw_ret < 0) {
1280             bdrv_delete(bs_rw);
1281             /* try to re-open read-only */
1282             bs_ro = bdrv_new("");
1283             ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1284                 backing_drv);
1285             if (ret < 0) {
1286                 bdrv_delete(bs_ro);
1287                 /* drive not functional anymore */
1288                 bs->drv = NULL;
1289                 return ret;
1290             }
1291             bs->backing_hd = bs_ro;
1292             return rw_ret;
1293         }
1294         bs->backing_hd = bs_rw;
1295     }
1296 
1297     total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1298     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1299 
1300     for (sector = 0; sector < total_sectors; sector += n) {
1301         if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1302 
1303             if (bdrv_read(bs, sector, buf, n) != 0) {
1304                 ret = -EIO;
1305                 goto ro_cleanup;
1306             }
1307 
1308             if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1309                 ret = -EIO;
1310                 goto ro_cleanup;
1311             }
1312         }
1313     }
1314 
1315     if (drv->bdrv_make_empty) {
1316         ret = drv->bdrv_make_empty(bs);
1317         bdrv_flush(bs);
1318     }
1319 
1320     /*
1321      * Make sure all data we wrote to the backing device is actually
1322      * stable on disk.
1323      */
1324     if (bs->backing_hd)
1325         bdrv_flush(bs->backing_hd);
1326 
1327 ro_cleanup:
1328     g_free(buf);
1329 
1330     if (ro) {
1331         /* re-open as RO */
1332         bdrv_delete(bs->backing_hd);
1333         bs->backing_hd = NULL;
1334         bs_ro = bdrv_new("");
1335         ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1336             backing_drv);
1337         if (ret < 0) {
1338             bdrv_delete(bs_ro);
1339             /* drive not functional anymore */
1340             bs->drv = NULL;
1341             return ret;
1342         }
1343         bs->backing_hd = bs_ro;
1344         bs->backing_hd->keep_read_only = 0;
1345     }
1346 
1347     return ret;
1348 }
1349 
1350 int bdrv_commit_all(void)
1351 {
1352     BlockDriverState *bs;
1353 
1354     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1355         int ret = bdrv_commit(bs);
1356         if (ret < 0) {
1357             return ret;
1358         }
1359     }
1360     return 0;
1361 }
1362 
1363 struct BdrvTrackedRequest {
1364     BlockDriverState *bs;
1365     int64_t sector_num;
1366     int nb_sectors;
1367     bool is_write;
1368     QLIST_ENTRY(BdrvTrackedRequest) list;
1369     Coroutine *co; /* owner, used for deadlock detection */
1370     CoQueue wait_queue; /* coroutines blocked on this request */
1371 };
1372 
1373 /**
1374  * Remove an active request from the tracked requests list
1375  *
1376  * This function should be called when a tracked request is completing.
1377  */
1378 static void tracked_request_end(BdrvTrackedRequest *req)
1379 {
1380     QLIST_REMOVE(req, list);
1381     qemu_co_queue_restart_all(&req->wait_queue);
1382 }
1383 
1384 /**
1385  * Add an active request to the tracked requests list
1386  */
1387 static void tracked_request_begin(BdrvTrackedRequest *req,
1388                                   BlockDriverState *bs,
1389                                   int64_t sector_num,
1390                                   int nb_sectors, bool is_write)
1391 {
1392     *req = (BdrvTrackedRequest){
1393         .bs = bs,
1394         .sector_num = sector_num,
1395         .nb_sectors = nb_sectors,
1396         .is_write = is_write,
1397         .co = qemu_coroutine_self(),
1398     };
1399 
1400     qemu_co_queue_init(&req->wait_queue);
1401 
1402     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1403 }
1404 
1405 /**
1406  * Round a region to cluster boundaries
1407  */
1408 static void round_to_clusters(BlockDriverState *bs,
1409                               int64_t sector_num, int nb_sectors,
1410                               int64_t *cluster_sector_num,
1411                               int *cluster_nb_sectors)
1412 {
1413     BlockDriverInfo bdi;
1414 
1415     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1416         *cluster_sector_num = sector_num;
1417         *cluster_nb_sectors = nb_sectors;
1418     } else {
1419         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1420         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1421         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1422                                             nb_sectors, c);
1423     }
1424 }
1425 
1426 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1427                                      int64_t sector_num, int nb_sectors) {
1428     /*        aaaa   bbbb */
1429     if (sector_num >= req->sector_num + req->nb_sectors) {
1430         return false;
1431     }
1432     /* bbbb   aaaa        */
1433     if (req->sector_num >= sector_num + nb_sectors) {
1434         return false;
1435     }
1436     return true;
1437 }
1438 
1439 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1440         int64_t sector_num, int nb_sectors)
1441 {
1442     BdrvTrackedRequest *req;
1443     int64_t cluster_sector_num;
1444     int cluster_nb_sectors;
1445     bool retry;
1446 
1447     /* If we touch the same cluster it counts as an overlap.  This guarantees
1448      * that allocating writes will be serialized and not race with each other
1449      * for the same cluster.  For example, in copy-on-read it ensures that the
1450      * CoR read and write operations are atomic and guest writes cannot
1451      * interleave between them.
1452      */
1453     round_to_clusters(bs, sector_num, nb_sectors,
1454                       &cluster_sector_num, &cluster_nb_sectors);
1455 
1456     do {
1457         retry = false;
1458         QLIST_FOREACH(req, &bs->tracked_requests, list) {
1459             if (tracked_request_overlaps(req, cluster_sector_num,
1460                                          cluster_nb_sectors)) {
1461                 /* Hitting this means there was a reentrant request, for
1462                  * example, a block driver issuing nested requests.  This must
1463                  * never happen since it means deadlock.
1464                  */
1465                 assert(qemu_coroutine_self() != req->co);
1466 
1467                 qemu_co_queue_wait(&req->wait_queue);
1468                 retry = true;
1469                 break;
1470             }
1471         }
1472     } while (retry);
1473 }
1474 
1475 /*
1476  * Return values:
1477  * 0        - success
1478  * -EINVAL  - backing format specified, but no file
1479  * -ENOSPC  - can't update the backing file because no space is left in the
1480  *            image file header
1481  * -ENOTSUP - format driver doesn't support changing the backing file
1482  */
1483 int bdrv_change_backing_file(BlockDriverState *bs,
1484     const char *backing_file, const char *backing_fmt)
1485 {
1486     BlockDriver *drv = bs->drv;
1487     int ret;
1488 
1489     /* Backing file format doesn't make sense without a backing file */
1490     if (backing_fmt && !backing_file) {
1491         return -EINVAL;
1492     }
1493 
1494     if (drv->bdrv_change_backing_file != NULL) {
1495         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1496     } else {
1497         ret = -ENOTSUP;
1498     }
1499 
1500     if (ret == 0) {
1501         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1502         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1503     }
1504     return ret;
1505 }
1506 
1507 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1508                                    size_t size)
1509 {
1510     int64_t len;
1511 
1512     if (!bdrv_is_inserted(bs))
1513         return -ENOMEDIUM;
1514 
1515     if (bs->growable)
1516         return 0;
1517 
1518     len = bdrv_getlength(bs);
1519 
1520     if (offset < 0)
1521         return -EIO;
1522 
1523     if ((offset > len) || (len - offset < size))
1524         return -EIO;
1525 
1526     return 0;
1527 }
1528 
1529 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1530                               int nb_sectors)
1531 {
1532     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1533                                    nb_sectors * BDRV_SECTOR_SIZE);
1534 }
1535 
1536 typedef struct RwCo {
1537     BlockDriverState *bs;
1538     int64_t sector_num;
1539     int nb_sectors;
1540     QEMUIOVector *qiov;
1541     bool is_write;
1542     int ret;
1543 } RwCo;
1544 
1545 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1546 {
1547     RwCo *rwco = opaque;
1548 
1549     if (!rwco->is_write) {
1550         rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1551                                      rwco->nb_sectors, rwco->qiov, 0);
1552     } else {
1553         rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1554                                       rwco->nb_sectors, rwco->qiov, 0);
1555     }
1556 }
1557 
1558 /*
1559  * Process a synchronous request using coroutines
1560  */
1561 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1562                       int nb_sectors, bool is_write)
1563 {
1564     QEMUIOVector qiov;
1565     struct iovec iov = {
1566         .iov_base = (void *)buf,
1567         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1568     };
1569     Coroutine *co;
1570     RwCo rwco = {
1571         .bs = bs,
1572         .sector_num = sector_num,
1573         .nb_sectors = nb_sectors,
1574         .qiov = &qiov,
1575         .is_write = is_write,
1576         .ret = NOT_DONE,
1577     };
1578 
1579     qemu_iovec_init_external(&qiov, &iov, 1);
1580 
1581     /**
1582      * In sync call context, when the vcpu is blocked, this throttling timer
1583      * will not fire; so the I/O throttling function has to be disabled here
1584      * if it has been enabled.
1585      */
1586     if (bs->io_limits_enabled) {
1587         fprintf(stderr, "Disabling I/O throttling on '%s' due "
1588                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
1589         bdrv_io_limits_disable(bs);
1590     }
1591 
1592     if (qemu_in_coroutine()) {
1593         /* Fast-path if already in coroutine context */
1594         bdrv_rw_co_entry(&rwco);
1595     } else {
1596         co = qemu_coroutine_create(bdrv_rw_co_entry);
1597         qemu_coroutine_enter(co, &rwco);
1598         while (rwco.ret == NOT_DONE) {
1599             qemu_aio_wait();
1600         }
1601     }
1602     return rwco.ret;
1603 }
1604 
1605 /* return < 0 if error. See bdrv_write() for the return codes */
1606 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1607               uint8_t *buf, int nb_sectors)
1608 {
1609     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1610 }
1611 
1612 #define BITS_PER_LONG  (sizeof(unsigned long) * 8)
1613 
1614 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1615                              int nb_sectors, int dirty)
1616 {
1617     int64_t start, end;
1618     unsigned long val, idx, bit;
1619 
1620     start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1621     end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1622 
1623     for (; start <= end; start++) {
1624         idx = start / BITS_PER_LONG;
1625         bit = start % BITS_PER_LONG;
1626         val = bs->dirty_bitmap[idx];
1627         if (dirty) {
1628             if (!(val & (1UL << bit))) {
1629                 bs->dirty_count++;
1630                 val |= 1UL << bit;
1631             }
1632         } else {
1633             if (val & (1UL << bit)) {
1634                 bs->dirty_count--;
1635                 val &= ~(1UL << bit);
1636             }
1637         }
1638         bs->dirty_bitmap[idx] = val;
1639     }
1640 }
1641 
1642 /* Return < 0 if error. Important errors are:
1643   -EIO         generic I/O error (may happen for all errors)
1644   -ENOMEDIUM   No media inserted.
1645   -EINVAL      Invalid sector number or nb_sectors
1646   -EACCES      Trying to write a read-only device
1647 */
1648 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1649                const uint8_t *buf, int nb_sectors)
1650 {
1651     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1652 }
1653 
1654 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1655                void *buf, int count1)
1656 {
1657     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1658     int len, nb_sectors, count;
1659     int64_t sector_num;
1660     int ret;
1661 
1662     count = count1;
1663     /* first read to align to sector start */
1664     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1665     if (len > count)
1666         len = count;
1667     sector_num = offset >> BDRV_SECTOR_BITS;
1668     if (len > 0) {
1669         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1670             return ret;
1671         memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1672         count -= len;
1673         if (count == 0)
1674             return count1;
1675         sector_num++;
1676         buf += len;
1677     }
1678 
1679     /* read the sectors "in place" */
1680     nb_sectors = count >> BDRV_SECTOR_BITS;
1681     if (nb_sectors > 0) {
1682         if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1683             return ret;
1684         sector_num += nb_sectors;
1685         len = nb_sectors << BDRV_SECTOR_BITS;
1686         buf += len;
1687         count -= len;
1688     }
1689 
1690     /* add data from the last sector */
1691     if (count > 0) {
1692         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1693             return ret;
1694         memcpy(buf, tmp_buf, count);
1695     }
1696     return count1;
1697 }
1698 
1699 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1700                 const void *buf, int count1)
1701 {
1702     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1703     int len, nb_sectors, count;
1704     int64_t sector_num;
1705     int ret;
1706 
1707     count = count1;
1708     /* first write to align to sector start */
1709     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1710     if (len > count)
1711         len = count;
1712     sector_num = offset >> BDRV_SECTOR_BITS;
1713     if (len > 0) {
1714         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1715             return ret;
1716         memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1717         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1718             return ret;
1719         count -= len;
1720         if (count == 0)
1721             return count1;
1722         sector_num++;
1723         buf += len;
1724     }
1725 
1726     /* write the sectors "in place" */
1727     nb_sectors = count >> BDRV_SECTOR_BITS;
1728     if (nb_sectors > 0) {
1729         if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1730             return ret;
1731         sector_num += nb_sectors;
1732         len = nb_sectors << BDRV_SECTOR_BITS;
1733         buf += len;
1734         count -= len;
1735     }
1736 
1737     /* add data from the last sector */
1738     if (count > 0) {
1739         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1740             return ret;
1741         memcpy(tmp_buf, buf, count);
1742         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1743             return ret;
1744     }
1745     return count1;
1746 }
1747 
1748 /*
1749  * Writes to the file and ensures that no writes are reordered across this
1750  * request (acts as a barrier)
1751  *
1752  * Returns 0 on success, -errno in error cases.
1753  */
1754 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1755     const void *buf, int count)
1756 {
1757     int ret;
1758 
1759     ret = bdrv_pwrite(bs, offset, buf, count);
1760     if (ret < 0) {
1761         return ret;
1762     }
1763 
1764     /* No flush needed for cache modes that already do it */
1765     if (bs->enable_write_cache) {
1766         bdrv_flush(bs);
1767     }
1768 
1769     return 0;
1770 }
1771 
1772 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1773         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1774 {
1775     /* Perform I/O through a temporary buffer so that users who scribble over
1776      * their read buffer while the operation is in progress do not end up
1777      * modifying the image file.  This is critical for zero-copy guest I/O
1778      * where anything might happen inside guest memory.
1779      */
1780     void *bounce_buffer;
1781 
1782     BlockDriver *drv = bs->drv;
1783     struct iovec iov;
1784     QEMUIOVector bounce_qiov;
1785     int64_t cluster_sector_num;
1786     int cluster_nb_sectors;
1787     size_t skip_bytes;
1788     int ret;
1789 
1790     /* Cover entire cluster so no additional backing file I/O is required when
1791      * allocating cluster in the image file.
1792      */
1793     round_to_clusters(bs, sector_num, nb_sectors,
1794                       &cluster_sector_num, &cluster_nb_sectors);
1795 
1796     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1797                                    cluster_sector_num, cluster_nb_sectors);
1798 
1799     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1800     iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1801     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1802 
1803     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1804                              &bounce_qiov);
1805     if (ret < 0) {
1806         goto err;
1807     }
1808 
1809     if (drv->bdrv_co_write_zeroes &&
1810         buffer_is_zero(bounce_buffer, iov.iov_len)) {
1811         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
1812                                       cluster_nb_sectors);
1813     } else {
1814         /* This does not change the data on the disk, it is not necessary
1815          * to flush even in cache=writethrough mode.
1816          */
1817         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1818                                   &bounce_qiov);
1819     }
1820 
1821     if (ret < 0) {
1822         /* It might be okay to ignore write errors for guest requests.  If this
1823          * is a deliberate copy-on-read then we don't want to ignore the error.
1824          * Simply report it in all cases.
1825          */
1826         goto err;
1827     }
1828 
1829     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1830     qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1831                            nb_sectors * BDRV_SECTOR_SIZE);
1832 
1833 err:
1834     qemu_vfree(bounce_buffer);
1835     return ret;
1836 }
1837 
1838 /*
1839  * Handle a read request in coroutine context
1840  */
1841 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1842     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1843     BdrvRequestFlags flags)
1844 {
1845     BlockDriver *drv = bs->drv;
1846     BdrvTrackedRequest req;
1847     int ret;
1848 
1849     if (!drv) {
1850         return -ENOMEDIUM;
1851     }
1852     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1853         return -EIO;
1854     }
1855 
1856     /* throttling disk read I/O */
1857     if (bs->io_limits_enabled) {
1858         bdrv_io_limits_intercept(bs, false, nb_sectors);
1859     }
1860 
1861     if (bs->copy_on_read) {
1862         flags |= BDRV_REQ_COPY_ON_READ;
1863     }
1864     if (flags & BDRV_REQ_COPY_ON_READ) {
1865         bs->copy_on_read_in_flight++;
1866     }
1867 
1868     if (bs->copy_on_read_in_flight) {
1869         wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1870     }
1871 
1872     tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1873 
1874     if (flags & BDRV_REQ_COPY_ON_READ) {
1875         int pnum;
1876 
1877         ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1878         if (ret < 0) {
1879             goto out;
1880         }
1881 
1882         if (!ret || pnum != nb_sectors) {
1883             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1884             goto out;
1885         }
1886     }
1887 
1888     ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1889 
1890 out:
1891     tracked_request_end(&req);
1892 
1893     if (flags & BDRV_REQ_COPY_ON_READ) {
1894         bs->copy_on_read_in_flight--;
1895     }
1896 
1897     return ret;
1898 }
1899 
1900 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1901     int nb_sectors, QEMUIOVector *qiov)
1902 {
1903     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1904 
1905     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1906 }
1907 
1908 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1909     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1910 {
1911     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1912 
1913     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1914                             BDRV_REQ_COPY_ON_READ);
1915 }
1916 
1917 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1918     int64_t sector_num, int nb_sectors)
1919 {
1920     BlockDriver *drv = bs->drv;
1921     QEMUIOVector qiov;
1922     struct iovec iov;
1923     int ret;
1924 
1925     /* TODO Emulate only part of misaligned requests instead of letting block
1926      * drivers return -ENOTSUP and emulate everything */
1927 
1928     /* First try the efficient write zeroes operation */
1929     if (drv->bdrv_co_write_zeroes) {
1930         ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1931         if (ret != -ENOTSUP) {
1932             return ret;
1933         }
1934     }
1935 
1936     /* Fall back to bounce buffer if write zeroes is unsupported */
1937     iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
1938     iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1939     memset(iov.iov_base, 0, iov.iov_len);
1940     qemu_iovec_init_external(&qiov, &iov, 1);
1941 
1942     ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1943 
1944     qemu_vfree(iov.iov_base);
1945     return ret;
1946 }
1947 
1948 /*
1949  * Handle a write request in coroutine context
1950  */
1951 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1952     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1953     BdrvRequestFlags flags)
1954 {
1955     BlockDriver *drv = bs->drv;
1956     BdrvTrackedRequest req;
1957     int ret;
1958 
1959     if (!bs->drv) {
1960         return -ENOMEDIUM;
1961     }
1962     if (bs->read_only) {
1963         return -EACCES;
1964     }
1965     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1966         return -EIO;
1967     }
1968 
1969     /* throttling disk write I/O */
1970     if (bs->io_limits_enabled) {
1971         bdrv_io_limits_intercept(bs, true, nb_sectors);
1972     }
1973 
1974     if (bs->copy_on_read_in_flight) {
1975         wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1976     }
1977 
1978     tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1979 
1980     if (flags & BDRV_REQ_ZERO_WRITE) {
1981         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1982     } else {
1983         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1984     }
1985 
1986     if (ret == 0 && !bs->enable_write_cache) {
1987         ret = bdrv_co_flush(bs);
1988     }
1989 
1990     if (bs->dirty_bitmap) {
1991         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1992     }
1993 
1994     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1995         bs->wr_highest_sector = sector_num + nb_sectors - 1;
1996     }
1997 
1998     tracked_request_end(&req);
1999 
2000     return ret;
2001 }
2002 
2003 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2004     int nb_sectors, QEMUIOVector *qiov)
2005 {
2006     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2007 
2008     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2009 }
2010 
2011 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2012                                       int64_t sector_num, int nb_sectors)
2013 {
2014     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2015 
2016     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2017                              BDRV_REQ_ZERO_WRITE);
2018 }
2019 
2020 /**
2021  * Truncate file to 'offset' bytes (needed only for file protocols)
2022  */
2023 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2024 {
2025     BlockDriver *drv = bs->drv;
2026     int ret;
2027     if (!drv)
2028         return -ENOMEDIUM;
2029     if (!drv->bdrv_truncate)
2030         return -ENOTSUP;
2031     if (bs->read_only)
2032         return -EACCES;
2033     if (bdrv_in_use(bs))
2034         return -EBUSY;
2035     ret = drv->bdrv_truncate(bs, offset);
2036     if (ret == 0) {
2037         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2038         bdrv_dev_resize_cb(bs);
2039     }
2040     return ret;
2041 }
2042 
2043 /**
2044  * Length of a allocated file in bytes. Sparse files are counted by actual
2045  * allocated space. Return < 0 if error or unknown.
2046  */
2047 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2048 {
2049     BlockDriver *drv = bs->drv;
2050     if (!drv) {
2051         return -ENOMEDIUM;
2052     }
2053     if (drv->bdrv_get_allocated_file_size) {
2054         return drv->bdrv_get_allocated_file_size(bs);
2055     }
2056     if (bs->file) {
2057         return bdrv_get_allocated_file_size(bs->file);
2058     }
2059     return -ENOTSUP;
2060 }
2061 
2062 /**
2063  * Length of a file in bytes. Return < 0 if error or unknown.
2064  */
2065 int64_t bdrv_getlength(BlockDriverState *bs)
2066 {
2067     BlockDriver *drv = bs->drv;
2068     if (!drv)
2069         return -ENOMEDIUM;
2070 
2071     if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2072         if (drv->bdrv_getlength) {
2073             return drv->bdrv_getlength(bs);
2074         }
2075     }
2076     return bs->total_sectors * BDRV_SECTOR_SIZE;
2077 }
2078 
2079 /* return 0 as number of sectors if no device present or error */
2080 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2081 {
2082     int64_t length;
2083     length = bdrv_getlength(bs);
2084     if (length < 0)
2085         length = 0;
2086     else
2087         length = length >> BDRV_SECTOR_BITS;
2088     *nb_sectors_ptr = length;
2089 }
2090 
2091 struct partition {
2092         uint8_t boot_ind;           /* 0x80 - active */
2093         uint8_t head;               /* starting head */
2094         uint8_t sector;             /* starting sector */
2095         uint8_t cyl;                /* starting cylinder */
2096         uint8_t sys_ind;            /* What partition type */
2097         uint8_t end_head;           /* end head */
2098         uint8_t end_sector;         /* end sector */
2099         uint8_t end_cyl;            /* end cylinder */
2100         uint32_t start_sect;        /* starting sector counting from 0 */
2101         uint32_t nr_sects;          /* nr of sectors in partition */
2102 } QEMU_PACKED;
2103 
2104 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
2105 static int guess_disk_lchs(BlockDriverState *bs,
2106                            int *pcylinders, int *pheads, int *psectors)
2107 {
2108     uint8_t buf[BDRV_SECTOR_SIZE];
2109     int ret, i, heads, sectors, cylinders;
2110     struct partition *p;
2111     uint32_t nr_sects;
2112     uint64_t nb_sectors;
2113     bool enabled;
2114 
2115     bdrv_get_geometry(bs, &nb_sectors);
2116 
2117     /**
2118      * The function will be invoked during startup not only in sync I/O mode,
2119      * but also in async I/O mode. So the I/O throttling function has to
2120      * be disabled temporarily here, not permanently.
2121      */
2122     enabled = bs->io_limits_enabled;
2123     bs->io_limits_enabled = false;
2124     ret = bdrv_read(bs, 0, buf, 1);
2125     bs->io_limits_enabled = enabled;
2126     if (ret < 0)
2127         return -1;
2128     /* test msdos magic */
2129     if (buf[510] != 0x55 || buf[511] != 0xaa)
2130         return -1;
2131     for(i = 0; i < 4; i++) {
2132         p = ((struct partition *)(buf + 0x1be)) + i;
2133         nr_sects = le32_to_cpu(p->nr_sects);
2134         if (nr_sects && p->end_head) {
2135             /* We make the assumption that the partition terminates on
2136                a cylinder boundary */
2137             heads = p->end_head + 1;
2138             sectors = p->end_sector & 63;
2139             if (sectors == 0)
2140                 continue;
2141             cylinders = nb_sectors / (heads * sectors);
2142             if (cylinders < 1 || cylinders > 16383)
2143                 continue;
2144             *pheads = heads;
2145             *psectors = sectors;
2146             *pcylinders = cylinders;
2147 #if 0
2148             printf("guessed geometry: LCHS=%d %d %d\n",
2149                    cylinders, heads, sectors);
2150 #endif
2151             return 0;
2152         }
2153     }
2154     return -1;
2155 }
2156 
2157 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2158 {
2159     int translation, lba_detected = 0;
2160     int cylinders, heads, secs;
2161     uint64_t nb_sectors;
2162 
2163     /* if a geometry hint is available, use it */
2164     bdrv_get_geometry(bs, &nb_sectors);
2165     bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2166     translation = bdrv_get_translation_hint(bs);
2167     if (cylinders != 0) {
2168         *pcyls = cylinders;
2169         *pheads = heads;
2170         *psecs = secs;
2171     } else {
2172         if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2173             if (heads > 16) {
2174                 /* if heads > 16, it means that a BIOS LBA
2175                    translation was active, so the default
2176                    hardware geometry is OK */
2177                 lba_detected = 1;
2178                 goto default_geometry;
2179             } else {
2180                 *pcyls = cylinders;
2181                 *pheads = heads;
2182                 *psecs = secs;
2183                 /* disable any translation to be in sync with
2184                    the logical geometry */
2185                 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2186                     bdrv_set_translation_hint(bs,
2187                                               BIOS_ATA_TRANSLATION_NONE);
2188                 }
2189             }
2190         } else {
2191         default_geometry:
2192             /* if no geometry, use a standard physical disk geometry */
2193             cylinders = nb_sectors / (16 * 63);
2194 
2195             if (cylinders > 16383)
2196                 cylinders = 16383;
2197             else if (cylinders < 2)
2198                 cylinders = 2;
2199             *pcyls = cylinders;
2200             *pheads = 16;
2201             *psecs = 63;
2202             if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2203                 if ((*pcyls * *pheads) <= 131072) {
2204                     bdrv_set_translation_hint(bs,
2205                                               BIOS_ATA_TRANSLATION_LARGE);
2206                 } else {
2207                     bdrv_set_translation_hint(bs,
2208                                               BIOS_ATA_TRANSLATION_LBA);
2209                 }
2210             }
2211         }
2212         bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2213     }
2214 }
2215 
2216 void bdrv_set_geometry_hint(BlockDriverState *bs,
2217                             int cyls, int heads, int secs)
2218 {
2219     bs->cyls = cyls;
2220     bs->heads = heads;
2221     bs->secs = secs;
2222 }
2223 
2224 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2225 {
2226     bs->translation = translation;
2227 }
2228 
2229 void bdrv_get_geometry_hint(BlockDriverState *bs,
2230                             int *pcyls, int *pheads, int *psecs)
2231 {
2232     *pcyls = bs->cyls;
2233     *pheads = bs->heads;
2234     *psecs = bs->secs;
2235 }
2236 
2237 /* throttling disk io limits */
2238 void bdrv_set_io_limits(BlockDriverState *bs,
2239                         BlockIOLimit *io_limits)
2240 {
2241     bs->io_limits = *io_limits;
2242     bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2243 }
2244 
2245 /* Recognize floppy formats */
2246 typedef struct FDFormat {
2247     FDriveType drive;
2248     uint8_t last_sect;
2249     uint8_t max_track;
2250     uint8_t max_head;
2251     FDriveRate rate;
2252 } FDFormat;
2253 
2254 static const FDFormat fd_formats[] = {
2255     /* First entry is default format */
2256     /* 1.44 MB 3"1/2 floppy disks */
2257     { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2258     { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2259     { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2260     { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2261     { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2262     { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2263     { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2264     { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
2265     /* 2.88 MB 3"1/2 floppy disks */
2266     { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2267     { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2268     { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2269     { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2270     { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
2271     /* 720 kB 3"1/2 floppy disks */
2272     { FDRIVE_DRV_144,  9, 80, 1, FDRIVE_RATE_250K, },
2273     { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2274     { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2275     { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2276     { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2277     { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
2278     /* 1.2 MB 5"1/4 floppy disks */
2279     { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2280     { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2281     { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2282     { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2283     { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
2284     /* 720 kB 5"1/4 floppy disks */
2285     { FDRIVE_DRV_120,  9, 80, 1, FDRIVE_RATE_250K, },
2286     { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
2287     /* 360 kB 5"1/4 floppy disks */
2288     { FDRIVE_DRV_120,  9, 40, 1, FDRIVE_RATE_300K, },
2289     { FDRIVE_DRV_120,  9, 40, 0, FDRIVE_RATE_300K, },
2290     { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2291     { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
2292     /* 320 kB 5"1/4 floppy disks */
2293     { FDRIVE_DRV_120,  8, 40, 1, FDRIVE_RATE_250K, },
2294     { FDRIVE_DRV_120,  8, 40, 0, FDRIVE_RATE_250K, },
2295     /* 360 kB must match 5"1/4 better than 3"1/2... */
2296     { FDRIVE_DRV_144,  9, 80, 0, FDRIVE_RATE_250K, },
2297     /* end */
2298     { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
2299 };
2300 
2301 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2302                                    int *max_track, int *last_sect,
2303                                    FDriveType drive_in, FDriveType *drive,
2304                                    FDriveRate *rate)
2305 {
2306     const FDFormat *parse;
2307     uint64_t nb_sectors, size;
2308     int i, first_match, match;
2309 
2310     bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2311     if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2312         /* User defined disk */
2313         *rate = FDRIVE_RATE_500K;
2314     } else {
2315         bdrv_get_geometry(bs, &nb_sectors);
2316         match = -1;
2317         first_match = -1;
2318         for (i = 0; ; i++) {
2319             parse = &fd_formats[i];
2320             if (parse->drive == FDRIVE_DRV_NONE) {
2321                 break;
2322             }
2323             if (drive_in == parse->drive ||
2324                 drive_in == FDRIVE_DRV_NONE) {
2325                 size = (parse->max_head + 1) * parse->max_track *
2326                     parse->last_sect;
2327                 if (nb_sectors == size) {
2328                     match = i;
2329                     break;
2330                 }
2331                 if (first_match == -1) {
2332                     first_match = i;
2333                 }
2334             }
2335         }
2336         if (match == -1) {
2337             if (first_match == -1) {
2338                 match = 1;
2339             } else {
2340                 match = first_match;
2341             }
2342             parse = &fd_formats[match];
2343         }
2344         *nb_heads = parse->max_head + 1;
2345         *max_track = parse->max_track;
2346         *last_sect = parse->last_sect;
2347         *drive = parse->drive;
2348         *rate = parse->rate;
2349     }
2350 }
2351 
2352 int bdrv_get_translation_hint(BlockDriverState *bs)
2353 {
2354     return bs->translation;
2355 }
2356 
2357 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2358                        BlockErrorAction on_write_error)
2359 {
2360     bs->on_read_error = on_read_error;
2361     bs->on_write_error = on_write_error;
2362 }
2363 
2364 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2365 {
2366     return is_read ? bs->on_read_error : bs->on_write_error;
2367 }
2368 
2369 int bdrv_is_read_only(BlockDriverState *bs)
2370 {
2371     return bs->read_only;
2372 }
2373 
2374 int bdrv_is_sg(BlockDriverState *bs)
2375 {
2376     return bs->sg;
2377 }
2378 
2379 int bdrv_enable_write_cache(BlockDriverState *bs)
2380 {
2381     return bs->enable_write_cache;
2382 }
2383 
2384 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
2385 {
2386     bs->enable_write_cache = wce;
2387 }
2388 
2389 int bdrv_is_encrypted(BlockDriverState *bs)
2390 {
2391     if (bs->backing_hd && bs->backing_hd->encrypted)
2392         return 1;
2393     return bs->encrypted;
2394 }
2395 
2396 int bdrv_key_required(BlockDriverState *bs)
2397 {
2398     BlockDriverState *backing_hd = bs->backing_hd;
2399 
2400     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2401         return 1;
2402     return (bs->encrypted && !bs->valid_key);
2403 }
2404 
2405 int bdrv_set_key(BlockDriverState *bs, const char *key)
2406 {
2407     int ret;
2408     if (bs->backing_hd && bs->backing_hd->encrypted) {
2409         ret = bdrv_set_key(bs->backing_hd, key);
2410         if (ret < 0)
2411             return ret;
2412         if (!bs->encrypted)
2413             return 0;
2414     }
2415     if (!bs->encrypted) {
2416         return -EINVAL;
2417     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2418         return -ENOMEDIUM;
2419     }
2420     ret = bs->drv->bdrv_set_key(bs, key);
2421     if (ret < 0) {
2422         bs->valid_key = 0;
2423     } else if (!bs->valid_key) {
2424         bs->valid_key = 1;
2425         /* call the change callback now, we skipped it on open */
2426         bdrv_dev_change_media_cb(bs, true);
2427     }
2428     return ret;
2429 }
2430 
2431 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2432 {
2433     if (!bs->drv) {
2434         buf[0] = '\0';
2435     } else {
2436         pstrcpy(buf, buf_size, bs->drv->format_name);
2437     }
2438 }
2439 
2440 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2441                          void *opaque)
2442 {
2443     BlockDriver *drv;
2444 
2445     QLIST_FOREACH(drv, &bdrv_drivers, list) {
2446         it(opaque, drv->format_name);
2447     }
2448 }
2449 
2450 BlockDriverState *bdrv_find(const char *name)
2451 {
2452     BlockDriverState *bs;
2453 
2454     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2455         if (!strcmp(name, bs->device_name)) {
2456             return bs;
2457         }
2458     }
2459     return NULL;
2460 }
2461 
2462 BlockDriverState *bdrv_next(BlockDriverState *bs)
2463 {
2464     if (!bs) {
2465         return QTAILQ_FIRST(&bdrv_states);
2466     }
2467     return QTAILQ_NEXT(bs, list);
2468 }
2469 
2470 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2471 {
2472     BlockDriverState *bs;
2473 
2474     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2475         it(opaque, bs);
2476     }
2477 }
2478 
2479 const char *bdrv_get_device_name(BlockDriverState *bs)
2480 {
2481     return bs->device_name;
2482 }
2483 
2484 int bdrv_get_flags(BlockDriverState *bs)
2485 {
2486     return bs->open_flags;
2487 }
2488 
2489 void bdrv_flush_all(void)
2490 {
2491     BlockDriverState *bs;
2492 
2493     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2494         bdrv_flush(bs);
2495     }
2496 }
2497 
2498 int bdrv_has_zero_init(BlockDriverState *bs)
2499 {
2500     assert(bs->drv);
2501 
2502     if (bs->drv->bdrv_has_zero_init) {
2503         return bs->drv->bdrv_has_zero_init(bs);
2504     }
2505 
2506     return 1;
2507 }
2508 
2509 typedef struct BdrvCoIsAllocatedData {
2510     BlockDriverState *bs;
2511     int64_t sector_num;
2512     int nb_sectors;
2513     int *pnum;
2514     int ret;
2515     bool done;
2516 } BdrvCoIsAllocatedData;
2517 
2518 /*
2519  * Returns true iff the specified sector is present in the disk image. Drivers
2520  * not implementing the functionality are assumed to not support backing files,
2521  * hence all their sectors are reported as allocated.
2522  *
2523  * If 'sector_num' is beyond the end of the disk image the return value is 0
2524  * and 'pnum' is set to 0.
2525  *
2526  * 'pnum' is set to the number of sectors (including and immediately following
2527  * the specified sector) that are known to be in the same
2528  * allocated/unallocated state.
2529  *
2530  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
2531  * beyond the end of the disk image it will be clamped.
2532  */
2533 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2534                                       int nb_sectors, int *pnum)
2535 {
2536     int64_t n;
2537 
2538     if (sector_num >= bs->total_sectors) {
2539         *pnum = 0;
2540         return 0;
2541     }
2542 
2543     n = bs->total_sectors - sector_num;
2544     if (n < nb_sectors) {
2545         nb_sectors = n;
2546     }
2547 
2548     if (!bs->drv->bdrv_co_is_allocated) {
2549         *pnum = nb_sectors;
2550         return 1;
2551     }
2552 
2553     return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2554 }
2555 
2556 /* Coroutine wrapper for bdrv_is_allocated() */
2557 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2558 {
2559     BdrvCoIsAllocatedData *data = opaque;
2560     BlockDriverState *bs = data->bs;
2561 
2562     data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2563                                      data->pnum);
2564     data->done = true;
2565 }
2566 
2567 /*
2568  * Synchronous wrapper around bdrv_co_is_allocated().
2569  *
2570  * See bdrv_co_is_allocated() for details.
2571  */
2572 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2573                       int *pnum)
2574 {
2575     Coroutine *co;
2576     BdrvCoIsAllocatedData data = {
2577         .bs = bs,
2578         .sector_num = sector_num,
2579         .nb_sectors = nb_sectors,
2580         .pnum = pnum,
2581         .done = false,
2582     };
2583 
2584     co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2585     qemu_coroutine_enter(co, &data);
2586     while (!data.done) {
2587         qemu_aio_wait();
2588     }
2589     return data.ret;
2590 }
2591 
2592 /*
2593  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2594  *
2595  * Return true if the given sector is allocated in any image between
2596  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
2597  * sector is allocated in any image of the chain.  Return false otherwise.
2598  *
2599  * 'pnum' is set to the number of sectors (including and immediately following
2600  *  the specified sector) that are known to be in the same
2601  *  allocated/unallocated state.
2602  *
2603  */
2604 int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
2605                                             BlockDriverState *base,
2606                                             int64_t sector_num,
2607                                             int nb_sectors, int *pnum)
2608 {
2609     BlockDriverState *intermediate;
2610     int ret, n = nb_sectors;
2611 
2612     intermediate = top;
2613     while (intermediate && intermediate != base) {
2614         int pnum_inter;
2615         ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
2616                                    &pnum_inter);
2617         if (ret < 0) {
2618             return ret;
2619         } else if (ret) {
2620             *pnum = pnum_inter;
2621             return 1;
2622         }
2623 
2624         /*
2625          * [sector_num, nb_sectors] is unallocated on top but intermediate
2626          * might have
2627          *
2628          * [sector_num+x, nr_sectors] allocated.
2629          */
2630         if (n > pnum_inter) {
2631             n = pnum_inter;
2632         }
2633 
2634         intermediate = intermediate->backing_hd;
2635     }
2636 
2637     *pnum = n;
2638     return 0;
2639 }
2640 
2641 BlockInfoList *qmp_query_block(Error **errp)
2642 {
2643     BlockInfoList *head = NULL, *cur_item = NULL;
2644     BlockDriverState *bs;
2645 
2646     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2647         BlockInfoList *info = g_malloc0(sizeof(*info));
2648 
2649         info->value = g_malloc0(sizeof(*info->value));
2650         info->value->device = g_strdup(bs->device_name);
2651         info->value->type = g_strdup("unknown");
2652         info->value->locked = bdrv_dev_is_medium_locked(bs);
2653         info->value->removable = bdrv_dev_has_removable_media(bs);
2654 
2655         if (bdrv_dev_has_removable_media(bs)) {
2656             info->value->has_tray_open = true;
2657             info->value->tray_open = bdrv_dev_is_tray_open(bs);
2658         }
2659 
2660         if (bdrv_iostatus_is_enabled(bs)) {
2661             info->value->has_io_status = true;
2662             info->value->io_status = bs->iostatus;
2663         }
2664 
2665         if (bs->drv) {
2666             info->value->has_inserted = true;
2667             info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2668             info->value->inserted->file = g_strdup(bs->filename);
2669             info->value->inserted->ro = bs->read_only;
2670             info->value->inserted->drv = g_strdup(bs->drv->format_name);
2671             info->value->inserted->encrypted = bs->encrypted;
2672             if (bs->backing_file[0]) {
2673                 info->value->inserted->has_backing_file = true;
2674                 info->value->inserted->backing_file = g_strdup(bs->backing_file);
2675             }
2676 
2677             if (bs->io_limits_enabled) {
2678                 info->value->inserted->bps =
2679                                bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2680                 info->value->inserted->bps_rd =
2681                                bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2682                 info->value->inserted->bps_wr =
2683                                bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2684                 info->value->inserted->iops =
2685                                bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2686                 info->value->inserted->iops_rd =
2687                                bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2688                 info->value->inserted->iops_wr =
2689                                bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2690             }
2691         }
2692 
2693         /* XXX: waiting for the qapi to support GSList */
2694         if (!cur_item) {
2695             head = cur_item = info;
2696         } else {
2697             cur_item->next = info;
2698             cur_item = info;
2699         }
2700     }
2701 
2702     return head;
2703 }
2704 
2705 /* Consider exposing this as a full fledged QMP command */
2706 static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2707 {
2708     BlockStats *s;
2709 
2710     s = g_malloc0(sizeof(*s));
2711 
2712     if (bs->device_name[0]) {
2713         s->has_device = true;
2714         s->device = g_strdup(bs->device_name);
2715     }
2716 
2717     s->stats = g_malloc0(sizeof(*s->stats));
2718     s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2719     s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2720     s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2721     s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2722     s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2723     s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2724     s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2725     s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2726     s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2727 
2728     if (bs->file) {
2729         s->has_parent = true;
2730         s->parent = qmp_query_blockstat(bs->file, NULL);
2731     }
2732 
2733     return s;
2734 }
2735 
2736 BlockStatsList *qmp_query_blockstats(Error **errp)
2737 {
2738     BlockStatsList *head = NULL, *cur_item = NULL;
2739     BlockDriverState *bs;
2740 
2741     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2742         BlockStatsList *info = g_malloc0(sizeof(*info));
2743         info->value = qmp_query_blockstat(bs, NULL);
2744 
2745         /* XXX: waiting for the qapi to support GSList */
2746         if (!cur_item) {
2747             head = cur_item = info;
2748         } else {
2749             cur_item->next = info;
2750             cur_item = info;
2751         }
2752     }
2753 
2754     return head;
2755 }
2756 
2757 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2758 {
2759     if (bs->backing_hd && bs->backing_hd->encrypted)
2760         return bs->backing_file;
2761     else if (bs->encrypted)
2762         return bs->filename;
2763     else
2764         return NULL;
2765 }
2766 
2767 void bdrv_get_backing_filename(BlockDriverState *bs,
2768                                char *filename, int filename_size)
2769 {
2770     pstrcpy(filename, filename_size, bs->backing_file);
2771 }
2772 
2773 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2774                           const uint8_t *buf, int nb_sectors)
2775 {
2776     BlockDriver *drv = bs->drv;
2777     if (!drv)
2778         return -ENOMEDIUM;
2779     if (!drv->bdrv_write_compressed)
2780         return -ENOTSUP;
2781     if (bdrv_check_request(bs, sector_num, nb_sectors))
2782         return -EIO;
2783 
2784     if (bs->dirty_bitmap) {
2785         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2786     }
2787 
2788     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2789 }
2790 
2791 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2792 {
2793     BlockDriver *drv = bs->drv;
2794     if (!drv)
2795         return -ENOMEDIUM;
2796     if (!drv->bdrv_get_info)
2797         return -ENOTSUP;
2798     memset(bdi, 0, sizeof(*bdi));
2799     return drv->bdrv_get_info(bs, bdi);
2800 }
2801 
2802 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2803                       int64_t pos, int size)
2804 {
2805     BlockDriver *drv = bs->drv;
2806     if (!drv)
2807         return -ENOMEDIUM;
2808     if (drv->bdrv_save_vmstate)
2809         return drv->bdrv_save_vmstate(bs, buf, pos, size);
2810     if (bs->file)
2811         return bdrv_save_vmstate(bs->file, buf, pos, size);
2812     return -ENOTSUP;
2813 }
2814 
2815 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2816                       int64_t pos, int size)
2817 {
2818     BlockDriver *drv = bs->drv;
2819     if (!drv)
2820         return -ENOMEDIUM;
2821     if (drv->bdrv_load_vmstate)
2822         return drv->bdrv_load_vmstate(bs, buf, pos, size);
2823     if (bs->file)
2824         return bdrv_load_vmstate(bs->file, buf, pos, size);
2825     return -ENOTSUP;
2826 }
2827 
2828 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2829 {
2830     BlockDriver *drv = bs->drv;
2831 
2832     if (!drv || !drv->bdrv_debug_event) {
2833         return;
2834     }
2835 
2836     return drv->bdrv_debug_event(bs, event);
2837 
2838 }
2839 
2840 /**************************************************************/
2841 /* handling of snapshots */
2842 
2843 int bdrv_can_snapshot(BlockDriverState *bs)
2844 {
2845     BlockDriver *drv = bs->drv;
2846     if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2847         return 0;
2848     }
2849 
2850     if (!drv->bdrv_snapshot_create) {
2851         if (bs->file != NULL) {
2852             return bdrv_can_snapshot(bs->file);
2853         }
2854         return 0;
2855     }
2856 
2857     return 1;
2858 }
2859 
2860 int bdrv_is_snapshot(BlockDriverState *bs)
2861 {
2862     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2863 }
2864 
2865 BlockDriverState *bdrv_snapshots(void)
2866 {
2867     BlockDriverState *bs;
2868 
2869     if (bs_snapshots) {
2870         return bs_snapshots;
2871     }
2872 
2873     bs = NULL;
2874     while ((bs = bdrv_next(bs))) {
2875         if (bdrv_can_snapshot(bs)) {
2876             bs_snapshots = bs;
2877             return bs;
2878         }
2879     }
2880     return NULL;
2881 }
2882 
2883 int bdrv_snapshot_create(BlockDriverState *bs,
2884                          QEMUSnapshotInfo *sn_info)
2885 {
2886     BlockDriver *drv = bs->drv;
2887     if (!drv)
2888         return -ENOMEDIUM;
2889     if (drv->bdrv_snapshot_create)
2890         return drv->bdrv_snapshot_create(bs, sn_info);
2891     if (bs->file)
2892         return bdrv_snapshot_create(bs->file, sn_info);
2893     return -ENOTSUP;
2894 }
2895 
2896 int bdrv_snapshot_goto(BlockDriverState *bs,
2897                        const char *snapshot_id)
2898 {
2899     BlockDriver *drv = bs->drv;
2900     int ret, open_ret;
2901 
2902     if (!drv)
2903         return -ENOMEDIUM;
2904     if (drv->bdrv_snapshot_goto)
2905         return drv->bdrv_snapshot_goto(bs, snapshot_id);
2906 
2907     if (bs->file) {
2908         drv->bdrv_close(bs);
2909         ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2910         open_ret = drv->bdrv_open(bs, bs->open_flags);
2911         if (open_ret < 0) {
2912             bdrv_delete(bs->file);
2913             bs->drv = NULL;
2914             return open_ret;
2915         }
2916         return ret;
2917     }
2918 
2919     return -ENOTSUP;
2920 }
2921 
2922 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2923 {
2924     BlockDriver *drv = bs->drv;
2925     if (!drv)
2926         return -ENOMEDIUM;
2927     if (drv->bdrv_snapshot_delete)
2928         return drv->bdrv_snapshot_delete(bs, snapshot_id);
2929     if (bs->file)
2930         return bdrv_snapshot_delete(bs->file, snapshot_id);
2931     return -ENOTSUP;
2932 }
2933 
2934 int bdrv_snapshot_list(BlockDriverState *bs,
2935                        QEMUSnapshotInfo **psn_info)
2936 {
2937     BlockDriver *drv = bs->drv;
2938     if (!drv)
2939         return -ENOMEDIUM;
2940     if (drv->bdrv_snapshot_list)
2941         return drv->bdrv_snapshot_list(bs, psn_info);
2942     if (bs->file)
2943         return bdrv_snapshot_list(bs->file, psn_info);
2944     return -ENOTSUP;
2945 }
2946 
2947 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2948         const char *snapshot_name)
2949 {
2950     BlockDriver *drv = bs->drv;
2951     if (!drv) {
2952         return -ENOMEDIUM;
2953     }
2954     if (!bs->read_only) {
2955         return -EINVAL;
2956     }
2957     if (drv->bdrv_snapshot_load_tmp) {
2958         return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2959     }
2960     return -ENOTSUP;
2961 }
2962 
2963 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2964         const char *backing_file)
2965 {
2966     if (!bs->drv) {
2967         return NULL;
2968     }
2969 
2970     if (bs->backing_hd) {
2971         if (strcmp(bs->backing_file, backing_file) == 0) {
2972             return bs->backing_hd;
2973         } else {
2974             return bdrv_find_backing_image(bs->backing_hd, backing_file);
2975         }
2976     }
2977 
2978     return NULL;
2979 }
2980 
2981 #define NB_SUFFIXES 4
2982 
2983 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2984 {
2985     static const char suffixes[NB_SUFFIXES] = "KMGT";
2986     int64_t base;
2987     int i;
2988 
2989     if (size <= 999) {
2990         snprintf(buf, buf_size, "%" PRId64, size);
2991     } else {
2992         base = 1024;
2993         for(i = 0; i < NB_SUFFIXES; i++) {
2994             if (size < (10 * base)) {
2995                 snprintf(buf, buf_size, "%0.1f%c",
2996                          (double)size / base,
2997                          suffixes[i]);
2998                 break;
2999             } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
3000                 snprintf(buf, buf_size, "%" PRId64 "%c",
3001                          ((size + (base >> 1)) / base),
3002                          suffixes[i]);
3003                 break;
3004             }
3005             base = base * 1024;
3006         }
3007     }
3008     return buf;
3009 }
3010 
3011 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
3012 {
3013     char buf1[128], date_buf[128], clock_buf[128];
3014 #ifdef _WIN32
3015     struct tm *ptm;
3016 #else
3017     struct tm tm;
3018 #endif
3019     time_t ti;
3020     int64_t secs;
3021 
3022     if (!sn) {
3023         snprintf(buf, buf_size,
3024                  "%-10s%-20s%7s%20s%15s",
3025                  "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
3026     } else {
3027         ti = sn->date_sec;
3028 #ifdef _WIN32
3029         ptm = localtime(&ti);
3030         strftime(date_buf, sizeof(date_buf),
3031                  "%Y-%m-%d %H:%M:%S", ptm);
3032 #else
3033         localtime_r(&ti, &tm);
3034         strftime(date_buf, sizeof(date_buf),
3035                  "%Y-%m-%d %H:%M:%S", &tm);
3036 #endif
3037         secs = sn->vm_clock_nsec / 1000000000;
3038         snprintf(clock_buf, sizeof(clock_buf),
3039                  "%02d:%02d:%02d.%03d",
3040                  (int)(secs / 3600),
3041                  (int)((secs / 60) % 60),
3042                  (int)(secs % 60),
3043                  (int)((sn->vm_clock_nsec / 1000000) % 1000));
3044         snprintf(buf, buf_size,
3045                  "%-10s%-20s%7s%20s%15s",
3046                  sn->id_str, sn->name,
3047                  get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
3048                  date_buf,
3049                  clock_buf);
3050     }
3051     return buf;
3052 }
3053 
3054 /**************************************************************/
3055 /* async I/Os */
3056 
3057 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3058                                  QEMUIOVector *qiov, int nb_sectors,
3059                                  BlockDriverCompletionFunc *cb, void *opaque)
3060 {
3061     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3062 
3063     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3064                                  cb, opaque, false);
3065 }
3066 
3067 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3068                                   QEMUIOVector *qiov, int nb_sectors,
3069                                   BlockDriverCompletionFunc *cb, void *opaque)
3070 {
3071     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3072 
3073     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3074                                  cb, opaque, true);
3075 }
3076 
3077 
3078 typedef struct MultiwriteCB {
3079     int error;
3080     int num_requests;
3081     int num_callbacks;
3082     struct {
3083         BlockDriverCompletionFunc *cb;
3084         void *opaque;
3085         QEMUIOVector *free_qiov;
3086     } callbacks[];
3087 } MultiwriteCB;
3088 
3089 static void multiwrite_user_cb(MultiwriteCB *mcb)
3090 {
3091     int i;
3092 
3093     for (i = 0; i < mcb->num_callbacks; i++) {
3094         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3095         if (mcb->callbacks[i].free_qiov) {
3096             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3097         }
3098         g_free(mcb->callbacks[i].free_qiov);
3099     }
3100 }
3101 
3102 static void multiwrite_cb(void *opaque, int ret)
3103 {
3104     MultiwriteCB *mcb = opaque;
3105 
3106     trace_multiwrite_cb(mcb, ret);
3107 
3108     if (ret < 0 && !mcb->error) {
3109         mcb->error = ret;
3110     }
3111 
3112     mcb->num_requests--;
3113     if (mcb->num_requests == 0) {
3114         multiwrite_user_cb(mcb);
3115         g_free(mcb);
3116     }
3117 }
3118 
3119 static int multiwrite_req_compare(const void *a, const void *b)
3120 {
3121     const BlockRequest *req1 = a, *req2 = b;
3122 
3123     /*
3124      * Note that we can't simply subtract req2->sector from req1->sector
3125      * here as that could overflow the return value.
3126      */
3127     if (req1->sector > req2->sector) {
3128         return 1;
3129     } else if (req1->sector < req2->sector) {
3130         return -1;
3131     } else {
3132         return 0;
3133     }
3134 }
3135 
3136 /*
3137  * Takes a bunch of requests and tries to merge them. Returns the number of
3138  * requests that remain after merging.
3139  */
3140 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3141     int num_reqs, MultiwriteCB *mcb)
3142 {
3143     int i, outidx;
3144 
3145     // Sort requests by start sector
3146     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3147 
3148     // Check if adjacent requests touch the same clusters. If so, combine them,
3149     // filling up gaps with zero sectors.
3150     outidx = 0;
3151     for (i = 1; i < num_reqs; i++) {
3152         int merge = 0;
3153         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3154 
3155         // Handle exactly sequential writes and overlapping writes.
3156         if (reqs[i].sector <= oldreq_last) {
3157             merge = 1;
3158         }
3159 
3160         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3161             merge = 0;
3162         }
3163 
3164         if (merge) {
3165             size_t size;
3166             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3167             qemu_iovec_init(qiov,
3168                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3169 
3170             // Add the first request to the merged one. If the requests are
3171             // overlapping, drop the last sectors of the first request.
3172             size = (reqs[i].sector - reqs[outidx].sector) << 9;
3173             qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
3174 
3175             // We should need to add any zeros between the two requests
3176             assert (reqs[i].sector <= oldreq_last);
3177 
3178             // Add the second request
3179             qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
3180 
3181             reqs[outidx].nb_sectors = qiov->size >> 9;
3182             reqs[outidx].qiov = qiov;
3183 
3184             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3185         } else {
3186             outidx++;
3187             reqs[outidx].sector     = reqs[i].sector;
3188             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3189             reqs[outidx].qiov       = reqs[i].qiov;
3190         }
3191     }
3192 
3193     return outidx + 1;
3194 }
3195 
3196 /*
3197  * Submit multiple AIO write requests at once.
3198  *
3199  * On success, the function returns 0 and all requests in the reqs array have
3200  * been submitted. In error case this function returns -1, and any of the
3201  * requests may or may not be submitted yet. In particular, this means that the
3202  * callback will be called for some of the requests, for others it won't. The
3203  * caller must check the error field of the BlockRequest to wait for the right
3204  * callbacks (if error != 0, no callback will be called).
3205  *
3206  * The implementation may modify the contents of the reqs array, e.g. to merge
3207  * requests. However, the fields opaque and error are left unmodified as they
3208  * are used to signal failure for a single request to the caller.
3209  */
3210 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3211 {
3212     MultiwriteCB *mcb;
3213     int i;
3214 
3215     /* don't submit writes if we don't have a medium */
3216     if (bs->drv == NULL) {
3217         for (i = 0; i < num_reqs; i++) {
3218             reqs[i].error = -ENOMEDIUM;
3219         }
3220         return -1;
3221     }
3222 
3223     if (num_reqs == 0) {
3224         return 0;
3225     }
3226 
3227     // Create MultiwriteCB structure
3228     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3229     mcb->num_requests = 0;
3230     mcb->num_callbacks = num_reqs;
3231 
3232     for (i = 0; i < num_reqs; i++) {
3233         mcb->callbacks[i].cb = reqs[i].cb;
3234         mcb->callbacks[i].opaque = reqs[i].opaque;
3235     }
3236 
3237     // Check for mergable requests
3238     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3239 
3240     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3241 
3242     /* Run the aio requests. */
3243     mcb->num_requests = num_reqs;
3244     for (i = 0; i < num_reqs; i++) {
3245         bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3246             reqs[i].nb_sectors, multiwrite_cb, mcb);
3247     }
3248 
3249     return 0;
3250 }
3251 
3252 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3253 {
3254     acb->pool->cancel(acb);
3255 }
3256 
3257 /* block I/O throttling */
3258 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3259                  bool is_write, double elapsed_time, uint64_t *wait)
3260 {
3261     uint64_t bps_limit = 0;
3262     double   bytes_limit, bytes_base, bytes_res;
3263     double   slice_time, wait_time;
3264 
3265     if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3266         bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3267     } else if (bs->io_limits.bps[is_write]) {
3268         bps_limit = bs->io_limits.bps[is_write];
3269     } else {
3270         if (wait) {
3271             *wait = 0;
3272         }
3273 
3274         return false;
3275     }
3276 
3277     slice_time = bs->slice_end - bs->slice_start;
3278     slice_time /= (NANOSECONDS_PER_SECOND);
3279     bytes_limit = bps_limit * slice_time;
3280     bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3281     if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3282         bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3283     }
3284 
3285     /* bytes_base: the bytes of data which have been read/written; and
3286      *             it is obtained from the history statistic info.
3287      * bytes_res: the remaining bytes of data which need to be read/written.
3288      * (bytes_base + bytes_res) / bps_limit: used to calcuate
3289      *             the total time for completing reading/writting all data.
3290      */
3291     bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3292 
3293     if (bytes_base + bytes_res <= bytes_limit) {
3294         if (wait) {
3295             *wait = 0;
3296         }
3297 
3298         return false;
3299     }
3300 
3301     /* Calc approx time to dispatch */
3302     wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3303 
3304     /* When the I/O rate at runtime exceeds the limits,
3305      * bs->slice_end need to be extended in order that the current statistic
3306      * info can be kept until the timer fire, so it is increased and tuned
3307      * based on the result of experiment.
3308      */
3309     bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3310     bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3311     if (wait) {
3312         *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3313     }
3314 
3315     return true;
3316 }
3317 
3318 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3319                              double elapsed_time, uint64_t *wait)
3320 {
3321     uint64_t iops_limit = 0;
3322     double   ios_limit, ios_base;
3323     double   slice_time, wait_time;
3324 
3325     if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3326         iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3327     } else if (bs->io_limits.iops[is_write]) {
3328         iops_limit = bs->io_limits.iops[is_write];
3329     } else {
3330         if (wait) {
3331             *wait = 0;
3332         }
3333 
3334         return false;
3335     }
3336 
3337     slice_time = bs->slice_end - bs->slice_start;
3338     slice_time /= (NANOSECONDS_PER_SECOND);
3339     ios_limit  = iops_limit * slice_time;
3340     ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3341     if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3342         ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3343     }
3344 
3345     if (ios_base + 1 <= ios_limit) {
3346         if (wait) {
3347             *wait = 0;
3348         }
3349 
3350         return false;
3351     }
3352 
3353     /* Calc approx time to dispatch */
3354     wait_time = (ios_base + 1) / iops_limit;
3355     if (wait_time > elapsed_time) {
3356         wait_time = wait_time - elapsed_time;
3357     } else {
3358         wait_time = 0;
3359     }
3360 
3361     bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3362     bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3363     if (wait) {
3364         *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3365     }
3366 
3367     return true;
3368 }
3369 
3370 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3371                            bool is_write, int64_t *wait)
3372 {
3373     int64_t  now, max_wait;
3374     uint64_t bps_wait = 0, iops_wait = 0;
3375     double   elapsed_time;
3376     int      bps_ret, iops_ret;
3377 
3378     now = qemu_get_clock_ns(vm_clock);
3379     if ((bs->slice_start < now)
3380         && (bs->slice_end > now)) {
3381         bs->slice_end = now + bs->slice_time;
3382     } else {
3383         bs->slice_time  =  5 * BLOCK_IO_SLICE_TIME;
3384         bs->slice_start = now;
3385         bs->slice_end   = now + bs->slice_time;
3386 
3387         bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
3388         bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3389 
3390         bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
3391         bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
3392     }
3393 
3394     elapsed_time  = now - bs->slice_start;
3395     elapsed_time  /= (NANOSECONDS_PER_SECOND);
3396 
3397     bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
3398                                       is_write, elapsed_time, &bps_wait);
3399     iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3400                                       elapsed_time, &iops_wait);
3401     if (bps_ret || iops_ret) {
3402         max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3403         if (wait) {
3404             *wait = max_wait;
3405         }
3406 
3407         now = qemu_get_clock_ns(vm_clock);
3408         if (bs->slice_end < now + max_wait) {
3409             bs->slice_end = now + max_wait;
3410         }
3411 
3412         return true;
3413     }
3414 
3415     if (wait) {
3416         *wait = 0;
3417     }
3418 
3419     return false;
3420 }
3421 
3422 /**************************************************************/
3423 /* async block device emulation */
3424 
3425 typedef struct BlockDriverAIOCBSync {
3426     BlockDriverAIOCB common;
3427     QEMUBH *bh;
3428     int ret;
3429     /* vector translation state */
3430     QEMUIOVector *qiov;
3431     uint8_t *bounce;
3432     int is_write;
3433 } BlockDriverAIOCBSync;
3434 
3435 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3436 {
3437     BlockDriverAIOCBSync *acb =
3438         container_of(blockacb, BlockDriverAIOCBSync, common);
3439     qemu_bh_delete(acb->bh);
3440     acb->bh = NULL;
3441     qemu_aio_release(acb);
3442 }
3443 
3444 static AIOPool bdrv_em_aio_pool = {
3445     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3446     .cancel             = bdrv_aio_cancel_em,
3447 };
3448 
3449 static void bdrv_aio_bh_cb(void *opaque)
3450 {
3451     BlockDriverAIOCBSync *acb = opaque;
3452 
3453     if (!acb->is_write)
3454         qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
3455     qemu_vfree(acb->bounce);
3456     acb->common.cb(acb->common.opaque, acb->ret);
3457     qemu_bh_delete(acb->bh);
3458     acb->bh = NULL;
3459     qemu_aio_release(acb);
3460 }
3461 
3462 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3463                                             int64_t sector_num,
3464                                             QEMUIOVector *qiov,
3465                                             int nb_sectors,
3466                                             BlockDriverCompletionFunc *cb,
3467                                             void *opaque,
3468                                             int is_write)
3469 
3470 {
3471     BlockDriverAIOCBSync *acb;
3472 
3473     acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3474     acb->is_write = is_write;
3475     acb->qiov = qiov;
3476     acb->bounce = qemu_blockalign(bs, qiov->size);
3477     acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3478 
3479     if (is_write) {
3480         qemu_iovec_to_buffer(acb->qiov, acb->bounce);
3481         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3482     } else {
3483         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3484     }
3485 
3486     qemu_bh_schedule(acb->bh);
3487 
3488     return &acb->common;
3489 }
3490 
3491 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3492         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3493         BlockDriverCompletionFunc *cb, void *opaque)
3494 {
3495     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3496 }
3497 
3498 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3499         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3500         BlockDriverCompletionFunc *cb, void *opaque)
3501 {
3502     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3503 }
3504 
3505 
3506 typedef struct BlockDriverAIOCBCoroutine {
3507     BlockDriverAIOCB common;
3508     BlockRequest req;
3509     bool is_write;
3510     QEMUBH* bh;
3511 } BlockDriverAIOCBCoroutine;
3512 
3513 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3514 {
3515     qemu_aio_flush();
3516 }
3517 
3518 static AIOPool bdrv_em_co_aio_pool = {
3519     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3520     .cancel             = bdrv_aio_co_cancel_em,
3521 };
3522 
3523 static void bdrv_co_em_bh(void *opaque)
3524 {
3525     BlockDriverAIOCBCoroutine *acb = opaque;
3526 
3527     acb->common.cb(acb->common.opaque, acb->req.error);
3528     qemu_bh_delete(acb->bh);
3529     qemu_aio_release(acb);
3530 }
3531 
3532 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3533 static void coroutine_fn bdrv_co_do_rw(void *opaque)
3534 {
3535     BlockDriverAIOCBCoroutine *acb = opaque;
3536     BlockDriverState *bs = acb->common.bs;
3537 
3538     if (!acb->is_write) {
3539         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3540             acb->req.nb_sectors, acb->req.qiov, 0);
3541     } else {
3542         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3543             acb->req.nb_sectors, acb->req.qiov, 0);
3544     }
3545 
3546     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3547     qemu_bh_schedule(acb->bh);
3548 }
3549 
3550 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3551                                                int64_t sector_num,
3552                                                QEMUIOVector *qiov,
3553                                                int nb_sectors,
3554                                                BlockDriverCompletionFunc *cb,
3555                                                void *opaque,
3556                                                bool is_write)
3557 {
3558     Coroutine *co;
3559     BlockDriverAIOCBCoroutine *acb;
3560 
3561     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3562     acb->req.sector = sector_num;
3563     acb->req.nb_sectors = nb_sectors;
3564     acb->req.qiov = qiov;
3565     acb->is_write = is_write;
3566 
3567     co = qemu_coroutine_create(bdrv_co_do_rw);
3568     qemu_coroutine_enter(co, acb);
3569 
3570     return &acb->common;
3571 }
3572 
3573 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3574 {
3575     BlockDriverAIOCBCoroutine *acb = opaque;
3576     BlockDriverState *bs = acb->common.bs;
3577 
3578     acb->req.error = bdrv_co_flush(bs);
3579     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3580     qemu_bh_schedule(acb->bh);
3581 }
3582 
3583 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3584         BlockDriverCompletionFunc *cb, void *opaque)
3585 {
3586     trace_bdrv_aio_flush(bs, opaque);
3587 
3588     Coroutine *co;
3589     BlockDriverAIOCBCoroutine *acb;
3590 
3591     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3592     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3593     qemu_coroutine_enter(co, acb);
3594 
3595     return &acb->common;
3596 }
3597 
3598 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3599 {
3600     BlockDriverAIOCBCoroutine *acb = opaque;
3601     BlockDriverState *bs = acb->common.bs;
3602 
3603     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3604     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3605     qemu_bh_schedule(acb->bh);
3606 }
3607 
3608 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3609         int64_t sector_num, int nb_sectors,
3610         BlockDriverCompletionFunc *cb, void *opaque)
3611 {
3612     Coroutine *co;
3613     BlockDriverAIOCBCoroutine *acb;
3614 
3615     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3616 
3617     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3618     acb->req.sector = sector_num;
3619     acb->req.nb_sectors = nb_sectors;
3620     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3621     qemu_coroutine_enter(co, acb);
3622 
3623     return &acb->common;
3624 }
3625 
3626 void bdrv_init(void)
3627 {
3628     module_call_init(MODULE_INIT_BLOCK);
3629 }
3630 
3631 void bdrv_init_with_whitelist(void)
3632 {
3633     use_bdrv_whitelist = 1;
3634     bdrv_init();
3635 }
3636 
3637 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3638                    BlockDriverCompletionFunc *cb, void *opaque)
3639 {
3640     BlockDriverAIOCB *acb;
3641 
3642     if (pool->free_aiocb) {
3643         acb = pool->free_aiocb;
3644         pool->free_aiocb = acb->next;
3645     } else {
3646         acb = g_malloc0(pool->aiocb_size);
3647         acb->pool = pool;
3648     }
3649     acb->bs = bs;
3650     acb->cb = cb;
3651     acb->opaque = opaque;
3652     return acb;
3653 }
3654 
3655 void qemu_aio_release(void *p)
3656 {
3657     BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3658     AIOPool *pool = acb->pool;
3659     acb->next = pool->free_aiocb;
3660     pool->free_aiocb = acb;
3661 }
3662 
3663 /**************************************************************/
3664 /* Coroutine block device emulation */
3665 
3666 typedef struct CoroutineIOCompletion {
3667     Coroutine *coroutine;
3668     int ret;
3669 } CoroutineIOCompletion;
3670 
3671 static void bdrv_co_io_em_complete(void *opaque, int ret)
3672 {
3673     CoroutineIOCompletion *co = opaque;
3674 
3675     co->ret = ret;
3676     qemu_coroutine_enter(co->coroutine, NULL);
3677 }
3678 
3679 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3680                                       int nb_sectors, QEMUIOVector *iov,
3681                                       bool is_write)
3682 {
3683     CoroutineIOCompletion co = {
3684         .coroutine = qemu_coroutine_self(),
3685     };
3686     BlockDriverAIOCB *acb;
3687 
3688     if (is_write) {
3689         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3690                                        bdrv_co_io_em_complete, &co);
3691     } else {
3692         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3693                                       bdrv_co_io_em_complete, &co);
3694     }
3695 
3696     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3697     if (!acb) {
3698         return -EIO;
3699     }
3700     qemu_coroutine_yield();
3701 
3702     return co.ret;
3703 }
3704 
3705 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3706                                          int64_t sector_num, int nb_sectors,
3707                                          QEMUIOVector *iov)
3708 {
3709     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3710 }
3711 
3712 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3713                                          int64_t sector_num, int nb_sectors,
3714                                          QEMUIOVector *iov)
3715 {
3716     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3717 }
3718 
3719 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3720 {
3721     RwCo *rwco = opaque;
3722 
3723     rwco->ret = bdrv_co_flush(rwco->bs);
3724 }
3725 
3726 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3727 {
3728     int ret;
3729 
3730     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3731         return 0;
3732     }
3733 
3734     /* Write back cached data to the OS even with cache=unsafe */
3735     if (bs->drv->bdrv_co_flush_to_os) {
3736         ret = bs->drv->bdrv_co_flush_to_os(bs);
3737         if (ret < 0) {
3738             return ret;
3739         }
3740     }
3741 
3742     /* But don't actually force it to the disk with cache=unsafe */
3743     if (bs->open_flags & BDRV_O_NO_FLUSH) {
3744         return 0;
3745     }
3746 
3747     if (bs->drv->bdrv_co_flush_to_disk) {
3748         ret = bs->drv->bdrv_co_flush_to_disk(bs);
3749     } else if (bs->drv->bdrv_aio_flush) {
3750         BlockDriverAIOCB *acb;
3751         CoroutineIOCompletion co = {
3752             .coroutine = qemu_coroutine_self(),
3753         };
3754 
3755         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3756         if (acb == NULL) {
3757             ret = -EIO;
3758         } else {
3759             qemu_coroutine_yield();
3760             ret = co.ret;
3761         }
3762     } else {
3763         /*
3764          * Some block drivers always operate in either writethrough or unsafe
3765          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3766          * know how the server works (because the behaviour is hardcoded or
3767          * depends on server-side configuration), so we can't ensure that
3768          * everything is safe on disk. Returning an error doesn't work because
3769          * that would break guests even if the server operates in writethrough
3770          * mode.
3771          *
3772          * Let's hope the user knows what he's doing.
3773          */
3774         ret = 0;
3775     }
3776     if (ret < 0) {
3777         return ret;
3778     }
3779 
3780     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
3781      * in the case of cache=unsafe, so there are no useless flushes.
3782      */
3783     return bdrv_co_flush(bs->file);
3784 }
3785 
3786 void bdrv_invalidate_cache(BlockDriverState *bs)
3787 {
3788     if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3789         bs->drv->bdrv_invalidate_cache(bs);
3790     }
3791 }
3792 
3793 void bdrv_invalidate_cache_all(void)
3794 {
3795     BlockDriverState *bs;
3796 
3797     QTAILQ_FOREACH(bs, &bdrv_states, list) {
3798         bdrv_invalidate_cache(bs);
3799     }
3800 }
3801 
3802 void bdrv_clear_incoming_migration_all(void)
3803 {
3804     BlockDriverState *bs;
3805 
3806     QTAILQ_FOREACH(bs, &bdrv_states, list) {
3807         bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3808     }
3809 }
3810 
3811 int bdrv_flush(BlockDriverState *bs)
3812 {
3813     Coroutine *co;
3814     RwCo rwco = {
3815         .bs = bs,
3816         .ret = NOT_DONE,
3817     };
3818 
3819     if (qemu_in_coroutine()) {
3820         /* Fast-path if already in coroutine context */
3821         bdrv_flush_co_entry(&rwco);
3822     } else {
3823         co = qemu_coroutine_create(bdrv_flush_co_entry);
3824         qemu_coroutine_enter(co, &rwco);
3825         while (rwco.ret == NOT_DONE) {
3826             qemu_aio_wait();
3827         }
3828     }
3829 
3830     return rwco.ret;
3831 }
3832 
3833 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3834 {
3835     RwCo *rwco = opaque;
3836 
3837     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3838 }
3839 
3840 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3841                                  int nb_sectors)
3842 {
3843     if (!bs->drv) {
3844         return -ENOMEDIUM;
3845     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3846         return -EIO;
3847     } else if (bs->read_only) {
3848         return -EROFS;
3849     } else if (bs->drv->bdrv_co_discard) {
3850         return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3851     } else if (bs->drv->bdrv_aio_discard) {
3852         BlockDriverAIOCB *acb;
3853         CoroutineIOCompletion co = {
3854             .coroutine = qemu_coroutine_self(),
3855         };
3856 
3857         acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3858                                         bdrv_co_io_em_complete, &co);
3859         if (acb == NULL) {
3860             return -EIO;
3861         } else {
3862             qemu_coroutine_yield();
3863             return co.ret;
3864         }
3865     } else {
3866         return 0;
3867     }
3868 }
3869 
3870 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3871 {
3872     Coroutine *co;
3873     RwCo rwco = {
3874         .bs = bs,
3875         .sector_num = sector_num,
3876         .nb_sectors = nb_sectors,
3877         .ret = NOT_DONE,
3878     };
3879 
3880     if (qemu_in_coroutine()) {
3881         /* Fast-path if already in coroutine context */
3882         bdrv_discard_co_entry(&rwco);
3883     } else {
3884         co = qemu_coroutine_create(bdrv_discard_co_entry);
3885         qemu_coroutine_enter(co, &rwco);
3886         while (rwco.ret == NOT_DONE) {
3887             qemu_aio_wait();
3888         }
3889     }
3890 
3891     return rwco.ret;
3892 }
3893 
3894 /**************************************************************/
3895 /* removable device support */
3896 
3897 /**
3898  * Return TRUE if the media is present
3899  */
3900 int bdrv_is_inserted(BlockDriverState *bs)
3901 {
3902     BlockDriver *drv = bs->drv;
3903 
3904     if (!drv)
3905         return 0;
3906     if (!drv->bdrv_is_inserted)
3907         return 1;
3908     return drv->bdrv_is_inserted(bs);
3909 }
3910 
3911 /**
3912  * Return whether the media changed since the last call to this
3913  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
3914  */
3915 int bdrv_media_changed(BlockDriverState *bs)
3916 {
3917     BlockDriver *drv = bs->drv;
3918 
3919     if (drv && drv->bdrv_media_changed) {
3920         return drv->bdrv_media_changed(bs);
3921     }
3922     return -ENOTSUP;
3923 }
3924 
3925 /**
3926  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3927  */
3928 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3929 {
3930     BlockDriver *drv = bs->drv;
3931 
3932     if (drv && drv->bdrv_eject) {
3933         drv->bdrv_eject(bs, eject_flag);
3934     }
3935 
3936     if (bs->device_name[0] != '\0') {
3937         bdrv_emit_qmp_eject_event(bs, eject_flag);
3938     }
3939 }
3940 
3941 /**
3942  * Lock or unlock the media (if it is locked, the user won't be able
3943  * to eject it manually).
3944  */
3945 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3946 {
3947     BlockDriver *drv = bs->drv;
3948 
3949     trace_bdrv_lock_medium(bs, locked);
3950 
3951     if (drv && drv->bdrv_lock_medium) {
3952         drv->bdrv_lock_medium(bs, locked);
3953     }
3954 }
3955 
3956 /* needed for generic scsi interface */
3957 
3958 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3959 {
3960     BlockDriver *drv = bs->drv;
3961 
3962     if (drv && drv->bdrv_ioctl)
3963         return drv->bdrv_ioctl(bs, req, buf);
3964     return -ENOTSUP;
3965 }
3966 
3967 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3968         unsigned long int req, void *buf,
3969         BlockDriverCompletionFunc *cb, void *opaque)
3970 {
3971     BlockDriver *drv = bs->drv;
3972 
3973     if (drv && drv->bdrv_aio_ioctl)
3974         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3975     return NULL;
3976 }
3977 
3978 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3979 {
3980     bs->buffer_alignment = align;
3981 }
3982 
3983 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3984 {
3985     return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3986 }
3987 
3988 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3989 {
3990     int64_t bitmap_size;
3991 
3992     bs->dirty_count = 0;
3993     if (enable) {
3994         if (!bs->dirty_bitmap) {
3995             bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3996                     BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG - 1;
3997             bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
3998 
3999             bs->dirty_bitmap = g_new0(unsigned long, bitmap_size);
4000         }
4001     } else {
4002         if (bs->dirty_bitmap) {
4003             g_free(bs->dirty_bitmap);
4004             bs->dirty_bitmap = NULL;
4005         }
4006     }
4007 }
4008 
4009 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
4010 {
4011     int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
4012 
4013     if (bs->dirty_bitmap &&
4014         (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
4015         return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
4016             (1UL << (chunk % (sizeof(unsigned long) * 8))));
4017     } else {
4018         return 0;
4019     }
4020 }
4021 
4022 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
4023                       int nr_sectors)
4024 {
4025     set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
4026 }
4027 
4028 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
4029 {
4030     return bs->dirty_count;
4031 }
4032 
4033 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4034 {
4035     assert(bs->in_use != in_use);
4036     bs->in_use = in_use;
4037 }
4038 
4039 int bdrv_in_use(BlockDriverState *bs)
4040 {
4041     return bs->in_use;
4042 }
4043 
4044 void bdrv_iostatus_enable(BlockDriverState *bs)
4045 {
4046     bs->iostatus_enabled = true;
4047     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4048 }
4049 
4050 /* The I/O status is only enabled if the drive explicitly
4051  * enables it _and_ the VM is configured to stop on errors */
4052 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4053 {
4054     return (bs->iostatus_enabled &&
4055            (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
4056             bs->on_write_error == BLOCK_ERR_STOP_ANY    ||
4057             bs->on_read_error == BLOCK_ERR_STOP_ANY));
4058 }
4059 
4060 void bdrv_iostatus_disable(BlockDriverState *bs)
4061 {
4062     bs->iostatus_enabled = false;
4063 }
4064 
4065 void bdrv_iostatus_reset(BlockDriverState *bs)
4066 {
4067     if (bdrv_iostatus_is_enabled(bs)) {
4068         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4069     }
4070 }
4071 
4072 /* XXX: Today this is set by device models because it makes the implementation
4073    quite simple. However, the block layer knows about the error, so it's
4074    possible to implement this without device models being involved */
4075 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4076 {
4077     if (bdrv_iostatus_is_enabled(bs) &&
4078         bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4079         assert(error >= 0);
4080         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4081                                          BLOCK_DEVICE_IO_STATUS_FAILED;
4082     }
4083 }
4084 
4085 void
4086 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4087         enum BlockAcctType type)
4088 {
4089     assert(type < BDRV_MAX_IOTYPE);
4090 
4091     cookie->bytes = bytes;
4092     cookie->start_time_ns = get_clock();
4093     cookie->type = type;
4094 }
4095 
4096 void
4097 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4098 {
4099     assert(cookie->type < BDRV_MAX_IOTYPE);
4100 
4101     bs->nr_bytes[cookie->type] += cookie->bytes;
4102     bs->nr_ops[cookie->type]++;
4103     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4104 }
4105 
4106 int bdrv_img_create(const char *filename, const char *fmt,
4107                     const char *base_filename, const char *base_fmt,
4108                     char *options, uint64_t img_size, int flags)
4109 {
4110     QEMUOptionParameter *param = NULL, *create_options = NULL;
4111     QEMUOptionParameter *backing_fmt, *backing_file, *size;
4112     BlockDriverState *bs = NULL;
4113     BlockDriver *drv, *proto_drv;
4114     BlockDriver *backing_drv = NULL;
4115     int ret = 0;
4116 
4117     /* Find driver and parse its options */
4118     drv = bdrv_find_format(fmt);
4119     if (!drv) {
4120         error_report("Unknown file format '%s'", fmt);
4121         ret = -EINVAL;
4122         goto out;
4123     }
4124 
4125     proto_drv = bdrv_find_protocol(filename);
4126     if (!proto_drv) {
4127         error_report("Unknown protocol '%s'", filename);
4128         ret = -EINVAL;
4129         goto out;
4130     }
4131 
4132     create_options = append_option_parameters(create_options,
4133                                               drv->create_options);
4134     create_options = append_option_parameters(create_options,
4135                                               proto_drv->create_options);
4136 
4137     /* Create parameter list with default values */
4138     param = parse_option_parameters("", create_options, param);
4139 
4140     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4141 
4142     /* Parse -o options */
4143     if (options) {
4144         param = parse_option_parameters(options, create_options, param);
4145         if (param == NULL) {
4146             error_report("Invalid options for file format '%s'.", fmt);
4147             ret = -EINVAL;
4148             goto out;
4149         }
4150     }
4151 
4152     if (base_filename) {
4153         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4154                                  base_filename)) {
4155             error_report("Backing file not supported for file format '%s'",
4156                          fmt);
4157             ret = -EINVAL;
4158             goto out;
4159         }
4160     }
4161 
4162     if (base_fmt) {
4163         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4164             error_report("Backing file format not supported for file "
4165                          "format '%s'", fmt);
4166             ret = -EINVAL;
4167             goto out;
4168         }
4169     }
4170 
4171     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4172     if (backing_file && backing_file->value.s) {
4173         if (!strcmp(filename, backing_file->value.s)) {
4174             error_report("Error: Trying to create an image with the "
4175                          "same filename as the backing file");
4176             ret = -EINVAL;
4177             goto out;
4178         }
4179     }
4180 
4181     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4182     if (backing_fmt && backing_fmt->value.s) {
4183         backing_drv = bdrv_find_format(backing_fmt->value.s);
4184         if (!backing_drv) {
4185             error_report("Unknown backing file format '%s'",
4186                          backing_fmt->value.s);
4187             ret = -EINVAL;
4188             goto out;
4189         }
4190     }
4191 
4192     // The size for the image must always be specified, with one exception:
4193     // If we are using a backing file, we can obtain the size from there
4194     size = get_option_parameter(param, BLOCK_OPT_SIZE);
4195     if (size && size->value.n == -1) {
4196         if (backing_file && backing_file->value.s) {
4197             uint64_t size;
4198             char buf[32];
4199             int back_flags;
4200 
4201             /* backing files always opened read-only */
4202             back_flags =
4203                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4204 
4205             bs = bdrv_new("");
4206 
4207             ret = bdrv_open(bs, backing_file->value.s, back_flags, backing_drv);
4208             if (ret < 0) {
4209                 error_report("Could not open '%s'", backing_file->value.s);
4210                 goto out;
4211             }
4212             bdrv_get_geometry(bs, &size);
4213             size *= 512;
4214 
4215             snprintf(buf, sizeof(buf), "%" PRId64, size);
4216             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4217         } else {
4218             error_report("Image creation needs a size parameter");
4219             ret = -EINVAL;
4220             goto out;
4221         }
4222     }
4223 
4224     printf("Formatting '%s', fmt=%s ", filename, fmt);
4225     print_option_parameters(param);
4226     puts("");
4227 
4228     ret = bdrv_create(drv, filename, param);
4229 
4230     if (ret < 0) {
4231         if (ret == -ENOTSUP) {
4232             error_report("Formatting or formatting option not supported for "
4233                          "file format '%s'", fmt);
4234         } else if (ret == -EFBIG) {
4235             error_report("The image size is too large for file format '%s'",
4236                          fmt);
4237         } else {
4238             error_report("%s: error while creating %s: %s", filename, fmt,
4239                          strerror(-ret));
4240         }
4241     }
4242 
4243 out:
4244     free_option_parameters(create_options);
4245     free_option_parameters(param);
4246 
4247     if (bs) {
4248         bdrv_delete(bs);
4249     }
4250 
4251     return ret;
4252 }
4253 
4254 void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4255                        int64_t speed, BlockDriverCompletionFunc *cb,
4256                        void *opaque, Error **errp)
4257 {
4258     BlockJob *job;
4259 
4260     if (bs->job || bdrv_in_use(bs)) {
4261         error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
4262         return NULL;
4263     }
4264     bdrv_set_in_use(bs, 1);
4265 
4266     job = g_malloc0(job_type->instance_size);
4267     job->job_type      = job_type;
4268     job->bs            = bs;
4269     job->cb            = cb;
4270     job->opaque        = opaque;
4271     job->busy          = true;
4272     bs->job = job;
4273 
4274     /* Only set speed when necessary to avoid NotSupported error */
4275     if (speed != 0) {
4276         Error *local_err = NULL;
4277 
4278         block_job_set_speed(job, speed, &local_err);
4279         if (error_is_set(&local_err)) {
4280             bs->job = NULL;
4281             g_free(job);
4282             bdrv_set_in_use(bs, 0);
4283             error_propagate(errp, local_err);
4284             return NULL;
4285         }
4286     }
4287     return job;
4288 }
4289 
4290 void block_job_complete(BlockJob *job, int ret)
4291 {
4292     BlockDriverState *bs = job->bs;
4293 
4294     assert(bs->job == job);
4295     job->cb(job->opaque, ret);
4296     bs->job = NULL;
4297     g_free(job);
4298     bdrv_set_in_use(bs, 0);
4299 }
4300 
4301 void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
4302 {
4303     Error *local_err = NULL;
4304 
4305     if (!job->job_type->set_speed) {
4306         error_set(errp, QERR_NOT_SUPPORTED);
4307         return;
4308     }
4309     job->job_type->set_speed(job, speed, &local_err);
4310     if (error_is_set(&local_err)) {
4311         error_propagate(errp, local_err);
4312         return;
4313     }
4314 
4315     job->speed = speed;
4316 }
4317 
4318 void block_job_cancel(BlockJob *job)
4319 {
4320     job->cancelled = true;
4321     if (job->co && !job->busy) {
4322         qemu_coroutine_enter(job->co, NULL);
4323     }
4324 }
4325 
4326 bool block_job_is_cancelled(BlockJob *job)
4327 {
4328     return job->cancelled;
4329 }
4330 
4331 struct BlockCancelData {
4332     BlockJob *job;
4333     BlockDriverCompletionFunc *cb;
4334     void *opaque;
4335     bool cancelled;
4336     int ret;
4337 };
4338 
4339 static void block_job_cancel_cb(void *opaque, int ret)
4340 {
4341     struct BlockCancelData *data = opaque;
4342 
4343     data->cancelled = block_job_is_cancelled(data->job);
4344     data->ret = ret;
4345     data->cb(data->opaque, ret);
4346 }
4347 
4348 int block_job_cancel_sync(BlockJob *job)
4349 {
4350     struct BlockCancelData data;
4351     BlockDriverState *bs = job->bs;
4352 
4353     assert(bs->job == job);
4354 
4355     /* Set up our own callback to store the result and chain to
4356      * the original callback.
4357      */
4358     data.job = job;
4359     data.cb = job->cb;
4360     data.opaque = job->opaque;
4361     data.ret = -EINPROGRESS;
4362     job->cb = block_job_cancel_cb;
4363     job->opaque = &data;
4364     block_job_cancel(job);
4365     while (data.ret == -EINPROGRESS) {
4366         qemu_aio_wait();
4367     }
4368     return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret;
4369 }
4370 
4371 void block_job_sleep_ns(BlockJob *job, QEMUClock *clock, int64_t ns)
4372 {
4373     /* Check cancellation *before* setting busy = false, too!  */
4374     if (!block_job_is_cancelled(job)) {
4375         job->busy = false;
4376         co_sleep_ns(clock, ns);
4377         job->busy = true;
4378     }
4379 }
4380