xref: /openbmc/qemu/block.c (revision d8f8a860f2403533fc73f541122c65a34b21e42f)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor.h"
28 #include "block_int.h"
29 #include "module.h"
30 #include "qjson.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 #include "qemu-timer.h"
34 
35 #ifdef CONFIG_BSD
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/ioctl.h>
39 #include <sys/queue.h>
40 #ifndef __DragonFly__
41 #include <sys/disk.h>
42 #endif
43 #endif
44 
45 #ifdef _WIN32
46 #include <windows.h>
47 #endif
48 
49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50 
51 typedef enum {
52     BDRV_REQ_COPY_ON_READ = 0x1,
53     BDRV_REQ_ZERO_WRITE   = 0x2,
54 } BdrvRequestFlags;
55 
56 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59         BlockDriverCompletionFunc *cb, void *opaque);
60 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62         BlockDriverCompletionFunc *cb, void *opaque);
63 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64                                          int64_t sector_num, int nb_sectors,
65                                          QEMUIOVector *iov);
66 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67                                          int64_t sector_num, int nb_sectors,
68                                          QEMUIOVector *iov);
69 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71     BdrvRequestFlags flags);
72 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74     BdrvRequestFlags flags);
75 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76                                                int64_t sector_num,
77                                                QEMUIOVector *qiov,
78                                                int nb_sectors,
79                                                BlockDriverCompletionFunc *cb,
80                                                void *opaque,
81                                                bool is_write);
82 static void coroutine_fn bdrv_co_do_rw(void *opaque);
83 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
84     int64_t sector_num, int nb_sectors);
85 
86 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
87         bool is_write, double elapsed_time, uint64_t *wait);
88 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
89         double elapsed_time, uint64_t *wait);
90 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
91         bool is_write, int64_t *wait);
92 
93 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94     QTAILQ_HEAD_INITIALIZER(bdrv_states);
95 
96 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
97     QLIST_HEAD_INITIALIZER(bdrv_drivers);
98 
99 /* The device to use for VM snapshots */
100 static BlockDriverState *bs_snapshots;
101 
102 /* If non-zero, use only whitelisted block drivers */
103 static int use_bdrv_whitelist;
104 
105 #ifdef _WIN32
106 static int is_windows_drive_prefix(const char *filename)
107 {
108     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
110             filename[1] == ':');
111 }
112 
113 int is_windows_drive(const char *filename)
114 {
115     if (is_windows_drive_prefix(filename) &&
116         filename[2] == '\0')
117         return 1;
118     if (strstart(filename, "\\\\.\\", NULL) ||
119         strstart(filename, "//./", NULL))
120         return 1;
121     return 0;
122 }
123 #endif
124 
125 /* throttling disk I/O limits */
126 void bdrv_io_limits_disable(BlockDriverState *bs)
127 {
128     bs->io_limits_enabled = false;
129 
130     while (qemu_co_queue_next(&bs->throttled_reqs));
131 
132     if (bs->block_timer) {
133         qemu_del_timer(bs->block_timer);
134         qemu_free_timer(bs->block_timer);
135         bs->block_timer = NULL;
136     }
137 
138     bs->slice_start = 0;
139     bs->slice_end   = 0;
140     bs->slice_time  = 0;
141     memset(&bs->io_base, 0, sizeof(bs->io_base));
142 }
143 
144 static void bdrv_block_timer(void *opaque)
145 {
146     BlockDriverState *bs = opaque;
147 
148     qemu_co_queue_next(&bs->throttled_reqs);
149 }
150 
151 void bdrv_io_limits_enable(BlockDriverState *bs)
152 {
153     qemu_co_queue_init(&bs->throttled_reqs);
154     bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
155     bs->slice_time  = 5 * BLOCK_IO_SLICE_TIME;
156     bs->slice_start = qemu_get_clock_ns(vm_clock);
157     bs->slice_end   = bs->slice_start + bs->slice_time;
158     memset(&bs->io_base, 0, sizeof(bs->io_base));
159     bs->io_limits_enabled = true;
160 }
161 
162 bool bdrv_io_limits_enabled(BlockDriverState *bs)
163 {
164     BlockIOLimit *io_limits = &bs->io_limits;
165     return io_limits->bps[BLOCK_IO_LIMIT_READ]
166          || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
167          || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
168          || io_limits->iops[BLOCK_IO_LIMIT_READ]
169          || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
170          || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
171 }
172 
173 static void bdrv_io_limits_intercept(BlockDriverState *bs,
174                                      bool is_write, int nb_sectors)
175 {
176     int64_t wait_time = -1;
177 
178     if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
179         qemu_co_queue_wait(&bs->throttled_reqs);
180     }
181 
182     /* In fact, we hope to keep each request's timing, in FIFO mode. The next
183      * throttled requests will not be dequeued until the current request is
184      * allowed to be serviced. So if the current request still exceeds the
185      * limits, it will be inserted to the head. All requests followed it will
186      * be still in throttled_reqs queue.
187      */
188 
189     while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
190         qemu_mod_timer(bs->block_timer,
191                        wait_time + qemu_get_clock_ns(vm_clock));
192         qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
193     }
194 
195     qemu_co_queue_next(&bs->throttled_reqs);
196 }
197 
198 /* check if the path starts with "<protocol>:" */
199 static int path_has_protocol(const char *path)
200 {
201     const char *p;
202 
203 #ifdef _WIN32
204     if (is_windows_drive(path) ||
205         is_windows_drive_prefix(path)) {
206         return 0;
207     }
208     p = path + strcspn(path, ":/\\");
209 #else
210     p = path + strcspn(path, ":/");
211 #endif
212 
213     return *p == ':';
214 }
215 
216 int path_is_absolute(const char *path)
217 {
218 #ifdef _WIN32
219     /* specific case for names like: "\\.\d:" */
220     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
221         return 1;
222     }
223     return (*path == '/' || *path == '\\');
224 #else
225     return (*path == '/');
226 #endif
227 }
228 
229 /* if filename is absolute, just copy it to dest. Otherwise, build a
230    path to it by considering it is relative to base_path. URL are
231    supported. */
232 void path_combine(char *dest, int dest_size,
233                   const char *base_path,
234                   const char *filename)
235 {
236     const char *p, *p1;
237     int len;
238 
239     if (dest_size <= 0)
240         return;
241     if (path_is_absolute(filename)) {
242         pstrcpy(dest, dest_size, filename);
243     } else {
244         p = strchr(base_path, ':');
245         if (p)
246             p++;
247         else
248             p = base_path;
249         p1 = strrchr(base_path, '/');
250 #ifdef _WIN32
251         {
252             const char *p2;
253             p2 = strrchr(base_path, '\\');
254             if (!p1 || p2 > p1)
255                 p1 = p2;
256         }
257 #endif
258         if (p1)
259             p1++;
260         else
261             p1 = base_path;
262         if (p1 > p)
263             p = p1;
264         len = p - base_path;
265         if (len > dest_size - 1)
266             len = dest_size - 1;
267         memcpy(dest, base_path, len);
268         dest[len] = '\0';
269         pstrcat(dest, dest_size, filename);
270     }
271 }
272 
273 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
274 {
275     if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
276         pstrcpy(dest, sz, bs->backing_file);
277     } else {
278         path_combine(dest, sz, bs->filename, bs->backing_file);
279     }
280 }
281 
282 void bdrv_register(BlockDriver *bdrv)
283 {
284     /* Block drivers without coroutine functions need emulation */
285     if (!bdrv->bdrv_co_readv) {
286         bdrv->bdrv_co_readv = bdrv_co_readv_em;
287         bdrv->bdrv_co_writev = bdrv_co_writev_em;
288 
289         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
290          * the block driver lacks aio we need to emulate that too.
291          */
292         if (!bdrv->bdrv_aio_readv) {
293             /* add AIO emulation layer */
294             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
295             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
296         }
297     }
298 
299     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
300 }
301 
302 /* create a new block device (by default it is empty) */
303 BlockDriverState *bdrv_new(const char *device_name)
304 {
305     BlockDriverState *bs;
306 
307     bs = g_malloc0(sizeof(BlockDriverState));
308     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
309     if (device_name[0] != '\0') {
310         QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
311     }
312     bdrv_iostatus_disable(bs);
313     return bs;
314 }
315 
316 BlockDriver *bdrv_find_format(const char *format_name)
317 {
318     BlockDriver *drv1;
319     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
320         if (!strcmp(drv1->format_name, format_name)) {
321             return drv1;
322         }
323     }
324     return NULL;
325 }
326 
327 static int bdrv_is_whitelisted(BlockDriver *drv)
328 {
329     static const char *whitelist[] = {
330         CONFIG_BDRV_WHITELIST
331     };
332     const char **p;
333 
334     if (!whitelist[0])
335         return 1;               /* no whitelist, anything goes */
336 
337     for (p = whitelist; *p; p++) {
338         if (!strcmp(drv->format_name, *p)) {
339             return 1;
340         }
341     }
342     return 0;
343 }
344 
345 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
346 {
347     BlockDriver *drv = bdrv_find_format(format_name);
348     return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
349 }
350 
351 typedef struct CreateCo {
352     BlockDriver *drv;
353     char *filename;
354     QEMUOptionParameter *options;
355     int ret;
356 } CreateCo;
357 
358 static void coroutine_fn bdrv_create_co_entry(void *opaque)
359 {
360     CreateCo *cco = opaque;
361     assert(cco->drv);
362 
363     cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
364 }
365 
366 int bdrv_create(BlockDriver *drv, const char* filename,
367     QEMUOptionParameter *options)
368 {
369     int ret;
370 
371     Coroutine *co;
372     CreateCo cco = {
373         .drv = drv,
374         .filename = g_strdup(filename),
375         .options = options,
376         .ret = NOT_DONE,
377     };
378 
379     if (!drv->bdrv_create) {
380         return -ENOTSUP;
381     }
382 
383     if (qemu_in_coroutine()) {
384         /* Fast-path if already in coroutine context */
385         bdrv_create_co_entry(&cco);
386     } else {
387         co = qemu_coroutine_create(bdrv_create_co_entry);
388         qemu_coroutine_enter(co, &cco);
389         while (cco.ret == NOT_DONE) {
390             qemu_aio_wait();
391         }
392     }
393 
394     ret = cco.ret;
395     g_free(cco.filename);
396 
397     return ret;
398 }
399 
400 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
401 {
402     BlockDriver *drv;
403 
404     drv = bdrv_find_protocol(filename);
405     if (drv == NULL) {
406         return -ENOENT;
407     }
408 
409     return bdrv_create(drv, filename, options);
410 }
411 
412 /*
413  * Create a uniquely-named empty temporary file.
414  * Return 0 upon success, otherwise a negative errno value.
415  */
416 int get_tmp_filename(char *filename, int size)
417 {
418 #ifdef _WIN32
419     char temp_dir[MAX_PATH];
420     /* GetTempFileName requires that its output buffer (4th param)
421        have length MAX_PATH or greater.  */
422     assert(size >= MAX_PATH);
423     return (GetTempPath(MAX_PATH, temp_dir)
424             && GetTempFileName(temp_dir, "qem", 0, filename)
425             ? 0 : -GetLastError());
426 #else
427     int fd;
428     const char *tmpdir;
429     tmpdir = getenv("TMPDIR");
430     if (!tmpdir)
431         tmpdir = "/tmp";
432     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
433         return -EOVERFLOW;
434     }
435     fd = mkstemp(filename);
436     if (fd < 0) {
437         return -errno;
438     }
439     if (close(fd) != 0) {
440         unlink(filename);
441         return -errno;
442     }
443     return 0;
444 #endif
445 }
446 
447 /*
448  * Detect host devices. By convention, /dev/cdrom[N] is always
449  * recognized as a host CDROM.
450  */
451 static BlockDriver *find_hdev_driver(const char *filename)
452 {
453     int score_max = 0, score;
454     BlockDriver *drv = NULL, *d;
455 
456     QLIST_FOREACH(d, &bdrv_drivers, list) {
457         if (d->bdrv_probe_device) {
458             score = d->bdrv_probe_device(filename);
459             if (score > score_max) {
460                 score_max = score;
461                 drv = d;
462             }
463         }
464     }
465 
466     return drv;
467 }
468 
469 BlockDriver *bdrv_find_protocol(const char *filename)
470 {
471     BlockDriver *drv1;
472     char protocol[128];
473     int len;
474     const char *p;
475 
476     /* TODO Drivers without bdrv_file_open must be specified explicitly */
477 
478     /*
479      * XXX(hch): we really should not let host device detection
480      * override an explicit protocol specification, but moving this
481      * later breaks access to device names with colons in them.
482      * Thanks to the brain-dead persistent naming schemes on udev-
483      * based Linux systems those actually are quite common.
484      */
485     drv1 = find_hdev_driver(filename);
486     if (drv1) {
487         return drv1;
488     }
489 
490     if (!path_has_protocol(filename)) {
491         return bdrv_find_format("file");
492     }
493     p = strchr(filename, ':');
494     assert(p != NULL);
495     len = p - filename;
496     if (len > sizeof(protocol) - 1)
497         len = sizeof(protocol) - 1;
498     memcpy(protocol, filename, len);
499     protocol[len] = '\0';
500     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
501         if (drv1->protocol_name &&
502             !strcmp(drv1->protocol_name, protocol)) {
503             return drv1;
504         }
505     }
506     return NULL;
507 }
508 
509 static int find_image_format(const char *filename, BlockDriver **pdrv)
510 {
511     int ret, score, score_max;
512     BlockDriver *drv1, *drv;
513     uint8_t buf[2048];
514     BlockDriverState *bs;
515 
516     ret = bdrv_file_open(&bs, filename, 0);
517     if (ret < 0) {
518         *pdrv = NULL;
519         return ret;
520     }
521 
522     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
523     if (bs->sg || !bdrv_is_inserted(bs)) {
524         bdrv_delete(bs);
525         drv = bdrv_find_format("raw");
526         if (!drv) {
527             ret = -ENOENT;
528         }
529         *pdrv = drv;
530         return ret;
531     }
532 
533     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
534     bdrv_delete(bs);
535     if (ret < 0) {
536         *pdrv = NULL;
537         return ret;
538     }
539 
540     score_max = 0;
541     drv = NULL;
542     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
543         if (drv1->bdrv_probe) {
544             score = drv1->bdrv_probe(buf, ret, filename);
545             if (score > score_max) {
546                 score_max = score;
547                 drv = drv1;
548             }
549         }
550     }
551     if (!drv) {
552         ret = -ENOENT;
553     }
554     *pdrv = drv;
555     return ret;
556 }
557 
558 /**
559  * Set the current 'total_sectors' value
560  */
561 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
562 {
563     BlockDriver *drv = bs->drv;
564 
565     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
566     if (bs->sg)
567         return 0;
568 
569     /* query actual device if possible, otherwise just trust the hint */
570     if (drv->bdrv_getlength) {
571         int64_t length = drv->bdrv_getlength(bs);
572         if (length < 0) {
573             return length;
574         }
575         hint = length >> BDRV_SECTOR_BITS;
576     }
577 
578     bs->total_sectors = hint;
579     return 0;
580 }
581 
582 /**
583  * Set open flags for a given cache mode
584  *
585  * Return 0 on success, -1 if the cache mode was invalid.
586  */
587 int bdrv_parse_cache_flags(const char *mode, int *flags)
588 {
589     *flags &= ~BDRV_O_CACHE_MASK;
590 
591     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
592         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
593     } else if (!strcmp(mode, "directsync")) {
594         *flags |= BDRV_O_NOCACHE;
595     } else if (!strcmp(mode, "writeback")) {
596         *flags |= BDRV_O_CACHE_WB;
597     } else if (!strcmp(mode, "unsafe")) {
598         *flags |= BDRV_O_CACHE_WB;
599         *flags |= BDRV_O_NO_FLUSH;
600     } else if (!strcmp(mode, "writethrough")) {
601         /* this is the default */
602     } else {
603         return -1;
604     }
605 
606     return 0;
607 }
608 
609 /**
610  * The copy-on-read flag is actually a reference count so multiple users may
611  * use the feature without worrying about clobbering its previous state.
612  * Copy-on-read stays enabled until all users have called to disable it.
613  */
614 void bdrv_enable_copy_on_read(BlockDriverState *bs)
615 {
616     bs->copy_on_read++;
617 }
618 
619 void bdrv_disable_copy_on_read(BlockDriverState *bs)
620 {
621     assert(bs->copy_on_read > 0);
622     bs->copy_on_read--;
623 }
624 
625 /*
626  * Common part for opening disk images and files
627  */
628 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
629     int flags, BlockDriver *drv)
630 {
631     int ret, open_flags;
632 
633     assert(drv != NULL);
634     assert(bs->file == NULL);
635 
636     trace_bdrv_open_common(bs, filename, flags, drv->format_name);
637 
638     bs->open_flags = flags;
639     bs->buffer_alignment = 512;
640 
641     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
642     if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
643         bdrv_enable_copy_on_read(bs);
644     }
645 
646     pstrcpy(bs->filename, sizeof(bs->filename), filename);
647 
648     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
649         return -ENOTSUP;
650     }
651 
652     bs->drv = drv;
653     bs->opaque = g_malloc0(drv->instance_size);
654 
655     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
656     open_flags = flags | BDRV_O_CACHE_WB;
657 
658     /*
659      * Clear flags that are internal to the block layer before opening the
660      * image.
661      */
662     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
663 
664     /*
665      * Snapshots should be writable.
666      */
667     if (bs->is_temporary) {
668         open_flags |= BDRV_O_RDWR;
669     }
670 
671     bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
672 
673     /* Open the image, either directly or using a protocol */
674     if (drv->bdrv_file_open) {
675         ret = drv->bdrv_file_open(bs, filename, open_flags);
676     } else {
677         ret = bdrv_file_open(&bs->file, filename, open_flags);
678         if (ret >= 0) {
679             ret = drv->bdrv_open(bs, open_flags);
680         }
681     }
682 
683     if (ret < 0) {
684         goto free_and_fail;
685     }
686 
687     ret = refresh_total_sectors(bs, bs->total_sectors);
688     if (ret < 0) {
689         goto free_and_fail;
690     }
691 
692 #ifndef _WIN32
693     if (bs->is_temporary) {
694         unlink(filename);
695     }
696 #endif
697     return 0;
698 
699 free_and_fail:
700     if (bs->file) {
701         bdrv_delete(bs->file);
702         bs->file = NULL;
703     }
704     g_free(bs->opaque);
705     bs->opaque = NULL;
706     bs->drv = NULL;
707     return ret;
708 }
709 
710 /*
711  * Opens a file using a protocol (file, host_device, nbd, ...)
712  */
713 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
714 {
715     BlockDriverState *bs;
716     BlockDriver *drv;
717     int ret;
718 
719     drv = bdrv_find_protocol(filename);
720     if (!drv) {
721         return -ENOENT;
722     }
723 
724     bs = bdrv_new("");
725     ret = bdrv_open_common(bs, filename, flags, drv);
726     if (ret < 0) {
727         bdrv_delete(bs);
728         return ret;
729     }
730     bs->growable = 1;
731     *pbs = bs;
732     return 0;
733 }
734 
735 /*
736  * Opens a disk image (raw, qcow2, vmdk, ...)
737  */
738 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
739               BlockDriver *drv)
740 {
741     int ret;
742     char tmp_filename[PATH_MAX];
743 
744     if (flags & BDRV_O_SNAPSHOT) {
745         BlockDriverState *bs1;
746         int64_t total_size;
747         int is_protocol = 0;
748         BlockDriver *bdrv_qcow2;
749         QEMUOptionParameter *options;
750         char backing_filename[PATH_MAX];
751 
752         /* if snapshot, we create a temporary backing file and open it
753            instead of opening 'filename' directly */
754 
755         /* if there is a backing file, use it */
756         bs1 = bdrv_new("");
757         ret = bdrv_open(bs1, filename, 0, drv);
758         if (ret < 0) {
759             bdrv_delete(bs1);
760             return ret;
761         }
762         total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
763 
764         if (bs1->drv && bs1->drv->protocol_name)
765             is_protocol = 1;
766 
767         bdrv_delete(bs1);
768 
769         ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
770         if (ret < 0) {
771             return ret;
772         }
773 
774         /* Real path is meaningless for protocols */
775         if (is_protocol)
776             snprintf(backing_filename, sizeof(backing_filename),
777                      "%s", filename);
778         else if (!realpath(filename, backing_filename))
779             return -errno;
780 
781         bdrv_qcow2 = bdrv_find_format("qcow2");
782         options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
783 
784         set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
785         set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
786         if (drv) {
787             set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
788                 drv->format_name);
789         }
790 
791         ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
792         free_option_parameters(options);
793         if (ret < 0) {
794             return ret;
795         }
796 
797         filename = tmp_filename;
798         drv = bdrv_qcow2;
799         bs->is_temporary = 1;
800     }
801 
802     /* Find the right image format driver */
803     if (!drv) {
804         ret = find_image_format(filename, &drv);
805     }
806 
807     if (!drv) {
808         goto unlink_and_fail;
809     }
810 
811     /* Open the image */
812     ret = bdrv_open_common(bs, filename, flags, drv);
813     if (ret < 0) {
814         goto unlink_and_fail;
815     }
816 
817     /* If there is a backing file, use it */
818     if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
819         char backing_filename[PATH_MAX];
820         int back_flags;
821         BlockDriver *back_drv = NULL;
822 
823         bs->backing_hd = bdrv_new("");
824         bdrv_get_full_backing_filename(bs, backing_filename,
825                                        sizeof(backing_filename));
826 
827         if (bs->backing_format[0] != '\0') {
828             back_drv = bdrv_find_format(bs->backing_format);
829         }
830 
831         /* backing files always opened read-only */
832         back_flags =
833             flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
834 
835         ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
836         if (ret < 0) {
837             bdrv_close(bs);
838             return ret;
839         }
840         if (bs->is_temporary) {
841             bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
842         } else {
843             /* base image inherits from "parent" */
844             bs->backing_hd->keep_read_only = bs->keep_read_only;
845         }
846     }
847 
848     if (!bdrv_key_required(bs)) {
849         bdrv_dev_change_media_cb(bs, true);
850     }
851 
852     /* throttling disk I/O limits */
853     if (bs->io_limits_enabled) {
854         bdrv_io_limits_enable(bs);
855     }
856 
857     return 0;
858 
859 unlink_and_fail:
860     if (bs->is_temporary) {
861         unlink(filename);
862     }
863     return ret;
864 }
865 
866 void bdrv_close(BlockDriverState *bs)
867 {
868     bdrv_flush(bs);
869     if (bs->drv) {
870         if (bs->job) {
871             block_job_cancel_sync(bs->job);
872         }
873         bdrv_drain_all();
874 
875         if (bs == bs_snapshots) {
876             bs_snapshots = NULL;
877         }
878         if (bs->backing_hd) {
879             bdrv_delete(bs->backing_hd);
880             bs->backing_hd = NULL;
881         }
882         bs->drv->bdrv_close(bs);
883         g_free(bs->opaque);
884 #ifdef _WIN32
885         if (bs->is_temporary) {
886             unlink(bs->filename);
887         }
888 #endif
889         bs->opaque = NULL;
890         bs->drv = NULL;
891         bs->copy_on_read = 0;
892         bs->backing_file[0] = '\0';
893         bs->backing_format[0] = '\0';
894         bs->total_sectors = 0;
895         bs->encrypted = 0;
896         bs->valid_key = 0;
897         bs->sg = 0;
898         bs->growable = 0;
899 
900         if (bs->file != NULL) {
901             bdrv_delete(bs->file);
902             bs->file = NULL;
903         }
904     }
905 
906     bdrv_dev_change_media_cb(bs, false);
907 
908     /*throttling disk I/O limits*/
909     if (bs->io_limits_enabled) {
910         bdrv_io_limits_disable(bs);
911     }
912 }
913 
914 void bdrv_close_all(void)
915 {
916     BlockDriverState *bs;
917 
918     QTAILQ_FOREACH(bs, &bdrv_states, list) {
919         bdrv_close(bs);
920     }
921 }
922 
923 /*
924  * Wait for pending requests to complete across all BlockDriverStates
925  *
926  * This function does not flush data to disk, use bdrv_flush_all() for that
927  * after calling this function.
928  *
929  * Note that completion of an asynchronous I/O operation can trigger any
930  * number of other I/O operations on other devices---for example a coroutine
931  * can be arbitrarily complex and a constant flow of I/O can come until the
932  * coroutine is complete.  Because of this, it is not possible to have a
933  * function to drain a single device's I/O queue.
934  */
935 void bdrv_drain_all(void)
936 {
937     BlockDriverState *bs;
938     bool busy;
939 
940     do {
941         busy = qemu_aio_wait();
942 
943         /* FIXME: We do not have timer support here, so this is effectively
944          * a busy wait.
945          */
946         QTAILQ_FOREACH(bs, &bdrv_states, list) {
947             if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
948                 qemu_co_queue_restart_all(&bs->throttled_reqs);
949                 busy = true;
950             }
951         }
952     } while (busy);
953 
954     /* If requests are still pending there is a bug somewhere */
955     QTAILQ_FOREACH(bs, &bdrv_states, list) {
956         assert(QLIST_EMPTY(&bs->tracked_requests));
957         assert(qemu_co_queue_empty(&bs->throttled_reqs));
958     }
959 }
960 
961 /* make a BlockDriverState anonymous by removing from bdrv_state list.
962    Also, NULL terminate the device_name to prevent double remove */
963 void bdrv_make_anon(BlockDriverState *bs)
964 {
965     if (bs->device_name[0] != '\0') {
966         QTAILQ_REMOVE(&bdrv_states, bs, list);
967     }
968     bs->device_name[0] = '\0';
969 }
970 
971 static void bdrv_rebind(BlockDriverState *bs)
972 {
973     if (bs->drv && bs->drv->bdrv_rebind) {
974         bs->drv->bdrv_rebind(bs);
975     }
976 }
977 
978 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
979                                      BlockDriverState *bs_src)
980 {
981     /* move some fields that need to stay attached to the device */
982     bs_dest->open_flags         = bs_src->open_flags;
983 
984     /* dev info */
985     bs_dest->dev_ops            = bs_src->dev_ops;
986     bs_dest->dev_opaque         = bs_src->dev_opaque;
987     bs_dest->dev                = bs_src->dev;
988     bs_dest->buffer_alignment   = bs_src->buffer_alignment;
989     bs_dest->copy_on_read       = bs_src->copy_on_read;
990 
991     bs_dest->enable_write_cache = bs_src->enable_write_cache;
992 
993     /* i/o timing parameters */
994     bs_dest->slice_time         = bs_src->slice_time;
995     bs_dest->slice_start        = bs_src->slice_start;
996     bs_dest->slice_end          = bs_src->slice_end;
997     bs_dest->io_limits          = bs_src->io_limits;
998     bs_dest->io_base            = bs_src->io_base;
999     bs_dest->throttled_reqs     = bs_src->throttled_reqs;
1000     bs_dest->block_timer        = bs_src->block_timer;
1001     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1002 
1003     /* r/w error */
1004     bs_dest->on_read_error      = bs_src->on_read_error;
1005     bs_dest->on_write_error     = bs_src->on_write_error;
1006 
1007     /* i/o status */
1008     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1009     bs_dest->iostatus           = bs_src->iostatus;
1010 
1011     /* dirty bitmap */
1012     bs_dest->dirty_count        = bs_src->dirty_count;
1013     bs_dest->dirty_bitmap       = bs_src->dirty_bitmap;
1014 
1015     /* job */
1016     bs_dest->in_use             = bs_src->in_use;
1017     bs_dest->job                = bs_src->job;
1018 
1019     /* keep the same entry in bdrv_states */
1020     pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1021             bs_src->device_name);
1022     bs_dest->list = bs_src->list;
1023 }
1024 
1025 /*
1026  * Swap bs contents for two image chains while they are live,
1027  * while keeping required fields on the BlockDriverState that is
1028  * actually attached to a device.
1029  *
1030  * This will modify the BlockDriverState fields, and swap contents
1031  * between bs_new and bs_old. Both bs_new and bs_old are modified.
1032  *
1033  * bs_new is required to be anonymous.
1034  *
1035  * This function does not create any image files.
1036  */
1037 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1038 {
1039     BlockDriverState tmp;
1040 
1041     /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1042     assert(bs_new->device_name[0] == '\0');
1043     assert(bs_new->dirty_bitmap == NULL);
1044     assert(bs_new->job == NULL);
1045     assert(bs_new->dev == NULL);
1046     assert(bs_new->in_use == 0);
1047     assert(bs_new->io_limits_enabled == false);
1048     assert(bs_new->block_timer == NULL);
1049 
1050     tmp = *bs_new;
1051     *bs_new = *bs_old;
1052     *bs_old = tmp;
1053 
1054     /* there are some fields that should not be swapped, move them back */
1055     bdrv_move_feature_fields(&tmp, bs_old);
1056     bdrv_move_feature_fields(bs_old, bs_new);
1057     bdrv_move_feature_fields(bs_new, &tmp);
1058 
1059     /* bs_new shouldn't be in bdrv_states even after the swap!  */
1060     assert(bs_new->device_name[0] == '\0');
1061 
1062     /* Check a few fields that should remain attached to the device */
1063     assert(bs_new->dev == NULL);
1064     assert(bs_new->job == NULL);
1065     assert(bs_new->in_use == 0);
1066     assert(bs_new->io_limits_enabled == false);
1067     assert(bs_new->block_timer == NULL);
1068 
1069     bdrv_rebind(bs_new);
1070     bdrv_rebind(bs_old);
1071 }
1072 
1073 /*
1074  * Add new bs contents at the top of an image chain while the chain is
1075  * live, while keeping required fields on the top layer.
1076  *
1077  * This will modify the BlockDriverState fields, and swap contents
1078  * between bs_new and bs_top. Both bs_new and bs_top are modified.
1079  *
1080  * bs_new is required to be anonymous.
1081  *
1082  * This function does not create any image files.
1083  */
1084 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1085 {
1086     bdrv_swap(bs_new, bs_top);
1087 
1088     /* The contents of 'tmp' will become bs_top, as we are
1089      * swapping bs_new and bs_top contents. */
1090     bs_top->backing_hd = bs_new;
1091     bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1092     pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1093             bs_new->filename);
1094     pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1095             bs_new->drv ? bs_new->drv->format_name : "");
1096 }
1097 
1098 void bdrv_delete(BlockDriverState *bs)
1099 {
1100     assert(!bs->dev);
1101     assert(!bs->job);
1102     assert(!bs->in_use);
1103 
1104     /* remove from list, if necessary */
1105     bdrv_make_anon(bs);
1106 
1107     bdrv_close(bs);
1108 
1109     assert(bs != bs_snapshots);
1110     g_free(bs);
1111 }
1112 
1113 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1114 /* TODO change to DeviceState *dev when all users are qdevified */
1115 {
1116     if (bs->dev) {
1117         return -EBUSY;
1118     }
1119     bs->dev = dev;
1120     bdrv_iostatus_reset(bs);
1121     return 0;
1122 }
1123 
1124 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1125 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1126 {
1127     if (bdrv_attach_dev(bs, dev) < 0) {
1128         abort();
1129     }
1130 }
1131 
1132 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1133 /* TODO change to DeviceState *dev when all users are qdevified */
1134 {
1135     assert(bs->dev == dev);
1136     bs->dev = NULL;
1137     bs->dev_ops = NULL;
1138     bs->dev_opaque = NULL;
1139     bs->buffer_alignment = 512;
1140 }
1141 
1142 /* TODO change to return DeviceState * when all users are qdevified */
1143 void *bdrv_get_attached_dev(BlockDriverState *bs)
1144 {
1145     return bs->dev;
1146 }
1147 
1148 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1149                       void *opaque)
1150 {
1151     bs->dev_ops = ops;
1152     bs->dev_opaque = opaque;
1153     if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1154         bs_snapshots = NULL;
1155     }
1156 }
1157 
1158 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1159                                BlockQMPEventAction action, int is_read)
1160 {
1161     QObject *data;
1162     const char *action_str;
1163 
1164     switch (action) {
1165     case BDRV_ACTION_REPORT:
1166         action_str = "report";
1167         break;
1168     case BDRV_ACTION_IGNORE:
1169         action_str = "ignore";
1170         break;
1171     case BDRV_ACTION_STOP:
1172         action_str = "stop";
1173         break;
1174     default:
1175         abort();
1176     }
1177 
1178     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1179                               bdrv->device_name,
1180                               action_str,
1181                               is_read ? "read" : "write");
1182     monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1183 
1184     qobject_decref(data);
1185 }
1186 
1187 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1188 {
1189     QObject *data;
1190 
1191     data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1192                               bdrv_get_device_name(bs), ejected);
1193     monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1194 
1195     qobject_decref(data);
1196 }
1197 
1198 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1199 {
1200     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1201         bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1202         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1203         if (tray_was_closed) {
1204             /* tray open */
1205             bdrv_emit_qmp_eject_event(bs, true);
1206         }
1207         if (load) {
1208             /* tray close */
1209             bdrv_emit_qmp_eject_event(bs, false);
1210         }
1211     }
1212 }
1213 
1214 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1215 {
1216     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1217 }
1218 
1219 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1220 {
1221     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1222         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1223     }
1224 }
1225 
1226 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1227 {
1228     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1229         return bs->dev_ops->is_tray_open(bs->dev_opaque);
1230     }
1231     return false;
1232 }
1233 
1234 static void bdrv_dev_resize_cb(BlockDriverState *bs)
1235 {
1236     if (bs->dev_ops && bs->dev_ops->resize_cb) {
1237         bs->dev_ops->resize_cb(bs->dev_opaque);
1238     }
1239 }
1240 
1241 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1242 {
1243     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1244         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1245     }
1246     return false;
1247 }
1248 
1249 /*
1250  * Run consistency checks on an image
1251  *
1252  * Returns 0 if the check could be completed (it doesn't mean that the image is
1253  * free of errors) or -errno when an internal error occurred. The results of the
1254  * check are stored in res.
1255  */
1256 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1257 {
1258     if (bs->drv->bdrv_check == NULL) {
1259         return -ENOTSUP;
1260     }
1261 
1262     memset(res, 0, sizeof(*res));
1263     return bs->drv->bdrv_check(bs, res, fix);
1264 }
1265 
1266 #define COMMIT_BUF_SECTORS 2048
1267 
1268 /* commit COW file into the raw image */
1269 int bdrv_commit(BlockDriverState *bs)
1270 {
1271     BlockDriver *drv = bs->drv;
1272     BlockDriver *backing_drv;
1273     int64_t sector, total_sectors;
1274     int n, ro, open_flags;
1275     int ret = 0, rw_ret = 0;
1276     uint8_t *buf;
1277     char filename[1024];
1278     BlockDriverState *bs_rw, *bs_ro;
1279 
1280     if (!drv)
1281         return -ENOMEDIUM;
1282 
1283     if (!bs->backing_hd) {
1284         return -ENOTSUP;
1285     }
1286 
1287     if (bs->backing_hd->keep_read_only) {
1288         return -EACCES;
1289     }
1290 
1291     if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1292         return -EBUSY;
1293     }
1294 
1295     backing_drv = bs->backing_hd->drv;
1296     ro = bs->backing_hd->read_only;
1297     strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1298     open_flags =  bs->backing_hd->open_flags;
1299 
1300     if (ro) {
1301         /* re-open as RW */
1302         bdrv_delete(bs->backing_hd);
1303         bs->backing_hd = NULL;
1304         bs_rw = bdrv_new("");
1305         rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1306             backing_drv);
1307         if (rw_ret < 0) {
1308             bdrv_delete(bs_rw);
1309             /* try to re-open read-only */
1310             bs_ro = bdrv_new("");
1311             ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1312                 backing_drv);
1313             if (ret < 0) {
1314                 bdrv_delete(bs_ro);
1315                 /* drive not functional anymore */
1316                 bs->drv = NULL;
1317                 return ret;
1318             }
1319             bs->backing_hd = bs_ro;
1320             return rw_ret;
1321         }
1322         bs->backing_hd = bs_rw;
1323     }
1324 
1325     total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1326     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1327 
1328     for (sector = 0; sector < total_sectors; sector += n) {
1329         if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1330 
1331             if (bdrv_read(bs, sector, buf, n) != 0) {
1332                 ret = -EIO;
1333                 goto ro_cleanup;
1334             }
1335 
1336             if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1337                 ret = -EIO;
1338                 goto ro_cleanup;
1339             }
1340         }
1341     }
1342 
1343     if (drv->bdrv_make_empty) {
1344         ret = drv->bdrv_make_empty(bs);
1345         bdrv_flush(bs);
1346     }
1347 
1348     /*
1349      * Make sure all data we wrote to the backing device is actually
1350      * stable on disk.
1351      */
1352     if (bs->backing_hd)
1353         bdrv_flush(bs->backing_hd);
1354 
1355 ro_cleanup:
1356     g_free(buf);
1357 
1358     if (ro) {
1359         /* re-open as RO */
1360         bdrv_delete(bs->backing_hd);
1361         bs->backing_hd = NULL;
1362         bs_ro = bdrv_new("");
1363         ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1364             backing_drv);
1365         if (ret < 0) {
1366             bdrv_delete(bs_ro);
1367             /* drive not functional anymore */
1368             bs->drv = NULL;
1369             return ret;
1370         }
1371         bs->backing_hd = bs_ro;
1372         bs->backing_hd->keep_read_only = 0;
1373     }
1374 
1375     return ret;
1376 }
1377 
1378 int bdrv_commit_all(void)
1379 {
1380     BlockDriverState *bs;
1381 
1382     QTAILQ_FOREACH(bs, &bdrv_states, list) {
1383         int ret = bdrv_commit(bs);
1384         if (ret < 0) {
1385             return ret;
1386         }
1387     }
1388     return 0;
1389 }
1390 
1391 struct BdrvTrackedRequest {
1392     BlockDriverState *bs;
1393     int64_t sector_num;
1394     int nb_sectors;
1395     bool is_write;
1396     QLIST_ENTRY(BdrvTrackedRequest) list;
1397     Coroutine *co; /* owner, used for deadlock detection */
1398     CoQueue wait_queue; /* coroutines blocked on this request */
1399 };
1400 
1401 /**
1402  * Remove an active request from the tracked requests list
1403  *
1404  * This function should be called when a tracked request is completing.
1405  */
1406 static void tracked_request_end(BdrvTrackedRequest *req)
1407 {
1408     QLIST_REMOVE(req, list);
1409     qemu_co_queue_restart_all(&req->wait_queue);
1410 }
1411 
1412 /**
1413  * Add an active request to the tracked requests list
1414  */
1415 static void tracked_request_begin(BdrvTrackedRequest *req,
1416                                   BlockDriverState *bs,
1417                                   int64_t sector_num,
1418                                   int nb_sectors, bool is_write)
1419 {
1420     *req = (BdrvTrackedRequest){
1421         .bs = bs,
1422         .sector_num = sector_num,
1423         .nb_sectors = nb_sectors,
1424         .is_write = is_write,
1425         .co = qemu_coroutine_self(),
1426     };
1427 
1428     qemu_co_queue_init(&req->wait_queue);
1429 
1430     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1431 }
1432 
1433 /**
1434  * Round a region to cluster boundaries
1435  */
1436 static void round_to_clusters(BlockDriverState *bs,
1437                               int64_t sector_num, int nb_sectors,
1438                               int64_t *cluster_sector_num,
1439                               int *cluster_nb_sectors)
1440 {
1441     BlockDriverInfo bdi;
1442 
1443     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1444         *cluster_sector_num = sector_num;
1445         *cluster_nb_sectors = nb_sectors;
1446     } else {
1447         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1448         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1449         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1450                                             nb_sectors, c);
1451     }
1452 }
1453 
1454 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1455                                      int64_t sector_num, int nb_sectors) {
1456     /*        aaaa   bbbb */
1457     if (sector_num >= req->sector_num + req->nb_sectors) {
1458         return false;
1459     }
1460     /* bbbb   aaaa        */
1461     if (req->sector_num >= sector_num + nb_sectors) {
1462         return false;
1463     }
1464     return true;
1465 }
1466 
1467 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1468         int64_t sector_num, int nb_sectors)
1469 {
1470     BdrvTrackedRequest *req;
1471     int64_t cluster_sector_num;
1472     int cluster_nb_sectors;
1473     bool retry;
1474 
1475     /* If we touch the same cluster it counts as an overlap.  This guarantees
1476      * that allocating writes will be serialized and not race with each other
1477      * for the same cluster.  For example, in copy-on-read it ensures that the
1478      * CoR read and write operations are atomic and guest writes cannot
1479      * interleave between them.
1480      */
1481     round_to_clusters(bs, sector_num, nb_sectors,
1482                       &cluster_sector_num, &cluster_nb_sectors);
1483 
1484     do {
1485         retry = false;
1486         QLIST_FOREACH(req, &bs->tracked_requests, list) {
1487             if (tracked_request_overlaps(req, cluster_sector_num,
1488                                          cluster_nb_sectors)) {
1489                 /* Hitting this means there was a reentrant request, for
1490                  * example, a block driver issuing nested requests.  This must
1491                  * never happen since it means deadlock.
1492                  */
1493                 assert(qemu_coroutine_self() != req->co);
1494 
1495                 qemu_co_queue_wait(&req->wait_queue);
1496                 retry = true;
1497                 break;
1498             }
1499         }
1500     } while (retry);
1501 }
1502 
1503 /*
1504  * Return values:
1505  * 0        - success
1506  * -EINVAL  - backing format specified, but no file
1507  * -ENOSPC  - can't update the backing file because no space is left in the
1508  *            image file header
1509  * -ENOTSUP - format driver doesn't support changing the backing file
1510  */
1511 int bdrv_change_backing_file(BlockDriverState *bs,
1512     const char *backing_file, const char *backing_fmt)
1513 {
1514     BlockDriver *drv = bs->drv;
1515     int ret;
1516 
1517     /* Backing file format doesn't make sense without a backing file */
1518     if (backing_fmt && !backing_file) {
1519         return -EINVAL;
1520     }
1521 
1522     if (drv->bdrv_change_backing_file != NULL) {
1523         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1524     } else {
1525         ret = -ENOTSUP;
1526     }
1527 
1528     if (ret == 0) {
1529         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1530         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1531     }
1532     return ret;
1533 }
1534 
1535 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1536                                    size_t size)
1537 {
1538     int64_t len;
1539 
1540     if (!bdrv_is_inserted(bs))
1541         return -ENOMEDIUM;
1542 
1543     if (bs->growable)
1544         return 0;
1545 
1546     len = bdrv_getlength(bs);
1547 
1548     if (offset < 0)
1549         return -EIO;
1550 
1551     if ((offset > len) || (len - offset < size))
1552         return -EIO;
1553 
1554     return 0;
1555 }
1556 
1557 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1558                               int nb_sectors)
1559 {
1560     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1561                                    nb_sectors * BDRV_SECTOR_SIZE);
1562 }
1563 
1564 typedef struct RwCo {
1565     BlockDriverState *bs;
1566     int64_t sector_num;
1567     int nb_sectors;
1568     QEMUIOVector *qiov;
1569     bool is_write;
1570     int ret;
1571 } RwCo;
1572 
1573 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1574 {
1575     RwCo *rwco = opaque;
1576 
1577     if (!rwco->is_write) {
1578         rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1579                                      rwco->nb_sectors, rwco->qiov, 0);
1580     } else {
1581         rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1582                                       rwco->nb_sectors, rwco->qiov, 0);
1583     }
1584 }
1585 
1586 /*
1587  * Process a synchronous request using coroutines
1588  */
1589 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1590                       int nb_sectors, bool is_write)
1591 {
1592     QEMUIOVector qiov;
1593     struct iovec iov = {
1594         .iov_base = (void *)buf,
1595         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1596     };
1597     Coroutine *co;
1598     RwCo rwco = {
1599         .bs = bs,
1600         .sector_num = sector_num,
1601         .nb_sectors = nb_sectors,
1602         .qiov = &qiov,
1603         .is_write = is_write,
1604         .ret = NOT_DONE,
1605     };
1606 
1607     qemu_iovec_init_external(&qiov, &iov, 1);
1608 
1609     /**
1610      * In sync call context, when the vcpu is blocked, this throttling timer
1611      * will not fire; so the I/O throttling function has to be disabled here
1612      * if it has been enabled.
1613      */
1614     if (bs->io_limits_enabled) {
1615         fprintf(stderr, "Disabling I/O throttling on '%s' due "
1616                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
1617         bdrv_io_limits_disable(bs);
1618     }
1619 
1620     if (qemu_in_coroutine()) {
1621         /* Fast-path if already in coroutine context */
1622         bdrv_rw_co_entry(&rwco);
1623     } else {
1624         co = qemu_coroutine_create(bdrv_rw_co_entry);
1625         qemu_coroutine_enter(co, &rwco);
1626         while (rwco.ret == NOT_DONE) {
1627             qemu_aio_wait();
1628         }
1629     }
1630     return rwco.ret;
1631 }
1632 
1633 /* return < 0 if error. See bdrv_write() for the return codes */
1634 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1635               uint8_t *buf, int nb_sectors)
1636 {
1637     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1638 }
1639 
1640 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
1641 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
1642                           uint8_t *buf, int nb_sectors)
1643 {
1644     bool enabled;
1645     int ret;
1646 
1647     enabled = bs->io_limits_enabled;
1648     bs->io_limits_enabled = false;
1649     ret = bdrv_read(bs, 0, buf, 1);
1650     bs->io_limits_enabled = enabled;
1651     return ret;
1652 }
1653 
1654 #define BITS_PER_LONG  (sizeof(unsigned long) * 8)
1655 
1656 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1657                              int nb_sectors, int dirty)
1658 {
1659     int64_t start, end;
1660     unsigned long val, idx, bit;
1661 
1662     start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1663     end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1664 
1665     for (; start <= end; start++) {
1666         idx = start / BITS_PER_LONG;
1667         bit = start % BITS_PER_LONG;
1668         val = bs->dirty_bitmap[idx];
1669         if (dirty) {
1670             if (!(val & (1UL << bit))) {
1671                 bs->dirty_count++;
1672                 val |= 1UL << bit;
1673             }
1674         } else {
1675             if (val & (1UL << bit)) {
1676                 bs->dirty_count--;
1677                 val &= ~(1UL << bit);
1678             }
1679         }
1680         bs->dirty_bitmap[idx] = val;
1681     }
1682 }
1683 
1684 /* Return < 0 if error. Important errors are:
1685   -EIO         generic I/O error (may happen for all errors)
1686   -ENOMEDIUM   No media inserted.
1687   -EINVAL      Invalid sector number or nb_sectors
1688   -EACCES      Trying to write a read-only device
1689 */
1690 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1691                const uint8_t *buf, int nb_sectors)
1692 {
1693     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1694 }
1695 
1696 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1697                void *buf, int count1)
1698 {
1699     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1700     int len, nb_sectors, count;
1701     int64_t sector_num;
1702     int ret;
1703 
1704     count = count1;
1705     /* first read to align to sector start */
1706     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1707     if (len > count)
1708         len = count;
1709     sector_num = offset >> BDRV_SECTOR_BITS;
1710     if (len > 0) {
1711         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1712             return ret;
1713         memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1714         count -= len;
1715         if (count == 0)
1716             return count1;
1717         sector_num++;
1718         buf += len;
1719     }
1720 
1721     /* read the sectors "in place" */
1722     nb_sectors = count >> BDRV_SECTOR_BITS;
1723     if (nb_sectors > 0) {
1724         if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1725             return ret;
1726         sector_num += nb_sectors;
1727         len = nb_sectors << BDRV_SECTOR_BITS;
1728         buf += len;
1729         count -= len;
1730     }
1731 
1732     /* add data from the last sector */
1733     if (count > 0) {
1734         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1735             return ret;
1736         memcpy(buf, tmp_buf, count);
1737     }
1738     return count1;
1739 }
1740 
1741 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1742                 const void *buf, int count1)
1743 {
1744     uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1745     int len, nb_sectors, count;
1746     int64_t sector_num;
1747     int ret;
1748 
1749     count = count1;
1750     /* first write to align to sector start */
1751     len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1752     if (len > count)
1753         len = count;
1754     sector_num = offset >> BDRV_SECTOR_BITS;
1755     if (len > 0) {
1756         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1757             return ret;
1758         memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1759         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1760             return ret;
1761         count -= len;
1762         if (count == 0)
1763             return count1;
1764         sector_num++;
1765         buf += len;
1766     }
1767 
1768     /* write the sectors "in place" */
1769     nb_sectors = count >> BDRV_SECTOR_BITS;
1770     if (nb_sectors > 0) {
1771         if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1772             return ret;
1773         sector_num += nb_sectors;
1774         len = nb_sectors << BDRV_SECTOR_BITS;
1775         buf += len;
1776         count -= len;
1777     }
1778 
1779     /* add data from the last sector */
1780     if (count > 0) {
1781         if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1782             return ret;
1783         memcpy(tmp_buf, buf, count);
1784         if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1785             return ret;
1786     }
1787     return count1;
1788 }
1789 
1790 /*
1791  * Writes to the file and ensures that no writes are reordered across this
1792  * request (acts as a barrier)
1793  *
1794  * Returns 0 on success, -errno in error cases.
1795  */
1796 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1797     const void *buf, int count)
1798 {
1799     int ret;
1800 
1801     ret = bdrv_pwrite(bs, offset, buf, count);
1802     if (ret < 0) {
1803         return ret;
1804     }
1805 
1806     /* No flush needed for cache modes that already do it */
1807     if (bs->enable_write_cache) {
1808         bdrv_flush(bs);
1809     }
1810 
1811     return 0;
1812 }
1813 
1814 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1815         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1816 {
1817     /* Perform I/O through a temporary buffer so that users who scribble over
1818      * their read buffer while the operation is in progress do not end up
1819      * modifying the image file.  This is critical for zero-copy guest I/O
1820      * where anything might happen inside guest memory.
1821      */
1822     void *bounce_buffer;
1823 
1824     BlockDriver *drv = bs->drv;
1825     struct iovec iov;
1826     QEMUIOVector bounce_qiov;
1827     int64_t cluster_sector_num;
1828     int cluster_nb_sectors;
1829     size_t skip_bytes;
1830     int ret;
1831 
1832     /* Cover entire cluster so no additional backing file I/O is required when
1833      * allocating cluster in the image file.
1834      */
1835     round_to_clusters(bs, sector_num, nb_sectors,
1836                       &cluster_sector_num, &cluster_nb_sectors);
1837 
1838     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1839                                    cluster_sector_num, cluster_nb_sectors);
1840 
1841     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1842     iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1843     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1844 
1845     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1846                              &bounce_qiov);
1847     if (ret < 0) {
1848         goto err;
1849     }
1850 
1851     if (drv->bdrv_co_write_zeroes &&
1852         buffer_is_zero(bounce_buffer, iov.iov_len)) {
1853         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
1854                                       cluster_nb_sectors);
1855     } else {
1856         /* This does not change the data on the disk, it is not necessary
1857          * to flush even in cache=writethrough mode.
1858          */
1859         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1860                                   &bounce_qiov);
1861     }
1862 
1863     if (ret < 0) {
1864         /* It might be okay to ignore write errors for guest requests.  If this
1865          * is a deliberate copy-on-read then we don't want to ignore the error.
1866          * Simply report it in all cases.
1867          */
1868         goto err;
1869     }
1870 
1871     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1872     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
1873                         nb_sectors * BDRV_SECTOR_SIZE);
1874 
1875 err:
1876     qemu_vfree(bounce_buffer);
1877     return ret;
1878 }
1879 
1880 /*
1881  * Handle a read request in coroutine context
1882  */
1883 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1884     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1885     BdrvRequestFlags flags)
1886 {
1887     BlockDriver *drv = bs->drv;
1888     BdrvTrackedRequest req;
1889     int ret;
1890 
1891     if (!drv) {
1892         return -ENOMEDIUM;
1893     }
1894     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1895         return -EIO;
1896     }
1897 
1898     /* throttling disk read I/O */
1899     if (bs->io_limits_enabled) {
1900         bdrv_io_limits_intercept(bs, false, nb_sectors);
1901     }
1902 
1903     if (bs->copy_on_read) {
1904         flags |= BDRV_REQ_COPY_ON_READ;
1905     }
1906     if (flags & BDRV_REQ_COPY_ON_READ) {
1907         bs->copy_on_read_in_flight++;
1908     }
1909 
1910     if (bs->copy_on_read_in_flight) {
1911         wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1912     }
1913 
1914     tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1915 
1916     if (flags & BDRV_REQ_COPY_ON_READ) {
1917         int pnum;
1918 
1919         ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1920         if (ret < 0) {
1921             goto out;
1922         }
1923 
1924         if (!ret || pnum != nb_sectors) {
1925             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1926             goto out;
1927         }
1928     }
1929 
1930     ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1931 
1932 out:
1933     tracked_request_end(&req);
1934 
1935     if (flags & BDRV_REQ_COPY_ON_READ) {
1936         bs->copy_on_read_in_flight--;
1937     }
1938 
1939     return ret;
1940 }
1941 
1942 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1943     int nb_sectors, QEMUIOVector *qiov)
1944 {
1945     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1946 
1947     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1948 }
1949 
1950 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1951     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1952 {
1953     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1954 
1955     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1956                             BDRV_REQ_COPY_ON_READ);
1957 }
1958 
1959 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1960     int64_t sector_num, int nb_sectors)
1961 {
1962     BlockDriver *drv = bs->drv;
1963     QEMUIOVector qiov;
1964     struct iovec iov;
1965     int ret;
1966 
1967     /* TODO Emulate only part of misaligned requests instead of letting block
1968      * drivers return -ENOTSUP and emulate everything */
1969 
1970     /* First try the efficient write zeroes operation */
1971     if (drv->bdrv_co_write_zeroes) {
1972         ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1973         if (ret != -ENOTSUP) {
1974             return ret;
1975         }
1976     }
1977 
1978     /* Fall back to bounce buffer if write zeroes is unsupported */
1979     iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
1980     iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1981     memset(iov.iov_base, 0, iov.iov_len);
1982     qemu_iovec_init_external(&qiov, &iov, 1);
1983 
1984     ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1985 
1986     qemu_vfree(iov.iov_base);
1987     return ret;
1988 }
1989 
1990 /*
1991  * Handle a write request in coroutine context
1992  */
1993 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1994     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1995     BdrvRequestFlags flags)
1996 {
1997     BlockDriver *drv = bs->drv;
1998     BdrvTrackedRequest req;
1999     int ret;
2000 
2001     if (!bs->drv) {
2002         return -ENOMEDIUM;
2003     }
2004     if (bs->read_only) {
2005         return -EACCES;
2006     }
2007     if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2008         return -EIO;
2009     }
2010 
2011     /* throttling disk write I/O */
2012     if (bs->io_limits_enabled) {
2013         bdrv_io_limits_intercept(bs, true, nb_sectors);
2014     }
2015 
2016     if (bs->copy_on_read_in_flight) {
2017         wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2018     }
2019 
2020     tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
2021 
2022     if (flags & BDRV_REQ_ZERO_WRITE) {
2023         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
2024     } else {
2025         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
2026     }
2027 
2028     if (ret == 0 && !bs->enable_write_cache) {
2029         ret = bdrv_co_flush(bs);
2030     }
2031 
2032     if (bs->dirty_bitmap) {
2033         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2034     }
2035 
2036     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2037         bs->wr_highest_sector = sector_num + nb_sectors - 1;
2038     }
2039 
2040     tracked_request_end(&req);
2041 
2042     return ret;
2043 }
2044 
2045 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2046     int nb_sectors, QEMUIOVector *qiov)
2047 {
2048     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2049 
2050     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2051 }
2052 
2053 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2054                                       int64_t sector_num, int nb_sectors)
2055 {
2056     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2057 
2058     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2059                              BDRV_REQ_ZERO_WRITE);
2060 }
2061 
2062 /**
2063  * Truncate file to 'offset' bytes (needed only for file protocols)
2064  */
2065 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2066 {
2067     BlockDriver *drv = bs->drv;
2068     int ret;
2069     if (!drv)
2070         return -ENOMEDIUM;
2071     if (!drv->bdrv_truncate)
2072         return -ENOTSUP;
2073     if (bs->read_only)
2074         return -EACCES;
2075     if (bdrv_in_use(bs))
2076         return -EBUSY;
2077     ret = drv->bdrv_truncate(bs, offset);
2078     if (ret == 0) {
2079         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2080         bdrv_dev_resize_cb(bs);
2081     }
2082     return ret;
2083 }
2084 
2085 /**
2086  * Length of a allocated file in bytes. Sparse files are counted by actual
2087  * allocated space. Return < 0 if error or unknown.
2088  */
2089 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2090 {
2091     BlockDriver *drv = bs->drv;
2092     if (!drv) {
2093         return -ENOMEDIUM;
2094     }
2095     if (drv->bdrv_get_allocated_file_size) {
2096         return drv->bdrv_get_allocated_file_size(bs);
2097     }
2098     if (bs->file) {
2099         return bdrv_get_allocated_file_size(bs->file);
2100     }
2101     return -ENOTSUP;
2102 }
2103 
2104 /**
2105  * Length of a file in bytes. Return < 0 if error or unknown.
2106  */
2107 int64_t bdrv_getlength(BlockDriverState *bs)
2108 {
2109     BlockDriver *drv = bs->drv;
2110     if (!drv)
2111         return -ENOMEDIUM;
2112 
2113     if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2114         if (drv->bdrv_getlength) {
2115             return drv->bdrv_getlength(bs);
2116         }
2117     }
2118     return bs->total_sectors * BDRV_SECTOR_SIZE;
2119 }
2120 
2121 /* return 0 as number of sectors if no device present or error */
2122 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2123 {
2124     int64_t length;
2125     length = bdrv_getlength(bs);
2126     if (length < 0)
2127         length = 0;
2128     else
2129         length = length >> BDRV_SECTOR_BITS;
2130     *nb_sectors_ptr = length;
2131 }
2132 
2133 /* throttling disk io limits */
2134 void bdrv_set_io_limits(BlockDriverState *bs,
2135                         BlockIOLimit *io_limits)
2136 {
2137     bs->io_limits = *io_limits;
2138     bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2139 }
2140 
2141 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2142                        BlockErrorAction on_write_error)
2143 {
2144     bs->on_read_error = on_read_error;
2145     bs->on_write_error = on_write_error;
2146 }
2147 
2148 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2149 {
2150     return is_read ? bs->on_read_error : bs->on_write_error;
2151 }
2152 
2153 int bdrv_is_read_only(BlockDriverState *bs)
2154 {
2155     return bs->read_only;
2156 }
2157 
2158 int bdrv_is_sg(BlockDriverState *bs)
2159 {
2160     return bs->sg;
2161 }
2162 
2163 int bdrv_enable_write_cache(BlockDriverState *bs)
2164 {
2165     return bs->enable_write_cache;
2166 }
2167 
2168 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
2169 {
2170     bs->enable_write_cache = wce;
2171 }
2172 
2173 int bdrv_is_encrypted(BlockDriverState *bs)
2174 {
2175     if (bs->backing_hd && bs->backing_hd->encrypted)
2176         return 1;
2177     return bs->encrypted;
2178 }
2179 
2180 int bdrv_key_required(BlockDriverState *bs)
2181 {
2182     BlockDriverState *backing_hd = bs->backing_hd;
2183 
2184     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2185         return 1;
2186     return (bs->encrypted && !bs->valid_key);
2187 }
2188 
2189 int bdrv_set_key(BlockDriverState *bs, const char *key)
2190 {
2191     int ret;
2192     if (bs->backing_hd && bs->backing_hd->encrypted) {
2193         ret = bdrv_set_key(bs->backing_hd, key);
2194         if (ret < 0)
2195             return ret;
2196         if (!bs->encrypted)
2197             return 0;
2198     }
2199     if (!bs->encrypted) {
2200         return -EINVAL;
2201     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2202         return -ENOMEDIUM;
2203     }
2204     ret = bs->drv->bdrv_set_key(bs, key);
2205     if (ret < 0) {
2206         bs->valid_key = 0;
2207     } else if (!bs->valid_key) {
2208         bs->valid_key = 1;
2209         /* call the change callback now, we skipped it on open */
2210         bdrv_dev_change_media_cb(bs, true);
2211     }
2212     return ret;
2213 }
2214 
2215 const char *bdrv_get_format_name(BlockDriverState *bs)
2216 {
2217     return bs->drv ? bs->drv->format_name : NULL;
2218 }
2219 
2220 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2221                          void *opaque)
2222 {
2223     BlockDriver *drv;
2224 
2225     QLIST_FOREACH(drv, &bdrv_drivers, list) {
2226         it(opaque, drv->format_name);
2227     }
2228 }
2229 
2230 BlockDriverState *bdrv_find(const char *name)
2231 {
2232     BlockDriverState *bs;
2233 
2234     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2235         if (!strcmp(name, bs->device_name)) {
2236             return bs;
2237         }
2238     }
2239     return NULL;
2240 }
2241 
2242 BlockDriverState *bdrv_next(BlockDriverState *bs)
2243 {
2244     if (!bs) {
2245         return QTAILQ_FIRST(&bdrv_states);
2246     }
2247     return QTAILQ_NEXT(bs, list);
2248 }
2249 
2250 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2251 {
2252     BlockDriverState *bs;
2253 
2254     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2255         it(opaque, bs);
2256     }
2257 }
2258 
2259 const char *bdrv_get_device_name(BlockDriverState *bs)
2260 {
2261     return bs->device_name;
2262 }
2263 
2264 int bdrv_get_flags(BlockDriverState *bs)
2265 {
2266     return bs->open_flags;
2267 }
2268 
2269 void bdrv_flush_all(void)
2270 {
2271     BlockDriverState *bs;
2272 
2273     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2274         bdrv_flush(bs);
2275     }
2276 }
2277 
2278 int bdrv_has_zero_init(BlockDriverState *bs)
2279 {
2280     assert(bs->drv);
2281 
2282     if (bs->drv->bdrv_has_zero_init) {
2283         return bs->drv->bdrv_has_zero_init(bs);
2284     }
2285 
2286     return 1;
2287 }
2288 
2289 typedef struct BdrvCoIsAllocatedData {
2290     BlockDriverState *bs;
2291     int64_t sector_num;
2292     int nb_sectors;
2293     int *pnum;
2294     int ret;
2295     bool done;
2296 } BdrvCoIsAllocatedData;
2297 
2298 /*
2299  * Returns true iff the specified sector is present in the disk image. Drivers
2300  * not implementing the functionality are assumed to not support backing files,
2301  * hence all their sectors are reported as allocated.
2302  *
2303  * If 'sector_num' is beyond the end of the disk image the return value is 0
2304  * and 'pnum' is set to 0.
2305  *
2306  * 'pnum' is set to the number of sectors (including and immediately following
2307  * the specified sector) that are known to be in the same
2308  * allocated/unallocated state.
2309  *
2310  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
2311  * beyond the end of the disk image it will be clamped.
2312  */
2313 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2314                                       int nb_sectors, int *pnum)
2315 {
2316     int64_t n;
2317 
2318     if (sector_num >= bs->total_sectors) {
2319         *pnum = 0;
2320         return 0;
2321     }
2322 
2323     n = bs->total_sectors - sector_num;
2324     if (n < nb_sectors) {
2325         nb_sectors = n;
2326     }
2327 
2328     if (!bs->drv->bdrv_co_is_allocated) {
2329         *pnum = nb_sectors;
2330         return 1;
2331     }
2332 
2333     return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2334 }
2335 
2336 /* Coroutine wrapper for bdrv_is_allocated() */
2337 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2338 {
2339     BdrvCoIsAllocatedData *data = opaque;
2340     BlockDriverState *bs = data->bs;
2341 
2342     data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2343                                      data->pnum);
2344     data->done = true;
2345 }
2346 
2347 /*
2348  * Synchronous wrapper around bdrv_co_is_allocated().
2349  *
2350  * See bdrv_co_is_allocated() for details.
2351  */
2352 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2353                       int *pnum)
2354 {
2355     Coroutine *co;
2356     BdrvCoIsAllocatedData data = {
2357         .bs = bs,
2358         .sector_num = sector_num,
2359         .nb_sectors = nb_sectors,
2360         .pnum = pnum,
2361         .done = false,
2362     };
2363 
2364     co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2365     qemu_coroutine_enter(co, &data);
2366     while (!data.done) {
2367         qemu_aio_wait();
2368     }
2369     return data.ret;
2370 }
2371 
2372 /*
2373  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2374  *
2375  * Return true if the given sector is allocated in any image between
2376  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
2377  * sector is allocated in any image of the chain.  Return false otherwise.
2378  *
2379  * 'pnum' is set to the number of sectors (including and immediately following
2380  *  the specified sector) that are known to be in the same
2381  *  allocated/unallocated state.
2382  *
2383  */
2384 int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
2385                                             BlockDriverState *base,
2386                                             int64_t sector_num,
2387                                             int nb_sectors, int *pnum)
2388 {
2389     BlockDriverState *intermediate;
2390     int ret, n = nb_sectors;
2391 
2392     intermediate = top;
2393     while (intermediate && intermediate != base) {
2394         int pnum_inter;
2395         ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
2396                                    &pnum_inter);
2397         if (ret < 0) {
2398             return ret;
2399         } else if (ret) {
2400             *pnum = pnum_inter;
2401             return 1;
2402         }
2403 
2404         /*
2405          * [sector_num, nb_sectors] is unallocated on top but intermediate
2406          * might have
2407          *
2408          * [sector_num+x, nr_sectors] allocated.
2409          */
2410         if (n > pnum_inter) {
2411             n = pnum_inter;
2412         }
2413 
2414         intermediate = intermediate->backing_hd;
2415     }
2416 
2417     *pnum = n;
2418     return 0;
2419 }
2420 
2421 BlockInfoList *qmp_query_block(Error **errp)
2422 {
2423     BlockInfoList *head = NULL, *cur_item = NULL;
2424     BlockDriverState *bs;
2425 
2426     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2427         BlockInfoList *info = g_malloc0(sizeof(*info));
2428 
2429         info->value = g_malloc0(sizeof(*info->value));
2430         info->value->device = g_strdup(bs->device_name);
2431         info->value->type = g_strdup("unknown");
2432         info->value->locked = bdrv_dev_is_medium_locked(bs);
2433         info->value->removable = bdrv_dev_has_removable_media(bs);
2434 
2435         if (bdrv_dev_has_removable_media(bs)) {
2436             info->value->has_tray_open = true;
2437             info->value->tray_open = bdrv_dev_is_tray_open(bs);
2438         }
2439 
2440         if (bdrv_iostatus_is_enabled(bs)) {
2441             info->value->has_io_status = true;
2442             info->value->io_status = bs->iostatus;
2443         }
2444 
2445         if (bs->drv) {
2446             info->value->has_inserted = true;
2447             info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2448             info->value->inserted->file = g_strdup(bs->filename);
2449             info->value->inserted->ro = bs->read_only;
2450             info->value->inserted->drv = g_strdup(bs->drv->format_name);
2451             info->value->inserted->encrypted = bs->encrypted;
2452             info->value->inserted->encryption_key_missing = bdrv_key_required(bs);
2453             if (bs->backing_file[0]) {
2454                 info->value->inserted->has_backing_file = true;
2455                 info->value->inserted->backing_file = g_strdup(bs->backing_file);
2456             }
2457 
2458             info->value->inserted->backing_file_depth =
2459                 bdrv_get_backing_file_depth(bs);
2460 
2461             if (bs->io_limits_enabled) {
2462                 info->value->inserted->bps =
2463                                bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2464                 info->value->inserted->bps_rd =
2465                                bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2466                 info->value->inserted->bps_wr =
2467                                bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2468                 info->value->inserted->iops =
2469                                bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2470                 info->value->inserted->iops_rd =
2471                                bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2472                 info->value->inserted->iops_wr =
2473                                bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2474             }
2475         }
2476 
2477         /* XXX: waiting for the qapi to support GSList */
2478         if (!cur_item) {
2479             head = cur_item = info;
2480         } else {
2481             cur_item->next = info;
2482             cur_item = info;
2483         }
2484     }
2485 
2486     return head;
2487 }
2488 
2489 /* Consider exposing this as a full fledged QMP command */
2490 static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2491 {
2492     BlockStats *s;
2493 
2494     s = g_malloc0(sizeof(*s));
2495 
2496     if (bs->device_name[0]) {
2497         s->has_device = true;
2498         s->device = g_strdup(bs->device_name);
2499     }
2500 
2501     s->stats = g_malloc0(sizeof(*s->stats));
2502     s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2503     s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2504     s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2505     s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2506     s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2507     s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2508     s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2509     s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2510     s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2511 
2512     if (bs->file) {
2513         s->has_parent = true;
2514         s->parent = qmp_query_blockstat(bs->file, NULL);
2515     }
2516 
2517     return s;
2518 }
2519 
2520 BlockStatsList *qmp_query_blockstats(Error **errp)
2521 {
2522     BlockStatsList *head = NULL, *cur_item = NULL;
2523     BlockDriverState *bs;
2524 
2525     QTAILQ_FOREACH(bs, &bdrv_states, list) {
2526         BlockStatsList *info = g_malloc0(sizeof(*info));
2527         info->value = qmp_query_blockstat(bs, NULL);
2528 
2529         /* XXX: waiting for the qapi to support GSList */
2530         if (!cur_item) {
2531             head = cur_item = info;
2532         } else {
2533             cur_item->next = info;
2534             cur_item = info;
2535         }
2536     }
2537 
2538     return head;
2539 }
2540 
2541 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2542 {
2543     if (bs->backing_hd && bs->backing_hd->encrypted)
2544         return bs->backing_file;
2545     else if (bs->encrypted)
2546         return bs->filename;
2547     else
2548         return NULL;
2549 }
2550 
2551 void bdrv_get_backing_filename(BlockDriverState *bs,
2552                                char *filename, int filename_size)
2553 {
2554     pstrcpy(filename, filename_size, bs->backing_file);
2555 }
2556 
2557 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2558                           const uint8_t *buf, int nb_sectors)
2559 {
2560     BlockDriver *drv = bs->drv;
2561     if (!drv)
2562         return -ENOMEDIUM;
2563     if (!drv->bdrv_write_compressed)
2564         return -ENOTSUP;
2565     if (bdrv_check_request(bs, sector_num, nb_sectors))
2566         return -EIO;
2567 
2568     if (bs->dirty_bitmap) {
2569         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2570     }
2571 
2572     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2573 }
2574 
2575 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2576 {
2577     BlockDriver *drv = bs->drv;
2578     if (!drv)
2579         return -ENOMEDIUM;
2580     if (!drv->bdrv_get_info)
2581         return -ENOTSUP;
2582     memset(bdi, 0, sizeof(*bdi));
2583     return drv->bdrv_get_info(bs, bdi);
2584 }
2585 
2586 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2587                       int64_t pos, int size)
2588 {
2589     BlockDriver *drv = bs->drv;
2590     if (!drv)
2591         return -ENOMEDIUM;
2592     if (drv->bdrv_save_vmstate)
2593         return drv->bdrv_save_vmstate(bs, buf, pos, size);
2594     if (bs->file)
2595         return bdrv_save_vmstate(bs->file, buf, pos, size);
2596     return -ENOTSUP;
2597 }
2598 
2599 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2600                       int64_t pos, int size)
2601 {
2602     BlockDriver *drv = bs->drv;
2603     if (!drv)
2604         return -ENOMEDIUM;
2605     if (drv->bdrv_load_vmstate)
2606         return drv->bdrv_load_vmstate(bs, buf, pos, size);
2607     if (bs->file)
2608         return bdrv_load_vmstate(bs->file, buf, pos, size);
2609     return -ENOTSUP;
2610 }
2611 
2612 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2613 {
2614     BlockDriver *drv = bs->drv;
2615 
2616     if (!drv || !drv->bdrv_debug_event) {
2617         return;
2618     }
2619 
2620     drv->bdrv_debug_event(bs, event);
2621 
2622 }
2623 
2624 /**************************************************************/
2625 /* handling of snapshots */
2626 
2627 int bdrv_can_snapshot(BlockDriverState *bs)
2628 {
2629     BlockDriver *drv = bs->drv;
2630     if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2631         return 0;
2632     }
2633 
2634     if (!drv->bdrv_snapshot_create) {
2635         if (bs->file != NULL) {
2636             return bdrv_can_snapshot(bs->file);
2637         }
2638         return 0;
2639     }
2640 
2641     return 1;
2642 }
2643 
2644 int bdrv_is_snapshot(BlockDriverState *bs)
2645 {
2646     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2647 }
2648 
2649 BlockDriverState *bdrv_snapshots(void)
2650 {
2651     BlockDriverState *bs;
2652 
2653     if (bs_snapshots) {
2654         return bs_snapshots;
2655     }
2656 
2657     bs = NULL;
2658     while ((bs = bdrv_next(bs))) {
2659         if (bdrv_can_snapshot(bs)) {
2660             bs_snapshots = bs;
2661             return bs;
2662         }
2663     }
2664     return NULL;
2665 }
2666 
2667 int bdrv_snapshot_create(BlockDriverState *bs,
2668                          QEMUSnapshotInfo *sn_info)
2669 {
2670     BlockDriver *drv = bs->drv;
2671     if (!drv)
2672         return -ENOMEDIUM;
2673     if (drv->bdrv_snapshot_create)
2674         return drv->bdrv_snapshot_create(bs, sn_info);
2675     if (bs->file)
2676         return bdrv_snapshot_create(bs->file, sn_info);
2677     return -ENOTSUP;
2678 }
2679 
2680 int bdrv_snapshot_goto(BlockDriverState *bs,
2681                        const char *snapshot_id)
2682 {
2683     BlockDriver *drv = bs->drv;
2684     int ret, open_ret;
2685 
2686     if (!drv)
2687         return -ENOMEDIUM;
2688     if (drv->bdrv_snapshot_goto)
2689         return drv->bdrv_snapshot_goto(bs, snapshot_id);
2690 
2691     if (bs->file) {
2692         drv->bdrv_close(bs);
2693         ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2694         open_ret = drv->bdrv_open(bs, bs->open_flags);
2695         if (open_ret < 0) {
2696             bdrv_delete(bs->file);
2697             bs->drv = NULL;
2698             return open_ret;
2699         }
2700         return ret;
2701     }
2702 
2703     return -ENOTSUP;
2704 }
2705 
2706 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2707 {
2708     BlockDriver *drv = bs->drv;
2709     if (!drv)
2710         return -ENOMEDIUM;
2711     if (drv->bdrv_snapshot_delete)
2712         return drv->bdrv_snapshot_delete(bs, snapshot_id);
2713     if (bs->file)
2714         return bdrv_snapshot_delete(bs->file, snapshot_id);
2715     return -ENOTSUP;
2716 }
2717 
2718 int bdrv_snapshot_list(BlockDriverState *bs,
2719                        QEMUSnapshotInfo **psn_info)
2720 {
2721     BlockDriver *drv = bs->drv;
2722     if (!drv)
2723         return -ENOMEDIUM;
2724     if (drv->bdrv_snapshot_list)
2725         return drv->bdrv_snapshot_list(bs, psn_info);
2726     if (bs->file)
2727         return bdrv_snapshot_list(bs->file, psn_info);
2728     return -ENOTSUP;
2729 }
2730 
2731 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2732         const char *snapshot_name)
2733 {
2734     BlockDriver *drv = bs->drv;
2735     if (!drv) {
2736         return -ENOMEDIUM;
2737     }
2738     if (!bs->read_only) {
2739         return -EINVAL;
2740     }
2741     if (drv->bdrv_snapshot_load_tmp) {
2742         return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2743     }
2744     return -ENOTSUP;
2745 }
2746 
2747 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2748         const char *backing_file)
2749 {
2750     if (!bs->drv) {
2751         return NULL;
2752     }
2753 
2754     if (bs->backing_hd) {
2755         if (strcmp(bs->backing_file, backing_file) == 0) {
2756             return bs->backing_hd;
2757         } else {
2758             return bdrv_find_backing_image(bs->backing_hd, backing_file);
2759         }
2760     }
2761 
2762     return NULL;
2763 }
2764 
2765 int bdrv_get_backing_file_depth(BlockDriverState *bs)
2766 {
2767     if (!bs->drv) {
2768         return 0;
2769     }
2770 
2771     if (!bs->backing_hd) {
2772         return 0;
2773     }
2774 
2775     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
2776 }
2777 
2778 #define NB_SUFFIXES 4
2779 
2780 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2781 {
2782     static const char suffixes[NB_SUFFIXES] = "KMGT";
2783     int64_t base;
2784     int i;
2785 
2786     if (size <= 999) {
2787         snprintf(buf, buf_size, "%" PRId64, size);
2788     } else {
2789         base = 1024;
2790         for(i = 0; i < NB_SUFFIXES; i++) {
2791             if (size < (10 * base)) {
2792                 snprintf(buf, buf_size, "%0.1f%c",
2793                          (double)size / base,
2794                          suffixes[i]);
2795                 break;
2796             } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2797                 snprintf(buf, buf_size, "%" PRId64 "%c",
2798                          ((size + (base >> 1)) / base),
2799                          suffixes[i]);
2800                 break;
2801             }
2802             base = base * 1024;
2803         }
2804     }
2805     return buf;
2806 }
2807 
2808 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2809 {
2810     char buf1[128], date_buf[128], clock_buf[128];
2811 #ifdef _WIN32
2812     struct tm *ptm;
2813 #else
2814     struct tm tm;
2815 #endif
2816     time_t ti;
2817     int64_t secs;
2818 
2819     if (!sn) {
2820         snprintf(buf, buf_size,
2821                  "%-10s%-20s%7s%20s%15s",
2822                  "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2823     } else {
2824         ti = sn->date_sec;
2825 #ifdef _WIN32
2826         ptm = localtime(&ti);
2827         strftime(date_buf, sizeof(date_buf),
2828                  "%Y-%m-%d %H:%M:%S", ptm);
2829 #else
2830         localtime_r(&ti, &tm);
2831         strftime(date_buf, sizeof(date_buf),
2832                  "%Y-%m-%d %H:%M:%S", &tm);
2833 #endif
2834         secs = sn->vm_clock_nsec / 1000000000;
2835         snprintf(clock_buf, sizeof(clock_buf),
2836                  "%02d:%02d:%02d.%03d",
2837                  (int)(secs / 3600),
2838                  (int)((secs / 60) % 60),
2839                  (int)(secs % 60),
2840                  (int)((sn->vm_clock_nsec / 1000000) % 1000));
2841         snprintf(buf, buf_size,
2842                  "%-10s%-20s%7s%20s%15s",
2843                  sn->id_str, sn->name,
2844                  get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2845                  date_buf,
2846                  clock_buf);
2847     }
2848     return buf;
2849 }
2850 
2851 /**************************************************************/
2852 /* async I/Os */
2853 
2854 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2855                                  QEMUIOVector *qiov, int nb_sectors,
2856                                  BlockDriverCompletionFunc *cb, void *opaque)
2857 {
2858     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2859 
2860     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2861                                  cb, opaque, false);
2862 }
2863 
2864 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2865                                   QEMUIOVector *qiov, int nb_sectors,
2866                                   BlockDriverCompletionFunc *cb, void *opaque)
2867 {
2868     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2869 
2870     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2871                                  cb, opaque, true);
2872 }
2873 
2874 
2875 typedef struct MultiwriteCB {
2876     int error;
2877     int num_requests;
2878     int num_callbacks;
2879     struct {
2880         BlockDriverCompletionFunc *cb;
2881         void *opaque;
2882         QEMUIOVector *free_qiov;
2883     } callbacks[];
2884 } MultiwriteCB;
2885 
2886 static void multiwrite_user_cb(MultiwriteCB *mcb)
2887 {
2888     int i;
2889 
2890     for (i = 0; i < mcb->num_callbacks; i++) {
2891         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2892         if (mcb->callbacks[i].free_qiov) {
2893             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2894         }
2895         g_free(mcb->callbacks[i].free_qiov);
2896     }
2897 }
2898 
2899 static void multiwrite_cb(void *opaque, int ret)
2900 {
2901     MultiwriteCB *mcb = opaque;
2902 
2903     trace_multiwrite_cb(mcb, ret);
2904 
2905     if (ret < 0 && !mcb->error) {
2906         mcb->error = ret;
2907     }
2908 
2909     mcb->num_requests--;
2910     if (mcb->num_requests == 0) {
2911         multiwrite_user_cb(mcb);
2912         g_free(mcb);
2913     }
2914 }
2915 
2916 static int multiwrite_req_compare(const void *a, const void *b)
2917 {
2918     const BlockRequest *req1 = a, *req2 = b;
2919 
2920     /*
2921      * Note that we can't simply subtract req2->sector from req1->sector
2922      * here as that could overflow the return value.
2923      */
2924     if (req1->sector > req2->sector) {
2925         return 1;
2926     } else if (req1->sector < req2->sector) {
2927         return -1;
2928     } else {
2929         return 0;
2930     }
2931 }
2932 
2933 /*
2934  * Takes a bunch of requests and tries to merge them. Returns the number of
2935  * requests that remain after merging.
2936  */
2937 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2938     int num_reqs, MultiwriteCB *mcb)
2939 {
2940     int i, outidx;
2941 
2942     // Sort requests by start sector
2943     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2944 
2945     // Check if adjacent requests touch the same clusters. If so, combine them,
2946     // filling up gaps with zero sectors.
2947     outidx = 0;
2948     for (i = 1; i < num_reqs; i++) {
2949         int merge = 0;
2950         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2951 
2952         // Handle exactly sequential writes and overlapping writes.
2953         if (reqs[i].sector <= oldreq_last) {
2954             merge = 1;
2955         }
2956 
2957         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2958             merge = 0;
2959         }
2960 
2961         if (merge) {
2962             size_t size;
2963             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
2964             qemu_iovec_init(qiov,
2965                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2966 
2967             // Add the first request to the merged one. If the requests are
2968             // overlapping, drop the last sectors of the first request.
2969             size = (reqs[i].sector - reqs[outidx].sector) << 9;
2970             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
2971 
2972             // We should need to add any zeros between the two requests
2973             assert (reqs[i].sector <= oldreq_last);
2974 
2975             // Add the second request
2976             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
2977 
2978             reqs[outidx].nb_sectors = qiov->size >> 9;
2979             reqs[outidx].qiov = qiov;
2980 
2981             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
2982         } else {
2983             outidx++;
2984             reqs[outidx].sector     = reqs[i].sector;
2985             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
2986             reqs[outidx].qiov       = reqs[i].qiov;
2987         }
2988     }
2989 
2990     return outidx + 1;
2991 }
2992 
2993 /*
2994  * Submit multiple AIO write requests at once.
2995  *
2996  * On success, the function returns 0 and all requests in the reqs array have
2997  * been submitted. In error case this function returns -1, and any of the
2998  * requests may or may not be submitted yet. In particular, this means that the
2999  * callback will be called for some of the requests, for others it won't. The
3000  * caller must check the error field of the BlockRequest to wait for the right
3001  * callbacks (if error != 0, no callback will be called).
3002  *
3003  * The implementation may modify the contents of the reqs array, e.g. to merge
3004  * requests. However, the fields opaque and error are left unmodified as they
3005  * are used to signal failure for a single request to the caller.
3006  */
3007 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3008 {
3009     MultiwriteCB *mcb;
3010     int i;
3011 
3012     /* don't submit writes if we don't have a medium */
3013     if (bs->drv == NULL) {
3014         for (i = 0; i < num_reqs; i++) {
3015             reqs[i].error = -ENOMEDIUM;
3016         }
3017         return -1;
3018     }
3019 
3020     if (num_reqs == 0) {
3021         return 0;
3022     }
3023 
3024     // Create MultiwriteCB structure
3025     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3026     mcb->num_requests = 0;
3027     mcb->num_callbacks = num_reqs;
3028 
3029     for (i = 0; i < num_reqs; i++) {
3030         mcb->callbacks[i].cb = reqs[i].cb;
3031         mcb->callbacks[i].opaque = reqs[i].opaque;
3032     }
3033 
3034     // Check for mergable requests
3035     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3036 
3037     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3038 
3039     /* Run the aio requests. */
3040     mcb->num_requests = num_reqs;
3041     for (i = 0; i < num_reqs; i++) {
3042         bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3043             reqs[i].nb_sectors, multiwrite_cb, mcb);
3044     }
3045 
3046     return 0;
3047 }
3048 
3049 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3050 {
3051     acb->pool->cancel(acb);
3052 }
3053 
3054 /* block I/O throttling */
3055 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3056                  bool is_write, double elapsed_time, uint64_t *wait)
3057 {
3058     uint64_t bps_limit = 0;
3059     double   bytes_limit, bytes_base, bytes_res;
3060     double   slice_time, wait_time;
3061 
3062     if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3063         bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3064     } else if (bs->io_limits.bps[is_write]) {
3065         bps_limit = bs->io_limits.bps[is_write];
3066     } else {
3067         if (wait) {
3068             *wait = 0;
3069         }
3070 
3071         return false;
3072     }
3073 
3074     slice_time = bs->slice_end - bs->slice_start;
3075     slice_time /= (NANOSECONDS_PER_SECOND);
3076     bytes_limit = bps_limit * slice_time;
3077     bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3078     if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3079         bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3080     }
3081 
3082     /* bytes_base: the bytes of data which have been read/written; and
3083      *             it is obtained from the history statistic info.
3084      * bytes_res: the remaining bytes of data which need to be read/written.
3085      * (bytes_base + bytes_res) / bps_limit: used to calcuate
3086      *             the total time for completing reading/writting all data.
3087      */
3088     bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3089 
3090     if (bytes_base + bytes_res <= bytes_limit) {
3091         if (wait) {
3092             *wait = 0;
3093         }
3094 
3095         return false;
3096     }
3097 
3098     /* Calc approx time to dispatch */
3099     wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3100 
3101     /* When the I/O rate at runtime exceeds the limits,
3102      * bs->slice_end need to be extended in order that the current statistic
3103      * info can be kept until the timer fire, so it is increased and tuned
3104      * based on the result of experiment.
3105      */
3106     bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3107     bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3108     if (wait) {
3109         *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3110     }
3111 
3112     return true;
3113 }
3114 
3115 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3116                              double elapsed_time, uint64_t *wait)
3117 {
3118     uint64_t iops_limit = 0;
3119     double   ios_limit, ios_base;
3120     double   slice_time, wait_time;
3121 
3122     if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3123         iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3124     } else if (bs->io_limits.iops[is_write]) {
3125         iops_limit = bs->io_limits.iops[is_write];
3126     } else {
3127         if (wait) {
3128             *wait = 0;
3129         }
3130 
3131         return false;
3132     }
3133 
3134     slice_time = bs->slice_end - bs->slice_start;
3135     slice_time /= (NANOSECONDS_PER_SECOND);
3136     ios_limit  = iops_limit * slice_time;
3137     ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3138     if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3139         ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3140     }
3141 
3142     if (ios_base + 1 <= ios_limit) {
3143         if (wait) {
3144             *wait = 0;
3145         }
3146 
3147         return false;
3148     }
3149 
3150     /* Calc approx time to dispatch */
3151     wait_time = (ios_base + 1) / iops_limit;
3152     if (wait_time > elapsed_time) {
3153         wait_time = wait_time - elapsed_time;
3154     } else {
3155         wait_time = 0;
3156     }
3157 
3158     bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3159     bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3160     if (wait) {
3161         *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3162     }
3163 
3164     return true;
3165 }
3166 
3167 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3168                            bool is_write, int64_t *wait)
3169 {
3170     int64_t  now, max_wait;
3171     uint64_t bps_wait = 0, iops_wait = 0;
3172     double   elapsed_time;
3173     int      bps_ret, iops_ret;
3174 
3175     now = qemu_get_clock_ns(vm_clock);
3176     if ((bs->slice_start < now)
3177         && (bs->slice_end > now)) {
3178         bs->slice_end = now + bs->slice_time;
3179     } else {
3180         bs->slice_time  =  5 * BLOCK_IO_SLICE_TIME;
3181         bs->slice_start = now;
3182         bs->slice_end   = now + bs->slice_time;
3183 
3184         bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
3185         bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3186 
3187         bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
3188         bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
3189     }
3190 
3191     elapsed_time  = now - bs->slice_start;
3192     elapsed_time  /= (NANOSECONDS_PER_SECOND);
3193 
3194     bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
3195                                       is_write, elapsed_time, &bps_wait);
3196     iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3197                                       elapsed_time, &iops_wait);
3198     if (bps_ret || iops_ret) {
3199         max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3200         if (wait) {
3201             *wait = max_wait;
3202         }
3203 
3204         now = qemu_get_clock_ns(vm_clock);
3205         if (bs->slice_end < now + max_wait) {
3206             bs->slice_end = now + max_wait;
3207         }
3208 
3209         return true;
3210     }
3211 
3212     if (wait) {
3213         *wait = 0;
3214     }
3215 
3216     return false;
3217 }
3218 
3219 /**************************************************************/
3220 /* async block device emulation */
3221 
3222 typedef struct BlockDriverAIOCBSync {
3223     BlockDriverAIOCB common;
3224     QEMUBH *bh;
3225     int ret;
3226     /* vector translation state */
3227     QEMUIOVector *qiov;
3228     uint8_t *bounce;
3229     int is_write;
3230 } BlockDriverAIOCBSync;
3231 
3232 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3233 {
3234     BlockDriverAIOCBSync *acb =
3235         container_of(blockacb, BlockDriverAIOCBSync, common);
3236     qemu_bh_delete(acb->bh);
3237     acb->bh = NULL;
3238     qemu_aio_release(acb);
3239 }
3240 
3241 static AIOPool bdrv_em_aio_pool = {
3242     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
3243     .cancel             = bdrv_aio_cancel_em,
3244 };
3245 
3246 static void bdrv_aio_bh_cb(void *opaque)
3247 {
3248     BlockDriverAIOCBSync *acb = opaque;
3249 
3250     if (!acb->is_write)
3251         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
3252     qemu_vfree(acb->bounce);
3253     acb->common.cb(acb->common.opaque, acb->ret);
3254     qemu_bh_delete(acb->bh);
3255     acb->bh = NULL;
3256     qemu_aio_release(acb);
3257 }
3258 
3259 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3260                                             int64_t sector_num,
3261                                             QEMUIOVector *qiov,
3262                                             int nb_sectors,
3263                                             BlockDriverCompletionFunc *cb,
3264                                             void *opaque,
3265                                             int is_write)
3266 
3267 {
3268     BlockDriverAIOCBSync *acb;
3269 
3270     acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3271     acb->is_write = is_write;
3272     acb->qiov = qiov;
3273     acb->bounce = qemu_blockalign(bs, qiov->size);
3274     acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3275 
3276     if (is_write) {
3277         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
3278         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3279     } else {
3280         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3281     }
3282 
3283     qemu_bh_schedule(acb->bh);
3284 
3285     return &acb->common;
3286 }
3287 
3288 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3289         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3290         BlockDriverCompletionFunc *cb, void *opaque)
3291 {
3292     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3293 }
3294 
3295 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3296         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3297         BlockDriverCompletionFunc *cb, void *opaque)
3298 {
3299     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3300 }
3301 
3302 
3303 typedef struct BlockDriverAIOCBCoroutine {
3304     BlockDriverAIOCB common;
3305     BlockRequest req;
3306     bool is_write;
3307     QEMUBH* bh;
3308 } BlockDriverAIOCBCoroutine;
3309 
3310 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3311 {
3312     qemu_aio_flush();
3313 }
3314 
3315 static AIOPool bdrv_em_co_aio_pool = {
3316     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
3317     .cancel             = bdrv_aio_co_cancel_em,
3318 };
3319 
3320 static void bdrv_co_em_bh(void *opaque)
3321 {
3322     BlockDriverAIOCBCoroutine *acb = opaque;
3323 
3324     acb->common.cb(acb->common.opaque, acb->req.error);
3325     qemu_bh_delete(acb->bh);
3326     qemu_aio_release(acb);
3327 }
3328 
3329 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3330 static void coroutine_fn bdrv_co_do_rw(void *opaque)
3331 {
3332     BlockDriverAIOCBCoroutine *acb = opaque;
3333     BlockDriverState *bs = acb->common.bs;
3334 
3335     if (!acb->is_write) {
3336         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3337             acb->req.nb_sectors, acb->req.qiov, 0);
3338     } else {
3339         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3340             acb->req.nb_sectors, acb->req.qiov, 0);
3341     }
3342 
3343     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3344     qemu_bh_schedule(acb->bh);
3345 }
3346 
3347 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3348                                                int64_t sector_num,
3349                                                QEMUIOVector *qiov,
3350                                                int nb_sectors,
3351                                                BlockDriverCompletionFunc *cb,
3352                                                void *opaque,
3353                                                bool is_write)
3354 {
3355     Coroutine *co;
3356     BlockDriverAIOCBCoroutine *acb;
3357 
3358     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3359     acb->req.sector = sector_num;
3360     acb->req.nb_sectors = nb_sectors;
3361     acb->req.qiov = qiov;
3362     acb->is_write = is_write;
3363 
3364     co = qemu_coroutine_create(bdrv_co_do_rw);
3365     qemu_coroutine_enter(co, acb);
3366 
3367     return &acb->common;
3368 }
3369 
3370 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3371 {
3372     BlockDriverAIOCBCoroutine *acb = opaque;
3373     BlockDriverState *bs = acb->common.bs;
3374 
3375     acb->req.error = bdrv_co_flush(bs);
3376     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3377     qemu_bh_schedule(acb->bh);
3378 }
3379 
3380 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3381         BlockDriverCompletionFunc *cb, void *opaque)
3382 {
3383     trace_bdrv_aio_flush(bs, opaque);
3384 
3385     Coroutine *co;
3386     BlockDriverAIOCBCoroutine *acb;
3387 
3388     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3389     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3390     qemu_coroutine_enter(co, acb);
3391 
3392     return &acb->common;
3393 }
3394 
3395 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3396 {
3397     BlockDriverAIOCBCoroutine *acb = opaque;
3398     BlockDriverState *bs = acb->common.bs;
3399 
3400     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3401     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3402     qemu_bh_schedule(acb->bh);
3403 }
3404 
3405 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3406         int64_t sector_num, int nb_sectors,
3407         BlockDriverCompletionFunc *cb, void *opaque)
3408 {
3409     Coroutine *co;
3410     BlockDriverAIOCBCoroutine *acb;
3411 
3412     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3413 
3414     acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3415     acb->req.sector = sector_num;
3416     acb->req.nb_sectors = nb_sectors;
3417     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3418     qemu_coroutine_enter(co, acb);
3419 
3420     return &acb->common;
3421 }
3422 
3423 void bdrv_init(void)
3424 {
3425     module_call_init(MODULE_INIT_BLOCK);
3426 }
3427 
3428 void bdrv_init_with_whitelist(void)
3429 {
3430     use_bdrv_whitelist = 1;
3431     bdrv_init();
3432 }
3433 
3434 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3435                    BlockDriverCompletionFunc *cb, void *opaque)
3436 {
3437     BlockDriverAIOCB *acb;
3438 
3439     if (pool->free_aiocb) {
3440         acb = pool->free_aiocb;
3441         pool->free_aiocb = acb->next;
3442     } else {
3443         acb = g_malloc0(pool->aiocb_size);
3444         acb->pool = pool;
3445     }
3446     acb->bs = bs;
3447     acb->cb = cb;
3448     acb->opaque = opaque;
3449     return acb;
3450 }
3451 
3452 void qemu_aio_release(void *p)
3453 {
3454     BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3455     AIOPool *pool = acb->pool;
3456     acb->next = pool->free_aiocb;
3457     pool->free_aiocb = acb;
3458 }
3459 
3460 /**************************************************************/
3461 /* Coroutine block device emulation */
3462 
3463 typedef struct CoroutineIOCompletion {
3464     Coroutine *coroutine;
3465     int ret;
3466 } CoroutineIOCompletion;
3467 
3468 static void bdrv_co_io_em_complete(void *opaque, int ret)
3469 {
3470     CoroutineIOCompletion *co = opaque;
3471 
3472     co->ret = ret;
3473     qemu_coroutine_enter(co->coroutine, NULL);
3474 }
3475 
3476 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3477                                       int nb_sectors, QEMUIOVector *iov,
3478                                       bool is_write)
3479 {
3480     CoroutineIOCompletion co = {
3481         .coroutine = qemu_coroutine_self(),
3482     };
3483     BlockDriverAIOCB *acb;
3484 
3485     if (is_write) {
3486         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3487                                        bdrv_co_io_em_complete, &co);
3488     } else {
3489         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3490                                       bdrv_co_io_em_complete, &co);
3491     }
3492 
3493     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3494     if (!acb) {
3495         return -EIO;
3496     }
3497     qemu_coroutine_yield();
3498 
3499     return co.ret;
3500 }
3501 
3502 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3503                                          int64_t sector_num, int nb_sectors,
3504                                          QEMUIOVector *iov)
3505 {
3506     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3507 }
3508 
3509 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3510                                          int64_t sector_num, int nb_sectors,
3511                                          QEMUIOVector *iov)
3512 {
3513     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3514 }
3515 
3516 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3517 {
3518     RwCo *rwco = opaque;
3519 
3520     rwco->ret = bdrv_co_flush(rwco->bs);
3521 }
3522 
3523 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3524 {
3525     int ret;
3526 
3527     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3528         return 0;
3529     }
3530 
3531     /* Write back cached data to the OS even with cache=unsafe */
3532     if (bs->drv->bdrv_co_flush_to_os) {
3533         ret = bs->drv->bdrv_co_flush_to_os(bs);
3534         if (ret < 0) {
3535             return ret;
3536         }
3537     }
3538 
3539     /* But don't actually force it to the disk with cache=unsafe */
3540     if (bs->open_flags & BDRV_O_NO_FLUSH) {
3541         goto flush_parent;
3542     }
3543 
3544     if (bs->drv->bdrv_co_flush_to_disk) {
3545         ret = bs->drv->bdrv_co_flush_to_disk(bs);
3546     } else if (bs->drv->bdrv_aio_flush) {
3547         BlockDriverAIOCB *acb;
3548         CoroutineIOCompletion co = {
3549             .coroutine = qemu_coroutine_self(),
3550         };
3551 
3552         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3553         if (acb == NULL) {
3554             ret = -EIO;
3555         } else {
3556             qemu_coroutine_yield();
3557             ret = co.ret;
3558         }
3559     } else {
3560         /*
3561          * Some block drivers always operate in either writethrough or unsafe
3562          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3563          * know how the server works (because the behaviour is hardcoded or
3564          * depends on server-side configuration), so we can't ensure that
3565          * everything is safe on disk. Returning an error doesn't work because
3566          * that would break guests even if the server operates in writethrough
3567          * mode.
3568          *
3569          * Let's hope the user knows what he's doing.
3570          */
3571         ret = 0;
3572     }
3573     if (ret < 0) {
3574         return ret;
3575     }
3576 
3577     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
3578      * in the case of cache=unsafe, so there are no useless flushes.
3579      */
3580 flush_parent:
3581     return bdrv_co_flush(bs->file);
3582 }
3583 
3584 void bdrv_invalidate_cache(BlockDriverState *bs)
3585 {
3586     if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3587         bs->drv->bdrv_invalidate_cache(bs);
3588     }
3589 }
3590 
3591 void bdrv_invalidate_cache_all(void)
3592 {
3593     BlockDriverState *bs;
3594 
3595     QTAILQ_FOREACH(bs, &bdrv_states, list) {
3596         bdrv_invalidate_cache(bs);
3597     }
3598 }
3599 
3600 void bdrv_clear_incoming_migration_all(void)
3601 {
3602     BlockDriverState *bs;
3603 
3604     QTAILQ_FOREACH(bs, &bdrv_states, list) {
3605         bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3606     }
3607 }
3608 
3609 int bdrv_flush(BlockDriverState *bs)
3610 {
3611     Coroutine *co;
3612     RwCo rwco = {
3613         .bs = bs,
3614         .ret = NOT_DONE,
3615     };
3616 
3617     if (qemu_in_coroutine()) {
3618         /* Fast-path if already in coroutine context */
3619         bdrv_flush_co_entry(&rwco);
3620     } else {
3621         co = qemu_coroutine_create(bdrv_flush_co_entry);
3622         qemu_coroutine_enter(co, &rwco);
3623         while (rwco.ret == NOT_DONE) {
3624             qemu_aio_wait();
3625         }
3626     }
3627 
3628     return rwco.ret;
3629 }
3630 
3631 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3632 {
3633     RwCo *rwco = opaque;
3634 
3635     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3636 }
3637 
3638 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3639                                  int nb_sectors)
3640 {
3641     if (!bs->drv) {
3642         return -ENOMEDIUM;
3643     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3644         return -EIO;
3645     } else if (bs->read_only) {
3646         return -EROFS;
3647     } else if (bs->drv->bdrv_co_discard) {
3648         return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3649     } else if (bs->drv->bdrv_aio_discard) {
3650         BlockDriverAIOCB *acb;
3651         CoroutineIOCompletion co = {
3652             .coroutine = qemu_coroutine_self(),
3653         };
3654 
3655         acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3656                                         bdrv_co_io_em_complete, &co);
3657         if (acb == NULL) {
3658             return -EIO;
3659         } else {
3660             qemu_coroutine_yield();
3661             return co.ret;
3662         }
3663     } else {
3664         return 0;
3665     }
3666 }
3667 
3668 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3669 {
3670     Coroutine *co;
3671     RwCo rwco = {
3672         .bs = bs,
3673         .sector_num = sector_num,
3674         .nb_sectors = nb_sectors,
3675         .ret = NOT_DONE,
3676     };
3677 
3678     if (qemu_in_coroutine()) {
3679         /* Fast-path if already in coroutine context */
3680         bdrv_discard_co_entry(&rwco);
3681     } else {
3682         co = qemu_coroutine_create(bdrv_discard_co_entry);
3683         qemu_coroutine_enter(co, &rwco);
3684         while (rwco.ret == NOT_DONE) {
3685             qemu_aio_wait();
3686         }
3687     }
3688 
3689     return rwco.ret;
3690 }
3691 
3692 /**************************************************************/
3693 /* removable device support */
3694 
3695 /**
3696  * Return TRUE if the media is present
3697  */
3698 int bdrv_is_inserted(BlockDriverState *bs)
3699 {
3700     BlockDriver *drv = bs->drv;
3701 
3702     if (!drv)
3703         return 0;
3704     if (!drv->bdrv_is_inserted)
3705         return 1;
3706     return drv->bdrv_is_inserted(bs);
3707 }
3708 
3709 /**
3710  * Return whether the media changed since the last call to this
3711  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
3712  */
3713 int bdrv_media_changed(BlockDriverState *bs)
3714 {
3715     BlockDriver *drv = bs->drv;
3716 
3717     if (drv && drv->bdrv_media_changed) {
3718         return drv->bdrv_media_changed(bs);
3719     }
3720     return -ENOTSUP;
3721 }
3722 
3723 /**
3724  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3725  */
3726 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3727 {
3728     BlockDriver *drv = bs->drv;
3729 
3730     if (drv && drv->bdrv_eject) {
3731         drv->bdrv_eject(bs, eject_flag);
3732     }
3733 
3734     if (bs->device_name[0] != '\0') {
3735         bdrv_emit_qmp_eject_event(bs, eject_flag);
3736     }
3737 }
3738 
3739 /**
3740  * Lock or unlock the media (if it is locked, the user won't be able
3741  * to eject it manually).
3742  */
3743 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3744 {
3745     BlockDriver *drv = bs->drv;
3746 
3747     trace_bdrv_lock_medium(bs, locked);
3748 
3749     if (drv && drv->bdrv_lock_medium) {
3750         drv->bdrv_lock_medium(bs, locked);
3751     }
3752 }
3753 
3754 /* needed for generic scsi interface */
3755 
3756 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3757 {
3758     BlockDriver *drv = bs->drv;
3759 
3760     if (drv && drv->bdrv_ioctl)
3761         return drv->bdrv_ioctl(bs, req, buf);
3762     return -ENOTSUP;
3763 }
3764 
3765 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3766         unsigned long int req, void *buf,
3767         BlockDriverCompletionFunc *cb, void *opaque)
3768 {
3769     BlockDriver *drv = bs->drv;
3770 
3771     if (drv && drv->bdrv_aio_ioctl)
3772         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3773     return NULL;
3774 }
3775 
3776 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3777 {
3778     bs->buffer_alignment = align;
3779 }
3780 
3781 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3782 {
3783     return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3784 }
3785 
3786 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3787 {
3788     int64_t bitmap_size;
3789 
3790     bs->dirty_count = 0;
3791     if (enable) {
3792         if (!bs->dirty_bitmap) {
3793             bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3794                     BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG - 1;
3795             bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
3796 
3797             bs->dirty_bitmap = g_new0(unsigned long, bitmap_size);
3798         }
3799     } else {
3800         if (bs->dirty_bitmap) {
3801             g_free(bs->dirty_bitmap);
3802             bs->dirty_bitmap = NULL;
3803         }
3804     }
3805 }
3806 
3807 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3808 {
3809     int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3810 
3811     if (bs->dirty_bitmap &&
3812         (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3813         return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3814             (1UL << (chunk % (sizeof(unsigned long) * 8))));
3815     } else {
3816         return 0;
3817     }
3818 }
3819 
3820 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3821                       int nr_sectors)
3822 {
3823     set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3824 }
3825 
3826 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3827 {
3828     return bs->dirty_count;
3829 }
3830 
3831 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3832 {
3833     assert(bs->in_use != in_use);
3834     bs->in_use = in_use;
3835 }
3836 
3837 int bdrv_in_use(BlockDriverState *bs)
3838 {
3839     return bs->in_use;
3840 }
3841 
3842 void bdrv_iostatus_enable(BlockDriverState *bs)
3843 {
3844     bs->iostatus_enabled = true;
3845     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3846 }
3847 
3848 /* The I/O status is only enabled if the drive explicitly
3849  * enables it _and_ the VM is configured to stop on errors */
3850 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3851 {
3852     return (bs->iostatus_enabled &&
3853            (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3854             bs->on_write_error == BLOCK_ERR_STOP_ANY    ||
3855             bs->on_read_error == BLOCK_ERR_STOP_ANY));
3856 }
3857 
3858 void bdrv_iostatus_disable(BlockDriverState *bs)
3859 {
3860     bs->iostatus_enabled = false;
3861 }
3862 
3863 void bdrv_iostatus_reset(BlockDriverState *bs)
3864 {
3865     if (bdrv_iostatus_is_enabled(bs)) {
3866         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3867     }
3868 }
3869 
3870 /* XXX: Today this is set by device models because it makes the implementation
3871    quite simple. However, the block layer knows about the error, so it's
3872    possible to implement this without device models being involved */
3873 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3874 {
3875     if (bdrv_iostatus_is_enabled(bs) &&
3876         bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
3877         assert(error >= 0);
3878         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3879                                          BLOCK_DEVICE_IO_STATUS_FAILED;
3880     }
3881 }
3882 
3883 void
3884 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3885         enum BlockAcctType type)
3886 {
3887     assert(type < BDRV_MAX_IOTYPE);
3888 
3889     cookie->bytes = bytes;
3890     cookie->start_time_ns = get_clock();
3891     cookie->type = type;
3892 }
3893 
3894 void
3895 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3896 {
3897     assert(cookie->type < BDRV_MAX_IOTYPE);
3898 
3899     bs->nr_bytes[cookie->type] += cookie->bytes;
3900     bs->nr_ops[cookie->type]++;
3901     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
3902 }
3903 
3904 int bdrv_img_create(const char *filename, const char *fmt,
3905                     const char *base_filename, const char *base_fmt,
3906                     char *options, uint64_t img_size, int flags)
3907 {
3908     QEMUOptionParameter *param = NULL, *create_options = NULL;
3909     QEMUOptionParameter *backing_fmt, *backing_file, *size;
3910     BlockDriverState *bs = NULL;
3911     BlockDriver *drv, *proto_drv;
3912     BlockDriver *backing_drv = NULL;
3913     int ret = 0;
3914 
3915     /* Find driver and parse its options */
3916     drv = bdrv_find_format(fmt);
3917     if (!drv) {
3918         error_report("Unknown file format '%s'", fmt);
3919         ret = -EINVAL;
3920         goto out;
3921     }
3922 
3923     proto_drv = bdrv_find_protocol(filename);
3924     if (!proto_drv) {
3925         error_report("Unknown protocol '%s'", filename);
3926         ret = -EINVAL;
3927         goto out;
3928     }
3929 
3930     create_options = append_option_parameters(create_options,
3931                                               drv->create_options);
3932     create_options = append_option_parameters(create_options,
3933                                               proto_drv->create_options);
3934 
3935     /* Create parameter list with default values */
3936     param = parse_option_parameters("", create_options, param);
3937 
3938     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
3939 
3940     /* Parse -o options */
3941     if (options) {
3942         param = parse_option_parameters(options, create_options, param);
3943         if (param == NULL) {
3944             error_report("Invalid options for file format '%s'.", fmt);
3945             ret = -EINVAL;
3946             goto out;
3947         }
3948     }
3949 
3950     if (base_filename) {
3951         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
3952                                  base_filename)) {
3953             error_report("Backing file not supported for file format '%s'",
3954                          fmt);
3955             ret = -EINVAL;
3956             goto out;
3957         }
3958     }
3959 
3960     if (base_fmt) {
3961         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
3962             error_report("Backing file format not supported for file "
3963                          "format '%s'", fmt);
3964             ret = -EINVAL;
3965             goto out;
3966         }
3967     }
3968 
3969     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
3970     if (backing_file && backing_file->value.s) {
3971         if (!strcmp(filename, backing_file->value.s)) {
3972             error_report("Error: Trying to create an image with the "
3973                          "same filename as the backing file");
3974             ret = -EINVAL;
3975             goto out;
3976         }
3977     }
3978 
3979     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
3980     if (backing_fmt && backing_fmt->value.s) {
3981         backing_drv = bdrv_find_format(backing_fmt->value.s);
3982         if (!backing_drv) {
3983             error_report("Unknown backing file format '%s'",
3984                          backing_fmt->value.s);
3985             ret = -EINVAL;
3986             goto out;
3987         }
3988     }
3989 
3990     // The size for the image must always be specified, with one exception:
3991     // If we are using a backing file, we can obtain the size from there
3992     size = get_option_parameter(param, BLOCK_OPT_SIZE);
3993     if (size && size->value.n == -1) {
3994         if (backing_file && backing_file->value.s) {
3995             uint64_t size;
3996             char buf[32];
3997             int back_flags;
3998 
3999             /* backing files always opened read-only */
4000             back_flags =
4001                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4002 
4003             bs = bdrv_new("");
4004 
4005             ret = bdrv_open(bs, backing_file->value.s, back_flags, backing_drv);
4006             if (ret < 0) {
4007                 error_report("Could not open '%s'", backing_file->value.s);
4008                 goto out;
4009             }
4010             bdrv_get_geometry(bs, &size);
4011             size *= 512;
4012 
4013             snprintf(buf, sizeof(buf), "%" PRId64, size);
4014             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4015         } else {
4016             error_report("Image creation needs a size parameter");
4017             ret = -EINVAL;
4018             goto out;
4019         }
4020     }
4021 
4022     printf("Formatting '%s', fmt=%s ", filename, fmt);
4023     print_option_parameters(param);
4024     puts("");
4025 
4026     ret = bdrv_create(drv, filename, param);
4027 
4028     if (ret < 0) {
4029         if (ret == -ENOTSUP) {
4030             error_report("Formatting or formatting option not supported for "
4031                          "file format '%s'", fmt);
4032         } else if (ret == -EFBIG) {
4033             error_report("The image size is too large for file format '%s'",
4034                          fmt);
4035         } else {
4036             error_report("%s: error while creating %s: %s", filename, fmt,
4037                          strerror(-ret));
4038         }
4039     }
4040 
4041 out:
4042     free_option_parameters(create_options);
4043     free_option_parameters(param);
4044 
4045     if (bs) {
4046         bdrv_delete(bs);
4047     }
4048 
4049     return ret;
4050 }
4051 
4052 void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4053                        int64_t speed, BlockDriverCompletionFunc *cb,
4054                        void *opaque, Error **errp)
4055 {
4056     BlockJob *job;
4057 
4058     if (bs->job || bdrv_in_use(bs)) {
4059         error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
4060         return NULL;
4061     }
4062     bdrv_set_in_use(bs, 1);
4063 
4064     job = g_malloc0(job_type->instance_size);
4065     job->job_type      = job_type;
4066     job->bs            = bs;
4067     job->cb            = cb;
4068     job->opaque        = opaque;
4069     job->busy          = true;
4070     bs->job = job;
4071 
4072     /* Only set speed when necessary to avoid NotSupported error */
4073     if (speed != 0) {
4074         Error *local_err = NULL;
4075 
4076         block_job_set_speed(job, speed, &local_err);
4077         if (error_is_set(&local_err)) {
4078             bs->job = NULL;
4079             g_free(job);
4080             bdrv_set_in_use(bs, 0);
4081             error_propagate(errp, local_err);
4082             return NULL;
4083         }
4084     }
4085     return job;
4086 }
4087 
4088 void block_job_complete(BlockJob *job, int ret)
4089 {
4090     BlockDriverState *bs = job->bs;
4091 
4092     assert(bs->job == job);
4093     job->cb(job->opaque, ret);
4094     bs->job = NULL;
4095     g_free(job);
4096     bdrv_set_in_use(bs, 0);
4097 }
4098 
4099 void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
4100 {
4101     Error *local_err = NULL;
4102 
4103     if (!job->job_type->set_speed) {
4104         error_set(errp, QERR_NOT_SUPPORTED);
4105         return;
4106     }
4107     job->job_type->set_speed(job, speed, &local_err);
4108     if (error_is_set(&local_err)) {
4109         error_propagate(errp, local_err);
4110         return;
4111     }
4112 
4113     job->speed = speed;
4114 }
4115 
4116 void block_job_cancel(BlockJob *job)
4117 {
4118     job->cancelled = true;
4119     if (job->co && !job->busy) {
4120         qemu_coroutine_enter(job->co, NULL);
4121     }
4122 }
4123 
4124 bool block_job_is_cancelled(BlockJob *job)
4125 {
4126     return job->cancelled;
4127 }
4128 
4129 struct BlockCancelData {
4130     BlockJob *job;
4131     BlockDriverCompletionFunc *cb;
4132     void *opaque;
4133     bool cancelled;
4134     int ret;
4135 };
4136 
4137 static void block_job_cancel_cb(void *opaque, int ret)
4138 {
4139     struct BlockCancelData *data = opaque;
4140 
4141     data->cancelled = block_job_is_cancelled(data->job);
4142     data->ret = ret;
4143     data->cb(data->opaque, ret);
4144 }
4145 
4146 int block_job_cancel_sync(BlockJob *job)
4147 {
4148     struct BlockCancelData data;
4149     BlockDriverState *bs = job->bs;
4150 
4151     assert(bs->job == job);
4152 
4153     /* Set up our own callback to store the result and chain to
4154      * the original callback.
4155      */
4156     data.job = job;
4157     data.cb = job->cb;
4158     data.opaque = job->opaque;
4159     data.ret = -EINPROGRESS;
4160     job->cb = block_job_cancel_cb;
4161     job->opaque = &data;
4162     block_job_cancel(job);
4163     while (data.ret == -EINPROGRESS) {
4164         qemu_aio_wait();
4165     }
4166     return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret;
4167 }
4168 
4169 void block_job_sleep_ns(BlockJob *job, QEMUClock *clock, int64_t ns)
4170 {
4171     /* Check cancellation *before* setting busy = false, too!  */
4172     if (!block_job_is_cancelled(job)) {
4173         job->busy = false;
4174         co_sleep_ns(clock, ns);
4175         job->busy = true;
4176     }
4177 }
4178