xref: /openbmc/qemu/block.c (revision 1d10b445)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor/monitor.h"
28 #include "block/block_int.h"
29 #include "block/blockjob.h"
30 #include "qemu/module.h"
31 #include "qapi/qmp/qjson.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 
39 #ifdef CONFIG_BSD
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/queue.h>
44 #ifndef __DragonFly__
45 #include <sys/disk.h>
46 #endif
47 #endif
48 
49 #ifdef _WIN32
50 #include <windows.h>
51 #endif
52 
53 struct BdrvDirtyBitmap {
54     HBitmap *bitmap;
55     QLIST_ENTRY(BdrvDirtyBitmap) list;
56 };
57 
58 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59 
60 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63         BlockDriverCompletionFunc *cb, void *opaque);
64 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66         BlockDriverCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68                                          int64_t sector_num, int nb_sectors,
69                                          QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71                                          int64_t sector_num, int nb_sectors,
72                                          QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75     BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78     BdrvRequestFlags flags);
79 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80                                                int64_t sector_num,
81                                                QEMUIOVector *qiov,
82                                                int nb_sectors,
83                                                BdrvRequestFlags flags,
84                                                BlockDriverCompletionFunc *cb,
85                                                void *opaque,
86                                                bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90 
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92     QTAILQ_HEAD_INITIALIZER(bdrv_states);
93 
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96 
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98     QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
102 
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
105 {
106     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108             filename[1] == ':');
109 }
110 
111 int is_windows_drive(const char *filename)
112 {
113     if (is_windows_drive_prefix(filename) &&
114         filename[2] == '\0')
115         return 1;
116     if (strstart(filename, "\\\\.\\", NULL) ||
117         strstart(filename, "//./", NULL))
118         return 1;
119     return 0;
120 }
121 #endif
122 
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125                         ThrottleConfig *cfg)
126 {
127     int i;
128 
129     throttle_config(&bs->throttle_state, cfg);
130 
131     for (i = 0; i < 2; i++) {
132         qemu_co_enter_next(&bs->throttled_reqs[i]);
133     }
134 }
135 
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138 {
139     bool drained = false;
140     bool enabled = bs->io_limits_enabled;
141     int i;
142 
143     bs->io_limits_enabled = false;
144 
145     for (i = 0; i < 2; i++) {
146         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147             drained = true;
148         }
149     }
150 
151     bs->io_limits_enabled = enabled;
152 
153     return drained;
154 }
155 
156 void bdrv_io_limits_disable(BlockDriverState *bs)
157 {
158     bs->io_limits_enabled = false;
159 
160     bdrv_start_throttled_reqs(bs);
161 
162     throttle_destroy(&bs->throttle_state);
163 }
164 
165 static void bdrv_throttle_read_timer_cb(void *opaque)
166 {
167     BlockDriverState *bs = opaque;
168     qemu_co_enter_next(&bs->throttled_reqs[0]);
169 }
170 
171 static void bdrv_throttle_write_timer_cb(void *opaque)
172 {
173     BlockDriverState *bs = opaque;
174     qemu_co_enter_next(&bs->throttled_reqs[1]);
175 }
176 
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
179 {
180     assert(!bs->io_limits_enabled);
181     throttle_init(&bs->throttle_state,
182                   QEMU_CLOCK_VIRTUAL,
183                   bdrv_throttle_read_timer_cb,
184                   bdrv_throttle_write_timer_cb,
185                   bs);
186     bs->io_limits_enabled = true;
187 }
188 
189 /* This function makes an IO wait if needed
190  *
191  * @nb_sectors: the number of sectors of the IO
192  * @is_write:   is the IO a write
193  */
194 static void bdrv_io_limits_intercept(BlockDriverState *bs,
195                                      unsigned int bytes,
196                                      bool is_write)
197 {
198     /* does this io must wait */
199     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
200 
201     /* if must wait or any request of this type throttled queue the IO */
202     if (must_wait ||
203         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
205     }
206 
207     /* the IO will be executed, do the accounting */
208     throttle_account(&bs->throttle_state, is_write, bytes);
209 
210 
211     /* if the next request must wait -> do nothing */
212     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
213         return;
214     }
215 
216     /* else queue next request for execution */
217     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
218 }
219 
220 size_t bdrv_opt_mem_align(BlockDriverState *bs)
221 {
222     if (!bs || !bs->drv) {
223         /* 4k should be on the safe side */
224         return 4096;
225     }
226 
227     return bs->bl.opt_mem_alignment;
228 }
229 
230 /* check if the path starts with "<protocol>:" */
231 static int path_has_protocol(const char *path)
232 {
233     const char *p;
234 
235 #ifdef _WIN32
236     if (is_windows_drive(path) ||
237         is_windows_drive_prefix(path)) {
238         return 0;
239     }
240     p = path + strcspn(path, ":/\\");
241 #else
242     p = path + strcspn(path, ":/");
243 #endif
244 
245     return *p == ':';
246 }
247 
248 int path_is_absolute(const char *path)
249 {
250 #ifdef _WIN32
251     /* specific case for names like: "\\.\d:" */
252     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
253         return 1;
254     }
255     return (*path == '/' || *path == '\\');
256 #else
257     return (*path == '/');
258 #endif
259 }
260 
261 /* if filename is absolute, just copy it to dest. Otherwise, build a
262    path to it by considering it is relative to base_path. URL are
263    supported. */
264 void path_combine(char *dest, int dest_size,
265                   const char *base_path,
266                   const char *filename)
267 {
268     const char *p, *p1;
269     int len;
270 
271     if (dest_size <= 0)
272         return;
273     if (path_is_absolute(filename)) {
274         pstrcpy(dest, dest_size, filename);
275     } else {
276         p = strchr(base_path, ':');
277         if (p)
278             p++;
279         else
280             p = base_path;
281         p1 = strrchr(base_path, '/');
282 #ifdef _WIN32
283         {
284             const char *p2;
285             p2 = strrchr(base_path, '\\');
286             if (!p1 || p2 > p1)
287                 p1 = p2;
288         }
289 #endif
290         if (p1)
291             p1++;
292         else
293             p1 = base_path;
294         if (p1 > p)
295             p = p1;
296         len = p - base_path;
297         if (len > dest_size - 1)
298             len = dest_size - 1;
299         memcpy(dest, base_path, len);
300         dest[len] = '\0';
301         pstrcat(dest, dest_size, filename);
302     }
303 }
304 
305 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
306 {
307     if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
308         pstrcpy(dest, sz, bs->backing_file);
309     } else {
310         path_combine(dest, sz, bs->filename, bs->backing_file);
311     }
312 }
313 
314 void bdrv_register(BlockDriver *bdrv)
315 {
316     /* Block drivers without coroutine functions need emulation */
317     if (!bdrv->bdrv_co_readv) {
318         bdrv->bdrv_co_readv = bdrv_co_readv_em;
319         bdrv->bdrv_co_writev = bdrv_co_writev_em;
320 
321         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
322          * the block driver lacks aio we need to emulate that too.
323          */
324         if (!bdrv->bdrv_aio_readv) {
325             /* add AIO emulation layer */
326             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
327             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
328         }
329     }
330 
331     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
332 }
333 
334 /* create a new block device (by default it is empty) */
335 BlockDriverState *bdrv_new(const char *device_name, Error **errp)
336 {
337     BlockDriverState *bs;
338     int i;
339 
340     if (bdrv_find(device_name)) {
341         error_setg(errp, "Device with id '%s' already exists",
342                    device_name);
343         return NULL;
344     }
345     if (bdrv_find_node(device_name)) {
346         error_setg(errp, "Device with node-name '%s' already exists",
347                    device_name);
348         return NULL;
349     }
350 
351     bs = g_malloc0(sizeof(BlockDriverState));
352     QLIST_INIT(&bs->dirty_bitmaps);
353     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
354     if (device_name[0] != '\0') {
355         QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
356     }
357     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
358         QLIST_INIT(&bs->op_blockers[i]);
359     }
360     bdrv_iostatus_disable(bs);
361     notifier_list_init(&bs->close_notifiers);
362     notifier_with_return_list_init(&bs->before_write_notifiers);
363     qemu_co_queue_init(&bs->throttled_reqs[0]);
364     qemu_co_queue_init(&bs->throttled_reqs[1]);
365     bs->refcnt = 1;
366 
367     return bs;
368 }
369 
370 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
371 {
372     notifier_list_add(&bs->close_notifiers, notify);
373 }
374 
375 BlockDriver *bdrv_find_format(const char *format_name)
376 {
377     BlockDriver *drv1;
378     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
379         if (!strcmp(drv1->format_name, format_name)) {
380             return drv1;
381         }
382     }
383     return NULL;
384 }
385 
386 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
387 {
388     static const char *whitelist_rw[] = {
389         CONFIG_BDRV_RW_WHITELIST
390     };
391     static const char *whitelist_ro[] = {
392         CONFIG_BDRV_RO_WHITELIST
393     };
394     const char **p;
395 
396     if (!whitelist_rw[0] && !whitelist_ro[0]) {
397         return 1;               /* no whitelist, anything goes */
398     }
399 
400     for (p = whitelist_rw; *p; p++) {
401         if (!strcmp(drv->format_name, *p)) {
402             return 1;
403         }
404     }
405     if (read_only) {
406         for (p = whitelist_ro; *p; p++) {
407             if (!strcmp(drv->format_name, *p)) {
408                 return 1;
409             }
410         }
411     }
412     return 0;
413 }
414 
415 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
416                                           bool read_only)
417 {
418     BlockDriver *drv = bdrv_find_format(format_name);
419     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
420 }
421 
422 typedef struct CreateCo {
423     BlockDriver *drv;
424     char *filename;
425     QEMUOptionParameter *options;
426     int ret;
427     Error *err;
428 } CreateCo;
429 
430 static void coroutine_fn bdrv_create_co_entry(void *opaque)
431 {
432     Error *local_err = NULL;
433     int ret;
434 
435     CreateCo *cco = opaque;
436     assert(cco->drv);
437 
438     ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
439     if (local_err) {
440         error_propagate(&cco->err, local_err);
441     }
442     cco->ret = ret;
443 }
444 
445 int bdrv_create(BlockDriver *drv, const char* filename,
446     QEMUOptionParameter *options, Error **errp)
447 {
448     int ret;
449 
450     Coroutine *co;
451     CreateCo cco = {
452         .drv = drv,
453         .filename = g_strdup(filename),
454         .options = options,
455         .ret = NOT_DONE,
456         .err = NULL,
457     };
458 
459     if (!drv->bdrv_create) {
460         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
461         ret = -ENOTSUP;
462         goto out;
463     }
464 
465     if (qemu_in_coroutine()) {
466         /* Fast-path if already in coroutine context */
467         bdrv_create_co_entry(&cco);
468     } else {
469         co = qemu_coroutine_create(bdrv_create_co_entry);
470         qemu_coroutine_enter(co, &cco);
471         while (cco.ret == NOT_DONE) {
472             qemu_aio_wait();
473         }
474     }
475 
476     ret = cco.ret;
477     if (ret < 0) {
478         if (cco.err) {
479             error_propagate(errp, cco.err);
480         } else {
481             error_setg_errno(errp, -ret, "Could not create image");
482         }
483     }
484 
485 out:
486     g_free(cco.filename);
487     return ret;
488 }
489 
490 int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
491                      Error **errp)
492 {
493     BlockDriver *drv;
494     Error *local_err = NULL;
495     int ret;
496 
497     drv = bdrv_find_protocol(filename, true);
498     if (drv == NULL) {
499         error_setg(errp, "Could not find protocol for file '%s'", filename);
500         return -ENOENT;
501     }
502 
503     ret = bdrv_create(drv, filename, options, &local_err);
504     if (local_err) {
505         error_propagate(errp, local_err);
506     }
507     return ret;
508 }
509 
510 int bdrv_refresh_limits(BlockDriverState *bs)
511 {
512     BlockDriver *drv = bs->drv;
513 
514     memset(&bs->bl, 0, sizeof(bs->bl));
515 
516     if (!drv) {
517         return 0;
518     }
519 
520     /* Take some limits from the children as a default */
521     if (bs->file) {
522         bdrv_refresh_limits(bs->file);
523         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
524         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
525     } else {
526         bs->bl.opt_mem_alignment = 512;
527     }
528 
529     if (bs->backing_hd) {
530         bdrv_refresh_limits(bs->backing_hd);
531         bs->bl.opt_transfer_length =
532             MAX(bs->bl.opt_transfer_length,
533                 bs->backing_hd->bl.opt_transfer_length);
534         bs->bl.opt_mem_alignment =
535             MAX(bs->bl.opt_mem_alignment,
536                 bs->backing_hd->bl.opt_mem_alignment);
537     }
538 
539     /* Then let the driver override it */
540     if (drv->bdrv_refresh_limits) {
541         return drv->bdrv_refresh_limits(bs);
542     }
543 
544     return 0;
545 }
546 
547 /*
548  * Create a uniquely-named empty temporary file.
549  * Return 0 upon success, otherwise a negative errno value.
550  */
551 int get_tmp_filename(char *filename, int size)
552 {
553 #ifdef _WIN32
554     char temp_dir[MAX_PATH];
555     /* GetTempFileName requires that its output buffer (4th param)
556        have length MAX_PATH or greater.  */
557     assert(size >= MAX_PATH);
558     return (GetTempPath(MAX_PATH, temp_dir)
559             && GetTempFileName(temp_dir, "qem", 0, filename)
560             ? 0 : -GetLastError());
561 #else
562     int fd;
563     const char *tmpdir;
564     tmpdir = getenv("TMPDIR");
565     if (!tmpdir) {
566         tmpdir = "/var/tmp";
567     }
568     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
569         return -EOVERFLOW;
570     }
571     fd = mkstemp(filename);
572     if (fd < 0) {
573         return -errno;
574     }
575     if (close(fd) != 0) {
576         unlink(filename);
577         return -errno;
578     }
579     return 0;
580 #endif
581 }
582 
583 /*
584  * Detect host devices. By convention, /dev/cdrom[N] is always
585  * recognized as a host CDROM.
586  */
587 static BlockDriver *find_hdev_driver(const char *filename)
588 {
589     int score_max = 0, score;
590     BlockDriver *drv = NULL, *d;
591 
592     QLIST_FOREACH(d, &bdrv_drivers, list) {
593         if (d->bdrv_probe_device) {
594             score = d->bdrv_probe_device(filename);
595             if (score > score_max) {
596                 score_max = score;
597                 drv = d;
598             }
599         }
600     }
601 
602     return drv;
603 }
604 
605 BlockDriver *bdrv_find_protocol(const char *filename,
606                                 bool allow_protocol_prefix)
607 {
608     BlockDriver *drv1;
609     char protocol[128];
610     int len;
611     const char *p;
612 
613     /* TODO Drivers without bdrv_file_open must be specified explicitly */
614 
615     /*
616      * XXX(hch): we really should not let host device detection
617      * override an explicit protocol specification, but moving this
618      * later breaks access to device names with colons in them.
619      * Thanks to the brain-dead persistent naming schemes on udev-
620      * based Linux systems those actually are quite common.
621      */
622     drv1 = find_hdev_driver(filename);
623     if (drv1) {
624         return drv1;
625     }
626 
627     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
628         return bdrv_find_format("file");
629     }
630 
631     p = strchr(filename, ':');
632     assert(p != NULL);
633     len = p - filename;
634     if (len > sizeof(protocol) - 1)
635         len = sizeof(protocol) - 1;
636     memcpy(protocol, filename, len);
637     protocol[len] = '\0';
638     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
639         if (drv1->protocol_name &&
640             !strcmp(drv1->protocol_name, protocol)) {
641             return drv1;
642         }
643     }
644     return NULL;
645 }
646 
647 static int find_image_format(BlockDriverState *bs, const char *filename,
648                              BlockDriver **pdrv, Error **errp)
649 {
650     int score, score_max;
651     BlockDriver *drv1, *drv;
652     uint8_t buf[2048];
653     int ret = 0;
654 
655     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
656     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
657         drv = bdrv_find_format("raw");
658         if (!drv) {
659             error_setg(errp, "Could not find raw image format");
660             ret = -ENOENT;
661         }
662         *pdrv = drv;
663         return ret;
664     }
665 
666     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
667     if (ret < 0) {
668         error_setg_errno(errp, -ret, "Could not read image for determining its "
669                          "format");
670         *pdrv = NULL;
671         return ret;
672     }
673 
674     score_max = 0;
675     drv = NULL;
676     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
677         if (drv1->bdrv_probe) {
678             score = drv1->bdrv_probe(buf, ret, filename);
679             if (score > score_max) {
680                 score_max = score;
681                 drv = drv1;
682             }
683         }
684     }
685     if (!drv) {
686         error_setg(errp, "Could not determine image format: No compatible "
687                    "driver found");
688         ret = -ENOENT;
689     }
690     *pdrv = drv;
691     return ret;
692 }
693 
694 /**
695  * Set the current 'total_sectors' value
696  */
697 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
698 {
699     BlockDriver *drv = bs->drv;
700 
701     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
702     if (bs->sg)
703         return 0;
704 
705     /* query actual device if possible, otherwise just trust the hint */
706     if (drv->bdrv_getlength) {
707         int64_t length = drv->bdrv_getlength(bs);
708         if (length < 0) {
709             return length;
710         }
711         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
712     }
713 
714     bs->total_sectors = hint;
715     return 0;
716 }
717 
718 /**
719  * Set open flags for a given discard mode
720  *
721  * Return 0 on success, -1 if the discard mode was invalid.
722  */
723 int bdrv_parse_discard_flags(const char *mode, int *flags)
724 {
725     *flags &= ~BDRV_O_UNMAP;
726 
727     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
728         /* do nothing */
729     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
730         *flags |= BDRV_O_UNMAP;
731     } else {
732         return -1;
733     }
734 
735     return 0;
736 }
737 
738 /**
739  * Set open flags for a given cache mode
740  *
741  * Return 0 on success, -1 if the cache mode was invalid.
742  */
743 int bdrv_parse_cache_flags(const char *mode, int *flags)
744 {
745     *flags &= ~BDRV_O_CACHE_MASK;
746 
747     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
748         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
749     } else if (!strcmp(mode, "directsync")) {
750         *flags |= BDRV_O_NOCACHE;
751     } else if (!strcmp(mode, "writeback")) {
752         *flags |= BDRV_O_CACHE_WB;
753     } else if (!strcmp(mode, "unsafe")) {
754         *flags |= BDRV_O_CACHE_WB;
755         *flags |= BDRV_O_NO_FLUSH;
756     } else if (!strcmp(mode, "writethrough")) {
757         /* this is the default */
758     } else {
759         return -1;
760     }
761 
762     return 0;
763 }
764 
765 /**
766  * The copy-on-read flag is actually a reference count so multiple users may
767  * use the feature without worrying about clobbering its previous state.
768  * Copy-on-read stays enabled until all users have called to disable it.
769  */
770 void bdrv_enable_copy_on_read(BlockDriverState *bs)
771 {
772     bs->copy_on_read++;
773 }
774 
775 void bdrv_disable_copy_on_read(BlockDriverState *bs)
776 {
777     assert(bs->copy_on_read > 0);
778     bs->copy_on_read--;
779 }
780 
781 /*
782  * Returns the flags that a temporary snapshot should get, based on the
783  * originally requested flags (the originally requested image will have flags
784  * like a backing file)
785  */
786 static int bdrv_temp_snapshot_flags(int flags)
787 {
788     return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
789 }
790 
791 /*
792  * Returns the flags that bs->file should get, based on the given flags for
793  * the parent BDS
794  */
795 static int bdrv_inherited_flags(int flags)
796 {
797     /* Enable protocol handling, disable format probing for bs->file */
798     flags |= BDRV_O_PROTOCOL;
799 
800     /* Our block drivers take care to send flushes and respect unmap policy,
801      * so we can enable both unconditionally on lower layers. */
802     flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
803 
804     /* Clear flags that only apply to the top layer */
805     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
806 
807     return flags;
808 }
809 
810 /*
811  * Returns the flags that bs->backing_hd should get, based on the given flags
812  * for the parent BDS
813  */
814 static int bdrv_backing_flags(int flags)
815 {
816     /* backing files always opened read-only */
817     flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
818 
819     /* snapshot=on is handled on the top layer */
820     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
821 
822     return flags;
823 }
824 
825 static int bdrv_open_flags(BlockDriverState *bs, int flags)
826 {
827     int open_flags = flags | BDRV_O_CACHE_WB;
828 
829     /*
830      * Clear flags that are internal to the block layer before opening the
831      * image.
832      */
833     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
834 
835     /*
836      * Snapshots should be writable.
837      */
838     if (flags & BDRV_O_TEMPORARY) {
839         open_flags |= BDRV_O_RDWR;
840     }
841 
842     return open_flags;
843 }
844 
845 static void bdrv_assign_node_name(BlockDriverState *bs,
846                                   const char *node_name,
847                                   Error **errp)
848 {
849     if (!node_name) {
850         return;
851     }
852 
853     /* empty string node name is invalid */
854     if (node_name[0] == '\0') {
855         error_setg(errp, "Empty node name");
856         return;
857     }
858 
859     /* takes care of avoiding namespaces collisions */
860     if (bdrv_find(node_name)) {
861         error_setg(errp, "node-name=%s is conflicting with a device id",
862                    node_name);
863         return;
864     }
865 
866     /* takes care of avoiding duplicates node names */
867     if (bdrv_find_node(node_name)) {
868         error_setg(errp, "Duplicate node name");
869         return;
870     }
871 
872     /* copy node name into the bs and insert it into the graph list */
873     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
874     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
875 }
876 
877 /*
878  * Common part for opening disk images and files
879  *
880  * Removes all processed options from *options.
881  */
882 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
883     QDict *options, int flags, BlockDriver *drv, Error **errp)
884 {
885     int ret, open_flags;
886     const char *filename;
887     const char *node_name = NULL;
888     Error *local_err = NULL;
889 
890     assert(drv != NULL);
891     assert(bs->file == NULL);
892     assert(options != NULL && bs->options != options);
893 
894     if (file != NULL) {
895         filename = file->filename;
896     } else {
897         filename = qdict_get_try_str(options, "filename");
898     }
899 
900     if (drv->bdrv_needs_filename && !filename) {
901         error_setg(errp, "The '%s' block driver requires a file name",
902                    drv->format_name);
903         return -EINVAL;
904     }
905 
906     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
907 
908     node_name = qdict_get_try_str(options, "node-name");
909     bdrv_assign_node_name(bs, node_name, &local_err);
910     if (local_err) {
911         error_propagate(errp, local_err);
912         return -EINVAL;
913     }
914     qdict_del(options, "node-name");
915 
916     /* bdrv_open() with directly using a protocol as drv. This layer is already
917      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
918      * and return immediately. */
919     if (file != NULL && drv->bdrv_file_open) {
920         bdrv_swap(file, bs);
921         return 0;
922     }
923 
924     bs->open_flags = flags;
925     bs->guest_block_size = 512;
926     bs->request_alignment = 512;
927     bs->zero_beyond_eof = true;
928     open_flags = bdrv_open_flags(bs, flags);
929     bs->read_only = !(open_flags & BDRV_O_RDWR);
930 
931     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
932         error_setg(errp,
933                    !bs->read_only && bdrv_is_whitelisted(drv, true)
934                         ? "Driver '%s' can only be used for read-only devices"
935                         : "Driver '%s' is not whitelisted",
936                    drv->format_name);
937         return -ENOTSUP;
938     }
939 
940     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
941     if (flags & BDRV_O_COPY_ON_READ) {
942         if (!bs->read_only) {
943             bdrv_enable_copy_on_read(bs);
944         } else {
945             error_setg(errp, "Can't use copy-on-read on read-only device");
946             return -EINVAL;
947         }
948     }
949 
950     if (filename != NULL) {
951         pstrcpy(bs->filename, sizeof(bs->filename), filename);
952     } else {
953         bs->filename[0] = '\0';
954     }
955 
956     bs->drv = drv;
957     bs->opaque = g_malloc0(drv->instance_size);
958 
959     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
960 
961     /* Open the image, either directly or using a protocol */
962     if (drv->bdrv_file_open) {
963         assert(file == NULL);
964         assert(!drv->bdrv_needs_filename || filename != NULL);
965         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
966     } else {
967         if (file == NULL) {
968             error_setg(errp, "Can't use '%s' as a block driver for the "
969                        "protocol level", drv->format_name);
970             ret = -EINVAL;
971             goto free_and_fail;
972         }
973         bs->file = file;
974         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
975     }
976 
977     if (ret < 0) {
978         if (local_err) {
979             error_propagate(errp, local_err);
980         } else if (bs->filename[0]) {
981             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
982         } else {
983             error_setg_errno(errp, -ret, "Could not open image");
984         }
985         goto free_and_fail;
986     }
987 
988     ret = refresh_total_sectors(bs, bs->total_sectors);
989     if (ret < 0) {
990         error_setg_errno(errp, -ret, "Could not refresh total sector count");
991         goto free_and_fail;
992     }
993 
994     bdrv_refresh_limits(bs);
995     assert(bdrv_opt_mem_align(bs) != 0);
996     assert((bs->request_alignment != 0) || bs->sg);
997     return 0;
998 
999 free_and_fail:
1000     bs->file = NULL;
1001     g_free(bs->opaque);
1002     bs->opaque = NULL;
1003     bs->drv = NULL;
1004     return ret;
1005 }
1006 
1007 /*
1008  * Opens a file using a protocol (file, host_device, nbd, ...)
1009  *
1010  * options is an indirect pointer to a QDict of options to pass to the block
1011  * drivers, or pointer to NULL for an empty set of options. If this function
1012  * takes ownership of the QDict reference, it will set *options to NULL;
1013  * otherwise, it will contain unused/unrecognized options after this function
1014  * returns. Then, the caller is responsible for freeing it. If it intends to
1015  * reuse the QDict, QINCREF() should be called beforehand.
1016  */
1017 static int bdrv_file_open(BlockDriverState *bs, const char *filename,
1018                           QDict **options, int flags, Error **errp)
1019 {
1020     BlockDriver *drv;
1021     const char *drvname;
1022     bool parse_filename = false;
1023     Error *local_err = NULL;
1024     int ret;
1025 
1026     /* Fetch the file name from the options QDict if necessary */
1027     if (!filename) {
1028         filename = qdict_get_try_str(*options, "filename");
1029     } else if (filename && !qdict_haskey(*options, "filename")) {
1030         qdict_put(*options, "filename", qstring_from_str(filename));
1031         parse_filename = true;
1032     } else {
1033         error_setg(errp, "Can't specify 'file' and 'filename' options at the "
1034                    "same time");
1035         ret = -EINVAL;
1036         goto fail;
1037     }
1038 
1039     /* Find the right block driver */
1040     drvname = qdict_get_try_str(*options, "driver");
1041     if (drvname) {
1042         drv = bdrv_find_format(drvname);
1043         if (!drv) {
1044             error_setg(errp, "Unknown driver '%s'", drvname);
1045         }
1046         qdict_del(*options, "driver");
1047     } else if (filename) {
1048         drv = bdrv_find_protocol(filename, parse_filename);
1049         if (!drv) {
1050             error_setg(errp, "Unknown protocol");
1051         }
1052     } else {
1053         error_setg(errp, "Must specify either driver or file");
1054         drv = NULL;
1055     }
1056 
1057     if (!drv) {
1058         /* errp has been set already */
1059         ret = -ENOENT;
1060         goto fail;
1061     }
1062 
1063     /* Parse the filename and open it */
1064     if (drv->bdrv_parse_filename && parse_filename) {
1065         drv->bdrv_parse_filename(filename, *options, &local_err);
1066         if (local_err) {
1067             error_propagate(errp, local_err);
1068             ret = -EINVAL;
1069             goto fail;
1070         }
1071 
1072         if (!drv->bdrv_needs_filename) {
1073             qdict_del(*options, "filename");
1074         } else {
1075             filename = qdict_get_str(*options, "filename");
1076         }
1077     }
1078 
1079     if (!drv->bdrv_file_open) {
1080         ret = bdrv_open(&bs, filename, NULL, *options, flags, drv, &local_err);
1081         *options = NULL;
1082     } else {
1083         ret = bdrv_open_common(bs, NULL, *options, flags, drv, &local_err);
1084     }
1085     if (ret < 0) {
1086         error_propagate(errp, local_err);
1087         goto fail;
1088     }
1089 
1090     bs->growable = 1;
1091     return 0;
1092 
1093 fail:
1094     return ret;
1095 }
1096 
1097 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1098 {
1099 
1100     if (bs->backing_hd) {
1101         assert(bs->backing_blocker);
1102         bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1103     } else if (backing_hd) {
1104         error_setg(&bs->backing_blocker,
1105                    "device is used as backing hd of '%s'",
1106                    bs->device_name);
1107     }
1108 
1109     bs->backing_hd = backing_hd;
1110     if (!backing_hd) {
1111         error_free(bs->backing_blocker);
1112         bs->backing_blocker = NULL;
1113         goto out;
1114     }
1115     bs->open_flags &= ~BDRV_O_NO_BACKING;
1116     pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1117     pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1118             backing_hd->drv ? backing_hd->drv->format_name : "");
1119 
1120     bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1121     /* Otherwise we won't be able to commit due to check in bdrv_commit */
1122     bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1123                     bs->backing_blocker);
1124 out:
1125     bdrv_refresh_limits(bs);
1126 }
1127 
1128 /*
1129  * Opens the backing file for a BlockDriverState if not yet open
1130  *
1131  * options is a QDict of options to pass to the block drivers, or NULL for an
1132  * empty set of options. The reference to the QDict is transferred to this
1133  * function (even on failure), so if the caller intends to reuse the dictionary,
1134  * it needs to use QINCREF() before calling bdrv_file_open.
1135  */
1136 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1137 {
1138     char *backing_filename = g_malloc0(PATH_MAX);
1139     int ret = 0;
1140     BlockDriver *back_drv = NULL;
1141     BlockDriverState *backing_hd;
1142     Error *local_err = NULL;
1143 
1144     if (bs->backing_hd != NULL) {
1145         QDECREF(options);
1146         goto free_exit;
1147     }
1148 
1149     /* NULL means an empty set of options */
1150     if (options == NULL) {
1151         options = qdict_new();
1152     }
1153 
1154     bs->open_flags &= ~BDRV_O_NO_BACKING;
1155     if (qdict_haskey(options, "file.filename")) {
1156         backing_filename[0] = '\0';
1157     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1158         QDECREF(options);
1159         goto free_exit;
1160     } else {
1161         bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1162     }
1163 
1164     backing_hd = bdrv_new("", errp);
1165 
1166     if (bs->backing_format[0] != '\0') {
1167         back_drv = bdrv_find_format(bs->backing_format);
1168     }
1169 
1170     assert(bs->backing_hd == NULL);
1171     ret = bdrv_open(&backing_hd,
1172                     *backing_filename ? backing_filename : NULL, NULL, options,
1173                     bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
1174     if (ret < 0) {
1175         bdrv_unref(backing_hd);
1176         backing_hd = NULL;
1177         bs->open_flags |= BDRV_O_NO_BACKING;
1178         error_setg(errp, "Could not open backing file: %s",
1179                    error_get_pretty(local_err));
1180         error_free(local_err);
1181         goto free_exit;
1182     }
1183     bdrv_set_backing_hd(bs, backing_hd);
1184 
1185 free_exit:
1186     g_free(backing_filename);
1187     return ret;
1188 }
1189 
1190 /*
1191  * Opens a disk image whose options are given as BlockdevRef in another block
1192  * device's options.
1193  *
1194  * If allow_none is true, no image will be opened if filename is false and no
1195  * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1196  *
1197  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1198  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1199  * itself, all options starting with "${bdref_key}." are considered part of the
1200  * BlockdevRef.
1201  *
1202  * The BlockdevRef will be removed from the options QDict.
1203  *
1204  * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1205  */
1206 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1207                     QDict *options, const char *bdref_key, int flags,
1208                     bool allow_none, Error **errp)
1209 {
1210     QDict *image_options;
1211     int ret;
1212     char *bdref_key_dot;
1213     const char *reference;
1214 
1215     assert(pbs);
1216     assert(*pbs == NULL);
1217 
1218     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1219     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1220     g_free(bdref_key_dot);
1221 
1222     reference = qdict_get_try_str(options, bdref_key);
1223     if (!filename && !reference && !qdict_size(image_options)) {
1224         if (allow_none) {
1225             ret = 0;
1226         } else {
1227             error_setg(errp, "A block device must be specified for \"%s\"",
1228                        bdref_key);
1229             ret = -EINVAL;
1230         }
1231         goto done;
1232     }
1233 
1234     ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1235 
1236 done:
1237     qdict_del(options, bdref_key);
1238     return ret;
1239 }
1240 
1241 void bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1242 {
1243     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1244     char *tmp_filename = g_malloc0(PATH_MAX + 1);
1245     int64_t total_size;
1246     BlockDriver *bdrv_qcow2;
1247     QEMUOptionParameter *create_options;
1248     QDict *snapshot_options;
1249     BlockDriverState *bs_snapshot;
1250     Error *local_err;
1251     int ret;
1252 
1253     /* if snapshot, we create a temporary backing file and open it
1254        instead of opening 'filename' directly */
1255 
1256     /* Get the required size from the image */
1257     total_size = bdrv_getlength(bs);
1258     if (total_size < 0) {
1259         error_setg_errno(errp, -total_size, "Could not get image size");
1260         goto out;
1261     }
1262     total_size &= BDRV_SECTOR_MASK;
1263 
1264     /* Create the temporary image */
1265     ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1266     if (ret < 0) {
1267         error_setg_errno(errp, -ret, "Could not get temporary filename");
1268         goto out;
1269     }
1270 
1271     bdrv_qcow2 = bdrv_find_format("qcow2");
1272     create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1273                                              NULL);
1274 
1275     set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1276 
1277     ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1278     free_option_parameters(create_options);
1279     if (ret < 0) {
1280         error_setg_errno(errp, -ret, "Could not create temporary overlay "
1281                          "'%s': %s", tmp_filename,
1282                          error_get_pretty(local_err));
1283         error_free(local_err);
1284         goto out;
1285     }
1286 
1287     /* Prepare a new options QDict for the temporary file */
1288     snapshot_options = qdict_new();
1289     qdict_put(snapshot_options, "file.driver",
1290               qstring_from_str("file"));
1291     qdict_put(snapshot_options, "file.filename",
1292               qstring_from_str(tmp_filename));
1293 
1294     bs_snapshot = bdrv_new("", &error_abort);
1295 
1296     ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1297                     flags, bdrv_qcow2, &local_err);
1298     if (ret < 0) {
1299         error_propagate(errp, local_err);
1300         goto out;
1301     }
1302 
1303     bdrv_append(bs_snapshot, bs);
1304 
1305 out:
1306     g_free(tmp_filename);
1307 }
1308 
1309 static QDict *parse_json_filename(const char *filename, Error **errp)
1310 {
1311     QObject *options_obj;
1312     QDict *options;
1313     int ret;
1314 
1315     ret = strstart(filename, "json:", &filename);
1316     assert(ret);
1317 
1318     options_obj = qobject_from_json(filename);
1319     if (!options_obj) {
1320         error_setg(errp, "Could not parse the JSON options");
1321         return NULL;
1322     }
1323 
1324     if (qobject_type(options_obj) != QTYPE_QDICT) {
1325         qobject_decref(options_obj);
1326         error_setg(errp, "Invalid JSON object given");
1327         return NULL;
1328     }
1329 
1330     options = qobject_to_qdict(options_obj);
1331     qdict_flatten(options);
1332 
1333     return options;
1334 }
1335 
1336 /*
1337  * Opens a disk image (raw, qcow2, vmdk, ...)
1338  *
1339  * options is a QDict of options to pass to the block drivers, or NULL for an
1340  * empty set of options. The reference to the QDict belongs to the block layer
1341  * after the call (even on failure), so if the caller intends to reuse the
1342  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1343  *
1344  * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1345  * If it is not NULL, the referenced BDS will be reused.
1346  *
1347  * The reference parameter may be used to specify an existing block device which
1348  * should be opened. If specified, neither options nor a filename may be given,
1349  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1350  */
1351 int bdrv_open(BlockDriverState **pbs, const char *filename,
1352               const char *reference, QDict *options, int flags,
1353               BlockDriver *drv, Error **errp)
1354 {
1355     int ret;
1356     BlockDriverState *file = NULL, *bs;
1357     const char *drvname;
1358     Error *local_err = NULL;
1359     int snapshot_flags = 0;
1360 
1361     assert(pbs);
1362 
1363     if (reference) {
1364         bool options_non_empty = options ? qdict_size(options) : false;
1365         QDECREF(options);
1366 
1367         if (*pbs) {
1368             error_setg(errp, "Cannot reuse an existing BDS when referencing "
1369                        "another block device");
1370             return -EINVAL;
1371         }
1372 
1373         if (filename || options_non_empty) {
1374             error_setg(errp, "Cannot reference an existing block device with "
1375                        "additional options or a new filename");
1376             return -EINVAL;
1377         }
1378 
1379         bs = bdrv_lookup_bs(reference, reference, errp);
1380         if (!bs) {
1381             return -ENODEV;
1382         }
1383         bdrv_ref(bs);
1384         *pbs = bs;
1385         return 0;
1386     }
1387 
1388     if (*pbs) {
1389         bs = *pbs;
1390     } else {
1391         bs = bdrv_new("", &error_abort);
1392     }
1393 
1394     /* NULL means an empty set of options */
1395     if (options == NULL) {
1396         options = qdict_new();
1397     }
1398 
1399     if (filename && g_str_has_prefix(filename, "json:")) {
1400         QDict *json_options = parse_json_filename(filename, &local_err);
1401         if (local_err) {
1402             ret = -EINVAL;
1403             goto fail;
1404         }
1405 
1406         /* Options given in the filename have lower priority than options
1407          * specified directly */
1408         qdict_join(options, json_options, false);
1409         QDECREF(json_options);
1410         filename = NULL;
1411     }
1412 
1413     bs->options = options;
1414     options = qdict_clone_shallow(options);
1415 
1416     if (flags & BDRV_O_PROTOCOL) {
1417         assert(!drv);
1418         ret = bdrv_file_open(bs, filename, &options, flags & ~BDRV_O_PROTOCOL,
1419                              &local_err);
1420         if (!ret) {
1421             drv = bs->drv;
1422             goto done;
1423         } else if (bs->drv) {
1424             goto close_and_fail;
1425         } else {
1426             goto fail;
1427         }
1428     }
1429 
1430     /* Open image file without format layer */
1431     if (flags & BDRV_O_RDWR) {
1432         flags |= BDRV_O_ALLOW_RDWR;
1433     }
1434     if (flags & BDRV_O_SNAPSHOT) {
1435         snapshot_flags = bdrv_temp_snapshot_flags(flags);
1436         flags = bdrv_backing_flags(flags);
1437     }
1438 
1439     assert(file == NULL);
1440     ret = bdrv_open_image(&file, filename, options, "file",
1441                           bdrv_inherited_flags(flags),
1442                           true, &local_err);
1443     if (ret < 0) {
1444         goto fail;
1445     }
1446 
1447     /* Find the right image format driver */
1448     drvname = qdict_get_try_str(options, "driver");
1449     if (drvname) {
1450         drv = bdrv_find_format(drvname);
1451         qdict_del(options, "driver");
1452         if (!drv) {
1453             error_setg(errp, "Invalid driver: '%s'", drvname);
1454             ret = -EINVAL;
1455             goto fail;
1456         }
1457     }
1458 
1459     if (!drv) {
1460         if (file) {
1461             ret = find_image_format(file, filename, &drv, &local_err);
1462         } else {
1463             error_setg(errp, "Must specify either driver or file");
1464             ret = -EINVAL;
1465             goto fail;
1466         }
1467     }
1468 
1469     if (!drv) {
1470         goto fail;
1471     }
1472 
1473     /* Open the image */
1474     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1475     if (ret < 0) {
1476         goto fail;
1477     }
1478 
1479     if (file && (bs->file != file)) {
1480         bdrv_unref(file);
1481         file = NULL;
1482     }
1483 
1484     /* If there is a backing file, use it */
1485     if ((flags & BDRV_O_NO_BACKING) == 0) {
1486         QDict *backing_options;
1487 
1488         qdict_extract_subqdict(options, &backing_options, "backing.");
1489         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1490         if (ret < 0) {
1491             goto close_and_fail;
1492         }
1493     }
1494 
1495     /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1496      * temporary snapshot afterwards. */
1497     if (snapshot_flags) {
1498         bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1499         if (local_err) {
1500             error_propagate(errp, local_err);
1501             goto close_and_fail;
1502         }
1503     }
1504 
1505 
1506 done:
1507     /* Check if any unknown options were used */
1508     if (options && (qdict_size(options) != 0)) {
1509         const QDictEntry *entry = qdict_first(options);
1510         if (flags & BDRV_O_PROTOCOL) {
1511             error_setg(errp, "Block protocol '%s' doesn't support the option "
1512                        "'%s'", drv->format_name, entry->key);
1513         } else {
1514             error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1515                        "support the option '%s'", drv->format_name,
1516                        bs->device_name, entry->key);
1517         }
1518 
1519         ret = -EINVAL;
1520         goto close_and_fail;
1521     }
1522 
1523     if (!bdrv_key_required(bs)) {
1524         bdrv_dev_change_media_cb(bs, true);
1525     } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1526                && !runstate_check(RUN_STATE_INMIGRATE)
1527                && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1528         error_setg(errp,
1529                    "Guest must be stopped for opening of encrypted image");
1530         ret = -EBUSY;
1531         goto close_and_fail;
1532     }
1533 
1534     QDECREF(options);
1535     *pbs = bs;
1536     return 0;
1537 
1538 fail:
1539     if (file != NULL) {
1540         bdrv_unref(file);
1541     }
1542     QDECREF(bs->options);
1543     QDECREF(options);
1544     bs->options = NULL;
1545     if (!*pbs) {
1546         /* If *pbs is NULL, a new BDS has been created in this function and
1547            needs to be freed now. Otherwise, it does not need to be closed,
1548            since it has not really been opened yet. */
1549         bdrv_unref(bs);
1550     }
1551     if (local_err) {
1552         error_propagate(errp, local_err);
1553     }
1554     return ret;
1555 
1556 close_and_fail:
1557     /* See fail path, but now the BDS has to be always closed */
1558     if (*pbs) {
1559         bdrv_close(bs);
1560     } else {
1561         bdrv_unref(bs);
1562     }
1563     QDECREF(options);
1564     if (local_err) {
1565         error_propagate(errp, local_err);
1566     }
1567     return ret;
1568 }
1569 
1570 typedef struct BlockReopenQueueEntry {
1571      bool prepared;
1572      BDRVReopenState state;
1573      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1574 } BlockReopenQueueEntry;
1575 
1576 /*
1577  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1578  * reopen of multiple devices.
1579  *
1580  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1581  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1582  * be created and initialized. This newly created BlockReopenQueue should be
1583  * passed back in for subsequent calls that are intended to be of the same
1584  * atomic 'set'.
1585  *
1586  * bs is the BlockDriverState to add to the reopen queue.
1587  *
1588  * flags contains the open flags for the associated bs
1589  *
1590  * returns a pointer to bs_queue, which is either the newly allocated
1591  * bs_queue, or the existing bs_queue being used.
1592  *
1593  */
1594 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1595                                     BlockDriverState *bs, int flags)
1596 {
1597     assert(bs != NULL);
1598 
1599     BlockReopenQueueEntry *bs_entry;
1600     if (bs_queue == NULL) {
1601         bs_queue = g_new0(BlockReopenQueue, 1);
1602         QSIMPLEQ_INIT(bs_queue);
1603     }
1604 
1605     /* bdrv_open() masks this flag out */
1606     flags &= ~BDRV_O_PROTOCOL;
1607 
1608     if (bs->file) {
1609         bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1610     }
1611 
1612     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1613     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1614 
1615     bs_entry->state.bs = bs;
1616     bs_entry->state.flags = flags;
1617 
1618     return bs_queue;
1619 }
1620 
1621 /*
1622  * Reopen multiple BlockDriverStates atomically & transactionally.
1623  *
1624  * The queue passed in (bs_queue) must have been built up previous
1625  * via bdrv_reopen_queue().
1626  *
1627  * Reopens all BDS specified in the queue, with the appropriate
1628  * flags.  All devices are prepared for reopen, and failure of any
1629  * device will cause all device changes to be abandonded, and intermediate
1630  * data cleaned up.
1631  *
1632  * If all devices prepare successfully, then the changes are committed
1633  * to all devices.
1634  *
1635  */
1636 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1637 {
1638     int ret = -1;
1639     BlockReopenQueueEntry *bs_entry, *next;
1640     Error *local_err = NULL;
1641 
1642     assert(bs_queue != NULL);
1643 
1644     bdrv_drain_all();
1645 
1646     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1647         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1648             error_propagate(errp, local_err);
1649             goto cleanup;
1650         }
1651         bs_entry->prepared = true;
1652     }
1653 
1654     /* If we reach this point, we have success and just need to apply the
1655      * changes
1656      */
1657     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1658         bdrv_reopen_commit(&bs_entry->state);
1659     }
1660 
1661     ret = 0;
1662 
1663 cleanup:
1664     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1665         if (ret && bs_entry->prepared) {
1666             bdrv_reopen_abort(&bs_entry->state);
1667         }
1668         g_free(bs_entry);
1669     }
1670     g_free(bs_queue);
1671     return ret;
1672 }
1673 
1674 
1675 /* Reopen a single BlockDriverState with the specified flags. */
1676 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1677 {
1678     int ret = -1;
1679     Error *local_err = NULL;
1680     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1681 
1682     ret = bdrv_reopen_multiple(queue, &local_err);
1683     if (local_err != NULL) {
1684         error_propagate(errp, local_err);
1685     }
1686     return ret;
1687 }
1688 
1689 
1690 /*
1691  * Prepares a BlockDriverState for reopen. All changes are staged in the
1692  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1693  * the block driver layer .bdrv_reopen_prepare()
1694  *
1695  * bs is the BlockDriverState to reopen
1696  * flags are the new open flags
1697  * queue is the reopen queue
1698  *
1699  * Returns 0 on success, non-zero on error.  On error errp will be set
1700  * as well.
1701  *
1702  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1703  * It is the responsibility of the caller to then call the abort() or
1704  * commit() for any other BDS that have been left in a prepare() state
1705  *
1706  */
1707 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1708                         Error **errp)
1709 {
1710     int ret = -1;
1711     Error *local_err = NULL;
1712     BlockDriver *drv;
1713 
1714     assert(reopen_state != NULL);
1715     assert(reopen_state->bs->drv != NULL);
1716     drv = reopen_state->bs->drv;
1717 
1718     /* if we are to stay read-only, do not allow permission change
1719      * to r/w */
1720     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1721         reopen_state->flags & BDRV_O_RDWR) {
1722         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1723                   reopen_state->bs->device_name);
1724         goto error;
1725     }
1726 
1727 
1728     ret = bdrv_flush(reopen_state->bs);
1729     if (ret) {
1730         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1731                   strerror(-ret));
1732         goto error;
1733     }
1734 
1735     if (drv->bdrv_reopen_prepare) {
1736         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1737         if (ret) {
1738             if (local_err != NULL) {
1739                 error_propagate(errp, local_err);
1740             } else {
1741                 error_setg(errp, "failed while preparing to reopen image '%s'",
1742                            reopen_state->bs->filename);
1743             }
1744             goto error;
1745         }
1746     } else {
1747         /* It is currently mandatory to have a bdrv_reopen_prepare()
1748          * handler for each supported drv. */
1749         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1750                   drv->format_name, reopen_state->bs->device_name,
1751                  "reopening of file");
1752         ret = -1;
1753         goto error;
1754     }
1755 
1756     ret = 0;
1757 
1758 error:
1759     return ret;
1760 }
1761 
1762 /*
1763  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1764  * makes them final by swapping the staging BlockDriverState contents into
1765  * the active BlockDriverState contents.
1766  */
1767 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1768 {
1769     BlockDriver *drv;
1770 
1771     assert(reopen_state != NULL);
1772     drv = reopen_state->bs->drv;
1773     assert(drv != NULL);
1774 
1775     /* If there are any driver level actions to take */
1776     if (drv->bdrv_reopen_commit) {
1777         drv->bdrv_reopen_commit(reopen_state);
1778     }
1779 
1780     /* set BDS specific flags now */
1781     reopen_state->bs->open_flags         = reopen_state->flags;
1782     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1783                                               BDRV_O_CACHE_WB);
1784     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1785 
1786     bdrv_refresh_limits(reopen_state->bs);
1787 }
1788 
1789 /*
1790  * Abort the reopen, and delete and free the staged changes in
1791  * reopen_state
1792  */
1793 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1794 {
1795     BlockDriver *drv;
1796 
1797     assert(reopen_state != NULL);
1798     drv = reopen_state->bs->drv;
1799     assert(drv != NULL);
1800 
1801     if (drv->bdrv_reopen_abort) {
1802         drv->bdrv_reopen_abort(reopen_state);
1803     }
1804 }
1805 
1806 
1807 void bdrv_close(BlockDriverState *bs)
1808 {
1809     if (bs->job) {
1810         block_job_cancel_sync(bs->job);
1811     }
1812     bdrv_drain_all(); /* complete I/O */
1813     bdrv_flush(bs);
1814     bdrv_drain_all(); /* in case flush left pending I/O */
1815     notifier_list_notify(&bs->close_notifiers, bs);
1816 
1817     if (bs->drv) {
1818         if (bs->backing_hd) {
1819             BlockDriverState *backing_hd = bs->backing_hd;
1820             bdrv_set_backing_hd(bs, NULL);
1821             bdrv_unref(backing_hd);
1822         }
1823         bs->drv->bdrv_close(bs);
1824         g_free(bs->opaque);
1825         bs->opaque = NULL;
1826         bs->drv = NULL;
1827         bs->copy_on_read = 0;
1828         bs->backing_file[0] = '\0';
1829         bs->backing_format[0] = '\0';
1830         bs->total_sectors = 0;
1831         bs->encrypted = 0;
1832         bs->valid_key = 0;
1833         bs->sg = 0;
1834         bs->growable = 0;
1835         bs->zero_beyond_eof = false;
1836         QDECREF(bs->options);
1837         bs->options = NULL;
1838 
1839         if (bs->file != NULL) {
1840             bdrv_unref(bs->file);
1841             bs->file = NULL;
1842         }
1843     }
1844 
1845     bdrv_dev_change_media_cb(bs, false);
1846 
1847     /*throttling disk I/O limits*/
1848     if (bs->io_limits_enabled) {
1849         bdrv_io_limits_disable(bs);
1850     }
1851 }
1852 
1853 void bdrv_close_all(void)
1854 {
1855     BlockDriverState *bs;
1856 
1857     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1858         bdrv_close(bs);
1859     }
1860 }
1861 
1862 /* Check if any requests are in-flight (including throttled requests) */
1863 static bool bdrv_requests_pending(BlockDriverState *bs)
1864 {
1865     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1866         return true;
1867     }
1868     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1869         return true;
1870     }
1871     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1872         return true;
1873     }
1874     if (bs->file && bdrv_requests_pending(bs->file)) {
1875         return true;
1876     }
1877     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1878         return true;
1879     }
1880     return false;
1881 }
1882 
1883 static bool bdrv_requests_pending_all(void)
1884 {
1885     BlockDriverState *bs;
1886     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1887         if (bdrv_requests_pending(bs)) {
1888             return true;
1889         }
1890     }
1891     return false;
1892 }
1893 
1894 /*
1895  * Wait for pending requests to complete across all BlockDriverStates
1896  *
1897  * This function does not flush data to disk, use bdrv_flush_all() for that
1898  * after calling this function.
1899  *
1900  * Note that completion of an asynchronous I/O operation can trigger any
1901  * number of other I/O operations on other devices---for example a coroutine
1902  * can be arbitrarily complex and a constant flow of I/O can come until the
1903  * coroutine is complete.  Because of this, it is not possible to have a
1904  * function to drain a single device's I/O queue.
1905  */
1906 void bdrv_drain_all(void)
1907 {
1908     /* Always run first iteration so any pending completion BHs run */
1909     bool busy = true;
1910     BlockDriverState *bs;
1911 
1912     while (busy) {
1913         QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1914             bdrv_start_throttled_reqs(bs);
1915         }
1916 
1917         busy = bdrv_requests_pending_all();
1918         busy |= aio_poll(qemu_get_aio_context(), busy);
1919     }
1920 }
1921 
1922 /* make a BlockDriverState anonymous by removing from bdrv_state and
1923  * graph_bdrv_state list.
1924    Also, NULL terminate the device_name to prevent double remove */
1925 void bdrv_make_anon(BlockDriverState *bs)
1926 {
1927     if (bs->device_name[0] != '\0') {
1928         QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1929     }
1930     bs->device_name[0] = '\0';
1931     if (bs->node_name[0] != '\0') {
1932         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1933     }
1934     bs->node_name[0] = '\0';
1935 }
1936 
1937 static void bdrv_rebind(BlockDriverState *bs)
1938 {
1939     if (bs->drv && bs->drv->bdrv_rebind) {
1940         bs->drv->bdrv_rebind(bs);
1941     }
1942 }
1943 
1944 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1945                                      BlockDriverState *bs_src)
1946 {
1947     /* move some fields that need to stay attached to the device */
1948 
1949     /* dev info */
1950     bs_dest->dev_ops            = bs_src->dev_ops;
1951     bs_dest->dev_opaque         = bs_src->dev_opaque;
1952     bs_dest->dev                = bs_src->dev;
1953     bs_dest->guest_block_size   = bs_src->guest_block_size;
1954     bs_dest->copy_on_read       = bs_src->copy_on_read;
1955 
1956     bs_dest->enable_write_cache = bs_src->enable_write_cache;
1957 
1958     /* i/o throttled req */
1959     memcpy(&bs_dest->throttle_state,
1960            &bs_src->throttle_state,
1961            sizeof(ThrottleState));
1962     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
1963     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
1964     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
1965 
1966     /* r/w error */
1967     bs_dest->on_read_error      = bs_src->on_read_error;
1968     bs_dest->on_write_error     = bs_src->on_write_error;
1969 
1970     /* i/o status */
1971     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
1972     bs_dest->iostatus           = bs_src->iostatus;
1973 
1974     /* dirty bitmap */
1975     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
1976 
1977     /* reference count */
1978     bs_dest->refcnt             = bs_src->refcnt;
1979 
1980     /* job */
1981     bs_dest->job                = bs_src->job;
1982 
1983     /* keep the same entry in bdrv_states */
1984     pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1985             bs_src->device_name);
1986     bs_dest->device_list = bs_src->device_list;
1987     memcpy(bs_dest->op_blockers, bs_src->op_blockers,
1988            sizeof(bs_dest->op_blockers));
1989 }
1990 
1991 /*
1992  * Swap bs contents for two image chains while they are live,
1993  * while keeping required fields on the BlockDriverState that is
1994  * actually attached to a device.
1995  *
1996  * This will modify the BlockDriverState fields, and swap contents
1997  * between bs_new and bs_old. Both bs_new and bs_old are modified.
1998  *
1999  * bs_new is required to be anonymous.
2000  *
2001  * This function does not create any image files.
2002  */
2003 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2004 {
2005     BlockDriverState tmp;
2006 
2007     /* The code needs to swap the node_name but simply swapping node_list won't
2008      * work so first remove the nodes from the graph list, do the swap then
2009      * insert them back if needed.
2010      */
2011     if (bs_new->node_name[0] != '\0') {
2012         QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2013     }
2014     if (bs_old->node_name[0] != '\0') {
2015         QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2016     }
2017 
2018     /* bs_new must be anonymous and shouldn't have anything fancy enabled */
2019     assert(bs_new->device_name[0] == '\0');
2020     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2021     assert(bs_new->job == NULL);
2022     assert(bs_new->dev == NULL);
2023     assert(bs_new->io_limits_enabled == false);
2024     assert(!throttle_have_timer(&bs_new->throttle_state));
2025 
2026     tmp = *bs_new;
2027     *bs_new = *bs_old;
2028     *bs_old = tmp;
2029 
2030     /* there are some fields that should not be swapped, move them back */
2031     bdrv_move_feature_fields(&tmp, bs_old);
2032     bdrv_move_feature_fields(bs_old, bs_new);
2033     bdrv_move_feature_fields(bs_new, &tmp);
2034 
2035     /* bs_new shouldn't be in bdrv_states even after the swap!  */
2036     assert(bs_new->device_name[0] == '\0');
2037 
2038     /* Check a few fields that should remain attached to the device */
2039     assert(bs_new->dev == NULL);
2040     assert(bs_new->job == NULL);
2041     assert(bs_new->io_limits_enabled == false);
2042     assert(!throttle_have_timer(&bs_new->throttle_state));
2043 
2044     /* insert the nodes back into the graph node list if needed */
2045     if (bs_new->node_name[0] != '\0') {
2046         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2047     }
2048     if (bs_old->node_name[0] != '\0') {
2049         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2050     }
2051 
2052     bdrv_rebind(bs_new);
2053     bdrv_rebind(bs_old);
2054 }
2055 
2056 /*
2057  * Add new bs contents at the top of an image chain while the chain is
2058  * live, while keeping required fields on the top layer.
2059  *
2060  * This will modify the BlockDriverState fields, and swap contents
2061  * between bs_new and bs_top. Both bs_new and bs_top are modified.
2062  *
2063  * bs_new is required to be anonymous.
2064  *
2065  * This function does not create any image files.
2066  */
2067 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2068 {
2069     bdrv_swap(bs_new, bs_top);
2070 
2071     /* The contents of 'tmp' will become bs_top, as we are
2072      * swapping bs_new and bs_top contents. */
2073     bdrv_set_backing_hd(bs_top, bs_new);
2074 }
2075 
2076 static void bdrv_delete(BlockDriverState *bs)
2077 {
2078     assert(!bs->dev);
2079     assert(!bs->job);
2080     assert(bdrv_op_blocker_is_empty(bs));
2081     assert(!bs->refcnt);
2082     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2083 
2084     bdrv_close(bs);
2085 
2086     /* remove from list, if necessary */
2087     bdrv_make_anon(bs);
2088 
2089     g_free(bs);
2090 }
2091 
2092 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
2093 /* TODO change to DeviceState *dev when all users are qdevified */
2094 {
2095     if (bs->dev) {
2096         return -EBUSY;
2097     }
2098     bs->dev = dev;
2099     bdrv_iostatus_reset(bs);
2100     return 0;
2101 }
2102 
2103 /* TODO qdevified devices don't use this, remove when devices are qdevified */
2104 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
2105 {
2106     if (bdrv_attach_dev(bs, dev) < 0) {
2107         abort();
2108     }
2109 }
2110 
2111 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
2112 /* TODO change to DeviceState *dev when all users are qdevified */
2113 {
2114     assert(bs->dev == dev);
2115     bs->dev = NULL;
2116     bs->dev_ops = NULL;
2117     bs->dev_opaque = NULL;
2118     bs->guest_block_size = 512;
2119 }
2120 
2121 /* TODO change to return DeviceState * when all users are qdevified */
2122 void *bdrv_get_attached_dev(BlockDriverState *bs)
2123 {
2124     return bs->dev;
2125 }
2126 
2127 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2128                       void *opaque)
2129 {
2130     bs->dev_ops = ops;
2131     bs->dev_opaque = opaque;
2132 }
2133 
2134 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
2135                                enum MonitorEvent ev,
2136                                BlockErrorAction action, bool is_read)
2137 {
2138     QObject *data;
2139     const char *action_str;
2140 
2141     switch (action) {
2142     case BDRV_ACTION_REPORT:
2143         action_str = "report";
2144         break;
2145     case BDRV_ACTION_IGNORE:
2146         action_str = "ignore";
2147         break;
2148     case BDRV_ACTION_STOP:
2149         action_str = "stop";
2150         break;
2151     default:
2152         abort();
2153     }
2154 
2155     data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2156                               bdrv->device_name,
2157                               action_str,
2158                               is_read ? "read" : "write");
2159     monitor_protocol_event(ev, data);
2160 
2161     qobject_decref(data);
2162 }
2163 
2164 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2165 {
2166     QObject *data;
2167 
2168     data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2169                               bdrv_get_device_name(bs), ejected);
2170     monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2171 
2172     qobject_decref(data);
2173 }
2174 
2175 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2176 {
2177     if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2178         bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2179         bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2180         if (tray_was_closed) {
2181             /* tray open */
2182             bdrv_emit_qmp_eject_event(bs, true);
2183         }
2184         if (load) {
2185             /* tray close */
2186             bdrv_emit_qmp_eject_event(bs, false);
2187         }
2188     }
2189 }
2190 
2191 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2192 {
2193     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2194 }
2195 
2196 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2197 {
2198     if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2199         bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2200     }
2201 }
2202 
2203 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2204 {
2205     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2206         return bs->dev_ops->is_tray_open(bs->dev_opaque);
2207     }
2208     return false;
2209 }
2210 
2211 static void bdrv_dev_resize_cb(BlockDriverState *bs)
2212 {
2213     if (bs->dev_ops && bs->dev_ops->resize_cb) {
2214         bs->dev_ops->resize_cb(bs->dev_opaque);
2215     }
2216 }
2217 
2218 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2219 {
2220     if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2221         return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2222     }
2223     return false;
2224 }
2225 
2226 /*
2227  * Run consistency checks on an image
2228  *
2229  * Returns 0 if the check could be completed (it doesn't mean that the image is
2230  * free of errors) or -errno when an internal error occurred. The results of the
2231  * check are stored in res.
2232  */
2233 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2234 {
2235     if (bs->drv->bdrv_check == NULL) {
2236         return -ENOTSUP;
2237     }
2238 
2239     memset(res, 0, sizeof(*res));
2240     return bs->drv->bdrv_check(bs, res, fix);
2241 }
2242 
2243 #define COMMIT_BUF_SECTORS 2048
2244 
2245 /* commit COW file into the raw image */
2246 int bdrv_commit(BlockDriverState *bs)
2247 {
2248     BlockDriver *drv = bs->drv;
2249     int64_t sector, total_sectors, length, backing_length;
2250     int n, ro, open_flags;
2251     int ret = 0;
2252     uint8_t *buf = NULL;
2253     char filename[PATH_MAX];
2254 
2255     if (!drv)
2256         return -ENOMEDIUM;
2257 
2258     if (!bs->backing_hd) {
2259         return -ENOTSUP;
2260     }
2261 
2262     if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2263         bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2264         return -EBUSY;
2265     }
2266 
2267     ro = bs->backing_hd->read_only;
2268     /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2269     pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2270     open_flags =  bs->backing_hd->open_flags;
2271 
2272     if (ro) {
2273         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2274             return -EACCES;
2275         }
2276     }
2277 
2278     length = bdrv_getlength(bs);
2279     if (length < 0) {
2280         ret = length;
2281         goto ro_cleanup;
2282     }
2283 
2284     backing_length = bdrv_getlength(bs->backing_hd);
2285     if (backing_length < 0) {
2286         ret = backing_length;
2287         goto ro_cleanup;
2288     }
2289 
2290     /* If our top snapshot is larger than the backing file image,
2291      * grow the backing file image if possible.  If not possible,
2292      * we must return an error */
2293     if (length > backing_length) {
2294         ret = bdrv_truncate(bs->backing_hd, length);
2295         if (ret < 0) {
2296             goto ro_cleanup;
2297         }
2298     }
2299 
2300     total_sectors = length >> BDRV_SECTOR_BITS;
2301     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2302 
2303     for (sector = 0; sector < total_sectors; sector += n) {
2304         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2305         if (ret < 0) {
2306             goto ro_cleanup;
2307         }
2308         if (ret) {
2309             ret = bdrv_read(bs, sector, buf, n);
2310             if (ret < 0) {
2311                 goto ro_cleanup;
2312             }
2313 
2314             ret = bdrv_write(bs->backing_hd, sector, buf, n);
2315             if (ret < 0) {
2316                 goto ro_cleanup;
2317             }
2318         }
2319     }
2320 
2321     if (drv->bdrv_make_empty) {
2322         ret = drv->bdrv_make_empty(bs);
2323         if (ret < 0) {
2324             goto ro_cleanup;
2325         }
2326         bdrv_flush(bs);
2327     }
2328 
2329     /*
2330      * Make sure all data we wrote to the backing device is actually
2331      * stable on disk.
2332      */
2333     if (bs->backing_hd) {
2334         bdrv_flush(bs->backing_hd);
2335     }
2336 
2337     ret = 0;
2338 ro_cleanup:
2339     g_free(buf);
2340 
2341     if (ro) {
2342         /* ignoring error return here */
2343         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2344     }
2345 
2346     return ret;
2347 }
2348 
2349 int bdrv_commit_all(void)
2350 {
2351     BlockDriverState *bs;
2352 
2353     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2354         if (bs->drv && bs->backing_hd) {
2355             int ret = bdrv_commit(bs);
2356             if (ret < 0) {
2357                 return ret;
2358             }
2359         }
2360     }
2361     return 0;
2362 }
2363 
2364 /**
2365  * Remove an active request from the tracked requests list
2366  *
2367  * This function should be called when a tracked request is completing.
2368  */
2369 static void tracked_request_end(BdrvTrackedRequest *req)
2370 {
2371     if (req->serialising) {
2372         req->bs->serialising_in_flight--;
2373     }
2374 
2375     QLIST_REMOVE(req, list);
2376     qemu_co_queue_restart_all(&req->wait_queue);
2377 }
2378 
2379 /**
2380  * Add an active request to the tracked requests list
2381  */
2382 static void tracked_request_begin(BdrvTrackedRequest *req,
2383                                   BlockDriverState *bs,
2384                                   int64_t offset,
2385                                   unsigned int bytes, bool is_write)
2386 {
2387     *req = (BdrvTrackedRequest){
2388         .bs = bs,
2389         .offset         = offset,
2390         .bytes          = bytes,
2391         .is_write       = is_write,
2392         .co             = qemu_coroutine_self(),
2393         .serialising    = false,
2394         .overlap_offset = offset,
2395         .overlap_bytes  = bytes,
2396     };
2397 
2398     qemu_co_queue_init(&req->wait_queue);
2399 
2400     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2401 }
2402 
2403 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2404 {
2405     int64_t overlap_offset = req->offset & ~(align - 1);
2406     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2407                                - overlap_offset;
2408 
2409     if (!req->serialising) {
2410         req->bs->serialising_in_flight++;
2411         req->serialising = true;
2412     }
2413 
2414     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2415     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2416 }
2417 
2418 /**
2419  * Round a region to cluster boundaries
2420  */
2421 void bdrv_round_to_clusters(BlockDriverState *bs,
2422                             int64_t sector_num, int nb_sectors,
2423                             int64_t *cluster_sector_num,
2424                             int *cluster_nb_sectors)
2425 {
2426     BlockDriverInfo bdi;
2427 
2428     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2429         *cluster_sector_num = sector_num;
2430         *cluster_nb_sectors = nb_sectors;
2431     } else {
2432         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2433         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2434         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2435                                             nb_sectors, c);
2436     }
2437 }
2438 
2439 static int bdrv_get_cluster_size(BlockDriverState *bs)
2440 {
2441     BlockDriverInfo bdi;
2442     int ret;
2443 
2444     ret = bdrv_get_info(bs, &bdi);
2445     if (ret < 0 || bdi.cluster_size == 0) {
2446         return bs->request_alignment;
2447     } else {
2448         return bdi.cluster_size;
2449     }
2450 }
2451 
2452 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2453                                      int64_t offset, unsigned int bytes)
2454 {
2455     /*        aaaa   bbbb */
2456     if (offset >= req->overlap_offset + req->overlap_bytes) {
2457         return false;
2458     }
2459     /* bbbb   aaaa        */
2460     if (req->overlap_offset >= offset + bytes) {
2461         return false;
2462     }
2463     return true;
2464 }
2465 
2466 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2467 {
2468     BlockDriverState *bs = self->bs;
2469     BdrvTrackedRequest *req;
2470     bool retry;
2471     bool waited = false;
2472 
2473     if (!bs->serialising_in_flight) {
2474         return false;
2475     }
2476 
2477     do {
2478         retry = false;
2479         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2480             if (req == self || (!req->serialising && !self->serialising)) {
2481                 continue;
2482             }
2483             if (tracked_request_overlaps(req, self->overlap_offset,
2484                                          self->overlap_bytes))
2485             {
2486                 /* Hitting this means there was a reentrant request, for
2487                  * example, a block driver issuing nested requests.  This must
2488                  * never happen since it means deadlock.
2489                  */
2490                 assert(qemu_coroutine_self() != req->co);
2491 
2492                 /* If the request is already (indirectly) waiting for us, or
2493                  * will wait for us as soon as it wakes up, then just go on
2494                  * (instead of producing a deadlock in the former case). */
2495                 if (!req->waiting_for) {
2496                     self->waiting_for = req;
2497                     qemu_co_queue_wait(&req->wait_queue);
2498                     self->waiting_for = NULL;
2499                     retry = true;
2500                     waited = true;
2501                     break;
2502                 }
2503             }
2504         }
2505     } while (retry);
2506 
2507     return waited;
2508 }
2509 
2510 /*
2511  * Return values:
2512  * 0        - success
2513  * -EINVAL  - backing format specified, but no file
2514  * -ENOSPC  - can't update the backing file because no space is left in the
2515  *            image file header
2516  * -ENOTSUP - format driver doesn't support changing the backing file
2517  */
2518 int bdrv_change_backing_file(BlockDriverState *bs,
2519     const char *backing_file, const char *backing_fmt)
2520 {
2521     BlockDriver *drv = bs->drv;
2522     int ret;
2523 
2524     /* Backing file format doesn't make sense without a backing file */
2525     if (backing_fmt && !backing_file) {
2526         return -EINVAL;
2527     }
2528 
2529     if (drv->bdrv_change_backing_file != NULL) {
2530         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2531     } else {
2532         ret = -ENOTSUP;
2533     }
2534 
2535     if (ret == 0) {
2536         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2537         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2538     }
2539     return ret;
2540 }
2541 
2542 /*
2543  * Finds the image layer in the chain that has 'bs' as its backing file.
2544  *
2545  * active is the current topmost image.
2546  *
2547  * Returns NULL if bs is not found in active's image chain,
2548  * or if active == bs.
2549  */
2550 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2551                                     BlockDriverState *bs)
2552 {
2553     BlockDriverState *overlay = NULL;
2554     BlockDriverState *intermediate;
2555 
2556     assert(active != NULL);
2557     assert(bs != NULL);
2558 
2559     /* if bs is the same as active, then by definition it has no overlay
2560      */
2561     if (active == bs) {
2562         return NULL;
2563     }
2564 
2565     intermediate = active;
2566     while (intermediate->backing_hd) {
2567         if (intermediate->backing_hd == bs) {
2568             overlay = intermediate;
2569             break;
2570         }
2571         intermediate = intermediate->backing_hd;
2572     }
2573 
2574     return overlay;
2575 }
2576 
2577 typedef struct BlkIntermediateStates {
2578     BlockDriverState *bs;
2579     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2580 } BlkIntermediateStates;
2581 
2582 
2583 /*
2584  * Drops images above 'base' up to and including 'top', and sets the image
2585  * above 'top' to have base as its backing file.
2586  *
2587  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2588  * information in 'bs' can be properly updated.
2589  *
2590  * E.g., this will convert the following chain:
2591  * bottom <- base <- intermediate <- top <- active
2592  *
2593  * to
2594  *
2595  * bottom <- base <- active
2596  *
2597  * It is allowed for bottom==base, in which case it converts:
2598  *
2599  * base <- intermediate <- top <- active
2600  *
2601  * to
2602  *
2603  * base <- active
2604  *
2605  * Error conditions:
2606  *  if active == top, that is considered an error
2607  *
2608  */
2609 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2610                            BlockDriverState *base)
2611 {
2612     BlockDriverState *intermediate;
2613     BlockDriverState *base_bs = NULL;
2614     BlockDriverState *new_top_bs = NULL;
2615     BlkIntermediateStates *intermediate_state, *next;
2616     int ret = -EIO;
2617 
2618     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2619     QSIMPLEQ_INIT(&states_to_delete);
2620 
2621     if (!top->drv || !base->drv) {
2622         goto exit;
2623     }
2624 
2625     new_top_bs = bdrv_find_overlay(active, top);
2626 
2627     if (new_top_bs == NULL) {
2628         /* we could not find the image above 'top', this is an error */
2629         goto exit;
2630     }
2631 
2632     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2633      * to do, no intermediate images */
2634     if (new_top_bs->backing_hd == base) {
2635         ret = 0;
2636         goto exit;
2637     }
2638 
2639     intermediate = top;
2640 
2641     /* now we will go down through the list, and add each BDS we find
2642      * into our deletion queue, until we hit the 'base'
2643      */
2644     while (intermediate) {
2645         intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2646         intermediate_state->bs = intermediate;
2647         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2648 
2649         if (intermediate->backing_hd == base) {
2650             base_bs = intermediate->backing_hd;
2651             break;
2652         }
2653         intermediate = intermediate->backing_hd;
2654     }
2655     if (base_bs == NULL) {
2656         /* something went wrong, we did not end at the base. safely
2657          * unravel everything, and exit with error */
2658         goto exit;
2659     }
2660 
2661     /* success - we can delete the intermediate states, and link top->base */
2662     ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2663                                    base_bs->drv ? base_bs->drv->format_name : "");
2664     if (ret) {
2665         goto exit;
2666     }
2667     bdrv_set_backing_hd(new_top_bs, base_bs);
2668 
2669     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2670         /* so that bdrv_close() does not recursively close the chain */
2671         bdrv_set_backing_hd(intermediate_state->bs, NULL);
2672         bdrv_unref(intermediate_state->bs);
2673     }
2674     ret = 0;
2675 
2676 exit:
2677     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2678         g_free(intermediate_state);
2679     }
2680     return ret;
2681 }
2682 
2683 
2684 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2685                                    size_t size)
2686 {
2687     int64_t len;
2688 
2689     if (size > INT_MAX) {
2690         return -EIO;
2691     }
2692 
2693     if (!bdrv_is_inserted(bs))
2694         return -ENOMEDIUM;
2695 
2696     if (bs->growable)
2697         return 0;
2698 
2699     len = bdrv_getlength(bs);
2700 
2701     if (offset < 0)
2702         return -EIO;
2703 
2704     if ((offset > len) || (len - offset < size))
2705         return -EIO;
2706 
2707     return 0;
2708 }
2709 
2710 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2711                               int nb_sectors)
2712 {
2713     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2714         return -EIO;
2715     }
2716 
2717     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2718                                    nb_sectors * BDRV_SECTOR_SIZE);
2719 }
2720 
2721 typedef struct RwCo {
2722     BlockDriverState *bs;
2723     int64_t offset;
2724     QEMUIOVector *qiov;
2725     bool is_write;
2726     int ret;
2727     BdrvRequestFlags flags;
2728 } RwCo;
2729 
2730 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2731 {
2732     RwCo *rwco = opaque;
2733 
2734     if (!rwco->is_write) {
2735         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2736                                       rwco->qiov->size, rwco->qiov,
2737                                       rwco->flags);
2738     } else {
2739         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2740                                        rwco->qiov->size, rwco->qiov,
2741                                        rwco->flags);
2742     }
2743 }
2744 
2745 /*
2746  * Process a vectored synchronous request using coroutines
2747  */
2748 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2749                         QEMUIOVector *qiov, bool is_write,
2750                         BdrvRequestFlags flags)
2751 {
2752     Coroutine *co;
2753     RwCo rwco = {
2754         .bs = bs,
2755         .offset = offset,
2756         .qiov = qiov,
2757         .is_write = is_write,
2758         .ret = NOT_DONE,
2759         .flags = flags,
2760     };
2761 
2762     /**
2763      * In sync call context, when the vcpu is blocked, this throttling timer
2764      * will not fire; so the I/O throttling function has to be disabled here
2765      * if it has been enabled.
2766      */
2767     if (bs->io_limits_enabled) {
2768         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2769                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2770         bdrv_io_limits_disable(bs);
2771     }
2772 
2773     if (qemu_in_coroutine()) {
2774         /* Fast-path if already in coroutine context */
2775         bdrv_rw_co_entry(&rwco);
2776     } else {
2777         co = qemu_coroutine_create(bdrv_rw_co_entry);
2778         qemu_coroutine_enter(co, &rwco);
2779         while (rwco.ret == NOT_DONE) {
2780             qemu_aio_wait();
2781         }
2782     }
2783     return rwco.ret;
2784 }
2785 
2786 /*
2787  * Process a synchronous request using coroutines
2788  */
2789 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2790                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2791 {
2792     QEMUIOVector qiov;
2793     struct iovec iov = {
2794         .iov_base = (void *)buf,
2795         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2796     };
2797 
2798     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2799         return -EINVAL;
2800     }
2801 
2802     qemu_iovec_init_external(&qiov, &iov, 1);
2803     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2804                         &qiov, is_write, flags);
2805 }
2806 
2807 /* return < 0 if error. See bdrv_write() for the return codes */
2808 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2809               uint8_t *buf, int nb_sectors)
2810 {
2811     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2812 }
2813 
2814 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2815 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2816                           uint8_t *buf, int nb_sectors)
2817 {
2818     bool enabled;
2819     int ret;
2820 
2821     enabled = bs->io_limits_enabled;
2822     bs->io_limits_enabled = false;
2823     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2824     bs->io_limits_enabled = enabled;
2825     return ret;
2826 }
2827 
2828 /* Return < 0 if error. Important errors are:
2829   -EIO         generic I/O error (may happen for all errors)
2830   -ENOMEDIUM   No media inserted.
2831   -EINVAL      Invalid sector number or nb_sectors
2832   -EACCES      Trying to write a read-only device
2833 */
2834 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2835                const uint8_t *buf, int nb_sectors)
2836 {
2837     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2838 }
2839 
2840 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2841                       int nb_sectors, BdrvRequestFlags flags)
2842 {
2843     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2844                       BDRV_REQ_ZERO_WRITE | flags);
2845 }
2846 
2847 /*
2848  * Completely zero out a block device with the help of bdrv_write_zeroes.
2849  * The operation is sped up by checking the block status and only writing
2850  * zeroes to the device if they currently do not return zeroes. Optional
2851  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2852  *
2853  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2854  */
2855 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2856 {
2857     int64_t target_size;
2858     int64_t ret, nb_sectors, sector_num = 0;
2859     int n;
2860 
2861     target_size = bdrv_getlength(bs);
2862     if (target_size < 0) {
2863         return target_size;
2864     }
2865     target_size /= BDRV_SECTOR_SIZE;
2866 
2867     for (;;) {
2868         nb_sectors = target_size - sector_num;
2869         if (nb_sectors <= 0) {
2870             return 0;
2871         }
2872         if (nb_sectors > INT_MAX) {
2873             nb_sectors = INT_MAX;
2874         }
2875         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2876         if (ret < 0) {
2877             error_report("error getting block status at sector %" PRId64 ": %s",
2878                          sector_num, strerror(-ret));
2879             return ret;
2880         }
2881         if (ret & BDRV_BLOCK_ZERO) {
2882             sector_num += n;
2883             continue;
2884         }
2885         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2886         if (ret < 0) {
2887             error_report("error writing zeroes at sector %" PRId64 ": %s",
2888                          sector_num, strerror(-ret));
2889             return ret;
2890         }
2891         sector_num += n;
2892     }
2893 }
2894 
2895 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2896 {
2897     QEMUIOVector qiov;
2898     struct iovec iov = {
2899         .iov_base = (void *)buf,
2900         .iov_len = bytes,
2901     };
2902     int ret;
2903 
2904     if (bytes < 0) {
2905         return -EINVAL;
2906     }
2907 
2908     qemu_iovec_init_external(&qiov, &iov, 1);
2909     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2910     if (ret < 0) {
2911         return ret;
2912     }
2913 
2914     return bytes;
2915 }
2916 
2917 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2918 {
2919     int ret;
2920 
2921     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2922     if (ret < 0) {
2923         return ret;
2924     }
2925 
2926     return qiov->size;
2927 }
2928 
2929 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2930                 const void *buf, int bytes)
2931 {
2932     QEMUIOVector qiov;
2933     struct iovec iov = {
2934         .iov_base   = (void *) buf,
2935         .iov_len    = bytes,
2936     };
2937 
2938     if (bytes < 0) {
2939         return -EINVAL;
2940     }
2941 
2942     qemu_iovec_init_external(&qiov, &iov, 1);
2943     return bdrv_pwritev(bs, offset, &qiov);
2944 }
2945 
2946 /*
2947  * Writes to the file and ensures that no writes are reordered across this
2948  * request (acts as a barrier)
2949  *
2950  * Returns 0 on success, -errno in error cases.
2951  */
2952 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2953     const void *buf, int count)
2954 {
2955     int ret;
2956 
2957     ret = bdrv_pwrite(bs, offset, buf, count);
2958     if (ret < 0) {
2959         return ret;
2960     }
2961 
2962     /* No flush needed for cache modes that already do it */
2963     if (bs->enable_write_cache) {
2964         bdrv_flush(bs);
2965     }
2966 
2967     return 0;
2968 }
2969 
2970 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2971         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2972 {
2973     /* Perform I/O through a temporary buffer so that users who scribble over
2974      * their read buffer while the operation is in progress do not end up
2975      * modifying the image file.  This is critical for zero-copy guest I/O
2976      * where anything might happen inside guest memory.
2977      */
2978     void *bounce_buffer;
2979 
2980     BlockDriver *drv = bs->drv;
2981     struct iovec iov;
2982     QEMUIOVector bounce_qiov;
2983     int64_t cluster_sector_num;
2984     int cluster_nb_sectors;
2985     size_t skip_bytes;
2986     int ret;
2987 
2988     /* Cover entire cluster so no additional backing file I/O is required when
2989      * allocating cluster in the image file.
2990      */
2991     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2992                            &cluster_sector_num, &cluster_nb_sectors);
2993 
2994     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2995                                    cluster_sector_num, cluster_nb_sectors);
2996 
2997     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2998     iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2999     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
3000 
3001     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
3002                              &bounce_qiov);
3003     if (ret < 0) {
3004         goto err;
3005     }
3006 
3007     if (drv->bdrv_co_write_zeroes &&
3008         buffer_is_zero(bounce_buffer, iov.iov_len)) {
3009         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
3010                                       cluster_nb_sectors, 0);
3011     } else {
3012         /* This does not change the data on the disk, it is not necessary
3013          * to flush even in cache=writethrough mode.
3014          */
3015         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
3016                                   &bounce_qiov);
3017     }
3018 
3019     if (ret < 0) {
3020         /* It might be okay to ignore write errors for guest requests.  If this
3021          * is a deliberate copy-on-read then we don't want to ignore the error.
3022          * Simply report it in all cases.
3023          */
3024         goto err;
3025     }
3026 
3027     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
3028     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
3029                         nb_sectors * BDRV_SECTOR_SIZE);
3030 
3031 err:
3032     qemu_vfree(bounce_buffer);
3033     return ret;
3034 }
3035 
3036 /*
3037  * Forwards an already correctly aligned request to the BlockDriver. This
3038  * handles copy on read and zeroing after EOF; any other features must be
3039  * implemented by the caller.
3040  */
3041 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
3042     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3043     int64_t align, QEMUIOVector *qiov, int flags)
3044 {
3045     BlockDriver *drv = bs->drv;
3046     int ret;
3047 
3048     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3049     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3050 
3051     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3052     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3053 
3054     /* Handle Copy on Read and associated serialisation */
3055     if (flags & BDRV_REQ_COPY_ON_READ) {
3056         /* If we touch the same cluster it counts as an overlap.  This
3057          * guarantees that allocating writes will be serialized and not race
3058          * with each other for the same cluster.  For example, in copy-on-read
3059          * it ensures that the CoR read and write operations are atomic and
3060          * guest writes cannot interleave between them. */
3061         mark_request_serialising(req, bdrv_get_cluster_size(bs));
3062     }
3063 
3064     wait_serialising_requests(req);
3065 
3066     if (flags & BDRV_REQ_COPY_ON_READ) {
3067         int pnum;
3068 
3069         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3070         if (ret < 0) {
3071             goto out;
3072         }
3073 
3074         if (!ret || pnum != nb_sectors) {
3075             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3076             goto out;
3077         }
3078     }
3079 
3080     /* Forward the request to the BlockDriver */
3081     if (!(bs->zero_beyond_eof && bs->growable)) {
3082         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3083     } else {
3084         /* Read zeros after EOF of growable BDSes */
3085         int64_t len, total_sectors, max_nb_sectors;
3086 
3087         len = bdrv_getlength(bs);
3088         if (len < 0) {
3089             ret = len;
3090             goto out;
3091         }
3092 
3093         total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
3094         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3095                                   align >> BDRV_SECTOR_BITS);
3096         if (max_nb_sectors > 0) {
3097             ret = drv->bdrv_co_readv(bs, sector_num,
3098                                      MIN(nb_sectors, max_nb_sectors), qiov);
3099         } else {
3100             ret = 0;
3101         }
3102 
3103         /* Reading beyond end of file is supposed to produce zeroes */
3104         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3105             uint64_t offset = MAX(0, total_sectors - sector_num);
3106             uint64_t bytes = (sector_num + nb_sectors - offset) *
3107                               BDRV_SECTOR_SIZE;
3108             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3109         }
3110     }
3111 
3112 out:
3113     return ret;
3114 }
3115 
3116 /*
3117  * Handle a read request in coroutine context
3118  */
3119 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3120     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3121     BdrvRequestFlags flags)
3122 {
3123     BlockDriver *drv = bs->drv;
3124     BdrvTrackedRequest req;
3125 
3126     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3127     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3128     uint8_t *head_buf = NULL;
3129     uint8_t *tail_buf = NULL;
3130     QEMUIOVector local_qiov;
3131     bool use_local_qiov = false;
3132     int ret;
3133 
3134     if (!drv) {
3135         return -ENOMEDIUM;
3136     }
3137     if (bdrv_check_byte_request(bs, offset, bytes)) {
3138         return -EIO;
3139     }
3140 
3141     if (bs->copy_on_read) {
3142         flags |= BDRV_REQ_COPY_ON_READ;
3143     }
3144 
3145     /* throttling disk I/O */
3146     if (bs->io_limits_enabled) {
3147         bdrv_io_limits_intercept(bs, bytes, false);
3148     }
3149 
3150     /* Align read if necessary by padding qiov */
3151     if (offset & (align - 1)) {
3152         head_buf = qemu_blockalign(bs, align);
3153         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3154         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3155         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3156         use_local_qiov = true;
3157 
3158         bytes += offset & (align - 1);
3159         offset = offset & ~(align - 1);
3160     }
3161 
3162     if ((offset + bytes) & (align - 1)) {
3163         if (!use_local_qiov) {
3164             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3165             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3166             use_local_qiov = true;
3167         }
3168         tail_buf = qemu_blockalign(bs, align);
3169         qemu_iovec_add(&local_qiov, tail_buf,
3170                        align - ((offset + bytes) & (align - 1)));
3171 
3172         bytes = ROUND_UP(bytes, align);
3173     }
3174 
3175     tracked_request_begin(&req, bs, offset, bytes, false);
3176     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3177                               use_local_qiov ? &local_qiov : qiov,
3178                               flags);
3179     tracked_request_end(&req);
3180 
3181     if (use_local_qiov) {
3182         qemu_iovec_destroy(&local_qiov);
3183         qemu_vfree(head_buf);
3184         qemu_vfree(tail_buf);
3185     }
3186 
3187     return ret;
3188 }
3189 
3190 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3191     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3192     BdrvRequestFlags flags)
3193 {
3194     if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3195         return -EINVAL;
3196     }
3197 
3198     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3199                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3200 }
3201 
3202 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3203     int nb_sectors, QEMUIOVector *qiov)
3204 {
3205     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3206 
3207     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3208 }
3209 
3210 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3211     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3212 {
3213     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3214 
3215     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3216                             BDRV_REQ_COPY_ON_READ);
3217 }
3218 
3219 /* if no limit is specified in the BlockLimits use a default
3220  * of 32768 512-byte sectors (16 MiB) per request.
3221  */
3222 #define MAX_WRITE_ZEROES_DEFAULT 32768
3223 
3224 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3225     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3226 {
3227     BlockDriver *drv = bs->drv;
3228     QEMUIOVector qiov;
3229     struct iovec iov = {0};
3230     int ret = 0;
3231 
3232     int max_write_zeroes = bs->bl.max_write_zeroes ?
3233                            bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3234 
3235     while (nb_sectors > 0 && !ret) {
3236         int num = nb_sectors;
3237 
3238         /* Align request.  Block drivers can expect the "bulk" of the request
3239          * to be aligned.
3240          */
3241         if (bs->bl.write_zeroes_alignment
3242             && num > bs->bl.write_zeroes_alignment) {
3243             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3244                 /* Make a small request up to the first aligned sector.  */
3245                 num = bs->bl.write_zeroes_alignment;
3246                 num -= sector_num % bs->bl.write_zeroes_alignment;
3247             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3248                 /* Shorten the request to the last aligned sector.  num cannot
3249                  * underflow because num > bs->bl.write_zeroes_alignment.
3250                  */
3251                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3252             }
3253         }
3254 
3255         /* limit request size */
3256         if (num > max_write_zeroes) {
3257             num = max_write_zeroes;
3258         }
3259 
3260         ret = -ENOTSUP;
3261         /* First try the efficient write zeroes operation */
3262         if (drv->bdrv_co_write_zeroes) {
3263             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3264         }
3265 
3266         if (ret == -ENOTSUP) {
3267             /* Fall back to bounce buffer if write zeroes is unsupported */
3268             iov.iov_len = num * BDRV_SECTOR_SIZE;
3269             if (iov.iov_base == NULL) {
3270                 iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3271                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3272             }
3273             qemu_iovec_init_external(&qiov, &iov, 1);
3274 
3275             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3276 
3277             /* Keep bounce buffer around if it is big enough for all
3278              * all future requests.
3279              */
3280             if (num < max_write_zeroes) {
3281                 qemu_vfree(iov.iov_base);
3282                 iov.iov_base = NULL;
3283             }
3284         }
3285 
3286         sector_num += num;
3287         nb_sectors -= num;
3288     }
3289 
3290     qemu_vfree(iov.iov_base);
3291     return ret;
3292 }
3293 
3294 /*
3295  * Forwards an already correctly aligned write request to the BlockDriver.
3296  */
3297 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3298     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3299     QEMUIOVector *qiov, int flags)
3300 {
3301     BlockDriver *drv = bs->drv;
3302     bool waited;
3303     int ret;
3304 
3305     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3306     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3307 
3308     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3309     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3310 
3311     waited = wait_serialising_requests(req);
3312     assert(!waited || !req->serialising);
3313     assert(req->overlap_offset <= offset);
3314     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3315 
3316     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3317 
3318     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3319         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3320         qemu_iovec_is_zero(qiov)) {
3321         flags |= BDRV_REQ_ZERO_WRITE;
3322         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3323             flags |= BDRV_REQ_MAY_UNMAP;
3324         }
3325     }
3326 
3327     if (ret < 0) {
3328         /* Do nothing, write notifier decided to fail this request */
3329     } else if (flags & BDRV_REQ_ZERO_WRITE) {
3330         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3331         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3332     } else {
3333         BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3334         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3335     }
3336     BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3337 
3338     if (ret == 0 && !bs->enable_write_cache) {
3339         ret = bdrv_co_flush(bs);
3340     }
3341 
3342     bdrv_set_dirty(bs, sector_num, nb_sectors);
3343 
3344     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3345         bs->wr_highest_sector = sector_num + nb_sectors - 1;
3346     }
3347     if (bs->growable && ret >= 0) {
3348         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3349     }
3350 
3351     return ret;
3352 }
3353 
3354 /*
3355  * Handle a write request in coroutine context
3356  */
3357 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3358     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3359     BdrvRequestFlags flags)
3360 {
3361     BdrvTrackedRequest req;
3362     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3363     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3364     uint8_t *head_buf = NULL;
3365     uint8_t *tail_buf = NULL;
3366     QEMUIOVector local_qiov;
3367     bool use_local_qiov = false;
3368     int ret;
3369 
3370     if (!bs->drv) {
3371         return -ENOMEDIUM;
3372     }
3373     if (bs->read_only) {
3374         return -EACCES;
3375     }
3376     if (bdrv_check_byte_request(bs, offset, bytes)) {
3377         return -EIO;
3378     }
3379 
3380     /* throttling disk I/O */
3381     if (bs->io_limits_enabled) {
3382         bdrv_io_limits_intercept(bs, bytes, true);
3383     }
3384 
3385     /*
3386      * Align write if necessary by performing a read-modify-write cycle.
3387      * Pad qiov with the read parts and be sure to have a tracked request not
3388      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3389      */
3390     tracked_request_begin(&req, bs, offset, bytes, true);
3391 
3392     if (offset & (align - 1)) {
3393         QEMUIOVector head_qiov;
3394         struct iovec head_iov;
3395 
3396         mark_request_serialising(&req, align);
3397         wait_serialising_requests(&req);
3398 
3399         head_buf = qemu_blockalign(bs, align);
3400         head_iov = (struct iovec) {
3401             .iov_base   = head_buf,
3402             .iov_len    = align,
3403         };
3404         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3405 
3406         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3407         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3408                                   align, &head_qiov, 0);
3409         if (ret < 0) {
3410             goto fail;
3411         }
3412         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3413 
3414         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3415         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3416         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3417         use_local_qiov = true;
3418 
3419         bytes += offset & (align - 1);
3420         offset = offset & ~(align - 1);
3421     }
3422 
3423     if ((offset + bytes) & (align - 1)) {
3424         QEMUIOVector tail_qiov;
3425         struct iovec tail_iov;
3426         size_t tail_bytes;
3427         bool waited;
3428 
3429         mark_request_serialising(&req, align);
3430         waited = wait_serialising_requests(&req);
3431         assert(!waited || !use_local_qiov);
3432 
3433         tail_buf = qemu_blockalign(bs, align);
3434         tail_iov = (struct iovec) {
3435             .iov_base   = tail_buf,
3436             .iov_len    = align,
3437         };
3438         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3439 
3440         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3441         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3442                                   align, &tail_qiov, 0);
3443         if (ret < 0) {
3444             goto fail;
3445         }
3446         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3447 
3448         if (!use_local_qiov) {
3449             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3450             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3451             use_local_qiov = true;
3452         }
3453 
3454         tail_bytes = (offset + bytes) & (align - 1);
3455         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3456 
3457         bytes = ROUND_UP(bytes, align);
3458     }
3459 
3460     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3461                                use_local_qiov ? &local_qiov : qiov,
3462                                flags);
3463 
3464 fail:
3465     tracked_request_end(&req);
3466 
3467     if (use_local_qiov) {
3468         qemu_iovec_destroy(&local_qiov);
3469     }
3470     qemu_vfree(head_buf);
3471     qemu_vfree(tail_buf);
3472 
3473     return ret;
3474 }
3475 
3476 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3477     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3478     BdrvRequestFlags flags)
3479 {
3480     if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3481         return -EINVAL;
3482     }
3483 
3484     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3485                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3486 }
3487 
3488 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3489     int nb_sectors, QEMUIOVector *qiov)
3490 {
3491     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3492 
3493     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3494 }
3495 
3496 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3497                                       int64_t sector_num, int nb_sectors,
3498                                       BdrvRequestFlags flags)
3499 {
3500     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3501 
3502     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3503         flags &= ~BDRV_REQ_MAY_UNMAP;
3504     }
3505 
3506     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3507                              BDRV_REQ_ZERO_WRITE | flags);
3508 }
3509 
3510 /**
3511  * Truncate file to 'offset' bytes (needed only for file protocols)
3512  */
3513 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3514 {
3515     BlockDriver *drv = bs->drv;
3516     int ret;
3517     if (!drv)
3518         return -ENOMEDIUM;
3519     if (!drv->bdrv_truncate)
3520         return -ENOTSUP;
3521     if (bs->read_only)
3522         return -EACCES;
3523     if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_RESIZE, NULL)) {
3524         return -EBUSY;
3525     }
3526     ret = drv->bdrv_truncate(bs, offset);
3527     if (ret == 0) {
3528         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3529         bdrv_dev_resize_cb(bs);
3530     }
3531     return ret;
3532 }
3533 
3534 /**
3535  * Length of a allocated file in bytes. Sparse files are counted by actual
3536  * allocated space. Return < 0 if error or unknown.
3537  */
3538 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3539 {
3540     BlockDriver *drv = bs->drv;
3541     if (!drv) {
3542         return -ENOMEDIUM;
3543     }
3544     if (drv->bdrv_get_allocated_file_size) {
3545         return drv->bdrv_get_allocated_file_size(bs);
3546     }
3547     if (bs->file) {
3548         return bdrv_get_allocated_file_size(bs->file);
3549     }
3550     return -ENOTSUP;
3551 }
3552 
3553 /**
3554  * Length of a file in bytes. Return < 0 if error or unknown.
3555  */
3556 int64_t bdrv_getlength(BlockDriverState *bs)
3557 {
3558     BlockDriver *drv = bs->drv;
3559     if (!drv)
3560         return -ENOMEDIUM;
3561 
3562     if (drv->has_variable_length) {
3563         int ret = refresh_total_sectors(bs, bs->total_sectors);
3564         if (ret < 0) {
3565             return ret;
3566         }
3567     }
3568     return bs->total_sectors * BDRV_SECTOR_SIZE;
3569 }
3570 
3571 /* return 0 as number of sectors if no device present or error */
3572 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3573 {
3574     int64_t length;
3575     length = bdrv_getlength(bs);
3576     if (length < 0)
3577         length = 0;
3578     else
3579         length = length >> BDRV_SECTOR_BITS;
3580     *nb_sectors_ptr = length;
3581 }
3582 
3583 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3584                        BlockdevOnError on_write_error)
3585 {
3586     bs->on_read_error = on_read_error;
3587     bs->on_write_error = on_write_error;
3588 }
3589 
3590 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3591 {
3592     return is_read ? bs->on_read_error : bs->on_write_error;
3593 }
3594 
3595 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3596 {
3597     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3598 
3599     switch (on_err) {
3600     case BLOCKDEV_ON_ERROR_ENOSPC:
3601         return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3602     case BLOCKDEV_ON_ERROR_STOP:
3603         return BDRV_ACTION_STOP;
3604     case BLOCKDEV_ON_ERROR_REPORT:
3605         return BDRV_ACTION_REPORT;
3606     case BLOCKDEV_ON_ERROR_IGNORE:
3607         return BDRV_ACTION_IGNORE;
3608     default:
3609         abort();
3610     }
3611 }
3612 
3613 /* This is done by device models because, while the block layer knows
3614  * about the error, it does not know whether an operation comes from
3615  * the device or the block layer (from a job, for example).
3616  */
3617 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3618                        bool is_read, int error)
3619 {
3620     assert(error >= 0);
3621     bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3622     if (action == BDRV_ACTION_STOP) {
3623         vm_stop(RUN_STATE_IO_ERROR);
3624         bdrv_iostatus_set_err(bs, error);
3625     }
3626 }
3627 
3628 int bdrv_is_read_only(BlockDriverState *bs)
3629 {
3630     return bs->read_only;
3631 }
3632 
3633 int bdrv_is_sg(BlockDriverState *bs)
3634 {
3635     return bs->sg;
3636 }
3637 
3638 int bdrv_enable_write_cache(BlockDriverState *bs)
3639 {
3640     return bs->enable_write_cache;
3641 }
3642 
3643 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3644 {
3645     bs->enable_write_cache = wce;
3646 
3647     /* so a reopen() will preserve wce */
3648     if (wce) {
3649         bs->open_flags |= BDRV_O_CACHE_WB;
3650     } else {
3651         bs->open_flags &= ~BDRV_O_CACHE_WB;
3652     }
3653 }
3654 
3655 int bdrv_is_encrypted(BlockDriverState *bs)
3656 {
3657     if (bs->backing_hd && bs->backing_hd->encrypted)
3658         return 1;
3659     return bs->encrypted;
3660 }
3661 
3662 int bdrv_key_required(BlockDriverState *bs)
3663 {
3664     BlockDriverState *backing_hd = bs->backing_hd;
3665 
3666     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3667         return 1;
3668     return (bs->encrypted && !bs->valid_key);
3669 }
3670 
3671 int bdrv_set_key(BlockDriverState *bs, const char *key)
3672 {
3673     int ret;
3674     if (bs->backing_hd && bs->backing_hd->encrypted) {
3675         ret = bdrv_set_key(bs->backing_hd, key);
3676         if (ret < 0)
3677             return ret;
3678         if (!bs->encrypted)
3679             return 0;
3680     }
3681     if (!bs->encrypted) {
3682         return -EINVAL;
3683     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3684         return -ENOMEDIUM;
3685     }
3686     ret = bs->drv->bdrv_set_key(bs, key);
3687     if (ret < 0) {
3688         bs->valid_key = 0;
3689     } else if (!bs->valid_key) {
3690         bs->valid_key = 1;
3691         /* call the change callback now, we skipped it on open */
3692         bdrv_dev_change_media_cb(bs, true);
3693     }
3694     return ret;
3695 }
3696 
3697 const char *bdrv_get_format_name(BlockDriverState *bs)
3698 {
3699     return bs->drv ? bs->drv->format_name : NULL;
3700 }
3701 
3702 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3703                          void *opaque)
3704 {
3705     BlockDriver *drv;
3706     int count = 0;
3707     const char **formats = NULL;
3708 
3709     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3710         if (drv->format_name) {
3711             bool found = false;
3712             int i = count;
3713             while (formats && i && !found) {
3714                 found = !strcmp(formats[--i], drv->format_name);
3715             }
3716 
3717             if (!found) {
3718                 formats = g_realloc(formats, (count + 1) * sizeof(char *));
3719                 formats[count++] = drv->format_name;
3720                 it(opaque, drv->format_name);
3721             }
3722         }
3723     }
3724     g_free(formats);
3725 }
3726 
3727 /* This function is to find block backend bs */
3728 BlockDriverState *bdrv_find(const char *name)
3729 {
3730     BlockDriverState *bs;
3731 
3732     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3733         if (!strcmp(name, bs->device_name)) {
3734             return bs;
3735         }
3736     }
3737     return NULL;
3738 }
3739 
3740 /* This function is to find a node in the bs graph */
3741 BlockDriverState *bdrv_find_node(const char *node_name)
3742 {
3743     BlockDriverState *bs;
3744 
3745     assert(node_name);
3746 
3747     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3748         if (!strcmp(node_name, bs->node_name)) {
3749             return bs;
3750         }
3751     }
3752     return NULL;
3753 }
3754 
3755 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3756 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3757 {
3758     BlockDeviceInfoList *list, *entry;
3759     BlockDriverState *bs;
3760 
3761     list = NULL;
3762     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3763         entry = g_malloc0(sizeof(*entry));
3764         entry->value = bdrv_block_device_info(bs);
3765         entry->next = list;
3766         list = entry;
3767     }
3768 
3769     return list;
3770 }
3771 
3772 BlockDriverState *bdrv_lookup_bs(const char *device,
3773                                  const char *node_name,
3774                                  Error **errp)
3775 {
3776     BlockDriverState *bs = NULL;
3777 
3778     if (device) {
3779         bs = bdrv_find(device);
3780 
3781         if (bs) {
3782             return bs;
3783         }
3784     }
3785 
3786     if (node_name) {
3787         bs = bdrv_find_node(node_name);
3788 
3789         if (bs) {
3790             return bs;
3791         }
3792     }
3793 
3794     error_setg(errp, "Cannot find device=%s nor node_name=%s",
3795                      device ? device : "",
3796                      node_name ? node_name : "");
3797     return NULL;
3798 }
3799 
3800 BlockDriverState *bdrv_next(BlockDriverState *bs)
3801 {
3802     if (!bs) {
3803         return QTAILQ_FIRST(&bdrv_states);
3804     }
3805     return QTAILQ_NEXT(bs, device_list);
3806 }
3807 
3808 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3809 {
3810     BlockDriverState *bs;
3811 
3812     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3813         it(opaque, bs);
3814     }
3815 }
3816 
3817 const char *bdrv_get_device_name(BlockDriverState *bs)
3818 {
3819     return bs->device_name;
3820 }
3821 
3822 int bdrv_get_flags(BlockDriverState *bs)
3823 {
3824     return bs->open_flags;
3825 }
3826 
3827 int bdrv_flush_all(void)
3828 {
3829     BlockDriverState *bs;
3830     int result = 0;
3831 
3832     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3833         int ret = bdrv_flush(bs);
3834         if (ret < 0 && !result) {
3835             result = ret;
3836         }
3837     }
3838 
3839     return result;
3840 }
3841 
3842 int bdrv_has_zero_init_1(BlockDriverState *bs)
3843 {
3844     return 1;
3845 }
3846 
3847 int bdrv_has_zero_init(BlockDriverState *bs)
3848 {
3849     assert(bs->drv);
3850 
3851     /* If BS is a copy on write image, it is initialized to
3852        the contents of the base image, which may not be zeroes.  */
3853     if (bs->backing_hd) {
3854         return 0;
3855     }
3856     if (bs->drv->bdrv_has_zero_init) {
3857         return bs->drv->bdrv_has_zero_init(bs);
3858     }
3859 
3860     /* safe default */
3861     return 0;
3862 }
3863 
3864 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3865 {
3866     BlockDriverInfo bdi;
3867 
3868     if (bs->backing_hd) {
3869         return false;
3870     }
3871 
3872     if (bdrv_get_info(bs, &bdi) == 0) {
3873         return bdi.unallocated_blocks_are_zero;
3874     }
3875 
3876     return false;
3877 }
3878 
3879 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3880 {
3881     BlockDriverInfo bdi;
3882 
3883     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3884         return false;
3885     }
3886 
3887     if (bdrv_get_info(bs, &bdi) == 0) {
3888         return bdi.can_write_zeroes_with_unmap;
3889     }
3890 
3891     return false;
3892 }
3893 
3894 typedef struct BdrvCoGetBlockStatusData {
3895     BlockDriverState *bs;
3896     BlockDriverState *base;
3897     int64_t sector_num;
3898     int nb_sectors;
3899     int *pnum;
3900     int64_t ret;
3901     bool done;
3902 } BdrvCoGetBlockStatusData;
3903 
3904 /*
3905  * Returns true iff the specified sector is present in the disk image. Drivers
3906  * not implementing the functionality are assumed to not support backing files,
3907  * hence all their sectors are reported as allocated.
3908  *
3909  * If 'sector_num' is beyond the end of the disk image the return value is 0
3910  * and 'pnum' is set to 0.
3911  *
3912  * 'pnum' is set to the number of sectors (including and immediately following
3913  * the specified sector) that are known to be in the same
3914  * allocated/unallocated state.
3915  *
3916  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3917  * beyond the end of the disk image it will be clamped.
3918  */
3919 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3920                                                      int64_t sector_num,
3921                                                      int nb_sectors, int *pnum)
3922 {
3923     int64_t length;
3924     int64_t n;
3925     int64_t ret, ret2;
3926 
3927     length = bdrv_getlength(bs);
3928     if (length < 0) {
3929         return length;
3930     }
3931 
3932     if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3933         *pnum = 0;
3934         return 0;
3935     }
3936 
3937     n = bs->total_sectors - sector_num;
3938     if (n < nb_sectors) {
3939         nb_sectors = n;
3940     }
3941 
3942     if (!bs->drv->bdrv_co_get_block_status) {
3943         *pnum = nb_sectors;
3944         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
3945         if (bs->drv->protocol_name) {
3946             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3947         }
3948         return ret;
3949     }
3950 
3951     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3952     if (ret < 0) {
3953         *pnum = 0;
3954         return ret;
3955     }
3956 
3957     if (ret & BDRV_BLOCK_RAW) {
3958         assert(ret & BDRV_BLOCK_OFFSET_VALID);
3959         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3960                                      *pnum, pnum);
3961     }
3962 
3963     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
3964         ret |= BDRV_BLOCK_ALLOCATED;
3965     }
3966 
3967     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3968         if (bdrv_unallocated_blocks_are_zero(bs)) {
3969             ret |= BDRV_BLOCK_ZERO;
3970         } else if (bs->backing_hd) {
3971             BlockDriverState *bs2 = bs->backing_hd;
3972             int64_t length2 = bdrv_getlength(bs2);
3973             if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3974                 ret |= BDRV_BLOCK_ZERO;
3975             }
3976         }
3977     }
3978 
3979     if (bs->file &&
3980         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3981         (ret & BDRV_BLOCK_OFFSET_VALID)) {
3982         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3983                                         *pnum, pnum);
3984         if (ret2 >= 0) {
3985             /* Ignore errors.  This is just providing extra information, it
3986              * is useful but not necessary.
3987              */
3988             ret |= (ret2 & BDRV_BLOCK_ZERO);
3989         }
3990     }
3991 
3992     return ret;
3993 }
3994 
3995 /* Coroutine wrapper for bdrv_get_block_status() */
3996 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3997 {
3998     BdrvCoGetBlockStatusData *data = opaque;
3999     BlockDriverState *bs = data->bs;
4000 
4001     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4002                                          data->pnum);
4003     data->done = true;
4004 }
4005 
4006 /*
4007  * Synchronous wrapper around bdrv_co_get_block_status().
4008  *
4009  * See bdrv_co_get_block_status() for details.
4010  */
4011 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4012                               int nb_sectors, int *pnum)
4013 {
4014     Coroutine *co;
4015     BdrvCoGetBlockStatusData data = {
4016         .bs = bs,
4017         .sector_num = sector_num,
4018         .nb_sectors = nb_sectors,
4019         .pnum = pnum,
4020         .done = false,
4021     };
4022 
4023     if (qemu_in_coroutine()) {
4024         /* Fast-path if already in coroutine context */
4025         bdrv_get_block_status_co_entry(&data);
4026     } else {
4027         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4028         qemu_coroutine_enter(co, &data);
4029         while (!data.done) {
4030             qemu_aio_wait();
4031         }
4032     }
4033     return data.ret;
4034 }
4035 
4036 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4037                                    int nb_sectors, int *pnum)
4038 {
4039     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4040     if (ret < 0) {
4041         return ret;
4042     }
4043     return (ret & BDRV_BLOCK_ALLOCATED);
4044 }
4045 
4046 /*
4047  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4048  *
4049  * Return true if the given sector is allocated in any image between
4050  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
4051  * sector is allocated in any image of the chain.  Return false otherwise.
4052  *
4053  * 'pnum' is set to the number of sectors (including and immediately following
4054  *  the specified sector) that are known to be in the same
4055  *  allocated/unallocated state.
4056  *
4057  */
4058 int bdrv_is_allocated_above(BlockDriverState *top,
4059                             BlockDriverState *base,
4060                             int64_t sector_num,
4061                             int nb_sectors, int *pnum)
4062 {
4063     BlockDriverState *intermediate;
4064     int ret, n = nb_sectors;
4065 
4066     intermediate = top;
4067     while (intermediate && intermediate != base) {
4068         int pnum_inter;
4069         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4070                                 &pnum_inter);
4071         if (ret < 0) {
4072             return ret;
4073         } else if (ret) {
4074             *pnum = pnum_inter;
4075             return 1;
4076         }
4077 
4078         /*
4079          * [sector_num, nb_sectors] is unallocated on top but intermediate
4080          * might have
4081          *
4082          * [sector_num+x, nr_sectors] allocated.
4083          */
4084         if (n > pnum_inter &&
4085             (intermediate == top ||
4086              sector_num + pnum_inter < intermediate->total_sectors)) {
4087             n = pnum_inter;
4088         }
4089 
4090         intermediate = intermediate->backing_hd;
4091     }
4092 
4093     *pnum = n;
4094     return 0;
4095 }
4096 
4097 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4098 {
4099     if (bs->backing_hd && bs->backing_hd->encrypted)
4100         return bs->backing_file;
4101     else if (bs->encrypted)
4102         return bs->filename;
4103     else
4104         return NULL;
4105 }
4106 
4107 void bdrv_get_backing_filename(BlockDriverState *bs,
4108                                char *filename, int filename_size)
4109 {
4110     pstrcpy(filename, filename_size, bs->backing_file);
4111 }
4112 
4113 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4114                           const uint8_t *buf, int nb_sectors)
4115 {
4116     BlockDriver *drv = bs->drv;
4117     if (!drv)
4118         return -ENOMEDIUM;
4119     if (!drv->bdrv_write_compressed)
4120         return -ENOTSUP;
4121     if (bdrv_check_request(bs, sector_num, nb_sectors))
4122         return -EIO;
4123 
4124     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4125 
4126     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4127 }
4128 
4129 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4130 {
4131     BlockDriver *drv = bs->drv;
4132     if (!drv)
4133         return -ENOMEDIUM;
4134     if (!drv->bdrv_get_info)
4135         return -ENOTSUP;
4136     memset(bdi, 0, sizeof(*bdi));
4137     return drv->bdrv_get_info(bs, bdi);
4138 }
4139 
4140 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4141 {
4142     BlockDriver *drv = bs->drv;
4143     if (drv && drv->bdrv_get_specific_info) {
4144         return drv->bdrv_get_specific_info(bs);
4145     }
4146     return NULL;
4147 }
4148 
4149 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4150                       int64_t pos, int size)
4151 {
4152     QEMUIOVector qiov;
4153     struct iovec iov = {
4154         .iov_base   = (void *) buf,
4155         .iov_len    = size,
4156     };
4157 
4158     qemu_iovec_init_external(&qiov, &iov, 1);
4159     return bdrv_writev_vmstate(bs, &qiov, pos);
4160 }
4161 
4162 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4163 {
4164     BlockDriver *drv = bs->drv;
4165 
4166     if (!drv) {
4167         return -ENOMEDIUM;
4168     } else if (drv->bdrv_save_vmstate) {
4169         return drv->bdrv_save_vmstate(bs, qiov, pos);
4170     } else if (bs->file) {
4171         return bdrv_writev_vmstate(bs->file, qiov, pos);
4172     }
4173 
4174     return -ENOTSUP;
4175 }
4176 
4177 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4178                       int64_t pos, int size)
4179 {
4180     BlockDriver *drv = bs->drv;
4181     if (!drv)
4182         return -ENOMEDIUM;
4183     if (drv->bdrv_load_vmstate)
4184         return drv->bdrv_load_vmstate(bs, buf, pos, size);
4185     if (bs->file)
4186         return bdrv_load_vmstate(bs->file, buf, pos, size);
4187     return -ENOTSUP;
4188 }
4189 
4190 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4191 {
4192     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4193         return;
4194     }
4195 
4196     bs->drv->bdrv_debug_event(bs, event);
4197 }
4198 
4199 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4200                           const char *tag)
4201 {
4202     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4203         bs = bs->file;
4204     }
4205 
4206     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4207         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4208     }
4209 
4210     return -ENOTSUP;
4211 }
4212 
4213 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4214 {
4215     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4216         bs = bs->file;
4217     }
4218 
4219     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4220         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4221     }
4222 
4223     return -ENOTSUP;
4224 }
4225 
4226 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4227 {
4228     while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4229         bs = bs->file;
4230     }
4231 
4232     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4233         return bs->drv->bdrv_debug_resume(bs, tag);
4234     }
4235 
4236     return -ENOTSUP;
4237 }
4238 
4239 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4240 {
4241     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4242         bs = bs->file;
4243     }
4244 
4245     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4246         return bs->drv->bdrv_debug_is_suspended(bs, tag);
4247     }
4248 
4249     return false;
4250 }
4251 
4252 int bdrv_is_snapshot(BlockDriverState *bs)
4253 {
4254     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4255 }
4256 
4257 /* backing_file can either be relative, or absolute, or a protocol.  If it is
4258  * relative, it must be relative to the chain.  So, passing in bs->filename
4259  * from a BDS as backing_file should not be done, as that may be relative to
4260  * the CWD rather than the chain. */
4261 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4262         const char *backing_file)
4263 {
4264     char *filename_full = NULL;
4265     char *backing_file_full = NULL;
4266     char *filename_tmp = NULL;
4267     int is_protocol = 0;
4268     BlockDriverState *curr_bs = NULL;
4269     BlockDriverState *retval = NULL;
4270 
4271     if (!bs || !bs->drv || !backing_file) {
4272         return NULL;
4273     }
4274 
4275     filename_full     = g_malloc(PATH_MAX);
4276     backing_file_full = g_malloc(PATH_MAX);
4277     filename_tmp      = g_malloc(PATH_MAX);
4278 
4279     is_protocol = path_has_protocol(backing_file);
4280 
4281     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4282 
4283         /* If either of the filename paths is actually a protocol, then
4284          * compare unmodified paths; otherwise make paths relative */
4285         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4286             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4287                 retval = curr_bs->backing_hd;
4288                 break;
4289             }
4290         } else {
4291             /* If not an absolute filename path, make it relative to the current
4292              * image's filename path */
4293             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4294                          backing_file);
4295 
4296             /* We are going to compare absolute pathnames */
4297             if (!realpath(filename_tmp, filename_full)) {
4298                 continue;
4299             }
4300 
4301             /* We need to make sure the backing filename we are comparing against
4302              * is relative to the current image filename (or absolute) */
4303             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4304                          curr_bs->backing_file);
4305 
4306             if (!realpath(filename_tmp, backing_file_full)) {
4307                 continue;
4308             }
4309 
4310             if (strcmp(backing_file_full, filename_full) == 0) {
4311                 retval = curr_bs->backing_hd;
4312                 break;
4313             }
4314         }
4315     }
4316 
4317     g_free(filename_full);
4318     g_free(backing_file_full);
4319     g_free(filename_tmp);
4320     return retval;
4321 }
4322 
4323 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4324 {
4325     if (!bs->drv) {
4326         return 0;
4327     }
4328 
4329     if (!bs->backing_hd) {
4330         return 0;
4331     }
4332 
4333     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4334 }
4335 
4336 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4337 {
4338     BlockDriverState *curr_bs = NULL;
4339 
4340     if (!bs) {
4341         return NULL;
4342     }
4343 
4344     curr_bs = bs;
4345 
4346     while (curr_bs->backing_hd) {
4347         curr_bs = curr_bs->backing_hd;
4348     }
4349     return curr_bs;
4350 }
4351 
4352 /**************************************************************/
4353 /* async I/Os */
4354 
4355 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4356                                  QEMUIOVector *qiov, int nb_sectors,
4357                                  BlockDriverCompletionFunc *cb, void *opaque)
4358 {
4359     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4360 
4361     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4362                                  cb, opaque, false);
4363 }
4364 
4365 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4366                                   QEMUIOVector *qiov, int nb_sectors,
4367                                   BlockDriverCompletionFunc *cb, void *opaque)
4368 {
4369     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4370 
4371     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4372                                  cb, opaque, true);
4373 }
4374 
4375 BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4376         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4377         BlockDriverCompletionFunc *cb, void *opaque)
4378 {
4379     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4380 
4381     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4382                                  BDRV_REQ_ZERO_WRITE | flags,
4383                                  cb, opaque, true);
4384 }
4385 
4386 
4387 typedef struct MultiwriteCB {
4388     int error;
4389     int num_requests;
4390     int num_callbacks;
4391     struct {
4392         BlockDriverCompletionFunc *cb;
4393         void *opaque;
4394         QEMUIOVector *free_qiov;
4395     } callbacks[];
4396 } MultiwriteCB;
4397 
4398 static void multiwrite_user_cb(MultiwriteCB *mcb)
4399 {
4400     int i;
4401 
4402     for (i = 0; i < mcb->num_callbacks; i++) {
4403         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4404         if (mcb->callbacks[i].free_qiov) {
4405             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4406         }
4407         g_free(mcb->callbacks[i].free_qiov);
4408     }
4409 }
4410 
4411 static void multiwrite_cb(void *opaque, int ret)
4412 {
4413     MultiwriteCB *mcb = opaque;
4414 
4415     trace_multiwrite_cb(mcb, ret);
4416 
4417     if (ret < 0 && !mcb->error) {
4418         mcb->error = ret;
4419     }
4420 
4421     mcb->num_requests--;
4422     if (mcb->num_requests == 0) {
4423         multiwrite_user_cb(mcb);
4424         g_free(mcb);
4425     }
4426 }
4427 
4428 static int multiwrite_req_compare(const void *a, const void *b)
4429 {
4430     const BlockRequest *req1 = a, *req2 = b;
4431 
4432     /*
4433      * Note that we can't simply subtract req2->sector from req1->sector
4434      * here as that could overflow the return value.
4435      */
4436     if (req1->sector > req2->sector) {
4437         return 1;
4438     } else if (req1->sector < req2->sector) {
4439         return -1;
4440     } else {
4441         return 0;
4442     }
4443 }
4444 
4445 /*
4446  * Takes a bunch of requests and tries to merge them. Returns the number of
4447  * requests that remain after merging.
4448  */
4449 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4450     int num_reqs, MultiwriteCB *mcb)
4451 {
4452     int i, outidx;
4453 
4454     // Sort requests by start sector
4455     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4456 
4457     // Check if adjacent requests touch the same clusters. If so, combine them,
4458     // filling up gaps with zero sectors.
4459     outidx = 0;
4460     for (i = 1; i < num_reqs; i++) {
4461         int merge = 0;
4462         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4463 
4464         // Handle exactly sequential writes and overlapping writes.
4465         if (reqs[i].sector <= oldreq_last) {
4466             merge = 1;
4467         }
4468 
4469         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4470             merge = 0;
4471         }
4472 
4473         if (merge) {
4474             size_t size;
4475             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4476             qemu_iovec_init(qiov,
4477                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4478 
4479             // Add the first request to the merged one. If the requests are
4480             // overlapping, drop the last sectors of the first request.
4481             size = (reqs[i].sector - reqs[outidx].sector) << 9;
4482             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4483 
4484             // We should need to add any zeros between the two requests
4485             assert (reqs[i].sector <= oldreq_last);
4486 
4487             // Add the second request
4488             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4489 
4490             reqs[outidx].nb_sectors = qiov->size >> 9;
4491             reqs[outidx].qiov = qiov;
4492 
4493             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4494         } else {
4495             outidx++;
4496             reqs[outidx].sector     = reqs[i].sector;
4497             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4498             reqs[outidx].qiov       = reqs[i].qiov;
4499         }
4500     }
4501 
4502     return outidx + 1;
4503 }
4504 
4505 /*
4506  * Submit multiple AIO write requests at once.
4507  *
4508  * On success, the function returns 0 and all requests in the reqs array have
4509  * been submitted. In error case this function returns -1, and any of the
4510  * requests may or may not be submitted yet. In particular, this means that the
4511  * callback will be called for some of the requests, for others it won't. The
4512  * caller must check the error field of the BlockRequest to wait for the right
4513  * callbacks (if error != 0, no callback will be called).
4514  *
4515  * The implementation may modify the contents of the reqs array, e.g. to merge
4516  * requests. However, the fields opaque and error are left unmodified as they
4517  * are used to signal failure for a single request to the caller.
4518  */
4519 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4520 {
4521     MultiwriteCB *mcb;
4522     int i;
4523 
4524     /* don't submit writes if we don't have a medium */
4525     if (bs->drv == NULL) {
4526         for (i = 0; i < num_reqs; i++) {
4527             reqs[i].error = -ENOMEDIUM;
4528         }
4529         return -1;
4530     }
4531 
4532     if (num_reqs == 0) {
4533         return 0;
4534     }
4535 
4536     // Create MultiwriteCB structure
4537     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4538     mcb->num_requests = 0;
4539     mcb->num_callbacks = num_reqs;
4540 
4541     for (i = 0; i < num_reqs; i++) {
4542         mcb->callbacks[i].cb = reqs[i].cb;
4543         mcb->callbacks[i].opaque = reqs[i].opaque;
4544     }
4545 
4546     // Check for mergable requests
4547     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4548 
4549     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4550 
4551     /* Run the aio requests. */
4552     mcb->num_requests = num_reqs;
4553     for (i = 0; i < num_reqs; i++) {
4554         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4555                               reqs[i].nb_sectors, reqs[i].flags,
4556                               multiwrite_cb, mcb,
4557                               true);
4558     }
4559 
4560     return 0;
4561 }
4562 
4563 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4564 {
4565     acb->aiocb_info->cancel(acb);
4566 }
4567 
4568 /**************************************************************/
4569 /* async block device emulation */
4570 
4571 typedef struct BlockDriverAIOCBSync {
4572     BlockDriverAIOCB common;
4573     QEMUBH *bh;
4574     int ret;
4575     /* vector translation state */
4576     QEMUIOVector *qiov;
4577     uint8_t *bounce;
4578     int is_write;
4579 } BlockDriverAIOCBSync;
4580 
4581 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4582 {
4583     BlockDriverAIOCBSync *acb =
4584         container_of(blockacb, BlockDriverAIOCBSync, common);
4585     qemu_bh_delete(acb->bh);
4586     acb->bh = NULL;
4587     qemu_aio_release(acb);
4588 }
4589 
4590 static const AIOCBInfo bdrv_em_aiocb_info = {
4591     .aiocb_size         = sizeof(BlockDriverAIOCBSync),
4592     .cancel             = bdrv_aio_cancel_em,
4593 };
4594 
4595 static void bdrv_aio_bh_cb(void *opaque)
4596 {
4597     BlockDriverAIOCBSync *acb = opaque;
4598 
4599     if (!acb->is_write)
4600         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4601     qemu_vfree(acb->bounce);
4602     acb->common.cb(acb->common.opaque, acb->ret);
4603     qemu_bh_delete(acb->bh);
4604     acb->bh = NULL;
4605     qemu_aio_release(acb);
4606 }
4607 
4608 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4609                                             int64_t sector_num,
4610                                             QEMUIOVector *qiov,
4611                                             int nb_sectors,
4612                                             BlockDriverCompletionFunc *cb,
4613                                             void *opaque,
4614                                             int is_write)
4615 
4616 {
4617     BlockDriverAIOCBSync *acb;
4618 
4619     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4620     acb->is_write = is_write;
4621     acb->qiov = qiov;
4622     acb->bounce = qemu_blockalign(bs, qiov->size);
4623     acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4624 
4625     if (is_write) {
4626         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4627         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4628     } else {
4629         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4630     }
4631 
4632     qemu_bh_schedule(acb->bh);
4633 
4634     return &acb->common;
4635 }
4636 
4637 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4638         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4639         BlockDriverCompletionFunc *cb, void *opaque)
4640 {
4641     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4642 }
4643 
4644 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4645         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4646         BlockDriverCompletionFunc *cb, void *opaque)
4647 {
4648     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4649 }
4650 
4651 
4652 typedef struct BlockDriverAIOCBCoroutine {
4653     BlockDriverAIOCB common;
4654     BlockRequest req;
4655     bool is_write;
4656     bool *done;
4657     QEMUBH* bh;
4658 } BlockDriverAIOCBCoroutine;
4659 
4660 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4661 {
4662     BlockDriverAIOCBCoroutine *acb =
4663         container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4664     bool done = false;
4665 
4666     acb->done = &done;
4667     while (!done) {
4668         qemu_aio_wait();
4669     }
4670 }
4671 
4672 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4673     .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
4674     .cancel             = bdrv_aio_co_cancel_em,
4675 };
4676 
4677 static void bdrv_co_em_bh(void *opaque)
4678 {
4679     BlockDriverAIOCBCoroutine *acb = opaque;
4680 
4681     acb->common.cb(acb->common.opaque, acb->req.error);
4682 
4683     if (acb->done) {
4684         *acb->done = true;
4685     }
4686 
4687     qemu_bh_delete(acb->bh);
4688     qemu_aio_release(acb);
4689 }
4690 
4691 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4692 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4693 {
4694     BlockDriverAIOCBCoroutine *acb = opaque;
4695     BlockDriverState *bs = acb->common.bs;
4696 
4697     if (!acb->is_write) {
4698         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4699             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4700     } else {
4701         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4702             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4703     }
4704 
4705     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4706     qemu_bh_schedule(acb->bh);
4707 }
4708 
4709 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4710                                                int64_t sector_num,
4711                                                QEMUIOVector *qiov,
4712                                                int nb_sectors,
4713                                                BdrvRequestFlags flags,
4714                                                BlockDriverCompletionFunc *cb,
4715                                                void *opaque,
4716                                                bool is_write)
4717 {
4718     Coroutine *co;
4719     BlockDriverAIOCBCoroutine *acb;
4720 
4721     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4722     acb->req.sector = sector_num;
4723     acb->req.nb_sectors = nb_sectors;
4724     acb->req.qiov = qiov;
4725     acb->req.flags = flags;
4726     acb->is_write = is_write;
4727     acb->done = NULL;
4728 
4729     co = qemu_coroutine_create(bdrv_co_do_rw);
4730     qemu_coroutine_enter(co, acb);
4731 
4732     return &acb->common;
4733 }
4734 
4735 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4736 {
4737     BlockDriverAIOCBCoroutine *acb = opaque;
4738     BlockDriverState *bs = acb->common.bs;
4739 
4740     acb->req.error = bdrv_co_flush(bs);
4741     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4742     qemu_bh_schedule(acb->bh);
4743 }
4744 
4745 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4746         BlockDriverCompletionFunc *cb, void *opaque)
4747 {
4748     trace_bdrv_aio_flush(bs, opaque);
4749 
4750     Coroutine *co;
4751     BlockDriverAIOCBCoroutine *acb;
4752 
4753     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4754     acb->done = NULL;
4755 
4756     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4757     qemu_coroutine_enter(co, acb);
4758 
4759     return &acb->common;
4760 }
4761 
4762 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4763 {
4764     BlockDriverAIOCBCoroutine *acb = opaque;
4765     BlockDriverState *bs = acb->common.bs;
4766 
4767     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4768     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4769     qemu_bh_schedule(acb->bh);
4770 }
4771 
4772 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4773         int64_t sector_num, int nb_sectors,
4774         BlockDriverCompletionFunc *cb, void *opaque)
4775 {
4776     Coroutine *co;
4777     BlockDriverAIOCBCoroutine *acb;
4778 
4779     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4780 
4781     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4782     acb->req.sector = sector_num;
4783     acb->req.nb_sectors = nb_sectors;
4784     acb->done = NULL;
4785     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4786     qemu_coroutine_enter(co, acb);
4787 
4788     return &acb->common;
4789 }
4790 
4791 void bdrv_init(void)
4792 {
4793     module_call_init(MODULE_INIT_BLOCK);
4794 }
4795 
4796 void bdrv_init_with_whitelist(void)
4797 {
4798     use_bdrv_whitelist = 1;
4799     bdrv_init();
4800 }
4801 
4802 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4803                    BlockDriverCompletionFunc *cb, void *opaque)
4804 {
4805     BlockDriverAIOCB *acb;
4806 
4807     acb = g_slice_alloc(aiocb_info->aiocb_size);
4808     acb->aiocb_info = aiocb_info;
4809     acb->bs = bs;
4810     acb->cb = cb;
4811     acb->opaque = opaque;
4812     return acb;
4813 }
4814 
4815 void qemu_aio_release(void *p)
4816 {
4817     BlockDriverAIOCB *acb = p;
4818     g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4819 }
4820 
4821 /**************************************************************/
4822 /* Coroutine block device emulation */
4823 
4824 typedef struct CoroutineIOCompletion {
4825     Coroutine *coroutine;
4826     int ret;
4827 } CoroutineIOCompletion;
4828 
4829 static void bdrv_co_io_em_complete(void *opaque, int ret)
4830 {
4831     CoroutineIOCompletion *co = opaque;
4832 
4833     co->ret = ret;
4834     qemu_coroutine_enter(co->coroutine, NULL);
4835 }
4836 
4837 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4838                                       int nb_sectors, QEMUIOVector *iov,
4839                                       bool is_write)
4840 {
4841     CoroutineIOCompletion co = {
4842         .coroutine = qemu_coroutine_self(),
4843     };
4844     BlockDriverAIOCB *acb;
4845 
4846     if (is_write) {
4847         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4848                                        bdrv_co_io_em_complete, &co);
4849     } else {
4850         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4851                                       bdrv_co_io_em_complete, &co);
4852     }
4853 
4854     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4855     if (!acb) {
4856         return -EIO;
4857     }
4858     qemu_coroutine_yield();
4859 
4860     return co.ret;
4861 }
4862 
4863 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4864                                          int64_t sector_num, int nb_sectors,
4865                                          QEMUIOVector *iov)
4866 {
4867     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4868 }
4869 
4870 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4871                                          int64_t sector_num, int nb_sectors,
4872                                          QEMUIOVector *iov)
4873 {
4874     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4875 }
4876 
4877 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4878 {
4879     RwCo *rwco = opaque;
4880 
4881     rwco->ret = bdrv_co_flush(rwco->bs);
4882 }
4883 
4884 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4885 {
4886     int ret;
4887 
4888     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4889         return 0;
4890     }
4891 
4892     /* Write back cached data to the OS even with cache=unsafe */
4893     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4894     if (bs->drv->bdrv_co_flush_to_os) {
4895         ret = bs->drv->bdrv_co_flush_to_os(bs);
4896         if (ret < 0) {
4897             return ret;
4898         }
4899     }
4900 
4901     /* But don't actually force it to the disk with cache=unsafe */
4902     if (bs->open_flags & BDRV_O_NO_FLUSH) {
4903         goto flush_parent;
4904     }
4905 
4906     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4907     if (bs->drv->bdrv_co_flush_to_disk) {
4908         ret = bs->drv->bdrv_co_flush_to_disk(bs);
4909     } else if (bs->drv->bdrv_aio_flush) {
4910         BlockDriverAIOCB *acb;
4911         CoroutineIOCompletion co = {
4912             .coroutine = qemu_coroutine_self(),
4913         };
4914 
4915         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4916         if (acb == NULL) {
4917             ret = -EIO;
4918         } else {
4919             qemu_coroutine_yield();
4920             ret = co.ret;
4921         }
4922     } else {
4923         /*
4924          * Some block drivers always operate in either writethrough or unsafe
4925          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4926          * know how the server works (because the behaviour is hardcoded or
4927          * depends on server-side configuration), so we can't ensure that
4928          * everything is safe on disk. Returning an error doesn't work because
4929          * that would break guests even if the server operates in writethrough
4930          * mode.
4931          *
4932          * Let's hope the user knows what he's doing.
4933          */
4934         ret = 0;
4935     }
4936     if (ret < 0) {
4937         return ret;
4938     }
4939 
4940     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4941      * in the case of cache=unsafe, so there are no useless flushes.
4942      */
4943 flush_parent:
4944     return bdrv_co_flush(bs->file);
4945 }
4946 
4947 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
4948 {
4949     Error *local_err = NULL;
4950     int ret;
4951 
4952     if (!bs->drv)  {
4953         return;
4954     }
4955 
4956     if (bs->drv->bdrv_invalidate_cache) {
4957         bs->drv->bdrv_invalidate_cache(bs, &local_err);
4958     } else if (bs->file) {
4959         bdrv_invalidate_cache(bs->file, &local_err);
4960     }
4961     if (local_err) {
4962         error_propagate(errp, local_err);
4963         return;
4964     }
4965 
4966     ret = refresh_total_sectors(bs, bs->total_sectors);
4967     if (ret < 0) {
4968         error_setg_errno(errp, -ret, "Could not refresh total sector count");
4969         return;
4970     }
4971 }
4972 
4973 void bdrv_invalidate_cache_all(Error **errp)
4974 {
4975     BlockDriverState *bs;
4976     Error *local_err = NULL;
4977 
4978     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4979         bdrv_invalidate_cache(bs, &local_err);
4980         if (local_err) {
4981             error_propagate(errp, local_err);
4982             return;
4983         }
4984     }
4985 }
4986 
4987 void bdrv_clear_incoming_migration_all(void)
4988 {
4989     BlockDriverState *bs;
4990 
4991     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4992         bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4993     }
4994 }
4995 
4996 int bdrv_flush(BlockDriverState *bs)
4997 {
4998     Coroutine *co;
4999     RwCo rwco = {
5000         .bs = bs,
5001         .ret = NOT_DONE,
5002     };
5003 
5004     if (qemu_in_coroutine()) {
5005         /* Fast-path if already in coroutine context */
5006         bdrv_flush_co_entry(&rwco);
5007     } else {
5008         co = qemu_coroutine_create(bdrv_flush_co_entry);
5009         qemu_coroutine_enter(co, &rwco);
5010         while (rwco.ret == NOT_DONE) {
5011             qemu_aio_wait();
5012         }
5013     }
5014 
5015     return rwco.ret;
5016 }
5017 
5018 typedef struct DiscardCo {
5019     BlockDriverState *bs;
5020     int64_t sector_num;
5021     int nb_sectors;
5022     int ret;
5023 } DiscardCo;
5024 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5025 {
5026     DiscardCo *rwco = opaque;
5027 
5028     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5029 }
5030 
5031 /* if no limit is specified in the BlockLimits use a default
5032  * of 32768 512-byte sectors (16 MiB) per request.
5033  */
5034 #define MAX_DISCARD_DEFAULT 32768
5035 
5036 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5037                                  int nb_sectors)
5038 {
5039     int max_discard;
5040 
5041     if (!bs->drv) {
5042         return -ENOMEDIUM;
5043     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5044         return -EIO;
5045     } else if (bs->read_only) {
5046         return -EROFS;
5047     }
5048 
5049     bdrv_reset_dirty(bs, sector_num, nb_sectors);
5050 
5051     /* Do nothing if disabled.  */
5052     if (!(bs->open_flags & BDRV_O_UNMAP)) {
5053         return 0;
5054     }
5055 
5056     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5057         return 0;
5058     }
5059 
5060     max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5061     while (nb_sectors > 0) {
5062         int ret;
5063         int num = nb_sectors;
5064 
5065         /* align request */
5066         if (bs->bl.discard_alignment &&
5067             num >= bs->bl.discard_alignment &&
5068             sector_num % bs->bl.discard_alignment) {
5069             if (num > bs->bl.discard_alignment) {
5070                 num = bs->bl.discard_alignment;
5071             }
5072             num -= sector_num % bs->bl.discard_alignment;
5073         }
5074 
5075         /* limit request size */
5076         if (num > max_discard) {
5077             num = max_discard;
5078         }
5079 
5080         if (bs->drv->bdrv_co_discard) {
5081             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5082         } else {
5083             BlockDriverAIOCB *acb;
5084             CoroutineIOCompletion co = {
5085                 .coroutine = qemu_coroutine_self(),
5086             };
5087 
5088             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5089                                             bdrv_co_io_em_complete, &co);
5090             if (acb == NULL) {
5091                 return -EIO;
5092             } else {
5093                 qemu_coroutine_yield();
5094                 ret = co.ret;
5095             }
5096         }
5097         if (ret && ret != -ENOTSUP) {
5098             return ret;
5099         }
5100 
5101         sector_num += num;
5102         nb_sectors -= num;
5103     }
5104     return 0;
5105 }
5106 
5107 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5108 {
5109     Coroutine *co;
5110     DiscardCo rwco = {
5111         .bs = bs,
5112         .sector_num = sector_num,
5113         .nb_sectors = nb_sectors,
5114         .ret = NOT_DONE,
5115     };
5116 
5117     if (qemu_in_coroutine()) {
5118         /* Fast-path if already in coroutine context */
5119         bdrv_discard_co_entry(&rwco);
5120     } else {
5121         co = qemu_coroutine_create(bdrv_discard_co_entry);
5122         qemu_coroutine_enter(co, &rwco);
5123         while (rwco.ret == NOT_DONE) {
5124             qemu_aio_wait();
5125         }
5126     }
5127 
5128     return rwco.ret;
5129 }
5130 
5131 /**************************************************************/
5132 /* removable device support */
5133 
5134 /**
5135  * Return TRUE if the media is present
5136  */
5137 int bdrv_is_inserted(BlockDriverState *bs)
5138 {
5139     BlockDriver *drv = bs->drv;
5140 
5141     if (!drv)
5142         return 0;
5143     if (!drv->bdrv_is_inserted)
5144         return 1;
5145     return drv->bdrv_is_inserted(bs);
5146 }
5147 
5148 /**
5149  * Return whether the media changed since the last call to this
5150  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
5151  */
5152 int bdrv_media_changed(BlockDriverState *bs)
5153 {
5154     BlockDriver *drv = bs->drv;
5155 
5156     if (drv && drv->bdrv_media_changed) {
5157         return drv->bdrv_media_changed(bs);
5158     }
5159     return -ENOTSUP;
5160 }
5161 
5162 /**
5163  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5164  */
5165 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5166 {
5167     BlockDriver *drv = bs->drv;
5168 
5169     if (drv && drv->bdrv_eject) {
5170         drv->bdrv_eject(bs, eject_flag);
5171     }
5172 
5173     if (bs->device_name[0] != '\0') {
5174         bdrv_emit_qmp_eject_event(bs, eject_flag);
5175     }
5176 }
5177 
5178 /**
5179  * Lock or unlock the media (if it is locked, the user won't be able
5180  * to eject it manually).
5181  */
5182 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5183 {
5184     BlockDriver *drv = bs->drv;
5185 
5186     trace_bdrv_lock_medium(bs, locked);
5187 
5188     if (drv && drv->bdrv_lock_medium) {
5189         drv->bdrv_lock_medium(bs, locked);
5190     }
5191 }
5192 
5193 /* needed for generic scsi interface */
5194 
5195 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5196 {
5197     BlockDriver *drv = bs->drv;
5198 
5199     if (drv && drv->bdrv_ioctl)
5200         return drv->bdrv_ioctl(bs, req, buf);
5201     return -ENOTSUP;
5202 }
5203 
5204 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5205         unsigned long int req, void *buf,
5206         BlockDriverCompletionFunc *cb, void *opaque)
5207 {
5208     BlockDriver *drv = bs->drv;
5209 
5210     if (drv && drv->bdrv_aio_ioctl)
5211         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5212     return NULL;
5213 }
5214 
5215 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5216 {
5217     bs->guest_block_size = align;
5218 }
5219 
5220 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5221 {
5222     return qemu_memalign(bdrv_opt_mem_align(bs), size);
5223 }
5224 
5225 /*
5226  * Check if all memory in this vector is sector aligned.
5227  */
5228 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5229 {
5230     int i;
5231     size_t alignment = bdrv_opt_mem_align(bs);
5232 
5233     for (i = 0; i < qiov->niov; i++) {
5234         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5235             return false;
5236         }
5237         if (qiov->iov[i].iov_len % alignment) {
5238             return false;
5239         }
5240     }
5241 
5242     return true;
5243 }
5244 
5245 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5246                                           Error **errp)
5247 {
5248     int64_t bitmap_size;
5249     BdrvDirtyBitmap *bitmap;
5250 
5251     assert((granularity & (granularity - 1)) == 0);
5252 
5253     granularity >>= BDRV_SECTOR_BITS;
5254     assert(granularity);
5255     bitmap_size = bdrv_getlength(bs);
5256     if (bitmap_size < 0) {
5257         error_setg_errno(errp, -bitmap_size, "could not get length of device");
5258         errno = -bitmap_size;
5259         return NULL;
5260     }
5261     bitmap_size >>= BDRV_SECTOR_BITS;
5262     bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5263     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5264     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5265     return bitmap;
5266 }
5267 
5268 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5269 {
5270     BdrvDirtyBitmap *bm, *next;
5271     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5272         if (bm == bitmap) {
5273             QLIST_REMOVE(bitmap, list);
5274             hbitmap_free(bitmap->bitmap);
5275             g_free(bitmap);
5276             return;
5277         }
5278     }
5279 }
5280 
5281 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5282 {
5283     BdrvDirtyBitmap *bm;
5284     BlockDirtyInfoList *list = NULL;
5285     BlockDirtyInfoList **plist = &list;
5286 
5287     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5288         BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5289         BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5290         info->count = bdrv_get_dirty_count(bs, bm);
5291         info->granularity =
5292             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5293         entry->value = info;
5294         *plist = entry;
5295         plist = &entry->next;
5296     }
5297 
5298     return list;
5299 }
5300 
5301 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5302 {
5303     if (bitmap) {
5304         return hbitmap_get(bitmap->bitmap, sector);
5305     } else {
5306         return 0;
5307     }
5308 }
5309 
5310 void bdrv_dirty_iter_init(BlockDriverState *bs,
5311                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5312 {
5313     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5314 }
5315 
5316 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5317                     int nr_sectors)
5318 {
5319     BdrvDirtyBitmap *bitmap;
5320     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5321         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5322     }
5323 }
5324 
5325 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5326 {
5327     BdrvDirtyBitmap *bitmap;
5328     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5329         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5330     }
5331 }
5332 
5333 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5334 {
5335     return hbitmap_count(bitmap->bitmap);
5336 }
5337 
5338 /* Get a reference to bs */
5339 void bdrv_ref(BlockDriverState *bs)
5340 {
5341     bs->refcnt++;
5342 }
5343 
5344 /* Release a previously grabbed reference to bs.
5345  * If after releasing, reference count is zero, the BlockDriverState is
5346  * deleted. */
5347 void bdrv_unref(BlockDriverState *bs)
5348 {
5349     assert(bs->refcnt > 0);
5350     if (--bs->refcnt == 0) {
5351         bdrv_delete(bs);
5352     }
5353 }
5354 
5355 struct BdrvOpBlocker {
5356     Error *reason;
5357     QLIST_ENTRY(BdrvOpBlocker) list;
5358 };
5359 
5360 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5361 {
5362     BdrvOpBlocker *blocker;
5363     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5364     if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5365         blocker = QLIST_FIRST(&bs->op_blockers[op]);
5366         if (errp) {
5367             error_setg(errp, "Device '%s' is busy: %s",
5368                        bs->device_name, error_get_pretty(blocker->reason));
5369         }
5370         return true;
5371     }
5372     return false;
5373 }
5374 
5375 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5376 {
5377     BdrvOpBlocker *blocker;
5378     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5379 
5380     blocker = g_malloc0(sizeof(BdrvOpBlocker));
5381     blocker->reason = reason;
5382     QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5383 }
5384 
5385 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5386 {
5387     BdrvOpBlocker *blocker, *next;
5388     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5389     QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5390         if (blocker->reason == reason) {
5391             QLIST_REMOVE(blocker, list);
5392             g_free(blocker);
5393         }
5394     }
5395 }
5396 
5397 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5398 {
5399     int i;
5400     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5401         bdrv_op_block(bs, i, reason);
5402     }
5403 }
5404 
5405 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5406 {
5407     int i;
5408     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5409         bdrv_op_unblock(bs, i, reason);
5410     }
5411 }
5412 
5413 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5414 {
5415     int i;
5416 
5417     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5418         if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5419             return false;
5420         }
5421     }
5422     return true;
5423 }
5424 
5425 void bdrv_iostatus_enable(BlockDriverState *bs)
5426 {
5427     bs->iostatus_enabled = true;
5428     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5429 }
5430 
5431 /* The I/O status is only enabled if the drive explicitly
5432  * enables it _and_ the VM is configured to stop on errors */
5433 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5434 {
5435     return (bs->iostatus_enabled &&
5436            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5437             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5438             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5439 }
5440 
5441 void bdrv_iostatus_disable(BlockDriverState *bs)
5442 {
5443     bs->iostatus_enabled = false;
5444 }
5445 
5446 void bdrv_iostatus_reset(BlockDriverState *bs)
5447 {
5448     if (bdrv_iostatus_is_enabled(bs)) {
5449         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5450         if (bs->job) {
5451             block_job_iostatus_reset(bs->job);
5452         }
5453     }
5454 }
5455 
5456 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5457 {
5458     assert(bdrv_iostatus_is_enabled(bs));
5459     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5460         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5461                                          BLOCK_DEVICE_IO_STATUS_FAILED;
5462     }
5463 }
5464 
5465 void
5466 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5467         enum BlockAcctType type)
5468 {
5469     assert(type < BDRV_MAX_IOTYPE);
5470 
5471     cookie->bytes = bytes;
5472     cookie->start_time_ns = get_clock();
5473     cookie->type = type;
5474 }
5475 
5476 void
5477 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5478 {
5479     assert(cookie->type < BDRV_MAX_IOTYPE);
5480 
5481     bs->nr_bytes[cookie->type] += cookie->bytes;
5482     bs->nr_ops[cookie->type]++;
5483     bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5484 }
5485 
5486 void bdrv_img_create(const char *filename, const char *fmt,
5487                      const char *base_filename, const char *base_fmt,
5488                      char *options, uint64_t img_size, int flags,
5489                      Error **errp, bool quiet)
5490 {
5491     QEMUOptionParameter *param = NULL, *create_options = NULL;
5492     QEMUOptionParameter *backing_fmt, *backing_file, *size;
5493     BlockDriver *drv, *proto_drv;
5494     BlockDriver *backing_drv = NULL;
5495     Error *local_err = NULL;
5496     int ret = 0;
5497 
5498     /* Find driver and parse its options */
5499     drv = bdrv_find_format(fmt);
5500     if (!drv) {
5501         error_setg(errp, "Unknown file format '%s'", fmt);
5502         return;
5503     }
5504 
5505     proto_drv = bdrv_find_protocol(filename, true);
5506     if (!proto_drv) {
5507         error_setg(errp, "Unknown protocol '%s'", filename);
5508         return;
5509     }
5510 
5511     create_options = append_option_parameters(create_options,
5512                                               drv->create_options);
5513     create_options = append_option_parameters(create_options,
5514                                               proto_drv->create_options);
5515 
5516     /* Create parameter list with default values */
5517     param = parse_option_parameters("", create_options, param);
5518 
5519     set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5520 
5521     /* Parse -o options */
5522     if (options) {
5523         param = parse_option_parameters(options, create_options, param);
5524         if (param == NULL) {
5525             error_setg(errp, "Invalid options for file format '%s'.", fmt);
5526             goto out;
5527         }
5528     }
5529 
5530     if (base_filename) {
5531         if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5532                                  base_filename)) {
5533             error_setg(errp, "Backing file not supported for file format '%s'",
5534                        fmt);
5535             goto out;
5536         }
5537     }
5538 
5539     if (base_fmt) {
5540         if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5541             error_setg(errp, "Backing file format not supported for file "
5542                              "format '%s'", fmt);
5543             goto out;
5544         }
5545     }
5546 
5547     backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5548     if (backing_file && backing_file->value.s) {
5549         if (!strcmp(filename, backing_file->value.s)) {
5550             error_setg(errp, "Error: Trying to create an image with the "
5551                              "same filename as the backing file");
5552             goto out;
5553         }
5554     }
5555 
5556     backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5557     if (backing_fmt && backing_fmt->value.s) {
5558         backing_drv = bdrv_find_format(backing_fmt->value.s);
5559         if (!backing_drv) {
5560             error_setg(errp, "Unknown backing file format '%s'",
5561                        backing_fmt->value.s);
5562             goto out;
5563         }
5564     }
5565 
5566     // The size for the image must always be specified, with one exception:
5567     // If we are using a backing file, we can obtain the size from there
5568     size = get_option_parameter(param, BLOCK_OPT_SIZE);
5569     if (size && size->value.n == -1) {
5570         if (backing_file && backing_file->value.s) {
5571             BlockDriverState *bs;
5572             uint64_t size;
5573             char buf[32];
5574             int back_flags;
5575 
5576             /* backing files always opened read-only */
5577             back_flags =
5578                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5579 
5580             bs = NULL;
5581             ret = bdrv_open(&bs, backing_file->value.s, NULL, NULL, back_flags,
5582                             backing_drv, &local_err);
5583             if (ret < 0) {
5584                 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5585                                  backing_file->value.s,
5586                                  error_get_pretty(local_err));
5587                 error_free(local_err);
5588                 local_err = NULL;
5589                 goto out;
5590             }
5591             bdrv_get_geometry(bs, &size);
5592             size *= 512;
5593 
5594             snprintf(buf, sizeof(buf), "%" PRId64, size);
5595             set_option_parameter(param, BLOCK_OPT_SIZE, buf);
5596 
5597             bdrv_unref(bs);
5598         } else {
5599             error_setg(errp, "Image creation needs a size parameter");
5600             goto out;
5601         }
5602     }
5603 
5604     if (!quiet) {
5605         printf("Formatting '%s', fmt=%s ", filename, fmt);
5606         print_option_parameters(param);
5607         puts("");
5608     }
5609     ret = bdrv_create(drv, filename, param, &local_err);
5610     if (ret == -EFBIG) {
5611         /* This is generally a better message than whatever the driver would
5612          * deliver (especially because of the cluster_size_hint), since that
5613          * is most probably not much different from "image too large". */
5614         const char *cluster_size_hint = "";
5615         if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5616             cluster_size_hint = " (try using a larger cluster size)";
5617         }
5618         error_setg(errp, "The image size is too large for file format '%s'"
5619                    "%s", fmt, cluster_size_hint);
5620         error_free(local_err);
5621         local_err = NULL;
5622     }
5623 
5624 out:
5625     free_option_parameters(create_options);
5626     free_option_parameters(param);
5627 
5628     if (local_err) {
5629         error_propagate(errp, local_err);
5630     }
5631 }
5632 
5633 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5634 {
5635     /* Currently BlockDriverState always uses the main loop AioContext */
5636     return qemu_get_aio_context();
5637 }
5638 
5639 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5640                                     NotifierWithReturn *notifier)
5641 {
5642     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5643 }
5644 
5645 int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5646 {
5647     if (bs->drv->bdrv_amend_options == NULL) {
5648         return -ENOTSUP;
5649     }
5650     return bs->drv->bdrv_amend_options(bs, options);
5651 }
5652 
5653 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5654  * of block filter and by bdrv_is_first_non_filter.
5655  * It is used to test if the given bs is the candidate or recurse more in the
5656  * node graph.
5657  */
5658 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5659                                       BlockDriverState *candidate)
5660 {
5661     /* return false if basic checks fails */
5662     if (!bs || !bs->drv) {
5663         return false;
5664     }
5665 
5666     /* the code reached a non block filter driver -> check if the bs is
5667      * the same as the candidate. It's the recursion termination condition.
5668      */
5669     if (!bs->drv->is_filter) {
5670         return bs == candidate;
5671     }
5672     /* Down this path the driver is a block filter driver */
5673 
5674     /* If the block filter recursion method is defined use it to recurse down
5675      * the node graph.
5676      */
5677     if (bs->drv->bdrv_recurse_is_first_non_filter) {
5678         return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5679     }
5680 
5681     /* the driver is a block filter but don't allow to recurse -> return false
5682      */
5683     return false;
5684 }
5685 
5686 /* This function checks if the candidate is the first non filter bs down it's
5687  * bs chain. Since we don't have pointers to parents it explore all bs chains
5688  * from the top. Some filters can choose not to pass down the recursion.
5689  */
5690 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5691 {
5692     BlockDriverState *bs;
5693 
5694     /* walk down the bs forest recursively */
5695     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5696         bool perm;
5697 
5698         /* try to recurse in this top level bs */
5699         perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5700 
5701         /* candidate is the first non filter */
5702         if (perm) {
5703             return true;
5704         }
5705     }
5706 
5707     return false;
5708 }
5709