xref: /openbmc/qemu/block.c (revision c6684249fd35f7e692bcf3039d2fc4b13dd32308)
1 /*
2  * QEMU System Emulator block driver
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "block/block_int.h"
28 #include "block/blockjob.h"
29 #include "qemu/module.h"
30 #include "qapi/qmp/qjson.h"
31 #include "sysemu/block-backend.h"
32 #include "sysemu/sysemu.h"
33 #include "qemu/notify.h"
34 #include "block/coroutine.h"
35 #include "block/qapi.h"
36 #include "qmp-commands.h"
37 #include "qemu/timer.h"
38 #include "qapi-event.h"
39 
40 #ifdef CONFIG_BSD
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <sys/ioctl.h>
44 #include <sys/queue.h>
45 #ifndef __DragonFly__
46 #include <sys/disk.h>
47 #endif
48 #endif
49 
50 #ifdef _WIN32
51 #include <windows.h>
52 #endif
53 
54 struct BdrvDirtyBitmap {
55     HBitmap *bitmap;
56     QLIST_ENTRY(BdrvDirtyBitmap) list;
57 };
58 
59 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
60 
61 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63         BlockCompletionFunc *cb, void *opaque);
64 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66         BlockCompletionFunc *cb, void *opaque);
67 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68                                          int64_t sector_num, int nb_sectors,
69                                          QEMUIOVector *iov);
70 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71                                          int64_t sector_num, int nb_sectors,
72                                          QEMUIOVector *iov);
73 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75     BdrvRequestFlags flags);
76 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78     BdrvRequestFlags flags);
79 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80                                          int64_t sector_num,
81                                          QEMUIOVector *qiov,
82                                          int nb_sectors,
83                                          BdrvRequestFlags flags,
84                                          BlockCompletionFunc *cb,
85                                          void *opaque,
86                                          bool is_write);
87 static void coroutine_fn bdrv_co_do_rw(void *opaque);
88 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90 
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92     QTAILQ_HEAD_INITIALIZER(bdrv_states);
93 
94 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96 
97 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98     QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
102 
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
105 {
106     return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107              (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108             filename[1] == ':');
109 }
110 
111 int is_windows_drive(const char *filename)
112 {
113     if (is_windows_drive_prefix(filename) &&
114         filename[2] == '\0')
115         return 1;
116     if (strstart(filename, "\\\\.\\", NULL) ||
117         strstart(filename, "//./", NULL))
118         return 1;
119     return 0;
120 }
121 #endif
122 
123 /* throttling disk I/O limits */
124 void bdrv_set_io_limits(BlockDriverState *bs,
125                         ThrottleConfig *cfg)
126 {
127     int i;
128 
129     throttle_config(&bs->throttle_state, cfg);
130 
131     for (i = 0; i < 2; i++) {
132         qemu_co_enter_next(&bs->throttled_reqs[i]);
133     }
134 }
135 
136 /* this function drain all the throttled IOs */
137 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138 {
139     bool drained = false;
140     bool enabled = bs->io_limits_enabled;
141     int i;
142 
143     bs->io_limits_enabled = false;
144 
145     for (i = 0; i < 2; i++) {
146         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147             drained = true;
148         }
149     }
150 
151     bs->io_limits_enabled = enabled;
152 
153     return drained;
154 }
155 
156 void bdrv_io_limits_disable(BlockDriverState *bs)
157 {
158     bs->io_limits_enabled = false;
159 
160     bdrv_start_throttled_reqs(bs);
161 
162     throttle_destroy(&bs->throttle_state);
163 }
164 
165 static void bdrv_throttle_read_timer_cb(void *opaque)
166 {
167     BlockDriverState *bs = opaque;
168     qemu_co_enter_next(&bs->throttled_reqs[0]);
169 }
170 
171 static void bdrv_throttle_write_timer_cb(void *opaque)
172 {
173     BlockDriverState *bs = opaque;
174     qemu_co_enter_next(&bs->throttled_reqs[1]);
175 }
176 
177 /* should be called before bdrv_set_io_limits if a limit is set */
178 void bdrv_io_limits_enable(BlockDriverState *bs)
179 {
180     assert(!bs->io_limits_enabled);
181     throttle_init(&bs->throttle_state,
182                   bdrv_get_aio_context(bs),
183                   QEMU_CLOCK_VIRTUAL,
184                   bdrv_throttle_read_timer_cb,
185                   bdrv_throttle_write_timer_cb,
186                   bs);
187     bs->io_limits_enabled = true;
188 }
189 
190 /* This function makes an IO wait if needed
191  *
192  * @nb_sectors: the number of sectors of the IO
193  * @is_write:   is the IO a write
194  */
195 static void bdrv_io_limits_intercept(BlockDriverState *bs,
196                                      unsigned int bytes,
197                                      bool is_write)
198 {
199     /* does this io must wait */
200     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
201 
202     /* if must wait or any request of this type throttled queue the IO */
203     if (must_wait ||
204         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
205         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
206     }
207 
208     /* the IO will be executed, do the accounting */
209     throttle_account(&bs->throttle_state, is_write, bytes);
210 
211 
212     /* if the next request must wait -> do nothing */
213     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
214         return;
215     }
216 
217     /* else queue next request for execution */
218     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
219 }
220 
221 size_t bdrv_opt_mem_align(BlockDriverState *bs)
222 {
223     if (!bs || !bs->drv) {
224         /* 4k should be on the safe side */
225         return 4096;
226     }
227 
228     return bs->bl.opt_mem_alignment;
229 }
230 
231 /* check if the path starts with "<protocol>:" */
232 static int path_has_protocol(const char *path)
233 {
234     const char *p;
235 
236 #ifdef _WIN32
237     if (is_windows_drive(path) ||
238         is_windows_drive_prefix(path)) {
239         return 0;
240     }
241     p = path + strcspn(path, ":/\\");
242 #else
243     p = path + strcspn(path, ":/");
244 #endif
245 
246     return *p == ':';
247 }
248 
249 int path_is_absolute(const char *path)
250 {
251 #ifdef _WIN32
252     /* specific case for names like: "\\.\d:" */
253     if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
254         return 1;
255     }
256     return (*path == '/' || *path == '\\');
257 #else
258     return (*path == '/');
259 #endif
260 }
261 
262 /* if filename is absolute, just copy it to dest. Otherwise, build a
263    path to it by considering it is relative to base_path. URL are
264    supported. */
265 void path_combine(char *dest, int dest_size,
266                   const char *base_path,
267                   const char *filename)
268 {
269     const char *p, *p1;
270     int len;
271 
272     if (dest_size <= 0)
273         return;
274     if (path_is_absolute(filename)) {
275         pstrcpy(dest, dest_size, filename);
276     } else {
277         p = strchr(base_path, ':');
278         if (p)
279             p++;
280         else
281             p = base_path;
282         p1 = strrchr(base_path, '/');
283 #ifdef _WIN32
284         {
285             const char *p2;
286             p2 = strrchr(base_path, '\\');
287             if (!p1 || p2 > p1)
288                 p1 = p2;
289         }
290 #endif
291         if (p1)
292             p1++;
293         else
294             p1 = base_path;
295         if (p1 > p)
296             p = p1;
297         len = p - base_path;
298         if (len > dest_size - 1)
299             len = dest_size - 1;
300         memcpy(dest, base_path, len);
301         dest[len] = '\0';
302         pstrcat(dest, dest_size, filename);
303     }
304 }
305 
306 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
307 {
308     if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
309         pstrcpy(dest, sz, bs->backing_file);
310     } else {
311         path_combine(dest, sz, bs->filename, bs->backing_file);
312     }
313 }
314 
315 void bdrv_register(BlockDriver *bdrv)
316 {
317     /* Block drivers without coroutine functions need emulation */
318     if (!bdrv->bdrv_co_readv) {
319         bdrv->bdrv_co_readv = bdrv_co_readv_em;
320         bdrv->bdrv_co_writev = bdrv_co_writev_em;
321 
322         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
323          * the block driver lacks aio we need to emulate that too.
324          */
325         if (!bdrv->bdrv_aio_readv) {
326             /* add AIO emulation layer */
327             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
328             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
329         }
330     }
331 
332     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
333 }
334 
335 BlockDriverState *bdrv_new_root(void)
336 {
337     BlockDriverState *bs = bdrv_new();
338 
339     QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
340     return bs;
341 }
342 
343 BlockDriverState *bdrv_new(void)
344 {
345     BlockDriverState *bs;
346     int i;
347 
348     bs = g_new0(BlockDriverState, 1);
349     QLIST_INIT(&bs->dirty_bitmaps);
350     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
351         QLIST_INIT(&bs->op_blockers[i]);
352     }
353     bdrv_iostatus_disable(bs);
354     notifier_list_init(&bs->close_notifiers);
355     notifier_with_return_list_init(&bs->before_write_notifiers);
356     qemu_co_queue_init(&bs->throttled_reqs[0]);
357     qemu_co_queue_init(&bs->throttled_reqs[1]);
358     bs->refcnt = 1;
359     bs->aio_context = qemu_get_aio_context();
360 
361     return bs;
362 }
363 
364 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
365 {
366     notifier_list_add(&bs->close_notifiers, notify);
367 }
368 
369 BlockDriver *bdrv_find_format(const char *format_name)
370 {
371     BlockDriver *drv1;
372     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
373         if (!strcmp(drv1->format_name, format_name)) {
374             return drv1;
375         }
376     }
377     return NULL;
378 }
379 
380 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
381 {
382     static const char *whitelist_rw[] = {
383         CONFIG_BDRV_RW_WHITELIST
384     };
385     static const char *whitelist_ro[] = {
386         CONFIG_BDRV_RO_WHITELIST
387     };
388     const char **p;
389 
390     if (!whitelist_rw[0] && !whitelist_ro[0]) {
391         return 1;               /* no whitelist, anything goes */
392     }
393 
394     for (p = whitelist_rw; *p; p++) {
395         if (!strcmp(drv->format_name, *p)) {
396             return 1;
397         }
398     }
399     if (read_only) {
400         for (p = whitelist_ro; *p; p++) {
401             if (!strcmp(drv->format_name, *p)) {
402                 return 1;
403             }
404         }
405     }
406     return 0;
407 }
408 
409 BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
410                                           bool read_only)
411 {
412     BlockDriver *drv = bdrv_find_format(format_name);
413     return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
414 }
415 
416 typedef struct CreateCo {
417     BlockDriver *drv;
418     char *filename;
419     QemuOpts *opts;
420     int ret;
421     Error *err;
422 } CreateCo;
423 
424 static void coroutine_fn bdrv_create_co_entry(void *opaque)
425 {
426     Error *local_err = NULL;
427     int ret;
428 
429     CreateCo *cco = opaque;
430     assert(cco->drv);
431 
432     ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
433     if (local_err) {
434         error_propagate(&cco->err, local_err);
435     }
436     cco->ret = ret;
437 }
438 
439 int bdrv_create(BlockDriver *drv, const char* filename,
440                 QemuOpts *opts, Error **errp)
441 {
442     int ret;
443 
444     Coroutine *co;
445     CreateCo cco = {
446         .drv = drv,
447         .filename = g_strdup(filename),
448         .opts = opts,
449         .ret = NOT_DONE,
450         .err = NULL,
451     };
452 
453     if (!drv->bdrv_create) {
454         error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
455         ret = -ENOTSUP;
456         goto out;
457     }
458 
459     if (qemu_in_coroutine()) {
460         /* Fast-path if already in coroutine context */
461         bdrv_create_co_entry(&cco);
462     } else {
463         co = qemu_coroutine_create(bdrv_create_co_entry);
464         qemu_coroutine_enter(co, &cco);
465         while (cco.ret == NOT_DONE) {
466             aio_poll(qemu_get_aio_context(), true);
467         }
468     }
469 
470     ret = cco.ret;
471     if (ret < 0) {
472         if (cco.err) {
473             error_propagate(errp, cco.err);
474         } else {
475             error_setg_errno(errp, -ret, "Could not create image");
476         }
477     }
478 
479 out:
480     g_free(cco.filename);
481     return ret;
482 }
483 
484 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
485 {
486     BlockDriver *drv;
487     Error *local_err = NULL;
488     int ret;
489 
490     drv = bdrv_find_protocol(filename, true);
491     if (drv == NULL) {
492         error_setg(errp, "Could not find protocol for file '%s'", filename);
493         return -ENOENT;
494     }
495 
496     ret = bdrv_create(drv, filename, opts, &local_err);
497     if (local_err) {
498         error_propagate(errp, local_err);
499     }
500     return ret;
501 }
502 
503 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
504 {
505     BlockDriver *drv = bs->drv;
506     Error *local_err = NULL;
507 
508     memset(&bs->bl, 0, sizeof(bs->bl));
509 
510     if (!drv) {
511         return;
512     }
513 
514     /* Take some limits from the children as a default */
515     if (bs->file) {
516         bdrv_refresh_limits(bs->file, &local_err);
517         if (local_err) {
518             error_propagate(errp, local_err);
519             return;
520         }
521         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
522         bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
523         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
524     } else {
525         bs->bl.opt_mem_alignment = 512;
526     }
527 
528     if (bs->backing_hd) {
529         bdrv_refresh_limits(bs->backing_hd, &local_err);
530         if (local_err) {
531             error_propagate(errp, local_err);
532             return;
533         }
534         bs->bl.opt_transfer_length =
535             MAX(bs->bl.opt_transfer_length,
536                 bs->backing_hd->bl.opt_transfer_length);
537         bs->bl.max_transfer_length =
538             MIN_NON_ZERO(bs->bl.max_transfer_length,
539                          bs->backing_hd->bl.max_transfer_length);
540         bs->bl.opt_mem_alignment =
541             MAX(bs->bl.opt_mem_alignment,
542                 bs->backing_hd->bl.opt_mem_alignment);
543     }
544 
545     /* Then let the driver override it */
546     if (drv->bdrv_refresh_limits) {
547         drv->bdrv_refresh_limits(bs, errp);
548     }
549 }
550 
551 /*
552  * Create a uniquely-named empty temporary file.
553  * Return 0 upon success, otherwise a negative errno value.
554  */
555 int get_tmp_filename(char *filename, int size)
556 {
557 #ifdef _WIN32
558     char temp_dir[MAX_PATH];
559     /* GetTempFileName requires that its output buffer (4th param)
560        have length MAX_PATH or greater.  */
561     assert(size >= MAX_PATH);
562     return (GetTempPath(MAX_PATH, temp_dir)
563             && GetTempFileName(temp_dir, "qem", 0, filename)
564             ? 0 : -GetLastError());
565 #else
566     int fd;
567     const char *tmpdir;
568     tmpdir = getenv("TMPDIR");
569     if (!tmpdir) {
570         tmpdir = "/var/tmp";
571     }
572     if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
573         return -EOVERFLOW;
574     }
575     fd = mkstemp(filename);
576     if (fd < 0) {
577         return -errno;
578     }
579     if (close(fd) != 0) {
580         unlink(filename);
581         return -errno;
582     }
583     return 0;
584 #endif
585 }
586 
587 /*
588  * Detect host devices. By convention, /dev/cdrom[N] is always
589  * recognized as a host CDROM.
590  */
591 static BlockDriver *find_hdev_driver(const char *filename)
592 {
593     int score_max = 0, score;
594     BlockDriver *drv = NULL, *d;
595 
596     QLIST_FOREACH(d, &bdrv_drivers, list) {
597         if (d->bdrv_probe_device) {
598             score = d->bdrv_probe_device(filename);
599             if (score > score_max) {
600                 score_max = score;
601                 drv = d;
602             }
603         }
604     }
605 
606     return drv;
607 }
608 
609 BlockDriver *bdrv_find_protocol(const char *filename,
610                                 bool allow_protocol_prefix)
611 {
612     BlockDriver *drv1;
613     char protocol[128];
614     int len;
615     const char *p;
616 
617     /* TODO Drivers without bdrv_file_open must be specified explicitly */
618 
619     /*
620      * XXX(hch): we really should not let host device detection
621      * override an explicit protocol specification, but moving this
622      * later breaks access to device names with colons in them.
623      * Thanks to the brain-dead persistent naming schemes on udev-
624      * based Linux systems those actually are quite common.
625      */
626     drv1 = find_hdev_driver(filename);
627     if (drv1) {
628         return drv1;
629     }
630 
631     if (!path_has_protocol(filename) || !allow_protocol_prefix) {
632         return bdrv_find_format("file");
633     }
634 
635     p = strchr(filename, ':');
636     assert(p != NULL);
637     len = p - filename;
638     if (len > sizeof(protocol) - 1)
639         len = sizeof(protocol) - 1;
640     memcpy(protocol, filename, len);
641     protocol[len] = '\0';
642     QLIST_FOREACH(drv1, &bdrv_drivers, list) {
643         if (drv1->protocol_name &&
644             !strcmp(drv1->protocol_name, protocol)) {
645             return drv1;
646         }
647     }
648     return NULL;
649 }
650 
651 /*
652  * Guess image format by probing its contents.
653  * This is not a good idea when your image is raw (CVE-2008-2004), but
654  * we do it anyway for backward compatibility.
655  *
656  * @buf         contains the image's first @buf_size bytes.
657  * @buf_size    is the buffer size in bytes (generally 2048, but can be smaller
658  *              if the image file is smaller)
659  * @filename    is its filename.
660  *
661  * For all block drivers, call the bdrv_probe() method to get its
662  * probing score.
663  * Return the first block driver with the highest probing score.
664  */
665 static BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
666                                    const char *filename)
667 {
668     int score_max = 0, score;
669     BlockDriver *drv = NULL, *d;
670 
671     QLIST_FOREACH(d, &bdrv_drivers, list) {
672         if (d->bdrv_probe) {
673             score = d->bdrv_probe(buf, buf_size, filename);
674             if (score > score_max) {
675                 score_max = score;
676                 drv = d;
677             }
678         }
679     }
680 
681     return drv;
682 }
683 
684 static int find_image_format(BlockDriverState *bs, const char *filename,
685                              BlockDriver **pdrv, Error **errp)
686 {
687     BlockDriver *drv;
688     uint8_t buf[2048];
689     int ret = 0;
690 
691     /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
692     if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
693         drv = bdrv_find_format("raw");
694         if (!drv) {
695             error_setg(errp, "Could not find raw image format");
696             ret = -ENOENT;
697         }
698         *pdrv = drv;
699         return ret;
700     }
701 
702     ret = bdrv_pread(bs, 0, buf, sizeof(buf));
703     if (ret < 0) {
704         error_setg_errno(errp, -ret, "Could not read image for determining its "
705                          "format");
706         *pdrv = NULL;
707         return ret;
708     }
709 
710     drv = bdrv_probe_all(buf, ret, filename);
711     if (!drv) {
712         error_setg(errp, "Could not determine image format: No compatible "
713                    "driver found");
714         ret = -ENOENT;
715     }
716     *pdrv = drv;
717     return ret;
718 }
719 
720 /**
721  * Set the current 'total_sectors' value
722  * Return 0 on success, -errno on error.
723  */
724 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
725 {
726     BlockDriver *drv = bs->drv;
727 
728     /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
729     if (bs->sg)
730         return 0;
731 
732     /* query actual device if possible, otherwise just trust the hint */
733     if (drv->bdrv_getlength) {
734         int64_t length = drv->bdrv_getlength(bs);
735         if (length < 0) {
736             return length;
737         }
738         hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
739     }
740 
741     bs->total_sectors = hint;
742     return 0;
743 }
744 
745 /**
746  * Set open flags for a given discard mode
747  *
748  * Return 0 on success, -1 if the discard mode was invalid.
749  */
750 int bdrv_parse_discard_flags(const char *mode, int *flags)
751 {
752     *flags &= ~BDRV_O_UNMAP;
753 
754     if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
755         /* do nothing */
756     } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
757         *flags |= BDRV_O_UNMAP;
758     } else {
759         return -1;
760     }
761 
762     return 0;
763 }
764 
765 /**
766  * Set open flags for a given cache mode
767  *
768  * Return 0 on success, -1 if the cache mode was invalid.
769  */
770 int bdrv_parse_cache_flags(const char *mode, int *flags)
771 {
772     *flags &= ~BDRV_O_CACHE_MASK;
773 
774     if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
775         *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
776     } else if (!strcmp(mode, "directsync")) {
777         *flags |= BDRV_O_NOCACHE;
778     } else if (!strcmp(mode, "writeback")) {
779         *flags |= BDRV_O_CACHE_WB;
780     } else if (!strcmp(mode, "unsafe")) {
781         *flags |= BDRV_O_CACHE_WB;
782         *flags |= BDRV_O_NO_FLUSH;
783     } else if (!strcmp(mode, "writethrough")) {
784         /* this is the default */
785     } else {
786         return -1;
787     }
788 
789     return 0;
790 }
791 
792 /**
793  * The copy-on-read flag is actually a reference count so multiple users may
794  * use the feature without worrying about clobbering its previous state.
795  * Copy-on-read stays enabled until all users have called to disable it.
796  */
797 void bdrv_enable_copy_on_read(BlockDriverState *bs)
798 {
799     bs->copy_on_read++;
800 }
801 
802 void bdrv_disable_copy_on_read(BlockDriverState *bs)
803 {
804     assert(bs->copy_on_read > 0);
805     bs->copy_on_read--;
806 }
807 
808 /*
809  * Returns the flags that a temporary snapshot should get, based on the
810  * originally requested flags (the originally requested image will have flags
811  * like a backing file)
812  */
813 static int bdrv_temp_snapshot_flags(int flags)
814 {
815     return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
816 }
817 
818 /*
819  * Returns the flags that bs->file should get, based on the given flags for
820  * the parent BDS
821  */
822 static int bdrv_inherited_flags(int flags)
823 {
824     /* Enable protocol handling, disable format probing for bs->file */
825     flags |= BDRV_O_PROTOCOL;
826 
827     /* Our block drivers take care to send flushes and respect unmap policy,
828      * so we can enable both unconditionally on lower layers. */
829     flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
830 
831     /* Clear flags that only apply to the top layer */
832     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
833 
834     return flags;
835 }
836 
837 /*
838  * Returns the flags that bs->backing_hd should get, based on the given flags
839  * for the parent BDS
840  */
841 static int bdrv_backing_flags(int flags)
842 {
843     /* backing files always opened read-only */
844     flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ);
845 
846     /* snapshot=on is handled on the top layer */
847     flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY);
848 
849     return flags;
850 }
851 
852 static int bdrv_open_flags(BlockDriverState *bs, int flags)
853 {
854     int open_flags = flags | BDRV_O_CACHE_WB;
855 
856     /*
857      * Clear flags that are internal to the block layer before opening the
858      * image.
859      */
860     open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
861 
862     /*
863      * Snapshots should be writable.
864      */
865     if (flags & BDRV_O_TEMPORARY) {
866         open_flags |= BDRV_O_RDWR;
867     }
868 
869     return open_flags;
870 }
871 
872 static void bdrv_assign_node_name(BlockDriverState *bs,
873                                   const char *node_name,
874                                   Error **errp)
875 {
876     if (!node_name) {
877         return;
878     }
879 
880     /* Check for empty string or invalid characters */
881     if (!id_wellformed(node_name)) {
882         error_setg(errp, "Invalid node name");
883         return;
884     }
885 
886     /* takes care of avoiding namespaces collisions */
887     if (blk_by_name(node_name)) {
888         error_setg(errp, "node-name=%s is conflicting with a device id",
889                    node_name);
890         return;
891     }
892 
893     /* takes care of avoiding duplicates node names */
894     if (bdrv_find_node(node_name)) {
895         error_setg(errp, "Duplicate node name");
896         return;
897     }
898 
899     /* copy node name into the bs and insert it into the graph list */
900     pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
901     QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
902 }
903 
904 /*
905  * Common part for opening disk images and files
906  *
907  * Removes all processed options from *options.
908  */
909 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
910     QDict *options, int flags, BlockDriver *drv, Error **errp)
911 {
912     int ret, open_flags;
913     const char *filename;
914     const char *node_name = NULL;
915     Error *local_err = NULL;
916 
917     assert(drv != NULL);
918     assert(bs->file == NULL);
919     assert(options != NULL && bs->options != options);
920 
921     if (file != NULL) {
922         filename = file->filename;
923     } else {
924         filename = qdict_get_try_str(options, "filename");
925     }
926 
927     if (drv->bdrv_needs_filename && !filename) {
928         error_setg(errp, "The '%s' block driver requires a file name",
929                    drv->format_name);
930         return -EINVAL;
931     }
932 
933     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
934 
935     node_name = qdict_get_try_str(options, "node-name");
936     bdrv_assign_node_name(bs, node_name, &local_err);
937     if (local_err) {
938         error_propagate(errp, local_err);
939         return -EINVAL;
940     }
941     qdict_del(options, "node-name");
942 
943     /* bdrv_open() with directly using a protocol as drv. This layer is already
944      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
945      * and return immediately. */
946     if (file != NULL && drv->bdrv_file_open) {
947         bdrv_swap(file, bs);
948         return 0;
949     }
950 
951     bs->open_flags = flags;
952     bs->guest_block_size = 512;
953     bs->request_alignment = 512;
954     bs->zero_beyond_eof = true;
955     open_flags = bdrv_open_flags(bs, flags);
956     bs->read_only = !(open_flags & BDRV_O_RDWR);
957     bs->growable = !!(flags & BDRV_O_PROTOCOL);
958 
959     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
960         error_setg(errp,
961                    !bs->read_only && bdrv_is_whitelisted(drv, true)
962                         ? "Driver '%s' can only be used for read-only devices"
963                         : "Driver '%s' is not whitelisted",
964                    drv->format_name);
965         return -ENOTSUP;
966     }
967 
968     assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
969     if (flags & BDRV_O_COPY_ON_READ) {
970         if (!bs->read_only) {
971             bdrv_enable_copy_on_read(bs);
972         } else {
973             error_setg(errp, "Can't use copy-on-read on read-only device");
974             return -EINVAL;
975         }
976     }
977 
978     if (filename != NULL) {
979         pstrcpy(bs->filename, sizeof(bs->filename), filename);
980     } else {
981         bs->filename[0] = '\0';
982     }
983     pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
984 
985     bs->drv = drv;
986     bs->opaque = g_malloc0(drv->instance_size);
987 
988     bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
989 
990     /* Open the image, either directly or using a protocol */
991     if (drv->bdrv_file_open) {
992         assert(file == NULL);
993         assert(!drv->bdrv_needs_filename || filename != NULL);
994         ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
995     } else {
996         if (file == NULL) {
997             error_setg(errp, "Can't use '%s' as a block driver for the "
998                        "protocol level", drv->format_name);
999             ret = -EINVAL;
1000             goto free_and_fail;
1001         }
1002         bs->file = file;
1003         ret = drv->bdrv_open(bs, options, open_flags, &local_err);
1004     }
1005 
1006     if (ret < 0) {
1007         if (local_err) {
1008             error_propagate(errp, local_err);
1009         } else if (bs->filename[0]) {
1010             error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
1011         } else {
1012             error_setg_errno(errp, -ret, "Could not open image");
1013         }
1014         goto free_and_fail;
1015     }
1016 
1017     ret = refresh_total_sectors(bs, bs->total_sectors);
1018     if (ret < 0) {
1019         error_setg_errno(errp, -ret, "Could not refresh total sector count");
1020         goto free_and_fail;
1021     }
1022 
1023     bdrv_refresh_limits(bs, &local_err);
1024     if (local_err) {
1025         error_propagate(errp, local_err);
1026         ret = -EINVAL;
1027         goto free_and_fail;
1028     }
1029 
1030     assert(bdrv_opt_mem_align(bs) != 0);
1031     assert((bs->request_alignment != 0) || bs->sg);
1032     return 0;
1033 
1034 free_and_fail:
1035     bs->file = NULL;
1036     g_free(bs->opaque);
1037     bs->opaque = NULL;
1038     bs->drv = NULL;
1039     return ret;
1040 }
1041 
1042 static QDict *parse_json_filename(const char *filename, Error **errp)
1043 {
1044     QObject *options_obj;
1045     QDict *options;
1046     int ret;
1047 
1048     ret = strstart(filename, "json:", &filename);
1049     assert(ret);
1050 
1051     options_obj = qobject_from_json(filename);
1052     if (!options_obj) {
1053         error_setg(errp, "Could not parse the JSON options");
1054         return NULL;
1055     }
1056 
1057     if (qobject_type(options_obj) != QTYPE_QDICT) {
1058         qobject_decref(options_obj);
1059         error_setg(errp, "Invalid JSON object given");
1060         return NULL;
1061     }
1062 
1063     options = qobject_to_qdict(options_obj);
1064     qdict_flatten(options);
1065 
1066     return options;
1067 }
1068 
1069 /*
1070  * Fills in default options for opening images and converts the legacy
1071  * filename/flags pair to option QDict entries.
1072  */
1073 static int bdrv_fill_options(QDict **options, const char **pfilename, int flags,
1074                              BlockDriver *drv, Error **errp)
1075 {
1076     const char *filename = *pfilename;
1077     const char *drvname;
1078     bool protocol = flags & BDRV_O_PROTOCOL;
1079     bool parse_filename = false;
1080     Error *local_err = NULL;
1081 
1082     /* Parse json: pseudo-protocol */
1083     if (filename && g_str_has_prefix(filename, "json:")) {
1084         QDict *json_options = parse_json_filename(filename, &local_err);
1085         if (local_err) {
1086             error_propagate(errp, local_err);
1087             return -EINVAL;
1088         }
1089 
1090         /* Options given in the filename have lower priority than options
1091          * specified directly */
1092         qdict_join(*options, json_options, false);
1093         QDECREF(json_options);
1094         *pfilename = filename = NULL;
1095     }
1096 
1097     /* Fetch the file name from the options QDict if necessary */
1098     if (protocol && filename) {
1099         if (!qdict_haskey(*options, "filename")) {
1100             qdict_put(*options, "filename", qstring_from_str(filename));
1101             parse_filename = true;
1102         } else {
1103             error_setg(errp, "Can't specify 'file' and 'filename' options at "
1104                              "the same time");
1105             return -EINVAL;
1106         }
1107     }
1108 
1109     /* Find the right block driver */
1110     filename = qdict_get_try_str(*options, "filename");
1111     drvname = qdict_get_try_str(*options, "driver");
1112 
1113     if (drv) {
1114         if (drvname) {
1115             error_setg(errp, "Driver specified twice");
1116             return -EINVAL;
1117         }
1118         drvname = drv->format_name;
1119         qdict_put(*options, "driver", qstring_from_str(drvname));
1120     } else {
1121         if (!drvname && protocol) {
1122             if (filename) {
1123                 drv = bdrv_find_protocol(filename, parse_filename);
1124                 if (!drv) {
1125                     error_setg(errp, "Unknown protocol");
1126                     return -EINVAL;
1127                 }
1128 
1129                 drvname = drv->format_name;
1130                 qdict_put(*options, "driver", qstring_from_str(drvname));
1131             } else {
1132                 error_setg(errp, "Must specify either driver or file");
1133                 return -EINVAL;
1134             }
1135         } else if (drvname) {
1136             drv = bdrv_find_format(drvname);
1137             if (!drv) {
1138                 error_setg(errp, "Unknown driver '%s'", drvname);
1139                 return -ENOENT;
1140             }
1141         }
1142     }
1143 
1144     assert(drv || !protocol);
1145 
1146     /* Driver-specific filename parsing */
1147     if (drv && drv->bdrv_parse_filename && parse_filename) {
1148         drv->bdrv_parse_filename(filename, *options, &local_err);
1149         if (local_err) {
1150             error_propagate(errp, local_err);
1151             return -EINVAL;
1152         }
1153 
1154         if (!drv->bdrv_needs_filename) {
1155             qdict_del(*options, "filename");
1156         }
1157     }
1158 
1159     return 0;
1160 }
1161 
1162 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
1163 {
1164 
1165     if (bs->backing_hd) {
1166         assert(bs->backing_blocker);
1167         bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
1168     } else if (backing_hd) {
1169         error_setg(&bs->backing_blocker,
1170                    "device is used as backing hd of '%s'",
1171                    bdrv_get_device_name(bs));
1172     }
1173 
1174     bs->backing_hd = backing_hd;
1175     if (!backing_hd) {
1176         error_free(bs->backing_blocker);
1177         bs->backing_blocker = NULL;
1178         goto out;
1179     }
1180     bs->open_flags &= ~BDRV_O_NO_BACKING;
1181     pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_hd->filename);
1182     pstrcpy(bs->backing_format, sizeof(bs->backing_format),
1183             backing_hd->drv ? backing_hd->drv->format_name : "");
1184 
1185     bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
1186     /* Otherwise we won't be able to commit due to check in bdrv_commit */
1187     bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
1188                     bs->backing_blocker);
1189 out:
1190     bdrv_refresh_limits(bs, NULL);
1191 }
1192 
1193 /*
1194  * Opens the backing file for a BlockDriverState if not yet open
1195  *
1196  * options is a QDict of options to pass to the block drivers, or NULL for an
1197  * empty set of options. The reference to the QDict is transferred to this
1198  * function (even on failure), so if the caller intends to reuse the dictionary,
1199  * it needs to use QINCREF() before calling bdrv_file_open.
1200  */
1201 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1202 {
1203     char *backing_filename = g_malloc0(PATH_MAX);
1204     int ret = 0;
1205     BlockDriver *back_drv = NULL;
1206     BlockDriverState *backing_hd;
1207     Error *local_err = NULL;
1208 
1209     if (bs->backing_hd != NULL) {
1210         QDECREF(options);
1211         goto free_exit;
1212     }
1213 
1214     /* NULL means an empty set of options */
1215     if (options == NULL) {
1216         options = qdict_new();
1217     }
1218 
1219     bs->open_flags &= ~BDRV_O_NO_BACKING;
1220     if (qdict_haskey(options, "file.filename")) {
1221         backing_filename[0] = '\0';
1222     } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1223         QDECREF(options);
1224         goto free_exit;
1225     } else {
1226         bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
1227     }
1228 
1229     if (!bs->drv || !bs->drv->supports_backing) {
1230         ret = -EINVAL;
1231         error_setg(errp, "Driver doesn't support backing files");
1232         QDECREF(options);
1233         goto free_exit;
1234     }
1235 
1236     backing_hd = bdrv_new();
1237 
1238     if (bs->backing_format[0] != '\0') {
1239         back_drv = bdrv_find_format(bs->backing_format);
1240     }
1241 
1242     assert(bs->backing_hd == NULL);
1243     ret = bdrv_open(&backing_hd,
1244                     *backing_filename ? backing_filename : NULL, NULL, options,
1245                     bdrv_backing_flags(bs->open_flags), back_drv, &local_err);
1246     if (ret < 0) {
1247         bdrv_unref(backing_hd);
1248         backing_hd = NULL;
1249         bs->open_flags |= BDRV_O_NO_BACKING;
1250         error_setg(errp, "Could not open backing file: %s",
1251                    error_get_pretty(local_err));
1252         error_free(local_err);
1253         goto free_exit;
1254     }
1255     bdrv_set_backing_hd(bs, backing_hd);
1256 
1257 free_exit:
1258     g_free(backing_filename);
1259     return ret;
1260 }
1261 
1262 /*
1263  * Opens a disk image whose options are given as BlockdevRef in another block
1264  * device's options.
1265  *
1266  * If allow_none is true, no image will be opened if filename is false and no
1267  * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1268  *
1269  * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1270  * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1271  * itself, all options starting with "${bdref_key}." are considered part of the
1272  * BlockdevRef.
1273  *
1274  * The BlockdevRef will be removed from the options QDict.
1275  *
1276  * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1277  */
1278 int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1279                     QDict *options, const char *bdref_key, int flags,
1280                     bool allow_none, Error **errp)
1281 {
1282     QDict *image_options;
1283     int ret;
1284     char *bdref_key_dot;
1285     const char *reference;
1286 
1287     assert(pbs);
1288     assert(*pbs == NULL);
1289 
1290     bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1291     qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1292     g_free(bdref_key_dot);
1293 
1294     reference = qdict_get_try_str(options, bdref_key);
1295     if (!filename && !reference && !qdict_size(image_options)) {
1296         if (allow_none) {
1297             ret = 0;
1298         } else {
1299             error_setg(errp, "A block device must be specified for \"%s\"",
1300                        bdref_key);
1301             ret = -EINVAL;
1302         }
1303         QDECREF(image_options);
1304         goto done;
1305     }
1306 
1307     ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
1308 
1309 done:
1310     qdict_del(options, bdref_key);
1311     return ret;
1312 }
1313 
1314 int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
1315 {
1316     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1317     char *tmp_filename = g_malloc0(PATH_MAX + 1);
1318     int64_t total_size;
1319     BlockDriver *bdrv_qcow2;
1320     QemuOpts *opts = NULL;
1321     QDict *snapshot_options;
1322     BlockDriverState *bs_snapshot;
1323     Error *local_err;
1324     int ret;
1325 
1326     /* if snapshot, we create a temporary backing file and open it
1327        instead of opening 'filename' directly */
1328 
1329     /* Get the required size from the image */
1330     total_size = bdrv_getlength(bs);
1331     if (total_size < 0) {
1332         ret = total_size;
1333         error_setg_errno(errp, -total_size, "Could not get image size");
1334         goto out;
1335     }
1336 
1337     /* Create the temporary image */
1338     ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
1339     if (ret < 0) {
1340         error_setg_errno(errp, -ret, "Could not get temporary filename");
1341         goto out;
1342     }
1343 
1344     bdrv_qcow2 = bdrv_find_format("qcow2");
1345     opts = qemu_opts_create(bdrv_qcow2->create_opts, NULL, 0,
1346                             &error_abort);
1347     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size);
1348     ret = bdrv_create(bdrv_qcow2, tmp_filename, opts, &local_err);
1349     qemu_opts_del(opts);
1350     if (ret < 0) {
1351         error_setg_errno(errp, -ret, "Could not create temporary overlay "
1352                          "'%s': %s", tmp_filename,
1353                          error_get_pretty(local_err));
1354         error_free(local_err);
1355         goto out;
1356     }
1357 
1358     /* Prepare a new options QDict for the temporary file */
1359     snapshot_options = qdict_new();
1360     qdict_put(snapshot_options, "file.driver",
1361               qstring_from_str("file"));
1362     qdict_put(snapshot_options, "file.filename",
1363               qstring_from_str(tmp_filename));
1364 
1365     bs_snapshot = bdrv_new();
1366 
1367     ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1368                     flags, bdrv_qcow2, &local_err);
1369     if (ret < 0) {
1370         error_propagate(errp, local_err);
1371         goto out;
1372     }
1373 
1374     bdrv_append(bs_snapshot, bs);
1375 
1376 out:
1377     g_free(tmp_filename);
1378     return ret;
1379 }
1380 
1381 /*
1382  * Opens a disk image (raw, qcow2, vmdk, ...)
1383  *
1384  * options is a QDict of options to pass to the block drivers, or NULL for an
1385  * empty set of options. The reference to the QDict belongs to the block layer
1386  * after the call (even on failure), so if the caller intends to reuse the
1387  * dictionary, it needs to use QINCREF() before calling bdrv_open.
1388  *
1389  * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1390  * If it is not NULL, the referenced BDS will be reused.
1391  *
1392  * The reference parameter may be used to specify an existing block device which
1393  * should be opened. If specified, neither options nor a filename may be given,
1394  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
1395  */
1396 int bdrv_open(BlockDriverState **pbs, const char *filename,
1397               const char *reference, QDict *options, int flags,
1398               BlockDriver *drv, Error **errp)
1399 {
1400     int ret;
1401     BlockDriverState *file = NULL, *bs;
1402     const char *drvname;
1403     Error *local_err = NULL;
1404     int snapshot_flags = 0;
1405 
1406     assert(pbs);
1407 
1408     if (reference) {
1409         bool options_non_empty = options ? qdict_size(options) : false;
1410         QDECREF(options);
1411 
1412         if (*pbs) {
1413             error_setg(errp, "Cannot reuse an existing BDS when referencing "
1414                        "another block device");
1415             return -EINVAL;
1416         }
1417 
1418         if (filename || options_non_empty) {
1419             error_setg(errp, "Cannot reference an existing block device with "
1420                        "additional options or a new filename");
1421             return -EINVAL;
1422         }
1423 
1424         bs = bdrv_lookup_bs(reference, reference, errp);
1425         if (!bs) {
1426             return -ENODEV;
1427         }
1428         bdrv_ref(bs);
1429         *pbs = bs;
1430         return 0;
1431     }
1432 
1433     if (*pbs) {
1434         bs = *pbs;
1435     } else {
1436         bs = bdrv_new();
1437     }
1438 
1439     /* NULL means an empty set of options */
1440     if (options == NULL) {
1441         options = qdict_new();
1442     }
1443 
1444     ret = bdrv_fill_options(&options, &filename, flags, drv, &local_err);
1445     if (local_err) {
1446         goto fail;
1447     }
1448 
1449     /* Find the right image format driver */
1450     drv = NULL;
1451     drvname = qdict_get_try_str(options, "driver");
1452     if (drvname) {
1453         drv = bdrv_find_format(drvname);
1454         qdict_del(options, "driver");
1455         if (!drv) {
1456             error_setg(errp, "Unknown driver: '%s'", drvname);
1457             ret = -EINVAL;
1458             goto fail;
1459         }
1460     }
1461 
1462     assert(drvname || !(flags & BDRV_O_PROTOCOL));
1463     if (drv && !drv->bdrv_file_open) {
1464         /* If the user explicitly wants a format driver here, we'll need to add
1465          * another layer for the protocol in bs->file */
1466         flags &= ~BDRV_O_PROTOCOL;
1467     }
1468 
1469     bs->options = options;
1470     options = qdict_clone_shallow(options);
1471 
1472     /* Open image file without format layer */
1473     if ((flags & BDRV_O_PROTOCOL) == 0) {
1474         if (flags & BDRV_O_RDWR) {
1475             flags |= BDRV_O_ALLOW_RDWR;
1476         }
1477         if (flags & BDRV_O_SNAPSHOT) {
1478             snapshot_flags = bdrv_temp_snapshot_flags(flags);
1479             flags = bdrv_backing_flags(flags);
1480         }
1481 
1482         assert(file == NULL);
1483         ret = bdrv_open_image(&file, filename, options, "file",
1484                               bdrv_inherited_flags(flags),
1485                               true, &local_err);
1486         if (ret < 0) {
1487             goto fail;
1488         }
1489     }
1490 
1491     /* Image format probing */
1492     if (!drv && file) {
1493         ret = find_image_format(file, filename, &drv, &local_err);
1494         if (ret < 0) {
1495             goto fail;
1496         }
1497     } else if (!drv) {
1498         error_setg(errp, "Must specify either driver or file");
1499         ret = -EINVAL;
1500         goto fail;
1501     }
1502 
1503     /* Open the image */
1504     ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1505     if (ret < 0) {
1506         goto fail;
1507     }
1508 
1509     if (file && (bs->file != file)) {
1510         bdrv_unref(file);
1511         file = NULL;
1512     }
1513 
1514     /* If there is a backing file, use it */
1515     if ((flags & BDRV_O_NO_BACKING) == 0) {
1516         QDict *backing_options;
1517 
1518         qdict_extract_subqdict(options, &backing_options, "backing.");
1519         ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1520         if (ret < 0) {
1521             goto close_and_fail;
1522         }
1523     }
1524 
1525     bdrv_refresh_filename(bs);
1526 
1527     /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1528      * temporary snapshot afterwards. */
1529     if (snapshot_flags) {
1530         ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
1531         if (local_err) {
1532             goto close_and_fail;
1533         }
1534     }
1535 
1536     /* Check if any unknown options were used */
1537     if (options && (qdict_size(options) != 0)) {
1538         const QDictEntry *entry = qdict_first(options);
1539         if (flags & BDRV_O_PROTOCOL) {
1540             error_setg(errp, "Block protocol '%s' doesn't support the option "
1541                        "'%s'", drv->format_name, entry->key);
1542         } else {
1543             error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1544                        "support the option '%s'", drv->format_name,
1545                        bdrv_get_device_name(bs), entry->key);
1546         }
1547 
1548         ret = -EINVAL;
1549         goto close_and_fail;
1550     }
1551 
1552     if (!bdrv_key_required(bs)) {
1553         if (bs->blk) {
1554             blk_dev_change_media_cb(bs->blk, true);
1555         }
1556     } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1557                && !runstate_check(RUN_STATE_INMIGRATE)
1558                && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1559         error_setg(errp,
1560                    "Guest must be stopped for opening of encrypted image");
1561         ret = -EBUSY;
1562         goto close_and_fail;
1563     }
1564 
1565     QDECREF(options);
1566     *pbs = bs;
1567     return 0;
1568 
1569 fail:
1570     if (file != NULL) {
1571         bdrv_unref(file);
1572     }
1573     QDECREF(bs->options);
1574     QDECREF(options);
1575     bs->options = NULL;
1576     if (!*pbs) {
1577         /* If *pbs is NULL, a new BDS has been created in this function and
1578            needs to be freed now. Otherwise, it does not need to be closed,
1579            since it has not really been opened yet. */
1580         bdrv_unref(bs);
1581     }
1582     if (local_err) {
1583         error_propagate(errp, local_err);
1584     }
1585     return ret;
1586 
1587 close_and_fail:
1588     /* See fail path, but now the BDS has to be always closed */
1589     if (*pbs) {
1590         bdrv_close(bs);
1591     } else {
1592         bdrv_unref(bs);
1593     }
1594     QDECREF(options);
1595     if (local_err) {
1596         error_propagate(errp, local_err);
1597     }
1598     return ret;
1599 }
1600 
1601 typedef struct BlockReopenQueueEntry {
1602      bool prepared;
1603      BDRVReopenState state;
1604      QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1605 } BlockReopenQueueEntry;
1606 
1607 /*
1608  * Adds a BlockDriverState to a simple queue for an atomic, transactional
1609  * reopen of multiple devices.
1610  *
1611  * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1612  * already performed, or alternatively may be NULL a new BlockReopenQueue will
1613  * be created and initialized. This newly created BlockReopenQueue should be
1614  * passed back in for subsequent calls that are intended to be of the same
1615  * atomic 'set'.
1616  *
1617  * bs is the BlockDriverState to add to the reopen queue.
1618  *
1619  * flags contains the open flags for the associated bs
1620  *
1621  * returns a pointer to bs_queue, which is either the newly allocated
1622  * bs_queue, or the existing bs_queue being used.
1623  *
1624  */
1625 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1626                                     BlockDriverState *bs, int flags)
1627 {
1628     assert(bs != NULL);
1629 
1630     BlockReopenQueueEntry *bs_entry;
1631     if (bs_queue == NULL) {
1632         bs_queue = g_new0(BlockReopenQueue, 1);
1633         QSIMPLEQ_INIT(bs_queue);
1634     }
1635 
1636     /* bdrv_open() masks this flag out */
1637     flags &= ~BDRV_O_PROTOCOL;
1638 
1639     if (bs->file) {
1640         bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags));
1641     }
1642 
1643     bs_entry = g_new0(BlockReopenQueueEntry, 1);
1644     QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1645 
1646     bs_entry->state.bs = bs;
1647     bs_entry->state.flags = flags;
1648 
1649     return bs_queue;
1650 }
1651 
1652 /*
1653  * Reopen multiple BlockDriverStates atomically & transactionally.
1654  *
1655  * The queue passed in (bs_queue) must have been built up previous
1656  * via bdrv_reopen_queue().
1657  *
1658  * Reopens all BDS specified in the queue, with the appropriate
1659  * flags.  All devices are prepared for reopen, and failure of any
1660  * device will cause all device changes to be abandonded, and intermediate
1661  * data cleaned up.
1662  *
1663  * If all devices prepare successfully, then the changes are committed
1664  * to all devices.
1665  *
1666  */
1667 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1668 {
1669     int ret = -1;
1670     BlockReopenQueueEntry *bs_entry, *next;
1671     Error *local_err = NULL;
1672 
1673     assert(bs_queue != NULL);
1674 
1675     bdrv_drain_all();
1676 
1677     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1678         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1679             error_propagate(errp, local_err);
1680             goto cleanup;
1681         }
1682         bs_entry->prepared = true;
1683     }
1684 
1685     /* If we reach this point, we have success and just need to apply the
1686      * changes
1687      */
1688     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1689         bdrv_reopen_commit(&bs_entry->state);
1690     }
1691 
1692     ret = 0;
1693 
1694 cleanup:
1695     QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1696         if (ret && bs_entry->prepared) {
1697             bdrv_reopen_abort(&bs_entry->state);
1698         }
1699         g_free(bs_entry);
1700     }
1701     g_free(bs_queue);
1702     return ret;
1703 }
1704 
1705 
1706 /* Reopen a single BlockDriverState with the specified flags. */
1707 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1708 {
1709     int ret = -1;
1710     Error *local_err = NULL;
1711     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1712 
1713     ret = bdrv_reopen_multiple(queue, &local_err);
1714     if (local_err != NULL) {
1715         error_propagate(errp, local_err);
1716     }
1717     return ret;
1718 }
1719 
1720 
1721 /*
1722  * Prepares a BlockDriverState for reopen. All changes are staged in the
1723  * 'opaque' field of the BDRVReopenState, which is used and allocated by
1724  * the block driver layer .bdrv_reopen_prepare()
1725  *
1726  * bs is the BlockDriverState to reopen
1727  * flags are the new open flags
1728  * queue is the reopen queue
1729  *
1730  * Returns 0 on success, non-zero on error.  On error errp will be set
1731  * as well.
1732  *
1733  * On failure, bdrv_reopen_abort() will be called to clean up any data.
1734  * It is the responsibility of the caller to then call the abort() or
1735  * commit() for any other BDS that have been left in a prepare() state
1736  *
1737  */
1738 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1739                         Error **errp)
1740 {
1741     int ret = -1;
1742     Error *local_err = NULL;
1743     BlockDriver *drv;
1744 
1745     assert(reopen_state != NULL);
1746     assert(reopen_state->bs->drv != NULL);
1747     drv = reopen_state->bs->drv;
1748 
1749     /* if we are to stay read-only, do not allow permission change
1750      * to r/w */
1751     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1752         reopen_state->flags & BDRV_O_RDWR) {
1753         error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1754                   bdrv_get_device_name(reopen_state->bs));
1755         goto error;
1756     }
1757 
1758 
1759     ret = bdrv_flush(reopen_state->bs);
1760     if (ret) {
1761         error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1762                   strerror(-ret));
1763         goto error;
1764     }
1765 
1766     if (drv->bdrv_reopen_prepare) {
1767         ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1768         if (ret) {
1769             if (local_err != NULL) {
1770                 error_propagate(errp, local_err);
1771             } else {
1772                 error_setg(errp, "failed while preparing to reopen image '%s'",
1773                            reopen_state->bs->filename);
1774             }
1775             goto error;
1776         }
1777     } else {
1778         /* It is currently mandatory to have a bdrv_reopen_prepare()
1779          * handler for each supported drv. */
1780         error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1781                   drv->format_name, bdrv_get_device_name(reopen_state->bs),
1782                  "reopening of file");
1783         ret = -1;
1784         goto error;
1785     }
1786 
1787     ret = 0;
1788 
1789 error:
1790     return ret;
1791 }
1792 
1793 /*
1794  * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1795  * makes them final by swapping the staging BlockDriverState contents into
1796  * the active BlockDriverState contents.
1797  */
1798 void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1799 {
1800     BlockDriver *drv;
1801 
1802     assert(reopen_state != NULL);
1803     drv = reopen_state->bs->drv;
1804     assert(drv != NULL);
1805 
1806     /* If there are any driver level actions to take */
1807     if (drv->bdrv_reopen_commit) {
1808         drv->bdrv_reopen_commit(reopen_state);
1809     }
1810 
1811     /* set BDS specific flags now */
1812     reopen_state->bs->open_flags         = reopen_state->flags;
1813     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1814                                               BDRV_O_CACHE_WB);
1815     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1816 
1817     bdrv_refresh_limits(reopen_state->bs, NULL);
1818 }
1819 
1820 /*
1821  * Abort the reopen, and delete and free the staged changes in
1822  * reopen_state
1823  */
1824 void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1825 {
1826     BlockDriver *drv;
1827 
1828     assert(reopen_state != NULL);
1829     drv = reopen_state->bs->drv;
1830     assert(drv != NULL);
1831 
1832     if (drv->bdrv_reopen_abort) {
1833         drv->bdrv_reopen_abort(reopen_state);
1834     }
1835 }
1836 
1837 
1838 void bdrv_close(BlockDriverState *bs)
1839 {
1840     BdrvAioNotifier *ban, *ban_next;
1841 
1842     if (bs->job) {
1843         block_job_cancel_sync(bs->job);
1844     }
1845     bdrv_drain_all(); /* complete I/O */
1846     bdrv_flush(bs);
1847     bdrv_drain_all(); /* in case flush left pending I/O */
1848     notifier_list_notify(&bs->close_notifiers, bs);
1849 
1850     if (bs->drv) {
1851         if (bs->backing_hd) {
1852             BlockDriverState *backing_hd = bs->backing_hd;
1853             bdrv_set_backing_hd(bs, NULL);
1854             bdrv_unref(backing_hd);
1855         }
1856         bs->drv->bdrv_close(bs);
1857         g_free(bs->opaque);
1858         bs->opaque = NULL;
1859         bs->drv = NULL;
1860         bs->copy_on_read = 0;
1861         bs->backing_file[0] = '\0';
1862         bs->backing_format[0] = '\0';
1863         bs->total_sectors = 0;
1864         bs->encrypted = 0;
1865         bs->valid_key = 0;
1866         bs->sg = 0;
1867         bs->growable = 0;
1868         bs->zero_beyond_eof = false;
1869         QDECREF(bs->options);
1870         bs->options = NULL;
1871         QDECREF(bs->full_open_options);
1872         bs->full_open_options = NULL;
1873 
1874         if (bs->file != NULL) {
1875             bdrv_unref(bs->file);
1876             bs->file = NULL;
1877         }
1878     }
1879 
1880     if (bs->blk) {
1881         blk_dev_change_media_cb(bs->blk, false);
1882     }
1883 
1884     /*throttling disk I/O limits*/
1885     if (bs->io_limits_enabled) {
1886         bdrv_io_limits_disable(bs);
1887     }
1888 
1889     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
1890         g_free(ban);
1891     }
1892     QLIST_INIT(&bs->aio_notifiers);
1893 }
1894 
1895 void bdrv_close_all(void)
1896 {
1897     BlockDriverState *bs;
1898 
1899     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1900         AioContext *aio_context = bdrv_get_aio_context(bs);
1901 
1902         aio_context_acquire(aio_context);
1903         bdrv_close(bs);
1904         aio_context_release(aio_context);
1905     }
1906 }
1907 
1908 /* Check if any requests are in-flight (including throttled requests) */
1909 static bool bdrv_requests_pending(BlockDriverState *bs)
1910 {
1911     if (!QLIST_EMPTY(&bs->tracked_requests)) {
1912         return true;
1913     }
1914     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1915         return true;
1916     }
1917     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1918         return true;
1919     }
1920     if (bs->file && bdrv_requests_pending(bs->file)) {
1921         return true;
1922     }
1923     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1924         return true;
1925     }
1926     return false;
1927 }
1928 
1929 static bool bdrv_drain_one(BlockDriverState *bs)
1930 {
1931     bool bs_busy;
1932 
1933     bdrv_flush_io_queue(bs);
1934     bdrv_start_throttled_reqs(bs);
1935     bs_busy = bdrv_requests_pending(bs);
1936     bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
1937     return bs_busy;
1938 }
1939 
1940 /*
1941  * Wait for pending requests to complete on a single BlockDriverState subtree
1942  *
1943  * See the warning in bdrv_drain_all().  This function can only be called if
1944  * you are sure nothing can generate I/O because you have op blockers
1945  * installed.
1946  *
1947  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
1948  * AioContext.
1949  */
1950 void bdrv_drain(BlockDriverState *bs)
1951 {
1952     while (bdrv_drain_one(bs)) {
1953         /* Keep iterating */
1954     }
1955 }
1956 
1957 /*
1958  * Wait for pending requests to complete across all BlockDriverStates
1959  *
1960  * This function does not flush data to disk, use bdrv_flush_all() for that
1961  * after calling this function.
1962  *
1963  * Note that completion of an asynchronous I/O operation can trigger any
1964  * number of other I/O operations on other devices---for example a coroutine
1965  * can be arbitrarily complex and a constant flow of I/O can come until the
1966  * coroutine is complete.  Because of this, it is not possible to have a
1967  * function to drain a single device's I/O queue.
1968  */
1969 void bdrv_drain_all(void)
1970 {
1971     /* Always run first iteration so any pending completion BHs run */
1972     bool busy = true;
1973     BlockDriverState *bs;
1974 
1975     while (busy) {
1976         busy = false;
1977 
1978         QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1979             AioContext *aio_context = bdrv_get_aio_context(bs);
1980 
1981             aio_context_acquire(aio_context);
1982             busy |= bdrv_drain_one(bs);
1983             aio_context_release(aio_context);
1984         }
1985     }
1986 }
1987 
1988 /* make a BlockDriverState anonymous by removing from bdrv_state and
1989  * graph_bdrv_state list.
1990    Also, NULL terminate the device_name to prevent double remove */
1991 void bdrv_make_anon(BlockDriverState *bs)
1992 {
1993     /*
1994      * Take care to remove bs from bdrv_states only when it's actually
1995      * in it.  Note that bs->device_list.tqe_prev is initially null,
1996      * and gets set to non-null by QTAILQ_INSERT_TAIL().  Establish
1997      * the useful invariant "bs in bdrv_states iff bs->tqe_prev" by
1998      * resetting it to null on remove.
1999      */
2000     if (bs->device_list.tqe_prev) {
2001         QTAILQ_REMOVE(&bdrv_states, bs, device_list);
2002         bs->device_list.tqe_prev = NULL;
2003     }
2004     if (bs->node_name[0] != '\0') {
2005         QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
2006     }
2007     bs->node_name[0] = '\0';
2008 }
2009 
2010 static void bdrv_rebind(BlockDriverState *bs)
2011 {
2012     if (bs->drv && bs->drv->bdrv_rebind) {
2013         bs->drv->bdrv_rebind(bs);
2014     }
2015 }
2016 
2017 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
2018                                      BlockDriverState *bs_src)
2019 {
2020     /* move some fields that need to stay attached to the device */
2021 
2022     /* dev info */
2023     bs_dest->guest_block_size   = bs_src->guest_block_size;
2024     bs_dest->copy_on_read       = bs_src->copy_on_read;
2025 
2026     bs_dest->enable_write_cache = bs_src->enable_write_cache;
2027 
2028     /* i/o throttled req */
2029     memcpy(&bs_dest->throttle_state,
2030            &bs_src->throttle_state,
2031            sizeof(ThrottleState));
2032     bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
2033     bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
2034     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
2035 
2036     /* r/w error */
2037     bs_dest->on_read_error      = bs_src->on_read_error;
2038     bs_dest->on_write_error     = bs_src->on_write_error;
2039 
2040     /* i/o status */
2041     bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
2042     bs_dest->iostatus           = bs_src->iostatus;
2043 
2044     /* dirty bitmap */
2045     bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
2046 
2047     /* reference count */
2048     bs_dest->refcnt             = bs_src->refcnt;
2049 
2050     /* job */
2051     bs_dest->job                = bs_src->job;
2052 
2053     /* keep the same entry in bdrv_states */
2054     bs_dest->device_list = bs_src->device_list;
2055     bs_dest->blk = bs_src->blk;
2056 
2057     memcpy(bs_dest->op_blockers, bs_src->op_blockers,
2058            sizeof(bs_dest->op_blockers));
2059 }
2060 
2061 /*
2062  * Swap bs contents for two image chains while they are live,
2063  * while keeping required fields on the BlockDriverState that is
2064  * actually attached to a device.
2065  *
2066  * This will modify the BlockDriverState fields, and swap contents
2067  * between bs_new and bs_old. Both bs_new and bs_old are modified.
2068  *
2069  * bs_new must not be attached to a BlockBackend.
2070  *
2071  * This function does not create any image files.
2072  */
2073 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
2074 {
2075     BlockDriverState tmp;
2076 
2077     /* The code needs to swap the node_name but simply swapping node_list won't
2078      * work so first remove the nodes from the graph list, do the swap then
2079      * insert them back if needed.
2080      */
2081     if (bs_new->node_name[0] != '\0') {
2082         QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
2083     }
2084     if (bs_old->node_name[0] != '\0') {
2085         QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
2086     }
2087 
2088     /* bs_new must be unattached and shouldn't have anything fancy enabled */
2089     assert(!bs_new->blk);
2090     assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
2091     assert(bs_new->job == NULL);
2092     assert(bs_new->io_limits_enabled == false);
2093     assert(!throttle_have_timer(&bs_new->throttle_state));
2094 
2095     tmp = *bs_new;
2096     *bs_new = *bs_old;
2097     *bs_old = tmp;
2098 
2099     /* there are some fields that should not be swapped, move them back */
2100     bdrv_move_feature_fields(&tmp, bs_old);
2101     bdrv_move_feature_fields(bs_old, bs_new);
2102     bdrv_move_feature_fields(bs_new, &tmp);
2103 
2104     /* bs_new must remain unattached */
2105     assert(!bs_new->blk);
2106 
2107     /* Check a few fields that should remain attached to the device */
2108     assert(bs_new->job == NULL);
2109     assert(bs_new->io_limits_enabled == false);
2110     assert(!throttle_have_timer(&bs_new->throttle_state));
2111 
2112     /* insert the nodes back into the graph node list if needed */
2113     if (bs_new->node_name[0] != '\0') {
2114         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
2115     }
2116     if (bs_old->node_name[0] != '\0') {
2117         QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
2118     }
2119 
2120     bdrv_rebind(bs_new);
2121     bdrv_rebind(bs_old);
2122 }
2123 
2124 /*
2125  * Add new bs contents at the top of an image chain while the chain is
2126  * live, while keeping required fields on the top layer.
2127  *
2128  * This will modify the BlockDriverState fields, and swap contents
2129  * between bs_new and bs_top. Both bs_new and bs_top are modified.
2130  *
2131  * bs_new must not be attached to a BlockBackend.
2132  *
2133  * This function does not create any image files.
2134  */
2135 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
2136 {
2137     bdrv_swap(bs_new, bs_top);
2138 
2139     /* The contents of 'tmp' will become bs_top, as we are
2140      * swapping bs_new and bs_top contents. */
2141     bdrv_set_backing_hd(bs_top, bs_new);
2142 }
2143 
2144 static void bdrv_delete(BlockDriverState *bs)
2145 {
2146     assert(!bs->job);
2147     assert(bdrv_op_blocker_is_empty(bs));
2148     assert(!bs->refcnt);
2149     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
2150 
2151     bdrv_close(bs);
2152 
2153     /* remove from list, if necessary */
2154     bdrv_make_anon(bs);
2155 
2156     g_free(bs);
2157 }
2158 
2159 /*
2160  * Run consistency checks on an image
2161  *
2162  * Returns 0 if the check could be completed (it doesn't mean that the image is
2163  * free of errors) or -errno when an internal error occurred. The results of the
2164  * check are stored in res.
2165  */
2166 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2167 {
2168     if (bs->drv == NULL) {
2169         return -ENOMEDIUM;
2170     }
2171     if (bs->drv->bdrv_check == NULL) {
2172         return -ENOTSUP;
2173     }
2174 
2175     memset(res, 0, sizeof(*res));
2176     return bs->drv->bdrv_check(bs, res, fix);
2177 }
2178 
2179 #define COMMIT_BUF_SECTORS 2048
2180 
2181 /* commit COW file into the raw image */
2182 int bdrv_commit(BlockDriverState *bs)
2183 {
2184     BlockDriver *drv = bs->drv;
2185     int64_t sector, total_sectors, length, backing_length;
2186     int n, ro, open_flags;
2187     int ret = 0;
2188     uint8_t *buf = NULL;
2189     char filename[PATH_MAX];
2190 
2191     if (!drv)
2192         return -ENOMEDIUM;
2193 
2194     if (!bs->backing_hd) {
2195         return -ENOTSUP;
2196     }
2197 
2198     if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT, NULL) ||
2199         bdrv_op_is_blocked(bs->backing_hd, BLOCK_OP_TYPE_COMMIT, NULL)) {
2200         return -EBUSY;
2201     }
2202 
2203     ro = bs->backing_hd->read_only;
2204     /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2205     pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2206     open_flags =  bs->backing_hd->open_flags;
2207 
2208     if (ro) {
2209         if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2210             return -EACCES;
2211         }
2212     }
2213 
2214     length = bdrv_getlength(bs);
2215     if (length < 0) {
2216         ret = length;
2217         goto ro_cleanup;
2218     }
2219 
2220     backing_length = bdrv_getlength(bs->backing_hd);
2221     if (backing_length < 0) {
2222         ret = backing_length;
2223         goto ro_cleanup;
2224     }
2225 
2226     /* If our top snapshot is larger than the backing file image,
2227      * grow the backing file image if possible.  If not possible,
2228      * we must return an error */
2229     if (length > backing_length) {
2230         ret = bdrv_truncate(bs->backing_hd, length);
2231         if (ret < 0) {
2232             goto ro_cleanup;
2233         }
2234     }
2235 
2236     total_sectors = length >> BDRV_SECTOR_BITS;
2237 
2238     /* qemu_try_blockalign() for bs will choose an alignment that works for
2239      * bs->backing_hd as well, so no need to compare the alignment manually. */
2240     buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2241     if (buf == NULL) {
2242         ret = -ENOMEM;
2243         goto ro_cleanup;
2244     }
2245 
2246     for (sector = 0; sector < total_sectors; sector += n) {
2247         ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2248         if (ret < 0) {
2249             goto ro_cleanup;
2250         }
2251         if (ret) {
2252             ret = bdrv_read(bs, sector, buf, n);
2253             if (ret < 0) {
2254                 goto ro_cleanup;
2255             }
2256 
2257             ret = bdrv_write(bs->backing_hd, sector, buf, n);
2258             if (ret < 0) {
2259                 goto ro_cleanup;
2260             }
2261         }
2262     }
2263 
2264     if (drv->bdrv_make_empty) {
2265         ret = drv->bdrv_make_empty(bs);
2266         if (ret < 0) {
2267             goto ro_cleanup;
2268         }
2269         bdrv_flush(bs);
2270     }
2271 
2272     /*
2273      * Make sure all data we wrote to the backing device is actually
2274      * stable on disk.
2275      */
2276     if (bs->backing_hd) {
2277         bdrv_flush(bs->backing_hd);
2278     }
2279 
2280     ret = 0;
2281 ro_cleanup:
2282     qemu_vfree(buf);
2283 
2284     if (ro) {
2285         /* ignoring error return here */
2286         bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2287     }
2288 
2289     return ret;
2290 }
2291 
2292 int bdrv_commit_all(void)
2293 {
2294     BlockDriverState *bs;
2295 
2296     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2297         AioContext *aio_context = bdrv_get_aio_context(bs);
2298 
2299         aio_context_acquire(aio_context);
2300         if (bs->drv && bs->backing_hd) {
2301             int ret = bdrv_commit(bs);
2302             if (ret < 0) {
2303                 aio_context_release(aio_context);
2304                 return ret;
2305             }
2306         }
2307         aio_context_release(aio_context);
2308     }
2309     return 0;
2310 }
2311 
2312 /**
2313  * Remove an active request from the tracked requests list
2314  *
2315  * This function should be called when a tracked request is completing.
2316  */
2317 static void tracked_request_end(BdrvTrackedRequest *req)
2318 {
2319     if (req->serialising) {
2320         req->bs->serialising_in_flight--;
2321     }
2322 
2323     QLIST_REMOVE(req, list);
2324     qemu_co_queue_restart_all(&req->wait_queue);
2325 }
2326 
2327 /**
2328  * Add an active request to the tracked requests list
2329  */
2330 static void tracked_request_begin(BdrvTrackedRequest *req,
2331                                   BlockDriverState *bs,
2332                                   int64_t offset,
2333                                   unsigned int bytes, bool is_write)
2334 {
2335     *req = (BdrvTrackedRequest){
2336         .bs = bs,
2337         .offset         = offset,
2338         .bytes          = bytes,
2339         .is_write       = is_write,
2340         .co             = qemu_coroutine_self(),
2341         .serialising    = false,
2342         .overlap_offset = offset,
2343         .overlap_bytes  = bytes,
2344     };
2345 
2346     qemu_co_queue_init(&req->wait_queue);
2347 
2348     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2349 }
2350 
2351 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2352 {
2353     int64_t overlap_offset = req->offset & ~(align - 1);
2354     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2355                                - overlap_offset;
2356 
2357     if (!req->serialising) {
2358         req->bs->serialising_in_flight++;
2359         req->serialising = true;
2360     }
2361 
2362     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2363     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2364 }
2365 
2366 /**
2367  * Round a region to cluster boundaries
2368  */
2369 void bdrv_round_to_clusters(BlockDriverState *bs,
2370                             int64_t sector_num, int nb_sectors,
2371                             int64_t *cluster_sector_num,
2372                             int *cluster_nb_sectors)
2373 {
2374     BlockDriverInfo bdi;
2375 
2376     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2377         *cluster_sector_num = sector_num;
2378         *cluster_nb_sectors = nb_sectors;
2379     } else {
2380         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2381         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2382         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2383                                             nb_sectors, c);
2384     }
2385 }
2386 
2387 static int bdrv_get_cluster_size(BlockDriverState *bs)
2388 {
2389     BlockDriverInfo bdi;
2390     int ret;
2391 
2392     ret = bdrv_get_info(bs, &bdi);
2393     if (ret < 0 || bdi.cluster_size == 0) {
2394         return bs->request_alignment;
2395     } else {
2396         return bdi.cluster_size;
2397     }
2398 }
2399 
2400 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2401                                      int64_t offset, unsigned int bytes)
2402 {
2403     /*        aaaa   bbbb */
2404     if (offset >= req->overlap_offset + req->overlap_bytes) {
2405         return false;
2406     }
2407     /* bbbb   aaaa        */
2408     if (req->overlap_offset >= offset + bytes) {
2409         return false;
2410     }
2411     return true;
2412 }
2413 
2414 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2415 {
2416     BlockDriverState *bs = self->bs;
2417     BdrvTrackedRequest *req;
2418     bool retry;
2419     bool waited = false;
2420 
2421     if (!bs->serialising_in_flight) {
2422         return false;
2423     }
2424 
2425     do {
2426         retry = false;
2427         QLIST_FOREACH(req, &bs->tracked_requests, list) {
2428             if (req == self || (!req->serialising && !self->serialising)) {
2429                 continue;
2430             }
2431             if (tracked_request_overlaps(req, self->overlap_offset,
2432                                          self->overlap_bytes))
2433             {
2434                 /* Hitting this means there was a reentrant request, for
2435                  * example, a block driver issuing nested requests.  This must
2436                  * never happen since it means deadlock.
2437                  */
2438                 assert(qemu_coroutine_self() != req->co);
2439 
2440                 /* If the request is already (indirectly) waiting for us, or
2441                  * will wait for us as soon as it wakes up, then just go on
2442                  * (instead of producing a deadlock in the former case). */
2443                 if (!req->waiting_for) {
2444                     self->waiting_for = req;
2445                     qemu_co_queue_wait(&req->wait_queue);
2446                     self->waiting_for = NULL;
2447                     retry = true;
2448                     waited = true;
2449                     break;
2450                 }
2451             }
2452         }
2453     } while (retry);
2454 
2455     return waited;
2456 }
2457 
2458 /*
2459  * Return values:
2460  * 0        - success
2461  * -EINVAL  - backing format specified, but no file
2462  * -ENOSPC  - can't update the backing file because no space is left in the
2463  *            image file header
2464  * -ENOTSUP - format driver doesn't support changing the backing file
2465  */
2466 int bdrv_change_backing_file(BlockDriverState *bs,
2467     const char *backing_file, const char *backing_fmt)
2468 {
2469     BlockDriver *drv = bs->drv;
2470     int ret;
2471 
2472     /* Backing file format doesn't make sense without a backing file */
2473     if (backing_fmt && !backing_file) {
2474         return -EINVAL;
2475     }
2476 
2477     if (drv->bdrv_change_backing_file != NULL) {
2478         ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2479     } else {
2480         ret = -ENOTSUP;
2481     }
2482 
2483     if (ret == 0) {
2484         pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2485         pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2486     }
2487     return ret;
2488 }
2489 
2490 /*
2491  * Finds the image layer in the chain that has 'bs' as its backing file.
2492  *
2493  * active is the current topmost image.
2494  *
2495  * Returns NULL if bs is not found in active's image chain,
2496  * or if active == bs.
2497  *
2498  * Returns the bottommost base image if bs == NULL.
2499  */
2500 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2501                                     BlockDriverState *bs)
2502 {
2503     while (active && bs != active->backing_hd) {
2504         active = active->backing_hd;
2505     }
2506 
2507     return active;
2508 }
2509 
2510 /* Given a BDS, searches for the base layer. */
2511 BlockDriverState *bdrv_find_base(BlockDriverState *bs)
2512 {
2513     return bdrv_find_overlay(bs, NULL);
2514 }
2515 
2516 typedef struct BlkIntermediateStates {
2517     BlockDriverState *bs;
2518     QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2519 } BlkIntermediateStates;
2520 
2521 
2522 /*
2523  * Drops images above 'base' up to and including 'top', and sets the image
2524  * above 'top' to have base as its backing file.
2525  *
2526  * Requires that the overlay to 'top' is opened r/w, so that the backing file
2527  * information in 'bs' can be properly updated.
2528  *
2529  * E.g., this will convert the following chain:
2530  * bottom <- base <- intermediate <- top <- active
2531  *
2532  * to
2533  *
2534  * bottom <- base <- active
2535  *
2536  * It is allowed for bottom==base, in which case it converts:
2537  *
2538  * base <- intermediate <- top <- active
2539  *
2540  * to
2541  *
2542  * base <- active
2543  *
2544  * If backing_file_str is non-NULL, it will be used when modifying top's
2545  * overlay image metadata.
2546  *
2547  * Error conditions:
2548  *  if active == top, that is considered an error
2549  *
2550  */
2551 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2552                            BlockDriverState *base, const char *backing_file_str)
2553 {
2554     BlockDriverState *intermediate;
2555     BlockDriverState *base_bs = NULL;
2556     BlockDriverState *new_top_bs = NULL;
2557     BlkIntermediateStates *intermediate_state, *next;
2558     int ret = -EIO;
2559 
2560     QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2561     QSIMPLEQ_INIT(&states_to_delete);
2562 
2563     if (!top->drv || !base->drv) {
2564         goto exit;
2565     }
2566 
2567     new_top_bs = bdrv_find_overlay(active, top);
2568 
2569     if (new_top_bs == NULL) {
2570         /* we could not find the image above 'top', this is an error */
2571         goto exit;
2572     }
2573 
2574     /* special case of new_top_bs->backing_hd already pointing to base - nothing
2575      * to do, no intermediate images */
2576     if (new_top_bs->backing_hd == base) {
2577         ret = 0;
2578         goto exit;
2579     }
2580 
2581     intermediate = top;
2582 
2583     /* now we will go down through the list, and add each BDS we find
2584      * into our deletion queue, until we hit the 'base'
2585      */
2586     while (intermediate) {
2587         intermediate_state = g_new0(BlkIntermediateStates, 1);
2588         intermediate_state->bs = intermediate;
2589         QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2590 
2591         if (intermediate->backing_hd == base) {
2592             base_bs = intermediate->backing_hd;
2593             break;
2594         }
2595         intermediate = intermediate->backing_hd;
2596     }
2597     if (base_bs == NULL) {
2598         /* something went wrong, we did not end at the base. safely
2599          * unravel everything, and exit with error */
2600         goto exit;
2601     }
2602 
2603     /* success - we can delete the intermediate states, and link top->base */
2604     backing_file_str = backing_file_str ? backing_file_str : base_bs->filename;
2605     ret = bdrv_change_backing_file(new_top_bs, backing_file_str,
2606                                    base_bs->drv ? base_bs->drv->format_name : "");
2607     if (ret) {
2608         goto exit;
2609     }
2610     bdrv_set_backing_hd(new_top_bs, base_bs);
2611 
2612     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2613         /* so that bdrv_close() does not recursively close the chain */
2614         bdrv_set_backing_hd(intermediate_state->bs, NULL);
2615         bdrv_unref(intermediate_state->bs);
2616     }
2617     ret = 0;
2618 
2619 exit:
2620     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2621         g_free(intermediate_state);
2622     }
2623     return ret;
2624 }
2625 
2626 
2627 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2628                                    size_t size)
2629 {
2630     int64_t len;
2631 
2632     if (size > INT_MAX) {
2633         return -EIO;
2634     }
2635 
2636     if (!bdrv_is_inserted(bs))
2637         return -ENOMEDIUM;
2638 
2639     if (bs->growable)
2640         return 0;
2641 
2642     len = bdrv_getlength(bs);
2643 
2644     if (offset < 0)
2645         return -EIO;
2646 
2647     if ((offset > len) || (len - offset < size))
2648         return -EIO;
2649 
2650     return 0;
2651 }
2652 
2653 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2654                               int nb_sectors)
2655 {
2656     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2657         return -EIO;
2658     }
2659 
2660     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2661                                    nb_sectors * BDRV_SECTOR_SIZE);
2662 }
2663 
2664 typedef struct RwCo {
2665     BlockDriverState *bs;
2666     int64_t offset;
2667     QEMUIOVector *qiov;
2668     bool is_write;
2669     int ret;
2670     BdrvRequestFlags flags;
2671 } RwCo;
2672 
2673 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2674 {
2675     RwCo *rwco = opaque;
2676 
2677     if (!rwco->is_write) {
2678         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2679                                       rwco->qiov->size, rwco->qiov,
2680                                       rwco->flags);
2681     } else {
2682         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2683                                        rwco->qiov->size, rwco->qiov,
2684                                        rwco->flags);
2685     }
2686 }
2687 
2688 /*
2689  * Process a vectored synchronous request using coroutines
2690  */
2691 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2692                         QEMUIOVector *qiov, bool is_write,
2693                         BdrvRequestFlags flags)
2694 {
2695     Coroutine *co;
2696     RwCo rwco = {
2697         .bs = bs,
2698         .offset = offset,
2699         .qiov = qiov,
2700         .is_write = is_write,
2701         .ret = NOT_DONE,
2702         .flags = flags,
2703     };
2704 
2705     /**
2706      * In sync call context, when the vcpu is blocked, this throttling timer
2707      * will not fire; so the I/O throttling function has to be disabled here
2708      * if it has been enabled.
2709      */
2710     if (bs->io_limits_enabled) {
2711         fprintf(stderr, "Disabling I/O throttling on '%s' due "
2712                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
2713         bdrv_io_limits_disable(bs);
2714     }
2715 
2716     if (qemu_in_coroutine()) {
2717         /* Fast-path if already in coroutine context */
2718         bdrv_rw_co_entry(&rwco);
2719     } else {
2720         AioContext *aio_context = bdrv_get_aio_context(bs);
2721 
2722         co = qemu_coroutine_create(bdrv_rw_co_entry);
2723         qemu_coroutine_enter(co, &rwco);
2724         while (rwco.ret == NOT_DONE) {
2725             aio_poll(aio_context, true);
2726         }
2727     }
2728     return rwco.ret;
2729 }
2730 
2731 /*
2732  * Process a synchronous request using coroutines
2733  */
2734 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2735                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
2736 {
2737     QEMUIOVector qiov;
2738     struct iovec iov = {
2739         .iov_base = (void *)buf,
2740         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2741     };
2742 
2743     if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2744         return -EINVAL;
2745     }
2746 
2747     qemu_iovec_init_external(&qiov, &iov, 1);
2748     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2749                         &qiov, is_write, flags);
2750 }
2751 
2752 /* return < 0 if error. See bdrv_write() for the return codes */
2753 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2754               uint8_t *buf, int nb_sectors)
2755 {
2756     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2757 }
2758 
2759 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2760 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2761                           uint8_t *buf, int nb_sectors)
2762 {
2763     bool enabled;
2764     int ret;
2765 
2766     enabled = bs->io_limits_enabled;
2767     bs->io_limits_enabled = false;
2768     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2769     bs->io_limits_enabled = enabled;
2770     return ret;
2771 }
2772 
2773 /* Return < 0 if error. Important errors are:
2774   -EIO         generic I/O error (may happen for all errors)
2775   -ENOMEDIUM   No media inserted.
2776   -EINVAL      Invalid sector number or nb_sectors
2777   -EACCES      Trying to write a read-only device
2778 */
2779 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2780                const uint8_t *buf, int nb_sectors)
2781 {
2782     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2783 }
2784 
2785 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2786                       int nb_sectors, BdrvRequestFlags flags)
2787 {
2788     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2789                       BDRV_REQ_ZERO_WRITE | flags);
2790 }
2791 
2792 /*
2793  * Completely zero out a block device with the help of bdrv_write_zeroes.
2794  * The operation is sped up by checking the block status and only writing
2795  * zeroes to the device if they currently do not return zeroes. Optional
2796  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2797  *
2798  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2799  */
2800 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2801 {
2802     int64_t target_sectors, ret, nb_sectors, sector_num = 0;
2803     int n;
2804 
2805     target_sectors = bdrv_nb_sectors(bs);
2806     if (target_sectors < 0) {
2807         return target_sectors;
2808     }
2809 
2810     for (;;) {
2811         nb_sectors = target_sectors - sector_num;
2812         if (nb_sectors <= 0) {
2813             return 0;
2814         }
2815         if (nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2816             nb_sectors = INT_MAX / BDRV_SECTOR_SIZE;
2817         }
2818         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2819         if (ret < 0) {
2820             error_report("error getting block status at sector %" PRId64 ": %s",
2821                          sector_num, strerror(-ret));
2822             return ret;
2823         }
2824         if (ret & BDRV_BLOCK_ZERO) {
2825             sector_num += n;
2826             continue;
2827         }
2828         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2829         if (ret < 0) {
2830             error_report("error writing zeroes at sector %" PRId64 ": %s",
2831                          sector_num, strerror(-ret));
2832             return ret;
2833         }
2834         sector_num += n;
2835     }
2836 }
2837 
2838 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2839 {
2840     QEMUIOVector qiov;
2841     struct iovec iov = {
2842         .iov_base = (void *)buf,
2843         .iov_len = bytes,
2844     };
2845     int ret;
2846 
2847     if (bytes < 0) {
2848         return -EINVAL;
2849     }
2850 
2851     qemu_iovec_init_external(&qiov, &iov, 1);
2852     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2853     if (ret < 0) {
2854         return ret;
2855     }
2856 
2857     return bytes;
2858 }
2859 
2860 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2861 {
2862     int ret;
2863 
2864     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2865     if (ret < 0) {
2866         return ret;
2867     }
2868 
2869     return qiov->size;
2870 }
2871 
2872 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2873                 const void *buf, int bytes)
2874 {
2875     QEMUIOVector qiov;
2876     struct iovec iov = {
2877         .iov_base   = (void *) buf,
2878         .iov_len    = bytes,
2879     };
2880 
2881     if (bytes < 0) {
2882         return -EINVAL;
2883     }
2884 
2885     qemu_iovec_init_external(&qiov, &iov, 1);
2886     return bdrv_pwritev(bs, offset, &qiov);
2887 }
2888 
2889 /*
2890  * Writes to the file and ensures that no writes are reordered across this
2891  * request (acts as a barrier)
2892  *
2893  * Returns 0 on success, -errno in error cases.
2894  */
2895 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2896     const void *buf, int count)
2897 {
2898     int ret;
2899 
2900     ret = bdrv_pwrite(bs, offset, buf, count);
2901     if (ret < 0) {
2902         return ret;
2903     }
2904 
2905     /* No flush needed for cache modes that already do it */
2906     if (bs->enable_write_cache) {
2907         bdrv_flush(bs);
2908     }
2909 
2910     return 0;
2911 }
2912 
2913 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2914         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2915 {
2916     /* Perform I/O through a temporary buffer so that users who scribble over
2917      * their read buffer while the operation is in progress do not end up
2918      * modifying the image file.  This is critical for zero-copy guest I/O
2919      * where anything might happen inside guest memory.
2920      */
2921     void *bounce_buffer;
2922 
2923     BlockDriver *drv = bs->drv;
2924     struct iovec iov;
2925     QEMUIOVector bounce_qiov;
2926     int64_t cluster_sector_num;
2927     int cluster_nb_sectors;
2928     size_t skip_bytes;
2929     int ret;
2930 
2931     /* Cover entire cluster so no additional backing file I/O is required when
2932      * allocating cluster in the image file.
2933      */
2934     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2935                            &cluster_sector_num, &cluster_nb_sectors);
2936 
2937     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2938                                    cluster_sector_num, cluster_nb_sectors);
2939 
2940     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2941     iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
2942     if (bounce_buffer == NULL) {
2943         ret = -ENOMEM;
2944         goto err;
2945     }
2946 
2947     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2948 
2949     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2950                              &bounce_qiov);
2951     if (ret < 0) {
2952         goto err;
2953     }
2954 
2955     if (drv->bdrv_co_write_zeroes &&
2956         buffer_is_zero(bounce_buffer, iov.iov_len)) {
2957         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2958                                       cluster_nb_sectors, 0);
2959     } else {
2960         /* This does not change the data on the disk, it is not necessary
2961          * to flush even in cache=writethrough mode.
2962          */
2963         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2964                                   &bounce_qiov);
2965     }
2966 
2967     if (ret < 0) {
2968         /* It might be okay to ignore write errors for guest requests.  If this
2969          * is a deliberate copy-on-read then we don't want to ignore the error.
2970          * Simply report it in all cases.
2971          */
2972         goto err;
2973     }
2974 
2975     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2976     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2977                         nb_sectors * BDRV_SECTOR_SIZE);
2978 
2979 err:
2980     qemu_vfree(bounce_buffer);
2981     return ret;
2982 }
2983 
2984 /*
2985  * Forwards an already correctly aligned request to the BlockDriver. This
2986  * handles copy on read and zeroing after EOF; any other features must be
2987  * implemented by the caller.
2988  */
2989 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2990     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2991     int64_t align, QEMUIOVector *qiov, int flags)
2992 {
2993     BlockDriver *drv = bs->drv;
2994     int ret;
2995 
2996     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2997     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2998 
2999     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3000     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3001     assert(!qiov || bytes == qiov->size);
3002 
3003     /* Handle Copy on Read and associated serialisation */
3004     if (flags & BDRV_REQ_COPY_ON_READ) {
3005         /* If we touch the same cluster it counts as an overlap.  This
3006          * guarantees that allocating writes will be serialized and not race
3007          * with each other for the same cluster.  For example, in copy-on-read
3008          * it ensures that the CoR read and write operations are atomic and
3009          * guest writes cannot interleave between them. */
3010         mark_request_serialising(req, bdrv_get_cluster_size(bs));
3011     }
3012 
3013     wait_serialising_requests(req);
3014 
3015     if (flags & BDRV_REQ_COPY_ON_READ) {
3016         int pnum;
3017 
3018         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
3019         if (ret < 0) {
3020             goto out;
3021         }
3022 
3023         if (!ret || pnum != nb_sectors) {
3024             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
3025             goto out;
3026         }
3027     }
3028 
3029     /* Forward the request to the BlockDriver */
3030     if (!(bs->zero_beyond_eof && bs->growable)) {
3031         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3032     } else {
3033         /* Read zeros after EOF of growable BDSes */
3034         int64_t total_sectors, max_nb_sectors;
3035 
3036         total_sectors = bdrv_nb_sectors(bs);
3037         if (total_sectors < 0) {
3038             ret = total_sectors;
3039             goto out;
3040         }
3041 
3042         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3043                                   align >> BDRV_SECTOR_BITS);
3044         if (max_nb_sectors > 0) {
3045             QEMUIOVector local_qiov;
3046             size_t local_sectors;
3047 
3048             max_nb_sectors = MIN(max_nb_sectors, SIZE_MAX / BDRV_SECTOR_BITS);
3049             local_sectors = MIN(max_nb_sectors, nb_sectors);
3050 
3051             qemu_iovec_init(&local_qiov, qiov->niov);
3052             qemu_iovec_concat(&local_qiov, qiov, 0,
3053                               local_sectors * BDRV_SECTOR_SIZE);
3054 
3055             ret = drv->bdrv_co_readv(bs, sector_num, local_sectors,
3056                                      &local_qiov);
3057 
3058             qemu_iovec_destroy(&local_qiov);
3059         } else {
3060             ret = 0;
3061         }
3062 
3063         /* Reading beyond end of file is supposed to produce zeroes */
3064         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3065             uint64_t offset = MAX(0, total_sectors - sector_num);
3066             uint64_t bytes = (sector_num + nb_sectors - offset) *
3067                               BDRV_SECTOR_SIZE;
3068             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3069         }
3070     }
3071 
3072 out:
3073     return ret;
3074 }
3075 
3076 /*
3077  * Handle a read request in coroutine context
3078  */
3079 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3080     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3081     BdrvRequestFlags flags)
3082 {
3083     BlockDriver *drv = bs->drv;
3084     BdrvTrackedRequest req;
3085 
3086     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3087     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3088     uint8_t *head_buf = NULL;
3089     uint8_t *tail_buf = NULL;
3090     QEMUIOVector local_qiov;
3091     bool use_local_qiov = false;
3092     int ret;
3093 
3094     if (!drv) {
3095         return -ENOMEDIUM;
3096     }
3097     if (bdrv_check_byte_request(bs, offset, bytes)) {
3098         return -EIO;
3099     }
3100 
3101     if (bs->copy_on_read) {
3102         flags |= BDRV_REQ_COPY_ON_READ;
3103     }
3104 
3105     /* throttling disk I/O */
3106     if (bs->io_limits_enabled) {
3107         bdrv_io_limits_intercept(bs, bytes, false);
3108     }
3109 
3110     /* Align read if necessary by padding qiov */
3111     if (offset & (align - 1)) {
3112         head_buf = qemu_blockalign(bs, align);
3113         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3114         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3115         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3116         use_local_qiov = true;
3117 
3118         bytes += offset & (align - 1);
3119         offset = offset & ~(align - 1);
3120     }
3121 
3122     if ((offset + bytes) & (align - 1)) {
3123         if (!use_local_qiov) {
3124             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3125             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3126             use_local_qiov = true;
3127         }
3128         tail_buf = qemu_blockalign(bs, align);
3129         qemu_iovec_add(&local_qiov, tail_buf,
3130                        align - ((offset + bytes) & (align - 1)));
3131 
3132         bytes = ROUND_UP(bytes, align);
3133     }
3134 
3135     tracked_request_begin(&req, bs, offset, bytes, false);
3136     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3137                               use_local_qiov ? &local_qiov : qiov,
3138                               flags);
3139     tracked_request_end(&req);
3140 
3141     if (use_local_qiov) {
3142         qemu_iovec_destroy(&local_qiov);
3143         qemu_vfree(head_buf);
3144         qemu_vfree(tail_buf);
3145     }
3146 
3147     return ret;
3148 }
3149 
3150 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3151     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3152     BdrvRequestFlags flags)
3153 {
3154     if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3155         return -EINVAL;
3156     }
3157 
3158     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3159                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3160 }
3161 
3162 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3163     int nb_sectors, QEMUIOVector *qiov)
3164 {
3165     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3166 
3167     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3168 }
3169 
3170 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3171     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3172 {
3173     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3174 
3175     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3176                             BDRV_REQ_COPY_ON_READ);
3177 }
3178 
3179 /* if no limit is specified in the BlockLimits use a default
3180  * of 32768 512-byte sectors (16 MiB) per request.
3181  */
3182 #define MAX_WRITE_ZEROES_DEFAULT 32768
3183 
3184 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3185     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3186 {
3187     BlockDriver *drv = bs->drv;
3188     QEMUIOVector qiov;
3189     struct iovec iov = {0};
3190     int ret = 0;
3191 
3192     int max_write_zeroes = bs->bl.max_write_zeroes ?
3193                            bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3194 
3195     while (nb_sectors > 0 && !ret) {
3196         int num = nb_sectors;
3197 
3198         /* Align request.  Block drivers can expect the "bulk" of the request
3199          * to be aligned.
3200          */
3201         if (bs->bl.write_zeroes_alignment
3202             && num > bs->bl.write_zeroes_alignment) {
3203             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3204                 /* Make a small request up to the first aligned sector.  */
3205                 num = bs->bl.write_zeroes_alignment;
3206                 num -= sector_num % bs->bl.write_zeroes_alignment;
3207             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3208                 /* Shorten the request to the last aligned sector.  num cannot
3209                  * underflow because num > bs->bl.write_zeroes_alignment.
3210                  */
3211                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3212             }
3213         }
3214 
3215         /* limit request size */
3216         if (num > max_write_zeroes) {
3217             num = max_write_zeroes;
3218         }
3219 
3220         ret = -ENOTSUP;
3221         /* First try the efficient write zeroes operation */
3222         if (drv->bdrv_co_write_zeroes) {
3223             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3224         }
3225 
3226         if (ret == -ENOTSUP) {
3227             /* Fall back to bounce buffer if write zeroes is unsupported */
3228             iov.iov_len = num * BDRV_SECTOR_SIZE;
3229             if (iov.iov_base == NULL) {
3230                 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
3231                 if (iov.iov_base == NULL) {
3232                     ret = -ENOMEM;
3233                     goto fail;
3234                 }
3235                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3236             }
3237             qemu_iovec_init_external(&qiov, &iov, 1);
3238 
3239             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3240 
3241             /* Keep bounce buffer around if it is big enough for all
3242              * all future requests.
3243              */
3244             if (num < max_write_zeroes) {
3245                 qemu_vfree(iov.iov_base);
3246                 iov.iov_base = NULL;
3247             }
3248         }
3249 
3250         sector_num += num;
3251         nb_sectors -= num;
3252     }
3253 
3254 fail:
3255     qemu_vfree(iov.iov_base);
3256     return ret;
3257 }
3258 
3259 /*
3260  * Forwards an already correctly aligned write request to the BlockDriver.
3261  */
3262 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3263     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3264     QEMUIOVector *qiov, int flags)
3265 {
3266     BlockDriver *drv = bs->drv;
3267     bool waited;
3268     int ret;
3269 
3270     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3271     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3272 
3273     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3274     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3275     assert(!qiov || bytes == qiov->size);
3276 
3277     waited = wait_serialising_requests(req);
3278     assert(!waited || !req->serialising);
3279     assert(req->overlap_offset <= offset);
3280     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3281 
3282     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3283 
3284     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
3285         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
3286         qemu_iovec_is_zero(qiov)) {
3287         flags |= BDRV_REQ_ZERO_WRITE;
3288         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
3289             flags |= BDRV_REQ_MAY_UNMAP;
3290         }
3291     }
3292 
3293     if (ret < 0) {
3294         /* Do nothing, write notifier decided to fail this request */
3295     } else if (flags & BDRV_REQ_ZERO_WRITE) {
3296         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3297         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3298     } else {
3299         BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3300         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3301     }
3302     BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3303 
3304     if (ret == 0 && !bs->enable_write_cache) {
3305         ret = bdrv_co_flush(bs);
3306     }
3307 
3308     bdrv_set_dirty(bs, sector_num, nb_sectors);
3309 
3310     block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
3311 
3312     if (bs->growable && ret >= 0) {
3313         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3314     }
3315 
3316     return ret;
3317 }
3318 
3319 /*
3320  * Handle a write request in coroutine context
3321  */
3322 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3323     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3324     BdrvRequestFlags flags)
3325 {
3326     BdrvTrackedRequest req;
3327     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3328     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3329     uint8_t *head_buf = NULL;
3330     uint8_t *tail_buf = NULL;
3331     QEMUIOVector local_qiov;
3332     bool use_local_qiov = false;
3333     int ret;
3334 
3335     if (!bs->drv) {
3336         return -ENOMEDIUM;
3337     }
3338     if (bs->read_only) {
3339         return -EACCES;
3340     }
3341     if (bdrv_check_byte_request(bs, offset, bytes)) {
3342         return -EIO;
3343     }
3344 
3345     /* throttling disk I/O */
3346     if (bs->io_limits_enabled) {
3347         bdrv_io_limits_intercept(bs, bytes, true);
3348     }
3349 
3350     /*
3351      * Align write if necessary by performing a read-modify-write cycle.
3352      * Pad qiov with the read parts and be sure to have a tracked request not
3353      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3354      */
3355     tracked_request_begin(&req, bs, offset, bytes, true);
3356 
3357     if (offset & (align - 1)) {
3358         QEMUIOVector head_qiov;
3359         struct iovec head_iov;
3360 
3361         mark_request_serialising(&req, align);
3362         wait_serialising_requests(&req);
3363 
3364         head_buf = qemu_blockalign(bs, align);
3365         head_iov = (struct iovec) {
3366             .iov_base   = head_buf,
3367             .iov_len    = align,
3368         };
3369         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3370 
3371         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3372         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3373                                   align, &head_qiov, 0);
3374         if (ret < 0) {
3375             goto fail;
3376         }
3377         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3378 
3379         qemu_iovec_init(&local_qiov, qiov->niov + 2);
3380         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3381         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3382         use_local_qiov = true;
3383 
3384         bytes += offset & (align - 1);
3385         offset = offset & ~(align - 1);
3386     }
3387 
3388     if ((offset + bytes) & (align - 1)) {
3389         QEMUIOVector tail_qiov;
3390         struct iovec tail_iov;
3391         size_t tail_bytes;
3392         bool waited;
3393 
3394         mark_request_serialising(&req, align);
3395         waited = wait_serialising_requests(&req);
3396         assert(!waited || !use_local_qiov);
3397 
3398         tail_buf = qemu_blockalign(bs, align);
3399         tail_iov = (struct iovec) {
3400             .iov_base   = tail_buf,
3401             .iov_len    = align,
3402         };
3403         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3404 
3405         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3406         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3407                                   align, &tail_qiov, 0);
3408         if (ret < 0) {
3409             goto fail;
3410         }
3411         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3412 
3413         if (!use_local_qiov) {
3414             qemu_iovec_init(&local_qiov, qiov->niov + 1);
3415             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3416             use_local_qiov = true;
3417         }
3418 
3419         tail_bytes = (offset + bytes) & (align - 1);
3420         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3421 
3422         bytes = ROUND_UP(bytes, align);
3423     }
3424 
3425     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3426                                use_local_qiov ? &local_qiov : qiov,
3427                                flags);
3428 
3429 fail:
3430     tracked_request_end(&req);
3431 
3432     if (use_local_qiov) {
3433         qemu_iovec_destroy(&local_qiov);
3434     }
3435     qemu_vfree(head_buf);
3436     qemu_vfree(tail_buf);
3437 
3438     return ret;
3439 }
3440 
3441 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3442     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3443     BdrvRequestFlags flags)
3444 {
3445     if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3446         return -EINVAL;
3447     }
3448 
3449     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3450                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3451 }
3452 
3453 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3454     int nb_sectors, QEMUIOVector *qiov)
3455 {
3456     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3457 
3458     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3459 }
3460 
3461 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3462                                       int64_t sector_num, int nb_sectors,
3463                                       BdrvRequestFlags flags)
3464 {
3465     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3466 
3467     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3468         flags &= ~BDRV_REQ_MAY_UNMAP;
3469     }
3470 
3471     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3472                              BDRV_REQ_ZERO_WRITE | flags);
3473 }
3474 
3475 /**
3476  * Truncate file to 'offset' bytes (needed only for file protocols)
3477  */
3478 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3479 {
3480     BlockDriver *drv = bs->drv;
3481     int ret;
3482     if (!drv)
3483         return -ENOMEDIUM;
3484     if (!drv->bdrv_truncate)
3485         return -ENOTSUP;
3486     if (bs->read_only)
3487         return -EACCES;
3488 
3489     ret = drv->bdrv_truncate(bs, offset);
3490     if (ret == 0) {
3491         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3492         if (bs->blk) {
3493             blk_dev_resize_cb(bs->blk);
3494         }
3495     }
3496     return ret;
3497 }
3498 
3499 /**
3500  * Length of a allocated file in bytes. Sparse files are counted by actual
3501  * allocated space. Return < 0 if error or unknown.
3502  */
3503 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3504 {
3505     BlockDriver *drv = bs->drv;
3506     if (!drv) {
3507         return -ENOMEDIUM;
3508     }
3509     if (drv->bdrv_get_allocated_file_size) {
3510         return drv->bdrv_get_allocated_file_size(bs);
3511     }
3512     if (bs->file) {
3513         return bdrv_get_allocated_file_size(bs->file);
3514     }
3515     return -ENOTSUP;
3516 }
3517 
3518 /**
3519  * Return number of sectors on success, -errno on error.
3520  */
3521 int64_t bdrv_nb_sectors(BlockDriverState *bs)
3522 {
3523     BlockDriver *drv = bs->drv;
3524 
3525     if (!drv)
3526         return -ENOMEDIUM;
3527 
3528     if (drv->has_variable_length) {
3529         int ret = refresh_total_sectors(bs, bs->total_sectors);
3530         if (ret < 0) {
3531             return ret;
3532         }
3533     }
3534     return bs->total_sectors;
3535 }
3536 
3537 /**
3538  * Return length in bytes on success, -errno on error.
3539  * The length is always a multiple of BDRV_SECTOR_SIZE.
3540  */
3541 int64_t bdrv_getlength(BlockDriverState *bs)
3542 {
3543     int64_t ret = bdrv_nb_sectors(bs);
3544 
3545     return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
3546 }
3547 
3548 /* return 0 as number of sectors if no device present or error */
3549 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3550 {
3551     int64_t nb_sectors = bdrv_nb_sectors(bs);
3552 
3553     *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
3554 }
3555 
3556 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3557                        BlockdevOnError on_write_error)
3558 {
3559     bs->on_read_error = on_read_error;
3560     bs->on_write_error = on_write_error;
3561 }
3562 
3563 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3564 {
3565     return is_read ? bs->on_read_error : bs->on_write_error;
3566 }
3567 
3568 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3569 {
3570     BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3571 
3572     switch (on_err) {
3573     case BLOCKDEV_ON_ERROR_ENOSPC:
3574         return (error == ENOSPC) ?
3575                BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
3576     case BLOCKDEV_ON_ERROR_STOP:
3577         return BLOCK_ERROR_ACTION_STOP;
3578     case BLOCKDEV_ON_ERROR_REPORT:
3579         return BLOCK_ERROR_ACTION_REPORT;
3580     case BLOCKDEV_ON_ERROR_IGNORE:
3581         return BLOCK_ERROR_ACTION_IGNORE;
3582     default:
3583         abort();
3584     }
3585 }
3586 
3587 static void send_qmp_error_event(BlockDriverState *bs,
3588                                  BlockErrorAction action,
3589                                  bool is_read, int error)
3590 {
3591     IoOperationType optype;
3592 
3593     optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
3594     qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
3595                                    bdrv_iostatus_is_enabled(bs),
3596                                    error == ENOSPC, strerror(error),
3597                                    &error_abort);
3598 }
3599 
3600 /* This is done by device models because, while the block layer knows
3601  * about the error, it does not know whether an operation comes from
3602  * the device or the block layer (from a job, for example).
3603  */
3604 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3605                        bool is_read, int error)
3606 {
3607     assert(error >= 0);
3608 
3609     if (action == BLOCK_ERROR_ACTION_STOP) {
3610         /* First set the iostatus, so that "info block" returns an iostatus
3611          * that matches the events raised so far (an additional error iostatus
3612          * is fine, but not a lost one).
3613          */
3614         bdrv_iostatus_set_err(bs, error);
3615 
3616         /* Then raise the request to stop the VM and the event.
3617          * qemu_system_vmstop_request_prepare has two effects.  First,
3618          * it ensures that the STOP event always comes after the
3619          * BLOCK_IO_ERROR event.  Second, it ensures that even if management
3620          * can observe the STOP event and do a "cont" before the STOP
3621          * event is issued, the VM will not stop.  In this case, vm_start()
3622          * also ensures that the STOP/RESUME pair of events is emitted.
3623          */
3624         qemu_system_vmstop_request_prepare();
3625         send_qmp_error_event(bs, action, is_read, error);
3626         qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
3627     } else {
3628         send_qmp_error_event(bs, action, is_read, error);
3629     }
3630 }
3631 
3632 int bdrv_is_read_only(BlockDriverState *bs)
3633 {
3634     return bs->read_only;
3635 }
3636 
3637 int bdrv_is_sg(BlockDriverState *bs)
3638 {
3639     return bs->sg;
3640 }
3641 
3642 int bdrv_enable_write_cache(BlockDriverState *bs)
3643 {
3644     return bs->enable_write_cache;
3645 }
3646 
3647 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3648 {
3649     bs->enable_write_cache = wce;
3650 
3651     /* so a reopen() will preserve wce */
3652     if (wce) {
3653         bs->open_flags |= BDRV_O_CACHE_WB;
3654     } else {
3655         bs->open_flags &= ~BDRV_O_CACHE_WB;
3656     }
3657 }
3658 
3659 int bdrv_is_encrypted(BlockDriverState *bs)
3660 {
3661     if (bs->backing_hd && bs->backing_hd->encrypted)
3662         return 1;
3663     return bs->encrypted;
3664 }
3665 
3666 int bdrv_key_required(BlockDriverState *bs)
3667 {
3668     BlockDriverState *backing_hd = bs->backing_hd;
3669 
3670     if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3671         return 1;
3672     return (bs->encrypted && !bs->valid_key);
3673 }
3674 
3675 int bdrv_set_key(BlockDriverState *bs, const char *key)
3676 {
3677     int ret;
3678     if (bs->backing_hd && bs->backing_hd->encrypted) {
3679         ret = bdrv_set_key(bs->backing_hd, key);
3680         if (ret < 0)
3681             return ret;
3682         if (!bs->encrypted)
3683             return 0;
3684     }
3685     if (!bs->encrypted) {
3686         return -EINVAL;
3687     } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3688         return -ENOMEDIUM;
3689     }
3690     ret = bs->drv->bdrv_set_key(bs, key);
3691     if (ret < 0) {
3692         bs->valid_key = 0;
3693     } else if (!bs->valid_key) {
3694         bs->valid_key = 1;
3695         if (bs->blk) {
3696             /* call the change callback now, we skipped it on open */
3697             blk_dev_change_media_cb(bs->blk, true);
3698         }
3699     }
3700     return ret;
3701 }
3702 
3703 const char *bdrv_get_format_name(BlockDriverState *bs)
3704 {
3705     return bs->drv ? bs->drv->format_name : NULL;
3706 }
3707 
3708 static int qsort_strcmp(const void *a, const void *b)
3709 {
3710     return strcmp(a, b);
3711 }
3712 
3713 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3714                          void *opaque)
3715 {
3716     BlockDriver *drv;
3717     int count = 0;
3718     int i;
3719     const char **formats = NULL;
3720 
3721     QLIST_FOREACH(drv, &bdrv_drivers, list) {
3722         if (drv->format_name) {
3723             bool found = false;
3724             int i = count;
3725             while (formats && i && !found) {
3726                 found = !strcmp(formats[--i], drv->format_name);
3727             }
3728 
3729             if (!found) {
3730                 formats = g_renew(const char *, formats, count + 1);
3731                 formats[count++] = drv->format_name;
3732             }
3733         }
3734     }
3735 
3736     qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
3737 
3738     for (i = 0; i < count; i++) {
3739         it(opaque, formats[i]);
3740     }
3741 
3742     g_free(formats);
3743 }
3744 
3745 /* This function is to find block backend bs */
3746 /* TODO convert callers to blk_by_name(), then remove */
3747 BlockDriverState *bdrv_find(const char *name)
3748 {
3749     BlockBackend *blk = blk_by_name(name);
3750 
3751     return blk ? blk_bs(blk) : NULL;
3752 }
3753 
3754 /* This function is to find a node in the bs graph */
3755 BlockDriverState *bdrv_find_node(const char *node_name)
3756 {
3757     BlockDriverState *bs;
3758 
3759     assert(node_name);
3760 
3761     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3762         if (!strcmp(node_name, bs->node_name)) {
3763             return bs;
3764         }
3765     }
3766     return NULL;
3767 }
3768 
3769 /* Put this QMP function here so it can access the static graph_bdrv_states. */
3770 BlockDeviceInfoList *bdrv_named_nodes_list(void)
3771 {
3772     BlockDeviceInfoList *list, *entry;
3773     BlockDriverState *bs;
3774 
3775     list = NULL;
3776     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3777         entry = g_malloc0(sizeof(*entry));
3778         entry->value = bdrv_block_device_info(bs);
3779         entry->next = list;
3780         list = entry;
3781     }
3782 
3783     return list;
3784 }
3785 
3786 BlockDriverState *bdrv_lookup_bs(const char *device,
3787                                  const char *node_name,
3788                                  Error **errp)
3789 {
3790     BlockBackend *blk;
3791     BlockDriverState *bs;
3792 
3793     if (device) {
3794         blk = blk_by_name(device);
3795 
3796         if (blk) {
3797             return blk_bs(blk);
3798         }
3799     }
3800 
3801     if (node_name) {
3802         bs = bdrv_find_node(node_name);
3803 
3804         if (bs) {
3805             return bs;
3806         }
3807     }
3808 
3809     error_setg(errp, "Cannot find device=%s nor node_name=%s",
3810                      device ? device : "",
3811                      node_name ? node_name : "");
3812     return NULL;
3813 }
3814 
3815 /* If 'base' is in the same chain as 'top', return true. Otherwise,
3816  * return false.  If either argument is NULL, return false. */
3817 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
3818 {
3819     while (top && top != base) {
3820         top = top->backing_hd;
3821     }
3822 
3823     return top != NULL;
3824 }
3825 
3826 BlockDriverState *bdrv_next_node(BlockDriverState *bs)
3827 {
3828     if (!bs) {
3829         return QTAILQ_FIRST(&graph_bdrv_states);
3830     }
3831     return QTAILQ_NEXT(bs, node_list);
3832 }
3833 
3834 BlockDriverState *bdrv_next(BlockDriverState *bs)
3835 {
3836     if (!bs) {
3837         return QTAILQ_FIRST(&bdrv_states);
3838     }
3839     return QTAILQ_NEXT(bs, device_list);
3840 }
3841 
3842 const char *bdrv_get_node_name(const BlockDriverState *bs)
3843 {
3844     return bs->node_name;
3845 }
3846 
3847 /* TODO check what callers really want: bs->node_name or blk_name() */
3848 const char *bdrv_get_device_name(const BlockDriverState *bs)
3849 {
3850     return bs->blk ? blk_name(bs->blk) : "";
3851 }
3852 
3853 int bdrv_get_flags(BlockDriverState *bs)
3854 {
3855     return bs->open_flags;
3856 }
3857 
3858 int bdrv_flush_all(void)
3859 {
3860     BlockDriverState *bs;
3861     int result = 0;
3862 
3863     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3864         AioContext *aio_context = bdrv_get_aio_context(bs);
3865         int ret;
3866 
3867         aio_context_acquire(aio_context);
3868         ret = bdrv_flush(bs);
3869         if (ret < 0 && !result) {
3870             result = ret;
3871         }
3872         aio_context_release(aio_context);
3873     }
3874 
3875     return result;
3876 }
3877 
3878 int bdrv_has_zero_init_1(BlockDriverState *bs)
3879 {
3880     return 1;
3881 }
3882 
3883 int bdrv_has_zero_init(BlockDriverState *bs)
3884 {
3885     assert(bs->drv);
3886 
3887     /* If BS is a copy on write image, it is initialized to
3888        the contents of the base image, which may not be zeroes.  */
3889     if (bs->backing_hd) {
3890         return 0;
3891     }
3892     if (bs->drv->bdrv_has_zero_init) {
3893         return bs->drv->bdrv_has_zero_init(bs);
3894     }
3895 
3896     /* safe default */
3897     return 0;
3898 }
3899 
3900 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3901 {
3902     BlockDriverInfo bdi;
3903 
3904     if (bs->backing_hd) {
3905         return false;
3906     }
3907 
3908     if (bdrv_get_info(bs, &bdi) == 0) {
3909         return bdi.unallocated_blocks_are_zero;
3910     }
3911 
3912     return false;
3913 }
3914 
3915 bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3916 {
3917     BlockDriverInfo bdi;
3918 
3919     if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3920         return false;
3921     }
3922 
3923     if (bdrv_get_info(bs, &bdi) == 0) {
3924         return bdi.can_write_zeroes_with_unmap;
3925     }
3926 
3927     return false;
3928 }
3929 
3930 typedef struct BdrvCoGetBlockStatusData {
3931     BlockDriverState *bs;
3932     BlockDriverState *base;
3933     int64_t sector_num;
3934     int nb_sectors;
3935     int *pnum;
3936     int64_t ret;
3937     bool done;
3938 } BdrvCoGetBlockStatusData;
3939 
3940 /*
3941  * Returns the allocation status of the specified sectors.
3942  * Drivers not implementing the functionality are assumed to not support
3943  * backing files, hence all their sectors are reported as allocated.
3944  *
3945  * If 'sector_num' is beyond the end of the disk image the return value is 0
3946  * and 'pnum' is set to 0.
3947  *
3948  * 'pnum' is set to the number of sectors (including and immediately following
3949  * the specified sector) that are known to be in the same
3950  * allocated/unallocated state.
3951  *
3952  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
3953  * beyond the end of the disk image it will be clamped.
3954  */
3955 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3956                                                      int64_t sector_num,
3957                                                      int nb_sectors, int *pnum)
3958 {
3959     int64_t total_sectors;
3960     int64_t n;
3961     int64_t ret, ret2;
3962 
3963     total_sectors = bdrv_nb_sectors(bs);
3964     if (total_sectors < 0) {
3965         return total_sectors;
3966     }
3967 
3968     if (sector_num >= total_sectors) {
3969         *pnum = 0;
3970         return 0;
3971     }
3972 
3973     n = total_sectors - sector_num;
3974     if (n < nb_sectors) {
3975         nb_sectors = n;
3976     }
3977 
3978     if (!bs->drv->bdrv_co_get_block_status) {
3979         *pnum = nb_sectors;
3980         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
3981         if (bs->drv->protocol_name) {
3982             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3983         }
3984         return ret;
3985     }
3986 
3987     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3988     if (ret < 0) {
3989         *pnum = 0;
3990         return ret;
3991     }
3992 
3993     if (ret & BDRV_BLOCK_RAW) {
3994         assert(ret & BDRV_BLOCK_OFFSET_VALID);
3995         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3996                                      *pnum, pnum);
3997     }
3998 
3999     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
4000         ret |= BDRV_BLOCK_ALLOCATED;
4001     }
4002 
4003     if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
4004         if (bdrv_unallocated_blocks_are_zero(bs)) {
4005             ret |= BDRV_BLOCK_ZERO;
4006         } else if (bs->backing_hd) {
4007             BlockDriverState *bs2 = bs->backing_hd;
4008             int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
4009             if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
4010                 ret |= BDRV_BLOCK_ZERO;
4011             }
4012         }
4013     }
4014 
4015     if (bs->file &&
4016         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
4017         (ret & BDRV_BLOCK_OFFSET_VALID)) {
4018         int file_pnum;
4019 
4020         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
4021                                         *pnum, &file_pnum);
4022         if (ret2 >= 0) {
4023             /* Ignore errors.  This is just providing extra information, it
4024              * is useful but not necessary.
4025              */
4026             if (!file_pnum) {
4027                 /* !file_pnum indicates an offset at or beyond the EOF; it is
4028                  * perfectly valid for the format block driver to point to such
4029                  * offsets, so catch it and mark everything as zero */
4030                 ret |= BDRV_BLOCK_ZERO;
4031             } else {
4032                 /* Limit request to the range reported by the protocol driver */
4033                 *pnum = file_pnum;
4034                 ret |= (ret2 & BDRV_BLOCK_ZERO);
4035             }
4036         }
4037     }
4038 
4039     return ret;
4040 }
4041 
4042 /* Coroutine wrapper for bdrv_get_block_status() */
4043 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
4044 {
4045     BdrvCoGetBlockStatusData *data = opaque;
4046     BlockDriverState *bs = data->bs;
4047 
4048     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
4049                                          data->pnum);
4050     data->done = true;
4051 }
4052 
4053 /*
4054  * Synchronous wrapper around bdrv_co_get_block_status().
4055  *
4056  * See bdrv_co_get_block_status() for details.
4057  */
4058 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
4059                               int nb_sectors, int *pnum)
4060 {
4061     Coroutine *co;
4062     BdrvCoGetBlockStatusData data = {
4063         .bs = bs,
4064         .sector_num = sector_num,
4065         .nb_sectors = nb_sectors,
4066         .pnum = pnum,
4067         .done = false,
4068     };
4069 
4070     if (qemu_in_coroutine()) {
4071         /* Fast-path if already in coroutine context */
4072         bdrv_get_block_status_co_entry(&data);
4073     } else {
4074         AioContext *aio_context = bdrv_get_aio_context(bs);
4075 
4076         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
4077         qemu_coroutine_enter(co, &data);
4078         while (!data.done) {
4079             aio_poll(aio_context, true);
4080         }
4081     }
4082     return data.ret;
4083 }
4084 
4085 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
4086                                    int nb_sectors, int *pnum)
4087 {
4088     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
4089     if (ret < 0) {
4090         return ret;
4091     }
4092     return !!(ret & BDRV_BLOCK_ALLOCATED);
4093 }
4094 
4095 /*
4096  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
4097  *
4098  * Return true if the given sector is allocated in any image between
4099  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
4100  * sector is allocated in any image of the chain.  Return false otherwise.
4101  *
4102  * 'pnum' is set to the number of sectors (including and immediately following
4103  *  the specified sector) that are known to be in the same
4104  *  allocated/unallocated state.
4105  *
4106  */
4107 int bdrv_is_allocated_above(BlockDriverState *top,
4108                             BlockDriverState *base,
4109                             int64_t sector_num,
4110                             int nb_sectors, int *pnum)
4111 {
4112     BlockDriverState *intermediate;
4113     int ret, n = nb_sectors;
4114 
4115     intermediate = top;
4116     while (intermediate && intermediate != base) {
4117         int pnum_inter;
4118         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
4119                                 &pnum_inter);
4120         if (ret < 0) {
4121             return ret;
4122         } else if (ret) {
4123             *pnum = pnum_inter;
4124             return 1;
4125         }
4126 
4127         /*
4128          * [sector_num, nb_sectors] is unallocated on top but intermediate
4129          * might have
4130          *
4131          * [sector_num+x, nr_sectors] allocated.
4132          */
4133         if (n > pnum_inter &&
4134             (intermediate == top ||
4135              sector_num + pnum_inter < intermediate->total_sectors)) {
4136             n = pnum_inter;
4137         }
4138 
4139         intermediate = intermediate->backing_hd;
4140     }
4141 
4142     *pnum = n;
4143     return 0;
4144 }
4145 
4146 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4147 {
4148     if (bs->backing_hd && bs->backing_hd->encrypted)
4149         return bs->backing_file;
4150     else if (bs->encrypted)
4151         return bs->filename;
4152     else
4153         return NULL;
4154 }
4155 
4156 void bdrv_get_backing_filename(BlockDriverState *bs,
4157                                char *filename, int filename_size)
4158 {
4159     pstrcpy(filename, filename_size, bs->backing_file);
4160 }
4161 
4162 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
4163                           const uint8_t *buf, int nb_sectors)
4164 {
4165     BlockDriver *drv = bs->drv;
4166     if (!drv)
4167         return -ENOMEDIUM;
4168     if (!drv->bdrv_write_compressed)
4169         return -ENOTSUP;
4170     if (bdrv_check_request(bs, sector_num, nb_sectors))
4171         return -EIO;
4172 
4173     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4174 
4175     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4176 }
4177 
4178 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4179 {
4180     BlockDriver *drv = bs->drv;
4181     if (!drv)
4182         return -ENOMEDIUM;
4183     if (!drv->bdrv_get_info)
4184         return -ENOTSUP;
4185     memset(bdi, 0, sizeof(*bdi));
4186     return drv->bdrv_get_info(bs, bdi);
4187 }
4188 
4189 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4190 {
4191     BlockDriver *drv = bs->drv;
4192     if (drv && drv->bdrv_get_specific_info) {
4193         return drv->bdrv_get_specific_info(bs);
4194     }
4195     return NULL;
4196 }
4197 
4198 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4199                       int64_t pos, int size)
4200 {
4201     QEMUIOVector qiov;
4202     struct iovec iov = {
4203         .iov_base   = (void *) buf,
4204         .iov_len    = size,
4205     };
4206 
4207     qemu_iovec_init_external(&qiov, &iov, 1);
4208     return bdrv_writev_vmstate(bs, &qiov, pos);
4209 }
4210 
4211 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
4212 {
4213     BlockDriver *drv = bs->drv;
4214 
4215     if (!drv) {
4216         return -ENOMEDIUM;
4217     } else if (drv->bdrv_save_vmstate) {
4218         return drv->bdrv_save_vmstate(bs, qiov, pos);
4219     } else if (bs->file) {
4220         return bdrv_writev_vmstate(bs->file, qiov, pos);
4221     }
4222 
4223     return -ENOTSUP;
4224 }
4225 
4226 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4227                       int64_t pos, int size)
4228 {
4229     BlockDriver *drv = bs->drv;
4230     if (!drv)
4231         return -ENOMEDIUM;
4232     if (drv->bdrv_load_vmstate)
4233         return drv->bdrv_load_vmstate(bs, buf, pos, size);
4234     if (bs->file)
4235         return bdrv_load_vmstate(bs->file, buf, pos, size);
4236     return -ENOTSUP;
4237 }
4238 
4239 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4240 {
4241     if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4242         return;
4243     }
4244 
4245     bs->drv->bdrv_debug_event(bs, event);
4246 }
4247 
4248 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4249                           const char *tag)
4250 {
4251     while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4252         bs = bs->file;
4253     }
4254 
4255     if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4256         return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4257     }
4258 
4259     return -ENOTSUP;
4260 }
4261 
4262 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4263 {
4264     while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4265         bs = bs->file;
4266     }
4267 
4268     if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4269         return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4270     }
4271 
4272     return -ENOTSUP;
4273 }
4274 
4275 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4276 {
4277     while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
4278         bs = bs->file;
4279     }
4280 
4281     if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4282         return bs->drv->bdrv_debug_resume(bs, tag);
4283     }
4284 
4285     return -ENOTSUP;
4286 }
4287 
4288 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4289 {
4290     while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4291         bs = bs->file;
4292     }
4293 
4294     if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4295         return bs->drv->bdrv_debug_is_suspended(bs, tag);
4296     }
4297 
4298     return false;
4299 }
4300 
4301 int bdrv_is_snapshot(BlockDriverState *bs)
4302 {
4303     return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4304 }
4305 
4306 /* backing_file can either be relative, or absolute, or a protocol.  If it is
4307  * relative, it must be relative to the chain.  So, passing in bs->filename
4308  * from a BDS as backing_file should not be done, as that may be relative to
4309  * the CWD rather than the chain. */
4310 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4311         const char *backing_file)
4312 {
4313     char *filename_full = NULL;
4314     char *backing_file_full = NULL;
4315     char *filename_tmp = NULL;
4316     int is_protocol = 0;
4317     BlockDriverState *curr_bs = NULL;
4318     BlockDriverState *retval = NULL;
4319 
4320     if (!bs || !bs->drv || !backing_file) {
4321         return NULL;
4322     }
4323 
4324     filename_full     = g_malloc(PATH_MAX);
4325     backing_file_full = g_malloc(PATH_MAX);
4326     filename_tmp      = g_malloc(PATH_MAX);
4327 
4328     is_protocol = path_has_protocol(backing_file);
4329 
4330     for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4331 
4332         /* If either of the filename paths is actually a protocol, then
4333          * compare unmodified paths; otherwise make paths relative */
4334         if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4335             if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4336                 retval = curr_bs->backing_hd;
4337                 break;
4338             }
4339         } else {
4340             /* If not an absolute filename path, make it relative to the current
4341              * image's filename path */
4342             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4343                          backing_file);
4344 
4345             /* We are going to compare absolute pathnames */
4346             if (!realpath(filename_tmp, filename_full)) {
4347                 continue;
4348             }
4349 
4350             /* We need to make sure the backing filename we are comparing against
4351              * is relative to the current image filename (or absolute) */
4352             path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4353                          curr_bs->backing_file);
4354 
4355             if (!realpath(filename_tmp, backing_file_full)) {
4356                 continue;
4357             }
4358 
4359             if (strcmp(backing_file_full, filename_full) == 0) {
4360                 retval = curr_bs->backing_hd;
4361                 break;
4362             }
4363         }
4364     }
4365 
4366     g_free(filename_full);
4367     g_free(backing_file_full);
4368     g_free(filename_tmp);
4369     return retval;
4370 }
4371 
4372 int bdrv_get_backing_file_depth(BlockDriverState *bs)
4373 {
4374     if (!bs->drv) {
4375         return 0;
4376     }
4377 
4378     if (!bs->backing_hd) {
4379         return 0;
4380     }
4381 
4382     return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4383 }
4384 
4385 /**************************************************************/
4386 /* async I/Os */
4387 
4388 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4389                            QEMUIOVector *qiov, int nb_sectors,
4390                            BlockCompletionFunc *cb, void *opaque)
4391 {
4392     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4393 
4394     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4395                                  cb, opaque, false);
4396 }
4397 
4398 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4399                             QEMUIOVector *qiov, int nb_sectors,
4400                             BlockCompletionFunc *cb, void *opaque)
4401 {
4402     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4403 
4404     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4405                                  cb, opaque, true);
4406 }
4407 
4408 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4409         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4410         BlockCompletionFunc *cb, void *opaque)
4411 {
4412     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4413 
4414     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4415                                  BDRV_REQ_ZERO_WRITE | flags,
4416                                  cb, opaque, true);
4417 }
4418 
4419 
4420 typedef struct MultiwriteCB {
4421     int error;
4422     int num_requests;
4423     int num_callbacks;
4424     struct {
4425         BlockCompletionFunc *cb;
4426         void *opaque;
4427         QEMUIOVector *free_qiov;
4428     } callbacks[];
4429 } MultiwriteCB;
4430 
4431 static void multiwrite_user_cb(MultiwriteCB *mcb)
4432 {
4433     int i;
4434 
4435     for (i = 0; i < mcb->num_callbacks; i++) {
4436         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4437         if (mcb->callbacks[i].free_qiov) {
4438             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4439         }
4440         g_free(mcb->callbacks[i].free_qiov);
4441     }
4442 }
4443 
4444 static void multiwrite_cb(void *opaque, int ret)
4445 {
4446     MultiwriteCB *mcb = opaque;
4447 
4448     trace_multiwrite_cb(mcb, ret);
4449 
4450     if (ret < 0 && !mcb->error) {
4451         mcb->error = ret;
4452     }
4453 
4454     mcb->num_requests--;
4455     if (mcb->num_requests == 0) {
4456         multiwrite_user_cb(mcb);
4457         g_free(mcb);
4458     }
4459 }
4460 
4461 static int multiwrite_req_compare(const void *a, const void *b)
4462 {
4463     const BlockRequest *req1 = a, *req2 = b;
4464 
4465     /*
4466      * Note that we can't simply subtract req2->sector from req1->sector
4467      * here as that could overflow the return value.
4468      */
4469     if (req1->sector > req2->sector) {
4470         return 1;
4471     } else if (req1->sector < req2->sector) {
4472         return -1;
4473     } else {
4474         return 0;
4475     }
4476 }
4477 
4478 /*
4479  * Takes a bunch of requests and tries to merge them. Returns the number of
4480  * requests that remain after merging.
4481  */
4482 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4483     int num_reqs, MultiwriteCB *mcb)
4484 {
4485     int i, outidx;
4486 
4487     // Sort requests by start sector
4488     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4489 
4490     // Check if adjacent requests touch the same clusters. If so, combine them,
4491     // filling up gaps with zero sectors.
4492     outidx = 0;
4493     for (i = 1; i < num_reqs; i++) {
4494         int merge = 0;
4495         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4496 
4497         // Handle exactly sequential writes and overlapping writes.
4498         if (reqs[i].sector <= oldreq_last) {
4499             merge = 1;
4500         }
4501 
4502         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4503             merge = 0;
4504         }
4505 
4506         if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
4507             reqs[i].nb_sectors > bs->bl.max_transfer_length) {
4508             merge = 0;
4509         }
4510 
4511         if (merge) {
4512             size_t size;
4513             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4514             qemu_iovec_init(qiov,
4515                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4516 
4517             // Add the first request to the merged one. If the requests are
4518             // overlapping, drop the last sectors of the first request.
4519             size = (reqs[i].sector - reqs[outidx].sector) << 9;
4520             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4521 
4522             // We should need to add any zeros between the two requests
4523             assert (reqs[i].sector <= oldreq_last);
4524 
4525             // Add the second request
4526             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4527 
4528             // Add tail of first request, if necessary
4529             if (qiov->size < reqs[outidx].qiov->size) {
4530                 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
4531                                   reqs[outidx].qiov->size - qiov->size);
4532             }
4533 
4534             reqs[outidx].nb_sectors = qiov->size >> 9;
4535             reqs[outidx].qiov = qiov;
4536 
4537             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4538         } else {
4539             outidx++;
4540             reqs[outidx].sector     = reqs[i].sector;
4541             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4542             reqs[outidx].qiov       = reqs[i].qiov;
4543         }
4544     }
4545 
4546     return outidx + 1;
4547 }
4548 
4549 /*
4550  * Submit multiple AIO write requests at once.
4551  *
4552  * On success, the function returns 0 and all requests in the reqs array have
4553  * been submitted. In error case this function returns -1, and any of the
4554  * requests may or may not be submitted yet. In particular, this means that the
4555  * callback will be called for some of the requests, for others it won't. The
4556  * caller must check the error field of the BlockRequest to wait for the right
4557  * callbacks (if error != 0, no callback will be called).
4558  *
4559  * The implementation may modify the contents of the reqs array, e.g. to merge
4560  * requests. However, the fields opaque and error are left unmodified as they
4561  * are used to signal failure for a single request to the caller.
4562  */
4563 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4564 {
4565     MultiwriteCB *mcb;
4566     int i;
4567 
4568     /* don't submit writes if we don't have a medium */
4569     if (bs->drv == NULL) {
4570         for (i = 0; i < num_reqs; i++) {
4571             reqs[i].error = -ENOMEDIUM;
4572         }
4573         return -1;
4574     }
4575 
4576     if (num_reqs == 0) {
4577         return 0;
4578     }
4579 
4580     // Create MultiwriteCB structure
4581     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4582     mcb->num_requests = 0;
4583     mcb->num_callbacks = num_reqs;
4584 
4585     for (i = 0; i < num_reqs; i++) {
4586         mcb->callbacks[i].cb = reqs[i].cb;
4587         mcb->callbacks[i].opaque = reqs[i].opaque;
4588     }
4589 
4590     // Check for mergable requests
4591     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4592 
4593     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4594 
4595     /* Run the aio requests. */
4596     mcb->num_requests = num_reqs;
4597     for (i = 0; i < num_reqs; i++) {
4598         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4599                               reqs[i].nb_sectors, reqs[i].flags,
4600                               multiwrite_cb, mcb,
4601                               true);
4602     }
4603 
4604     return 0;
4605 }
4606 
4607 void bdrv_aio_cancel(BlockAIOCB *acb)
4608 {
4609     qemu_aio_ref(acb);
4610     bdrv_aio_cancel_async(acb);
4611     while (acb->refcnt > 1) {
4612         if (acb->aiocb_info->get_aio_context) {
4613             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
4614         } else if (acb->bs) {
4615             aio_poll(bdrv_get_aio_context(acb->bs), true);
4616         } else {
4617             abort();
4618         }
4619     }
4620     qemu_aio_unref(acb);
4621 }
4622 
4623 /* Async version of aio cancel. The caller is not blocked if the acb implements
4624  * cancel_async, otherwise we do nothing and let the request normally complete.
4625  * In either case the completion callback must be called. */
4626 void bdrv_aio_cancel_async(BlockAIOCB *acb)
4627 {
4628     if (acb->aiocb_info->cancel_async) {
4629         acb->aiocb_info->cancel_async(acb);
4630     }
4631 }
4632 
4633 /**************************************************************/
4634 /* async block device emulation */
4635 
4636 typedef struct BlockAIOCBSync {
4637     BlockAIOCB common;
4638     QEMUBH *bh;
4639     int ret;
4640     /* vector translation state */
4641     QEMUIOVector *qiov;
4642     uint8_t *bounce;
4643     int is_write;
4644 } BlockAIOCBSync;
4645 
4646 static const AIOCBInfo bdrv_em_aiocb_info = {
4647     .aiocb_size         = sizeof(BlockAIOCBSync),
4648 };
4649 
4650 static void bdrv_aio_bh_cb(void *opaque)
4651 {
4652     BlockAIOCBSync *acb = opaque;
4653 
4654     if (!acb->is_write && acb->ret >= 0) {
4655         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4656     }
4657     qemu_vfree(acb->bounce);
4658     acb->common.cb(acb->common.opaque, acb->ret);
4659     qemu_bh_delete(acb->bh);
4660     acb->bh = NULL;
4661     qemu_aio_unref(acb);
4662 }
4663 
4664 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4665                                       int64_t sector_num,
4666                                       QEMUIOVector *qiov,
4667                                       int nb_sectors,
4668                                       BlockCompletionFunc *cb,
4669                                       void *opaque,
4670                                       int is_write)
4671 
4672 {
4673     BlockAIOCBSync *acb;
4674 
4675     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4676     acb->is_write = is_write;
4677     acb->qiov = qiov;
4678     acb->bounce = qemu_try_blockalign(bs, qiov->size);
4679     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
4680 
4681     if (acb->bounce == NULL) {
4682         acb->ret = -ENOMEM;
4683     } else if (is_write) {
4684         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4685         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4686     } else {
4687         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4688     }
4689 
4690     qemu_bh_schedule(acb->bh);
4691 
4692     return &acb->common;
4693 }
4694 
4695 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4696         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4697         BlockCompletionFunc *cb, void *opaque)
4698 {
4699     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4700 }
4701 
4702 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4703         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4704         BlockCompletionFunc *cb, void *opaque)
4705 {
4706     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4707 }
4708 
4709 
4710 typedef struct BlockAIOCBCoroutine {
4711     BlockAIOCB common;
4712     BlockRequest req;
4713     bool is_write;
4714     bool *done;
4715     QEMUBH* bh;
4716 } BlockAIOCBCoroutine;
4717 
4718 static const AIOCBInfo bdrv_em_co_aiocb_info = {
4719     .aiocb_size         = sizeof(BlockAIOCBCoroutine),
4720 };
4721 
4722 static void bdrv_co_em_bh(void *opaque)
4723 {
4724     BlockAIOCBCoroutine *acb = opaque;
4725 
4726     acb->common.cb(acb->common.opaque, acb->req.error);
4727 
4728     qemu_bh_delete(acb->bh);
4729     qemu_aio_unref(acb);
4730 }
4731 
4732 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4733 static void coroutine_fn bdrv_co_do_rw(void *opaque)
4734 {
4735     BlockAIOCBCoroutine *acb = opaque;
4736     BlockDriverState *bs = acb->common.bs;
4737 
4738     if (!acb->is_write) {
4739         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4740             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4741     } else {
4742         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4743             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4744     }
4745 
4746     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4747     qemu_bh_schedule(acb->bh);
4748 }
4749 
4750 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4751                                          int64_t sector_num,
4752                                          QEMUIOVector *qiov,
4753                                          int nb_sectors,
4754                                          BdrvRequestFlags flags,
4755                                          BlockCompletionFunc *cb,
4756                                          void *opaque,
4757                                          bool is_write)
4758 {
4759     Coroutine *co;
4760     BlockAIOCBCoroutine *acb;
4761 
4762     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4763     acb->req.sector = sector_num;
4764     acb->req.nb_sectors = nb_sectors;
4765     acb->req.qiov = qiov;
4766     acb->req.flags = flags;
4767     acb->is_write = is_write;
4768 
4769     co = qemu_coroutine_create(bdrv_co_do_rw);
4770     qemu_coroutine_enter(co, acb);
4771 
4772     return &acb->common;
4773 }
4774 
4775 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4776 {
4777     BlockAIOCBCoroutine *acb = opaque;
4778     BlockDriverState *bs = acb->common.bs;
4779 
4780     acb->req.error = bdrv_co_flush(bs);
4781     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4782     qemu_bh_schedule(acb->bh);
4783 }
4784 
4785 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4786         BlockCompletionFunc *cb, void *opaque)
4787 {
4788     trace_bdrv_aio_flush(bs, opaque);
4789 
4790     Coroutine *co;
4791     BlockAIOCBCoroutine *acb;
4792 
4793     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4794 
4795     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4796     qemu_coroutine_enter(co, acb);
4797 
4798     return &acb->common;
4799 }
4800 
4801 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4802 {
4803     BlockAIOCBCoroutine *acb = opaque;
4804     BlockDriverState *bs = acb->common.bs;
4805 
4806     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4807     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
4808     qemu_bh_schedule(acb->bh);
4809 }
4810 
4811 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4812         int64_t sector_num, int nb_sectors,
4813         BlockCompletionFunc *cb, void *opaque)
4814 {
4815     Coroutine *co;
4816     BlockAIOCBCoroutine *acb;
4817 
4818     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4819 
4820     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4821     acb->req.sector = sector_num;
4822     acb->req.nb_sectors = nb_sectors;
4823     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4824     qemu_coroutine_enter(co, acb);
4825 
4826     return &acb->common;
4827 }
4828 
4829 void bdrv_init(void)
4830 {
4831     module_call_init(MODULE_INIT_BLOCK);
4832 }
4833 
4834 void bdrv_init_with_whitelist(void)
4835 {
4836     use_bdrv_whitelist = 1;
4837     bdrv_init();
4838 }
4839 
4840 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4841                    BlockCompletionFunc *cb, void *opaque)
4842 {
4843     BlockAIOCB *acb;
4844 
4845     acb = g_slice_alloc(aiocb_info->aiocb_size);
4846     acb->aiocb_info = aiocb_info;
4847     acb->bs = bs;
4848     acb->cb = cb;
4849     acb->opaque = opaque;
4850     acb->refcnt = 1;
4851     return acb;
4852 }
4853 
4854 void qemu_aio_ref(void *p)
4855 {
4856     BlockAIOCB *acb = p;
4857     acb->refcnt++;
4858 }
4859 
4860 void qemu_aio_unref(void *p)
4861 {
4862     BlockAIOCB *acb = p;
4863     assert(acb->refcnt > 0);
4864     if (--acb->refcnt == 0) {
4865         g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4866     }
4867 }
4868 
4869 /**************************************************************/
4870 /* Coroutine block device emulation */
4871 
4872 typedef struct CoroutineIOCompletion {
4873     Coroutine *coroutine;
4874     int ret;
4875 } CoroutineIOCompletion;
4876 
4877 static void bdrv_co_io_em_complete(void *opaque, int ret)
4878 {
4879     CoroutineIOCompletion *co = opaque;
4880 
4881     co->ret = ret;
4882     qemu_coroutine_enter(co->coroutine, NULL);
4883 }
4884 
4885 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4886                                       int nb_sectors, QEMUIOVector *iov,
4887                                       bool is_write)
4888 {
4889     CoroutineIOCompletion co = {
4890         .coroutine = qemu_coroutine_self(),
4891     };
4892     BlockAIOCB *acb;
4893 
4894     if (is_write) {
4895         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4896                                        bdrv_co_io_em_complete, &co);
4897     } else {
4898         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4899                                       bdrv_co_io_em_complete, &co);
4900     }
4901 
4902     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4903     if (!acb) {
4904         return -EIO;
4905     }
4906     qemu_coroutine_yield();
4907 
4908     return co.ret;
4909 }
4910 
4911 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4912                                          int64_t sector_num, int nb_sectors,
4913                                          QEMUIOVector *iov)
4914 {
4915     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4916 }
4917 
4918 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4919                                          int64_t sector_num, int nb_sectors,
4920                                          QEMUIOVector *iov)
4921 {
4922     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4923 }
4924 
4925 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4926 {
4927     RwCo *rwco = opaque;
4928 
4929     rwco->ret = bdrv_co_flush(rwco->bs);
4930 }
4931 
4932 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4933 {
4934     int ret;
4935 
4936     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4937         return 0;
4938     }
4939 
4940     /* Write back cached data to the OS even with cache=unsafe */
4941     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4942     if (bs->drv->bdrv_co_flush_to_os) {
4943         ret = bs->drv->bdrv_co_flush_to_os(bs);
4944         if (ret < 0) {
4945             return ret;
4946         }
4947     }
4948 
4949     /* But don't actually force it to the disk with cache=unsafe */
4950     if (bs->open_flags & BDRV_O_NO_FLUSH) {
4951         goto flush_parent;
4952     }
4953 
4954     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4955     if (bs->drv->bdrv_co_flush_to_disk) {
4956         ret = bs->drv->bdrv_co_flush_to_disk(bs);
4957     } else if (bs->drv->bdrv_aio_flush) {
4958         BlockAIOCB *acb;
4959         CoroutineIOCompletion co = {
4960             .coroutine = qemu_coroutine_self(),
4961         };
4962 
4963         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4964         if (acb == NULL) {
4965             ret = -EIO;
4966         } else {
4967             qemu_coroutine_yield();
4968             ret = co.ret;
4969         }
4970     } else {
4971         /*
4972          * Some block drivers always operate in either writethrough or unsafe
4973          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4974          * know how the server works (because the behaviour is hardcoded or
4975          * depends on server-side configuration), so we can't ensure that
4976          * everything is safe on disk. Returning an error doesn't work because
4977          * that would break guests even if the server operates in writethrough
4978          * mode.
4979          *
4980          * Let's hope the user knows what he's doing.
4981          */
4982         ret = 0;
4983     }
4984     if (ret < 0) {
4985         return ret;
4986     }
4987 
4988     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
4989      * in the case of cache=unsafe, so there are no useless flushes.
4990      */
4991 flush_parent:
4992     return bdrv_co_flush(bs->file);
4993 }
4994 
4995 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
4996 {
4997     Error *local_err = NULL;
4998     int ret;
4999 
5000     if (!bs->drv)  {
5001         return;
5002     }
5003 
5004     if (!(bs->open_flags & BDRV_O_INCOMING)) {
5005         return;
5006     }
5007     bs->open_flags &= ~BDRV_O_INCOMING;
5008 
5009     if (bs->drv->bdrv_invalidate_cache) {
5010         bs->drv->bdrv_invalidate_cache(bs, &local_err);
5011     } else if (bs->file) {
5012         bdrv_invalidate_cache(bs->file, &local_err);
5013     }
5014     if (local_err) {
5015         error_propagate(errp, local_err);
5016         return;
5017     }
5018 
5019     ret = refresh_total_sectors(bs, bs->total_sectors);
5020     if (ret < 0) {
5021         error_setg_errno(errp, -ret, "Could not refresh total sector count");
5022         return;
5023     }
5024 }
5025 
5026 void bdrv_invalidate_cache_all(Error **errp)
5027 {
5028     BlockDriverState *bs;
5029     Error *local_err = NULL;
5030 
5031     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5032         AioContext *aio_context = bdrv_get_aio_context(bs);
5033 
5034         aio_context_acquire(aio_context);
5035         bdrv_invalidate_cache(bs, &local_err);
5036         aio_context_release(aio_context);
5037         if (local_err) {
5038             error_propagate(errp, local_err);
5039             return;
5040         }
5041     }
5042 }
5043 
5044 int bdrv_flush(BlockDriverState *bs)
5045 {
5046     Coroutine *co;
5047     RwCo rwco = {
5048         .bs = bs,
5049         .ret = NOT_DONE,
5050     };
5051 
5052     if (qemu_in_coroutine()) {
5053         /* Fast-path if already in coroutine context */
5054         bdrv_flush_co_entry(&rwco);
5055     } else {
5056         AioContext *aio_context = bdrv_get_aio_context(bs);
5057 
5058         co = qemu_coroutine_create(bdrv_flush_co_entry);
5059         qemu_coroutine_enter(co, &rwco);
5060         while (rwco.ret == NOT_DONE) {
5061             aio_poll(aio_context, true);
5062         }
5063     }
5064 
5065     return rwco.ret;
5066 }
5067 
5068 typedef struct DiscardCo {
5069     BlockDriverState *bs;
5070     int64_t sector_num;
5071     int nb_sectors;
5072     int ret;
5073 } DiscardCo;
5074 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
5075 {
5076     DiscardCo *rwco = opaque;
5077 
5078     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
5079 }
5080 
5081 /* if no limit is specified in the BlockLimits use a default
5082  * of 32768 512-byte sectors (16 MiB) per request.
5083  */
5084 #define MAX_DISCARD_DEFAULT 32768
5085 
5086 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
5087                                  int nb_sectors)
5088 {
5089     int max_discard;
5090 
5091     if (!bs->drv) {
5092         return -ENOMEDIUM;
5093     } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
5094         return -EIO;
5095     } else if (bs->read_only) {
5096         return -EROFS;
5097     }
5098 
5099     bdrv_reset_dirty(bs, sector_num, nb_sectors);
5100 
5101     /* Do nothing if disabled.  */
5102     if (!(bs->open_flags & BDRV_O_UNMAP)) {
5103         return 0;
5104     }
5105 
5106     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
5107         return 0;
5108     }
5109 
5110     max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
5111     while (nb_sectors > 0) {
5112         int ret;
5113         int num = nb_sectors;
5114 
5115         /* align request */
5116         if (bs->bl.discard_alignment &&
5117             num >= bs->bl.discard_alignment &&
5118             sector_num % bs->bl.discard_alignment) {
5119             if (num > bs->bl.discard_alignment) {
5120                 num = bs->bl.discard_alignment;
5121             }
5122             num -= sector_num % bs->bl.discard_alignment;
5123         }
5124 
5125         /* limit request size */
5126         if (num > max_discard) {
5127             num = max_discard;
5128         }
5129 
5130         if (bs->drv->bdrv_co_discard) {
5131             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
5132         } else {
5133             BlockAIOCB *acb;
5134             CoroutineIOCompletion co = {
5135                 .coroutine = qemu_coroutine_self(),
5136             };
5137 
5138             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5139                                             bdrv_co_io_em_complete, &co);
5140             if (acb == NULL) {
5141                 return -EIO;
5142             } else {
5143                 qemu_coroutine_yield();
5144                 ret = co.ret;
5145             }
5146         }
5147         if (ret && ret != -ENOTSUP) {
5148             return ret;
5149         }
5150 
5151         sector_num += num;
5152         nb_sectors -= num;
5153     }
5154     return 0;
5155 }
5156 
5157 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5158 {
5159     Coroutine *co;
5160     DiscardCo rwco = {
5161         .bs = bs,
5162         .sector_num = sector_num,
5163         .nb_sectors = nb_sectors,
5164         .ret = NOT_DONE,
5165     };
5166 
5167     if (qemu_in_coroutine()) {
5168         /* Fast-path if already in coroutine context */
5169         bdrv_discard_co_entry(&rwco);
5170     } else {
5171         AioContext *aio_context = bdrv_get_aio_context(bs);
5172 
5173         co = qemu_coroutine_create(bdrv_discard_co_entry);
5174         qemu_coroutine_enter(co, &rwco);
5175         while (rwco.ret == NOT_DONE) {
5176             aio_poll(aio_context, true);
5177         }
5178     }
5179 
5180     return rwco.ret;
5181 }
5182 
5183 /**************************************************************/
5184 /* removable device support */
5185 
5186 /**
5187  * Return TRUE if the media is present
5188  */
5189 int bdrv_is_inserted(BlockDriverState *bs)
5190 {
5191     BlockDriver *drv = bs->drv;
5192 
5193     if (!drv)
5194         return 0;
5195     if (!drv->bdrv_is_inserted)
5196         return 1;
5197     return drv->bdrv_is_inserted(bs);
5198 }
5199 
5200 /**
5201  * Return whether the media changed since the last call to this
5202  * function, or -ENOTSUP if we don't know.  Most drivers don't know.
5203  */
5204 int bdrv_media_changed(BlockDriverState *bs)
5205 {
5206     BlockDriver *drv = bs->drv;
5207 
5208     if (drv && drv->bdrv_media_changed) {
5209         return drv->bdrv_media_changed(bs);
5210     }
5211     return -ENOTSUP;
5212 }
5213 
5214 /**
5215  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5216  */
5217 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
5218 {
5219     BlockDriver *drv = bs->drv;
5220     const char *device_name;
5221 
5222     if (drv && drv->bdrv_eject) {
5223         drv->bdrv_eject(bs, eject_flag);
5224     }
5225 
5226     device_name = bdrv_get_device_name(bs);
5227     if (device_name[0] != '\0') {
5228         qapi_event_send_device_tray_moved(device_name,
5229                                           eject_flag, &error_abort);
5230     }
5231 }
5232 
5233 /**
5234  * Lock or unlock the media (if it is locked, the user won't be able
5235  * to eject it manually).
5236  */
5237 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
5238 {
5239     BlockDriver *drv = bs->drv;
5240 
5241     trace_bdrv_lock_medium(bs, locked);
5242 
5243     if (drv && drv->bdrv_lock_medium) {
5244         drv->bdrv_lock_medium(bs, locked);
5245     }
5246 }
5247 
5248 /* needed for generic scsi interface */
5249 
5250 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5251 {
5252     BlockDriver *drv = bs->drv;
5253 
5254     if (drv && drv->bdrv_ioctl)
5255         return drv->bdrv_ioctl(bs, req, buf);
5256     return -ENOTSUP;
5257 }
5258 
5259 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5260         unsigned long int req, void *buf,
5261         BlockCompletionFunc *cb, void *opaque)
5262 {
5263     BlockDriver *drv = bs->drv;
5264 
5265     if (drv && drv->bdrv_aio_ioctl)
5266         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5267     return NULL;
5268 }
5269 
5270 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5271 {
5272     bs->guest_block_size = align;
5273 }
5274 
5275 void *qemu_blockalign(BlockDriverState *bs, size_t size)
5276 {
5277     return qemu_memalign(bdrv_opt_mem_align(bs), size);
5278 }
5279 
5280 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
5281 {
5282     return memset(qemu_blockalign(bs, size), 0, size);
5283 }
5284 
5285 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
5286 {
5287     size_t align = bdrv_opt_mem_align(bs);
5288 
5289     /* Ensure that NULL is never returned on success */
5290     assert(align > 0);
5291     if (size == 0) {
5292         size = align;
5293     }
5294 
5295     return qemu_try_memalign(align, size);
5296 }
5297 
5298 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
5299 {
5300     void *mem = qemu_try_blockalign(bs, size);
5301 
5302     if (mem) {
5303         memset(mem, 0, size);
5304     }
5305 
5306     return mem;
5307 }
5308 
5309 /*
5310  * Check if all memory in this vector is sector aligned.
5311  */
5312 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5313 {
5314     int i;
5315     size_t alignment = bdrv_opt_mem_align(bs);
5316 
5317     for (i = 0; i < qiov->niov; i++) {
5318         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5319             return false;
5320         }
5321         if (qiov->iov[i].iov_len % alignment) {
5322             return false;
5323         }
5324     }
5325 
5326     return true;
5327 }
5328 
5329 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5330                                           Error **errp)
5331 {
5332     int64_t bitmap_size;
5333     BdrvDirtyBitmap *bitmap;
5334 
5335     assert((granularity & (granularity - 1)) == 0);
5336 
5337     granularity >>= BDRV_SECTOR_BITS;
5338     assert(granularity);
5339     bitmap_size = bdrv_nb_sectors(bs);
5340     if (bitmap_size < 0) {
5341         error_setg_errno(errp, -bitmap_size, "could not get length of device");
5342         errno = -bitmap_size;
5343         return NULL;
5344     }
5345     bitmap = g_new0(BdrvDirtyBitmap, 1);
5346     bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5347     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5348     return bitmap;
5349 }
5350 
5351 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5352 {
5353     BdrvDirtyBitmap *bm, *next;
5354     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5355         if (bm == bitmap) {
5356             QLIST_REMOVE(bitmap, list);
5357             hbitmap_free(bitmap->bitmap);
5358             g_free(bitmap);
5359             return;
5360         }
5361     }
5362 }
5363 
5364 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5365 {
5366     BdrvDirtyBitmap *bm;
5367     BlockDirtyInfoList *list = NULL;
5368     BlockDirtyInfoList **plist = &list;
5369 
5370     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5371         BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
5372         BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
5373         info->count = bdrv_get_dirty_count(bs, bm);
5374         info->granularity =
5375             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5376         entry->value = info;
5377         *plist = entry;
5378         plist = &entry->next;
5379     }
5380 
5381     return list;
5382 }
5383 
5384 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5385 {
5386     if (bitmap) {
5387         return hbitmap_get(bitmap->bitmap, sector);
5388     } else {
5389         return 0;
5390     }
5391 }
5392 
5393 void bdrv_dirty_iter_init(BlockDriverState *bs,
5394                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5395 {
5396     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5397 }
5398 
5399 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5400                     int nr_sectors)
5401 {
5402     BdrvDirtyBitmap *bitmap;
5403     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5404         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5405     }
5406 }
5407 
5408 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5409 {
5410     BdrvDirtyBitmap *bitmap;
5411     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5412         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5413     }
5414 }
5415 
5416 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5417 {
5418     return hbitmap_count(bitmap->bitmap);
5419 }
5420 
5421 /* Get a reference to bs */
5422 void bdrv_ref(BlockDriverState *bs)
5423 {
5424     bs->refcnt++;
5425 }
5426 
5427 /* Release a previously grabbed reference to bs.
5428  * If after releasing, reference count is zero, the BlockDriverState is
5429  * deleted. */
5430 void bdrv_unref(BlockDriverState *bs)
5431 {
5432     if (!bs) {
5433         return;
5434     }
5435     assert(bs->refcnt > 0);
5436     if (--bs->refcnt == 0) {
5437         bdrv_delete(bs);
5438     }
5439 }
5440 
5441 struct BdrvOpBlocker {
5442     Error *reason;
5443     QLIST_ENTRY(BdrvOpBlocker) list;
5444 };
5445 
5446 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
5447 {
5448     BdrvOpBlocker *blocker;
5449     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5450     if (!QLIST_EMPTY(&bs->op_blockers[op])) {
5451         blocker = QLIST_FIRST(&bs->op_blockers[op]);
5452         if (errp) {
5453             error_setg(errp, "Device '%s' is busy: %s",
5454                        bdrv_get_device_name(bs),
5455                        error_get_pretty(blocker->reason));
5456         }
5457         return true;
5458     }
5459     return false;
5460 }
5461 
5462 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
5463 {
5464     BdrvOpBlocker *blocker;
5465     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5466 
5467     blocker = g_new0(BdrvOpBlocker, 1);
5468     blocker->reason = reason;
5469     QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
5470 }
5471 
5472 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
5473 {
5474     BdrvOpBlocker *blocker, *next;
5475     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
5476     QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
5477         if (blocker->reason == reason) {
5478             QLIST_REMOVE(blocker, list);
5479             g_free(blocker);
5480         }
5481     }
5482 }
5483 
5484 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
5485 {
5486     int i;
5487     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5488         bdrv_op_block(bs, i, reason);
5489     }
5490 }
5491 
5492 void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
5493 {
5494     int i;
5495     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5496         bdrv_op_unblock(bs, i, reason);
5497     }
5498 }
5499 
5500 bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
5501 {
5502     int i;
5503 
5504     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
5505         if (!QLIST_EMPTY(&bs->op_blockers[i])) {
5506             return false;
5507         }
5508     }
5509     return true;
5510 }
5511 
5512 void bdrv_iostatus_enable(BlockDriverState *bs)
5513 {
5514     bs->iostatus_enabled = true;
5515     bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5516 }
5517 
5518 /* The I/O status is only enabled if the drive explicitly
5519  * enables it _and_ the VM is configured to stop on errors */
5520 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5521 {
5522     return (bs->iostatus_enabled &&
5523            (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5524             bs->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
5525             bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5526 }
5527 
5528 void bdrv_iostatus_disable(BlockDriverState *bs)
5529 {
5530     bs->iostatus_enabled = false;
5531 }
5532 
5533 void bdrv_iostatus_reset(BlockDriverState *bs)
5534 {
5535     if (bdrv_iostatus_is_enabled(bs)) {
5536         bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5537         if (bs->job) {
5538             block_job_iostatus_reset(bs->job);
5539         }
5540     }
5541 }
5542 
5543 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5544 {
5545     assert(bdrv_iostatus_is_enabled(bs));
5546     if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5547         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5548                                          BLOCK_DEVICE_IO_STATUS_FAILED;
5549     }
5550 }
5551 
5552 void bdrv_img_create(const char *filename, const char *fmt,
5553                      const char *base_filename, const char *base_fmt,
5554                      char *options, uint64_t img_size, int flags,
5555                      Error **errp, bool quiet)
5556 {
5557     QemuOptsList *create_opts = NULL;
5558     QemuOpts *opts = NULL;
5559     const char *backing_fmt, *backing_file;
5560     int64_t size;
5561     BlockDriver *drv, *proto_drv;
5562     BlockDriver *backing_drv = NULL;
5563     Error *local_err = NULL;
5564     int ret = 0;
5565 
5566     /* Find driver and parse its options */
5567     drv = bdrv_find_format(fmt);
5568     if (!drv) {
5569         error_setg(errp, "Unknown file format '%s'", fmt);
5570         return;
5571     }
5572 
5573     proto_drv = bdrv_find_protocol(filename, true);
5574     if (!proto_drv) {
5575         error_setg(errp, "Unknown protocol '%s'", filename);
5576         return;
5577     }
5578 
5579     create_opts = qemu_opts_append(create_opts, drv->create_opts);
5580     create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
5581 
5582     /* Create parameter list with default values */
5583     opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
5584     qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size);
5585 
5586     /* Parse -o options */
5587     if (options) {
5588         if (qemu_opts_do_parse(opts, options, NULL) != 0) {
5589             error_setg(errp, "Invalid options for file format '%s'", fmt);
5590             goto out;
5591         }
5592     }
5593 
5594     if (base_filename) {
5595         if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename)) {
5596             error_setg(errp, "Backing file not supported for file format '%s'",
5597                        fmt);
5598             goto out;
5599         }
5600     }
5601 
5602     if (base_fmt) {
5603         if (qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5604             error_setg(errp, "Backing file format not supported for file "
5605                              "format '%s'", fmt);
5606             goto out;
5607         }
5608     }
5609 
5610     backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5611     if (backing_file) {
5612         if (!strcmp(filename, backing_file)) {
5613             error_setg(errp, "Error: Trying to create an image with the "
5614                              "same filename as the backing file");
5615             goto out;
5616         }
5617     }
5618 
5619     backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5620     if (backing_fmt) {
5621         backing_drv = bdrv_find_format(backing_fmt);
5622         if (!backing_drv) {
5623             error_setg(errp, "Unknown backing file format '%s'",
5624                        backing_fmt);
5625             goto out;
5626         }
5627     }
5628 
5629     // The size for the image must always be specified, with one exception:
5630     // If we are using a backing file, we can obtain the size from there
5631     size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5632     if (size == -1) {
5633         if (backing_file) {
5634             BlockDriverState *bs;
5635             int64_t size;
5636             int back_flags;
5637 
5638             /* backing files always opened read-only */
5639             back_flags =
5640                 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5641 
5642             bs = NULL;
5643             ret = bdrv_open(&bs, backing_file, NULL, NULL, back_flags,
5644                             backing_drv, &local_err);
5645             if (ret < 0) {
5646                 goto out;
5647             }
5648             size = bdrv_getlength(bs);
5649             if (size < 0) {
5650                 error_setg_errno(errp, -size, "Could not get size of '%s'",
5651                                  backing_file);
5652                 bdrv_unref(bs);
5653                 goto out;
5654             }
5655 
5656             qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size);
5657 
5658             bdrv_unref(bs);
5659         } else {
5660             error_setg(errp, "Image creation needs a size parameter");
5661             goto out;
5662         }
5663     }
5664 
5665     if (!quiet) {
5666         printf("Formatting '%s', fmt=%s ", filename, fmt);
5667         qemu_opts_print(opts);
5668         puts("");
5669     }
5670 
5671     ret = bdrv_create(drv, filename, opts, &local_err);
5672 
5673     if (ret == -EFBIG) {
5674         /* This is generally a better message than whatever the driver would
5675          * deliver (especially because of the cluster_size_hint), since that
5676          * is most probably not much different from "image too large". */
5677         const char *cluster_size_hint = "";
5678         if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
5679             cluster_size_hint = " (try using a larger cluster size)";
5680         }
5681         error_setg(errp, "The image size is too large for file format '%s'"
5682                    "%s", fmt, cluster_size_hint);
5683         error_free(local_err);
5684         local_err = NULL;
5685     }
5686 
5687 out:
5688     qemu_opts_del(opts);
5689     qemu_opts_free(create_opts);
5690     if (local_err) {
5691         error_propagate(errp, local_err);
5692     }
5693 }
5694 
5695 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5696 {
5697     return bs->aio_context;
5698 }
5699 
5700 void bdrv_detach_aio_context(BlockDriverState *bs)
5701 {
5702     BdrvAioNotifier *baf;
5703 
5704     if (!bs->drv) {
5705         return;
5706     }
5707 
5708     QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
5709         baf->detach_aio_context(baf->opaque);
5710     }
5711 
5712     if (bs->io_limits_enabled) {
5713         throttle_detach_aio_context(&bs->throttle_state);
5714     }
5715     if (bs->drv->bdrv_detach_aio_context) {
5716         bs->drv->bdrv_detach_aio_context(bs);
5717     }
5718     if (bs->file) {
5719         bdrv_detach_aio_context(bs->file);
5720     }
5721     if (bs->backing_hd) {
5722         bdrv_detach_aio_context(bs->backing_hd);
5723     }
5724 
5725     bs->aio_context = NULL;
5726 }
5727 
5728 void bdrv_attach_aio_context(BlockDriverState *bs,
5729                              AioContext *new_context)
5730 {
5731     BdrvAioNotifier *ban;
5732 
5733     if (!bs->drv) {
5734         return;
5735     }
5736 
5737     bs->aio_context = new_context;
5738 
5739     if (bs->backing_hd) {
5740         bdrv_attach_aio_context(bs->backing_hd, new_context);
5741     }
5742     if (bs->file) {
5743         bdrv_attach_aio_context(bs->file, new_context);
5744     }
5745     if (bs->drv->bdrv_attach_aio_context) {
5746         bs->drv->bdrv_attach_aio_context(bs, new_context);
5747     }
5748     if (bs->io_limits_enabled) {
5749         throttle_attach_aio_context(&bs->throttle_state, new_context);
5750     }
5751 
5752     QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
5753         ban->attached_aio_context(new_context, ban->opaque);
5754     }
5755 }
5756 
5757 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
5758 {
5759     bdrv_drain_all(); /* ensure there are no in-flight requests */
5760 
5761     bdrv_detach_aio_context(bs);
5762 
5763     /* This function executes in the old AioContext so acquire the new one in
5764      * case it runs in a different thread.
5765      */
5766     aio_context_acquire(new_context);
5767     bdrv_attach_aio_context(bs, new_context);
5768     aio_context_release(new_context);
5769 }
5770 
5771 void bdrv_add_aio_context_notifier(BlockDriverState *bs,
5772         void (*attached_aio_context)(AioContext *new_context, void *opaque),
5773         void (*detach_aio_context)(void *opaque), void *opaque)
5774 {
5775     BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
5776     *ban = (BdrvAioNotifier){
5777         .attached_aio_context = attached_aio_context,
5778         .detach_aio_context   = detach_aio_context,
5779         .opaque               = opaque
5780     };
5781 
5782     QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
5783 }
5784 
5785 void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
5786                                       void (*attached_aio_context)(AioContext *,
5787                                                                    void *),
5788                                       void (*detach_aio_context)(void *),
5789                                       void *opaque)
5790 {
5791     BdrvAioNotifier *ban, *ban_next;
5792 
5793     QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5794         if (ban->attached_aio_context == attached_aio_context &&
5795             ban->detach_aio_context   == detach_aio_context   &&
5796             ban->opaque               == opaque)
5797         {
5798             QLIST_REMOVE(ban, list);
5799             g_free(ban);
5800 
5801             return;
5802         }
5803     }
5804 
5805     abort();
5806 }
5807 
5808 void bdrv_add_before_write_notifier(BlockDriverState *bs,
5809                                     NotifierWithReturn *notifier)
5810 {
5811     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5812 }
5813 
5814 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
5815                        BlockDriverAmendStatusCB *status_cb)
5816 {
5817     if (!bs->drv->bdrv_amend_options) {
5818         return -ENOTSUP;
5819     }
5820     return bs->drv->bdrv_amend_options(bs, opts, status_cb);
5821 }
5822 
5823 /* This function will be called by the bdrv_recurse_is_first_non_filter method
5824  * of block filter and by bdrv_is_first_non_filter.
5825  * It is used to test if the given bs is the candidate or recurse more in the
5826  * node graph.
5827  */
5828 bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5829                                       BlockDriverState *candidate)
5830 {
5831     /* return false if basic checks fails */
5832     if (!bs || !bs->drv) {
5833         return false;
5834     }
5835 
5836     /* the code reached a non block filter driver -> check if the bs is
5837      * the same as the candidate. It's the recursion termination condition.
5838      */
5839     if (!bs->drv->is_filter) {
5840         return bs == candidate;
5841     }
5842     /* Down this path the driver is a block filter driver */
5843 
5844     /* If the block filter recursion method is defined use it to recurse down
5845      * the node graph.
5846      */
5847     if (bs->drv->bdrv_recurse_is_first_non_filter) {
5848         return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5849     }
5850 
5851     /* the driver is a block filter but don't allow to recurse -> return false
5852      */
5853     return false;
5854 }
5855 
5856 /* This function checks if the candidate is the first non filter bs down it's
5857  * bs chain. Since we don't have pointers to parents it explore all bs chains
5858  * from the top. Some filters can choose not to pass down the recursion.
5859  */
5860 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5861 {
5862     BlockDriverState *bs;
5863 
5864     /* walk down the bs forest recursively */
5865     QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5866         bool perm;
5867 
5868         /* try to recurse in this top level bs */
5869         perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5870 
5871         /* candidate is the first non filter */
5872         if (perm) {
5873             return true;
5874         }
5875     }
5876 
5877     return false;
5878 }
5879 
5880 BlockDriverState *check_to_replace_node(const char *node_name, Error **errp)
5881 {
5882     BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
5883     AioContext *aio_context;
5884 
5885     if (!to_replace_bs) {
5886         error_setg(errp, "Node name '%s' not found", node_name);
5887         return NULL;
5888     }
5889 
5890     aio_context = bdrv_get_aio_context(to_replace_bs);
5891     aio_context_acquire(aio_context);
5892 
5893     if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
5894         to_replace_bs = NULL;
5895         goto out;
5896     }
5897 
5898     /* We don't want arbitrary node of the BDS chain to be replaced only the top
5899      * most non filter in order to prevent data corruption.
5900      * Another benefit is that this tests exclude backing files which are
5901      * blocked by the backing blockers.
5902      */
5903     if (!bdrv_is_first_non_filter(to_replace_bs)) {
5904         error_setg(errp, "Only top most non filter can be replaced");
5905         to_replace_bs = NULL;
5906         goto out;
5907     }
5908 
5909 out:
5910     aio_context_release(aio_context);
5911     return to_replace_bs;
5912 }
5913 
5914 void bdrv_io_plug(BlockDriverState *bs)
5915 {
5916     BlockDriver *drv = bs->drv;
5917     if (drv && drv->bdrv_io_plug) {
5918         drv->bdrv_io_plug(bs);
5919     } else if (bs->file) {
5920         bdrv_io_plug(bs->file);
5921     }
5922 }
5923 
5924 void bdrv_io_unplug(BlockDriverState *bs)
5925 {
5926     BlockDriver *drv = bs->drv;
5927     if (drv && drv->bdrv_io_unplug) {
5928         drv->bdrv_io_unplug(bs);
5929     } else if (bs->file) {
5930         bdrv_io_unplug(bs->file);
5931     }
5932 }
5933 
5934 void bdrv_flush_io_queue(BlockDriverState *bs)
5935 {
5936     BlockDriver *drv = bs->drv;
5937     if (drv && drv->bdrv_flush_io_queue) {
5938         drv->bdrv_flush_io_queue(bs);
5939     } else if (bs->file) {
5940         bdrv_flush_io_queue(bs->file);
5941     }
5942 }
5943 
5944 static bool append_open_options(QDict *d, BlockDriverState *bs)
5945 {
5946     const QDictEntry *entry;
5947     bool found_any = false;
5948 
5949     for (entry = qdict_first(bs->options); entry;
5950          entry = qdict_next(bs->options, entry))
5951     {
5952         /* Only take options for this level and exclude all non-driver-specific
5953          * options */
5954         if (!strchr(qdict_entry_key(entry), '.') &&
5955             strcmp(qdict_entry_key(entry), "node-name"))
5956         {
5957             qobject_incref(qdict_entry_value(entry));
5958             qdict_put_obj(d, qdict_entry_key(entry), qdict_entry_value(entry));
5959             found_any = true;
5960         }
5961     }
5962 
5963     return found_any;
5964 }
5965 
5966 /* Updates the following BDS fields:
5967  *  - exact_filename: A filename which may be used for opening a block device
5968  *                    which (mostly) equals the given BDS (even without any
5969  *                    other options; so reading and writing must return the same
5970  *                    results, but caching etc. may be different)
5971  *  - full_open_options: Options which, when given when opening a block device
5972  *                       (without a filename), result in a BDS (mostly)
5973  *                       equalling the given one
5974  *  - filename: If exact_filename is set, it is copied here. Otherwise,
5975  *              full_open_options is converted to a JSON object, prefixed with
5976  *              "json:" (for use through the JSON pseudo protocol) and put here.
5977  */
5978 void bdrv_refresh_filename(BlockDriverState *bs)
5979 {
5980     BlockDriver *drv = bs->drv;
5981     QDict *opts;
5982 
5983     if (!drv) {
5984         return;
5985     }
5986 
5987     /* This BDS's file name will most probably depend on its file's name, so
5988      * refresh that first */
5989     if (bs->file) {
5990         bdrv_refresh_filename(bs->file);
5991     }
5992 
5993     if (drv->bdrv_refresh_filename) {
5994         /* Obsolete information is of no use here, so drop the old file name
5995          * information before refreshing it */
5996         bs->exact_filename[0] = '\0';
5997         if (bs->full_open_options) {
5998             QDECREF(bs->full_open_options);
5999             bs->full_open_options = NULL;
6000         }
6001 
6002         drv->bdrv_refresh_filename(bs);
6003     } else if (bs->file) {
6004         /* Try to reconstruct valid information from the underlying file */
6005         bool has_open_options;
6006 
6007         bs->exact_filename[0] = '\0';
6008         if (bs->full_open_options) {
6009             QDECREF(bs->full_open_options);
6010             bs->full_open_options = NULL;
6011         }
6012 
6013         opts = qdict_new();
6014         has_open_options = append_open_options(opts, bs);
6015 
6016         /* If no specific options have been given for this BDS, the filename of
6017          * the underlying file should suffice for this one as well */
6018         if (bs->file->exact_filename[0] && !has_open_options) {
6019             strcpy(bs->exact_filename, bs->file->exact_filename);
6020         }
6021         /* Reconstructing the full options QDict is simple for most format block
6022          * drivers, as long as the full options are known for the underlying
6023          * file BDS. The full options QDict of that file BDS should somehow
6024          * contain a representation of the filename, therefore the following
6025          * suffices without querying the (exact_)filename of this BDS. */
6026         if (bs->file->full_open_options) {
6027             qdict_put_obj(opts, "driver",
6028                           QOBJECT(qstring_from_str(drv->format_name)));
6029             QINCREF(bs->file->full_open_options);
6030             qdict_put_obj(opts, "file", QOBJECT(bs->file->full_open_options));
6031 
6032             bs->full_open_options = opts;
6033         } else {
6034             QDECREF(opts);
6035         }
6036     } else if (!bs->full_open_options && qdict_size(bs->options)) {
6037         /* There is no underlying file BDS (at least referenced by BDS.file),
6038          * so the full options QDict should be equal to the options given
6039          * specifically for this block device when it was opened (plus the
6040          * driver specification).
6041          * Because those options don't change, there is no need to update
6042          * full_open_options when it's already set. */
6043 
6044         opts = qdict_new();
6045         append_open_options(opts, bs);
6046         qdict_put_obj(opts, "driver",
6047                       QOBJECT(qstring_from_str(drv->format_name)));
6048 
6049         if (bs->exact_filename[0]) {
6050             /* This may not work for all block protocol drivers (some may
6051              * require this filename to be parsed), but we have to find some
6052              * default solution here, so just include it. If some block driver
6053              * does not support pure options without any filename at all or
6054              * needs some special format of the options QDict, it needs to
6055              * implement the driver-specific bdrv_refresh_filename() function.
6056              */
6057             qdict_put_obj(opts, "filename",
6058                           QOBJECT(qstring_from_str(bs->exact_filename)));
6059         }
6060 
6061         bs->full_open_options = opts;
6062     }
6063 
6064     if (bs->exact_filename[0]) {
6065         pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
6066     } else if (bs->full_open_options) {
6067         QString *json = qobject_to_json(QOBJECT(bs->full_open_options));
6068         snprintf(bs->filename, sizeof(bs->filename), "json:%s",
6069                  qstring_get_str(json));
6070         QDECREF(json);
6071     }
6072 }
6073 
6074 /* This accessor function purpose is to allow the device models to access the
6075  * BlockAcctStats structure embedded inside a BlockDriverState without being
6076  * aware of the BlockDriverState structure layout.
6077  * It will go away when the BlockAcctStats structure will be moved inside
6078  * the device models.
6079  */
6080 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
6081 {
6082     return &bs->stats;
6083 }
6084