xref: /openbmc/qemu/block/io.c (revision c80f6e9c)
1 /*
2  * Block layer I/O functions
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "trace.h"
27 #include "sysemu/block-backend.h"
28 #include "block/blockjob.h"
29 #include "block/block_int.h"
30 #include "block/throttle-groups.h"
31 #include "qapi/error.h"
32 #include "qemu/error-report.h"
33 
34 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
35 
36 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
37         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
38         BlockCompletionFunc *cb, void *opaque);
39 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
40         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
41         BlockCompletionFunc *cb, void *opaque);
42 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
43                                          int64_t sector_num, int nb_sectors,
44                                          QEMUIOVector *iov);
45 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
46                                          int64_t sector_num, int nb_sectors,
47                                          QEMUIOVector *iov);
48 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
49                                          int64_t sector_num,
50                                          QEMUIOVector *qiov,
51                                          int nb_sectors,
52                                          BdrvRequestFlags flags,
53                                          BlockCompletionFunc *cb,
54                                          void *opaque,
55                                          bool is_write);
56 static void coroutine_fn bdrv_co_do_rw(void *opaque);
57 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
58     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
59 
60 /* throttling disk I/O limits */
61 void bdrv_set_io_limits(BlockDriverState *bs,
62                         ThrottleConfig *cfg)
63 {
64     int i;
65 
66     throttle_group_config(bs, cfg);
67 
68     for (i = 0; i < 2; i++) {
69         qemu_co_enter_next(&bs->throttled_reqs[i]);
70     }
71 }
72 
73 /* this function drain all the throttled IOs */
74 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
75 {
76     bool drained = false;
77     bool enabled = bs->io_limits_enabled;
78     int i;
79 
80     bs->io_limits_enabled = false;
81 
82     for (i = 0; i < 2; i++) {
83         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
84             drained = true;
85         }
86     }
87 
88     bs->io_limits_enabled = enabled;
89 
90     return drained;
91 }
92 
93 void bdrv_io_limits_disable(BlockDriverState *bs)
94 {
95     bs->io_limits_enabled = false;
96     bdrv_start_throttled_reqs(bs);
97     throttle_group_unregister_bs(bs);
98 }
99 
100 /* should be called before bdrv_set_io_limits if a limit is set */
101 void bdrv_io_limits_enable(BlockDriverState *bs, const char *group)
102 {
103     assert(!bs->io_limits_enabled);
104     throttle_group_register_bs(bs, group);
105     bs->io_limits_enabled = true;
106 }
107 
108 void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group)
109 {
110     /* this bs is not part of any group */
111     if (!bs->throttle_state) {
112         return;
113     }
114 
115     /* this bs is a part of the same group than the one we want */
116     if (!g_strcmp0(throttle_group_get_name(bs), group)) {
117         return;
118     }
119 
120     /* need to change the group this bs belong to */
121     bdrv_io_limits_disable(bs);
122     bdrv_io_limits_enable(bs, group);
123 }
124 
125 void bdrv_setup_io_funcs(BlockDriver *bdrv)
126 {
127     /* Block drivers without coroutine functions need emulation */
128     if (!bdrv->bdrv_co_readv) {
129         bdrv->bdrv_co_readv = bdrv_co_readv_em;
130         bdrv->bdrv_co_writev = bdrv_co_writev_em;
131 
132         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
133          * the block driver lacks aio we need to emulate that too.
134          */
135         if (!bdrv->bdrv_aio_readv) {
136             /* add AIO emulation layer */
137             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
138             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
139         }
140     }
141 }
142 
143 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
144 {
145     BlockDriver *drv = bs->drv;
146     Error *local_err = NULL;
147 
148     memset(&bs->bl, 0, sizeof(bs->bl));
149 
150     if (!drv) {
151         return;
152     }
153 
154     /* Take some limits from the children as a default */
155     if (bs->file) {
156         bdrv_refresh_limits(bs->file->bs, &local_err);
157         if (local_err) {
158             error_propagate(errp, local_err);
159             return;
160         }
161         bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length;
162         bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length;
163         bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment;
164         bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment;
165         bs->bl.max_iov = bs->file->bs->bl.max_iov;
166     } else {
167         bs->bl.min_mem_alignment = 512;
168         bs->bl.opt_mem_alignment = getpagesize();
169 
170         /* Safe default since most protocols use readv()/writev()/etc */
171         bs->bl.max_iov = IOV_MAX;
172     }
173 
174     if (bs->backing) {
175         bdrv_refresh_limits(bs->backing->bs, &local_err);
176         if (local_err) {
177             error_propagate(errp, local_err);
178             return;
179         }
180         bs->bl.opt_transfer_length =
181             MAX(bs->bl.opt_transfer_length,
182                 bs->backing->bs->bl.opt_transfer_length);
183         bs->bl.max_transfer_length =
184             MIN_NON_ZERO(bs->bl.max_transfer_length,
185                          bs->backing->bs->bl.max_transfer_length);
186         bs->bl.opt_mem_alignment =
187             MAX(bs->bl.opt_mem_alignment,
188                 bs->backing->bs->bl.opt_mem_alignment);
189         bs->bl.min_mem_alignment =
190             MAX(bs->bl.min_mem_alignment,
191                 bs->backing->bs->bl.min_mem_alignment);
192         bs->bl.max_iov =
193             MIN(bs->bl.max_iov,
194                 bs->backing->bs->bl.max_iov);
195     }
196 
197     /* Then let the driver override it */
198     if (drv->bdrv_refresh_limits) {
199         drv->bdrv_refresh_limits(bs, errp);
200     }
201 }
202 
203 /**
204  * The copy-on-read flag is actually a reference count so multiple users may
205  * use the feature without worrying about clobbering its previous state.
206  * Copy-on-read stays enabled until all users have called to disable it.
207  */
208 void bdrv_enable_copy_on_read(BlockDriverState *bs)
209 {
210     bs->copy_on_read++;
211 }
212 
213 void bdrv_disable_copy_on_read(BlockDriverState *bs)
214 {
215     assert(bs->copy_on_read > 0);
216     bs->copy_on_read--;
217 }
218 
219 /* Check if any requests are in-flight (including throttled requests) */
220 bool bdrv_requests_pending(BlockDriverState *bs)
221 {
222     BdrvChild *child;
223 
224     if (!QLIST_EMPTY(&bs->tracked_requests)) {
225         return true;
226     }
227     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
228         return true;
229     }
230     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
231         return true;
232     }
233 
234     QLIST_FOREACH(child, &bs->children, next) {
235         if (bdrv_requests_pending(child->bs)) {
236             return true;
237         }
238     }
239 
240     return false;
241 }
242 
243 static void bdrv_drain_recurse(BlockDriverState *bs)
244 {
245     BdrvChild *child;
246 
247     if (bs->drv && bs->drv->bdrv_drain) {
248         bs->drv->bdrv_drain(bs);
249     }
250     QLIST_FOREACH(child, &bs->children, next) {
251         bdrv_drain_recurse(child->bs);
252     }
253 }
254 
255 /*
256  * Wait for pending requests to complete on a single BlockDriverState subtree,
257  * and suspend block driver's internal I/O until next request arrives.
258  *
259  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
260  * AioContext.
261  *
262  * Only this BlockDriverState's AioContext is run, so in-flight requests must
263  * not depend on events in other AioContexts.  In that case, use
264  * bdrv_drain_all() instead.
265  */
266 void bdrv_drain(BlockDriverState *bs)
267 {
268     bool busy = true;
269 
270     bdrv_drain_recurse(bs);
271     while (busy) {
272         /* Keep iterating */
273          bdrv_flush_io_queue(bs);
274          busy = bdrv_requests_pending(bs);
275          busy |= aio_poll(bdrv_get_aio_context(bs), busy);
276     }
277 }
278 
279 /*
280  * Wait for pending requests to complete across all BlockDriverStates
281  *
282  * This function does not flush data to disk, use bdrv_flush_all() for that
283  * after calling this function.
284  */
285 void bdrv_drain_all(void)
286 {
287     /* Always run first iteration so any pending completion BHs run */
288     bool busy = true;
289     BlockDriverState *bs = NULL;
290     GSList *aio_ctxs = NULL, *ctx;
291 
292     while ((bs = bdrv_next(bs))) {
293         AioContext *aio_context = bdrv_get_aio_context(bs);
294 
295         aio_context_acquire(aio_context);
296         if (bs->job) {
297             block_job_pause(bs->job);
298         }
299         bdrv_drain_recurse(bs);
300         aio_context_release(aio_context);
301 
302         if (!g_slist_find(aio_ctxs, aio_context)) {
303             aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
304         }
305     }
306 
307     /* Note that completion of an asynchronous I/O operation can trigger any
308      * number of other I/O operations on other devices---for example a
309      * coroutine can submit an I/O request to another device in response to
310      * request completion.  Therefore we must keep looping until there was no
311      * more activity rather than simply draining each device independently.
312      */
313     while (busy) {
314         busy = false;
315 
316         for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
317             AioContext *aio_context = ctx->data;
318             bs = NULL;
319 
320             aio_context_acquire(aio_context);
321             while ((bs = bdrv_next(bs))) {
322                 if (aio_context == bdrv_get_aio_context(bs)) {
323                     bdrv_flush_io_queue(bs);
324                     if (bdrv_requests_pending(bs)) {
325                         busy = true;
326                         aio_poll(aio_context, busy);
327                     }
328                 }
329             }
330             busy |= aio_poll(aio_context, false);
331             aio_context_release(aio_context);
332         }
333     }
334 
335     bs = NULL;
336     while ((bs = bdrv_next(bs))) {
337         AioContext *aio_context = bdrv_get_aio_context(bs);
338 
339         aio_context_acquire(aio_context);
340         if (bs->job) {
341             block_job_resume(bs->job);
342         }
343         aio_context_release(aio_context);
344     }
345     g_slist_free(aio_ctxs);
346 }
347 
348 /**
349  * Remove an active request from the tracked requests list
350  *
351  * This function should be called when a tracked request is completing.
352  */
353 static void tracked_request_end(BdrvTrackedRequest *req)
354 {
355     if (req->serialising) {
356         req->bs->serialising_in_flight--;
357     }
358 
359     QLIST_REMOVE(req, list);
360     qemu_co_queue_restart_all(&req->wait_queue);
361 }
362 
363 /**
364  * Add an active request to the tracked requests list
365  */
366 static void tracked_request_begin(BdrvTrackedRequest *req,
367                                   BlockDriverState *bs,
368                                   int64_t offset,
369                                   unsigned int bytes,
370                                   enum BdrvTrackedRequestType type)
371 {
372     *req = (BdrvTrackedRequest){
373         .bs = bs,
374         .offset         = offset,
375         .bytes          = bytes,
376         .type           = type,
377         .co             = qemu_coroutine_self(),
378         .serialising    = false,
379         .overlap_offset = offset,
380         .overlap_bytes  = bytes,
381     };
382 
383     qemu_co_queue_init(&req->wait_queue);
384 
385     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
386 }
387 
388 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
389 {
390     int64_t overlap_offset = req->offset & ~(align - 1);
391     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
392                                - overlap_offset;
393 
394     if (!req->serialising) {
395         req->bs->serialising_in_flight++;
396         req->serialising = true;
397     }
398 
399     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
400     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
401 }
402 
403 /**
404  * Round a region to cluster boundaries
405  */
406 void bdrv_round_to_clusters(BlockDriverState *bs,
407                             int64_t sector_num, int nb_sectors,
408                             int64_t *cluster_sector_num,
409                             int *cluster_nb_sectors)
410 {
411     BlockDriverInfo bdi;
412 
413     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
414         *cluster_sector_num = sector_num;
415         *cluster_nb_sectors = nb_sectors;
416     } else {
417         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
418         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
419         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
420                                             nb_sectors, c);
421     }
422 }
423 
424 static int bdrv_get_cluster_size(BlockDriverState *bs)
425 {
426     BlockDriverInfo bdi;
427     int ret;
428 
429     ret = bdrv_get_info(bs, &bdi);
430     if (ret < 0 || bdi.cluster_size == 0) {
431         return bs->request_alignment;
432     } else {
433         return bdi.cluster_size;
434     }
435 }
436 
437 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
438                                      int64_t offset, unsigned int bytes)
439 {
440     /*        aaaa   bbbb */
441     if (offset >= req->overlap_offset + req->overlap_bytes) {
442         return false;
443     }
444     /* bbbb   aaaa        */
445     if (req->overlap_offset >= offset + bytes) {
446         return false;
447     }
448     return true;
449 }
450 
451 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
452 {
453     BlockDriverState *bs = self->bs;
454     BdrvTrackedRequest *req;
455     bool retry;
456     bool waited = false;
457 
458     if (!bs->serialising_in_flight) {
459         return false;
460     }
461 
462     do {
463         retry = false;
464         QLIST_FOREACH(req, &bs->tracked_requests, list) {
465             if (req == self || (!req->serialising && !self->serialising)) {
466                 continue;
467             }
468             if (tracked_request_overlaps(req, self->overlap_offset,
469                                          self->overlap_bytes))
470             {
471                 /* Hitting this means there was a reentrant request, for
472                  * example, a block driver issuing nested requests.  This must
473                  * never happen since it means deadlock.
474                  */
475                 assert(qemu_coroutine_self() != req->co);
476 
477                 /* If the request is already (indirectly) waiting for us, or
478                  * will wait for us as soon as it wakes up, then just go on
479                  * (instead of producing a deadlock in the former case). */
480                 if (!req->waiting_for) {
481                     self->waiting_for = req;
482                     qemu_co_queue_wait(&req->wait_queue);
483                     self->waiting_for = NULL;
484                     retry = true;
485                     waited = true;
486                     break;
487                 }
488             }
489         }
490     } while (retry);
491 
492     return waited;
493 }
494 
495 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
496                                    size_t size)
497 {
498     if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
499         return -EIO;
500     }
501 
502     if (!bdrv_is_inserted(bs)) {
503         return -ENOMEDIUM;
504     }
505 
506     if (offset < 0) {
507         return -EIO;
508     }
509 
510     return 0;
511 }
512 
513 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
514                               int nb_sectors)
515 {
516     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
517         return -EIO;
518     }
519 
520     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
521                                    nb_sectors * BDRV_SECTOR_SIZE);
522 }
523 
524 typedef struct RwCo {
525     BlockDriverState *bs;
526     int64_t offset;
527     QEMUIOVector *qiov;
528     bool is_write;
529     int ret;
530     BdrvRequestFlags flags;
531 } RwCo;
532 
533 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
534 {
535     RwCo *rwco = opaque;
536 
537     if (!rwco->is_write) {
538         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
539                                       rwco->qiov->size, rwco->qiov,
540                                       rwco->flags);
541     } else {
542         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
543                                        rwco->qiov->size, rwco->qiov,
544                                        rwco->flags);
545     }
546 }
547 
548 /*
549  * Process a vectored synchronous request using coroutines
550  */
551 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
552                         QEMUIOVector *qiov, bool is_write,
553                         BdrvRequestFlags flags)
554 {
555     Coroutine *co;
556     RwCo rwco = {
557         .bs = bs,
558         .offset = offset,
559         .qiov = qiov,
560         .is_write = is_write,
561         .ret = NOT_DONE,
562         .flags = flags,
563     };
564 
565     /**
566      * In sync call context, when the vcpu is blocked, this throttling timer
567      * will not fire; so the I/O throttling function has to be disabled here
568      * if it has been enabled.
569      */
570     if (bs->io_limits_enabled) {
571         fprintf(stderr, "Disabling I/O throttling on '%s' due "
572                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
573         bdrv_io_limits_disable(bs);
574     }
575 
576     if (qemu_in_coroutine()) {
577         /* Fast-path if already in coroutine context */
578         bdrv_rw_co_entry(&rwco);
579     } else {
580         AioContext *aio_context = bdrv_get_aio_context(bs);
581 
582         co = qemu_coroutine_create(bdrv_rw_co_entry);
583         qemu_coroutine_enter(co, &rwco);
584         while (rwco.ret == NOT_DONE) {
585             aio_poll(aio_context, true);
586         }
587     }
588     return rwco.ret;
589 }
590 
591 /*
592  * Process a synchronous request using coroutines
593  */
594 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
595                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
596 {
597     QEMUIOVector qiov;
598     struct iovec iov = {
599         .iov_base = (void *)buf,
600         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
601     };
602 
603     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
604         return -EINVAL;
605     }
606 
607     qemu_iovec_init_external(&qiov, &iov, 1);
608     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
609                         &qiov, is_write, flags);
610 }
611 
612 /* return < 0 if error. See bdrv_write() for the return codes */
613 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
614               uint8_t *buf, int nb_sectors)
615 {
616     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
617 }
618 
619 /* Return < 0 if error. Important errors are:
620   -EIO         generic I/O error (may happen for all errors)
621   -ENOMEDIUM   No media inserted.
622   -EINVAL      Invalid sector number or nb_sectors
623   -EACCES      Trying to write a read-only device
624 */
625 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
626                const uint8_t *buf, int nb_sectors)
627 {
628     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
629 }
630 
631 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
632                       int nb_sectors, BdrvRequestFlags flags)
633 {
634     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
635                       BDRV_REQ_ZERO_WRITE | flags);
636 }
637 
638 /*
639  * Completely zero out a block device with the help of bdrv_write_zeroes.
640  * The operation is sped up by checking the block status and only writing
641  * zeroes to the device if they currently do not return zeroes. Optional
642  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
643  *
644  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
645  */
646 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
647 {
648     int64_t target_sectors, ret, nb_sectors, sector_num = 0;
649     BlockDriverState *file;
650     int n;
651 
652     target_sectors = bdrv_nb_sectors(bs);
653     if (target_sectors < 0) {
654         return target_sectors;
655     }
656 
657     for (;;) {
658         nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
659         if (nb_sectors <= 0) {
660             return 0;
661         }
662         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n, &file);
663         if (ret < 0) {
664             error_report("error getting block status at sector %" PRId64 ": %s",
665                          sector_num, strerror(-ret));
666             return ret;
667         }
668         if (ret & BDRV_BLOCK_ZERO) {
669             sector_num += n;
670             continue;
671         }
672         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
673         if (ret < 0) {
674             error_report("error writing zeroes at sector %" PRId64 ": %s",
675                          sector_num, strerror(-ret));
676             return ret;
677         }
678         sector_num += n;
679     }
680 }
681 
682 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
683 {
684     QEMUIOVector qiov;
685     struct iovec iov = {
686         .iov_base = (void *)buf,
687         .iov_len = bytes,
688     };
689     int ret;
690 
691     if (bytes < 0) {
692         return -EINVAL;
693     }
694 
695     qemu_iovec_init_external(&qiov, &iov, 1);
696     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
697     if (ret < 0) {
698         return ret;
699     }
700 
701     return bytes;
702 }
703 
704 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
705 {
706     int ret;
707 
708     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
709     if (ret < 0) {
710         return ret;
711     }
712 
713     return qiov->size;
714 }
715 
716 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
717                 const void *buf, int bytes)
718 {
719     QEMUIOVector qiov;
720     struct iovec iov = {
721         .iov_base   = (void *) buf,
722         .iov_len    = bytes,
723     };
724 
725     if (bytes < 0) {
726         return -EINVAL;
727     }
728 
729     qemu_iovec_init_external(&qiov, &iov, 1);
730     return bdrv_pwritev(bs, offset, &qiov);
731 }
732 
733 /*
734  * Writes to the file and ensures that no writes are reordered across this
735  * request (acts as a barrier)
736  *
737  * Returns 0 on success, -errno in error cases.
738  */
739 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
740     const void *buf, int count)
741 {
742     int ret;
743 
744     ret = bdrv_pwrite(bs, offset, buf, count);
745     if (ret < 0) {
746         return ret;
747     }
748 
749     /* No flush needed for cache modes that already do it */
750     if (bs->enable_write_cache) {
751         bdrv_flush(bs);
752     }
753 
754     return 0;
755 }
756 
757 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
758         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
759 {
760     /* Perform I/O through a temporary buffer so that users who scribble over
761      * their read buffer while the operation is in progress do not end up
762      * modifying the image file.  This is critical for zero-copy guest I/O
763      * where anything might happen inside guest memory.
764      */
765     void *bounce_buffer;
766 
767     BlockDriver *drv = bs->drv;
768     struct iovec iov;
769     QEMUIOVector bounce_qiov;
770     int64_t cluster_sector_num;
771     int cluster_nb_sectors;
772     size_t skip_bytes;
773     int ret;
774 
775     /* Cover entire cluster so no additional backing file I/O is required when
776      * allocating cluster in the image file.
777      */
778     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
779                            &cluster_sector_num, &cluster_nb_sectors);
780 
781     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
782                                    cluster_sector_num, cluster_nb_sectors);
783 
784     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
785     iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
786     if (bounce_buffer == NULL) {
787         ret = -ENOMEM;
788         goto err;
789     }
790 
791     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
792 
793     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
794                              &bounce_qiov);
795     if (ret < 0) {
796         goto err;
797     }
798 
799     if (drv->bdrv_co_write_zeroes &&
800         buffer_is_zero(bounce_buffer, iov.iov_len)) {
801         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
802                                       cluster_nb_sectors, 0);
803     } else {
804         /* This does not change the data on the disk, it is not necessary
805          * to flush even in cache=writethrough mode.
806          */
807         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
808                                   &bounce_qiov);
809     }
810 
811     if (ret < 0) {
812         /* It might be okay to ignore write errors for guest requests.  If this
813          * is a deliberate copy-on-read then we don't want to ignore the error.
814          * Simply report it in all cases.
815          */
816         goto err;
817     }
818 
819     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
820     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
821                         nb_sectors * BDRV_SECTOR_SIZE);
822 
823 err:
824     qemu_vfree(bounce_buffer);
825     return ret;
826 }
827 
828 /*
829  * Forwards an already correctly aligned request to the BlockDriver. This
830  * handles copy on read and zeroing after EOF; any other features must be
831  * implemented by the caller.
832  */
833 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
834     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
835     int64_t align, QEMUIOVector *qiov, int flags)
836 {
837     BlockDriver *drv = bs->drv;
838     int ret;
839 
840     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
841     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
842 
843     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
844     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
845     assert(!qiov || bytes == qiov->size);
846 
847     /* Handle Copy on Read and associated serialisation */
848     if (flags & BDRV_REQ_COPY_ON_READ) {
849         /* If we touch the same cluster it counts as an overlap.  This
850          * guarantees that allocating writes will be serialized and not race
851          * with each other for the same cluster.  For example, in copy-on-read
852          * it ensures that the CoR read and write operations are atomic and
853          * guest writes cannot interleave between them. */
854         mark_request_serialising(req, bdrv_get_cluster_size(bs));
855     }
856 
857     if (!(flags & BDRV_REQ_NO_SERIALISING)) {
858         wait_serialising_requests(req);
859     }
860 
861     if (flags & BDRV_REQ_COPY_ON_READ) {
862         int pnum;
863 
864         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
865         if (ret < 0) {
866             goto out;
867         }
868 
869         if (!ret || pnum != nb_sectors) {
870             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
871             goto out;
872         }
873     }
874 
875     /* Forward the request to the BlockDriver */
876     if (!bs->zero_beyond_eof) {
877         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
878     } else {
879         /* Read zeros after EOF */
880         int64_t total_sectors, max_nb_sectors;
881 
882         total_sectors = bdrv_nb_sectors(bs);
883         if (total_sectors < 0) {
884             ret = total_sectors;
885             goto out;
886         }
887 
888         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
889                                   align >> BDRV_SECTOR_BITS);
890         if (nb_sectors < max_nb_sectors) {
891             ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
892         } else if (max_nb_sectors > 0) {
893             QEMUIOVector local_qiov;
894 
895             qemu_iovec_init(&local_qiov, qiov->niov);
896             qemu_iovec_concat(&local_qiov, qiov, 0,
897                               max_nb_sectors * BDRV_SECTOR_SIZE);
898 
899             ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
900                                      &local_qiov);
901 
902             qemu_iovec_destroy(&local_qiov);
903         } else {
904             ret = 0;
905         }
906 
907         /* Reading beyond end of file is supposed to produce zeroes */
908         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
909             uint64_t offset = MAX(0, total_sectors - sector_num);
910             uint64_t bytes = (sector_num + nb_sectors - offset) *
911                               BDRV_SECTOR_SIZE;
912             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
913         }
914     }
915 
916 out:
917     return ret;
918 }
919 
920 /*
921  * Handle a read request in coroutine context
922  */
923 int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
924     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
925     BdrvRequestFlags flags)
926 {
927     BlockDriver *drv = bs->drv;
928     BdrvTrackedRequest req;
929 
930     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
931     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
932     uint8_t *head_buf = NULL;
933     uint8_t *tail_buf = NULL;
934     QEMUIOVector local_qiov;
935     bool use_local_qiov = false;
936     int ret;
937 
938     if (!drv) {
939         return -ENOMEDIUM;
940     }
941 
942     ret = bdrv_check_byte_request(bs, offset, bytes);
943     if (ret < 0) {
944         return ret;
945     }
946 
947     /* Don't do copy-on-read if we read data before write operation */
948     if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) {
949         flags |= BDRV_REQ_COPY_ON_READ;
950     }
951 
952     /* throttling disk I/O */
953     if (bs->io_limits_enabled) {
954         throttle_group_co_io_limits_intercept(bs, bytes, false);
955     }
956 
957     /* Align read if necessary by padding qiov */
958     if (offset & (align - 1)) {
959         head_buf = qemu_blockalign(bs, align);
960         qemu_iovec_init(&local_qiov, qiov->niov + 2);
961         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
962         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
963         use_local_qiov = true;
964 
965         bytes += offset & (align - 1);
966         offset = offset & ~(align - 1);
967     }
968 
969     if ((offset + bytes) & (align - 1)) {
970         if (!use_local_qiov) {
971             qemu_iovec_init(&local_qiov, qiov->niov + 1);
972             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
973             use_local_qiov = true;
974         }
975         tail_buf = qemu_blockalign(bs, align);
976         qemu_iovec_add(&local_qiov, tail_buf,
977                        align - ((offset + bytes) & (align - 1)));
978 
979         bytes = ROUND_UP(bytes, align);
980     }
981 
982     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
983     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
984                               use_local_qiov ? &local_qiov : qiov,
985                               flags);
986     tracked_request_end(&req);
987 
988     if (use_local_qiov) {
989         qemu_iovec_destroy(&local_qiov);
990         qemu_vfree(head_buf);
991         qemu_vfree(tail_buf);
992     }
993 
994     return ret;
995 }
996 
997 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
998     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
999     BdrvRequestFlags flags)
1000 {
1001     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1002         return -EINVAL;
1003     }
1004 
1005     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
1006                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1007 }
1008 
1009 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1010     int nb_sectors, QEMUIOVector *qiov)
1011 {
1012     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1013 
1014     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1015 }
1016 
1017 int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs,
1018     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1019 {
1020     trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors);
1021 
1022     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1023                             BDRV_REQ_NO_SERIALISING);
1024 }
1025 
1026 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1027     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1028 {
1029     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1030 
1031     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1032                             BDRV_REQ_COPY_ON_READ);
1033 }
1034 
1035 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
1036 
1037 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1038     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
1039 {
1040     BlockDriver *drv = bs->drv;
1041     QEMUIOVector qiov;
1042     struct iovec iov = {0};
1043     int ret = 0;
1044 
1045     int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
1046                                         BDRV_REQUEST_MAX_SECTORS);
1047 
1048     while (nb_sectors > 0 && !ret) {
1049         int num = nb_sectors;
1050 
1051         /* Align request.  Block drivers can expect the "bulk" of the request
1052          * to be aligned.
1053          */
1054         if (bs->bl.write_zeroes_alignment
1055             && num > bs->bl.write_zeroes_alignment) {
1056             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
1057                 /* Make a small request up to the first aligned sector.  */
1058                 num = bs->bl.write_zeroes_alignment;
1059                 num -= sector_num % bs->bl.write_zeroes_alignment;
1060             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
1061                 /* Shorten the request to the last aligned sector.  num cannot
1062                  * underflow because num > bs->bl.write_zeroes_alignment.
1063                  */
1064                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
1065             }
1066         }
1067 
1068         /* limit request size */
1069         if (num > max_write_zeroes) {
1070             num = max_write_zeroes;
1071         }
1072 
1073         ret = -ENOTSUP;
1074         /* First try the efficient write zeroes operation */
1075         if (drv->bdrv_co_write_zeroes) {
1076             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
1077         }
1078 
1079         if (ret == -ENOTSUP) {
1080             /* Fall back to bounce buffer if write zeroes is unsupported */
1081             int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
1082                                             MAX_WRITE_ZEROES_BOUNCE_BUFFER);
1083             num = MIN(num, max_xfer_len);
1084             iov.iov_len = num * BDRV_SECTOR_SIZE;
1085             if (iov.iov_base == NULL) {
1086                 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
1087                 if (iov.iov_base == NULL) {
1088                     ret = -ENOMEM;
1089                     goto fail;
1090                 }
1091                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
1092             }
1093             qemu_iovec_init_external(&qiov, &iov, 1);
1094 
1095             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
1096 
1097             /* Keep bounce buffer around if it is big enough for all
1098              * all future requests.
1099              */
1100             if (num < max_xfer_len) {
1101                 qemu_vfree(iov.iov_base);
1102                 iov.iov_base = NULL;
1103             }
1104         }
1105 
1106         sector_num += num;
1107         nb_sectors -= num;
1108     }
1109 
1110 fail:
1111     qemu_vfree(iov.iov_base);
1112     return ret;
1113 }
1114 
1115 /*
1116  * Forwards an already correctly aligned write request to the BlockDriver.
1117  */
1118 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
1119     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1120     QEMUIOVector *qiov, int flags)
1121 {
1122     BlockDriver *drv = bs->drv;
1123     bool waited;
1124     int ret;
1125 
1126     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
1127     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
1128 
1129     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
1130     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
1131     assert(!qiov || bytes == qiov->size);
1132 
1133     waited = wait_serialising_requests(req);
1134     assert(!waited || !req->serialising);
1135     assert(req->overlap_offset <= offset);
1136     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1137 
1138     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
1139 
1140     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
1141         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
1142         qemu_iovec_is_zero(qiov)) {
1143         flags |= BDRV_REQ_ZERO_WRITE;
1144         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
1145             flags |= BDRV_REQ_MAY_UNMAP;
1146         }
1147     }
1148 
1149     if (ret < 0) {
1150         /* Do nothing, write notifier decided to fail this request */
1151     } else if (flags & BDRV_REQ_ZERO_WRITE) {
1152         bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
1153         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
1154     } else {
1155         bdrv_debug_event(bs, BLKDBG_PWRITEV);
1156         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1157     }
1158     bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
1159 
1160     if (ret == 0 && !bs->enable_write_cache) {
1161         ret = bdrv_co_flush(bs);
1162     }
1163 
1164     bdrv_set_dirty(bs, sector_num, nb_sectors);
1165 
1166     if (bs->wr_highest_offset < offset + bytes) {
1167         bs->wr_highest_offset = offset + bytes;
1168     }
1169 
1170     if (ret >= 0) {
1171         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
1172     }
1173 
1174     return ret;
1175 }
1176 
1177 static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
1178                                                 int64_t offset,
1179                                                 unsigned int bytes,
1180                                                 BdrvRequestFlags flags,
1181                                                 BdrvTrackedRequest *req)
1182 {
1183     uint8_t *buf = NULL;
1184     QEMUIOVector local_qiov;
1185     struct iovec iov;
1186     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
1187     unsigned int head_padding_bytes, tail_padding_bytes;
1188     int ret = 0;
1189 
1190     head_padding_bytes = offset & (align - 1);
1191     tail_padding_bytes = align - ((offset + bytes) & (align - 1));
1192 
1193 
1194     assert(flags & BDRV_REQ_ZERO_WRITE);
1195     if (head_padding_bytes || tail_padding_bytes) {
1196         buf = qemu_blockalign(bs, align);
1197         iov = (struct iovec) {
1198             .iov_base   = buf,
1199             .iov_len    = align,
1200         };
1201         qemu_iovec_init_external(&local_qiov, &iov, 1);
1202     }
1203     if (head_padding_bytes) {
1204         uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
1205 
1206         /* RMW the unaligned part before head. */
1207         mark_request_serialising(req, align);
1208         wait_serialising_requests(req);
1209         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1210         ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align,
1211                                   align, &local_qiov, 0);
1212         if (ret < 0) {
1213             goto fail;
1214         }
1215         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1216 
1217         memset(buf + head_padding_bytes, 0, zero_bytes);
1218         ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align,
1219                                    &local_qiov,
1220                                    flags & ~BDRV_REQ_ZERO_WRITE);
1221         if (ret < 0) {
1222             goto fail;
1223         }
1224         offset += zero_bytes;
1225         bytes -= zero_bytes;
1226     }
1227 
1228     assert(!bytes || (offset & (align - 1)) == 0);
1229     if (bytes >= align) {
1230         /* Write the aligned part in the middle. */
1231         uint64_t aligned_bytes = bytes & ~(align - 1);
1232         ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes,
1233                                    NULL, flags);
1234         if (ret < 0) {
1235             goto fail;
1236         }
1237         bytes -= aligned_bytes;
1238         offset += aligned_bytes;
1239     }
1240 
1241     assert(!bytes || (offset & (align - 1)) == 0);
1242     if (bytes) {
1243         assert(align == tail_padding_bytes + bytes);
1244         /* RMW the unaligned part after tail. */
1245         mark_request_serialising(req, align);
1246         wait_serialising_requests(req);
1247         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1248         ret = bdrv_aligned_preadv(bs, req, offset, align,
1249                                   align, &local_qiov, 0);
1250         if (ret < 0) {
1251             goto fail;
1252         }
1253         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1254 
1255         memset(buf, 0, bytes);
1256         ret = bdrv_aligned_pwritev(bs, req, offset, align,
1257                                    &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
1258     }
1259 fail:
1260     qemu_vfree(buf);
1261     return ret;
1262 
1263 }
1264 
1265 /*
1266  * Handle a write request in coroutine context
1267  */
1268 int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
1269     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1270     BdrvRequestFlags flags)
1271 {
1272     BdrvTrackedRequest req;
1273     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
1274     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
1275     uint8_t *head_buf = NULL;
1276     uint8_t *tail_buf = NULL;
1277     QEMUIOVector local_qiov;
1278     bool use_local_qiov = false;
1279     int ret;
1280 
1281     if (!bs->drv) {
1282         return -ENOMEDIUM;
1283     }
1284     if (bs->read_only) {
1285         return -EPERM;
1286     }
1287     assert(!(bs->open_flags & BDRV_O_INACTIVE));
1288 
1289     ret = bdrv_check_byte_request(bs, offset, bytes);
1290     if (ret < 0) {
1291         return ret;
1292     }
1293 
1294     /* throttling disk I/O */
1295     if (bs->io_limits_enabled) {
1296         throttle_group_co_io_limits_intercept(bs, bytes, true);
1297     }
1298 
1299     /*
1300      * Align write if necessary by performing a read-modify-write cycle.
1301      * Pad qiov with the read parts and be sure to have a tracked request not
1302      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
1303      */
1304     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
1305 
1306     if (!qiov) {
1307         ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req);
1308         goto out;
1309     }
1310 
1311     if (offset & (align - 1)) {
1312         QEMUIOVector head_qiov;
1313         struct iovec head_iov;
1314 
1315         mark_request_serialising(&req, align);
1316         wait_serialising_requests(&req);
1317 
1318         head_buf = qemu_blockalign(bs, align);
1319         head_iov = (struct iovec) {
1320             .iov_base   = head_buf,
1321             .iov_len    = align,
1322         };
1323         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
1324 
1325         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1326         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
1327                                   align, &head_qiov, 0);
1328         if (ret < 0) {
1329             goto fail;
1330         }
1331         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1332 
1333         qemu_iovec_init(&local_qiov, qiov->niov + 2);
1334         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1335         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1336         use_local_qiov = true;
1337 
1338         bytes += offset & (align - 1);
1339         offset = offset & ~(align - 1);
1340     }
1341 
1342     if ((offset + bytes) & (align - 1)) {
1343         QEMUIOVector tail_qiov;
1344         struct iovec tail_iov;
1345         size_t tail_bytes;
1346         bool waited;
1347 
1348         mark_request_serialising(&req, align);
1349         waited = wait_serialising_requests(&req);
1350         assert(!waited || !use_local_qiov);
1351 
1352         tail_buf = qemu_blockalign(bs, align);
1353         tail_iov = (struct iovec) {
1354             .iov_base   = tail_buf,
1355             .iov_len    = align,
1356         };
1357         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
1358 
1359         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1360         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
1361                                   align, &tail_qiov, 0);
1362         if (ret < 0) {
1363             goto fail;
1364         }
1365         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1366 
1367         if (!use_local_qiov) {
1368             qemu_iovec_init(&local_qiov, qiov->niov + 1);
1369             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1370             use_local_qiov = true;
1371         }
1372 
1373         tail_bytes = (offset + bytes) & (align - 1);
1374         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
1375 
1376         bytes = ROUND_UP(bytes, align);
1377     }
1378 
1379     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
1380                                use_local_qiov ? &local_qiov : qiov,
1381                                flags);
1382 
1383 fail:
1384 
1385     if (use_local_qiov) {
1386         qemu_iovec_destroy(&local_qiov);
1387     }
1388     qemu_vfree(head_buf);
1389     qemu_vfree(tail_buf);
1390 out:
1391     tracked_request_end(&req);
1392     return ret;
1393 }
1394 
1395 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1396     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1397     BdrvRequestFlags flags)
1398 {
1399     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1400         return -EINVAL;
1401     }
1402 
1403     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
1404                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1405 }
1406 
1407 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1408     int nb_sectors, QEMUIOVector *qiov)
1409 {
1410     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1411 
1412     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1413 }
1414 
1415 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1416                                       int64_t sector_num, int nb_sectors,
1417                                       BdrvRequestFlags flags)
1418 {
1419     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
1420 
1421     if (!(bs->open_flags & BDRV_O_UNMAP)) {
1422         flags &= ~BDRV_REQ_MAY_UNMAP;
1423     }
1424 
1425     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1426                              BDRV_REQ_ZERO_WRITE | flags);
1427 }
1428 
1429 typedef struct BdrvCoGetBlockStatusData {
1430     BlockDriverState *bs;
1431     BlockDriverState *base;
1432     BlockDriverState **file;
1433     int64_t sector_num;
1434     int nb_sectors;
1435     int *pnum;
1436     int64_t ret;
1437     bool done;
1438 } BdrvCoGetBlockStatusData;
1439 
1440 /*
1441  * Returns the allocation status of the specified sectors.
1442  * Drivers not implementing the functionality are assumed to not support
1443  * backing files, hence all their sectors are reported as allocated.
1444  *
1445  * If 'sector_num' is beyond the end of the disk image the return value is 0
1446  * and 'pnum' is set to 0.
1447  *
1448  * 'pnum' is set to the number of sectors (including and immediately following
1449  * the specified sector) that are known to be in the same
1450  * allocated/unallocated state.
1451  *
1452  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
1453  * beyond the end of the disk image it will be clamped.
1454  *
1455  * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file'
1456  * points to the BDS which the sector range is allocated in.
1457  */
1458 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
1459                                                      int64_t sector_num,
1460                                                      int nb_sectors, int *pnum,
1461                                                      BlockDriverState **file)
1462 {
1463     int64_t total_sectors;
1464     int64_t n;
1465     int64_t ret, ret2;
1466 
1467     total_sectors = bdrv_nb_sectors(bs);
1468     if (total_sectors < 0) {
1469         return total_sectors;
1470     }
1471 
1472     if (sector_num >= total_sectors) {
1473         *pnum = 0;
1474         return 0;
1475     }
1476 
1477     n = total_sectors - sector_num;
1478     if (n < nb_sectors) {
1479         nb_sectors = n;
1480     }
1481 
1482     if (!bs->drv->bdrv_co_get_block_status) {
1483         *pnum = nb_sectors;
1484         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
1485         if (bs->drv->protocol_name) {
1486             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
1487         }
1488         return ret;
1489     }
1490 
1491     *file = NULL;
1492     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum,
1493                                             file);
1494     if (ret < 0) {
1495         *pnum = 0;
1496         return ret;
1497     }
1498 
1499     if (ret & BDRV_BLOCK_RAW) {
1500         assert(ret & BDRV_BLOCK_OFFSET_VALID);
1501         return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
1502                                      *pnum, pnum, file);
1503     }
1504 
1505     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
1506         ret |= BDRV_BLOCK_ALLOCATED;
1507     } else {
1508         if (bdrv_unallocated_blocks_are_zero(bs)) {
1509             ret |= BDRV_BLOCK_ZERO;
1510         } else if (bs->backing) {
1511             BlockDriverState *bs2 = bs->backing->bs;
1512             int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
1513             if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
1514                 ret |= BDRV_BLOCK_ZERO;
1515             }
1516         }
1517     }
1518 
1519     if (*file && *file != bs &&
1520         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
1521         (ret & BDRV_BLOCK_OFFSET_VALID)) {
1522         BlockDriverState *file2;
1523         int file_pnum;
1524 
1525         ret2 = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS,
1526                                         *pnum, &file_pnum, &file2);
1527         if (ret2 >= 0) {
1528             /* Ignore errors.  This is just providing extra information, it
1529              * is useful but not necessary.
1530              */
1531             if (!file_pnum) {
1532                 /* !file_pnum indicates an offset at or beyond the EOF; it is
1533                  * perfectly valid for the format block driver to point to such
1534                  * offsets, so catch it and mark everything as zero */
1535                 ret |= BDRV_BLOCK_ZERO;
1536             } else {
1537                 /* Limit request to the range reported by the protocol driver */
1538                 *pnum = file_pnum;
1539                 ret |= (ret2 & BDRV_BLOCK_ZERO);
1540             }
1541         }
1542     }
1543 
1544     return ret;
1545 }
1546 
1547 static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs,
1548         BlockDriverState *base,
1549         int64_t sector_num,
1550         int nb_sectors,
1551         int *pnum,
1552         BlockDriverState **file)
1553 {
1554     BlockDriverState *p;
1555     int64_t ret = 0;
1556 
1557     assert(bs != base);
1558     for (p = bs; p != base; p = backing_bs(p)) {
1559         ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file);
1560         if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) {
1561             break;
1562         }
1563         /* [sector_num, pnum] unallocated on this layer, which could be only
1564          * the first part of [sector_num, nb_sectors].  */
1565         nb_sectors = MIN(nb_sectors, *pnum);
1566     }
1567     return ret;
1568 }
1569 
1570 /* Coroutine wrapper for bdrv_get_block_status_above() */
1571 static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque)
1572 {
1573     BdrvCoGetBlockStatusData *data = opaque;
1574 
1575     data->ret = bdrv_co_get_block_status_above(data->bs, data->base,
1576                                                data->sector_num,
1577                                                data->nb_sectors,
1578                                                data->pnum,
1579                                                data->file);
1580     data->done = true;
1581 }
1582 
1583 /*
1584  * Synchronous wrapper around bdrv_co_get_block_status_above().
1585  *
1586  * See bdrv_co_get_block_status_above() for details.
1587  */
1588 int64_t bdrv_get_block_status_above(BlockDriverState *bs,
1589                                     BlockDriverState *base,
1590                                     int64_t sector_num,
1591                                     int nb_sectors, int *pnum,
1592                                     BlockDriverState **file)
1593 {
1594     Coroutine *co;
1595     BdrvCoGetBlockStatusData data = {
1596         .bs = bs,
1597         .base = base,
1598         .file = file,
1599         .sector_num = sector_num,
1600         .nb_sectors = nb_sectors,
1601         .pnum = pnum,
1602         .done = false,
1603     };
1604 
1605     if (qemu_in_coroutine()) {
1606         /* Fast-path if already in coroutine context */
1607         bdrv_get_block_status_above_co_entry(&data);
1608     } else {
1609         AioContext *aio_context = bdrv_get_aio_context(bs);
1610 
1611         co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry);
1612         qemu_coroutine_enter(co, &data);
1613         while (!data.done) {
1614             aio_poll(aio_context, true);
1615         }
1616     }
1617     return data.ret;
1618 }
1619 
1620 int64_t bdrv_get_block_status(BlockDriverState *bs,
1621                               int64_t sector_num,
1622                               int nb_sectors, int *pnum,
1623                               BlockDriverState **file)
1624 {
1625     return bdrv_get_block_status_above(bs, backing_bs(bs),
1626                                        sector_num, nb_sectors, pnum, file);
1627 }
1628 
1629 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
1630                                    int nb_sectors, int *pnum)
1631 {
1632     BlockDriverState *file;
1633     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum,
1634                                         &file);
1635     if (ret < 0) {
1636         return ret;
1637     }
1638     return !!(ret & BDRV_BLOCK_ALLOCATED);
1639 }
1640 
1641 /*
1642  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
1643  *
1644  * Return true if the given sector is allocated in any image between
1645  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
1646  * sector is allocated in any image of the chain.  Return false otherwise.
1647  *
1648  * 'pnum' is set to the number of sectors (including and immediately following
1649  *  the specified sector) that are known to be in the same
1650  *  allocated/unallocated state.
1651  *
1652  */
1653 int bdrv_is_allocated_above(BlockDriverState *top,
1654                             BlockDriverState *base,
1655                             int64_t sector_num,
1656                             int nb_sectors, int *pnum)
1657 {
1658     BlockDriverState *intermediate;
1659     int ret, n = nb_sectors;
1660 
1661     intermediate = top;
1662     while (intermediate && intermediate != base) {
1663         int pnum_inter;
1664         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
1665                                 &pnum_inter);
1666         if (ret < 0) {
1667             return ret;
1668         } else if (ret) {
1669             *pnum = pnum_inter;
1670             return 1;
1671         }
1672 
1673         /*
1674          * [sector_num, nb_sectors] is unallocated on top but intermediate
1675          * might have
1676          *
1677          * [sector_num+x, nr_sectors] allocated.
1678          */
1679         if (n > pnum_inter &&
1680             (intermediate == top ||
1681              sector_num + pnum_inter < intermediate->total_sectors)) {
1682             n = pnum_inter;
1683         }
1684 
1685         intermediate = backing_bs(intermediate);
1686     }
1687 
1688     *pnum = n;
1689     return 0;
1690 }
1691 
1692 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
1693                           const uint8_t *buf, int nb_sectors)
1694 {
1695     BlockDriver *drv = bs->drv;
1696     int ret;
1697 
1698     if (!drv) {
1699         return -ENOMEDIUM;
1700     }
1701     if (!drv->bdrv_write_compressed) {
1702         return -ENOTSUP;
1703     }
1704     ret = bdrv_check_request(bs, sector_num, nb_sectors);
1705     if (ret < 0) {
1706         return ret;
1707     }
1708 
1709     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1710 
1711     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
1712 }
1713 
1714 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
1715                       int64_t pos, int size)
1716 {
1717     QEMUIOVector qiov;
1718     struct iovec iov = {
1719         .iov_base   = (void *) buf,
1720         .iov_len    = size,
1721     };
1722 
1723     qemu_iovec_init_external(&qiov, &iov, 1);
1724     return bdrv_writev_vmstate(bs, &qiov, pos);
1725 }
1726 
1727 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
1728 {
1729     BlockDriver *drv = bs->drv;
1730 
1731     if (!drv) {
1732         return -ENOMEDIUM;
1733     } else if (drv->bdrv_save_vmstate) {
1734         return drv->bdrv_save_vmstate(bs, qiov, pos);
1735     } else if (bs->file) {
1736         return bdrv_writev_vmstate(bs->file->bs, qiov, pos);
1737     }
1738 
1739     return -ENOTSUP;
1740 }
1741 
1742 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
1743                       int64_t pos, int size)
1744 {
1745     BlockDriver *drv = bs->drv;
1746     if (!drv)
1747         return -ENOMEDIUM;
1748     if (drv->bdrv_load_vmstate)
1749         return drv->bdrv_load_vmstate(bs, buf, pos, size);
1750     if (bs->file)
1751         return bdrv_load_vmstate(bs->file->bs, buf, pos, size);
1752     return -ENOTSUP;
1753 }
1754 
1755 /**************************************************************/
1756 /* async I/Os */
1757 
1758 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
1759                            QEMUIOVector *qiov, int nb_sectors,
1760                            BlockCompletionFunc *cb, void *opaque)
1761 {
1762     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
1763 
1764     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
1765                                  cb, opaque, false);
1766 }
1767 
1768 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
1769                             QEMUIOVector *qiov, int nb_sectors,
1770                             BlockCompletionFunc *cb, void *opaque)
1771 {
1772     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
1773 
1774     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
1775                                  cb, opaque, true);
1776 }
1777 
1778 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
1779         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
1780         BlockCompletionFunc *cb, void *opaque)
1781 {
1782     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
1783 
1784     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
1785                                  BDRV_REQ_ZERO_WRITE | flags,
1786                                  cb, opaque, true);
1787 }
1788 
1789 
1790 typedef struct MultiwriteCB {
1791     int error;
1792     int num_requests;
1793     int num_callbacks;
1794     struct {
1795         BlockCompletionFunc *cb;
1796         void *opaque;
1797         QEMUIOVector *free_qiov;
1798     } callbacks[];
1799 } MultiwriteCB;
1800 
1801 static void multiwrite_user_cb(MultiwriteCB *mcb)
1802 {
1803     int i;
1804 
1805     for (i = 0; i < mcb->num_callbacks; i++) {
1806         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1807         if (mcb->callbacks[i].free_qiov) {
1808             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
1809         }
1810         g_free(mcb->callbacks[i].free_qiov);
1811     }
1812 }
1813 
1814 static void multiwrite_cb(void *opaque, int ret)
1815 {
1816     MultiwriteCB *mcb = opaque;
1817 
1818     trace_multiwrite_cb(mcb, ret);
1819 
1820     if (ret < 0 && !mcb->error) {
1821         mcb->error = ret;
1822     }
1823 
1824     mcb->num_requests--;
1825     if (mcb->num_requests == 0) {
1826         multiwrite_user_cb(mcb);
1827         g_free(mcb);
1828     }
1829 }
1830 
1831 static int multiwrite_req_compare(const void *a, const void *b)
1832 {
1833     const BlockRequest *req1 = a, *req2 = b;
1834 
1835     /*
1836      * Note that we can't simply subtract req2->sector from req1->sector
1837      * here as that could overflow the return value.
1838      */
1839     if (req1->sector > req2->sector) {
1840         return 1;
1841     } else if (req1->sector < req2->sector) {
1842         return -1;
1843     } else {
1844         return 0;
1845     }
1846 }
1847 
1848 /*
1849  * Takes a bunch of requests and tries to merge them. Returns the number of
1850  * requests that remain after merging.
1851  */
1852 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
1853     int num_reqs, MultiwriteCB *mcb)
1854 {
1855     int i, outidx;
1856 
1857     // Sort requests by start sector
1858     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
1859 
1860     // Check if adjacent requests touch the same clusters. If so, combine them,
1861     // filling up gaps with zero sectors.
1862     outidx = 0;
1863     for (i = 1; i < num_reqs; i++) {
1864         int merge = 0;
1865         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
1866 
1867         // Handle exactly sequential writes and overlapping writes.
1868         if (reqs[i].sector <= oldreq_last) {
1869             merge = 1;
1870         }
1871 
1872         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 >
1873             bs->bl.max_iov) {
1874             merge = 0;
1875         }
1876 
1877         if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
1878             reqs[i].nb_sectors > bs->bl.max_transfer_length) {
1879             merge = 0;
1880         }
1881 
1882         if (merge) {
1883             size_t size;
1884             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
1885             qemu_iovec_init(qiov,
1886                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
1887 
1888             // Add the first request to the merged one. If the requests are
1889             // overlapping, drop the last sectors of the first request.
1890             size = (reqs[i].sector - reqs[outidx].sector) << 9;
1891             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
1892 
1893             // We should need to add any zeros between the two requests
1894             assert (reqs[i].sector <= oldreq_last);
1895 
1896             // Add the second request
1897             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
1898 
1899             // Add tail of first request, if necessary
1900             if (qiov->size < reqs[outidx].qiov->size) {
1901                 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
1902                                   reqs[outidx].qiov->size - qiov->size);
1903             }
1904 
1905             reqs[outidx].nb_sectors = qiov->size >> 9;
1906             reqs[outidx].qiov = qiov;
1907 
1908             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
1909         } else {
1910             outidx++;
1911             reqs[outidx].sector     = reqs[i].sector;
1912             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
1913             reqs[outidx].qiov       = reqs[i].qiov;
1914         }
1915     }
1916 
1917     if (bs->blk) {
1918         block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE,
1919                               num_reqs - outidx - 1);
1920     }
1921 
1922     return outidx + 1;
1923 }
1924 
1925 /*
1926  * Submit multiple AIO write requests at once.
1927  *
1928  * On success, the function returns 0 and all requests in the reqs array have
1929  * been submitted. In error case this function returns -1, and any of the
1930  * requests may or may not be submitted yet. In particular, this means that the
1931  * callback will be called for some of the requests, for others it won't. The
1932  * caller must check the error field of the BlockRequest to wait for the right
1933  * callbacks (if error != 0, no callback will be called).
1934  *
1935  * The implementation may modify the contents of the reqs array, e.g. to merge
1936  * requests. However, the fields opaque and error are left unmodified as they
1937  * are used to signal failure for a single request to the caller.
1938  */
1939 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
1940 {
1941     MultiwriteCB *mcb;
1942     int i;
1943 
1944     /* don't submit writes if we don't have a medium */
1945     if (bs->drv == NULL) {
1946         for (i = 0; i < num_reqs; i++) {
1947             reqs[i].error = -ENOMEDIUM;
1948         }
1949         return -1;
1950     }
1951 
1952     if (num_reqs == 0) {
1953         return 0;
1954     }
1955 
1956     // Create MultiwriteCB structure
1957     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
1958     mcb->num_requests = 0;
1959     mcb->num_callbacks = num_reqs;
1960 
1961     for (i = 0; i < num_reqs; i++) {
1962         mcb->callbacks[i].cb = reqs[i].cb;
1963         mcb->callbacks[i].opaque = reqs[i].opaque;
1964     }
1965 
1966     // Check for mergable requests
1967     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
1968 
1969     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
1970 
1971     /* Run the aio requests. */
1972     mcb->num_requests = num_reqs;
1973     for (i = 0; i < num_reqs; i++) {
1974         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
1975                               reqs[i].nb_sectors, reqs[i].flags,
1976                               multiwrite_cb, mcb,
1977                               true);
1978     }
1979 
1980     return 0;
1981 }
1982 
1983 void bdrv_aio_cancel(BlockAIOCB *acb)
1984 {
1985     qemu_aio_ref(acb);
1986     bdrv_aio_cancel_async(acb);
1987     while (acb->refcnt > 1) {
1988         if (acb->aiocb_info->get_aio_context) {
1989             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
1990         } else if (acb->bs) {
1991             aio_poll(bdrv_get_aio_context(acb->bs), true);
1992         } else {
1993             abort();
1994         }
1995     }
1996     qemu_aio_unref(acb);
1997 }
1998 
1999 /* Async version of aio cancel. The caller is not blocked if the acb implements
2000  * cancel_async, otherwise we do nothing and let the request normally complete.
2001  * In either case the completion callback must be called. */
2002 void bdrv_aio_cancel_async(BlockAIOCB *acb)
2003 {
2004     if (acb->aiocb_info->cancel_async) {
2005         acb->aiocb_info->cancel_async(acb);
2006     }
2007 }
2008 
2009 /**************************************************************/
2010 /* async block device emulation */
2011 
2012 typedef struct BlockAIOCBSync {
2013     BlockAIOCB common;
2014     QEMUBH *bh;
2015     int ret;
2016     /* vector translation state */
2017     QEMUIOVector *qiov;
2018     uint8_t *bounce;
2019     int is_write;
2020 } BlockAIOCBSync;
2021 
2022 static const AIOCBInfo bdrv_em_aiocb_info = {
2023     .aiocb_size         = sizeof(BlockAIOCBSync),
2024 };
2025 
2026 static void bdrv_aio_bh_cb(void *opaque)
2027 {
2028     BlockAIOCBSync *acb = opaque;
2029 
2030     if (!acb->is_write && acb->ret >= 0) {
2031         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
2032     }
2033     qemu_vfree(acb->bounce);
2034     acb->common.cb(acb->common.opaque, acb->ret);
2035     qemu_bh_delete(acb->bh);
2036     acb->bh = NULL;
2037     qemu_aio_unref(acb);
2038 }
2039 
2040 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
2041                                       int64_t sector_num,
2042                                       QEMUIOVector *qiov,
2043                                       int nb_sectors,
2044                                       BlockCompletionFunc *cb,
2045                                       void *opaque,
2046                                       int is_write)
2047 
2048 {
2049     BlockAIOCBSync *acb;
2050 
2051     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
2052     acb->is_write = is_write;
2053     acb->qiov = qiov;
2054     acb->bounce = qemu_try_blockalign(bs, qiov->size);
2055     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
2056 
2057     if (acb->bounce == NULL) {
2058         acb->ret = -ENOMEM;
2059     } else if (is_write) {
2060         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
2061         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
2062     } else {
2063         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
2064     }
2065 
2066     qemu_bh_schedule(acb->bh);
2067 
2068     return &acb->common;
2069 }
2070 
2071 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
2072         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2073         BlockCompletionFunc *cb, void *opaque)
2074 {
2075     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
2076 }
2077 
2078 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
2079         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2080         BlockCompletionFunc *cb, void *opaque)
2081 {
2082     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
2083 }
2084 
2085 
2086 typedef struct BlockAIOCBCoroutine {
2087     BlockAIOCB common;
2088     BlockRequest req;
2089     bool is_write;
2090     bool need_bh;
2091     bool *done;
2092     QEMUBH* bh;
2093 } BlockAIOCBCoroutine;
2094 
2095 static const AIOCBInfo bdrv_em_co_aiocb_info = {
2096     .aiocb_size         = sizeof(BlockAIOCBCoroutine),
2097 };
2098 
2099 static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
2100 {
2101     if (!acb->need_bh) {
2102         acb->common.cb(acb->common.opaque, acb->req.error);
2103         qemu_aio_unref(acb);
2104     }
2105 }
2106 
2107 static void bdrv_co_em_bh(void *opaque)
2108 {
2109     BlockAIOCBCoroutine *acb = opaque;
2110 
2111     assert(!acb->need_bh);
2112     qemu_bh_delete(acb->bh);
2113     bdrv_co_complete(acb);
2114 }
2115 
2116 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
2117 {
2118     acb->need_bh = false;
2119     if (acb->req.error != -EINPROGRESS) {
2120         BlockDriverState *bs = acb->common.bs;
2121 
2122         acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
2123         qemu_bh_schedule(acb->bh);
2124     }
2125 }
2126 
2127 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
2128 static void coroutine_fn bdrv_co_do_rw(void *opaque)
2129 {
2130     BlockAIOCBCoroutine *acb = opaque;
2131     BlockDriverState *bs = acb->common.bs;
2132 
2133     if (!acb->is_write) {
2134         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
2135             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
2136     } else {
2137         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
2138             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
2139     }
2140 
2141     bdrv_co_complete(acb);
2142 }
2143 
2144 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
2145                                          int64_t sector_num,
2146                                          QEMUIOVector *qiov,
2147                                          int nb_sectors,
2148                                          BdrvRequestFlags flags,
2149                                          BlockCompletionFunc *cb,
2150                                          void *opaque,
2151                                          bool is_write)
2152 {
2153     Coroutine *co;
2154     BlockAIOCBCoroutine *acb;
2155 
2156     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2157     acb->need_bh = true;
2158     acb->req.error = -EINPROGRESS;
2159     acb->req.sector = sector_num;
2160     acb->req.nb_sectors = nb_sectors;
2161     acb->req.qiov = qiov;
2162     acb->req.flags = flags;
2163     acb->is_write = is_write;
2164 
2165     co = qemu_coroutine_create(bdrv_co_do_rw);
2166     qemu_coroutine_enter(co, acb);
2167 
2168     bdrv_co_maybe_schedule_bh(acb);
2169     return &acb->common;
2170 }
2171 
2172 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
2173 {
2174     BlockAIOCBCoroutine *acb = opaque;
2175     BlockDriverState *bs = acb->common.bs;
2176 
2177     acb->req.error = bdrv_co_flush(bs);
2178     bdrv_co_complete(acb);
2179 }
2180 
2181 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
2182         BlockCompletionFunc *cb, void *opaque)
2183 {
2184     trace_bdrv_aio_flush(bs, opaque);
2185 
2186     Coroutine *co;
2187     BlockAIOCBCoroutine *acb;
2188 
2189     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2190     acb->need_bh = true;
2191     acb->req.error = -EINPROGRESS;
2192 
2193     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
2194     qemu_coroutine_enter(co, acb);
2195 
2196     bdrv_co_maybe_schedule_bh(acb);
2197     return &acb->common;
2198 }
2199 
2200 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
2201 {
2202     BlockAIOCBCoroutine *acb = opaque;
2203     BlockDriverState *bs = acb->common.bs;
2204 
2205     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
2206     bdrv_co_complete(acb);
2207 }
2208 
2209 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
2210         int64_t sector_num, int nb_sectors,
2211         BlockCompletionFunc *cb, void *opaque)
2212 {
2213     Coroutine *co;
2214     BlockAIOCBCoroutine *acb;
2215 
2216     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
2217 
2218     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2219     acb->need_bh = true;
2220     acb->req.error = -EINPROGRESS;
2221     acb->req.sector = sector_num;
2222     acb->req.nb_sectors = nb_sectors;
2223     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
2224     qemu_coroutine_enter(co, acb);
2225 
2226     bdrv_co_maybe_schedule_bh(acb);
2227     return &acb->common;
2228 }
2229 
2230 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
2231                    BlockCompletionFunc *cb, void *opaque)
2232 {
2233     BlockAIOCB *acb;
2234 
2235     acb = g_malloc(aiocb_info->aiocb_size);
2236     acb->aiocb_info = aiocb_info;
2237     acb->bs = bs;
2238     acb->cb = cb;
2239     acb->opaque = opaque;
2240     acb->refcnt = 1;
2241     return acb;
2242 }
2243 
2244 void qemu_aio_ref(void *p)
2245 {
2246     BlockAIOCB *acb = p;
2247     acb->refcnt++;
2248 }
2249 
2250 void qemu_aio_unref(void *p)
2251 {
2252     BlockAIOCB *acb = p;
2253     assert(acb->refcnt > 0);
2254     if (--acb->refcnt == 0) {
2255         g_free(acb);
2256     }
2257 }
2258 
2259 /**************************************************************/
2260 /* Coroutine block device emulation */
2261 
2262 typedef struct CoroutineIOCompletion {
2263     Coroutine *coroutine;
2264     int ret;
2265 } CoroutineIOCompletion;
2266 
2267 static void bdrv_co_io_em_complete(void *opaque, int ret)
2268 {
2269     CoroutineIOCompletion *co = opaque;
2270 
2271     co->ret = ret;
2272     qemu_coroutine_enter(co->coroutine, NULL);
2273 }
2274 
2275 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
2276                                       int nb_sectors, QEMUIOVector *iov,
2277                                       bool is_write)
2278 {
2279     CoroutineIOCompletion co = {
2280         .coroutine = qemu_coroutine_self(),
2281     };
2282     BlockAIOCB *acb;
2283 
2284     if (is_write) {
2285         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
2286                                        bdrv_co_io_em_complete, &co);
2287     } else {
2288         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
2289                                       bdrv_co_io_em_complete, &co);
2290     }
2291 
2292     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
2293     if (!acb) {
2294         return -EIO;
2295     }
2296     qemu_coroutine_yield();
2297 
2298     return co.ret;
2299 }
2300 
2301 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
2302                                          int64_t sector_num, int nb_sectors,
2303                                          QEMUIOVector *iov)
2304 {
2305     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
2306 }
2307 
2308 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
2309                                          int64_t sector_num, int nb_sectors,
2310                                          QEMUIOVector *iov)
2311 {
2312     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
2313 }
2314 
2315 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2316 {
2317     RwCo *rwco = opaque;
2318 
2319     rwco->ret = bdrv_co_flush(rwco->bs);
2320 }
2321 
2322 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2323 {
2324     int ret;
2325     BdrvTrackedRequest req;
2326 
2327     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
2328         bdrv_is_sg(bs)) {
2329         return 0;
2330     }
2331 
2332     tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
2333     /* Write back cached data to the OS even with cache=unsafe */
2334     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
2335     if (bs->drv->bdrv_co_flush_to_os) {
2336         ret = bs->drv->bdrv_co_flush_to_os(bs);
2337         if (ret < 0) {
2338             goto out;
2339         }
2340     }
2341 
2342     /* But don't actually force it to the disk with cache=unsafe */
2343     if (bs->open_flags & BDRV_O_NO_FLUSH) {
2344         goto flush_parent;
2345     }
2346 
2347     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2348     if (bs->drv->bdrv_co_flush_to_disk) {
2349         ret = bs->drv->bdrv_co_flush_to_disk(bs);
2350     } else if (bs->drv->bdrv_aio_flush) {
2351         BlockAIOCB *acb;
2352         CoroutineIOCompletion co = {
2353             .coroutine = qemu_coroutine_self(),
2354         };
2355 
2356         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2357         if (acb == NULL) {
2358             ret = -EIO;
2359         } else {
2360             qemu_coroutine_yield();
2361             ret = co.ret;
2362         }
2363     } else {
2364         /*
2365          * Some block drivers always operate in either writethrough or unsafe
2366          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2367          * know how the server works (because the behaviour is hardcoded or
2368          * depends on server-side configuration), so we can't ensure that
2369          * everything is safe on disk. Returning an error doesn't work because
2370          * that would break guests even if the server operates in writethrough
2371          * mode.
2372          *
2373          * Let's hope the user knows what he's doing.
2374          */
2375         ret = 0;
2376     }
2377     if (ret < 0) {
2378         goto out;
2379     }
2380 
2381     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
2382      * in the case of cache=unsafe, so there are no useless flushes.
2383      */
2384 flush_parent:
2385     ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2386 out:
2387     tracked_request_end(&req);
2388     return ret;
2389 }
2390 
2391 int bdrv_flush(BlockDriverState *bs)
2392 {
2393     Coroutine *co;
2394     RwCo rwco = {
2395         .bs = bs,
2396         .ret = NOT_DONE,
2397     };
2398 
2399     if (qemu_in_coroutine()) {
2400         /* Fast-path if already in coroutine context */
2401         bdrv_flush_co_entry(&rwco);
2402     } else {
2403         AioContext *aio_context = bdrv_get_aio_context(bs);
2404 
2405         co = qemu_coroutine_create(bdrv_flush_co_entry);
2406         qemu_coroutine_enter(co, &rwco);
2407         while (rwco.ret == NOT_DONE) {
2408             aio_poll(aio_context, true);
2409         }
2410     }
2411 
2412     return rwco.ret;
2413 }
2414 
2415 typedef struct DiscardCo {
2416     BlockDriverState *bs;
2417     int64_t sector_num;
2418     int nb_sectors;
2419     int ret;
2420 } DiscardCo;
2421 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
2422 {
2423     DiscardCo *rwco = opaque;
2424 
2425     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
2426 }
2427 
2428 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
2429                                  int nb_sectors)
2430 {
2431     BdrvTrackedRequest req;
2432     int max_discard, ret;
2433 
2434     if (!bs->drv) {
2435         return -ENOMEDIUM;
2436     }
2437 
2438     ret = bdrv_check_request(bs, sector_num, nb_sectors);
2439     if (ret < 0) {
2440         return ret;
2441     } else if (bs->read_only) {
2442         return -EPERM;
2443     }
2444     assert(!(bs->open_flags & BDRV_O_INACTIVE));
2445 
2446     /* Do nothing if disabled.  */
2447     if (!(bs->open_flags & BDRV_O_UNMAP)) {
2448         return 0;
2449     }
2450 
2451     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
2452         return 0;
2453     }
2454 
2455     tracked_request_begin(&req, bs, sector_num, nb_sectors,
2456                           BDRV_TRACKED_DISCARD);
2457     bdrv_set_dirty(bs, sector_num, nb_sectors);
2458 
2459     max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
2460     while (nb_sectors > 0) {
2461         int ret;
2462         int num = nb_sectors;
2463 
2464         /* align request */
2465         if (bs->bl.discard_alignment &&
2466             num >= bs->bl.discard_alignment &&
2467             sector_num % bs->bl.discard_alignment) {
2468             if (num > bs->bl.discard_alignment) {
2469                 num = bs->bl.discard_alignment;
2470             }
2471             num -= sector_num % bs->bl.discard_alignment;
2472         }
2473 
2474         /* limit request size */
2475         if (num > max_discard) {
2476             num = max_discard;
2477         }
2478 
2479         if (bs->drv->bdrv_co_discard) {
2480             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
2481         } else {
2482             BlockAIOCB *acb;
2483             CoroutineIOCompletion co = {
2484                 .coroutine = qemu_coroutine_self(),
2485             };
2486 
2487             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
2488                                             bdrv_co_io_em_complete, &co);
2489             if (acb == NULL) {
2490                 ret = -EIO;
2491                 goto out;
2492             } else {
2493                 qemu_coroutine_yield();
2494                 ret = co.ret;
2495             }
2496         }
2497         if (ret && ret != -ENOTSUP) {
2498             goto out;
2499         }
2500 
2501         sector_num += num;
2502         nb_sectors -= num;
2503     }
2504     ret = 0;
2505 out:
2506     tracked_request_end(&req);
2507     return ret;
2508 }
2509 
2510 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
2511 {
2512     Coroutine *co;
2513     DiscardCo rwco = {
2514         .bs = bs,
2515         .sector_num = sector_num,
2516         .nb_sectors = nb_sectors,
2517         .ret = NOT_DONE,
2518     };
2519 
2520     if (qemu_in_coroutine()) {
2521         /* Fast-path if already in coroutine context */
2522         bdrv_discard_co_entry(&rwco);
2523     } else {
2524         AioContext *aio_context = bdrv_get_aio_context(bs);
2525 
2526         co = qemu_coroutine_create(bdrv_discard_co_entry);
2527         qemu_coroutine_enter(co, &rwco);
2528         while (rwco.ret == NOT_DONE) {
2529             aio_poll(aio_context, true);
2530         }
2531     }
2532 
2533     return rwco.ret;
2534 }
2535 
2536 typedef struct {
2537     CoroutineIOCompletion *co;
2538     QEMUBH *bh;
2539 } BdrvIoctlCompletionData;
2540 
2541 static void bdrv_ioctl_bh_cb(void *opaque)
2542 {
2543     BdrvIoctlCompletionData *data = opaque;
2544 
2545     bdrv_co_io_em_complete(data->co, -ENOTSUP);
2546     qemu_bh_delete(data->bh);
2547 }
2548 
2549 static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf)
2550 {
2551     BlockDriver *drv = bs->drv;
2552     BdrvTrackedRequest tracked_req;
2553     CoroutineIOCompletion co = {
2554         .coroutine = qemu_coroutine_self(),
2555     };
2556     BlockAIOCB *acb;
2557 
2558     tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL);
2559     if (!drv || !drv->bdrv_aio_ioctl) {
2560         co.ret = -ENOTSUP;
2561         goto out;
2562     }
2563 
2564     acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
2565     if (!acb) {
2566         BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1);
2567         data->bh = aio_bh_new(bdrv_get_aio_context(bs),
2568                                 bdrv_ioctl_bh_cb, data);
2569         data->co = &co;
2570         qemu_bh_schedule(data->bh);
2571     }
2572     qemu_coroutine_yield();
2573 out:
2574     tracked_request_end(&tracked_req);
2575     return co.ret;
2576 }
2577 
2578 typedef struct {
2579     BlockDriverState *bs;
2580     int req;
2581     void *buf;
2582     int ret;
2583 } BdrvIoctlCoData;
2584 
2585 static void coroutine_fn bdrv_co_ioctl_entry(void *opaque)
2586 {
2587     BdrvIoctlCoData *data = opaque;
2588     data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf);
2589 }
2590 
2591 /* needed for generic scsi interface */
2592 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
2593 {
2594     BdrvIoctlCoData data = {
2595         .bs = bs,
2596         .req = req,
2597         .buf = buf,
2598         .ret = -EINPROGRESS,
2599     };
2600 
2601     if (qemu_in_coroutine()) {
2602         /* Fast-path if already in coroutine context */
2603         bdrv_co_ioctl_entry(&data);
2604     } else {
2605         Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry);
2606 
2607         qemu_coroutine_enter(co, &data);
2608         while (data.ret == -EINPROGRESS) {
2609             aio_poll(bdrv_get_aio_context(bs), true);
2610         }
2611     }
2612     return data.ret;
2613 }
2614 
2615 static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque)
2616 {
2617     BlockAIOCBCoroutine *acb = opaque;
2618     acb->req.error = bdrv_co_do_ioctl(acb->common.bs,
2619                                       acb->req.req, acb->req.buf);
2620     bdrv_co_complete(acb);
2621 }
2622 
2623 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
2624         unsigned long int req, void *buf,
2625         BlockCompletionFunc *cb, void *opaque)
2626 {
2627     BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info,
2628                                             bs, cb, opaque);
2629     Coroutine *co;
2630 
2631     acb->need_bh = true;
2632     acb->req.error = -EINPROGRESS;
2633     acb->req.req = req;
2634     acb->req.buf = buf;
2635     co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry);
2636     qemu_coroutine_enter(co, acb);
2637 
2638     bdrv_co_maybe_schedule_bh(acb);
2639     return &acb->common;
2640 }
2641 
2642 void *qemu_blockalign(BlockDriverState *bs, size_t size)
2643 {
2644     return qemu_memalign(bdrv_opt_mem_align(bs), size);
2645 }
2646 
2647 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
2648 {
2649     return memset(qemu_blockalign(bs, size), 0, size);
2650 }
2651 
2652 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
2653 {
2654     size_t align = bdrv_opt_mem_align(bs);
2655 
2656     /* Ensure that NULL is never returned on success */
2657     assert(align > 0);
2658     if (size == 0) {
2659         size = align;
2660     }
2661 
2662     return qemu_try_memalign(align, size);
2663 }
2664 
2665 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
2666 {
2667     void *mem = qemu_try_blockalign(bs, size);
2668 
2669     if (mem) {
2670         memset(mem, 0, size);
2671     }
2672 
2673     return mem;
2674 }
2675 
2676 /*
2677  * Check if all memory in this vector is sector aligned.
2678  */
2679 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
2680 {
2681     int i;
2682     size_t alignment = bdrv_min_mem_align(bs);
2683 
2684     for (i = 0; i < qiov->niov; i++) {
2685         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
2686             return false;
2687         }
2688         if (qiov->iov[i].iov_len % alignment) {
2689             return false;
2690         }
2691     }
2692 
2693     return true;
2694 }
2695 
2696 void bdrv_add_before_write_notifier(BlockDriverState *bs,
2697                                     NotifierWithReturn *notifier)
2698 {
2699     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
2700 }
2701 
2702 void bdrv_io_plug(BlockDriverState *bs)
2703 {
2704     BlockDriver *drv = bs->drv;
2705     if (drv && drv->bdrv_io_plug) {
2706         drv->bdrv_io_plug(bs);
2707     } else if (bs->file) {
2708         bdrv_io_plug(bs->file->bs);
2709     }
2710 }
2711 
2712 void bdrv_io_unplug(BlockDriverState *bs)
2713 {
2714     BlockDriver *drv = bs->drv;
2715     if (drv && drv->bdrv_io_unplug) {
2716         drv->bdrv_io_unplug(bs);
2717     } else if (bs->file) {
2718         bdrv_io_unplug(bs->file->bs);
2719     }
2720 }
2721 
2722 void bdrv_flush_io_queue(BlockDriverState *bs)
2723 {
2724     BlockDriver *drv = bs->drv;
2725     if (drv && drv->bdrv_flush_io_queue) {
2726         drv->bdrv_flush_io_queue(bs);
2727     } else if (bs->file) {
2728         bdrv_flush_io_queue(bs->file->bs);
2729     }
2730     bdrv_start_throttled_reqs(bs);
2731 }
2732 
2733 void bdrv_drained_begin(BlockDriverState *bs)
2734 {
2735     if (!bs->quiesce_counter++) {
2736         aio_disable_external(bdrv_get_aio_context(bs));
2737     }
2738     bdrv_drain(bs);
2739 }
2740 
2741 void bdrv_drained_end(BlockDriverState *bs)
2742 {
2743     assert(bs->quiesce_counter > 0);
2744     if (--bs->quiesce_counter > 0) {
2745         return;
2746     }
2747     aio_enable_external(bdrv_get_aio_context(bs));
2748 }
2749