xref: /openbmc/qemu/block/io.c (revision 87c9b5e0)
1 /*
2  * Block layer I/O functions
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "trace.h"
27 #include "sysemu/block-backend.h"
28 #include "block/blockjob.h"
29 #include "block/block_int.h"
30 #include "block/throttle-groups.h"
31 #include "qemu/error-report.h"
32 
33 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
34 
35 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
36         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
37         BlockCompletionFunc *cb, void *opaque);
38 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
39         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
40         BlockCompletionFunc *cb, void *opaque);
41 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
42                                          int64_t sector_num, int nb_sectors,
43                                          QEMUIOVector *iov);
44 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
45                                          int64_t sector_num, int nb_sectors,
46                                          QEMUIOVector *iov);
47 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
48     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
49     BdrvRequestFlags flags);
50 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
51     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
52     BdrvRequestFlags flags);
53 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
54                                          int64_t sector_num,
55                                          QEMUIOVector *qiov,
56                                          int nb_sectors,
57                                          BdrvRequestFlags flags,
58                                          BlockCompletionFunc *cb,
59                                          void *opaque,
60                                          bool is_write);
61 static void coroutine_fn bdrv_co_do_rw(void *opaque);
62 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
63     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
64 
65 /* throttling disk I/O limits */
66 void bdrv_set_io_limits(BlockDriverState *bs,
67                         ThrottleConfig *cfg)
68 {
69     int i;
70 
71     throttle_group_config(bs, cfg);
72 
73     for (i = 0; i < 2; i++) {
74         qemu_co_enter_next(&bs->throttled_reqs[i]);
75     }
76 }
77 
78 /* this function drain all the throttled IOs */
79 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
80 {
81     bool drained = false;
82     bool enabled = bs->io_limits_enabled;
83     int i;
84 
85     bs->io_limits_enabled = false;
86 
87     for (i = 0; i < 2; i++) {
88         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
89             drained = true;
90         }
91     }
92 
93     bs->io_limits_enabled = enabled;
94 
95     return drained;
96 }
97 
98 void bdrv_io_limits_disable(BlockDriverState *bs)
99 {
100     bs->io_limits_enabled = false;
101     bdrv_start_throttled_reqs(bs);
102     throttle_group_unregister_bs(bs);
103 }
104 
105 /* should be called before bdrv_set_io_limits if a limit is set */
106 void bdrv_io_limits_enable(BlockDriverState *bs, const char *group)
107 {
108     assert(!bs->io_limits_enabled);
109     throttle_group_register_bs(bs, group);
110     bs->io_limits_enabled = true;
111 }
112 
113 void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group)
114 {
115     /* this bs is not part of any group */
116     if (!bs->throttle_state) {
117         return;
118     }
119 
120     /* this bs is a part of the same group than the one we want */
121     if (!g_strcmp0(throttle_group_get_name(bs), group)) {
122         return;
123     }
124 
125     /* need to change the group this bs belong to */
126     bdrv_io_limits_disable(bs);
127     bdrv_io_limits_enable(bs, group);
128 }
129 
130 void bdrv_setup_io_funcs(BlockDriver *bdrv)
131 {
132     /* Block drivers without coroutine functions need emulation */
133     if (!bdrv->bdrv_co_readv) {
134         bdrv->bdrv_co_readv = bdrv_co_readv_em;
135         bdrv->bdrv_co_writev = bdrv_co_writev_em;
136 
137         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
138          * the block driver lacks aio we need to emulate that too.
139          */
140         if (!bdrv->bdrv_aio_readv) {
141             /* add AIO emulation layer */
142             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
143             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
144         }
145     }
146 }
147 
148 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
149 {
150     BlockDriver *drv = bs->drv;
151     Error *local_err = NULL;
152 
153     memset(&bs->bl, 0, sizeof(bs->bl));
154 
155     if (!drv) {
156         return;
157     }
158 
159     /* Take some limits from the children as a default */
160     if (bs->file) {
161         bdrv_refresh_limits(bs->file->bs, &local_err);
162         if (local_err) {
163             error_propagate(errp, local_err);
164             return;
165         }
166         bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length;
167         bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length;
168         bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment;
169         bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment;
170         bs->bl.max_iov = bs->file->bs->bl.max_iov;
171     } else {
172         bs->bl.min_mem_alignment = 512;
173         bs->bl.opt_mem_alignment = getpagesize();
174 
175         /* Safe default since most protocols use readv()/writev()/etc */
176         bs->bl.max_iov = IOV_MAX;
177     }
178 
179     if (bs->backing) {
180         bdrv_refresh_limits(bs->backing->bs, &local_err);
181         if (local_err) {
182             error_propagate(errp, local_err);
183             return;
184         }
185         bs->bl.opt_transfer_length =
186             MAX(bs->bl.opt_transfer_length,
187                 bs->backing->bs->bl.opt_transfer_length);
188         bs->bl.max_transfer_length =
189             MIN_NON_ZERO(bs->bl.max_transfer_length,
190                          bs->backing->bs->bl.max_transfer_length);
191         bs->bl.opt_mem_alignment =
192             MAX(bs->bl.opt_mem_alignment,
193                 bs->backing->bs->bl.opt_mem_alignment);
194         bs->bl.min_mem_alignment =
195             MAX(bs->bl.min_mem_alignment,
196                 bs->backing->bs->bl.min_mem_alignment);
197         bs->bl.max_iov =
198             MIN(bs->bl.max_iov,
199                 bs->backing->bs->bl.max_iov);
200     }
201 
202     /* Then let the driver override it */
203     if (drv->bdrv_refresh_limits) {
204         drv->bdrv_refresh_limits(bs, errp);
205     }
206 }
207 
208 /**
209  * The copy-on-read flag is actually a reference count so multiple users may
210  * use the feature without worrying about clobbering its previous state.
211  * Copy-on-read stays enabled until all users have called to disable it.
212  */
213 void bdrv_enable_copy_on_read(BlockDriverState *bs)
214 {
215     bs->copy_on_read++;
216 }
217 
218 void bdrv_disable_copy_on_read(BlockDriverState *bs)
219 {
220     assert(bs->copy_on_read > 0);
221     bs->copy_on_read--;
222 }
223 
224 /* Check if any requests are in-flight (including throttled requests) */
225 bool bdrv_requests_pending(BlockDriverState *bs)
226 {
227     BdrvChild *child;
228 
229     if (!QLIST_EMPTY(&bs->tracked_requests)) {
230         return true;
231     }
232     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
233         return true;
234     }
235     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
236         return true;
237     }
238 
239     QLIST_FOREACH(child, &bs->children, next) {
240         if (bdrv_requests_pending(child->bs)) {
241             return true;
242         }
243     }
244 
245     return false;
246 }
247 
248 static void bdrv_drain_recurse(BlockDriverState *bs)
249 {
250     BdrvChild *child;
251 
252     if (bs->drv && bs->drv->bdrv_drain) {
253         bs->drv->bdrv_drain(bs);
254     }
255     QLIST_FOREACH(child, &bs->children, next) {
256         bdrv_drain_recurse(child->bs);
257     }
258 }
259 
260 /*
261  * Wait for pending requests to complete on a single BlockDriverState subtree,
262  * and suspend block driver's internal I/O until next request arrives.
263  *
264  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
265  * AioContext.
266  *
267  * Only this BlockDriverState's AioContext is run, so in-flight requests must
268  * not depend on events in other AioContexts.  In that case, use
269  * bdrv_drain_all() instead.
270  */
271 void bdrv_drain(BlockDriverState *bs)
272 {
273     bool busy = true;
274 
275     bdrv_drain_recurse(bs);
276     while (busy) {
277         /* Keep iterating */
278          bdrv_flush_io_queue(bs);
279          busy = bdrv_requests_pending(bs);
280          busy |= aio_poll(bdrv_get_aio_context(bs), busy);
281     }
282 }
283 
284 /*
285  * Wait for pending requests to complete across all BlockDriverStates
286  *
287  * This function does not flush data to disk, use bdrv_flush_all() for that
288  * after calling this function.
289  */
290 void bdrv_drain_all(void)
291 {
292     /* Always run first iteration so any pending completion BHs run */
293     bool busy = true;
294     BlockDriverState *bs = NULL;
295     GSList *aio_ctxs = NULL, *ctx;
296 
297     while ((bs = bdrv_next(bs))) {
298         AioContext *aio_context = bdrv_get_aio_context(bs);
299 
300         aio_context_acquire(aio_context);
301         if (bs->job) {
302             block_job_pause(bs->job);
303         }
304         aio_context_release(aio_context);
305 
306         if (!g_slist_find(aio_ctxs, aio_context)) {
307             aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
308         }
309     }
310 
311     /* Note that completion of an asynchronous I/O operation can trigger any
312      * number of other I/O operations on other devices---for example a
313      * coroutine can submit an I/O request to another device in response to
314      * request completion.  Therefore we must keep looping until there was no
315      * more activity rather than simply draining each device independently.
316      */
317     while (busy) {
318         busy = false;
319 
320         for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
321             AioContext *aio_context = ctx->data;
322             bs = NULL;
323 
324             aio_context_acquire(aio_context);
325             while ((bs = bdrv_next(bs))) {
326                 if (aio_context == bdrv_get_aio_context(bs)) {
327                     bdrv_flush_io_queue(bs);
328                     if (bdrv_requests_pending(bs)) {
329                         busy = true;
330                         aio_poll(aio_context, busy);
331                     }
332                 }
333             }
334             busy |= aio_poll(aio_context, false);
335             aio_context_release(aio_context);
336         }
337     }
338 
339     bs = NULL;
340     while ((bs = bdrv_next(bs))) {
341         AioContext *aio_context = bdrv_get_aio_context(bs);
342 
343         aio_context_acquire(aio_context);
344         if (bs->job) {
345             block_job_resume(bs->job);
346         }
347         aio_context_release(aio_context);
348     }
349     g_slist_free(aio_ctxs);
350 }
351 
352 /**
353  * Remove an active request from the tracked requests list
354  *
355  * This function should be called when a tracked request is completing.
356  */
357 static void tracked_request_end(BdrvTrackedRequest *req)
358 {
359     if (req->serialising) {
360         req->bs->serialising_in_flight--;
361     }
362 
363     QLIST_REMOVE(req, list);
364     qemu_co_queue_restart_all(&req->wait_queue);
365 }
366 
367 /**
368  * Add an active request to the tracked requests list
369  */
370 static void tracked_request_begin(BdrvTrackedRequest *req,
371                                   BlockDriverState *bs,
372                                   int64_t offset,
373                                   unsigned int bytes,
374                                   enum BdrvTrackedRequestType type)
375 {
376     *req = (BdrvTrackedRequest){
377         .bs = bs,
378         .offset         = offset,
379         .bytes          = bytes,
380         .type           = type,
381         .co             = qemu_coroutine_self(),
382         .serialising    = false,
383         .overlap_offset = offset,
384         .overlap_bytes  = bytes,
385     };
386 
387     qemu_co_queue_init(&req->wait_queue);
388 
389     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
390 }
391 
392 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
393 {
394     int64_t overlap_offset = req->offset & ~(align - 1);
395     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
396                                - overlap_offset;
397 
398     if (!req->serialising) {
399         req->bs->serialising_in_flight++;
400         req->serialising = true;
401     }
402 
403     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
404     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
405 }
406 
407 /**
408  * Round a region to cluster boundaries
409  */
410 void bdrv_round_to_clusters(BlockDriverState *bs,
411                             int64_t sector_num, int nb_sectors,
412                             int64_t *cluster_sector_num,
413                             int *cluster_nb_sectors)
414 {
415     BlockDriverInfo bdi;
416 
417     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
418         *cluster_sector_num = sector_num;
419         *cluster_nb_sectors = nb_sectors;
420     } else {
421         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
422         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
423         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
424                                             nb_sectors, c);
425     }
426 }
427 
428 static int bdrv_get_cluster_size(BlockDriverState *bs)
429 {
430     BlockDriverInfo bdi;
431     int ret;
432 
433     ret = bdrv_get_info(bs, &bdi);
434     if (ret < 0 || bdi.cluster_size == 0) {
435         return bs->request_alignment;
436     } else {
437         return bdi.cluster_size;
438     }
439 }
440 
441 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
442                                      int64_t offset, unsigned int bytes)
443 {
444     /*        aaaa   bbbb */
445     if (offset >= req->overlap_offset + req->overlap_bytes) {
446         return false;
447     }
448     /* bbbb   aaaa        */
449     if (req->overlap_offset >= offset + bytes) {
450         return false;
451     }
452     return true;
453 }
454 
455 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
456 {
457     BlockDriverState *bs = self->bs;
458     BdrvTrackedRequest *req;
459     bool retry;
460     bool waited = false;
461 
462     if (!bs->serialising_in_flight) {
463         return false;
464     }
465 
466     do {
467         retry = false;
468         QLIST_FOREACH(req, &bs->tracked_requests, list) {
469             if (req == self || (!req->serialising && !self->serialising)) {
470                 continue;
471             }
472             if (tracked_request_overlaps(req, self->overlap_offset,
473                                          self->overlap_bytes))
474             {
475                 /* Hitting this means there was a reentrant request, for
476                  * example, a block driver issuing nested requests.  This must
477                  * never happen since it means deadlock.
478                  */
479                 assert(qemu_coroutine_self() != req->co);
480 
481                 /* If the request is already (indirectly) waiting for us, or
482                  * will wait for us as soon as it wakes up, then just go on
483                  * (instead of producing a deadlock in the former case). */
484                 if (!req->waiting_for) {
485                     self->waiting_for = req;
486                     qemu_co_queue_wait(&req->wait_queue);
487                     self->waiting_for = NULL;
488                     retry = true;
489                     waited = true;
490                     break;
491                 }
492             }
493         }
494     } while (retry);
495 
496     return waited;
497 }
498 
499 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
500                                    size_t size)
501 {
502     if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
503         return -EIO;
504     }
505 
506     if (!bdrv_is_inserted(bs)) {
507         return -ENOMEDIUM;
508     }
509 
510     if (offset < 0) {
511         return -EIO;
512     }
513 
514     return 0;
515 }
516 
517 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
518                               int nb_sectors)
519 {
520     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
521         return -EIO;
522     }
523 
524     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
525                                    nb_sectors * BDRV_SECTOR_SIZE);
526 }
527 
528 typedef struct RwCo {
529     BlockDriverState *bs;
530     int64_t offset;
531     QEMUIOVector *qiov;
532     bool is_write;
533     int ret;
534     BdrvRequestFlags flags;
535 } RwCo;
536 
537 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
538 {
539     RwCo *rwco = opaque;
540 
541     if (!rwco->is_write) {
542         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
543                                       rwco->qiov->size, rwco->qiov,
544                                       rwco->flags);
545     } else {
546         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
547                                        rwco->qiov->size, rwco->qiov,
548                                        rwco->flags);
549     }
550 }
551 
552 /*
553  * Process a vectored synchronous request using coroutines
554  */
555 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
556                         QEMUIOVector *qiov, bool is_write,
557                         BdrvRequestFlags flags)
558 {
559     Coroutine *co;
560     RwCo rwco = {
561         .bs = bs,
562         .offset = offset,
563         .qiov = qiov,
564         .is_write = is_write,
565         .ret = NOT_DONE,
566         .flags = flags,
567     };
568 
569     /**
570      * In sync call context, when the vcpu is blocked, this throttling timer
571      * will not fire; so the I/O throttling function has to be disabled here
572      * if it has been enabled.
573      */
574     if (bs->io_limits_enabled) {
575         fprintf(stderr, "Disabling I/O throttling on '%s' due "
576                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
577         bdrv_io_limits_disable(bs);
578     }
579 
580     if (qemu_in_coroutine()) {
581         /* Fast-path if already in coroutine context */
582         bdrv_rw_co_entry(&rwco);
583     } else {
584         AioContext *aio_context = bdrv_get_aio_context(bs);
585 
586         co = qemu_coroutine_create(bdrv_rw_co_entry);
587         qemu_coroutine_enter(co, &rwco);
588         while (rwco.ret == NOT_DONE) {
589             aio_poll(aio_context, true);
590         }
591     }
592     return rwco.ret;
593 }
594 
595 /*
596  * Process a synchronous request using coroutines
597  */
598 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
599                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
600 {
601     QEMUIOVector qiov;
602     struct iovec iov = {
603         .iov_base = (void *)buf,
604         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
605     };
606 
607     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
608         return -EINVAL;
609     }
610 
611     qemu_iovec_init_external(&qiov, &iov, 1);
612     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
613                         &qiov, is_write, flags);
614 }
615 
616 /* return < 0 if error. See bdrv_write() for the return codes */
617 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
618               uint8_t *buf, int nb_sectors)
619 {
620     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
621 }
622 
623 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
624 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
625                           uint8_t *buf, int nb_sectors)
626 {
627     bool enabled;
628     int ret;
629 
630     enabled = bs->io_limits_enabled;
631     bs->io_limits_enabled = false;
632     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
633     bs->io_limits_enabled = enabled;
634     return ret;
635 }
636 
637 /* Return < 0 if error. Important errors are:
638   -EIO         generic I/O error (may happen for all errors)
639   -ENOMEDIUM   No media inserted.
640   -EINVAL      Invalid sector number or nb_sectors
641   -EACCES      Trying to write a read-only device
642 */
643 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
644                const uint8_t *buf, int nb_sectors)
645 {
646     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
647 }
648 
649 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
650                       int nb_sectors, BdrvRequestFlags flags)
651 {
652     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
653                       BDRV_REQ_ZERO_WRITE | flags);
654 }
655 
656 /*
657  * Completely zero out a block device with the help of bdrv_write_zeroes.
658  * The operation is sped up by checking the block status and only writing
659  * zeroes to the device if they currently do not return zeroes. Optional
660  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
661  *
662  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
663  */
664 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
665 {
666     int64_t target_sectors, ret, nb_sectors, sector_num = 0;
667     BlockDriverState *file;
668     int n;
669 
670     target_sectors = bdrv_nb_sectors(bs);
671     if (target_sectors < 0) {
672         return target_sectors;
673     }
674 
675     for (;;) {
676         nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
677         if (nb_sectors <= 0) {
678             return 0;
679         }
680         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n, &file);
681         if (ret < 0) {
682             error_report("error getting block status at sector %" PRId64 ": %s",
683                          sector_num, strerror(-ret));
684             return ret;
685         }
686         if (ret & BDRV_BLOCK_ZERO) {
687             sector_num += n;
688             continue;
689         }
690         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
691         if (ret < 0) {
692             error_report("error writing zeroes at sector %" PRId64 ": %s",
693                          sector_num, strerror(-ret));
694             return ret;
695         }
696         sector_num += n;
697     }
698 }
699 
700 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
701 {
702     QEMUIOVector qiov;
703     struct iovec iov = {
704         .iov_base = (void *)buf,
705         .iov_len = bytes,
706     };
707     int ret;
708 
709     if (bytes < 0) {
710         return -EINVAL;
711     }
712 
713     qemu_iovec_init_external(&qiov, &iov, 1);
714     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
715     if (ret < 0) {
716         return ret;
717     }
718 
719     return bytes;
720 }
721 
722 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
723 {
724     int ret;
725 
726     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
727     if (ret < 0) {
728         return ret;
729     }
730 
731     return qiov->size;
732 }
733 
734 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
735                 const void *buf, int bytes)
736 {
737     QEMUIOVector qiov;
738     struct iovec iov = {
739         .iov_base   = (void *) buf,
740         .iov_len    = bytes,
741     };
742 
743     if (bytes < 0) {
744         return -EINVAL;
745     }
746 
747     qemu_iovec_init_external(&qiov, &iov, 1);
748     return bdrv_pwritev(bs, offset, &qiov);
749 }
750 
751 /*
752  * Writes to the file and ensures that no writes are reordered across this
753  * request (acts as a barrier)
754  *
755  * Returns 0 on success, -errno in error cases.
756  */
757 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
758     const void *buf, int count)
759 {
760     int ret;
761 
762     ret = bdrv_pwrite(bs, offset, buf, count);
763     if (ret < 0) {
764         return ret;
765     }
766 
767     /* No flush needed for cache modes that already do it */
768     if (bs->enable_write_cache) {
769         bdrv_flush(bs);
770     }
771 
772     return 0;
773 }
774 
775 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
776         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
777 {
778     /* Perform I/O through a temporary buffer so that users who scribble over
779      * their read buffer while the operation is in progress do not end up
780      * modifying the image file.  This is critical for zero-copy guest I/O
781      * where anything might happen inside guest memory.
782      */
783     void *bounce_buffer;
784 
785     BlockDriver *drv = bs->drv;
786     struct iovec iov;
787     QEMUIOVector bounce_qiov;
788     int64_t cluster_sector_num;
789     int cluster_nb_sectors;
790     size_t skip_bytes;
791     int ret;
792 
793     /* Cover entire cluster so no additional backing file I/O is required when
794      * allocating cluster in the image file.
795      */
796     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
797                            &cluster_sector_num, &cluster_nb_sectors);
798 
799     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
800                                    cluster_sector_num, cluster_nb_sectors);
801 
802     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
803     iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
804     if (bounce_buffer == NULL) {
805         ret = -ENOMEM;
806         goto err;
807     }
808 
809     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
810 
811     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
812                              &bounce_qiov);
813     if (ret < 0) {
814         goto err;
815     }
816 
817     if (drv->bdrv_co_write_zeroes &&
818         buffer_is_zero(bounce_buffer, iov.iov_len)) {
819         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
820                                       cluster_nb_sectors, 0);
821     } else {
822         /* This does not change the data on the disk, it is not necessary
823          * to flush even in cache=writethrough mode.
824          */
825         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
826                                   &bounce_qiov);
827     }
828 
829     if (ret < 0) {
830         /* It might be okay to ignore write errors for guest requests.  If this
831          * is a deliberate copy-on-read then we don't want to ignore the error.
832          * Simply report it in all cases.
833          */
834         goto err;
835     }
836 
837     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
838     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
839                         nb_sectors * BDRV_SECTOR_SIZE);
840 
841 err:
842     qemu_vfree(bounce_buffer);
843     return ret;
844 }
845 
846 /*
847  * Forwards an already correctly aligned request to the BlockDriver. This
848  * handles copy on read and zeroing after EOF; any other features must be
849  * implemented by the caller.
850  */
851 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
852     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
853     int64_t align, QEMUIOVector *qiov, int flags)
854 {
855     BlockDriver *drv = bs->drv;
856     int ret;
857 
858     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
859     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
860 
861     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
862     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
863     assert(!qiov || bytes == qiov->size);
864 
865     /* Handle Copy on Read and associated serialisation */
866     if (flags & BDRV_REQ_COPY_ON_READ) {
867         /* If we touch the same cluster it counts as an overlap.  This
868          * guarantees that allocating writes will be serialized and not race
869          * with each other for the same cluster.  For example, in copy-on-read
870          * it ensures that the CoR read and write operations are atomic and
871          * guest writes cannot interleave between them. */
872         mark_request_serialising(req, bdrv_get_cluster_size(bs));
873     }
874 
875     if (!(flags & BDRV_REQ_NO_SERIALISING)) {
876         wait_serialising_requests(req);
877     }
878 
879     if (flags & BDRV_REQ_COPY_ON_READ) {
880         int pnum;
881 
882         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
883         if (ret < 0) {
884             goto out;
885         }
886 
887         if (!ret || pnum != nb_sectors) {
888             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
889             goto out;
890         }
891     }
892 
893     /* Forward the request to the BlockDriver */
894     if (!bs->zero_beyond_eof) {
895         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
896     } else {
897         /* Read zeros after EOF */
898         int64_t total_sectors, max_nb_sectors;
899 
900         total_sectors = bdrv_nb_sectors(bs);
901         if (total_sectors < 0) {
902             ret = total_sectors;
903             goto out;
904         }
905 
906         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
907                                   align >> BDRV_SECTOR_BITS);
908         if (nb_sectors < max_nb_sectors) {
909             ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
910         } else if (max_nb_sectors > 0) {
911             QEMUIOVector local_qiov;
912 
913             qemu_iovec_init(&local_qiov, qiov->niov);
914             qemu_iovec_concat(&local_qiov, qiov, 0,
915                               max_nb_sectors * BDRV_SECTOR_SIZE);
916 
917             ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
918                                      &local_qiov);
919 
920             qemu_iovec_destroy(&local_qiov);
921         } else {
922             ret = 0;
923         }
924 
925         /* Reading beyond end of file is supposed to produce zeroes */
926         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
927             uint64_t offset = MAX(0, total_sectors - sector_num);
928             uint64_t bytes = (sector_num + nb_sectors - offset) *
929                               BDRV_SECTOR_SIZE;
930             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
931         }
932     }
933 
934 out:
935     return ret;
936 }
937 
938 /*
939  * Handle a read request in coroutine context
940  */
941 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
942     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
943     BdrvRequestFlags flags)
944 {
945     BlockDriver *drv = bs->drv;
946     BdrvTrackedRequest req;
947 
948     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
949     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
950     uint8_t *head_buf = NULL;
951     uint8_t *tail_buf = NULL;
952     QEMUIOVector local_qiov;
953     bool use_local_qiov = false;
954     int ret;
955 
956     if (!drv) {
957         return -ENOMEDIUM;
958     }
959 
960     ret = bdrv_check_byte_request(bs, offset, bytes);
961     if (ret < 0) {
962         return ret;
963     }
964 
965     /* Don't do copy-on-read if we read data before write operation */
966     if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) {
967         flags |= BDRV_REQ_COPY_ON_READ;
968     }
969 
970     /* throttling disk I/O */
971     if (bs->io_limits_enabled) {
972         throttle_group_co_io_limits_intercept(bs, bytes, false);
973     }
974 
975     /* Align read if necessary by padding qiov */
976     if (offset & (align - 1)) {
977         head_buf = qemu_blockalign(bs, align);
978         qemu_iovec_init(&local_qiov, qiov->niov + 2);
979         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
980         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
981         use_local_qiov = true;
982 
983         bytes += offset & (align - 1);
984         offset = offset & ~(align - 1);
985     }
986 
987     if ((offset + bytes) & (align - 1)) {
988         if (!use_local_qiov) {
989             qemu_iovec_init(&local_qiov, qiov->niov + 1);
990             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
991             use_local_qiov = true;
992         }
993         tail_buf = qemu_blockalign(bs, align);
994         qemu_iovec_add(&local_qiov, tail_buf,
995                        align - ((offset + bytes) & (align - 1)));
996 
997         bytes = ROUND_UP(bytes, align);
998     }
999 
1000     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
1001     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
1002                               use_local_qiov ? &local_qiov : qiov,
1003                               flags);
1004     tracked_request_end(&req);
1005 
1006     if (use_local_qiov) {
1007         qemu_iovec_destroy(&local_qiov);
1008         qemu_vfree(head_buf);
1009         qemu_vfree(tail_buf);
1010     }
1011 
1012     return ret;
1013 }
1014 
1015 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1016     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1017     BdrvRequestFlags flags)
1018 {
1019     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1020         return -EINVAL;
1021     }
1022 
1023     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
1024                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1025 }
1026 
1027 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1028     int nb_sectors, QEMUIOVector *qiov)
1029 {
1030     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1031 
1032     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1033 }
1034 
1035 int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs,
1036     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1037 {
1038     trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors);
1039 
1040     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1041                             BDRV_REQ_NO_SERIALISING);
1042 }
1043 
1044 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1045     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1046 {
1047     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1048 
1049     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1050                             BDRV_REQ_COPY_ON_READ);
1051 }
1052 
1053 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
1054 
1055 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1056     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
1057 {
1058     BlockDriver *drv = bs->drv;
1059     QEMUIOVector qiov;
1060     struct iovec iov = {0};
1061     int ret = 0;
1062 
1063     int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
1064                                         BDRV_REQUEST_MAX_SECTORS);
1065 
1066     while (nb_sectors > 0 && !ret) {
1067         int num = nb_sectors;
1068 
1069         /* Align request.  Block drivers can expect the "bulk" of the request
1070          * to be aligned.
1071          */
1072         if (bs->bl.write_zeroes_alignment
1073             && num > bs->bl.write_zeroes_alignment) {
1074             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
1075                 /* Make a small request up to the first aligned sector.  */
1076                 num = bs->bl.write_zeroes_alignment;
1077                 num -= sector_num % bs->bl.write_zeroes_alignment;
1078             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
1079                 /* Shorten the request to the last aligned sector.  num cannot
1080                  * underflow because num > bs->bl.write_zeroes_alignment.
1081                  */
1082                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
1083             }
1084         }
1085 
1086         /* limit request size */
1087         if (num > max_write_zeroes) {
1088             num = max_write_zeroes;
1089         }
1090 
1091         ret = -ENOTSUP;
1092         /* First try the efficient write zeroes operation */
1093         if (drv->bdrv_co_write_zeroes) {
1094             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
1095         }
1096 
1097         if (ret == -ENOTSUP) {
1098             /* Fall back to bounce buffer if write zeroes is unsupported */
1099             int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
1100                                             MAX_WRITE_ZEROES_BOUNCE_BUFFER);
1101             num = MIN(num, max_xfer_len);
1102             iov.iov_len = num * BDRV_SECTOR_SIZE;
1103             if (iov.iov_base == NULL) {
1104                 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
1105                 if (iov.iov_base == NULL) {
1106                     ret = -ENOMEM;
1107                     goto fail;
1108                 }
1109                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
1110             }
1111             qemu_iovec_init_external(&qiov, &iov, 1);
1112 
1113             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
1114 
1115             /* Keep bounce buffer around if it is big enough for all
1116              * all future requests.
1117              */
1118             if (num < max_xfer_len) {
1119                 qemu_vfree(iov.iov_base);
1120                 iov.iov_base = NULL;
1121             }
1122         }
1123 
1124         sector_num += num;
1125         nb_sectors -= num;
1126     }
1127 
1128 fail:
1129     qemu_vfree(iov.iov_base);
1130     return ret;
1131 }
1132 
1133 /*
1134  * Forwards an already correctly aligned write request to the BlockDriver.
1135  */
1136 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
1137     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1138     QEMUIOVector *qiov, int flags)
1139 {
1140     BlockDriver *drv = bs->drv;
1141     bool waited;
1142     int ret;
1143 
1144     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
1145     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
1146 
1147     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
1148     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
1149     assert(!qiov || bytes == qiov->size);
1150 
1151     waited = wait_serialising_requests(req);
1152     assert(!waited || !req->serialising);
1153     assert(req->overlap_offset <= offset);
1154     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1155 
1156     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
1157 
1158     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
1159         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
1160         qemu_iovec_is_zero(qiov)) {
1161         flags |= BDRV_REQ_ZERO_WRITE;
1162         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
1163             flags |= BDRV_REQ_MAY_UNMAP;
1164         }
1165     }
1166 
1167     if (ret < 0) {
1168         /* Do nothing, write notifier decided to fail this request */
1169     } else if (flags & BDRV_REQ_ZERO_WRITE) {
1170         bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
1171         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
1172     } else {
1173         bdrv_debug_event(bs, BLKDBG_PWRITEV);
1174         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1175     }
1176     bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
1177 
1178     if (ret == 0 && !bs->enable_write_cache) {
1179         ret = bdrv_co_flush(bs);
1180     }
1181 
1182     bdrv_set_dirty(bs, sector_num, nb_sectors);
1183 
1184     if (bs->wr_highest_offset < offset + bytes) {
1185         bs->wr_highest_offset = offset + bytes;
1186     }
1187 
1188     if (ret >= 0) {
1189         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
1190     }
1191 
1192     return ret;
1193 }
1194 
1195 static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
1196                                                 int64_t offset,
1197                                                 unsigned int bytes,
1198                                                 BdrvRequestFlags flags,
1199                                                 BdrvTrackedRequest *req)
1200 {
1201     uint8_t *buf = NULL;
1202     QEMUIOVector local_qiov;
1203     struct iovec iov;
1204     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
1205     unsigned int head_padding_bytes, tail_padding_bytes;
1206     int ret = 0;
1207 
1208     head_padding_bytes = offset & (align - 1);
1209     tail_padding_bytes = align - ((offset + bytes) & (align - 1));
1210 
1211 
1212     assert(flags & BDRV_REQ_ZERO_WRITE);
1213     if (head_padding_bytes || tail_padding_bytes) {
1214         buf = qemu_blockalign(bs, align);
1215         iov = (struct iovec) {
1216             .iov_base   = buf,
1217             .iov_len    = align,
1218         };
1219         qemu_iovec_init_external(&local_qiov, &iov, 1);
1220     }
1221     if (head_padding_bytes) {
1222         uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
1223 
1224         /* RMW the unaligned part before head. */
1225         mark_request_serialising(req, align);
1226         wait_serialising_requests(req);
1227         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1228         ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align,
1229                                   align, &local_qiov, 0);
1230         if (ret < 0) {
1231             goto fail;
1232         }
1233         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1234 
1235         memset(buf + head_padding_bytes, 0, zero_bytes);
1236         ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align,
1237                                    &local_qiov,
1238                                    flags & ~BDRV_REQ_ZERO_WRITE);
1239         if (ret < 0) {
1240             goto fail;
1241         }
1242         offset += zero_bytes;
1243         bytes -= zero_bytes;
1244     }
1245 
1246     assert(!bytes || (offset & (align - 1)) == 0);
1247     if (bytes >= align) {
1248         /* Write the aligned part in the middle. */
1249         uint64_t aligned_bytes = bytes & ~(align - 1);
1250         ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes,
1251                                    NULL, flags);
1252         if (ret < 0) {
1253             goto fail;
1254         }
1255         bytes -= aligned_bytes;
1256         offset += aligned_bytes;
1257     }
1258 
1259     assert(!bytes || (offset & (align - 1)) == 0);
1260     if (bytes) {
1261         assert(align == tail_padding_bytes + bytes);
1262         /* RMW the unaligned part after tail. */
1263         mark_request_serialising(req, align);
1264         wait_serialising_requests(req);
1265         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1266         ret = bdrv_aligned_preadv(bs, req, offset, align,
1267                                   align, &local_qiov, 0);
1268         if (ret < 0) {
1269             goto fail;
1270         }
1271         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1272 
1273         memset(buf, 0, bytes);
1274         ret = bdrv_aligned_pwritev(bs, req, offset, align,
1275                                    &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
1276     }
1277 fail:
1278     qemu_vfree(buf);
1279     return ret;
1280 
1281 }
1282 
1283 /*
1284  * Handle a write request in coroutine context
1285  */
1286 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
1287     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1288     BdrvRequestFlags flags)
1289 {
1290     BdrvTrackedRequest req;
1291     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
1292     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
1293     uint8_t *head_buf = NULL;
1294     uint8_t *tail_buf = NULL;
1295     QEMUIOVector local_qiov;
1296     bool use_local_qiov = false;
1297     int ret;
1298 
1299     if (!bs->drv) {
1300         return -ENOMEDIUM;
1301     }
1302     if (bs->read_only) {
1303         return -EPERM;
1304     }
1305     assert(!(bs->open_flags & BDRV_O_INACTIVE));
1306 
1307     ret = bdrv_check_byte_request(bs, offset, bytes);
1308     if (ret < 0) {
1309         return ret;
1310     }
1311 
1312     /* throttling disk I/O */
1313     if (bs->io_limits_enabled) {
1314         throttle_group_co_io_limits_intercept(bs, bytes, true);
1315     }
1316 
1317     /*
1318      * Align write if necessary by performing a read-modify-write cycle.
1319      * Pad qiov with the read parts and be sure to have a tracked request not
1320      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
1321      */
1322     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
1323 
1324     if (!qiov) {
1325         ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req);
1326         goto out;
1327     }
1328 
1329     if (offset & (align - 1)) {
1330         QEMUIOVector head_qiov;
1331         struct iovec head_iov;
1332 
1333         mark_request_serialising(&req, align);
1334         wait_serialising_requests(&req);
1335 
1336         head_buf = qemu_blockalign(bs, align);
1337         head_iov = (struct iovec) {
1338             .iov_base   = head_buf,
1339             .iov_len    = align,
1340         };
1341         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
1342 
1343         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1344         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
1345                                   align, &head_qiov, 0);
1346         if (ret < 0) {
1347             goto fail;
1348         }
1349         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1350 
1351         qemu_iovec_init(&local_qiov, qiov->niov + 2);
1352         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1353         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1354         use_local_qiov = true;
1355 
1356         bytes += offset & (align - 1);
1357         offset = offset & ~(align - 1);
1358     }
1359 
1360     if ((offset + bytes) & (align - 1)) {
1361         QEMUIOVector tail_qiov;
1362         struct iovec tail_iov;
1363         size_t tail_bytes;
1364         bool waited;
1365 
1366         mark_request_serialising(&req, align);
1367         waited = wait_serialising_requests(&req);
1368         assert(!waited || !use_local_qiov);
1369 
1370         tail_buf = qemu_blockalign(bs, align);
1371         tail_iov = (struct iovec) {
1372             .iov_base   = tail_buf,
1373             .iov_len    = align,
1374         };
1375         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
1376 
1377         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1378         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
1379                                   align, &tail_qiov, 0);
1380         if (ret < 0) {
1381             goto fail;
1382         }
1383         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1384 
1385         if (!use_local_qiov) {
1386             qemu_iovec_init(&local_qiov, qiov->niov + 1);
1387             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1388             use_local_qiov = true;
1389         }
1390 
1391         tail_bytes = (offset + bytes) & (align - 1);
1392         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
1393 
1394         bytes = ROUND_UP(bytes, align);
1395     }
1396 
1397     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
1398                                use_local_qiov ? &local_qiov : qiov,
1399                                flags);
1400 
1401 fail:
1402 
1403     if (use_local_qiov) {
1404         qemu_iovec_destroy(&local_qiov);
1405     }
1406     qemu_vfree(head_buf);
1407     qemu_vfree(tail_buf);
1408 out:
1409     tracked_request_end(&req);
1410     return ret;
1411 }
1412 
1413 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1414     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1415     BdrvRequestFlags flags)
1416 {
1417     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1418         return -EINVAL;
1419     }
1420 
1421     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
1422                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1423 }
1424 
1425 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1426     int nb_sectors, QEMUIOVector *qiov)
1427 {
1428     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1429 
1430     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1431 }
1432 
1433 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1434                                       int64_t sector_num, int nb_sectors,
1435                                       BdrvRequestFlags flags)
1436 {
1437     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
1438 
1439     if (!(bs->open_flags & BDRV_O_UNMAP)) {
1440         flags &= ~BDRV_REQ_MAY_UNMAP;
1441     }
1442 
1443     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1444                              BDRV_REQ_ZERO_WRITE | flags);
1445 }
1446 
1447 int bdrv_flush_all(void)
1448 {
1449     BlockDriverState *bs = NULL;
1450     int result = 0;
1451 
1452     while ((bs = bdrv_next(bs))) {
1453         AioContext *aio_context = bdrv_get_aio_context(bs);
1454         int ret;
1455 
1456         aio_context_acquire(aio_context);
1457         ret = bdrv_flush(bs);
1458         if (ret < 0 && !result) {
1459             result = ret;
1460         }
1461         aio_context_release(aio_context);
1462     }
1463 
1464     return result;
1465 }
1466 
1467 typedef struct BdrvCoGetBlockStatusData {
1468     BlockDriverState *bs;
1469     BlockDriverState *base;
1470     BlockDriverState **file;
1471     int64_t sector_num;
1472     int nb_sectors;
1473     int *pnum;
1474     int64_t ret;
1475     bool done;
1476 } BdrvCoGetBlockStatusData;
1477 
1478 /*
1479  * Returns the allocation status of the specified sectors.
1480  * Drivers not implementing the functionality are assumed to not support
1481  * backing files, hence all their sectors are reported as allocated.
1482  *
1483  * If 'sector_num' is beyond the end of the disk image the return value is 0
1484  * and 'pnum' is set to 0.
1485  *
1486  * 'pnum' is set to the number of sectors (including and immediately following
1487  * the specified sector) that are known to be in the same
1488  * allocated/unallocated state.
1489  *
1490  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
1491  * beyond the end of the disk image it will be clamped.
1492  *
1493  * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file'
1494  * points to the BDS which the sector range is allocated in.
1495  */
1496 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
1497                                                      int64_t sector_num,
1498                                                      int nb_sectors, int *pnum,
1499                                                      BlockDriverState **file)
1500 {
1501     int64_t total_sectors;
1502     int64_t n;
1503     int64_t ret, ret2;
1504 
1505     total_sectors = bdrv_nb_sectors(bs);
1506     if (total_sectors < 0) {
1507         return total_sectors;
1508     }
1509 
1510     if (sector_num >= total_sectors) {
1511         *pnum = 0;
1512         return 0;
1513     }
1514 
1515     n = total_sectors - sector_num;
1516     if (n < nb_sectors) {
1517         nb_sectors = n;
1518     }
1519 
1520     if (!bs->drv->bdrv_co_get_block_status) {
1521         *pnum = nb_sectors;
1522         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
1523         if (bs->drv->protocol_name) {
1524             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
1525         }
1526         return ret;
1527     }
1528 
1529     *file = NULL;
1530     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum,
1531                                             file);
1532     if (ret < 0) {
1533         *pnum = 0;
1534         return ret;
1535     }
1536 
1537     if (ret & BDRV_BLOCK_RAW) {
1538         assert(ret & BDRV_BLOCK_OFFSET_VALID);
1539         return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
1540                                      *pnum, pnum, file);
1541     }
1542 
1543     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
1544         ret |= BDRV_BLOCK_ALLOCATED;
1545     } else {
1546         if (bdrv_unallocated_blocks_are_zero(bs)) {
1547             ret |= BDRV_BLOCK_ZERO;
1548         } else if (bs->backing) {
1549             BlockDriverState *bs2 = bs->backing->bs;
1550             int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
1551             if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
1552                 ret |= BDRV_BLOCK_ZERO;
1553             }
1554         }
1555     }
1556 
1557     if (*file && *file != bs &&
1558         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
1559         (ret & BDRV_BLOCK_OFFSET_VALID)) {
1560         BlockDriverState *file2;
1561         int file_pnum;
1562 
1563         ret2 = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS,
1564                                         *pnum, &file_pnum, &file2);
1565         if (ret2 >= 0) {
1566             /* Ignore errors.  This is just providing extra information, it
1567              * is useful but not necessary.
1568              */
1569             if (!file_pnum) {
1570                 /* !file_pnum indicates an offset at or beyond the EOF; it is
1571                  * perfectly valid for the format block driver to point to such
1572                  * offsets, so catch it and mark everything as zero */
1573                 ret |= BDRV_BLOCK_ZERO;
1574             } else {
1575                 /* Limit request to the range reported by the protocol driver */
1576                 *pnum = file_pnum;
1577                 ret |= (ret2 & BDRV_BLOCK_ZERO);
1578             }
1579         }
1580     }
1581 
1582     return ret;
1583 }
1584 
1585 static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs,
1586         BlockDriverState *base,
1587         int64_t sector_num,
1588         int nb_sectors,
1589         int *pnum,
1590         BlockDriverState **file)
1591 {
1592     BlockDriverState *p;
1593     int64_t ret = 0;
1594 
1595     assert(bs != base);
1596     for (p = bs; p != base; p = backing_bs(p)) {
1597         ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file);
1598         if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) {
1599             break;
1600         }
1601         /* [sector_num, pnum] unallocated on this layer, which could be only
1602          * the first part of [sector_num, nb_sectors].  */
1603         nb_sectors = MIN(nb_sectors, *pnum);
1604     }
1605     return ret;
1606 }
1607 
1608 /* Coroutine wrapper for bdrv_get_block_status_above() */
1609 static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque)
1610 {
1611     BdrvCoGetBlockStatusData *data = opaque;
1612 
1613     data->ret = bdrv_co_get_block_status_above(data->bs, data->base,
1614                                                data->sector_num,
1615                                                data->nb_sectors,
1616                                                data->pnum,
1617                                                data->file);
1618     data->done = true;
1619 }
1620 
1621 /*
1622  * Synchronous wrapper around bdrv_co_get_block_status_above().
1623  *
1624  * See bdrv_co_get_block_status_above() for details.
1625  */
1626 int64_t bdrv_get_block_status_above(BlockDriverState *bs,
1627                                     BlockDriverState *base,
1628                                     int64_t sector_num,
1629                                     int nb_sectors, int *pnum,
1630                                     BlockDriverState **file)
1631 {
1632     Coroutine *co;
1633     BdrvCoGetBlockStatusData data = {
1634         .bs = bs,
1635         .base = base,
1636         .file = file,
1637         .sector_num = sector_num,
1638         .nb_sectors = nb_sectors,
1639         .pnum = pnum,
1640         .done = false,
1641     };
1642 
1643     if (qemu_in_coroutine()) {
1644         /* Fast-path if already in coroutine context */
1645         bdrv_get_block_status_above_co_entry(&data);
1646     } else {
1647         AioContext *aio_context = bdrv_get_aio_context(bs);
1648 
1649         co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry);
1650         qemu_coroutine_enter(co, &data);
1651         while (!data.done) {
1652             aio_poll(aio_context, true);
1653         }
1654     }
1655     return data.ret;
1656 }
1657 
1658 int64_t bdrv_get_block_status(BlockDriverState *bs,
1659                               int64_t sector_num,
1660                               int nb_sectors, int *pnum,
1661                               BlockDriverState **file)
1662 {
1663     return bdrv_get_block_status_above(bs, backing_bs(bs),
1664                                        sector_num, nb_sectors, pnum, file);
1665 }
1666 
1667 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
1668                                    int nb_sectors, int *pnum)
1669 {
1670     BlockDriverState *file;
1671     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum,
1672                                         &file);
1673     if (ret < 0) {
1674         return ret;
1675     }
1676     return !!(ret & BDRV_BLOCK_ALLOCATED);
1677 }
1678 
1679 /*
1680  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
1681  *
1682  * Return true if the given sector is allocated in any image between
1683  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
1684  * sector is allocated in any image of the chain.  Return false otherwise.
1685  *
1686  * 'pnum' is set to the number of sectors (including and immediately following
1687  *  the specified sector) that are known to be in the same
1688  *  allocated/unallocated state.
1689  *
1690  */
1691 int bdrv_is_allocated_above(BlockDriverState *top,
1692                             BlockDriverState *base,
1693                             int64_t sector_num,
1694                             int nb_sectors, int *pnum)
1695 {
1696     BlockDriverState *intermediate;
1697     int ret, n = nb_sectors;
1698 
1699     intermediate = top;
1700     while (intermediate && intermediate != base) {
1701         int pnum_inter;
1702         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
1703                                 &pnum_inter);
1704         if (ret < 0) {
1705             return ret;
1706         } else if (ret) {
1707             *pnum = pnum_inter;
1708             return 1;
1709         }
1710 
1711         /*
1712          * [sector_num, nb_sectors] is unallocated on top but intermediate
1713          * might have
1714          *
1715          * [sector_num+x, nr_sectors] allocated.
1716          */
1717         if (n > pnum_inter &&
1718             (intermediate == top ||
1719              sector_num + pnum_inter < intermediate->total_sectors)) {
1720             n = pnum_inter;
1721         }
1722 
1723         intermediate = backing_bs(intermediate);
1724     }
1725 
1726     *pnum = n;
1727     return 0;
1728 }
1729 
1730 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
1731                           const uint8_t *buf, int nb_sectors)
1732 {
1733     BlockDriver *drv = bs->drv;
1734     int ret;
1735 
1736     if (!drv) {
1737         return -ENOMEDIUM;
1738     }
1739     if (!drv->bdrv_write_compressed) {
1740         return -ENOTSUP;
1741     }
1742     ret = bdrv_check_request(bs, sector_num, nb_sectors);
1743     if (ret < 0) {
1744         return ret;
1745     }
1746 
1747     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1748 
1749     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
1750 }
1751 
1752 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
1753                       int64_t pos, int size)
1754 {
1755     QEMUIOVector qiov;
1756     struct iovec iov = {
1757         .iov_base   = (void *) buf,
1758         .iov_len    = size,
1759     };
1760 
1761     qemu_iovec_init_external(&qiov, &iov, 1);
1762     return bdrv_writev_vmstate(bs, &qiov, pos);
1763 }
1764 
1765 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
1766 {
1767     BlockDriver *drv = bs->drv;
1768 
1769     if (!drv) {
1770         return -ENOMEDIUM;
1771     } else if (drv->bdrv_save_vmstate) {
1772         return drv->bdrv_save_vmstate(bs, qiov, pos);
1773     } else if (bs->file) {
1774         return bdrv_writev_vmstate(bs->file->bs, qiov, pos);
1775     }
1776 
1777     return -ENOTSUP;
1778 }
1779 
1780 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
1781                       int64_t pos, int size)
1782 {
1783     BlockDriver *drv = bs->drv;
1784     if (!drv)
1785         return -ENOMEDIUM;
1786     if (drv->bdrv_load_vmstate)
1787         return drv->bdrv_load_vmstate(bs, buf, pos, size);
1788     if (bs->file)
1789         return bdrv_load_vmstate(bs->file->bs, buf, pos, size);
1790     return -ENOTSUP;
1791 }
1792 
1793 /**************************************************************/
1794 /* async I/Os */
1795 
1796 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
1797                            QEMUIOVector *qiov, int nb_sectors,
1798                            BlockCompletionFunc *cb, void *opaque)
1799 {
1800     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
1801 
1802     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
1803                                  cb, opaque, false);
1804 }
1805 
1806 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
1807                             QEMUIOVector *qiov, int nb_sectors,
1808                             BlockCompletionFunc *cb, void *opaque)
1809 {
1810     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
1811 
1812     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
1813                                  cb, opaque, true);
1814 }
1815 
1816 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
1817         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
1818         BlockCompletionFunc *cb, void *opaque)
1819 {
1820     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
1821 
1822     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
1823                                  BDRV_REQ_ZERO_WRITE | flags,
1824                                  cb, opaque, true);
1825 }
1826 
1827 
1828 typedef struct MultiwriteCB {
1829     int error;
1830     int num_requests;
1831     int num_callbacks;
1832     struct {
1833         BlockCompletionFunc *cb;
1834         void *opaque;
1835         QEMUIOVector *free_qiov;
1836     } callbacks[];
1837 } MultiwriteCB;
1838 
1839 static void multiwrite_user_cb(MultiwriteCB *mcb)
1840 {
1841     int i;
1842 
1843     for (i = 0; i < mcb->num_callbacks; i++) {
1844         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1845         if (mcb->callbacks[i].free_qiov) {
1846             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
1847         }
1848         g_free(mcb->callbacks[i].free_qiov);
1849     }
1850 }
1851 
1852 static void multiwrite_cb(void *opaque, int ret)
1853 {
1854     MultiwriteCB *mcb = opaque;
1855 
1856     trace_multiwrite_cb(mcb, ret);
1857 
1858     if (ret < 0 && !mcb->error) {
1859         mcb->error = ret;
1860     }
1861 
1862     mcb->num_requests--;
1863     if (mcb->num_requests == 0) {
1864         multiwrite_user_cb(mcb);
1865         g_free(mcb);
1866     }
1867 }
1868 
1869 static int multiwrite_req_compare(const void *a, const void *b)
1870 {
1871     const BlockRequest *req1 = a, *req2 = b;
1872 
1873     /*
1874      * Note that we can't simply subtract req2->sector from req1->sector
1875      * here as that could overflow the return value.
1876      */
1877     if (req1->sector > req2->sector) {
1878         return 1;
1879     } else if (req1->sector < req2->sector) {
1880         return -1;
1881     } else {
1882         return 0;
1883     }
1884 }
1885 
1886 /*
1887  * Takes a bunch of requests and tries to merge them. Returns the number of
1888  * requests that remain after merging.
1889  */
1890 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
1891     int num_reqs, MultiwriteCB *mcb)
1892 {
1893     int i, outidx;
1894 
1895     // Sort requests by start sector
1896     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
1897 
1898     // Check if adjacent requests touch the same clusters. If so, combine them,
1899     // filling up gaps with zero sectors.
1900     outidx = 0;
1901     for (i = 1; i < num_reqs; i++) {
1902         int merge = 0;
1903         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
1904 
1905         // Handle exactly sequential writes and overlapping writes.
1906         if (reqs[i].sector <= oldreq_last) {
1907             merge = 1;
1908         }
1909 
1910         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 >
1911             bs->bl.max_iov) {
1912             merge = 0;
1913         }
1914 
1915         if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
1916             reqs[i].nb_sectors > bs->bl.max_transfer_length) {
1917             merge = 0;
1918         }
1919 
1920         if (merge) {
1921             size_t size;
1922             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
1923             qemu_iovec_init(qiov,
1924                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
1925 
1926             // Add the first request to the merged one. If the requests are
1927             // overlapping, drop the last sectors of the first request.
1928             size = (reqs[i].sector - reqs[outidx].sector) << 9;
1929             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
1930 
1931             // We should need to add any zeros between the two requests
1932             assert (reqs[i].sector <= oldreq_last);
1933 
1934             // Add the second request
1935             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
1936 
1937             // Add tail of first request, if necessary
1938             if (qiov->size < reqs[outidx].qiov->size) {
1939                 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
1940                                   reqs[outidx].qiov->size - qiov->size);
1941             }
1942 
1943             reqs[outidx].nb_sectors = qiov->size >> 9;
1944             reqs[outidx].qiov = qiov;
1945 
1946             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
1947         } else {
1948             outidx++;
1949             reqs[outidx].sector     = reqs[i].sector;
1950             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
1951             reqs[outidx].qiov       = reqs[i].qiov;
1952         }
1953     }
1954 
1955     if (bs->blk) {
1956         block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE,
1957                               num_reqs - outidx - 1);
1958     }
1959 
1960     return outidx + 1;
1961 }
1962 
1963 /*
1964  * Submit multiple AIO write requests at once.
1965  *
1966  * On success, the function returns 0 and all requests in the reqs array have
1967  * been submitted. In error case this function returns -1, and any of the
1968  * requests may or may not be submitted yet. In particular, this means that the
1969  * callback will be called for some of the requests, for others it won't. The
1970  * caller must check the error field of the BlockRequest to wait for the right
1971  * callbacks (if error != 0, no callback will be called).
1972  *
1973  * The implementation may modify the contents of the reqs array, e.g. to merge
1974  * requests. However, the fields opaque and error are left unmodified as they
1975  * are used to signal failure for a single request to the caller.
1976  */
1977 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
1978 {
1979     MultiwriteCB *mcb;
1980     int i;
1981 
1982     /* don't submit writes if we don't have a medium */
1983     if (bs->drv == NULL) {
1984         for (i = 0; i < num_reqs; i++) {
1985             reqs[i].error = -ENOMEDIUM;
1986         }
1987         return -1;
1988     }
1989 
1990     if (num_reqs == 0) {
1991         return 0;
1992     }
1993 
1994     // Create MultiwriteCB structure
1995     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
1996     mcb->num_requests = 0;
1997     mcb->num_callbacks = num_reqs;
1998 
1999     for (i = 0; i < num_reqs; i++) {
2000         mcb->callbacks[i].cb = reqs[i].cb;
2001         mcb->callbacks[i].opaque = reqs[i].opaque;
2002     }
2003 
2004     // Check for mergable requests
2005     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
2006 
2007     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
2008 
2009     /* Run the aio requests. */
2010     mcb->num_requests = num_reqs;
2011     for (i = 0; i < num_reqs; i++) {
2012         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
2013                               reqs[i].nb_sectors, reqs[i].flags,
2014                               multiwrite_cb, mcb,
2015                               true);
2016     }
2017 
2018     return 0;
2019 }
2020 
2021 void bdrv_aio_cancel(BlockAIOCB *acb)
2022 {
2023     qemu_aio_ref(acb);
2024     bdrv_aio_cancel_async(acb);
2025     while (acb->refcnt > 1) {
2026         if (acb->aiocb_info->get_aio_context) {
2027             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
2028         } else if (acb->bs) {
2029             aio_poll(bdrv_get_aio_context(acb->bs), true);
2030         } else {
2031             abort();
2032         }
2033     }
2034     qemu_aio_unref(acb);
2035 }
2036 
2037 /* Async version of aio cancel. The caller is not blocked if the acb implements
2038  * cancel_async, otherwise we do nothing and let the request normally complete.
2039  * In either case the completion callback must be called. */
2040 void bdrv_aio_cancel_async(BlockAIOCB *acb)
2041 {
2042     if (acb->aiocb_info->cancel_async) {
2043         acb->aiocb_info->cancel_async(acb);
2044     }
2045 }
2046 
2047 /**************************************************************/
2048 /* async block device emulation */
2049 
2050 typedef struct BlockAIOCBSync {
2051     BlockAIOCB common;
2052     QEMUBH *bh;
2053     int ret;
2054     /* vector translation state */
2055     QEMUIOVector *qiov;
2056     uint8_t *bounce;
2057     int is_write;
2058 } BlockAIOCBSync;
2059 
2060 static const AIOCBInfo bdrv_em_aiocb_info = {
2061     .aiocb_size         = sizeof(BlockAIOCBSync),
2062 };
2063 
2064 static void bdrv_aio_bh_cb(void *opaque)
2065 {
2066     BlockAIOCBSync *acb = opaque;
2067 
2068     if (!acb->is_write && acb->ret >= 0) {
2069         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
2070     }
2071     qemu_vfree(acb->bounce);
2072     acb->common.cb(acb->common.opaque, acb->ret);
2073     qemu_bh_delete(acb->bh);
2074     acb->bh = NULL;
2075     qemu_aio_unref(acb);
2076 }
2077 
2078 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
2079                                       int64_t sector_num,
2080                                       QEMUIOVector *qiov,
2081                                       int nb_sectors,
2082                                       BlockCompletionFunc *cb,
2083                                       void *opaque,
2084                                       int is_write)
2085 
2086 {
2087     BlockAIOCBSync *acb;
2088 
2089     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
2090     acb->is_write = is_write;
2091     acb->qiov = qiov;
2092     acb->bounce = qemu_try_blockalign(bs, qiov->size);
2093     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
2094 
2095     if (acb->bounce == NULL) {
2096         acb->ret = -ENOMEM;
2097     } else if (is_write) {
2098         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
2099         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
2100     } else {
2101         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
2102     }
2103 
2104     qemu_bh_schedule(acb->bh);
2105 
2106     return &acb->common;
2107 }
2108 
2109 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
2110         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2111         BlockCompletionFunc *cb, void *opaque)
2112 {
2113     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
2114 }
2115 
2116 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
2117         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2118         BlockCompletionFunc *cb, void *opaque)
2119 {
2120     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
2121 }
2122 
2123 
2124 typedef struct BlockAIOCBCoroutine {
2125     BlockAIOCB common;
2126     BlockRequest req;
2127     bool is_write;
2128     bool need_bh;
2129     bool *done;
2130     QEMUBH* bh;
2131 } BlockAIOCBCoroutine;
2132 
2133 static const AIOCBInfo bdrv_em_co_aiocb_info = {
2134     .aiocb_size         = sizeof(BlockAIOCBCoroutine),
2135 };
2136 
2137 static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
2138 {
2139     if (!acb->need_bh) {
2140         acb->common.cb(acb->common.opaque, acb->req.error);
2141         qemu_aio_unref(acb);
2142     }
2143 }
2144 
2145 static void bdrv_co_em_bh(void *opaque)
2146 {
2147     BlockAIOCBCoroutine *acb = opaque;
2148 
2149     assert(!acb->need_bh);
2150     qemu_bh_delete(acb->bh);
2151     bdrv_co_complete(acb);
2152 }
2153 
2154 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
2155 {
2156     acb->need_bh = false;
2157     if (acb->req.error != -EINPROGRESS) {
2158         BlockDriverState *bs = acb->common.bs;
2159 
2160         acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
2161         qemu_bh_schedule(acb->bh);
2162     }
2163 }
2164 
2165 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
2166 static void coroutine_fn bdrv_co_do_rw(void *opaque)
2167 {
2168     BlockAIOCBCoroutine *acb = opaque;
2169     BlockDriverState *bs = acb->common.bs;
2170 
2171     if (!acb->is_write) {
2172         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
2173             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
2174     } else {
2175         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
2176             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
2177     }
2178 
2179     bdrv_co_complete(acb);
2180 }
2181 
2182 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
2183                                          int64_t sector_num,
2184                                          QEMUIOVector *qiov,
2185                                          int nb_sectors,
2186                                          BdrvRequestFlags flags,
2187                                          BlockCompletionFunc *cb,
2188                                          void *opaque,
2189                                          bool is_write)
2190 {
2191     Coroutine *co;
2192     BlockAIOCBCoroutine *acb;
2193 
2194     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2195     acb->need_bh = true;
2196     acb->req.error = -EINPROGRESS;
2197     acb->req.sector = sector_num;
2198     acb->req.nb_sectors = nb_sectors;
2199     acb->req.qiov = qiov;
2200     acb->req.flags = flags;
2201     acb->is_write = is_write;
2202 
2203     co = qemu_coroutine_create(bdrv_co_do_rw);
2204     qemu_coroutine_enter(co, acb);
2205 
2206     bdrv_co_maybe_schedule_bh(acb);
2207     return &acb->common;
2208 }
2209 
2210 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
2211 {
2212     BlockAIOCBCoroutine *acb = opaque;
2213     BlockDriverState *bs = acb->common.bs;
2214 
2215     acb->req.error = bdrv_co_flush(bs);
2216     bdrv_co_complete(acb);
2217 }
2218 
2219 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
2220         BlockCompletionFunc *cb, void *opaque)
2221 {
2222     trace_bdrv_aio_flush(bs, opaque);
2223 
2224     Coroutine *co;
2225     BlockAIOCBCoroutine *acb;
2226 
2227     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2228     acb->need_bh = true;
2229     acb->req.error = -EINPROGRESS;
2230 
2231     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
2232     qemu_coroutine_enter(co, acb);
2233 
2234     bdrv_co_maybe_schedule_bh(acb);
2235     return &acb->common;
2236 }
2237 
2238 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
2239 {
2240     BlockAIOCBCoroutine *acb = opaque;
2241     BlockDriverState *bs = acb->common.bs;
2242 
2243     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
2244     bdrv_co_complete(acb);
2245 }
2246 
2247 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
2248         int64_t sector_num, int nb_sectors,
2249         BlockCompletionFunc *cb, void *opaque)
2250 {
2251     Coroutine *co;
2252     BlockAIOCBCoroutine *acb;
2253 
2254     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
2255 
2256     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2257     acb->need_bh = true;
2258     acb->req.error = -EINPROGRESS;
2259     acb->req.sector = sector_num;
2260     acb->req.nb_sectors = nb_sectors;
2261     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
2262     qemu_coroutine_enter(co, acb);
2263 
2264     bdrv_co_maybe_schedule_bh(acb);
2265     return &acb->common;
2266 }
2267 
2268 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
2269                    BlockCompletionFunc *cb, void *opaque)
2270 {
2271     BlockAIOCB *acb;
2272 
2273     acb = g_malloc(aiocb_info->aiocb_size);
2274     acb->aiocb_info = aiocb_info;
2275     acb->bs = bs;
2276     acb->cb = cb;
2277     acb->opaque = opaque;
2278     acb->refcnt = 1;
2279     return acb;
2280 }
2281 
2282 void qemu_aio_ref(void *p)
2283 {
2284     BlockAIOCB *acb = p;
2285     acb->refcnt++;
2286 }
2287 
2288 void qemu_aio_unref(void *p)
2289 {
2290     BlockAIOCB *acb = p;
2291     assert(acb->refcnt > 0);
2292     if (--acb->refcnt == 0) {
2293         g_free(acb);
2294     }
2295 }
2296 
2297 /**************************************************************/
2298 /* Coroutine block device emulation */
2299 
2300 typedef struct CoroutineIOCompletion {
2301     Coroutine *coroutine;
2302     int ret;
2303 } CoroutineIOCompletion;
2304 
2305 static void bdrv_co_io_em_complete(void *opaque, int ret)
2306 {
2307     CoroutineIOCompletion *co = opaque;
2308 
2309     co->ret = ret;
2310     qemu_coroutine_enter(co->coroutine, NULL);
2311 }
2312 
2313 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
2314                                       int nb_sectors, QEMUIOVector *iov,
2315                                       bool is_write)
2316 {
2317     CoroutineIOCompletion co = {
2318         .coroutine = qemu_coroutine_self(),
2319     };
2320     BlockAIOCB *acb;
2321 
2322     if (is_write) {
2323         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
2324                                        bdrv_co_io_em_complete, &co);
2325     } else {
2326         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
2327                                       bdrv_co_io_em_complete, &co);
2328     }
2329 
2330     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
2331     if (!acb) {
2332         return -EIO;
2333     }
2334     qemu_coroutine_yield();
2335 
2336     return co.ret;
2337 }
2338 
2339 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
2340                                          int64_t sector_num, int nb_sectors,
2341                                          QEMUIOVector *iov)
2342 {
2343     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
2344 }
2345 
2346 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
2347                                          int64_t sector_num, int nb_sectors,
2348                                          QEMUIOVector *iov)
2349 {
2350     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
2351 }
2352 
2353 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2354 {
2355     RwCo *rwco = opaque;
2356 
2357     rwco->ret = bdrv_co_flush(rwco->bs);
2358 }
2359 
2360 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2361 {
2362     int ret;
2363     BdrvTrackedRequest req;
2364 
2365     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
2366         bdrv_is_sg(bs)) {
2367         return 0;
2368     }
2369 
2370     tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
2371     /* Write back cached data to the OS even with cache=unsafe */
2372     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
2373     if (bs->drv->bdrv_co_flush_to_os) {
2374         ret = bs->drv->bdrv_co_flush_to_os(bs);
2375         if (ret < 0) {
2376             goto out;
2377         }
2378     }
2379 
2380     /* But don't actually force it to the disk with cache=unsafe */
2381     if (bs->open_flags & BDRV_O_NO_FLUSH) {
2382         goto flush_parent;
2383     }
2384 
2385     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2386     if (bs->drv->bdrv_co_flush_to_disk) {
2387         ret = bs->drv->bdrv_co_flush_to_disk(bs);
2388     } else if (bs->drv->bdrv_aio_flush) {
2389         BlockAIOCB *acb;
2390         CoroutineIOCompletion co = {
2391             .coroutine = qemu_coroutine_self(),
2392         };
2393 
2394         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2395         if (acb == NULL) {
2396             ret = -EIO;
2397         } else {
2398             qemu_coroutine_yield();
2399             ret = co.ret;
2400         }
2401     } else {
2402         /*
2403          * Some block drivers always operate in either writethrough or unsafe
2404          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2405          * know how the server works (because the behaviour is hardcoded or
2406          * depends on server-side configuration), so we can't ensure that
2407          * everything is safe on disk. Returning an error doesn't work because
2408          * that would break guests even if the server operates in writethrough
2409          * mode.
2410          *
2411          * Let's hope the user knows what he's doing.
2412          */
2413         ret = 0;
2414     }
2415     if (ret < 0) {
2416         goto out;
2417     }
2418 
2419     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
2420      * in the case of cache=unsafe, so there are no useless flushes.
2421      */
2422 flush_parent:
2423     ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2424 out:
2425     tracked_request_end(&req);
2426     return ret;
2427 }
2428 
2429 int bdrv_flush(BlockDriverState *bs)
2430 {
2431     Coroutine *co;
2432     RwCo rwco = {
2433         .bs = bs,
2434         .ret = NOT_DONE,
2435     };
2436 
2437     if (qemu_in_coroutine()) {
2438         /* Fast-path if already in coroutine context */
2439         bdrv_flush_co_entry(&rwco);
2440     } else {
2441         AioContext *aio_context = bdrv_get_aio_context(bs);
2442 
2443         co = qemu_coroutine_create(bdrv_flush_co_entry);
2444         qemu_coroutine_enter(co, &rwco);
2445         while (rwco.ret == NOT_DONE) {
2446             aio_poll(aio_context, true);
2447         }
2448     }
2449 
2450     return rwco.ret;
2451 }
2452 
2453 typedef struct DiscardCo {
2454     BlockDriverState *bs;
2455     int64_t sector_num;
2456     int nb_sectors;
2457     int ret;
2458 } DiscardCo;
2459 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
2460 {
2461     DiscardCo *rwco = opaque;
2462 
2463     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
2464 }
2465 
2466 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
2467                                  int nb_sectors)
2468 {
2469     BdrvTrackedRequest req;
2470     int max_discard, ret;
2471 
2472     if (!bs->drv) {
2473         return -ENOMEDIUM;
2474     }
2475 
2476     ret = bdrv_check_request(bs, sector_num, nb_sectors);
2477     if (ret < 0) {
2478         return ret;
2479     } else if (bs->read_only) {
2480         return -EPERM;
2481     }
2482     assert(!(bs->open_flags & BDRV_O_INACTIVE));
2483 
2484     /* Do nothing if disabled.  */
2485     if (!(bs->open_flags & BDRV_O_UNMAP)) {
2486         return 0;
2487     }
2488 
2489     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
2490         return 0;
2491     }
2492 
2493     tracked_request_begin(&req, bs, sector_num, nb_sectors,
2494                           BDRV_TRACKED_DISCARD);
2495     bdrv_set_dirty(bs, sector_num, nb_sectors);
2496 
2497     max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
2498     while (nb_sectors > 0) {
2499         int ret;
2500         int num = nb_sectors;
2501 
2502         /* align request */
2503         if (bs->bl.discard_alignment &&
2504             num >= bs->bl.discard_alignment &&
2505             sector_num % bs->bl.discard_alignment) {
2506             if (num > bs->bl.discard_alignment) {
2507                 num = bs->bl.discard_alignment;
2508             }
2509             num -= sector_num % bs->bl.discard_alignment;
2510         }
2511 
2512         /* limit request size */
2513         if (num > max_discard) {
2514             num = max_discard;
2515         }
2516 
2517         if (bs->drv->bdrv_co_discard) {
2518             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
2519         } else {
2520             BlockAIOCB *acb;
2521             CoroutineIOCompletion co = {
2522                 .coroutine = qemu_coroutine_self(),
2523             };
2524 
2525             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
2526                                             bdrv_co_io_em_complete, &co);
2527             if (acb == NULL) {
2528                 ret = -EIO;
2529                 goto out;
2530             } else {
2531                 qemu_coroutine_yield();
2532                 ret = co.ret;
2533             }
2534         }
2535         if (ret && ret != -ENOTSUP) {
2536             goto out;
2537         }
2538 
2539         sector_num += num;
2540         nb_sectors -= num;
2541     }
2542     ret = 0;
2543 out:
2544     tracked_request_end(&req);
2545     return ret;
2546 }
2547 
2548 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
2549 {
2550     Coroutine *co;
2551     DiscardCo rwco = {
2552         .bs = bs,
2553         .sector_num = sector_num,
2554         .nb_sectors = nb_sectors,
2555         .ret = NOT_DONE,
2556     };
2557 
2558     if (qemu_in_coroutine()) {
2559         /* Fast-path if already in coroutine context */
2560         bdrv_discard_co_entry(&rwco);
2561     } else {
2562         AioContext *aio_context = bdrv_get_aio_context(bs);
2563 
2564         co = qemu_coroutine_create(bdrv_discard_co_entry);
2565         qemu_coroutine_enter(co, &rwco);
2566         while (rwco.ret == NOT_DONE) {
2567             aio_poll(aio_context, true);
2568         }
2569     }
2570 
2571     return rwco.ret;
2572 }
2573 
2574 typedef struct {
2575     CoroutineIOCompletion *co;
2576     QEMUBH *bh;
2577 } BdrvIoctlCompletionData;
2578 
2579 static void bdrv_ioctl_bh_cb(void *opaque)
2580 {
2581     BdrvIoctlCompletionData *data = opaque;
2582 
2583     bdrv_co_io_em_complete(data->co, -ENOTSUP);
2584     qemu_bh_delete(data->bh);
2585 }
2586 
2587 static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf)
2588 {
2589     BlockDriver *drv = bs->drv;
2590     BdrvTrackedRequest tracked_req;
2591     CoroutineIOCompletion co = {
2592         .coroutine = qemu_coroutine_self(),
2593     };
2594     BlockAIOCB *acb;
2595 
2596     tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL);
2597     if (!drv || !drv->bdrv_aio_ioctl) {
2598         co.ret = -ENOTSUP;
2599         goto out;
2600     }
2601 
2602     acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
2603     if (!acb) {
2604         BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1);
2605         data->bh = aio_bh_new(bdrv_get_aio_context(bs),
2606                                 bdrv_ioctl_bh_cb, data);
2607         data->co = &co;
2608         qemu_bh_schedule(data->bh);
2609     }
2610     qemu_coroutine_yield();
2611 out:
2612     tracked_request_end(&tracked_req);
2613     return co.ret;
2614 }
2615 
2616 typedef struct {
2617     BlockDriverState *bs;
2618     int req;
2619     void *buf;
2620     int ret;
2621 } BdrvIoctlCoData;
2622 
2623 static void coroutine_fn bdrv_co_ioctl_entry(void *opaque)
2624 {
2625     BdrvIoctlCoData *data = opaque;
2626     data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf);
2627 }
2628 
2629 /* needed for generic scsi interface */
2630 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
2631 {
2632     BdrvIoctlCoData data = {
2633         .bs = bs,
2634         .req = req,
2635         .buf = buf,
2636         .ret = -EINPROGRESS,
2637     };
2638 
2639     if (qemu_in_coroutine()) {
2640         /* Fast-path if already in coroutine context */
2641         bdrv_co_ioctl_entry(&data);
2642     } else {
2643         Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry);
2644 
2645         qemu_coroutine_enter(co, &data);
2646         while (data.ret == -EINPROGRESS) {
2647             aio_poll(bdrv_get_aio_context(bs), true);
2648         }
2649     }
2650     return data.ret;
2651 }
2652 
2653 static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque)
2654 {
2655     BlockAIOCBCoroutine *acb = opaque;
2656     acb->req.error = bdrv_co_do_ioctl(acb->common.bs,
2657                                       acb->req.req, acb->req.buf);
2658     bdrv_co_complete(acb);
2659 }
2660 
2661 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
2662         unsigned long int req, void *buf,
2663         BlockCompletionFunc *cb, void *opaque)
2664 {
2665     BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info,
2666                                             bs, cb, opaque);
2667     Coroutine *co;
2668 
2669     acb->need_bh = true;
2670     acb->req.error = -EINPROGRESS;
2671     acb->req.req = req;
2672     acb->req.buf = buf;
2673     co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry);
2674     qemu_coroutine_enter(co, acb);
2675 
2676     bdrv_co_maybe_schedule_bh(acb);
2677     return &acb->common;
2678 }
2679 
2680 void *qemu_blockalign(BlockDriverState *bs, size_t size)
2681 {
2682     return qemu_memalign(bdrv_opt_mem_align(bs), size);
2683 }
2684 
2685 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
2686 {
2687     return memset(qemu_blockalign(bs, size), 0, size);
2688 }
2689 
2690 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
2691 {
2692     size_t align = bdrv_opt_mem_align(bs);
2693 
2694     /* Ensure that NULL is never returned on success */
2695     assert(align > 0);
2696     if (size == 0) {
2697         size = align;
2698     }
2699 
2700     return qemu_try_memalign(align, size);
2701 }
2702 
2703 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
2704 {
2705     void *mem = qemu_try_blockalign(bs, size);
2706 
2707     if (mem) {
2708         memset(mem, 0, size);
2709     }
2710 
2711     return mem;
2712 }
2713 
2714 /*
2715  * Check if all memory in this vector is sector aligned.
2716  */
2717 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
2718 {
2719     int i;
2720     size_t alignment = bdrv_min_mem_align(bs);
2721 
2722     for (i = 0; i < qiov->niov; i++) {
2723         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
2724             return false;
2725         }
2726         if (qiov->iov[i].iov_len % alignment) {
2727             return false;
2728         }
2729     }
2730 
2731     return true;
2732 }
2733 
2734 void bdrv_add_before_write_notifier(BlockDriverState *bs,
2735                                     NotifierWithReturn *notifier)
2736 {
2737     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
2738 }
2739 
2740 void bdrv_io_plug(BlockDriverState *bs)
2741 {
2742     BlockDriver *drv = bs->drv;
2743     if (drv && drv->bdrv_io_plug) {
2744         drv->bdrv_io_plug(bs);
2745     } else if (bs->file) {
2746         bdrv_io_plug(bs->file->bs);
2747     }
2748 }
2749 
2750 void bdrv_io_unplug(BlockDriverState *bs)
2751 {
2752     BlockDriver *drv = bs->drv;
2753     if (drv && drv->bdrv_io_unplug) {
2754         drv->bdrv_io_unplug(bs);
2755     } else if (bs->file) {
2756         bdrv_io_unplug(bs->file->bs);
2757     }
2758 }
2759 
2760 void bdrv_flush_io_queue(BlockDriverState *bs)
2761 {
2762     BlockDriver *drv = bs->drv;
2763     if (drv && drv->bdrv_flush_io_queue) {
2764         drv->bdrv_flush_io_queue(bs);
2765     } else if (bs->file) {
2766         bdrv_flush_io_queue(bs->file->bs);
2767     }
2768     bdrv_start_throttled_reqs(bs);
2769 }
2770 
2771 void bdrv_drained_begin(BlockDriverState *bs)
2772 {
2773     if (!bs->quiesce_counter++) {
2774         aio_disable_external(bdrv_get_aio_context(bs));
2775     }
2776     bdrv_drain(bs);
2777 }
2778 
2779 void bdrv_drained_end(BlockDriverState *bs)
2780 {
2781     assert(bs->quiesce_counter > 0);
2782     if (--bs->quiesce_counter > 0) {
2783         return;
2784     }
2785     aio_enable_external(bdrv_get_aio_context(bs));
2786 }
2787