xref: /openbmc/qemu/block/io.c (revision 9967e4fec0dc6c0157f37574dd33ddd03ca3bee8)
1 /*
2  * Block layer I/O functions
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "trace.h"
26 #include "sysemu/block-backend.h"
27 #include "block/blockjob.h"
28 #include "block/block_int.h"
29 #include "block/throttle-groups.h"
30 #include "qemu/error-report.h"
31 
32 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
33 
34 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
35         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
36         BlockCompletionFunc *cb, void *opaque);
37 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
38         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
39         BlockCompletionFunc *cb, void *opaque);
40 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
41                                          int64_t sector_num, int nb_sectors,
42                                          QEMUIOVector *iov);
43 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
44                                          int64_t sector_num, int nb_sectors,
45                                          QEMUIOVector *iov);
46 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
47     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
48     BdrvRequestFlags flags);
49 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
50     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
51     BdrvRequestFlags flags);
52 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
53                                          int64_t sector_num,
54                                          QEMUIOVector *qiov,
55                                          int nb_sectors,
56                                          BdrvRequestFlags flags,
57                                          BlockCompletionFunc *cb,
58                                          void *opaque,
59                                          bool is_write);
60 static void coroutine_fn bdrv_co_do_rw(void *opaque);
61 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
62     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
63 
64 /* throttling disk I/O limits */
65 void bdrv_set_io_limits(BlockDriverState *bs,
66                         ThrottleConfig *cfg)
67 {
68     int i;
69 
70     throttle_group_config(bs, cfg);
71 
72     for (i = 0; i < 2; i++) {
73         qemu_co_enter_next(&bs->throttled_reqs[i]);
74     }
75 }
76 
77 /* this function drain all the throttled IOs */
78 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
79 {
80     bool drained = false;
81     bool enabled = bs->io_limits_enabled;
82     int i;
83 
84     bs->io_limits_enabled = false;
85 
86     for (i = 0; i < 2; i++) {
87         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
88             drained = true;
89         }
90     }
91 
92     bs->io_limits_enabled = enabled;
93 
94     return drained;
95 }
96 
97 void bdrv_io_limits_disable(BlockDriverState *bs)
98 {
99     bs->io_limits_enabled = false;
100     bdrv_start_throttled_reqs(bs);
101     throttle_group_unregister_bs(bs);
102 }
103 
104 /* should be called before bdrv_set_io_limits if a limit is set */
105 void bdrv_io_limits_enable(BlockDriverState *bs, const char *group)
106 {
107     assert(!bs->io_limits_enabled);
108     throttle_group_register_bs(bs, group);
109     bs->io_limits_enabled = true;
110 }
111 
112 void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group)
113 {
114     /* this bs is not part of any group */
115     if (!bs->throttle_state) {
116         return;
117     }
118 
119     /* this bs is a part of the same group than the one we want */
120     if (!g_strcmp0(throttle_group_get_name(bs), group)) {
121         return;
122     }
123 
124     /* need to change the group this bs belong to */
125     bdrv_io_limits_disable(bs);
126     bdrv_io_limits_enable(bs, group);
127 }
128 
129 void bdrv_setup_io_funcs(BlockDriver *bdrv)
130 {
131     /* Block drivers without coroutine functions need emulation */
132     if (!bdrv->bdrv_co_readv) {
133         bdrv->bdrv_co_readv = bdrv_co_readv_em;
134         bdrv->bdrv_co_writev = bdrv_co_writev_em;
135 
136         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
137          * the block driver lacks aio we need to emulate that too.
138          */
139         if (!bdrv->bdrv_aio_readv) {
140             /* add AIO emulation layer */
141             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
142             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
143         }
144     }
145 }
146 
147 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
148 {
149     BlockDriver *drv = bs->drv;
150     Error *local_err = NULL;
151 
152     memset(&bs->bl, 0, sizeof(bs->bl));
153 
154     if (!drv) {
155         return;
156     }
157 
158     /* Take some limits from the children as a default */
159     if (bs->file) {
160         bdrv_refresh_limits(bs->file->bs, &local_err);
161         if (local_err) {
162             error_propagate(errp, local_err);
163             return;
164         }
165         bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length;
166         bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length;
167         bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment;
168         bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment;
169         bs->bl.max_iov = bs->file->bs->bl.max_iov;
170     } else {
171         bs->bl.min_mem_alignment = 512;
172         bs->bl.opt_mem_alignment = getpagesize();
173 
174         /* Safe default since most protocols use readv()/writev()/etc */
175         bs->bl.max_iov = IOV_MAX;
176     }
177 
178     if (bs->backing) {
179         bdrv_refresh_limits(bs->backing->bs, &local_err);
180         if (local_err) {
181             error_propagate(errp, local_err);
182             return;
183         }
184         bs->bl.opt_transfer_length =
185             MAX(bs->bl.opt_transfer_length,
186                 bs->backing->bs->bl.opt_transfer_length);
187         bs->bl.max_transfer_length =
188             MIN_NON_ZERO(bs->bl.max_transfer_length,
189                          bs->backing->bs->bl.max_transfer_length);
190         bs->bl.opt_mem_alignment =
191             MAX(bs->bl.opt_mem_alignment,
192                 bs->backing->bs->bl.opt_mem_alignment);
193         bs->bl.min_mem_alignment =
194             MAX(bs->bl.min_mem_alignment,
195                 bs->backing->bs->bl.min_mem_alignment);
196         bs->bl.max_iov =
197             MIN(bs->bl.max_iov,
198                 bs->backing->bs->bl.max_iov);
199     }
200 
201     /* Then let the driver override it */
202     if (drv->bdrv_refresh_limits) {
203         drv->bdrv_refresh_limits(bs, errp);
204     }
205 }
206 
207 /**
208  * The copy-on-read flag is actually a reference count so multiple users may
209  * use the feature without worrying about clobbering its previous state.
210  * Copy-on-read stays enabled until all users have called to disable it.
211  */
212 void bdrv_enable_copy_on_read(BlockDriverState *bs)
213 {
214     bs->copy_on_read++;
215 }
216 
217 void bdrv_disable_copy_on_read(BlockDriverState *bs)
218 {
219     assert(bs->copy_on_read > 0);
220     bs->copy_on_read--;
221 }
222 
223 /* Check if any requests are in-flight (including throttled requests) */
224 bool bdrv_requests_pending(BlockDriverState *bs)
225 {
226     BdrvChild *child;
227 
228     if (!QLIST_EMPTY(&bs->tracked_requests)) {
229         return true;
230     }
231     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
232         return true;
233     }
234     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
235         return true;
236     }
237 
238     QLIST_FOREACH(child, &bs->children, next) {
239         if (bdrv_requests_pending(child->bs)) {
240             return true;
241         }
242     }
243 
244     return false;
245 }
246 
247 static void bdrv_drain_recurse(BlockDriverState *bs)
248 {
249     BdrvChild *child;
250 
251     if (bs->drv && bs->drv->bdrv_drain) {
252         bs->drv->bdrv_drain(bs);
253     }
254     QLIST_FOREACH(child, &bs->children, next) {
255         bdrv_drain_recurse(child->bs);
256     }
257 }
258 
259 /*
260  * Wait for pending requests to complete on a single BlockDriverState subtree,
261  * and suspend block driver's internal I/O until next request arrives.
262  *
263  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
264  * AioContext.
265  *
266  * Only this BlockDriverState's AioContext is run, so in-flight requests must
267  * not depend on events in other AioContexts.  In that case, use
268  * bdrv_drain_all() instead.
269  */
270 void bdrv_drain(BlockDriverState *bs)
271 {
272     bool busy = true;
273 
274     bdrv_drain_recurse(bs);
275     while (busy) {
276         /* Keep iterating */
277          bdrv_flush_io_queue(bs);
278          busy = bdrv_requests_pending(bs);
279          busy |= aio_poll(bdrv_get_aio_context(bs), busy);
280     }
281 }
282 
283 /*
284  * Wait for pending requests to complete across all BlockDriverStates
285  *
286  * This function does not flush data to disk, use bdrv_flush_all() for that
287  * after calling this function.
288  */
289 void bdrv_drain_all(void)
290 {
291     /* Always run first iteration so any pending completion BHs run */
292     bool busy = true;
293     BlockDriverState *bs = NULL;
294     GSList *aio_ctxs = NULL, *ctx;
295 
296     while ((bs = bdrv_next(bs))) {
297         AioContext *aio_context = bdrv_get_aio_context(bs);
298 
299         aio_context_acquire(aio_context);
300         if (bs->job) {
301             block_job_pause(bs->job);
302         }
303         aio_context_release(aio_context);
304 
305         if (!g_slist_find(aio_ctxs, aio_context)) {
306             aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
307         }
308     }
309 
310     /* Note that completion of an asynchronous I/O operation can trigger any
311      * number of other I/O operations on other devices---for example a
312      * coroutine can submit an I/O request to another device in response to
313      * request completion.  Therefore we must keep looping until there was no
314      * more activity rather than simply draining each device independently.
315      */
316     while (busy) {
317         busy = false;
318 
319         for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
320             AioContext *aio_context = ctx->data;
321             bs = NULL;
322 
323             aio_context_acquire(aio_context);
324             while ((bs = bdrv_next(bs))) {
325                 if (aio_context == bdrv_get_aio_context(bs)) {
326                     bdrv_flush_io_queue(bs);
327                     if (bdrv_requests_pending(bs)) {
328                         busy = true;
329                         aio_poll(aio_context, busy);
330                     }
331                 }
332             }
333             busy |= aio_poll(aio_context, false);
334             aio_context_release(aio_context);
335         }
336     }
337 
338     bs = NULL;
339     while ((bs = bdrv_next(bs))) {
340         AioContext *aio_context = bdrv_get_aio_context(bs);
341 
342         aio_context_acquire(aio_context);
343         if (bs->job) {
344             block_job_resume(bs->job);
345         }
346         aio_context_release(aio_context);
347     }
348     g_slist_free(aio_ctxs);
349 }
350 
351 /**
352  * Remove an active request from the tracked requests list
353  *
354  * This function should be called when a tracked request is completing.
355  */
356 static void tracked_request_end(BdrvTrackedRequest *req)
357 {
358     if (req->serialising) {
359         req->bs->serialising_in_flight--;
360     }
361 
362     QLIST_REMOVE(req, list);
363     qemu_co_queue_restart_all(&req->wait_queue);
364 }
365 
366 /**
367  * Add an active request to the tracked requests list
368  */
369 static void tracked_request_begin(BdrvTrackedRequest *req,
370                                   BlockDriverState *bs,
371                                   int64_t offset,
372                                   unsigned int bytes,
373                                   enum BdrvTrackedRequestType type)
374 {
375     *req = (BdrvTrackedRequest){
376         .bs = bs,
377         .offset         = offset,
378         .bytes          = bytes,
379         .type           = type,
380         .co             = qemu_coroutine_self(),
381         .serialising    = false,
382         .overlap_offset = offset,
383         .overlap_bytes  = bytes,
384     };
385 
386     qemu_co_queue_init(&req->wait_queue);
387 
388     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
389 }
390 
391 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
392 {
393     int64_t overlap_offset = req->offset & ~(align - 1);
394     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
395                                - overlap_offset;
396 
397     if (!req->serialising) {
398         req->bs->serialising_in_flight++;
399         req->serialising = true;
400     }
401 
402     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
403     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
404 }
405 
406 /**
407  * Round a region to cluster boundaries
408  */
409 void bdrv_round_to_clusters(BlockDriverState *bs,
410                             int64_t sector_num, int nb_sectors,
411                             int64_t *cluster_sector_num,
412                             int *cluster_nb_sectors)
413 {
414     BlockDriverInfo bdi;
415 
416     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
417         *cluster_sector_num = sector_num;
418         *cluster_nb_sectors = nb_sectors;
419     } else {
420         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
421         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
422         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
423                                             nb_sectors, c);
424     }
425 }
426 
427 static int bdrv_get_cluster_size(BlockDriverState *bs)
428 {
429     BlockDriverInfo bdi;
430     int ret;
431 
432     ret = bdrv_get_info(bs, &bdi);
433     if (ret < 0 || bdi.cluster_size == 0) {
434         return bs->request_alignment;
435     } else {
436         return bdi.cluster_size;
437     }
438 }
439 
440 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
441                                      int64_t offset, unsigned int bytes)
442 {
443     /*        aaaa   bbbb */
444     if (offset >= req->overlap_offset + req->overlap_bytes) {
445         return false;
446     }
447     /* bbbb   aaaa        */
448     if (req->overlap_offset >= offset + bytes) {
449         return false;
450     }
451     return true;
452 }
453 
454 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
455 {
456     BlockDriverState *bs = self->bs;
457     BdrvTrackedRequest *req;
458     bool retry;
459     bool waited = false;
460 
461     if (!bs->serialising_in_flight) {
462         return false;
463     }
464 
465     do {
466         retry = false;
467         QLIST_FOREACH(req, &bs->tracked_requests, list) {
468             if (req == self || (!req->serialising && !self->serialising)) {
469                 continue;
470             }
471             if (tracked_request_overlaps(req, self->overlap_offset,
472                                          self->overlap_bytes))
473             {
474                 /* Hitting this means there was a reentrant request, for
475                  * example, a block driver issuing nested requests.  This must
476                  * never happen since it means deadlock.
477                  */
478                 assert(qemu_coroutine_self() != req->co);
479 
480                 /* If the request is already (indirectly) waiting for us, or
481                  * will wait for us as soon as it wakes up, then just go on
482                  * (instead of producing a deadlock in the former case). */
483                 if (!req->waiting_for) {
484                     self->waiting_for = req;
485                     qemu_co_queue_wait(&req->wait_queue);
486                     self->waiting_for = NULL;
487                     retry = true;
488                     waited = true;
489                     break;
490                 }
491             }
492         }
493     } while (retry);
494 
495     return waited;
496 }
497 
498 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
499                                    size_t size)
500 {
501     if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
502         return -EIO;
503     }
504 
505     if (!bdrv_is_inserted(bs)) {
506         return -ENOMEDIUM;
507     }
508 
509     if (offset < 0) {
510         return -EIO;
511     }
512 
513     return 0;
514 }
515 
516 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
517                               int nb_sectors)
518 {
519     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
520         return -EIO;
521     }
522 
523     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
524                                    nb_sectors * BDRV_SECTOR_SIZE);
525 }
526 
527 typedef struct RwCo {
528     BlockDriverState *bs;
529     int64_t offset;
530     QEMUIOVector *qiov;
531     bool is_write;
532     int ret;
533     BdrvRequestFlags flags;
534 } RwCo;
535 
536 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
537 {
538     RwCo *rwco = opaque;
539 
540     if (!rwco->is_write) {
541         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
542                                       rwco->qiov->size, rwco->qiov,
543                                       rwco->flags);
544     } else {
545         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
546                                        rwco->qiov->size, rwco->qiov,
547                                        rwco->flags);
548     }
549 }
550 
551 /*
552  * Process a vectored synchronous request using coroutines
553  */
554 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
555                         QEMUIOVector *qiov, bool is_write,
556                         BdrvRequestFlags flags)
557 {
558     Coroutine *co;
559     RwCo rwco = {
560         .bs = bs,
561         .offset = offset,
562         .qiov = qiov,
563         .is_write = is_write,
564         .ret = NOT_DONE,
565         .flags = flags,
566     };
567 
568     /**
569      * In sync call context, when the vcpu is blocked, this throttling timer
570      * will not fire; so the I/O throttling function has to be disabled here
571      * if it has been enabled.
572      */
573     if (bs->io_limits_enabled) {
574         fprintf(stderr, "Disabling I/O throttling on '%s' due "
575                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
576         bdrv_io_limits_disable(bs);
577     }
578 
579     if (qemu_in_coroutine()) {
580         /* Fast-path if already in coroutine context */
581         bdrv_rw_co_entry(&rwco);
582     } else {
583         AioContext *aio_context = bdrv_get_aio_context(bs);
584 
585         co = qemu_coroutine_create(bdrv_rw_co_entry);
586         qemu_coroutine_enter(co, &rwco);
587         while (rwco.ret == NOT_DONE) {
588             aio_poll(aio_context, true);
589         }
590     }
591     return rwco.ret;
592 }
593 
594 /*
595  * Process a synchronous request using coroutines
596  */
597 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
598                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
599 {
600     QEMUIOVector qiov;
601     struct iovec iov = {
602         .iov_base = (void *)buf,
603         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
604     };
605 
606     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
607         return -EINVAL;
608     }
609 
610     qemu_iovec_init_external(&qiov, &iov, 1);
611     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
612                         &qiov, is_write, flags);
613 }
614 
615 /* return < 0 if error. See bdrv_write() for the return codes */
616 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
617               uint8_t *buf, int nb_sectors)
618 {
619     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
620 }
621 
622 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
623 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
624                           uint8_t *buf, int nb_sectors)
625 {
626     bool enabled;
627     int ret;
628 
629     enabled = bs->io_limits_enabled;
630     bs->io_limits_enabled = false;
631     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
632     bs->io_limits_enabled = enabled;
633     return ret;
634 }
635 
636 /* Return < 0 if error. Important errors are:
637   -EIO         generic I/O error (may happen for all errors)
638   -ENOMEDIUM   No media inserted.
639   -EINVAL      Invalid sector number or nb_sectors
640   -EACCES      Trying to write a read-only device
641 */
642 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
643                const uint8_t *buf, int nb_sectors)
644 {
645     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
646 }
647 
648 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
649                       int nb_sectors, BdrvRequestFlags flags)
650 {
651     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
652                       BDRV_REQ_ZERO_WRITE | flags);
653 }
654 
655 /*
656  * Completely zero out a block device with the help of bdrv_write_zeroes.
657  * The operation is sped up by checking the block status and only writing
658  * zeroes to the device if they currently do not return zeroes. Optional
659  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
660  *
661  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
662  */
663 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
664 {
665     int64_t target_sectors, ret, nb_sectors, sector_num = 0;
666     int n;
667 
668     target_sectors = bdrv_nb_sectors(bs);
669     if (target_sectors < 0) {
670         return target_sectors;
671     }
672 
673     for (;;) {
674         nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
675         if (nb_sectors <= 0) {
676             return 0;
677         }
678         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
679         if (ret < 0) {
680             error_report("error getting block status at sector %" PRId64 ": %s",
681                          sector_num, strerror(-ret));
682             return ret;
683         }
684         if (ret & BDRV_BLOCK_ZERO) {
685             sector_num += n;
686             continue;
687         }
688         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
689         if (ret < 0) {
690             error_report("error writing zeroes at sector %" PRId64 ": %s",
691                          sector_num, strerror(-ret));
692             return ret;
693         }
694         sector_num += n;
695     }
696 }
697 
698 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
699 {
700     QEMUIOVector qiov;
701     struct iovec iov = {
702         .iov_base = (void *)buf,
703         .iov_len = bytes,
704     };
705     int ret;
706 
707     if (bytes < 0) {
708         return -EINVAL;
709     }
710 
711     qemu_iovec_init_external(&qiov, &iov, 1);
712     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
713     if (ret < 0) {
714         return ret;
715     }
716 
717     return bytes;
718 }
719 
720 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
721 {
722     int ret;
723 
724     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
725     if (ret < 0) {
726         return ret;
727     }
728 
729     return qiov->size;
730 }
731 
732 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
733                 const void *buf, int bytes)
734 {
735     QEMUIOVector qiov;
736     struct iovec iov = {
737         .iov_base   = (void *) buf,
738         .iov_len    = bytes,
739     };
740 
741     if (bytes < 0) {
742         return -EINVAL;
743     }
744 
745     qemu_iovec_init_external(&qiov, &iov, 1);
746     return bdrv_pwritev(bs, offset, &qiov);
747 }
748 
749 /*
750  * Writes to the file and ensures that no writes are reordered across this
751  * request (acts as a barrier)
752  *
753  * Returns 0 on success, -errno in error cases.
754  */
755 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
756     const void *buf, int count)
757 {
758     int ret;
759 
760     ret = bdrv_pwrite(bs, offset, buf, count);
761     if (ret < 0) {
762         return ret;
763     }
764 
765     /* No flush needed for cache modes that already do it */
766     if (bs->enable_write_cache) {
767         bdrv_flush(bs);
768     }
769 
770     return 0;
771 }
772 
773 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
774         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
775 {
776     /* Perform I/O through a temporary buffer so that users who scribble over
777      * their read buffer while the operation is in progress do not end up
778      * modifying the image file.  This is critical for zero-copy guest I/O
779      * where anything might happen inside guest memory.
780      */
781     void *bounce_buffer;
782 
783     BlockDriver *drv = bs->drv;
784     struct iovec iov;
785     QEMUIOVector bounce_qiov;
786     int64_t cluster_sector_num;
787     int cluster_nb_sectors;
788     size_t skip_bytes;
789     int ret;
790 
791     /* Cover entire cluster so no additional backing file I/O is required when
792      * allocating cluster in the image file.
793      */
794     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
795                            &cluster_sector_num, &cluster_nb_sectors);
796 
797     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
798                                    cluster_sector_num, cluster_nb_sectors);
799 
800     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
801     iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
802     if (bounce_buffer == NULL) {
803         ret = -ENOMEM;
804         goto err;
805     }
806 
807     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
808 
809     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
810                              &bounce_qiov);
811     if (ret < 0) {
812         goto err;
813     }
814 
815     if (drv->bdrv_co_write_zeroes &&
816         buffer_is_zero(bounce_buffer, iov.iov_len)) {
817         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
818                                       cluster_nb_sectors, 0);
819     } else {
820         /* This does not change the data on the disk, it is not necessary
821          * to flush even in cache=writethrough mode.
822          */
823         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
824                                   &bounce_qiov);
825     }
826 
827     if (ret < 0) {
828         /* It might be okay to ignore write errors for guest requests.  If this
829          * is a deliberate copy-on-read then we don't want to ignore the error.
830          * Simply report it in all cases.
831          */
832         goto err;
833     }
834 
835     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
836     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
837                         nb_sectors * BDRV_SECTOR_SIZE);
838 
839 err:
840     qemu_vfree(bounce_buffer);
841     return ret;
842 }
843 
844 /*
845  * Forwards an already correctly aligned request to the BlockDriver. This
846  * handles copy on read and zeroing after EOF; any other features must be
847  * implemented by the caller.
848  */
849 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
850     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
851     int64_t align, QEMUIOVector *qiov, int flags)
852 {
853     BlockDriver *drv = bs->drv;
854     int ret;
855 
856     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
857     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
858 
859     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
860     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
861     assert(!qiov || bytes == qiov->size);
862 
863     /* Handle Copy on Read and associated serialisation */
864     if (flags & BDRV_REQ_COPY_ON_READ) {
865         /* If we touch the same cluster it counts as an overlap.  This
866          * guarantees that allocating writes will be serialized and not race
867          * with each other for the same cluster.  For example, in copy-on-read
868          * it ensures that the CoR read and write operations are atomic and
869          * guest writes cannot interleave between them. */
870         mark_request_serialising(req, bdrv_get_cluster_size(bs));
871     }
872 
873     if (!(flags & BDRV_REQ_NO_SERIALISING)) {
874         wait_serialising_requests(req);
875     }
876 
877     if (flags & BDRV_REQ_COPY_ON_READ) {
878         int pnum;
879 
880         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
881         if (ret < 0) {
882             goto out;
883         }
884 
885         if (!ret || pnum != nb_sectors) {
886             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
887             goto out;
888         }
889     }
890 
891     /* Forward the request to the BlockDriver */
892     if (!bs->zero_beyond_eof) {
893         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
894     } else {
895         /* Read zeros after EOF */
896         int64_t total_sectors, max_nb_sectors;
897 
898         total_sectors = bdrv_nb_sectors(bs);
899         if (total_sectors < 0) {
900             ret = total_sectors;
901             goto out;
902         }
903 
904         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
905                                   align >> BDRV_SECTOR_BITS);
906         if (nb_sectors < max_nb_sectors) {
907             ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
908         } else if (max_nb_sectors > 0) {
909             QEMUIOVector local_qiov;
910 
911             qemu_iovec_init(&local_qiov, qiov->niov);
912             qemu_iovec_concat(&local_qiov, qiov, 0,
913                               max_nb_sectors * BDRV_SECTOR_SIZE);
914 
915             ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
916                                      &local_qiov);
917 
918             qemu_iovec_destroy(&local_qiov);
919         } else {
920             ret = 0;
921         }
922 
923         /* Reading beyond end of file is supposed to produce zeroes */
924         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
925             uint64_t offset = MAX(0, total_sectors - sector_num);
926             uint64_t bytes = (sector_num + nb_sectors - offset) *
927                               BDRV_SECTOR_SIZE;
928             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
929         }
930     }
931 
932 out:
933     return ret;
934 }
935 
936 /*
937  * Handle a read request in coroutine context
938  */
939 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
940     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
941     BdrvRequestFlags flags)
942 {
943     BlockDriver *drv = bs->drv;
944     BdrvTrackedRequest req;
945 
946     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
947     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
948     uint8_t *head_buf = NULL;
949     uint8_t *tail_buf = NULL;
950     QEMUIOVector local_qiov;
951     bool use_local_qiov = false;
952     int ret;
953 
954     if (!drv) {
955         return -ENOMEDIUM;
956     }
957 
958     ret = bdrv_check_byte_request(bs, offset, bytes);
959     if (ret < 0) {
960         return ret;
961     }
962 
963     /* Don't do copy-on-read if we read data before write operation */
964     if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) {
965         flags |= BDRV_REQ_COPY_ON_READ;
966     }
967 
968     /* throttling disk I/O */
969     if (bs->io_limits_enabled) {
970         throttle_group_co_io_limits_intercept(bs, bytes, false);
971     }
972 
973     /* Align read if necessary by padding qiov */
974     if (offset & (align - 1)) {
975         head_buf = qemu_blockalign(bs, align);
976         qemu_iovec_init(&local_qiov, qiov->niov + 2);
977         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
978         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
979         use_local_qiov = true;
980 
981         bytes += offset & (align - 1);
982         offset = offset & ~(align - 1);
983     }
984 
985     if ((offset + bytes) & (align - 1)) {
986         if (!use_local_qiov) {
987             qemu_iovec_init(&local_qiov, qiov->niov + 1);
988             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
989             use_local_qiov = true;
990         }
991         tail_buf = qemu_blockalign(bs, align);
992         qemu_iovec_add(&local_qiov, tail_buf,
993                        align - ((offset + bytes) & (align - 1)));
994 
995         bytes = ROUND_UP(bytes, align);
996     }
997 
998     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
999     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
1000                               use_local_qiov ? &local_qiov : qiov,
1001                               flags);
1002     tracked_request_end(&req);
1003 
1004     if (use_local_qiov) {
1005         qemu_iovec_destroy(&local_qiov);
1006         qemu_vfree(head_buf);
1007         qemu_vfree(tail_buf);
1008     }
1009 
1010     return ret;
1011 }
1012 
1013 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1014     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1015     BdrvRequestFlags flags)
1016 {
1017     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1018         return -EINVAL;
1019     }
1020 
1021     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
1022                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1023 }
1024 
1025 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1026     int nb_sectors, QEMUIOVector *qiov)
1027 {
1028     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1029 
1030     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1031 }
1032 
1033 int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs,
1034     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1035 {
1036     trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors);
1037 
1038     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1039                             BDRV_REQ_NO_SERIALISING);
1040 }
1041 
1042 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1043     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1044 {
1045     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1046 
1047     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1048                             BDRV_REQ_COPY_ON_READ);
1049 }
1050 
1051 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
1052 
1053 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1054     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
1055 {
1056     BlockDriver *drv = bs->drv;
1057     QEMUIOVector qiov;
1058     struct iovec iov = {0};
1059     int ret = 0;
1060 
1061     int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
1062                                         BDRV_REQUEST_MAX_SECTORS);
1063 
1064     while (nb_sectors > 0 && !ret) {
1065         int num = nb_sectors;
1066 
1067         /* Align request.  Block drivers can expect the "bulk" of the request
1068          * to be aligned.
1069          */
1070         if (bs->bl.write_zeroes_alignment
1071             && num > bs->bl.write_zeroes_alignment) {
1072             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
1073                 /* Make a small request up to the first aligned sector.  */
1074                 num = bs->bl.write_zeroes_alignment;
1075                 num -= sector_num % bs->bl.write_zeroes_alignment;
1076             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
1077                 /* Shorten the request to the last aligned sector.  num cannot
1078                  * underflow because num > bs->bl.write_zeroes_alignment.
1079                  */
1080                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
1081             }
1082         }
1083 
1084         /* limit request size */
1085         if (num > max_write_zeroes) {
1086             num = max_write_zeroes;
1087         }
1088 
1089         ret = -ENOTSUP;
1090         /* First try the efficient write zeroes operation */
1091         if (drv->bdrv_co_write_zeroes) {
1092             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
1093         }
1094 
1095         if (ret == -ENOTSUP) {
1096             /* Fall back to bounce buffer if write zeroes is unsupported */
1097             int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
1098                                             MAX_WRITE_ZEROES_BOUNCE_BUFFER);
1099             num = MIN(num, max_xfer_len);
1100             iov.iov_len = num * BDRV_SECTOR_SIZE;
1101             if (iov.iov_base == NULL) {
1102                 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
1103                 if (iov.iov_base == NULL) {
1104                     ret = -ENOMEM;
1105                     goto fail;
1106                 }
1107                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
1108             }
1109             qemu_iovec_init_external(&qiov, &iov, 1);
1110 
1111             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
1112 
1113             /* Keep bounce buffer around if it is big enough for all
1114              * all future requests.
1115              */
1116             if (num < max_xfer_len) {
1117                 qemu_vfree(iov.iov_base);
1118                 iov.iov_base = NULL;
1119             }
1120         }
1121 
1122         sector_num += num;
1123         nb_sectors -= num;
1124     }
1125 
1126 fail:
1127     qemu_vfree(iov.iov_base);
1128     return ret;
1129 }
1130 
1131 /*
1132  * Forwards an already correctly aligned write request to the BlockDriver.
1133  */
1134 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
1135     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1136     QEMUIOVector *qiov, int flags)
1137 {
1138     BlockDriver *drv = bs->drv;
1139     bool waited;
1140     int ret;
1141 
1142     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
1143     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
1144 
1145     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
1146     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
1147     assert(!qiov || bytes == qiov->size);
1148 
1149     waited = wait_serialising_requests(req);
1150     assert(!waited || !req->serialising);
1151     assert(req->overlap_offset <= offset);
1152     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1153 
1154     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
1155 
1156     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
1157         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
1158         qemu_iovec_is_zero(qiov)) {
1159         flags |= BDRV_REQ_ZERO_WRITE;
1160         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
1161             flags |= BDRV_REQ_MAY_UNMAP;
1162         }
1163     }
1164 
1165     if (ret < 0) {
1166         /* Do nothing, write notifier decided to fail this request */
1167     } else if (flags & BDRV_REQ_ZERO_WRITE) {
1168         bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
1169         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
1170     } else {
1171         bdrv_debug_event(bs, BLKDBG_PWRITEV);
1172         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1173     }
1174     bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
1175 
1176     if (ret == 0 && !bs->enable_write_cache) {
1177         ret = bdrv_co_flush(bs);
1178     }
1179 
1180     bdrv_set_dirty(bs, sector_num, nb_sectors);
1181 
1182     if (bs->wr_highest_offset < offset + bytes) {
1183         bs->wr_highest_offset = offset + bytes;
1184     }
1185 
1186     if (ret >= 0) {
1187         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
1188     }
1189 
1190     return ret;
1191 }
1192 
1193 static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
1194                                                 int64_t offset,
1195                                                 unsigned int bytes,
1196                                                 BdrvRequestFlags flags,
1197                                                 BdrvTrackedRequest *req)
1198 {
1199     uint8_t *buf = NULL;
1200     QEMUIOVector local_qiov;
1201     struct iovec iov;
1202     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
1203     unsigned int head_padding_bytes, tail_padding_bytes;
1204     int ret = 0;
1205 
1206     head_padding_bytes = offset & (align - 1);
1207     tail_padding_bytes = align - ((offset + bytes) & (align - 1));
1208 
1209 
1210     assert(flags & BDRV_REQ_ZERO_WRITE);
1211     if (head_padding_bytes || tail_padding_bytes) {
1212         buf = qemu_blockalign(bs, align);
1213         iov = (struct iovec) {
1214             .iov_base   = buf,
1215             .iov_len    = align,
1216         };
1217         qemu_iovec_init_external(&local_qiov, &iov, 1);
1218     }
1219     if (head_padding_bytes) {
1220         uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
1221 
1222         /* RMW the unaligned part before head. */
1223         mark_request_serialising(req, align);
1224         wait_serialising_requests(req);
1225         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1226         ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align,
1227                                   align, &local_qiov, 0);
1228         if (ret < 0) {
1229             goto fail;
1230         }
1231         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1232 
1233         memset(buf + head_padding_bytes, 0, zero_bytes);
1234         ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align,
1235                                    &local_qiov,
1236                                    flags & ~BDRV_REQ_ZERO_WRITE);
1237         if (ret < 0) {
1238             goto fail;
1239         }
1240         offset += zero_bytes;
1241         bytes -= zero_bytes;
1242     }
1243 
1244     assert(!bytes || (offset & (align - 1)) == 0);
1245     if (bytes >= align) {
1246         /* Write the aligned part in the middle. */
1247         uint64_t aligned_bytes = bytes & ~(align - 1);
1248         ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes,
1249                                    NULL, flags);
1250         if (ret < 0) {
1251             goto fail;
1252         }
1253         bytes -= aligned_bytes;
1254         offset += aligned_bytes;
1255     }
1256 
1257     assert(!bytes || (offset & (align - 1)) == 0);
1258     if (bytes) {
1259         assert(align == tail_padding_bytes + bytes);
1260         /* RMW the unaligned part after tail. */
1261         mark_request_serialising(req, align);
1262         wait_serialising_requests(req);
1263         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1264         ret = bdrv_aligned_preadv(bs, req, offset, align,
1265                                   align, &local_qiov, 0);
1266         if (ret < 0) {
1267             goto fail;
1268         }
1269         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1270 
1271         memset(buf, 0, bytes);
1272         ret = bdrv_aligned_pwritev(bs, req, offset, align,
1273                                    &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
1274     }
1275 fail:
1276     qemu_vfree(buf);
1277     return ret;
1278 
1279 }
1280 
1281 /*
1282  * Handle a write request in coroutine context
1283  */
1284 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
1285     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1286     BdrvRequestFlags flags)
1287 {
1288     BdrvTrackedRequest req;
1289     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
1290     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
1291     uint8_t *head_buf = NULL;
1292     uint8_t *tail_buf = NULL;
1293     QEMUIOVector local_qiov;
1294     bool use_local_qiov = false;
1295     int ret;
1296 
1297     if (!bs->drv) {
1298         return -ENOMEDIUM;
1299     }
1300     if (bs->read_only) {
1301         return -EPERM;
1302     }
1303 
1304     ret = bdrv_check_byte_request(bs, offset, bytes);
1305     if (ret < 0) {
1306         return ret;
1307     }
1308 
1309     /* throttling disk I/O */
1310     if (bs->io_limits_enabled) {
1311         throttle_group_co_io_limits_intercept(bs, bytes, true);
1312     }
1313 
1314     /*
1315      * Align write if necessary by performing a read-modify-write cycle.
1316      * Pad qiov with the read parts and be sure to have a tracked request not
1317      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
1318      */
1319     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
1320 
1321     if (!qiov) {
1322         ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req);
1323         goto out;
1324     }
1325 
1326     if (offset & (align - 1)) {
1327         QEMUIOVector head_qiov;
1328         struct iovec head_iov;
1329 
1330         mark_request_serialising(&req, align);
1331         wait_serialising_requests(&req);
1332 
1333         head_buf = qemu_blockalign(bs, align);
1334         head_iov = (struct iovec) {
1335             .iov_base   = head_buf,
1336             .iov_len    = align,
1337         };
1338         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
1339 
1340         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1341         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
1342                                   align, &head_qiov, 0);
1343         if (ret < 0) {
1344             goto fail;
1345         }
1346         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1347 
1348         qemu_iovec_init(&local_qiov, qiov->niov + 2);
1349         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1350         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1351         use_local_qiov = true;
1352 
1353         bytes += offset & (align - 1);
1354         offset = offset & ~(align - 1);
1355     }
1356 
1357     if ((offset + bytes) & (align - 1)) {
1358         QEMUIOVector tail_qiov;
1359         struct iovec tail_iov;
1360         size_t tail_bytes;
1361         bool waited;
1362 
1363         mark_request_serialising(&req, align);
1364         waited = wait_serialising_requests(&req);
1365         assert(!waited || !use_local_qiov);
1366 
1367         tail_buf = qemu_blockalign(bs, align);
1368         tail_iov = (struct iovec) {
1369             .iov_base   = tail_buf,
1370             .iov_len    = align,
1371         };
1372         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
1373 
1374         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1375         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
1376                                   align, &tail_qiov, 0);
1377         if (ret < 0) {
1378             goto fail;
1379         }
1380         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1381 
1382         if (!use_local_qiov) {
1383             qemu_iovec_init(&local_qiov, qiov->niov + 1);
1384             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1385             use_local_qiov = true;
1386         }
1387 
1388         tail_bytes = (offset + bytes) & (align - 1);
1389         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
1390 
1391         bytes = ROUND_UP(bytes, align);
1392     }
1393 
1394     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
1395                                use_local_qiov ? &local_qiov : qiov,
1396                                flags);
1397 
1398 fail:
1399 
1400     if (use_local_qiov) {
1401         qemu_iovec_destroy(&local_qiov);
1402     }
1403     qemu_vfree(head_buf);
1404     qemu_vfree(tail_buf);
1405 out:
1406     tracked_request_end(&req);
1407     return ret;
1408 }
1409 
1410 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1411     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1412     BdrvRequestFlags flags)
1413 {
1414     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1415         return -EINVAL;
1416     }
1417 
1418     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
1419                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1420 }
1421 
1422 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1423     int nb_sectors, QEMUIOVector *qiov)
1424 {
1425     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1426 
1427     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1428 }
1429 
1430 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1431                                       int64_t sector_num, int nb_sectors,
1432                                       BdrvRequestFlags flags)
1433 {
1434     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
1435 
1436     if (!(bs->open_flags & BDRV_O_UNMAP)) {
1437         flags &= ~BDRV_REQ_MAY_UNMAP;
1438     }
1439 
1440     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1441                              BDRV_REQ_ZERO_WRITE | flags);
1442 }
1443 
1444 int bdrv_flush_all(void)
1445 {
1446     BlockDriverState *bs = NULL;
1447     int result = 0;
1448 
1449     while ((bs = bdrv_next(bs))) {
1450         AioContext *aio_context = bdrv_get_aio_context(bs);
1451         int ret;
1452 
1453         aio_context_acquire(aio_context);
1454         ret = bdrv_flush(bs);
1455         if (ret < 0 && !result) {
1456             result = ret;
1457         }
1458         aio_context_release(aio_context);
1459     }
1460 
1461     return result;
1462 }
1463 
1464 typedef struct BdrvCoGetBlockStatusData {
1465     BlockDriverState *bs;
1466     BlockDriverState *base;
1467     int64_t sector_num;
1468     int nb_sectors;
1469     int *pnum;
1470     int64_t ret;
1471     bool done;
1472 } BdrvCoGetBlockStatusData;
1473 
1474 /*
1475  * Returns the allocation status of the specified sectors.
1476  * Drivers not implementing the functionality are assumed to not support
1477  * backing files, hence all their sectors are reported as allocated.
1478  *
1479  * If 'sector_num' is beyond the end of the disk image the return value is 0
1480  * and 'pnum' is set to 0.
1481  *
1482  * 'pnum' is set to the number of sectors (including and immediately following
1483  * the specified sector) that are known to be in the same
1484  * allocated/unallocated state.
1485  *
1486  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
1487  * beyond the end of the disk image it will be clamped.
1488  */
1489 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
1490                                                      int64_t sector_num,
1491                                                      int nb_sectors, int *pnum)
1492 {
1493     int64_t total_sectors;
1494     int64_t n;
1495     int64_t ret, ret2;
1496 
1497     total_sectors = bdrv_nb_sectors(bs);
1498     if (total_sectors < 0) {
1499         return total_sectors;
1500     }
1501 
1502     if (sector_num >= total_sectors) {
1503         *pnum = 0;
1504         return 0;
1505     }
1506 
1507     n = total_sectors - sector_num;
1508     if (n < nb_sectors) {
1509         nb_sectors = n;
1510     }
1511 
1512     if (!bs->drv->bdrv_co_get_block_status) {
1513         *pnum = nb_sectors;
1514         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
1515         if (bs->drv->protocol_name) {
1516             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
1517         }
1518         return ret;
1519     }
1520 
1521     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
1522     if (ret < 0) {
1523         *pnum = 0;
1524         return ret;
1525     }
1526 
1527     if (ret & BDRV_BLOCK_RAW) {
1528         assert(ret & BDRV_BLOCK_OFFSET_VALID);
1529         return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
1530                                      *pnum, pnum);
1531     }
1532 
1533     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
1534         ret |= BDRV_BLOCK_ALLOCATED;
1535     } else {
1536         if (bdrv_unallocated_blocks_are_zero(bs)) {
1537             ret |= BDRV_BLOCK_ZERO;
1538         } else if (bs->backing) {
1539             BlockDriverState *bs2 = bs->backing->bs;
1540             int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
1541             if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
1542                 ret |= BDRV_BLOCK_ZERO;
1543             }
1544         }
1545     }
1546 
1547     if (bs->file &&
1548         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
1549         (ret & BDRV_BLOCK_OFFSET_VALID)) {
1550         int file_pnum;
1551 
1552         ret2 = bdrv_co_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
1553                                         *pnum, &file_pnum);
1554         if (ret2 >= 0) {
1555             /* Ignore errors.  This is just providing extra information, it
1556              * is useful but not necessary.
1557              */
1558             if (!file_pnum) {
1559                 /* !file_pnum indicates an offset at or beyond the EOF; it is
1560                  * perfectly valid for the format block driver to point to such
1561                  * offsets, so catch it and mark everything as zero */
1562                 ret |= BDRV_BLOCK_ZERO;
1563             } else {
1564                 /* Limit request to the range reported by the protocol driver */
1565                 *pnum = file_pnum;
1566                 ret |= (ret2 & BDRV_BLOCK_ZERO);
1567             }
1568         }
1569     }
1570 
1571     return ret;
1572 }
1573 
1574 static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs,
1575         BlockDriverState *base,
1576         int64_t sector_num,
1577         int nb_sectors,
1578         int *pnum)
1579 {
1580     BlockDriverState *p;
1581     int64_t ret = 0;
1582 
1583     assert(bs != base);
1584     for (p = bs; p != base; p = backing_bs(p)) {
1585         ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum);
1586         if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) {
1587             break;
1588         }
1589         /* [sector_num, pnum] unallocated on this layer, which could be only
1590          * the first part of [sector_num, nb_sectors].  */
1591         nb_sectors = MIN(nb_sectors, *pnum);
1592     }
1593     return ret;
1594 }
1595 
1596 /* Coroutine wrapper for bdrv_get_block_status_above() */
1597 static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque)
1598 {
1599     BdrvCoGetBlockStatusData *data = opaque;
1600 
1601     data->ret = bdrv_co_get_block_status_above(data->bs, data->base,
1602                                                data->sector_num,
1603                                                data->nb_sectors,
1604                                                data->pnum);
1605     data->done = true;
1606 }
1607 
1608 /*
1609  * Synchronous wrapper around bdrv_co_get_block_status_above().
1610  *
1611  * See bdrv_co_get_block_status_above() for details.
1612  */
1613 int64_t bdrv_get_block_status_above(BlockDriverState *bs,
1614                                     BlockDriverState *base,
1615                                     int64_t sector_num,
1616                                     int nb_sectors, int *pnum)
1617 {
1618     Coroutine *co;
1619     BdrvCoGetBlockStatusData data = {
1620         .bs = bs,
1621         .base = base,
1622         .sector_num = sector_num,
1623         .nb_sectors = nb_sectors,
1624         .pnum = pnum,
1625         .done = false,
1626     };
1627 
1628     if (qemu_in_coroutine()) {
1629         /* Fast-path if already in coroutine context */
1630         bdrv_get_block_status_above_co_entry(&data);
1631     } else {
1632         AioContext *aio_context = bdrv_get_aio_context(bs);
1633 
1634         co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry);
1635         qemu_coroutine_enter(co, &data);
1636         while (!data.done) {
1637             aio_poll(aio_context, true);
1638         }
1639     }
1640     return data.ret;
1641 }
1642 
1643 int64_t bdrv_get_block_status(BlockDriverState *bs,
1644                               int64_t sector_num,
1645                               int nb_sectors, int *pnum)
1646 {
1647     return bdrv_get_block_status_above(bs, backing_bs(bs),
1648                                        sector_num, nb_sectors, pnum);
1649 }
1650 
1651 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
1652                                    int nb_sectors, int *pnum)
1653 {
1654     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
1655     if (ret < 0) {
1656         return ret;
1657     }
1658     return !!(ret & BDRV_BLOCK_ALLOCATED);
1659 }
1660 
1661 /*
1662  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
1663  *
1664  * Return true if the given sector is allocated in any image between
1665  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
1666  * sector is allocated in any image of the chain.  Return false otherwise.
1667  *
1668  * 'pnum' is set to the number of sectors (including and immediately following
1669  *  the specified sector) that are known to be in the same
1670  *  allocated/unallocated state.
1671  *
1672  */
1673 int bdrv_is_allocated_above(BlockDriverState *top,
1674                             BlockDriverState *base,
1675                             int64_t sector_num,
1676                             int nb_sectors, int *pnum)
1677 {
1678     BlockDriverState *intermediate;
1679     int ret, n = nb_sectors;
1680 
1681     intermediate = top;
1682     while (intermediate && intermediate != base) {
1683         int pnum_inter;
1684         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
1685                                 &pnum_inter);
1686         if (ret < 0) {
1687             return ret;
1688         } else if (ret) {
1689             *pnum = pnum_inter;
1690             return 1;
1691         }
1692 
1693         /*
1694          * [sector_num, nb_sectors] is unallocated on top but intermediate
1695          * might have
1696          *
1697          * [sector_num+x, nr_sectors] allocated.
1698          */
1699         if (n > pnum_inter &&
1700             (intermediate == top ||
1701              sector_num + pnum_inter < intermediate->total_sectors)) {
1702             n = pnum_inter;
1703         }
1704 
1705         intermediate = backing_bs(intermediate);
1706     }
1707 
1708     *pnum = n;
1709     return 0;
1710 }
1711 
1712 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
1713                           const uint8_t *buf, int nb_sectors)
1714 {
1715     BlockDriver *drv = bs->drv;
1716     int ret;
1717 
1718     if (!drv) {
1719         return -ENOMEDIUM;
1720     }
1721     if (!drv->bdrv_write_compressed) {
1722         return -ENOTSUP;
1723     }
1724     ret = bdrv_check_request(bs, sector_num, nb_sectors);
1725     if (ret < 0) {
1726         return ret;
1727     }
1728 
1729     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1730 
1731     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
1732 }
1733 
1734 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
1735                       int64_t pos, int size)
1736 {
1737     QEMUIOVector qiov;
1738     struct iovec iov = {
1739         .iov_base   = (void *) buf,
1740         .iov_len    = size,
1741     };
1742 
1743     qemu_iovec_init_external(&qiov, &iov, 1);
1744     return bdrv_writev_vmstate(bs, &qiov, pos);
1745 }
1746 
1747 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
1748 {
1749     BlockDriver *drv = bs->drv;
1750 
1751     if (!drv) {
1752         return -ENOMEDIUM;
1753     } else if (drv->bdrv_save_vmstate) {
1754         return drv->bdrv_save_vmstate(bs, qiov, pos);
1755     } else if (bs->file) {
1756         return bdrv_writev_vmstate(bs->file->bs, qiov, pos);
1757     }
1758 
1759     return -ENOTSUP;
1760 }
1761 
1762 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
1763                       int64_t pos, int size)
1764 {
1765     BlockDriver *drv = bs->drv;
1766     if (!drv)
1767         return -ENOMEDIUM;
1768     if (drv->bdrv_load_vmstate)
1769         return drv->bdrv_load_vmstate(bs, buf, pos, size);
1770     if (bs->file)
1771         return bdrv_load_vmstate(bs->file->bs, buf, pos, size);
1772     return -ENOTSUP;
1773 }
1774 
1775 /**************************************************************/
1776 /* async I/Os */
1777 
1778 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
1779                            QEMUIOVector *qiov, int nb_sectors,
1780                            BlockCompletionFunc *cb, void *opaque)
1781 {
1782     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
1783 
1784     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
1785                                  cb, opaque, false);
1786 }
1787 
1788 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
1789                             QEMUIOVector *qiov, int nb_sectors,
1790                             BlockCompletionFunc *cb, void *opaque)
1791 {
1792     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
1793 
1794     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
1795                                  cb, opaque, true);
1796 }
1797 
1798 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
1799         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
1800         BlockCompletionFunc *cb, void *opaque)
1801 {
1802     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
1803 
1804     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
1805                                  BDRV_REQ_ZERO_WRITE | flags,
1806                                  cb, opaque, true);
1807 }
1808 
1809 
1810 typedef struct MultiwriteCB {
1811     int error;
1812     int num_requests;
1813     int num_callbacks;
1814     struct {
1815         BlockCompletionFunc *cb;
1816         void *opaque;
1817         QEMUIOVector *free_qiov;
1818     } callbacks[];
1819 } MultiwriteCB;
1820 
1821 static void multiwrite_user_cb(MultiwriteCB *mcb)
1822 {
1823     int i;
1824 
1825     for (i = 0; i < mcb->num_callbacks; i++) {
1826         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1827         if (mcb->callbacks[i].free_qiov) {
1828             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
1829         }
1830         g_free(mcb->callbacks[i].free_qiov);
1831     }
1832 }
1833 
1834 static void multiwrite_cb(void *opaque, int ret)
1835 {
1836     MultiwriteCB *mcb = opaque;
1837 
1838     trace_multiwrite_cb(mcb, ret);
1839 
1840     if (ret < 0 && !mcb->error) {
1841         mcb->error = ret;
1842     }
1843 
1844     mcb->num_requests--;
1845     if (mcb->num_requests == 0) {
1846         multiwrite_user_cb(mcb);
1847         g_free(mcb);
1848     }
1849 }
1850 
1851 static int multiwrite_req_compare(const void *a, const void *b)
1852 {
1853     const BlockRequest *req1 = a, *req2 = b;
1854 
1855     /*
1856      * Note that we can't simply subtract req2->sector from req1->sector
1857      * here as that could overflow the return value.
1858      */
1859     if (req1->sector > req2->sector) {
1860         return 1;
1861     } else if (req1->sector < req2->sector) {
1862         return -1;
1863     } else {
1864         return 0;
1865     }
1866 }
1867 
1868 /*
1869  * Takes a bunch of requests and tries to merge them. Returns the number of
1870  * requests that remain after merging.
1871  */
1872 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
1873     int num_reqs, MultiwriteCB *mcb)
1874 {
1875     int i, outidx;
1876 
1877     // Sort requests by start sector
1878     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
1879 
1880     // Check if adjacent requests touch the same clusters. If so, combine them,
1881     // filling up gaps with zero sectors.
1882     outidx = 0;
1883     for (i = 1; i < num_reqs; i++) {
1884         int merge = 0;
1885         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
1886 
1887         // Handle exactly sequential writes and overlapping writes.
1888         if (reqs[i].sector <= oldreq_last) {
1889             merge = 1;
1890         }
1891 
1892         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 >
1893             bs->bl.max_iov) {
1894             merge = 0;
1895         }
1896 
1897         if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
1898             reqs[i].nb_sectors > bs->bl.max_transfer_length) {
1899             merge = 0;
1900         }
1901 
1902         if (merge) {
1903             size_t size;
1904             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
1905             qemu_iovec_init(qiov,
1906                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
1907 
1908             // Add the first request to the merged one. If the requests are
1909             // overlapping, drop the last sectors of the first request.
1910             size = (reqs[i].sector - reqs[outidx].sector) << 9;
1911             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
1912 
1913             // We should need to add any zeros between the two requests
1914             assert (reqs[i].sector <= oldreq_last);
1915 
1916             // Add the second request
1917             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
1918 
1919             // Add tail of first request, if necessary
1920             if (qiov->size < reqs[outidx].qiov->size) {
1921                 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
1922                                   reqs[outidx].qiov->size - qiov->size);
1923             }
1924 
1925             reqs[outidx].nb_sectors = qiov->size >> 9;
1926             reqs[outidx].qiov = qiov;
1927 
1928             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
1929         } else {
1930             outidx++;
1931             reqs[outidx].sector     = reqs[i].sector;
1932             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
1933             reqs[outidx].qiov       = reqs[i].qiov;
1934         }
1935     }
1936 
1937     if (bs->blk) {
1938         block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE,
1939                               num_reqs - outidx - 1);
1940     }
1941 
1942     return outidx + 1;
1943 }
1944 
1945 /*
1946  * Submit multiple AIO write requests at once.
1947  *
1948  * On success, the function returns 0 and all requests in the reqs array have
1949  * been submitted. In error case this function returns -1, and any of the
1950  * requests may or may not be submitted yet. In particular, this means that the
1951  * callback will be called for some of the requests, for others it won't. The
1952  * caller must check the error field of the BlockRequest to wait for the right
1953  * callbacks (if error != 0, no callback will be called).
1954  *
1955  * The implementation may modify the contents of the reqs array, e.g. to merge
1956  * requests. However, the fields opaque and error are left unmodified as they
1957  * are used to signal failure for a single request to the caller.
1958  */
1959 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
1960 {
1961     MultiwriteCB *mcb;
1962     int i;
1963 
1964     /* don't submit writes if we don't have a medium */
1965     if (bs->drv == NULL) {
1966         for (i = 0; i < num_reqs; i++) {
1967             reqs[i].error = -ENOMEDIUM;
1968         }
1969         return -1;
1970     }
1971 
1972     if (num_reqs == 0) {
1973         return 0;
1974     }
1975 
1976     // Create MultiwriteCB structure
1977     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
1978     mcb->num_requests = 0;
1979     mcb->num_callbacks = num_reqs;
1980 
1981     for (i = 0; i < num_reqs; i++) {
1982         mcb->callbacks[i].cb = reqs[i].cb;
1983         mcb->callbacks[i].opaque = reqs[i].opaque;
1984     }
1985 
1986     // Check for mergable requests
1987     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
1988 
1989     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
1990 
1991     /* Run the aio requests. */
1992     mcb->num_requests = num_reqs;
1993     for (i = 0; i < num_reqs; i++) {
1994         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
1995                               reqs[i].nb_sectors, reqs[i].flags,
1996                               multiwrite_cb, mcb,
1997                               true);
1998     }
1999 
2000     return 0;
2001 }
2002 
2003 void bdrv_aio_cancel(BlockAIOCB *acb)
2004 {
2005     qemu_aio_ref(acb);
2006     bdrv_aio_cancel_async(acb);
2007     while (acb->refcnt > 1) {
2008         if (acb->aiocb_info->get_aio_context) {
2009             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
2010         } else if (acb->bs) {
2011             aio_poll(bdrv_get_aio_context(acb->bs), true);
2012         } else {
2013             abort();
2014         }
2015     }
2016     qemu_aio_unref(acb);
2017 }
2018 
2019 /* Async version of aio cancel. The caller is not blocked if the acb implements
2020  * cancel_async, otherwise we do nothing and let the request normally complete.
2021  * In either case the completion callback must be called. */
2022 void bdrv_aio_cancel_async(BlockAIOCB *acb)
2023 {
2024     if (acb->aiocb_info->cancel_async) {
2025         acb->aiocb_info->cancel_async(acb);
2026     }
2027 }
2028 
2029 /**************************************************************/
2030 /* async block device emulation */
2031 
2032 typedef struct BlockAIOCBSync {
2033     BlockAIOCB common;
2034     QEMUBH *bh;
2035     int ret;
2036     /* vector translation state */
2037     QEMUIOVector *qiov;
2038     uint8_t *bounce;
2039     int is_write;
2040 } BlockAIOCBSync;
2041 
2042 static const AIOCBInfo bdrv_em_aiocb_info = {
2043     .aiocb_size         = sizeof(BlockAIOCBSync),
2044 };
2045 
2046 static void bdrv_aio_bh_cb(void *opaque)
2047 {
2048     BlockAIOCBSync *acb = opaque;
2049 
2050     if (!acb->is_write && acb->ret >= 0) {
2051         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
2052     }
2053     qemu_vfree(acb->bounce);
2054     acb->common.cb(acb->common.opaque, acb->ret);
2055     qemu_bh_delete(acb->bh);
2056     acb->bh = NULL;
2057     qemu_aio_unref(acb);
2058 }
2059 
2060 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
2061                                       int64_t sector_num,
2062                                       QEMUIOVector *qiov,
2063                                       int nb_sectors,
2064                                       BlockCompletionFunc *cb,
2065                                       void *opaque,
2066                                       int is_write)
2067 
2068 {
2069     BlockAIOCBSync *acb;
2070 
2071     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
2072     acb->is_write = is_write;
2073     acb->qiov = qiov;
2074     acb->bounce = qemu_try_blockalign(bs, qiov->size);
2075     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
2076 
2077     if (acb->bounce == NULL) {
2078         acb->ret = -ENOMEM;
2079     } else if (is_write) {
2080         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
2081         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
2082     } else {
2083         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
2084     }
2085 
2086     qemu_bh_schedule(acb->bh);
2087 
2088     return &acb->common;
2089 }
2090 
2091 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
2092         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2093         BlockCompletionFunc *cb, void *opaque)
2094 {
2095     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
2096 }
2097 
2098 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
2099         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2100         BlockCompletionFunc *cb, void *opaque)
2101 {
2102     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
2103 }
2104 
2105 
2106 typedef struct BlockAIOCBCoroutine {
2107     BlockAIOCB common;
2108     BlockRequest req;
2109     bool is_write;
2110     bool need_bh;
2111     bool *done;
2112     QEMUBH* bh;
2113 } BlockAIOCBCoroutine;
2114 
2115 static const AIOCBInfo bdrv_em_co_aiocb_info = {
2116     .aiocb_size         = sizeof(BlockAIOCBCoroutine),
2117 };
2118 
2119 static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
2120 {
2121     if (!acb->need_bh) {
2122         acb->common.cb(acb->common.opaque, acb->req.error);
2123         qemu_aio_unref(acb);
2124     }
2125 }
2126 
2127 static void bdrv_co_em_bh(void *opaque)
2128 {
2129     BlockAIOCBCoroutine *acb = opaque;
2130 
2131     assert(!acb->need_bh);
2132     qemu_bh_delete(acb->bh);
2133     bdrv_co_complete(acb);
2134 }
2135 
2136 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
2137 {
2138     acb->need_bh = false;
2139     if (acb->req.error != -EINPROGRESS) {
2140         BlockDriverState *bs = acb->common.bs;
2141 
2142         acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
2143         qemu_bh_schedule(acb->bh);
2144     }
2145 }
2146 
2147 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
2148 static void coroutine_fn bdrv_co_do_rw(void *opaque)
2149 {
2150     BlockAIOCBCoroutine *acb = opaque;
2151     BlockDriverState *bs = acb->common.bs;
2152 
2153     if (!acb->is_write) {
2154         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
2155             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
2156     } else {
2157         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
2158             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
2159     }
2160 
2161     bdrv_co_complete(acb);
2162 }
2163 
2164 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
2165                                          int64_t sector_num,
2166                                          QEMUIOVector *qiov,
2167                                          int nb_sectors,
2168                                          BdrvRequestFlags flags,
2169                                          BlockCompletionFunc *cb,
2170                                          void *opaque,
2171                                          bool is_write)
2172 {
2173     Coroutine *co;
2174     BlockAIOCBCoroutine *acb;
2175 
2176     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2177     acb->need_bh = true;
2178     acb->req.error = -EINPROGRESS;
2179     acb->req.sector = sector_num;
2180     acb->req.nb_sectors = nb_sectors;
2181     acb->req.qiov = qiov;
2182     acb->req.flags = flags;
2183     acb->is_write = is_write;
2184 
2185     co = qemu_coroutine_create(bdrv_co_do_rw);
2186     qemu_coroutine_enter(co, acb);
2187 
2188     bdrv_co_maybe_schedule_bh(acb);
2189     return &acb->common;
2190 }
2191 
2192 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
2193 {
2194     BlockAIOCBCoroutine *acb = opaque;
2195     BlockDriverState *bs = acb->common.bs;
2196 
2197     acb->req.error = bdrv_co_flush(bs);
2198     bdrv_co_complete(acb);
2199 }
2200 
2201 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
2202         BlockCompletionFunc *cb, void *opaque)
2203 {
2204     trace_bdrv_aio_flush(bs, opaque);
2205 
2206     Coroutine *co;
2207     BlockAIOCBCoroutine *acb;
2208 
2209     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2210     acb->need_bh = true;
2211     acb->req.error = -EINPROGRESS;
2212 
2213     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
2214     qemu_coroutine_enter(co, acb);
2215 
2216     bdrv_co_maybe_schedule_bh(acb);
2217     return &acb->common;
2218 }
2219 
2220 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
2221 {
2222     BlockAIOCBCoroutine *acb = opaque;
2223     BlockDriverState *bs = acb->common.bs;
2224 
2225     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
2226     bdrv_co_complete(acb);
2227 }
2228 
2229 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
2230         int64_t sector_num, int nb_sectors,
2231         BlockCompletionFunc *cb, void *opaque)
2232 {
2233     Coroutine *co;
2234     BlockAIOCBCoroutine *acb;
2235 
2236     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
2237 
2238     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2239     acb->need_bh = true;
2240     acb->req.error = -EINPROGRESS;
2241     acb->req.sector = sector_num;
2242     acb->req.nb_sectors = nb_sectors;
2243     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
2244     qemu_coroutine_enter(co, acb);
2245 
2246     bdrv_co_maybe_schedule_bh(acb);
2247     return &acb->common;
2248 }
2249 
2250 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
2251                    BlockCompletionFunc *cb, void *opaque)
2252 {
2253     BlockAIOCB *acb;
2254 
2255     acb = g_malloc(aiocb_info->aiocb_size);
2256     acb->aiocb_info = aiocb_info;
2257     acb->bs = bs;
2258     acb->cb = cb;
2259     acb->opaque = opaque;
2260     acb->refcnt = 1;
2261     return acb;
2262 }
2263 
2264 void qemu_aio_ref(void *p)
2265 {
2266     BlockAIOCB *acb = p;
2267     acb->refcnt++;
2268 }
2269 
2270 void qemu_aio_unref(void *p)
2271 {
2272     BlockAIOCB *acb = p;
2273     assert(acb->refcnt > 0);
2274     if (--acb->refcnt == 0) {
2275         g_free(acb);
2276     }
2277 }
2278 
2279 /**************************************************************/
2280 /* Coroutine block device emulation */
2281 
2282 typedef struct CoroutineIOCompletion {
2283     Coroutine *coroutine;
2284     int ret;
2285 } CoroutineIOCompletion;
2286 
2287 static void bdrv_co_io_em_complete(void *opaque, int ret)
2288 {
2289     CoroutineIOCompletion *co = opaque;
2290 
2291     co->ret = ret;
2292     qemu_coroutine_enter(co->coroutine, NULL);
2293 }
2294 
2295 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
2296                                       int nb_sectors, QEMUIOVector *iov,
2297                                       bool is_write)
2298 {
2299     CoroutineIOCompletion co = {
2300         .coroutine = qemu_coroutine_self(),
2301     };
2302     BlockAIOCB *acb;
2303 
2304     if (is_write) {
2305         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
2306                                        bdrv_co_io_em_complete, &co);
2307     } else {
2308         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
2309                                       bdrv_co_io_em_complete, &co);
2310     }
2311 
2312     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
2313     if (!acb) {
2314         return -EIO;
2315     }
2316     qemu_coroutine_yield();
2317 
2318     return co.ret;
2319 }
2320 
2321 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
2322                                          int64_t sector_num, int nb_sectors,
2323                                          QEMUIOVector *iov)
2324 {
2325     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
2326 }
2327 
2328 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
2329                                          int64_t sector_num, int nb_sectors,
2330                                          QEMUIOVector *iov)
2331 {
2332     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
2333 }
2334 
2335 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2336 {
2337     RwCo *rwco = opaque;
2338 
2339     rwco->ret = bdrv_co_flush(rwco->bs);
2340 }
2341 
2342 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2343 {
2344     int ret;
2345     BdrvTrackedRequest req;
2346 
2347     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
2348         bdrv_is_sg(bs)) {
2349         return 0;
2350     }
2351 
2352     tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
2353     /* Write back cached data to the OS even with cache=unsafe */
2354     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
2355     if (bs->drv->bdrv_co_flush_to_os) {
2356         ret = bs->drv->bdrv_co_flush_to_os(bs);
2357         if (ret < 0) {
2358             goto out;
2359         }
2360     }
2361 
2362     /* But don't actually force it to the disk with cache=unsafe */
2363     if (bs->open_flags & BDRV_O_NO_FLUSH) {
2364         goto flush_parent;
2365     }
2366 
2367     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2368     if (bs->drv->bdrv_co_flush_to_disk) {
2369         ret = bs->drv->bdrv_co_flush_to_disk(bs);
2370     } else if (bs->drv->bdrv_aio_flush) {
2371         BlockAIOCB *acb;
2372         CoroutineIOCompletion co = {
2373             .coroutine = qemu_coroutine_self(),
2374         };
2375 
2376         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2377         if (acb == NULL) {
2378             ret = -EIO;
2379         } else {
2380             qemu_coroutine_yield();
2381             ret = co.ret;
2382         }
2383     } else {
2384         /*
2385          * Some block drivers always operate in either writethrough or unsafe
2386          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2387          * know how the server works (because the behaviour is hardcoded or
2388          * depends on server-side configuration), so we can't ensure that
2389          * everything is safe on disk. Returning an error doesn't work because
2390          * that would break guests even if the server operates in writethrough
2391          * mode.
2392          *
2393          * Let's hope the user knows what he's doing.
2394          */
2395         ret = 0;
2396     }
2397     if (ret < 0) {
2398         goto out;
2399     }
2400 
2401     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
2402      * in the case of cache=unsafe, so there are no useless flushes.
2403      */
2404 flush_parent:
2405     ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2406 out:
2407     tracked_request_end(&req);
2408     return ret;
2409 }
2410 
2411 int bdrv_flush(BlockDriverState *bs)
2412 {
2413     Coroutine *co;
2414     RwCo rwco = {
2415         .bs = bs,
2416         .ret = NOT_DONE,
2417     };
2418 
2419     if (qemu_in_coroutine()) {
2420         /* Fast-path if already in coroutine context */
2421         bdrv_flush_co_entry(&rwco);
2422     } else {
2423         AioContext *aio_context = bdrv_get_aio_context(bs);
2424 
2425         co = qemu_coroutine_create(bdrv_flush_co_entry);
2426         qemu_coroutine_enter(co, &rwco);
2427         while (rwco.ret == NOT_DONE) {
2428             aio_poll(aio_context, true);
2429         }
2430     }
2431 
2432     return rwco.ret;
2433 }
2434 
2435 typedef struct DiscardCo {
2436     BlockDriverState *bs;
2437     int64_t sector_num;
2438     int nb_sectors;
2439     int ret;
2440 } DiscardCo;
2441 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
2442 {
2443     DiscardCo *rwco = opaque;
2444 
2445     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
2446 }
2447 
2448 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
2449                                  int nb_sectors)
2450 {
2451     BdrvTrackedRequest req;
2452     int max_discard, ret;
2453 
2454     if (!bs->drv) {
2455         return -ENOMEDIUM;
2456     }
2457 
2458     ret = bdrv_check_request(bs, sector_num, nb_sectors);
2459     if (ret < 0) {
2460         return ret;
2461     } else if (bs->read_only) {
2462         return -EPERM;
2463     }
2464 
2465     /* Do nothing if disabled.  */
2466     if (!(bs->open_flags & BDRV_O_UNMAP)) {
2467         return 0;
2468     }
2469 
2470     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
2471         return 0;
2472     }
2473 
2474     tracked_request_begin(&req, bs, sector_num, nb_sectors,
2475                           BDRV_TRACKED_DISCARD);
2476     bdrv_set_dirty(bs, sector_num, nb_sectors);
2477 
2478     max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
2479     while (nb_sectors > 0) {
2480         int ret;
2481         int num = nb_sectors;
2482 
2483         /* align request */
2484         if (bs->bl.discard_alignment &&
2485             num >= bs->bl.discard_alignment &&
2486             sector_num % bs->bl.discard_alignment) {
2487             if (num > bs->bl.discard_alignment) {
2488                 num = bs->bl.discard_alignment;
2489             }
2490             num -= sector_num % bs->bl.discard_alignment;
2491         }
2492 
2493         /* limit request size */
2494         if (num > max_discard) {
2495             num = max_discard;
2496         }
2497 
2498         if (bs->drv->bdrv_co_discard) {
2499             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
2500         } else {
2501             BlockAIOCB *acb;
2502             CoroutineIOCompletion co = {
2503                 .coroutine = qemu_coroutine_self(),
2504             };
2505 
2506             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
2507                                             bdrv_co_io_em_complete, &co);
2508             if (acb == NULL) {
2509                 ret = -EIO;
2510                 goto out;
2511             } else {
2512                 qemu_coroutine_yield();
2513                 ret = co.ret;
2514             }
2515         }
2516         if (ret && ret != -ENOTSUP) {
2517             goto out;
2518         }
2519 
2520         sector_num += num;
2521         nb_sectors -= num;
2522     }
2523     ret = 0;
2524 out:
2525     tracked_request_end(&req);
2526     return ret;
2527 }
2528 
2529 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
2530 {
2531     Coroutine *co;
2532     DiscardCo rwco = {
2533         .bs = bs,
2534         .sector_num = sector_num,
2535         .nb_sectors = nb_sectors,
2536         .ret = NOT_DONE,
2537     };
2538 
2539     if (qemu_in_coroutine()) {
2540         /* Fast-path if already in coroutine context */
2541         bdrv_discard_co_entry(&rwco);
2542     } else {
2543         AioContext *aio_context = bdrv_get_aio_context(bs);
2544 
2545         co = qemu_coroutine_create(bdrv_discard_co_entry);
2546         qemu_coroutine_enter(co, &rwco);
2547         while (rwco.ret == NOT_DONE) {
2548             aio_poll(aio_context, true);
2549         }
2550     }
2551 
2552     return rwco.ret;
2553 }
2554 
2555 typedef struct {
2556     CoroutineIOCompletion *co;
2557     QEMUBH *bh;
2558 } BdrvIoctlCompletionData;
2559 
2560 static void bdrv_ioctl_bh_cb(void *opaque)
2561 {
2562     BdrvIoctlCompletionData *data = opaque;
2563 
2564     bdrv_co_io_em_complete(data->co, -ENOTSUP);
2565     qemu_bh_delete(data->bh);
2566 }
2567 
2568 static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf)
2569 {
2570     BlockDriver *drv = bs->drv;
2571     BdrvTrackedRequest tracked_req;
2572     CoroutineIOCompletion co = {
2573         .coroutine = qemu_coroutine_self(),
2574     };
2575     BlockAIOCB *acb;
2576 
2577     tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL);
2578     if (!drv || !drv->bdrv_aio_ioctl) {
2579         co.ret = -ENOTSUP;
2580         goto out;
2581     }
2582 
2583     acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
2584     if (!acb) {
2585         BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1);
2586         data->bh = aio_bh_new(bdrv_get_aio_context(bs),
2587                                 bdrv_ioctl_bh_cb, data);
2588         data->co = &co;
2589         qemu_bh_schedule(data->bh);
2590     }
2591     qemu_coroutine_yield();
2592 out:
2593     tracked_request_end(&tracked_req);
2594     return co.ret;
2595 }
2596 
2597 typedef struct {
2598     BlockDriverState *bs;
2599     int req;
2600     void *buf;
2601     int ret;
2602 } BdrvIoctlCoData;
2603 
2604 static void coroutine_fn bdrv_co_ioctl_entry(void *opaque)
2605 {
2606     BdrvIoctlCoData *data = opaque;
2607     data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf);
2608 }
2609 
2610 /* needed for generic scsi interface */
2611 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
2612 {
2613     BdrvIoctlCoData data = {
2614         .bs = bs,
2615         .req = req,
2616         .buf = buf,
2617         .ret = -EINPROGRESS,
2618     };
2619 
2620     if (qemu_in_coroutine()) {
2621         /* Fast-path if already in coroutine context */
2622         bdrv_co_ioctl_entry(&data);
2623     } else {
2624         Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry);
2625 
2626         qemu_coroutine_enter(co, &data);
2627         while (data.ret == -EINPROGRESS) {
2628             aio_poll(bdrv_get_aio_context(bs), true);
2629         }
2630     }
2631     return data.ret;
2632 }
2633 
2634 static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque)
2635 {
2636     BlockAIOCBCoroutine *acb = opaque;
2637     acb->req.error = bdrv_co_do_ioctl(acb->common.bs,
2638                                       acb->req.req, acb->req.buf);
2639     bdrv_co_complete(acb);
2640 }
2641 
2642 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
2643         unsigned long int req, void *buf,
2644         BlockCompletionFunc *cb, void *opaque)
2645 {
2646     BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info,
2647                                             bs, cb, opaque);
2648     Coroutine *co;
2649 
2650     acb->need_bh = true;
2651     acb->req.error = -EINPROGRESS;
2652     acb->req.req = req;
2653     acb->req.buf = buf;
2654     co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry);
2655     qemu_coroutine_enter(co, acb);
2656 
2657     bdrv_co_maybe_schedule_bh(acb);
2658     return &acb->common;
2659 }
2660 
2661 void *qemu_blockalign(BlockDriverState *bs, size_t size)
2662 {
2663     return qemu_memalign(bdrv_opt_mem_align(bs), size);
2664 }
2665 
2666 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
2667 {
2668     return memset(qemu_blockalign(bs, size), 0, size);
2669 }
2670 
2671 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
2672 {
2673     size_t align = bdrv_opt_mem_align(bs);
2674 
2675     /* Ensure that NULL is never returned on success */
2676     assert(align > 0);
2677     if (size == 0) {
2678         size = align;
2679     }
2680 
2681     return qemu_try_memalign(align, size);
2682 }
2683 
2684 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
2685 {
2686     void *mem = qemu_try_blockalign(bs, size);
2687 
2688     if (mem) {
2689         memset(mem, 0, size);
2690     }
2691 
2692     return mem;
2693 }
2694 
2695 /*
2696  * Check if all memory in this vector is sector aligned.
2697  */
2698 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
2699 {
2700     int i;
2701     size_t alignment = bdrv_min_mem_align(bs);
2702 
2703     for (i = 0; i < qiov->niov; i++) {
2704         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
2705             return false;
2706         }
2707         if (qiov->iov[i].iov_len % alignment) {
2708             return false;
2709         }
2710     }
2711 
2712     return true;
2713 }
2714 
2715 void bdrv_add_before_write_notifier(BlockDriverState *bs,
2716                                     NotifierWithReturn *notifier)
2717 {
2718     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
2719 }
2720 
2721 void bdrv_io_plug(BlockDriverState *bs)
2722 {
2723     BlockDriver *drv = bs->drv;
2724     if (drv && drv->bdrv_io_plug) {
2725         drv->bdrv_io_plug(bs);
2726     } else if (bs->file) {
2727         bdrv_io_plug(bs->file->bs);
2728     }
2729 }
2730 
2731 void bdrv_io_unplug(BlockDriverState *bs)
2732 {
2733     BlockDriver *drv = bs->drv;
2734     if (drv && drv->bdrv_io_unplug) {
2735         drv->bdrv_io_unplug(bs);
2736     } else if (bs->file) {
2737         bdrv_io_unplug(bs->file->bs);
2738     }
2739 }
2740 
2741 void bdrv_flush_io_queue(BlockDriverState *bs)
2742 {
2743     BlockDriver *drv = bs->drv;
2744     if (drv && drv->bdrv_flush_io_queue) {
2745         drv->bdrv_flush_io_queue(bs);
2746     } else if (bs->file) {
2747         bdrv_flush_io_queue(bs->file->bs);
2748     }
2749     bdrv_start_throttled_reqs(bs);
2750 }
2751 
2752 void bdrv_drained_begin(BlockDriverState *bs)
2753 {
2754     if (!bs->quiesce_counter++) {
2755         aio_disable_external(bdrv_get_aio_context(bs));
2756     }
2757     bdrv_drain(bs);
2758 }
2759 
2760 void bdrv_drained_end(BlockDriverState *bs)
2761 {
2762     assert(bs->quiesce_counter > 0);
2763     if (--bs->quiesce_counter > 0) {
2764         return;
2765     }
2766     aio_enable_external(bdrv_get_aio_context(bs));
2767 }
2768