xref: /openbmc/qemu/block/io.c (revision be9f8a08727e46c790adb8caa8a4525a1e8e9e73)
1 /*
2  * Block layer I/O functions
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "trace.h"
26 #include "sysemu/qtest.h"
27 #include "block/blockjob.h"
28 #include "block/block_int.h"
29 
30 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
31 
32 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
33         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
34         BlockCompletionFunc *cb, void *opaque);
35 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
36         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
37         BlockCompletionFunc *cb, void *opaque);
38 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
39                                          int64_t sector_num, int nb_sectors,
40                                          QEMUIOVector *iov);
41 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
42                                          int64_t sector_num, int nb_sectors,
43                                          QEMUIOVector *iov);
44 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
45     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
46     BdrvRequestFlags flags);
47 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
48     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
49     BdrvRequestFlags flags);
50 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
51                                          int64_t sector_num,
52                                          QEMUIOVector *qiov,
53                                          int nb_sectors,
54                                          BdrvRequestFlags flags,
55                                          BlockCompletionFunc *cb,
56                                          void *opaque,
57                                          bool is_write);
58 static void coroutine_fn bdrv_co_do_rw(void *opaque);
59 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
60     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
61 
62 /* throttling disk I/O limits */
63 void bdrv_set_io_limits(BlockDriverState *bs,
64                         ThrottleConfig *cfg)
65 {
66     int i;
67 
68     throttle_config(&bs->throttle_state, cfg);
69 
70     for (i = 0; i < 2; i++) {
71         qemu_co_enter_next(&bs->throttled_reqs[i]);
72     }
73 }
74 
75 /* this function drain all the throttled IOs */
76 static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
77 {
78     bool drained = false;
79     bool enabled = bs->io_limits_enabled;
80     int i;
81 
82     bs->io_limits_enabled = false;
83 
84     for (i = 0; i < 2; i++) {
85         while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
86             drained = true;
87         }
88     }
89 
90     bs->io_limits_enabled = enabled;
91 
92     return drained;
93 }
94 
95 void bdrv_io_limits_disable(BlockDriverState *bs)
96 {
97     bs->io_limits_enabled = false;
98 
99     bdrv_start_throttled_reqs(bs);
100 
101     throttle_destroy(&bs->throttle_state);
102 }
103 
104 static void bdrv_throttle_read_timer_cb(void *opaque)
105 {
106     BlockDriverState *bs = opaque;
107     qemu_co_enter_next(&bs->throttled_reqs[0]);
108 }
109 
110 static void bdrv_throttle_write_timer_cb(void *opaque)
111 {
112     BlockDriverState *bs = opaque;
113     qemu_co_enter_next(&bs->throttled_reqs[1]);
114 }
115 
116 /* should be called before bdrv_set_io_limits if a limit is set */
117 void bdrv_io_limits_enable(BlockDriverState *bs)
118 {
119     int clock_type = QEMU_CLOCK_REALTIME;
120 
121     if (qtest_enabled()) {
122         /* For testing block IO throttling only */
123         clock_type = QEMU_CLOCK_VIRTUAL;
124     }
125     assert(!bs->io_limits_enabled);
126     throttle_init(&bs->throttle_state,
127                   bdrv_get_aio_context(bs),
128                   clock_type,
129                   bdrv_throttle_read_timer_cb,
130                   bdrv_throttle_write_timer_cb,
131                   bs);
132     bs->io_limits_enabled = true;
133 }
134 
135 /* This function makes an IO wait if needed
136  *
137  * @nb_sectors: the number of sectors of the IO
138  * @is_write:   is the IO a write
139  */
140 static void bdrv_io_limits_intercept(BlockDriverState *bs,
141                                      unsigned int bytes,
142                                      bool is_write)
143 {
144     /* does this io must wait */
145     bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
146 
147     /* if must wait or any request of this type throttled queue the IO */
148     if (must_wait ||
149         !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
150         qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
151     }
152 
153     /* the IO will be executed, do the accounting */
154     throttle_account(&bs->throttle_state, is_write, bytes);
155 
156 
157     /* if the next request must wait -> do nothing */
158     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
159         return;
160     }
161 
162     /* else queue next request for execution */
163     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
164 }
165 
166 void bdrv_setup_io_funcs(BlockDriver *bdrv)
167 {
168     /* Block drivers without coroutine functions need emulation */
169     if (!bdrv->bdrv_co_readv) {
170         bdrv->bdrv_co_readv = bdrv_co_readv_em;
171         bdrv->bdrv_co_writev = bdrv_co_writev_em;
172 
173         /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
174          * the block driver lacks aio we need to emulate that too.
175          */
176         if (!bdrv->bdrv_aio_readv) {
177             /* add AIO emulation layer */
178             bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
179             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
180         }
181     }
182 }
183 
184 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
185 {
186     BlockDriver *drv = bs->drv;
187     Error *local_err = NULL;
188 
189     memset(&bs->bl, 0, sizeof(bs->bl));
190 
191     if (!drv) {
192         return;
193     }
194 
195     /* Take some limits from the children as a default */
196     if (bs->file) {
197         bdrv_refresh_limits(bs->file, &local_err);
198         if (local_err) {
199             error_propagate(errp, local_err);
200             return;
201         }
202         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
203         bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
204         bs->bl.min_mem_alignment = bs->file->bl.min_mem_alignment;
205         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
206     } else {
207         bs->bl.min_mem_alignment = 512;
208         bs->bl.opt_mem_alignment = getpagesize();
209     }
210 
211     if (bs->backing_hd) {
212         bdrv_refresh_limits(bs->backing_hd, &local_err);
213         if (local_err) {
214             error_propagate(errp, local_err);
215             return;
216         }
217         bs->bl.opt_transfer_length =
218             MAX(bs->bl.opt_transfer_length,
219                 bs->backing_hd->bl.opt_transfer_length);
220         bs->bl.max_transfer_length =
221             MIN_NON_ZERO(bs->bl.max_transfer_length,
222                          bs->backing_hd->bl.max_transfer_length);
223         bs->bl.opt_mem_alignment =
224             MAX(bs->bl.opt_mem_alignment,
225                 bs->backing_hd->bl.opt_mem_alignment);
226         bs->bl.min_mem_alignment =
227             MAX(bs->bl.min_mem_alignment,
228                 bs->backing_hd->bl.min_mem_alignment);
229     }
230 
231     /* Then let the driver override it */
232     if (drv->bdrv_refresh_limits) {
233         drv->bdrv_refresh_limits(bs, errp);
234     }
235 }
236 
237 /**
238  * The copy-on-read flag is actually a reference count so multiple users may
239  * use the feature without worrying about clobbering its previous state.
240  * Copy-on-read stays enabled until all users have called to disable it.
241  */
242 void bdrv_enable_copy_on_read(BlockDriverState *bs)
243 {
244     bs->copy_on_read++;
245 }
246 
247 void bdrv_disable_copy_on_read(BlockDriverState *bs)
248 {
249     assert(bs->copy_on_read > 0);
250     bs->copy_on_read--;
251 }
252 
253 /* Check if any requests are in-flight (including throttled requests) */
254 static bool bdrv_requests_pending(BlockDriverState *bs)
255 {
256     if (!QLIST_EMPTY(&bs->tracked_requests)) {
257         return true;
258     }
259     if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
260         return true;
261     }
262     if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
263         return true;
264     }
265     if (bs->file && bdrv_requests_pending(bs->file)) {
266         return true;
267     }
268     if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
269         return true;
270     }
271     return false;
272 }
273 
274 static bool bdrv_drain_one(BlockDriverState *bs)
275 {
276     bool bs_busy;
277 
278     bdrv_flush_io_queue(bs);
279     bdrv_start_throttled_reqs(bs);
280     bs_busy = bdrv_requests_pending(bs);
281     bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
282     return bs_busy;
283 }
284 
285 /*
286  * Wait for pending requests to complete on a single BlockDriverState subtree
287  *
288  * See the warning in bdrv_drain_all().  This function can only be called if
289  * you are sure nothing can generate I/O because you have op blockers
290  * installed.
291  *
292  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
293  * AioContext.
294  */
295 void bdrv_drain(BlockDriverState *bs)
296 {
297     while (bdrv_drain_one(bs)) {
298         /* Keep iterating */
299     }
300 }
301 
302 /*
303  * Wait for pending requests to complete across all BlockDriverStates
304  *
305  * This function does not flush data to disk, use bdrv_flush_all() for that
306  * after calling this function.
307  *
308  * Note that completion of an asynchronous I/O operation can trigger any
309  * number of other I/O operations on other devices---for example a coroutine
310  * can be arbitrarily complex and a constant flow of I/O can come until the
311  * coroutine is complete.  Because of this, it is not possible to have a
312  * function to drain a single device's I/O queue.
313  */
314 void bdrv_drain_all(void)
315 {
316     /* Always run first iteration so any pending completion BHs run */
317     bool busy = true;
318     BlockDriverState *bs = NULL;
319 
320     while ((bs = bdrv_next(bs))) {
321         AioContext *aio_context = bdrv_get_aio_context(bs);
322 
323         aio_context_acquire(aio_context);
324         if (bs->job) {
325             block_job_pause(bs->job);
326         }
327         aio_context_release(aio_context);
328     }
329 
330     while (busy) {
331         busy = false;
332         bs = NULL;
333 
334         while ((bs = bdrv_next(bs))) {
335             AioContext *aio_context = bdrv_get_aio_context(bs);
336 
337             aio_context_acquire(aio_context);
338             busy |= bdrv_drain_one(bs);
339             aio_context_release(aio_context);
340         }
341     }
342 
343     bs = NULL;
344     while ((bs = bdrv_next(bs))) {
345         AioContext *aio_context = bdrv_get_aio_context(bs);
346 
347         aio_context_acquire(aio_context);
348         if (bs->job) {
349             block_job_resume(bs->job);
350         }
351         aio_context_release(aio_context);
352     }
353 }
354 
355 /**
356  * Remove an active request from the tracked requests list
357  *
358  * This function should be called when a tracked request is completing.
359  */
360 static void tracked_request_end(BdrvTrackedRequest *req)
361 {
362     if (req->serialising) {
363         req->bs->serialising_in_flight--;
364     }
365 
366     QLIST_REMOVE(req, list);
367     qemu_co_queue_restart_all(&req->wait_queue);
368 }
369 
370 /**
371  * Add an active request to the tracked requests list
372  */
373 static void tracked_request_begin(BdrvTrackedRequest *req,
374                                   BlockDriverState *bs,
375                                   int64_t offset,
376                                   unsigned int bytes, bool is_write)
377 {
378     *req = (BdrvTrackedRequest){
379         .bs = bs,
380         .offset         = offset,
381         .bytes          = bytes,
382         .is_write       = is_write,
383         .co             = qemu_coroutine_self(),
384         .serialising    = false,
385         .overlap_offset = offset,
386         .overlap_bytes  = bytes,
387     };
388 
389     qemu_co_queue_init(&req->wait_queue);
390 
391     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
392 }
393 
394 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
395 {
396     int64_t overlap_offset = req->offset & ~(align - 1);
397     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
398                                - overlap_offset;
399 
400     if (!req->serialising) {
401         req->bs->serialising_in_flight++;
402         req->serialising = true;
403     }
404 
405     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
406     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
407 }
408 
409 /**
410  * Round a region to cluster boundaries
411  */
412 void bdrv_round_to_clusters(BlockDriverState *bs,
413                             int64_t sector_num, int nb_sectors,
414                             int64_t *cluster_sector_num,
415                             int *cluster_nb_sectors)
416 {
417     BlockDriverInfo bdi;
418 
419     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
420         *cluster_sector_num = sector_num;
421         *cluster_nb_sectors = nb_sectors;
422     } else {
423         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
424         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
425         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
426                                             nb_sectors, c);
427     }
428 }
429 
430 static int bdrv_get_cluster_size(BlockDriverState *bs)
431 {
432     BlockDriverInfo bdi;
433     int ret;
434 
435     ret = bdrv_get_info(bs, &bdi);
436     if (ret < 0 || bdi.cluster_size == 0) {
437         return bs->request_alignment;
438     } else {
439         return bdi.cluster_size;
440     }
441 }
442 
443 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
444                                      int64_t offset, unsigned int bytes)
445 {
446     /*        aaaa   bbbb */
447     if (offset >= req->overlap_offset + req->overlap_bytes) {
448         return false;
449     }
450     /* bbbb   aaaa        */
451     if (req->overlap_offset >= offset + bytes) {
452         return false;
453     }
454     return true;
455 }
456 
457 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
458 {
459     BlockDriverState *bs = self->bs;
460     BdrvTrackedRequest *req;
461     bool retry;
462     bool waited = false;
463 
464     if (!bs->serialising_in_flight) {
465         return false;
466     }
467 
468     do {
469         retry = false;
470         QLIST_FOREACH(req, &bs->tracked_requests, list) {
471             if (req == self || (!req->serialising && !self->serialising)) {
472                 continue;
473             }
474             if (tracked_request_overlaps(req, self->overlap_offset,
475                                          self->overlap_bytes))
476             {
477                 /* Hitting this means there was a reentrant request, for
478                  * example, a block driver issuing nested requests.  This must
479                  * never happen since it means deadlock.
480                  */
481                 assert(qemu_coroutine_self() != req->co);
482 
483                 /* If the request is already (indirectly) waiting for us, or
484                  * will wait for us as soon as it wakes up, then just go on
485                  * (instead of producing a deadlock in the former case). */
486                 if (!req->waiting_for) {
487                     self->waiting_for = req;
488                     qemu_co_queue_wait(&req->wait_queue);
489                     self->waiting_for = NULL;
490                     retry = true;
491                     waited = true;
492                     break;
493                 }
494             }
495         }
496     } while (retry);
497 
498     return waited;
499 }
500 
501 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
502                                    size_t size)
503 {
504     if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
505         return -EIO;
506     }
507 
508     if (!bdrv_is_inserted(bs)) {
509         return -ENOMEDIUM;
510     }
511 
512     if (offset < 0) {
513         return -EIO;
514     }
515 
516     return 0;
517 }
518 
519 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
520                               int nb_sectors)
521 {
522     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
523         return -EIO;
524     }
525 
526     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
527                                    nb_sectors * BDRV_SECTOR_SIZE);
528 }
529 
530 typedef struct RwCo {
531     BlockDriverState *bs;
532     int64_t offset;
533     QEMUIOVector *qiov;
534     bool is_write;
535     int ret;
536     BdrvRequestFlags flags;
537 } RwCo;
538 
539 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
540 {
541     RwCo *rwco = opaque;
542 
543     if (!rwco->is_write) {
544         rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
545                                       rwco->qiov->size, rwco->qiov,
546                                       rwco->flags);
547     } else {
548         rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
549                                        rwco->qiov->size, rwco->qiov,
550                                        rwco->flags);
551     }
552 }
553 
554 /*
555  * Process a vectored synchronous request using coroutines
556  */
557 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
558                         QEMUIOVector *qiov, bool is_write,
559                         BdrvRequestFlags flags)
560 {
561     Coroutine *co;
562     RwCo rwco = {
563         .bs = bs,
564         .offset = offset,
565         .qiov = qiov,
566         .is_write = is_write,
567         .ret = NOT_DONE,
568         .flags = flags,
569     };
570 
571     /**
572      * In sync call context, when the vcpu is blocked, this throttling timer
573      * will not fire; so the I/O throttling function has to be disabled here
574      * if it has been enabled.
575      */
576     if (bs->io_limits_enabled) {
577         fprintf(stderr, "Disabling I/O throttling on '%s' due "
578                         "to synchronous I/O.\n", bdrv_get_device_name(bs));
579         bdrv_io_limits_disable(bs);
580     }
581 
582     if (qemu_in_coroutine()) {
583         /* Fast-path if already in coroutine context */
584         bdrv_rw_co_entry(&rwco);
585     } else {
586         AioContext *aio_context = bdrv_get_aio_context(bs);
587 
588         co = qemu_coroutine_create(bdrv_rw_co_entry);
589         qemu_coroutine_enter(co, &rwco);
590         while (rwco.ret == NOT_DONE) {
591             aio_poll(aio_context, true);
592         }
593     }
594     return rwco.ret;
595 }
596 
597 /*
598  * Process a synchronous request using coroutines
599  */
600 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
601                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
602 {
603     QEMUIOVector qiov;
604     struct iovec iov = {
605         .iov_base = (void *)buf,
606         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
607     };
608 
609     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
610         return -EINVAL;
611     }
612 
613     qemu_iovec_init_external(&qiov, &iov, 1);
614     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
615                         &qiov, is_write, flags);
616 }
617 
618 /* return < 0 if error. See bdrv_write() for the return codes */
619 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
620               uint8_t *buf, int nb_sectors)
621 {
622     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
623 }
624 
625 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */
626 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
627                           uint8_t *buf, int nb_sectors)
628 {
629     bool enabled;
630     int ret;
631 
632     enabled = bs->io_limits_enabled;
633     bs->io_limits_enabled = false;
634     ret = bdrv_read(bs, sector_num, buf, nb_sectors);
635     bs->io_limits_enabled = enabled;
636     return ret;
637 }
638 
639 /* Return < 0 if error. Important errors are:
640   -EIO         generic I/O error (may happen for all errors)
641   -ENOMEDIUM   No media inserted.
642   -EINVAL      Invalid sector number or nb_sectors
643   -EACCES      Trying to write a read-only device
644 */
645 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
646                const uint8_t *buf, int nb_sectors)
647 {
648     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
649 }
650 
651 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
652                       int nb_sectors, BdrvRequestFlags flags)
653 {
654     return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
655                       BDRV_REQ_ZERO_WRITE | flags);
656 }
657 
658 /*
659  * Completely zero out a block device with the help of bdrv_write_zeroes.
660  * The operation is sped up by checking the block status and only writing
661  * zeroes to the device if they currently do not return zeroes. Optional
662  * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
663  *
664  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
665  */
666 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
667 {
668     int64_t target_sectors, ret, nb_sectors, sector_num = 0;
669     int n;
670 
671     target_sectors = bdrv_nb_sectors(bs);
672     if (target_sectors < 0) {
673         return target_sectors;
674     }
675 
676     for (;;) {
677         nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
678         if (nb_sectors <= 0) {
679             return 0;
680         }
681         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
682         if (ret < 0) {
683             error_report("error getting block status at sector %" PRId64 ": %s",
684                          sector_num, strerror(-ret));
685             return ret;
686         }
687         if (ret & BDRV_BLOCK_ZERO) {
688             sector_num += n;
689             continue;
690         }
691         ret = bdrv_write_zeroes(bs, sector_num, n, flags);
692         if (ret < 0) {
693             error_report("error writing zeroes at sector %" PRId64 ": %s",
694                          sector_num, strerror(-ret));
695             return ret;
696         }
697         sector_num += n;
698     }
699 }
700 
701 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
702 {
703     QEMUIOVector qiov;
704     struct iovec iov = {
705         .iov_base = (void *)buf,
706         .iov_len = bytes,
707     };
708     int ret;
709 
710     if (bytes < 0) {
711         return -EINVAL;
712     }
713 
714     qemu_iovec_init_external(&qiov, &iov, 1);
715     ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
716     if (ret < 0) {
717         return ret;
718     }
719 
720     return bytes;
721 }
722 
723 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
724 {
725     int ret;
726 
727     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
728     if (ret < 0) {
729         return ret;
730     }
731 
732     return qiov->size;
733 }
734 
735 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
736                 const void *buf, int bytes)
737 {
738     QEMUIOVector qiov;
739     struct iovec iov = {
740         .iov_base   = (void *) buf,
741         .iov_len    = bytes,
742     };
743 
744     if (bytes < 0) {
745         return -EINVAL;
746     }
747 
748     qemu_iovec_init_external(&qiov, &iov, 1);
749     return bdrv_pwritev(bs, offset, &qiov);
750 }
751 
752 /*
753  * Writes to the file and ensures that no writes are reordered across this
754  * request (acts as a barrier)
755  *
756  * Returns 0 on success, -errno in error cases.
757  */
758 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
759     const void *buf, int count)
760 {
761     int ret;
762 
763     ret = bdrv_pwrite(bs, offset, buf, count);
764     if (ret < 0) {
765         return ret;
766     }
767 
768     /* No flush needed for cache modes that already do it */
769     if (bs->enable_write_cache) {
770         bdrv_flush(bs);
771     }
772 
773     return 0;
774 }
775 
776 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
777         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
778 {
779     /* Perform I/O through a temporary buffer so that users who scribble over
780      * their read buffer while the operation is in progress do not end up
781      * modifying the image file.  This is critical for zero-copy guest I/O
782      * where anything might happen inside guest memory.
783      */
784     void *bounce_buffer;
785 
786     BlockDriver *drv = bs->drv;
787     struct iovec iov;
788     QEMUIOVector bounce_qiov;
789     int64_t cluster_sector_num;
790     int cluster_nb_sectors;
791     size_t skip_bytes;
792     int ret;
793 
794     /* Cover entire cluster so no additional backing file I/O is required when
795      * allocating cluster in the image file.
796      */
797     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
798                            &cluster_sector_num, &cluster_nb_sectors);
799 
800     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
801                                    cluster_sector_num, cluster_nb_sectors);
802 
803     iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
804     iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
805     if (bounce_buffer == NULL) {
806         ret = -ENOMEM;
807         goto err;
808     }
809 
810     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
811 
812     ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
813                              &bounce_qiov);
814     if (ret < 0) {
815         goto err;
816     }
817 
818     if (drv->bdrv_co_write_zeroes &&
819         buffer_is_zero(bounce_buffer, iov.iov_len)) {
820         ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
821                                       cluster_nb_sectors, 0);
822     } else {
823         /* This does not change the data on the disk, it is not necessary
824          * to flush even in cache=writethrough mode.
825          */
826         ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
827                                   &bounce_qiov);
828     }
829 
830     if (ret < 0) {
831         /* It might be okay to ignore write errors for guest requests.  If this
832          * is a deliberate copy-on-read then we don't want to ignore the error.
833          * Simply report it in all cases.
834          */
835         goto err;
836     }
837 
838     skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
839     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
840                         nb_sectors * BDRV_SECTOR_SIZE);
841 
842 err:
843     qemu_vfree(bounce_buffer);
844     return ret;
845 }
846 
847 /*
848  * Forwards an already correctly aligned request to the BlockDriver. This
849  * handles copy on read and zeroing after EOF; any other features must be
850  * implemented by the caller.
851  */
852 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
853     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
854     int64_t align, QEMUIOVector *qiov, int flags)
855 {
856     BlockDriver *drv = bs->drv;
857     int ret;
858 
859     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
860     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
861 
862     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
863     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
864     assert(!qiov || bytes == qiov->size);
865 
866     /* Handle Copy on Read and associated serialisation */
867     if (flags & BDRV_REQ_COPY_ON_READ) {
868         /* If we touch the same cluster it counts as an overlap.  This
869          * guarantees that allocating writes will be serialized and not race
870          * with each other for the same cluster.  For example, in copy-on-read
871          * it ensures that the CoR read and write operations are atomic and
872          * guest writes cannot interleave between them. */
873         mark_request_serialising(req, bdrv_get_cluster_size(bs));
874     }
875 
876     wait_serialising_requests(req);
877 
878     if (flags & BDRV_REQ_COPY_ON_READ) {
879         int pnum;
880 
881         ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
882         if (ret < 0) {
883             goto out;
884         }
885 
886         if (!ret || pnum != nb_sectors) {
887             ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
888             goto out;
889         }
890     }
891 
892     /* Forward the request to the BlockDriver */
893     if (!bs->zero_beyond_eof) {
894         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
895     } else {
896         /* Read zeros after EOF */
897         int64_t total_sectors, max_nb_sectors;
898 
899         total_sectors = bdrv_nb_sectors(bs);
900         if (total_sectors < 0) {
901             ret = total_sectors;
902             goto out;
903         }
904 
905         max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
906                                   align >> BDRV_SECTOR_BITS);
907         if (nb_sectors < max_nb_sectors) {
908             ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
909         } else if (max_nb_sectors > 0) {
910             QEMUIOVector local_qiov;
911 
912             qemu_iovec_init(&local_qiov, qiov->niov);
913             qemu_iovec_concat(&local_qiov, qiov, 0,
914                               max_nb_sectors * BDRV_SECTOR_SIZE);
915 
916             ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
917                                      &local_qiov);
918 
919             qemu_iovec_destroy(&local_qiov);
920         } else {
921             ret = 0;
922         }
923 
924         /* Reading beyond end of file is supposed to produce zeroes */
925         if (ret == 0 && total_sectors < sector_num + nb_sectors) {
926             uint64_t offset = MAX(0, total_sectors - sector_num);
927             uint64_t bytes = (sector_num + nb_sectors - offset) *
928                               BDRV_SECTOR_SIZE;
929             qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
930         }
931     }
932 
933 out:
934     return ret;
935 }
936 
937 /*
938  * Handle a read request in coroutine context
939  */
940 static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
941     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
942     BdrvRequestFlags flags)
943 {
944     BlockDriver *drv = bs->drv;
945     BdrvTrackedRequest req;
946 
947     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
948     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
949     uint8_t *head_buf = NULL;
950     uint8_t *tail_buf = NULL;
951     QEMUIOVector local_qiov;
952     bool use_local_qiov = false;
953     int ret;
954 
955     if (!drv) {
956         return -ENOMEDIUM;
957     }
958 
959     ret = bdrv_check_byte_request(bs, offset, bytes);
960     if (ret < 0) {
961         return ret;
962     }
963 
964     if (bs->copy_on_read) {
965         flags |= BDRV_REQ_COPY_ON_READ;
966     }
967 
968     /* throttling disk I/O */
969     if (bs->io_limits_enabled) {
970         bdrv_io_limits_intercept(bs, bytes, false);
971     }
972 
973     /* Align read if necessary by padding qiov */
974     if (offset & (align - 1)) {
975         head_buf = qemu_blockalign(bs, align);
976         qemu_iovec_init(&local_qiov, qiov->niov + 2);
977         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
978         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
979         use_local_qiov = true;
980 
981         bytes += offset & (align - 1);
982         offset = offset & ~(align - 1);
983     }
984 
985     if ((offset + bytes) & (align - 1)) {
986         if (!use_local_qiov) {
987             qemu_iovec_init(&local_qiov, qiov->niov + 1);
988             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
989             use_local_qiov = true;
990         }
991         tail_buf = qemu_blockalign(bs, align);
992         qemu_iovec_add(&local_qiov, tail_buf,
993                        align - ((offset + bytes) & (align - 1)));
994 
995         bytes = ROUND_UP(bytes, align);
996     }
997 
998     tracked_request_begin(&req, bs, offset, bytes, false);
999     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
1000                               use_local_qiov ? &local_qiov : qiov,
1001                               flags);
1002     tracked_request_end(&req);
1003 
1004     if (use_local_qiov) {
1005         qemu_iovec_destroy(&local_qiov);
1006         qemu_vfree(head_buf);
1007         qemu_vfree(tail_buf);
1008     }
1009 
1010     return ret;
1011 }
1012 
1013 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1014     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1015     BdrvRequestFlags flags)
1016 {
1017     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1018         return -EINVAL;
1019     }
1020 
1021     return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
1022                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1023 }
1024 
1025 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1026     int nb_sectors, QEMUIOVector *qiov)
1027 {
1028     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1029 
1030     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1031 }
1032 
1033 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1034     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1035 {
1036     trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1037 
1038     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1039                             BDRV_REQ_COPY_ON_READ);
1040 }
1041 
1042 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
1043 
1044 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1045     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
1046 {
1047     BlockDriver *drv = bs->drv;
1048     QEMUIOVector qiov;
1049     struct iovec iov = {0};
1050     int ret = 0;
1051 
1052     int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
1053                                         BDRV_REQUEST_MAX_SECTORS);
1054 
1055     while (nb_sectors > 0 && !ret) {
1056         int num = nb_sectors;
1057 
1058         /* Align request.  Block drivers can expect the "bulk" of the request
1059          * to be aligned.
1060          */
1061         if (bs->bl.write_zeroes_alignment
1062             && num > bs->bl.write_zeroes_alignment) {
1063             if (sector_num % bs->bl.write_zeroes_alignment != 0) {
1064                 /* Make a small request up to the first aligned sector.  */
1065                 num = bs->bl.write_zeroes_alignment;
1066                 num -= sector_num % bs->bl.write_zeroes_alignment;
1067             } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
1068                 /* Shorten the request to the last aligned sector.  num cannot
1069                  * underflow because num > bs->bl.write_zeroes_alignment.
1070                  */
1071                 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
1072             }
1073         }
1074 
1075         /* limit request size */
1076         if (num > max_write_zeroes) {
1077             num = max_write_zeroes;
1078         }
1079 
1080         ret = -ENOTSUP;
1081         /* First try the efficient write zeroes operation */
1082         if (drv->bdrv_co_write_zeroes) {
1083             ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
1084         }
1085 
1086         if (ret == -ENOTSUP) {
1087             /* Fall back to bounce buffer if write zeroes is unsupported */
1088             int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
1089                                             MAX_WRITE_ZEROES_BOUNCE_BUFFER);
1090             num = MIN(num, max_xfer_len);
1091             iov.iov_len = num * BDRV_SECTOR_SIZE;
1092             if (iov.iov_base == NULL) {
1093                 iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
1094                 if (iov.iov_base == NULL) {
1095                     ret = -ENOMEM;
1096                     goto fail;
1097                 }
1098                 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
1099             }
1100             qemu_iovec_init_external(&qiov, &iov, 1);
1101 
1102             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
1103 
1104             /* Keep bounce buffer around if it is big enough for all
1105              * all future requests.
1106              */
1107             if (num < max_xfer_len) {
1108                 qemu_vfree(iov.iov_base);
1109                 iov.iov_base = NULL;
1110             }
1111         }
1112 
1113         sector_num += num;
1114         nb_sectors -= num;
1115     }
1116 
1117 fail:
1118     qemu_vfree(iov.iov_base);
1119     return ret;
1120 }
1121 
1122 /*
1123  * Forwards an already correctly aligned write request to the BlockDriver.
1124  */
1125 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
1126     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1127     QEMUIOVector *qiov, int flags)
1128 {
1129     BlockDriver *drv = bs->drv;
1130     bool waited;
1131     int ret;
1132 
1133     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
1134     unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
1135 
1136     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
1137     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
1138     assert(!qiov || bytes == qiov->size);
1139 
1140     waited = wait_serialising_requests(req);
1141     assert(!waited || !req->serialising);
1142     assert(req->overlap_offset <= offset);
1143     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1144 
1145     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
1146 
1147     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
1148         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
1149         qemu_iovec_is_zero(qiov)) {
1150         flags |= BDRV_REQ_ZERO_WRITE;
1151         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
1152             flags |= BDRV_REQ_MAY_UNMAP;
1153         }
1154     }
1155 
1156     if (ret < 0) {
1157         /* Do nothing, write notifier decided to fail this request */
1158     } else if (flags & BDRV_REQ_ZERO_WRITE) {
1159         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
1160         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
1161     } else {
1162         BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
1163         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1164     }
1165     BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
1166 
1167     if (ret == 0 && !bs->enable_write_cache) {
1168         ret = bdrv_co_flush(bs);
1169     }
1170 
1171     bdrv_set_dirty(bs, sector_num, nb_sectors);
1172 
1173     block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
1174 
1175     if (ret >= 0) {
1176         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
1177     }
1178 
1179     return ret;
1180 }
1181 
1182 static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
1183                                                 int64_t offset,
1184                                                 unsigned int bytes,
1185                                                 BdrvRequestFlags flags,
1186                                                 BdrvTrackedRequest *req)
1187 {
1188     uint8_t *buf = NULL;
1189     QEMUIOVector local_qiov;
1190     struct iovec iov;
1191     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
1192     unsigned int head_padding_bytes, tail_padding_bytes;
1193     int ret = 0;
1194 
1195     head_padding_bytes = offset & (align - 1);
1196     tail_padding_bytes = align - ((offset + bytes) & (align - 1));
1197 
1198 
1199     assert(flags & BDRV_REQ_ZERO_WRITE);
1200     if (head_padding_bytes || tail_padding_bytes) {
1201         buf = qemu_blockalign(bs, align);
1202         iov = (struct iovec) {
1203             .iov_base   = buf,
1204             .iov_len    = align,
1205         };
1206         qemu_iovec_init_external(&local_qiov, &iov, 1);
1207     }
1208     if (head_padding_bytes) {
1209         uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
1210 
1211         /* RMW the unaligned part before head. */
1212         mark_request_serialising(req, align);
1213         wait_serialising_requests(req);
1214         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
1215         ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align,
1216                                   align, &local_qiov, 0);
1217         if (ret < 0) {
1218             goto fail;
1219         }
1220         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1221 
1222         memset(buf + head_padding_bytes, 0, zero_bytes);
1223         ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align,
1224                                    &local_qiov,
1225                                    flags & ~BDRV_REQ_ZERO_WRITE);
1226         if (ret < 0) {
1227             goto fail;
1228         }
1229         offset += zero_bytes;
1230         bytes -= zero_bytes;
1231     }
1232 
1233     assert(!bytes || (offset & (align - 1)) == 0);
1234     if (bytes >= align) {
1235         /* Write the aligned part in the middle. */
1236         uint64_t aligned_bytes = bytes & ~(align - 1);
1237         ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes,
1238                                    NULL, flags);
1239         if (ret < 0) {
1240             goto fail;
1241         }
1242         bytes -= aligned_bytes;
1243         offset += aligned_bytes;
1244     }
1245 
1246     assert(!bytes || (offset & (align - 1)) == 0);
1247     if (bytes) {
1248         assert(align == tail_padding_bytes + bytes);
1249         /* RMW the unaligned part after tail. */
1250         mark_request_serialising(req, align);
1251         wait_serialising_requests(req);
1252         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
1253         ret = bdrv_aligned_preadv(bs, req, offset, align,
1254                                   align, &local_qiov, 0);
1255         if (ret < 0) {
1256             goto fail;
1257         }
1258         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1259 
1260         memset(buf, 0, bytes);
1261         ret = bdrv_aligned_pwritev(bs, req, offset, align,
1262                                    &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
1263     }
1264 fail:
1265     qemu_vfree(buf);
1266     return ret;
1267 
1268 }
1269 
1270 /*
1271  * Handle a write request in coroutine context
1272  */
1273 static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
1274     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1275     BdrvRequestFlags flags)
1276 {
1277     BdrvTrackedRequest req;
1278     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
1279     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
1280     uint8_t *head_buf = NULL;
1281     uint8_t *tail_buf = NULL;
1282     QEMUIOVector local_qiov;
1283     bool use_local_qiov = false;
1284     int ret;
1285 
1286     if (!bs->drv) {
1287         return -ENOMEDIUM;
1288     }
1289     if (bs->read_only) {
1290         return -EPERM;
1291     }
1292 
1293     ret = bdrv_check_byte_request(bs, offset, bytes);
1294     if (ret < 0) {
1295         return ret;
1296     }
1297 
1298     /* throttling disk I/O */
1299     if (bs->io_limits_enabled) {
1300         bdrv_io_limits_intercept(bs, bytes, true);
1301     }
1302 
1303     /*
1304      * Align write if necessary by performing a read-modify-write cycle.
1305      * Pad qiov with the read parts and be sure to have a tracked request not
1306      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
1307      */
1308     tracked_request_begin(&req, bs, offset, bytes, true);
1309 
1310     if (!qiov) {
1311         ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req);
1312         goto out;
1313     }
1314 
1315     if (offset & (align - 1)) {
1316         QEMUIOVector head_qiov;
1317         struct iovec head_iov;
1318 
1319         mark_request_serialising(&req, align);
1320         wait_serialising_requests(&req);
1321 
1322         head_buf = qemu_blockalign(bs, align);
1323         head_iov = (struct iovec) {
1324             .iov_base   = head_buf,
1325             .iov_len    = align,
1326         };
1327         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
1328 
1329         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
1330         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
1331                                   align, &head_qiov, 0);
1332         if (ret < 0) {
1333             goto fail;
1334         }
1335         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1336 
1337         qemu_iovec_init(&local_qiov, qiov->niov + 2);
1338         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1339         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1340         use_local_qiov = true;
1341 
1342         bytes += offset & (align - 1);
1343         offset = offset & ~(align - 1);
1344     }
1345 
1346     if ((offset + bytes) & (align - 1)) {
1347         QEMUIOVector tail_qiov;
1348         struct iovec tail_iov;
1349         size_t tail_bytes;
1350         bool waited;
1351 
1352         mark_request_serialising(&req, align);
1353         waited = wait_serialising_requests(&req);
1354         assert(!waited || !use_local_qiov);
1355 
1356         tail_buf = qemu_blockalign(bs, align);
1357         tail_iov = (struct iovec) {
1358             .iov_base   = tail_buf,
1359             .iov_len    = align,
1360         };
1361         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
1362 
1363         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
1364         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
1365                                   align, &tail_qiov, 0);
1366         if (ret < 0) {
1367             goto fail;
1368         }
1369         BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1370 
1371         if (!use_local_qiov) {
1372             qemu_iovec_init(&local_qiov, qiov->niov + 1);
1373             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1374             use_local_qiov = true;
1375         }
1376 
1377         tail_bytes = (offset + bytes) & (align - 1);
1378         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
1379 
1380         bytes = ROUND_UP(bytes, align);
1381     }
1382 
1383     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
1384                                use_local_qiov ? &local_qiov : qiov,
1385                                flags);
1386 
1387 fail:
1388 
1389     if (use_local_qiov) {
1390         qemu_iovec_destroy(&local_qiov);
1391     }
1392     qemu_vfree(head_buf);
1393     qemu_vfree(tail_buf);
1394 out:
1395     tracked_request_end(&req);
1396     return ret;
1397 }
1398 
1399 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1400     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1401     BdrvRequestFlags flags)
1402 {
1403     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1404         return -EINVAL;
1405     }
1406 
1407     return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
1408                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1409 }
1410 
1411 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1412     int nb_sectors, QEMUIOVector *qiov)
1413 {
1414     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1415 
1416     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1417 }
1418 
1419 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1420                                       int64_t sector_num, int nb_sectors,
1421                                       BdrvRequestFlags flags)
1422 {
1423     trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
1424 
1425     if (!(bs->open_flags & BDRV_O_UNMAP)) {
1426         flags &= ~BDRV_REQ_MAY_UNMAP;
1427     }
1428 
1429     return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1430                              BDRV_REQ_ZERO_WRITE | flags);
1431 }
1432 
1433 int bdrv_flush_all(void)
1434 {
1435     BlockDriverState *bs = NULL;
1436     int result = 0;
1437 
1438     while ((bs = bdrv_next(bs))) {
1439         AioContext *aio_context = bdrv_get_aio_context(bs);
1440         int ret;
1441 
1442         aio_context_acquire(aio_context);
1443         ret = bdrv_flush(bs);
1444         if (ret < 0 && !result) {
1445             result = ret;
1446         }
1447         aio_context_release(aio_context);
1448     }
1449 
1450     return result;
1451 }
1452 
1453 typedef struct BdrvCoGetBlockStatusData {
1454     BlockDriverState *bs;
1455     BlockDriverState *base;
1456     int64_t sector_num;
1457     int nb_sectors;
1458     int *pnum;
1459     int64_t ret;
1460     bool done;
1461 } BdrvCoGetBlockStatusData;
1462 
1463 /*
1464  * Returns the allocation status of the specified sectors.
1465  * Drivers not implementing the functionality are assumed to not support
1466  * backing files, hence all their sectors are reported as allocated.
1467  *
1468  * If 'sector_num' is beyond the end of the disk image the return value is 0
1469  * and 'pnum' is set to 0.
1470  *
1471  * 'pnum' is set to the number of sectors (including and immediately following
1472  * the specified sector) that are known to be in the same
1473  * allocated/unallocated state.
1474  *
1475  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
1476  * beyond the end of the disk image it will be clamped.
1477  */
1478 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
1479                                                      int64_t sector_num,
1480                                                      int nb_sectors, int *pnum)
1481 {
1482     int64_t total_sectors;
1483     int64_t n;
1484     int64_t ret, ret2;
1485 
1486     total_sectors = bdrv_nb_sectors(bs);
1487     if (total_sectors < 0) {
1488         return total_sectors;
1489     }
1490 
1491     if (sector_num >= total_sectors) {
1492         *pnum = 0;
1493         return 0;
1494     }
1495 
1496     n = total_sectors - sector_num;
1497     if (n < nb_sectors) {
1498         nb_sectors = n;
1499     }
1500 
1501     if (!bs->drv->bdrv_co_get_block_status) {
1502         *pnum = nb_sectors;
1503         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
1504         if (bs->drv->protocol_name) {
1505             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
1506         }
1507         return ret;
1508     }
1509 
1510     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
1511     if (ret < 0) {
1512         *pnum = 0;
1513         return ret;
1514     }
1515 
1516     if (ret & BDRV_BLOCK_RAW) {
1517         assert(ret & BDRV_BLOCK_OFFSET_VALID);
1518         return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
1519                                      *pnum, pnum);
1520     }
1521 
1522     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
1523         ret |= BDRV_BLOCK_ALLOCATED;
1524     } else {
1525         if (bdrv_unallocated_blocks_are_zero(bs)) {
1526             ret |= BDRV_BLOCK_ZERO;
1527         } else if (bs->backing_hd) {
1528             BlockDriverState *bs2 = bs->backing_hd;
1529             int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
1530             if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
1531                 ret |= BDRV_BLOCK_ZERO;
1532             }
1533         }
1534     }
1535 
1536     if (bs->file &&
1537         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
1538         (ret & BDRV_BLOCK_OFFSET_VALID)) {
1539         int file_pnum;
1540 
1541         ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
1542                                         *pnum, &file_pnum);
1543         if (ret2 >= 0) {
1544             /* Ignore errors.  This is just providing extra information, it
1545              * is useful but not necessary.
1546              */
1547             if (!file_pnum) {
1548                 /* !file_pnum indicates an offset at or beyond the EOF; it is
1549                  * perfectly valid for the format block driver to point to such
1550                  * offsets, so catch it and mark everything as zero */
1551                 ret |= BDRV_BLOCK_ZERO;
1552             } else {
1553                 /* Limit request to the range reported by the protocol driver */
1554                 *pnum = file_pnum;
1555                 ret |= (ret2 & BDRV_BLOCK_ZERO);
1556             }
1557         }
1558     }
1559 
1560     return ret;
1561 }
1562 
1563 /* Coroutine wrapper for bdrv_get_block_status() */
1564 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
1565 {
1566     BdrvCoGetBlockStatusData *data = opaque;
1567     BlockDriverState *bs = data->bs;
1568 
1569     data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
1570                                          data->pnum);
1571     data->done = true;
1572 }
1573 
1574 /*
1575  * Synchronous wrapper around bdrv_co_get_block_status().
1576  *
1577  * See bdrv_co_get_block_status() for details.
1578  */
1579 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
1580                               int nb_sectors, int *pnum)
1581 {
1582     Coroutine *co;
1583     BdrvCoGetBlockStatusData data = {
1584         .bs = bs,
1585         .sector_num = sector_num,
1586         .nb_sectors = nb_sectors,
1587         .pnum = pnum,
1588         .done = false,
1589     };
1590 
1591     if (qemu_in_coroutine()) {
1592         /* Fast-path if already in coroutine context */
1593         bdrv_get_block_status_co_entry(&data);
1594     } else {
1595         AioContext *aio_context = bdrv_get_aio_context(bs);
1596 
1597         co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
1598         qemu_coroutine_enter(co, &data);
1599         while (!data.done) {
1600             aio_poll(aio_context, true);
1601         }
1602     }
1603     return data.ret;
1604 }
1605 
1606 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
1607                                    int nb_sectors, int *pnum)
1608 {
1609     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
1610     if (ret < 0) {
1611         return ret;
1612     }
1613     return !!(ret & BDRV_BLOCK_ALLOCATED);
1614 }
1615 
1616 /*
1617  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
1618  *
1619  * Return true if the given sector is allocated in any image between
1620  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
1621  * sector is allocated in any image of the chain.  Return false otherwise.
1622  *
1623  * 'pnum' is set to the number of sectors (including and immediately following
1624  *  the specified sector) that are known to be in the same
1625  *  allocated/unallocated state.
1626  *
1627  */
1628 int bdrv_is_allocated_above(BlockDriverState *top,
1629                             BlockDriverState *base,
1630                             int64_t sector_num,
1631                             int nb_sectors, int *pnum)
1632 {
1633     BlockDriverState *intermediate;
1634     int ret, n = nb_sectors;
1635 
1636     intermediate = top;
1637     while (intermediate && intermediate != base) {
1638         int pnum_inter;
1639         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
1640                                 &pnum_inter);
1641         if (ret < 0) {
1642             return ret;
1643         } else if (ret) {
1644             *pnum = pnum_inter;
1645             return 1;
1646         }
1647 
1648         /*
1649          * [sector_num, nb_sectors] is unallocated on top but intermediate
1650          * might have
1651          *
1652          * [sector_num+x, nr_sectors] allocated.
1653          */
1654         if (n > pnum_inter &&
1655             (intermediate == top ||
1656              sector_num + pnum_inter < intermediate->total_sectors)) {
1657             n = pnum_inter;
1658         }
1659 
1660         intermediate = intermediate->backing_hd;
1661     }
1662 
1663     *pnum = n;
1664     return 0;
1665 }
1666 
1667 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
1668                           const uint8_t *buf, int nb_sectors)
1669 {
1670     BlockDriver *drv = bs->drv;
1671     int ret;
1672 
1673     if (!drv) {
1674         return -ENOMEDIUM;
1675     }
1676     if (!drv->bdrv_write_compressed) {
1677         return -ENOTSUP;
1678     }
1679     ret = bdrv_check_request(bs, sector_num, nb_sectors);
1680     if (ret < 0) {
1681         return ret;
1682     }
1683 
1684     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1685 
1686     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
1687 }
1688 
1689 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
1690                       int64_t pos, int size)
1691 {
1692     QEMUIOVector qiov;
1693     struct iovec iov = {
1694         .iov_base   = (void *) buf,
1695         .iov_len    = size,
1696     };
1697 
1698     qemu_iovec_init_external(&qiov, &iov, 1);
1699     return bdrv_writev_vmstate(bs, &qiov, pos);
1700 }
1701 
1702 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
1703 {
1704     BlockDriver *drv = bs->drv;
1705 
1706     if (!drv) {
1707         return -ENOMEDIUM;
1708     } else if (drv->bdrv_save_vmstate) {
1709         return drv->bdrv_save_vmstate(bs, qiov, pos);
1710     } else if (bs->file) {
1711         return bdrv_writev_vmstate(bs->file, qiov, pos);
1712     }
1713 
1714     return -ENOTSUP;
1715 }
1716 
1717 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
1718                       int64_t pos, int size)
1719 {
1720     BlockDriver *drv = bs->drv;
1721     if (!drv)
1722         return -ENOMEDIUM;
1723     if (drv->bdrv_load_vmstate)
1724         return drv->bdrv_load_vmstate(bs, buf, pos, size);
1725     if (bs->file)
1726         return bdrv_load_vmstate(bs->file, buf, pos, size);
1727     return -ENOTSUP;
1728 }
1729 
1730 /**************************************************************/
1731 /* async I/Os */
1732 
1733 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
1734                            QEMUIOVector *qiov, int nb_sectors,
1735                            BlockCompletionFunc *cb, void *opaque)
1736 {
1737     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
1738 
1739     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
1740                                  cb, opaque, false);
1741 }
1742 
1743 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
1744                             QEMUIOVector *qiov, int nb_sectors,
1745                             BlockCompletionFunc *cb, void *opaque)
1746 {
1747     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
1748 
1749     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
1750                                  cb, opaque, true);
1751 }
1752 
1753 BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
1754         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
1755         BlockCompletionFunc *cb, void *opaque)
1756 {
1757     trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
1758 
1759     return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
1760                                  BDRV_REQ_ZERO_WRITE | flags,
1761                                  cb, opaque, true);
1762 }
1763 
1764 
1765 typedef struct MultiwriteCB {
1766     int error;
1767     int num_requests;
1768     int num_callbacks;
1769     struct {
1770         BlockCompletionFunc *cb;
1771         void *opaque;
1772         QEMUIOVector *free_qiov;
1773     } callbacks[];
1774 } MultiwriteCB;
1775 
1776 static void multiwrite_user_cb(MultiwriteCB *mcb)
1777 {
1778     int i;
1779 
1780     for (i = 0; i < mcb->num_callbacks; i++) {
1781         mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1782         if (mcb->callbacks[i].free_qiov) {
1783             qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
1784         }
1785         g_free(mcb->callbacks[i].free_qiov);
1786     }
1787 }
1788 
1789 static void multiwrite_cb(void *opaque, int ret)
1790 {
1791     MultiwriteCB *mcb = opaque;
1792 
1793     trace_multiwrite_cb(mcb, ret);
1794 
1795     if (ret < 0 && !mcb->error) {
1796         mcb->error = ret;
1797     }
1798 
1799     mcb->num_requests--;
1800     if (mcb->num_requests == 0) {
1801         multiwrite_user_cb(mcb);
1802         g_free(mcb);
1803     }
1804 }
1805 
1806 static int multiwrite_req_compare(const void *a, const void *b)
1807 {
1808     const BlockRequest *req1 = a, *req2 = b;
1809 
1810     /*
1811      * Note that we can't simply subtract req2->sector from req1->sector
1812      * here as that could overflow the return value.
1813      */
1814     if (req1->sector > req2->sector) {
1815         return 1;
1816     } else if (req1->sector < req2->sector) {
1817         return -1;
1818     } else {
1819         return 0;
1820     }
1821 }
1822 
1823 /*
1824  * Takes a bunch of requests and tries to merge them. Returns the number of
1825  * requests that remain after merging.
1826  */
1827 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
1828     int num_reqs, MultiwriteCB *mcb)
1829 {
1830     int i, outidx;
1831 
1832     // Sort requests by start sector
1833     qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
1834 
1835     // Check if adjacent requests touch the same clusters. If so, combine them,
1836     // filling up gaps with zero sectors.
1837     outidx = 0;
1838     for (i = 1; i < num_reqs; i++) {
1839         int merge = 0;
1840         int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
1841 
1842         // Handle exactly sequential writes and overlapping writes.
1843         if (reqs[i].sector <= oldreq_last) {
1844             merge = 1;
1845         }
1846 
1847         if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
1848             merge = 0;
1849         }
1850 
1851         if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
1852             reqs[i].nb_sectors > bs->bl.max_transfer_length) {
1853             merge = 0;
1854         }
1855 
1856         if (merge) {
1857             size_t size;
1858             QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
1859             qemu_iovec_init(qiov,
1860                 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
1861 
1862             // Add the first request to the merged one. If the requests are
1863             // overlapping, drop the last sectors of the first request.
1864             size = (reqs[i].sector - reqs[outidx].sector) << 9;
1865             qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
1866 
1867             // We should need to add any zeros between the two requests
1868             assert (reqs[i].sector <= oldreq_last);
1869 
1870             // Add the second request
1871             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
1872 
1873             // Add tail of first request, if necessary
1874             if (qiov->size < reqs[outidx].qiov->size) {
1875                 qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
1876                                   reqs[outidx].qiov->size - qiov->size);
1877             }
1878 
1879             reqs[outidx].nb_sectors = qiov->size >> 9;
1880             reqs[outidx].qiov = qiov;
1881 
1882             mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
1883         } else {
1884             outidx++;
1885             reqs[outidx].sector     = reqs[i].sector;
1886             reqs[outidx].nb_sectors = reqs[i].nb_sectors;
1887             reqs[outidx].qiov       = reqs[i].qiov;
1888         }
1889     }
1890 
1891     block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
1892 
1893     return outidx + 1;
1894 }
1895 
1896 /*
1897  * Submit multiple AIO write requests at once.
1898  *
1899  * On success, the function returns 0 and all requests in the reqs array have
1900  * been submitted. In error case this function returns -1, and any of the
1901  * requests may or may not be submitted yet. In particular, this means that the
1902  * callback will be called for some of the requests, for others it won't. The
1903  * caller must check the error field of the BlockRequest to wait for the right
1904  * callbacks (if error != 0, no callback will be called).
1905  *
1906  * The implementation may modify the contents of the reqs array, e.g. to merge
1907  * requests. However, the fields opaque and error are left unmodified as they
1908  * are used to signal failure for a single request to the caller.
1909  */
1910 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
1911 {
1912     MultiwriteCB *mcb;
1913     int i;
1914 
1915     /* don't submit writes if we don't have a medium */
1916     if (bs->drv == NULL) {
1917         for (i = 0; i < num_reqs; i++) {
1918             reqs[i].error = -ENOMEDIUM;
1919         }
1920         return -1;
1921     }
1922 
1923     if (num_reqs == 0) {
1924         return 0;
1925     }
1926 
1927     // Create MultiwriteCB structure
1928     mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
1929     mcb->num_requests = 0;
1930     mcb->num_callbacks = num_reqs;
1931 
1932     for (i = 0; i < num_reqs; i++) {
1933         mcb->callbacks[i].cb = reqs[i].cb;
1934         mcb->callbacks[i].opaque = reqs[i].opaque;
1935     }
1936 
1937     // Check for mergable requests
1938     num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
1939 
1940     trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
1941 
1942     /* Run the aio requests. */
1943     mcb->num_requests = num_reqs;
1944     for (i = 0; i < num_reqs; i++) {
1945         bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
1946                               reqs[i].nb_sectors, reqs[i].flags,
1947                               multiwrite_cb, mcb,
1948                               true);
1949     }
1950 
1951     return 0;
1952 }
1953 
1954 void bdrv_aio_cancel(BlockAIOCB *acb)
1955 {
1956     qemu_aio_ref(acb);
1957     bdrv_aio_cancel_async(acb);
1958     while (acb->refcnt > 1) {
1959         if (acb->aiocb_info->get_aio_context) {
1960             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
1961         } else if (acb->bs) {
1962             aio_poll(bdrv_get_aio_context(acb->bs), true);
1963         } else {
1964             abort();
1965         }
1966     }
1967     qemu_aio_unref(acb);
1968 }
1969 
1970 /* Async version of aio cancel. The caller is not blocked if the acb implements
1971  * cancel_async, otherwise we do nothing and let the request normally complete.
1972  * In either case the completion callback must be called. */
1973 void bdrv_aio_cancel_async(BlockAIOCB *acb)
1974 {
1975     if (acb->aiocb_info->cancel_async) {
1976         acb->aiocb_info->cancel_async(acb);
1977     }
1978 }
1979 
1980 /**************************************************************/
1981 /* async block device emulation */
1982 
1983 typedef struct BlockAIOCBSync {
1984     BlockAIOCB common;
1985     QEMUBH *bh;
1986     int ret;
1987     /* vector translation state */
1988     QEMUIOVector *qiov;
1989     uint8_t *bounce;
1990     int is_write;
1991 } BlockAIOCBSync;
1992 
1993 static const AIOCBInfo bdrv_em_aiocb_info = {
1994     .aiocb_size         = sizeof(BlockAIOCBSync),
1995 };
1996 
1997 static void bdrv_aio_bh_cb(void *opaque)
1998 {
1999     BlockAIOCBSync *acb = opaque;
2000 
2001     if (!acb->is_write && acb->ret >= 0) {
2002         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
2003     }
2004     qemu_vfree(acb->bounce);
2005     acb->common.cb(acb->common.opaque, acb->ret);
2006     qemu_bh_delete(acb->bh);
2007     acb->bh = NULL;
2008     qemu_aio_unref(acb);
2009 }
2010 
2011 static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
2012                                       int64_t sector_num,
2013                                       QEMUIOVector *qiov,
2014                                       int nb_sectors,
2015                                       BlockCompletionFunc *cb,
2016                                       void *opaque,
2017                                       int is_write)
2018 
2019 {
2020     BlockAIOCBSync *acb;
2021 
2022     acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
2023     acb->is_write = is_write;
2024     acb->qiov = qiov;
2025     acb->bounce = qemu_try_blockalign(bs, qiov->size);
2026     acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
2027 
2028     if (acb->bounce == NULL) {
2029         acb->ret = -ENOMEM;
2030     } else if (is_write) {
2031         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
2032         acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
2033     } else {
2034         acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
2035     }
2036 
2037     qemu_bh_schedule(acb->bh);
2038 
2039     return &acb->common;
2040 }
2041 
2042 static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
2043         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2044         BlockCompletionFunc *cb, void *opaque)
2045 {
2046     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
2047 }
2048 
2049 static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
2050         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2051         BlockCompletionFunc *cb, void *opaque)
2052 {
2053     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
2054 }
2055 
2056 
2057 typedef struct BlockAIOCBCoroutine {
2058     BlockAIOCB common;
2059     BlockRequest req;
2060     bool is_write;
2061     bool need_bh;
2062     bool *done;
2063     QEMUBH* bh;
2064 } BlockAIOCBCoroutine;
2065 
2066 static const AIOCBInfo bdrv_em_co_aiocb_info = {
2067     .aiocb_size         = sizeof(BlockAIOCBCoroutine),
2068 };
2069 
2070 static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
2071 {
2072     if (!acb->need_bh) {
2073         acb->common.cb(acb->common.opaque, acb->req.error);
2074         qemu_aio_unref(acb);
2075     }
2076 }
2077 
2078 static void bdrv_co_em_bh(void *opaque)
2079 {
2080     BlockAIOCBCoroutine *acb = opaque;
2081 
2082     assert(!acb->need_bh);
2083     qemu_bh_delete(acb->bh);
2084     bdrv_co_complete(acb);
2085 }
2086 
2087 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
2088 {
2089     acb->need_bh = false;
2090     if (acb->req.error != -EINPROGRESS) {
2091         BlockDriverState *bs = acb->common.bs;
2092 
2093         acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
2094         qemu_bh_schedule(acb->bh);
2095     }
2096 }
2097 
2098 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
2099 static void coroutine_fn bdrv_co_do_rw(void *opaque)
2100 {
2101     BlockAIOCBCoroutine *acb = opaque;
2102     BlockDriverState *bs = acb->common.bs;
2103 
2104     if (!acb->is_write) {
2105         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
2106             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
2107     } else {
2108         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
2109             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
2110     }
2111 
2112     bdrv_co_complete(acb);
2113 }
2114 
2115 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
2116                                          int64_t sector_num,
2117                                          QEMUIOVector *qiov,
2118                                          int nb_sectors,
2119                                          BdrvRequestFlags flags,
2120                                          BlockCompletionFunc *cb,
2121                                          void *opaque,
2122                                          bool is_write)
2123 {
2124     Coroutine *co;
2125     BlockAIOCBCoroutine *acb;
2126 
2127     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2128     acb->need_bh = true;
2129     acb->req.error = -EINPROGRESS;
2130     acb->req.sector = sector_num;
2131     acb->req.nb_sectors = nb_sectors;
2132     acb->req.qiov = qiov;
2133     acb->req.flags = flags;
2134     acb->is_write = is_write;
2135 
2136     co = qemu_coroutine_create(bdrv_co_do_rw);
2137     qemu_coroutine_enter(co, acb);
2138 
2139     bdrv_co_maybe_schedule_bh(acb);
2140     return &acb->common;
2141 }
2142 
2143 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
2144 {
2145     BlockAIOCBCoroutine *acb = opaque;
2146     BlockDriverState *bs = acb->common.bs;
2147 
2148     acb->req.error = bdrv_co_flush(bs);
2149     bdrv_co_complete(acb);
2150 }
2151 
2152 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
2153         BlockCompletionFunc *cb, void *opaque)
2154 {
2155     trace_bdrv_aio_flush(bs, opaque);
2156 
2157     Coroutine *co;
2158     BlockAIOCBCoroutine *acb;
2159 
2160     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2161     acb->need_bh = true;
2162     acb->req.error = -EINPROGRESS;
2163 
2164     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
2165     qemu_coroutine_enter(co, acb);
2166 
2167     bdrv_co_maybe_schedule_bh(acb);
2168     return &acb->common;
2169 }
2170 
2171 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
2172 {
2173     BlockAIOCBCoroutine *acb = opaque;
2174     BlockDriverState *bs = acb->common.bs;
2175 
2176     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
2177     bdrv_co_complete(acb);
2178 }
2179 
2180 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
2181         int64_t sector_num, int nb_sectors,
2182         BlockCompletionFunc *cb, void *opaque)
2183 {
2184     Coroutine *co;
2185     BlockAIOCBCoroutine *acb;
2186 
2187     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
2188 
2189     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2190     acb->need_bh = true;
2191     acb->req.error = -EINPROGRESS;
2192     acb->req.sector = sector_num;
2193     acb->req.nb_sectors = nb_sectors;
2194     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
2195     qemu_coroutine_enter(co, acb);
2196 
2197     bdrv_co_maybe_schedule_bh(acb);
2198     return &acb->common;
2199 }
2200 
2201 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
2202                    BlockCompletionFunc *cb, void *opaque)
2203 {
2204     BlockAIOCB *acb;
2205 
2206     acb = g_slice_alloc(aiocb_info->aiocb_size);
2207     acb->aiocb_info = aiocb_info;
2208     acb->bs = bs;
2209     acb->cb = cb;
2210     acb->opaque = opaque;
2211     acb->refcnt = 1;
2212     return acb;
2213 }
2214 
2215 void qemu_aio_ref(void *p)
2216 {
2217     BlockAIOCB *acb = p;
2218     acb->refcnt++;
2219 }
2220 
2221 void qemu_aio_unref(void *p)
2222 {
2223     BlockAIOCB *acb = p;
2224     assert(acb->refcnt > 0);
2225     if (--acb->refcnt == 0) {
2226         g_slice_free1(acb->aiocb_info->aiocb_size, acb);
2227     }
2228 }
2229 
2230 /**************************************************************/
2231 /* Coroutine block device emulation */
2232 
2233 typedef struct CoroutineIOCompletion {
2234     Coroutine *coroutine;
2235     int ret;
2236 } CoroutineIOCompletion;
2237 
2238 static void bdrv_co_io_em_complete(void *opaque, int ret)
2239 {
2240     CoroutineIOCompletion *co = opaque;
2241 
2242     co->ret = ret;
2243     qemu_coroutine_enter(co->coroutine, NULL);
2244 }
2245 
2246 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
2247                                       int nb_sectors, QEMUIOVector *iov,
2248                                       bool is_write)
2249 {
2250     CoroutineIOCompletion co = {
2251         .coroutine = qemu_coroutine_self(),
2252     };
2253     BlockAIOCB *acb;
2254 
2255     if (is_write) {
2256         acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
2257                                        bdrv_co_io_em_complete, &co);
2258     } else {
2259         acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
2260                                       bdrv_co_io_em_complete, &co);
2261     }
2262 
2263     trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
2264     if (!acb) {
2265         return -EIO;
2266     }
2267     qemu_coroutine_yield();
2268 
2269     return co.ret;
2270 }
2271 
2272 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
2273                                          int64_t sector_num, int nb_sectors,
2274                                          QEMUIOVector *iov)
2275 {
2276     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
2277 }
2278 
2279 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
2280                                          int64_t sector_num, int nb_sectors,
2281                                          QEMUIOVector *iov)
2282 {
2283     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
2284 }
2285 
2286 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2287 {
2288     RwCo *rwco = opaque;
2289 
2290     rwco->ret = bdrv_co_flush(rwco->bs);
2291 }
2292 
2293 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2294 {
2295     int ret;
2296 
2297     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2298         return 0;
2299     }
2300 
2301     /* Write back cached data to the OS even with cache=unsafe */
2302     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
2303     if (bs->drv->bdrv_co_flush_to_os) {
2304         ret = bs->drv->bdrv_co_flush_to_os(bs);
2305         if (ret < 0) {
2306             return ret;
2307         }
2308     }
2309 
2310     /* But don't actually force it to the disk with cache=unsafe */
2311     if (bs->open_flags & BDRV_O_NO_FLUSH) {
2312         goto flush_parent;
2313     }
2314 
2315     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2316     if (bs->drv->bdrv_co_flush_to_disk) {
2317         ret = bs->drv->bdrv_co_flush_to_disk(bs);
2318     } else if (bs->drv->bdrv_aio_flush) {
2319         BlockAIOCB *acb;
2320         CoroutineIOCompletion co = {
2321             .coroutine = qemu_coroutine_self(),
2322         };
2323 
2324         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2325         if (acb == NULL) {
2326             ret = -EIO;
2327         } else {
2328             qemu_coroutine_yield();
2329             ret = co.ret;
2330         }
2331     } else {
2332         /*
2333          * Some block drivers always operate in either writethrough or unsafe
2334          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2335          * know how the server works (because the behaviour is hardcoded or
2336          * depends on server-side configuration), so we can't ensure that
2337          * everything is safe on disk. Returning an error doesn't work because
2338          * that would break guests even if the server operates in writethrough
2339          * mode.
2340          *
2341          * Let's hope the user knows what he's doing.
2342          */
2343         ret = 0;
2344     }
2345     if (ret < 0) {
2346         return ret;
2347     }
2348 
2349     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
2350      * in the case of cache=unsafe, so there are no useless flushes.
2351      */
2352 flush_parent:
2353     return bdrv_co_flush(bs->file);
2354 }
2355 
2356 int bdrv_flush(BlockDriverState *bs)
2357 {
2358     Coroutine *co;
2359     RwCo rwco = {
2360         .bs = bs,
2361         .ret = NOT_DONE,
2362     };
2363 
2364     if (qemu_in_coroutine()) {
2365         /* Fast-path if already in coroutine context */
2366         bdrv_flush_co_entry(&rwco);
2367     } else {
2368         AioContext *aio_context = bdrv_get_aio_context(bs);
2369 
2370         co = qemu_coroutine_create(bdrv_flush_co_entry);
2371         qemu_coroutine_enter(co, &rwco);
2372         while (rwco.ret == NOT_DONE) {
2373             aio_poll(aio_context, true);
2374         }
2375     }
2376 
2377     return rwco.ret;
2378 }
2379 
2380 typedef struct DiscardCo {
2381     BlockDriverState *bs;
2382     int64_t sector_num;
2383     int nb_sectors;
2384     int ret;
2385 } DiscardCo;
2386 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
2387 {
2388     DiscardCo *rwco = opaque;
2389 
2390     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
2391 }
2392 
2393 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
2394                                  int nb_sectors)
2395 {
2396     int max_discard, ret;
2397 
2398     if (!bs->drv) {
2399         return -ENOMEDIUM;
2400     }
2401 
2402     ret = bdrv_check_request(bs, sector_num, nb_sectors);
2403     if (ret < 0) {
2404         return ret;
2405     } else if (bs->read_only) {
2406         return -EPERM;
2407     }
2408 
2409     bdrv_reset_dirty(bs, sector_num, nb_sectors);
2410 
2411     /* Do nothing if disabled.  */
2412     if (!(bs->open_flags & BDRV_O_UNMAP)) {
2413         return 0;
2414     }
2415 
2416     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
2417         return 0;
2418     }
2419 
2420     max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
2421     while (nb_sectors > 0) {
2422         int ret;
2423         int num = nb_sectors;
2424 
2425         /* align request */
2426         if (bs->bl.discard_alignment &&
2427             num >= bs->bl.discard_alignment &&
2428             sector_num % bs->bl.discard_alignment) {
2429             if (num > bs->bl.discard_alignment) {
2430                 num = bs->bl.discard_alignment;
2431             }
2432             num -= sector_num % bs->bl.discard_alignment;
2433         }
2434 
2435         /* limit request size */
2436         if (num > max_discard) {
2437             num = max_discard;
2438         }
2439 
2440         if (bs->drv->bdrv_co_discard) {
2441             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
2442         } else {
2443             BlockAIOCB *acb;
2444             CoroutineIOCompletion co = {
2445                 .coroutine = qemu_coroutine_self(),
2446             };
2447 
2448             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
2449                                             bdrv_co_io_em_complete, &co);
2450             if (acb == NULL) {
2451                 return -EIO;
2452             } else {
2453                 qemu_coroutine_yield();
2454                 ret = co.ret;
2455             }
2456         }
2457         if (ret && ret != -ENOTSUP) {
2458             return ret;
2459         }
2460 
2461         sector_num += num;
2462         nb_sectors -= num;
2463     }
2464     return 0;
2465 }
2466 
2467 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
2468 {
2469     Coroutine *co;
2470     DiscardCo rwco = {
2471         .bs = bs,
2472         .sector_num = sector_num,
2473         .nb_sectors = nb_sectors,
2474         .ret = NOT_DONE,
2475     };
2476 
2477     if (qemu_in_coroutine()) {
2478         /* Fast-path if already in coroutine context */
2479         bdrv_discard_co_entry(&rwco);
2480     } else {
2481         AioContext *aio_context = bdrv_get_aio_context(bs);
2482 
2483         co = qemu_coroutine_create(bdrv_discard_co_entry);
2484         qemu_coroutine_enter(co, &rwco);
2485         while (rwco.ret == NOT_DONE) {
2486             aio_poll(aio_context, true);
2487         }
2488     }
2489 
2490     return rwco.ret;
2491 }
2492 
2493 /* needed for generic scsi interface */
2494 
2495 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
2496 {
2497     BlockDriver *drv = bs->drv;
2498 
2499     if (drv && drv->bdrv_ioctl)
2500         return drv->bdrv_ioctl(bs, req, buf);
2501     return -ENOTSUP;
2502 }
2503 
2504 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
2505         unsigned long int req, void *buf,
2506         BlockCompletionFunc *cb, void *opaque)
2507 {
2508     BlockDriver *drv = bs->drv;
2509 
2510     if (drv && drv->bdrv_aio_ioctl)
2511         return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
2512     return NULL;
2513 }
2514 
2515 void *qemu_blockalign(BlockDriverState *bs, size_t size)
2516 {
2517     return qemu_memalign(bdrv_opt_mem_align(bs), size);
2518 }
2519 
2520 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
2521 {
2522     return memset(qemu_blockalign(bs, size), 0, size);
2523 }
2524 
2525 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
2526 {
2527     size_t align = bdrv_opt_mem_align(bs);
2528 
2529     /* Ensure that NULL is never returned on success */
2530     assert(align > 0);
2531     if (size == 0) {
2532         size = align;
2533     }
2534 
2535     return qemu_try_memalign(align, size);
2536 }
2537 
2538 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
2539 {
2540     void *mem = qemu_try_blockalign(bs, size);
2541 
2542     if (mem) {
2543         memset(mem, 0, size);
2544     }
2545 
2546     return mem;
2547 }
2548 
2549 /*
2550  * Check if all memory in this vector is sector aligned.
2551  */
2552 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
2553 {
2554     int i;
2555     size_t alignment = bdrv_min_mem_align(bs);
2556 
2557     for (i = 0; i < qiov->niov; i++) {
2558         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
2559             return false;
2560         }
2561         if (qiov->iov[i].iov_len % alignment) {
2562             return false;
2563         }
2564     }
2565 
2566     return true;
2567 }
2568 
2569 void bdrv_add_before_write_notifier(BlockDriverState *bs,
2570                                     NotifierWithReturn *notifier)
2571 {
2572     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
2573 }
2574 
2575 void bdrv_io_plug(BlockDriverState *bs)
2576 {
2577     BlockDriver *drv = bs->drv;
2578     if (drv && drv->bdrv_io_plug) {
2579         drv->bdrv_io_plug(bs);
2580     } else if (bs->file) {
2581         bdrv_io_plug(bs->file);
2582     }
2583 }
2584 
2585 void bdrv_io_unplug(BlockDriverState *bs)
2586 {
2587     BlockDriver *drv = bs->drv;
2588     if (drv && drv->bdrv_io_unplug) {
2589         drv->bdrv_io_unplug(bs);
2590     } else if (bs->file) {
2591         bdrv_io_unplug(bs->file);
2592     }
2593 }
2594 
2595 void bdrv_flush_io_queue(BlockDriverState *bs)
2596 {
2597     BlockDriver *drv = bs->drv;
2598     if (drv && drv->bdrv_flush_io_queue) {
2599         drv->bdrv_flush_io_queue(bs);
2600     } else if (bs->file) {
2601         bdrv_flush_io_queue(bs->file);
2602     }
2603 }
2604