xref: /openbmc/qemu/block/io.c (revision 35658f6e)
1 /*
2  * Block layer I/O functions
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "trace.h"
27 #include "sysemu/block-backend.h"
28 #include "block/blockjob.h"
29 #include "block/block_int.h"
30 #include "qemu/cutils.h"
31 #include "qapi/error.h"
32 #include "qemu/error-report.h"
33 
34 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
35 
36 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
37                                          int64_t sector_num,
38                                          QEMUIOVector *qiov,
39                                          int nb_sectors,
40                                          BdrvRequestFlags flags,
41                                          BlockCompletionFunc *cb,
42                                          void *opaque,
43                                          bool is_write);
44 static void coroutine_fn bdrv_co_do_rw(void *opaque);
45 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
46     int64_t offset, int count, BdrvRequestFlags flags);
47 
48 static void bdrv_parent_drained_begin(BlockDriverState *bs)
49 {
50     BdrvChild *c;
51 
52     QLIST_FOREACH(c, &bs->parents, next_parent) {
53         if (c->role->drained_begin) {
54             c->role->drained_begin(c);
55         }
56     }
57 }
58 
59 static void bdrv_parent_drained_end(BlockDriverState *bs)
60 {
61     BdrvChild *c;
62 
63     QLIST_FOREACH(c, &bs->parents, next_parent) {
64         if (c->role->drained_end) {
65             c->role->drained_end(c);
66         }
67     }
68 }
69 
70 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
71 {
72     BlockDriver *drv = bs->drv;
73     Error *local_err = NULL;
74 
75     memset(&bs->bl, 0, sizeof(bs->bl));
76 
77     if (!drv) {
78         return;
79     }
80 
81     /* Take some limits from the children as a default */
82     if (bs->file) {
83         bdrv_refresh_limits(bs->file->bs, &local_err);
84         if (local_err) {
85             error_propagate(errp, local_err);
86             return;
87         }
88         bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length;
89         bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length;
90         bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment;
91         bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment;
92         bs->bl.max_iov = bs->file->bs->bl.max_iov;
93     } else {
94         bs->bl.min_mem_alignment = 512;
95         bs->bl.opt_mem_alignment = getpagesize();
96 
97         /* Safe default since most protocols use readv()/writev()/etc */
98         bs->bl.max_iov = IOV_MAX;
99     }
100 
101     if (bs->backing) {
102         bdrv_refresh_limits(bs->backing->bs, &local_err);
103         if (local_err) {
104             error_propagate(errp, local_err);
105             return;
106         }
107         bs->bl.opt_transfer_length =
108             MAX(bs->bl.opt_transfer_length,
109                 bs->backing->bs->bl.opt_transfer_length);
110         bs->bl.max_transfer_length =
111             MIN_NON_ZERO(bs->bl.max_transfer_length,
112                          bs->backing->bs->bl.max_transfer_length);
113         bs->bl.opt_mem_alignment =
114             MAX(bs->bl.opt_mem_alignment,
115                 bs->backing->bs->bl.opt_mem_alignment);
116         bs->bl.min_mem_alignment =
117             MAX(bs->bl.min_mem_alignment,
118                 bs->backing->bs->bl.min_mem_alignment);
119         bs->bl.max_iov =
120             MIN(bs->bl.max_iov,
121                 bs->backing->bs->bl.max_iov);
122     }
123 
124     /* Then let the driver override it */
125     if (drv->bdrv_refresh_limits) {
126         drv->bdrv_refresh_limits(bs, errp);
127     }
128 }
129 
130 /**
131  * The copy-on-read flag is actually a reference count so multiple users may
132  * use the feature without worrying about clobbering its previous state.
133  * Copy-on-read stays enabled until all users have called to disable it.
134  */
135 void bdrv_enable_copy_on_read(BlockDriverState *bs)
136 {
137     bs->copy_on_read++;
138 }
139 
140 void bdrv_disable_copy_on_read(BlockDriverState *bs)
141 {
142     assert(bs->copy_on_read > 0);
143     bs->copy_on_read--;
144 }
145 
146 /* Check if any requests are in-flight (including throttled requests) */
147 bool bdrv_requests_pending(BlockDriverState *bs)
148 {
149     BdrvChild *child;
150 
151     if (!QLIST_EMPTY(&bs->tracked_requests)) {
152         return true;
153     }
154 
155     QLIST_FOREACH(child, &bs->children, next) {
156         if (bdrv_requests_pending(child->bs)) {
157             return true;
158         }
159     }
160 
161     return false;
162 }
163 
164 static void bdrv_drain_recurse(BlockDriverState *bs)
165 {
166     BdrvChild *child;
167 
168     if (bs->drv && bs->drv->bdrv_drain) {
169         bs->drv->bdrv_drain(bs);
170     }
171     QLIST_FOREACH(child, &bs->children, next) {
172         bdrv_drain_recurse(child->bs);
173     }
174 }
175 
176 typedef struct {
177     Coroutine *co;
178     BlockDriverState *bs;
179     QEMUBH *bh;
180     bool done;
181 } BdrvCoDrainData;
182 
183 static void bdrv_drain_poll(BlockDriverState *bs)
184 {
185     bool busy = true;
186 
187     while (busy) {
188         /* Keep iterating */
189         busy = bdrv_requests_pending(bs);
190         busy |= aio_poll(bdrv_get_aio_context(bs), busy);
191     }
192 }
193 
194 static void bdrv_co_drain_bh_cb(void *opaque)
195 {
196     BdrvCoDrainData *data = opaque;
197     Coroutine *co = data->co;
198 
199     qemu_bh_delete(data->bh);
200     bdrv_drain_poll(data->bs);
201     data->done = true;
202     qemu_coroutine_enter(co, NULL);
203 }
204 
205 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
206 {
207     BdrvCoDrainData data;
208 
209     /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
210      * other coroutines run if they were queued from
211      * qemu_co_queue_run_restart(). */
212 
213     assert(qemu_in_coroutine());
214     data = (BdrvCoDrainData) {
215         .co = qemu_coroutine_self(),
216         .bs = bs,
217         .done = false,
218         .bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_drain_bh_cb, &data),
219     };
220     qemu_bh_schedule(data.bh);
221 
222     qemu_coroutine_yield();
223     /* If we are resumed from some other event (such as an aio completion or a
224      * timer callback), it is a bug in the caller that should be fixed. */
225     assert(data.done);
226 }
227 
228 void bdrv_drained_begin(BlockDriverState *bs)
229 {
230     if (!bs->quiesce_counter++) {
231         aio_disable_external(bdrv_get_aio_context(bs));
232         bdrv_parent_drained_begin(bs);
233     }
234 
235     bdrv_io_unplugged_begin(bs);
236     bdrv_drain_recurse(bs);
237     if (qemu_in_coroutine()) {
238         bdrv_co_yield_to_drain(bs);
239     } else {
240         bdrv_drain_poll(bs);
241     }
242     bdrv_io_unplugged_end(bs);
243 }
244 
245 void bdrv_drained_end(BlockDriverState *bs)
246 {
247     assert(bs->quiesce_counter > 0);
248     if (--bs->quiesce_counter > 0) {
249         return;
250     }
251 
252     bdrv_parent_drained_end(bs);
253     aio_enable_external(bdrv_get_aio_context(bs));
254 }
255 
256 /*
257  * Wait for pending requests to complete on a single BlockDriverState subtree,
258  * and suspend block driver's internal I/O until next request arrives.
259  *
260  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
261  * AioContext.
262  *
263  * Only this BlockDriverState's AioContext is run, so in-flight requests must
264  * not depend on events in other AioContexts.  In that case, use
265  * bdrv_drain_all() instead.
266  */
267 void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
268 {
269     assert(qemu_in_coroutine());
270     bdrv_drained_begin(bs);
271     bdrv_drained_end(bs);
272 }
273 
274 void bdrv_drain(BlockDriverState *bs)
275 {
276     bdrv_drained_begin(bs);
277     bdrv_drained_end(bs);
278 }
279 
280 /*
281  * Wait for pending requests to complete across all BlockDriverStates
282  *
283  * This function does not flush data to disk, use bdrv_flush_all() for that
284  * after calling this function.
285  */
286 void bdrv_drain_all(void)
287 {
288     /* Always run first iteration so any pending completion BHs run */
289     bool busy = true;
290     BlockDriverState *bs;
291     BdrvNextIterator it;
292     BlockJob *job = NULL;
293     GSList *aio_ctxs = NULL, *ctx;
294 
295     while ((job = block_job_next(job))) {
296         AioContext *aio_context = blk_get_aio_context(job->blk);
297 
298         aio_context_acquire(aio_context);
299         block_job_pause(job);
300         aio_context_release(aio_context);
301     }
302 
303     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
304         AioContext *aio_context = bdrv_get_aio_context(bs);
305 
306         aio_context_acquire(aio_context);
307         bdrv_parent_drained_begin(bs);
308         bdrv_io_unplugged_begin(bs);
309         bdrv_drain_recurse(bs);
310         aio_context_release(aio_context);
311 
312         if (!g_slist_find(aio_ctxs, aio_context)) {
313             aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
314         }
315     }
316 
317     /* Note that completion of an asynchronous I/O operation can trigger any
318      * number of other I/O operations on other devices---for example a
319      * coroutine can submit an I/O request to another device in response to
320      * request completion.  Therefore we must keep looping until there was no
321      * more activity rather than simply draining each device independently.
322      */
323     while (busy) {
324         busy = false;
325 
326         for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
327             AioContext *aio_context = ctx->data;
328 
329             aio_context_acquire(aio_context);
330             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
331                 if (aio_context == bdrv_get_aio_context(bs)) {
332                     if (bdrv_requests_pending(bs)) {
333                         busy = true;
334                         aio_poll(aio_context, busy);
335                     }
336                 }
337             }
338             busy |= aio_poll(aio_context, false);
339             aio_context_release(aio_context);
340         }
341     }
342 
343     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
344         AioContext *aio_context = bdrv_get_aio_context(bs);
345 
346         aio_context_acquire(aio_context);
347         bdrv_io_unplugged_end(bs);
348         bdrv_parent_drained_end(bs);
349         aio_context_release(aio_context);
350     }
351     g_slist_free(aio_ctxs);
352 
353     job = NULL;
354     while ((job = block_job_next(job))) {
355         AioContext *aio_context = blk_get_aio_context(job->blk);
356 
357         aio_context_acquire(aio_context);
358         block_job_resume(job);
359         aio_context_release(aio_context);
360     }
361 }
362 
363 /**
364  * Remove an active request from the tracked requests list
365  *
366  * This function should be called when a tracked request is completing.
367  */
368 static void tracked_request_end(BdrvTrackedRequest *req)
369 {
370     if (req->serialising) {
371         req->bs->serialising_in_flight--;
372     }
373 
374     QLIST_REMOVE(req, list);
375     qemu_co_queue_restart_all(&req->wait_queue);
376 }
377 
378 /**
379  * Add an active request to the tracked requests list
380  */
381 static void tracked_request_begin(BdrvTrackedRequest *req,
382                                   BlockDriverState *bs,
383                                   int64_t offset,
384                                   unsigned int bytes,
385                                   enum BdrvTrackedRequestType type)
386 {
387     *req = (BdrvTrackedRequest){
388         .bs = bs,
389         .offset         = offset,
390         .bytes          = bytes,
391         .type           = type,
392         .co             = qemu_coroutine_self(),
393         .serialising    = false,
394         .overlap_offset = offset,
395         .overlap_bytes  = bytes,
396     };
397 
398     qemu_co_queue_init(&req->wait_queue);
399 
400     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
401 }
402 
403 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
404 {
405     int64_t overlap_offset = req->offset & ~(align - 1);
406     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
407                                - overlap_offset;
408 
409     if (!req->serialising) {
410         req->bs->serialising_in_flight++;
411         req->serialising = true;
412     }
413 
414     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
415     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
416 }
417 
418 /**
419  * Round a region to cluster boundaries (sector-based)
420  */
421 void bdrv_round_sectors_to_clusters(BlockDriverState *bs,
422                                     int64_t sector_num, int nb_sectors,
423                                     int64_t *cluster_sector_num,
424                                     int *cluster_nb_sectors)
425 {
426     BlockDriverInfo bdi;
427 
428     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
429         *cluster_sector_num = sector_num;
430         *cluster_nb_sectors = nb_sectors;
431     } else {
432         int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
433         *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
434         *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
435                                             nb_sectors, c);
436     }
437 }
438 
439 /**
440  * Round a region to cluster boundaries
441  */
442 void bdrv_round_to_clusters(BlockDriverState *bs,
443                             int64_t offset, unsigned int bytes,
444                             int64_t *cluster_offset,
445                             unsigned int *cluster_bytes)
446 {
447     BlockDriverInfo bdi;
448 
449     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
450         *cluster_offset = offset;
451         *cluster_bytes = bytes;
452     } else {
453         int64_t c = bdi.cluster_size;
454         *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
455         *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
456     }
457 }
458 
459 static int bdrv_get_cluster_size(BlockDriverState *bs)
460 {
461     BlockDriverInfo bdi;
462     int ret;
463 
464     ret = bdrv_get_info(bs, &bdi);
465     if (ret < 0 || bdi.cluster_size == 0) {
466         return bs->request_alignment;
467     } else {
468         return bdi.cluster_size;
469     }
470 }
471 
472 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
473                                      int64_t offset, unsigned int bytes)
474 {
475     /*        aaaa   bbbb */
476     if (offset >= req->overlap_offset + req->overlap_bytes) {
477         return false;
478     }
479     /* bbbb   aaaa        */
480     if (req->overlap_offset >= offset + bytes) {
481         return false;
482     }
483     return true;
484 }
485 
486 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
487 {
488     BlockDriverState *bs = self->bs;
489     BdrvTrackedRequest *req;
490     bool retry;
491     bool waited = false;
492 
493     if (!bs->serialising_in_flight) {
494         return false;
495     }
496 
497     do {
498         retry = false;
499         QLIST_FOREACH(req, &bs->tracked_requests, list) {
500             if (req == self || (!req->serialising && !self->serialising)) {
501                 continue;
502             }
503             if (tracked_request_overlaps(req, self->overlap_offset,
504                                          self->overlap_bytes))
505             {
506                 /* Hitting this means there was a reentrant request, for
507                  * example, a block driver issuing nested requests.  This must
508                  * never happen since it means deadlock.
509                  */
510                 assert(qemu_coroutine_self() != req->co);
511 
512                 /* If the request is already (indirectly) waiting for us, or
513                  * will wait for us as soon as it wakes up, then just go on
514                  * (instead of producing a deadlock in the former case). */
515                 if (!req->waiting_for) {
516                     self->waiting_for = req;
517                     qemu_co_queue_wait(&req->wait_queue);
518                     self->waiting_for = NULL;
519                     retry = true;
520                     waited = true;
521                     break;
522                 }
523             }
524         }
525     } while (retry);
526 
527     return waited;
528 }
529 
530 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
531                                    size_t size)
532 {
533     if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
534         return -EIO;
535     }
536 
537     if (!bdrv_is_inserted(bs)) {
538         return -ENOMEDIUM;
539     }
540 
541     if (offset < 0) {
542         return -EIO;
543     }
544 
545     return 0;
546 }
547 
548 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
549                               int nb_sectors)
550 {
551     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
552         return -EIO;
553     }
554 
555     return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
556                                    nb_sectors * BDRV_SECTOR_SIZE);
557 }
558 
559 typedef struct RwCo {
560     BlockDriverState *bs;
561     int64_t offset;
562     QEMUIOVector *qiov;
563     bool is_write;
564     int ret;
565     BdrvRequestFlags flags;
566 } RwCo;
567 
568 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
569 {
570     RwCo *rwco = opaque;
571 
572     if (!rwco->is_write) {
573         rwco->ret = bdrv_co_preadv(rwco->bs, rwco->offset,
574                                    rwco->qiov->size, rwco->qiov,
575                                    rwco->flags);
576     } else {
577         rwco->ret = bdrv_co_pwritev(rwco->bs, rwco->offset,
578                                     rwco->qiov->size, rwco->qiov,
579                                     rwco->flags);
580     }
581 }
582 
583 /*
584  * Process a vectored synchronous request using coroutines
585  */
586 static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
587                         QEMUIOVector *qiov, bool is_write,
588                         BdrvRequestFlags flags)
589 {
590     Coroutine *co;
591     RwCo rwco = {
592         .bs = bs,
593         .offset = offset,
594         .qiov = qiov,
595         .is_write = is_write,
596         .ret = NOT_DONE,
597         .flags = flags,
598     };
599 
600     if (qemu_in_coroutine()) {
601         /* Fast-path if already in coroutine context */
602         bdrv_rw_co_entry(&rwco);
603     } else {
604         AioContext *aio_context = bdrv_get_aio_context(bs);
605 
606         co = qemu_coroutine_create(bdrv_rw_co_entry);
607         qemu_coroutine_enter(co, &rwco);
608         while (rwco.ret == NOT_DONE) {
609             aio_poll(aio_context, true);
610         }
611     }
612     return rwco.ret;
613 }
614 
615 /*
616  * Process a synchronous request using coroutines
617  */
618 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
619                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
620 {
621     QEMUIOVector qiov;
622     struct iovec iov = {
623         .iov_base = (void *)buf,
624         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
625     };
626 
627     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
628         return -EINVAL;
629     }
630 
631     qemu_iovec_init_external(&qiov, &iov, 1);
632     return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
633                         &qiov, is_write, flags);
634 }
635 
636 /* return < 0 if error. See bdrv_write() for the return codes */
637 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
638               uint8_t *buf, int nb_sectors)
639 {
640     return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
641 }
642 
643 /* Return < 0 if error. Important errors are:
644   -EIO         generic I/O error (may happen for all errors)
645   -ENOMEDIUM   No media inserted.
646   -EINVAL      Invalid sector number or nb_sectors
647   -EACCES      Trying to write a read-only device
648 */
649 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
650                const uint8_t *buf, int nb_sectors)
651 {
652     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
653 }
654 
655 int bdrv_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
656                        int count, BdrvRequestFlags flags)
657 {
658     QEMUIOVector qiov;
659     struct iovec iov = {
660         .iov_base = NULL,
661         .iov_len = count,
662     };
663 
664     qemu_iovec_init_external(&qiov, &iov, 1);
665     return bdrv_prwv_co(bs, offset, &qiov, true,
666                         BDRV_REQ_ZERO_WRITE | flags);
667 }
668 
669 /*
670  * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
671  * The operation is sped up by checking the block status and only writing
672  * zeroes to the device if they currently do not return zeroes. Optional
673  * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
674  * BDRV_REQ_FUA).
675  *
676  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
677  */
678 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
679 {
680     int64_t target_sectors, ret, nb_sectors, sector_num = 0;
681     BlockDriverState *file;
682     int n;
683 
684     target_sectors = bdrv_nb_sectors(bs);
685     if (target_sectors < 0) {
686         return target_sectors;
687     }
688 
689     for (;;) {
690         nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
691         if (nb_sectors <= 0) {
692             return 0;
693         }
694         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n, &file);
695         if (ret < 0) {
696             error_report("error getting block status at sector %" PRId64 ": %s",
697                          sector_num, strerror(-ret));
698             return ret;
699         }
700         if (ret & BDRV_BLOCK_ZERO) {
701             sector_num += n;
702             continue;
703         }
704         ret = bdrv_pwrite_zeroes(bs, sector_num << BDRV_SECTOR_BITS,
705                                  n << BDRV_SECTOR_BITS, flags);
706         if (ret < 0) {
707             error_report("error writing zeroes at sector %" PRId64 ": %s",
708                          sector_num, strerror(-ret));
709             return ret;
710         }
711         sector_num += n;
712     }
713 }
714 
715 int bdrv_preadv(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
716 {
717     int ret;
718 
719     ret = bdrv_prwv_co(bs, offset, qiov, false, 0);
720     if (ret < 0) {
721         return ret;
722     }
723 
724     return qiov->size;
725 }
726 
727 int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
728 {
729     QEMUIOVector qiov;
730     struct iovec iov = {
731         .iov_base = (void *)buf,
732         .iov_len = bytes,
733     };
734 
735     if (bytes < 0) {
736         return -EINVAL;
737     }
738 
739     qemu_iovec_init_external(&qiov, &iov, 1);
740     return bdrv_preadv(bs, offset, &qiov);
741 }
742 
743 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
744 {
745     int ret;
746 
747     ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
748     if (ret < 0) {
749         return ret;
750     }
751 
752     return qiov->size;
753 }
754 
755 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
756                 const void *buf, int bytes)
757 {
758     QEMUIOVector qiov;
759     struct iovec iov = {
760         .iov_base   = (void *) buf,
761         .iov_len    = bytes,
762     };
763 
764     if (bytes < 0) {
765         return -EINVAL;
766     }
767 
768     qemu_iovec_init_external(&qiov, &iov, 1);
769     return bdrv_pwritev(bs, offset, &qiov);
770 }
771 
772 /*
773  * Writes to the file and ensures that no writes are reordered across this
774  * request (acts as a barrier)
775  *
776  * Returns 0 on success, -errno in error cases.
777  */
778 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
779     const void *buf, int count)
780 {
781     int ret;
782 
783     ret = bdrv_pwrite(bs, offset, buf, count);
784     if (ret < 0) {
785         return ret;
786     }
787 
788     ret = bdrv_flush(bs);
789     if (ret < 0) {
790         return ret;
791     }
792 
793     return 0;
794 }
795 
796 typedef struct CoroutineIOCompletion {
797     Coroutine *coroutine;
798     int ret;
799 } CoroutineIOCompletion;
800 
801 static void bdrv_co_io_em_complete(void *opaque, int ret)
802 {
803     CoroutineIOCompletion *co = opaque;
804 
805     co->ret = ret;
806     qemu_coroutine_enter(co->coroutine, NULL);
807 }
808 
809 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
810                                            uint64_t offset, uint64_t bytes,
811                                            QEMUIOVector *qiov, int flags)
812 {
813     BlockDriver *drv = bs->drv;
814     int64_t sector_num;
815     unsigned int nb_sectors;
816 
817     assert(!(flags & ~BDRV_REQ_MASK));
818 
819     if (drv->bdrv_co_preadv) {
820         return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
821     }
822 
823     sector_num = offset >> BDRV_SECTOR_BITS;
824     nb_sectors = bytes >> BDRV_SECTOR_BITS;
825 
826     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
827     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
828     assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
829 
830     if (drv->bdrv_co_readv) {
831         return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
832     } else {
833         BlockAIOCB *acb;
834         CoroutineIOCompletion co = {
835             .coroutine = qemu_coroutine_self(),
836         };
837 
838         acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
839                                       bdrv_co_io_em_complete, &co);
840         if (acb == NULL) {
841             return -EIO;
842         } else {
843             qemu_coroutine_yield();
844             return co.ret;
845         }
846     }
847 }
848 
849 static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
850                                             uint64_t offset, uint64_t bytes,
851                                             QEMUIOVector *qiov, int flags)
852 {
853     BlockDriver *drv = bs->drv;
854     int64_t sector_num;
855     unsigned int nb_sectors;
856     int ret;
857 
858     assert(!(flags & ~BDRV_REQ_MASK));
859 
860     if (drv->bdrv_co_pwritev) {
861         ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
862                                    flags & bs->supported_write_flags);
863         flags &= ~bs->supported_write_flags;
864         goto emulate_flags;
865     }
866 
867     sector_num = offset >> BDRV_SECTOR_BITS;
868     nb_sectors = bytes >> BDRV_SECTOR_BITS;
869 
870     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
871     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
872     assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
873 
874     if (drv->bdrv_co_writev_flags) {
875         ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
876                                         flags & bs->supported_write_flags);
877         flags &= ~bs->supported_write_flags;
878     } else if (drv->bdrv_co_writev) {
879         assert(!bs->supported_write_flags);
880         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
881     } else {
882         BlockAIOCB *acb;
883         CoroutineIOCompletion co = {
884             .coroutine = qemu_coroutine_self(),
885         };
886 
887         acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
888                                        bdrv_co_io_em_complete, &co);
889         if (acb == NULL) {
890             ret = -EIO;
891         } else {
892             qemu_coroutine_yield();
893             ret = co.ret;
894         }
895     }
896 
897 emulate_flags:
898     if (ret == 0 && (flags & BDRV_REQ_FUA)) {
899         ret = bdrv_co_flush(bs);
900     }
901 
902     return ret;
903 }
904 
905 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
906         int64_t offset, unsigned int bytes, QEMUIOVector *qiov)
907 {
908     /* Perform I/O through a temporary buffer so that users who scribble over
909      * their read buffer while the operation is in progress do not end up
910      * modifying the image file.  This is critical for zero-copy guest I/O
911      * where anything might happen inside guest memory.
912      */
913     void *bounce_buffer;
914 
915     BlockDriver *drv = bs->drv;
916     struct iovec iov;
917     QEMUIOVector bounce_qiov;
918     int64_t cluster_offset;
919     unsigned int cluster_bytes;
920     size_t skip_bytes;
921     int ret;
922 
923     /* Cover entire cluster so no additional backing file I/O is required when
924      * allocating cluster in the image file.
925      */
926     bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
927 
928     trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
929                                    cluster_offset, cluster_bytes);
930 
931     iov.iov_len = cluster_bytes;
932     iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
933     if (bounce_buffer == NULL) {
934         ret = -ENOMEM;
935         goto err;
936     }
937 
938     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
939 
940     ret = bdrv_driver_preadv(bs, cluster_offset, cluster_bytes,
941                              &bounce_qiov, 0);
942     if (ret < 0) {
943         goto err;
944     }
945 
946     if (drv->bdrv_co_pwrite_zeroes &&
947         buffer_is_zero(bounce_buffer, iov.iov_len)) {
948         ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, cluster_bytes, 0);
949     } else {
950         /* This does not change the data on the disk, it is not necessary
951          * to flush even in cache=writethrough mode.
952          */
953         ret = bdrv_driver_pwritev(bs, cluster_offset, cluster_bytes,
954                                   &bounce_qiov, 0);
955     }
956 
957     if (ret < 0) {
958         /* It might be okay to ignore write errors for guest requests.  If this
959          * is a deliberate copy-on-read then we don't want to ignore the error.
960          * Simply report it in all cases.
961          */
962         goto err;
963     }
964 
965     skip_bytes = offset - cluster_offset;
966     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, bytes);
967 
968 err:
969     qemu_vfree(bounce_buffer);
970     return ret;
971 }
972 
973 /*
974  * Forwards an already correctly aligned request to the BlockDriver. This
975  * handles copy on read and zeroing after EOF; any other features must be
976  * implemented by the caller.
977  */
978 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
979     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
980     int64_t align, QEMUIOVector *qiov, int flags)
981 {
982     int64_t total_bytes, max_bytes;
983     int ret;
984 
985     assert(is_power_of_2(align));
986     assert((offset & (align - 1)) == 0);
987     assert((bytes & (align - 1)) == 0);
988     assert(!qiov || bytes == qiov->size);
989     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
990     assert(!(flags & ~BDRV_REQ_MASK));
991 
992     /* Handle Copy on Read and associated serialisation */
993     if (flags & BDRV_REQ_COPY_ON_READ) {
994         /* If we touch the same cluster it counts as an overlap.  This
995          * guarantees that allocating writes will be serialized and not race
996          * with each other for the same cluster.  For example, in copy-on-read
997          * it ensures that the CoR read and write operations are atomic and
998          * guest writes cannot interleave between them. */
999         mark_request_serialising(req, bdrv_get_cluster_size(bs));
1000     }
1001 
1002     if (!(flags & BDRV_REQ_NO_SERIALISING)) {
1003         wait_serialising_requests(req);
1004     }
1005 
1006     if (flags & BDRV_REQ_COPY_ON_READ) {
1007         int64_t start_sector = offset >> BDRV_SECTOR_BITS;
1008         int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
1009         unsigned int nb_sectors = end_sector - start_sector;
1010         int pnum;
1011 
1012         ret = bdrv_is_allocated(bs, start_sector, nb_sectors, &pnum);
1013         if (ret < 0) {
1014             goto out;
1015         }
1016 
1017         if (!ret || pnum != nb_sectors) {
1018             ret = bdrv_co_do_copy_on_readv(bs, offset, bytes, qiov);
1019             goto out;
1020         }
1021     }
1022 
1023     /* Forward the request to the BlockDriver */
1024     total_bytes = bdrv_getlength(bs);
1025     if (total_bytes < 0) {
1026         ret = total_bytes;
1027         goto out;
1028     }
1029 
1030     max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
1031     if (bytes < max_bytes) {
1032         ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
1033     } else if (max_bytes > 0) {
1034         QEMUIOVector local_qiov;
1035 
1036         qemu_iovec_init(&local_qiov, qiov->niov);
1037         qemu_iovec_concat(&local_qiov, qiov, 0, max_bytes);
1038 
1039         ret = bdrv_driver_preadv(bs, offset, max_bytes, &local_qiov, 0);
1040 
1041         qemu_iovec_destroy(&local_qiov);
1042     } else {
1043         ret = 0;
1044     }
1045 
1046     /* Reading beyond end of file is supposed to produce zeroes */
1047     if (ret == 0 && total_bytes < offset + bytes) {
1048         uint64_t zero_offset = MAX(0, total_bytes - offset);
1049         uint64_t zero_bytes = offset + bytes - zero_offset;
1050         qemu_iovec_memset(qiov, zero_offset, 0, zero_bytes);
1051     }
1052 
1053 out:
1054     return ret;
1055 }
1056 
1057 /*
1058  * Handle a read request in coroutine context
1059  */
1060 int coroutine_fn bdrv_co_preadv(BlockDriverState *bs,
1061     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1062     BdrvRequestFlags flags)
1063 {
1064     BlockDriver *drv = bs->drv;
1065     BdrvTrackedRequest req;
1066 
1067     uint64_t align = bs->request_alignment;
1068     uint8_t *head_buf = NULL;
1069     uint8_t *tail_buf = NULL;
1070     QEMUIOVector local_qiov;
1071     bool use_local_qiov = false;
1072     int ret;
1073 
1074     if (!drv) {
1075         return -ENOMEDIUM;
1076     }
1077 
1078     ret = bdrv_check_byte_request(bs, offset, bytes);
1079     if (ret < 0) {
1080         return ret;
1081     }
1082 
1083     /* Don't do copy-on-read if we read data before write operation */
1084     if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) {
1085         flags |= BDRV_REQ_COPY_ON_READ;
1086     }
1087 
1088     /* Align read if necessary by padding qiov */
1089     if (offset & (align - 1)) {
1090         head_buf = qemu_blockalign(bs, align);
1091         qemu_iovec_init(&local_qiov, qiov->niov + 2);
1092         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1093         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1094         use_local_qiov = true;
1095 
1096         bytes += offset & (align - 1);
1097         offset = offset & ~(align - 1);
1098     }
1099 
1100     if ((offset + bytes) & (align - 1)) {
1101         if (!use_local_qiov) {
1102             qemu_iovec_init(&local_qiov, qiov->niov + 1);
1103             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1104             use_local_qiov = true;
1105         }
1106         tail_buf = qemu_blockalign(bs, align);
1107         qemu_iovec_add(&local_qiov, tail_buf,
1108                        align - ((offset + bytes) & (align - 1)));
1109 
1110         bytes = ROUND_UP(bytes, align);
1111     }
1112 
1113     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
1114     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
1115                               use_local_qiov ? &local_qiov : qiov,
1116                               flags);
1117     tracked_request_end(&req);
1118 
1119     if (use_local_qiov) {
1120         qemu_iovec_destroy(&local_qiov);
1121         qemu_vfree(head_buf);
1122         qemu_vfree(tail_buf);
1123     }
1124 
1125     return ret;
1126 }
1127 
1128 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1129     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1130     BdrvRequestFlags flags)
1131 {
1132     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1133         return -EINVAL;
1134     }
1135 
1136     return bdrv_co_preadv(bs, sector_num << BDRV_SECTOR_BITS,
1137                           nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1138 }
1139 
1140 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1141     int nb_sectors, QEMUIOVector *qiov)
1142 {
1143     trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1144 
1145     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1146 }
1147 
1148 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
1149 
1150 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
1151     int64_t offset, int count, BdrvRequestFlags flags)
1152 {
1153     BlockDriver *drv = bs->drv;
1154     QEMUIOVector qiov;
1155     struct iovec iov = {0};
1156     int ret = 0;
1157     bool need_flush = false;
1158     int head = 0;
1159     int tail = 0;
1160 
1161     int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
1162     int alignment = MAX(bs->bl.pwrite_zeroes_alignment ?: 1,
1163                         bs->request_alignment);
1164 
1165     assert(is_power_of_2(alignment));
1166     head = offset & (alignment - 1);
1167     tail = (offset + count) & (alignment - 1);
1168     max_write_zeroes &= ~(alignment - 1);
1169 
1170     while (count > 0 && !ret) {
1171         int num = count;
1172 
1173         /* Align request.  Block drivers can expect the "bulk" of the request
1174          * to be aligned, and that unaligned requests do not cross cluster
1175          * boundaries.
1176          */
1177         if (head) {
1178             /* Make a small request up to the first aligned sector.  */
1179             num = MIN(count, alignment - head);
1180             head = 0;
1181         } else if (tail && num > alignment) {
1182             /* Shorten the request to the last aligned sector.  */
1183             num -= tail;
1184         }
1185 
1186         /* limit request size */
1187         if (num > max_write_zeroes) {
1188             num = max_write_zeroes;
1189         }
1190 
1191         ret = -ENOTSUP;
1192         /* First try the efficient write zeroes operation */
1193         if (drv->bdrv_co_pwrite_zeroes) {
1194             ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1195                                              flags & bs->supported_zero_flags);
1196             if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1197                 !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1198                 need_flush = true;
1199             }
1200         } else {
1201             assert(!bs->supported_zero_flags);
1202         }
1203 
1204         if (ret == -ENOTSUP) {
1205             /* Fall back to bounce buffer if write zeroes is unsupported */
1206             int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
1207                                             MAX_WRITE_ZEROES_BOUNCE_BUFFER);
1208             BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1209 
1210             if ((flags & BDRV_REQ_FUA) &&
1211                 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1212                 /* No need for bdrv_driver_pwrite() to do a fallback
1213                  * flush on each chunk; use just one at the end */
1214                 write_flags &= ~BDRV_REQ_FUA;
1215                 need_flush = true;
1216             }
1217             num = MIN(num, max_xfer_len << BDRV_SECTOR_BITS);
1218             iov.iov_len = num;
1219             if (iov.iov_base == NULL) {
1220                 iov.iov_base = qemu_try_blockalign(bs, num);
1221                 if (iov.iov_base == NULL) {
1222                     ret = -ENOMEM;
1223                     goto fail;
1224                 }
1225                 memset(iov.iov_base, 0, num);
1226             }
1227             qemu_iovec_init_external(&qiov, &iov, 1);
1228 
1229             ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags);
1230 
1231             /* Keep bounce buffer around if it is big enough for all
1232              * all future requests.
1233              */
1234             if (num < max_xfer_len << BDRV_SECTOR_BITS) {
1235                 qemu_vfree(iov.iov_base);
1236                 iov.iov_base = NULL;
1237             }
1238         }
1239 
1240         offset += num;
1241         count -= num;
1242     }
1243 
1244 fail:
1245     if (ret == 0 && need_flush) {
1246         ret = bdrv_co_flush(bs);
1247     }
1248     qemu_vfree(iov.iov_base);
1249     return ret;
1250 }
1251 
1252 /*
1253  * Forwards an already correctly aligned write request to the BlockDriver.
1254  */
1255 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
1256     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1257     QEMUIOVector *qiov, int flags)
1258 {
1259     BlockDriver *drv = bs->drv;
1260     bool waited;
1261     int ret;
1262 
1263     int64_t start_sector = offset >> BDRV_SECTOR_BITS;
1264     int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
1265 
1266     assert(!qiov || bytes == qiov->size);
1267     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1268     assert(!(flags & ~BDRV_REQ_MASK));
1269 
1270     waited = wait_serialising_requests(req);
1271     assert(!waited || !req->serialising);
1272     assert(req->overlap_offset <= offset);
1273     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1274 
1275     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
1276 
1277     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
1278         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
1279         qemu_iovec_is_zero(qiov)) {
1280         flags |= BDRV_REQ_ZERO_WRITE;
1281         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
1282             flags |= BDRV_REQ_MAY_UNMAP;
1283         }
1284     }
1285 
1286     if (ret < 0) {
1287         /* Do nothing, write notifier decided to fail this request */
1288     } else if (flags & BDRV_REQ_ZERO_WRITE) {
1289         bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
1290         ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
1291     } else {
1292         bdrv_debug_event(bs, BLKDBG_PWRITEV);
1293         ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
1294     }
1295     bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
1296 
1297     bdrv_set_dirty(bs, start_sector, end_sector - start_sector);
1298 
1299     if (bs->wr_highest_offset < offset + bytes) {
1300         bs->wr_highest_offset = offset + bytes;
1301     }
1302 
1303     if (ret >= 0) {
1304         bs->total_sectors = MAX(bs->total_sectors, end_sector);
1305     }
1306 
1307     return ret;
1308 }
1309 
1310 static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
1311                                                 int64_t offset,
1312                                                 unsigned int bytes,
1313                                                 BdrvRequestFlags flags,
1314                                                 BdrvTrackedRequest *req)
1315 {
1316     uint8_t *buf = NULL;
1317     QEMUIOVector local_qiov;
1318     struct iovec iov;
1319     uint64_t align = bs->request_alignment;
1320     unsigned int head_padding_bytes, tail_padding_bytes;
1321     int ret = 0;
1322 
1323     head_padding_bytes = offset & (align - 1);
1324     tail_padding_bytes = align - ((offset + bytes) & (align - 1));
1325 
1326 
1327     assert(flags & BDRV_REQ_ZERO_WRITE);
1328     if (head_padding_bytes || tail_padding_bytes) {
1329         buf = qemu_blockalign(bs, align);
1330         iov = (struct iovec) {
1331             .iov_base   = buf,
1332             .iov_len    = align,
1333         };
1334         qemu_iovec_init_external(&local_qiov, &iov, 1);
1335     }
1336     if (head_padding_bytes) {
1337         uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
1338 
1339         /* RMW the unaligned part before head. */
1340         mark_request_serialising(req, align);
1341         wait_serialising_requests(req);
1342         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1343         ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align,
1344                                   align, &local_qiov, 0);
1345         if (ret < 0) {
1346             goto fail;
1347         }
1348         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1349 
1350         memset(buf + head_padding_bytes, 0, zero_bytes);
1351         ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align,
1352                                    &local_qiov,
1353                                    flags & ~BDRV_REQ_ZERO_WRITE);
1354         if (ret < 0) {
1355             goto fail;
1356         }
1357         offset += zero_bytes;
1358         bytes -= zero_bytes;
1359     }
1360 
1361     assert(!bytes || (offset & (align - 1)) == 0);
1362     if (bytes >= align) {
1363         /* Write the aligned part in the middle. */
1364         uint64_t aligned_bytes = bytes & ~(align - 1);
1365         ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes,
1366                                    NULL, flags);
1367         if (ret < 0) {
1368             goto fail;
1369         }
1370         bytes -= aligned_bytes;
1371         offset += aligned_bytes;
1372     }
1373 
1374     assert(!bytes || (offset & (align - 1)) == 0);
1375     if (bytes) {
1376         assert(align == tail_padding_bytes + bytes);
1377         /* RMW the unaligned part after tail. */
1378         mark_request_serialising(req, align);
1379         wait_serialising_requests(req);
1380         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1381         ret = bdrv_aligned_preadv(bs, req, offset, align,
1382                                   align, &local_qiov, 0);
1383         if (ret < 0) {
1384             goto fail;
1385         }
1386         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1387 
1388         memset(buf, 0, bytes);
1389         ret = bdrv_aligned_pwritev(bs, req, offset, align,
1390                                    &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
1391     }
1392 fail:
1393     qemu_vfree(buf);
1394     return ret;
1395 
1396 }
1397 
1398 /*
1399  * Handle a write request in coroutine context
1400  */
1401 int coroutine_fn bdrv_co_pwritev(BlockDriverState *bs,
1402     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1403     BdrvRequestFlags flags)
1404 {
1405     BdrvTrackedRequest req;
1406     uint64_t align = bs->request_alignment;
1407     uint8_t *head_buf = NULL;
1408     uint8_t *tail_buf = NULL;
1409     QEMUIOVector local_qiov;
1410     bool use_local_qiov = false;
1411     int ret;
1412 
1413     if (!bs->drv) {
1414         return -ENOMEDIUM;
1415     }
1416     if (bs->read_only) {
1417         return -EPERM;
1418     }
1419     assert(!(bs->open_flags & BDRV_O_INACTIVE));
1420 
1421     ret = bdrv_check_byte_request(bs, offset, bytes);
1422     if (ret < 0) {
1423         return ret;
1424     }
1425 
1426     /*
1427      * Align write if necessary by performing a read-modify-write cycle.
1428      * Pad qiov with the read parts and be sure to have a tracked request not
1429      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
1430      */
1431     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
1432 
1433     if (!qiov) {
1434         ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req);
1435         goto out;
1436     }
1437 
1438     if (offset & (align - 1)) {
1439         QEMUIOVector head_qiov;
1440         struct iovec head_iov;
1441 
1442         mark_request_serialising(&req, align);
1443         wait_serialising_requests(&req);
1444 
1445         head_buf = qemu_blockalign(bs, align);
1446         head_iov = (struct iovec) {
1447             .iov_base   = head_buf,
1448             .iov_len    = align,
1449         };
1450         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
1451 
1452         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1453         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
1454                                   align, &head_qiov, 0);
1455         if (ret < 0) {
1456             goto fail;
1457         }
1458         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1459 
1460         qemu_iovec_init(&local_qiov, qiov->niov + 2);
1461         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1462         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1463         use_local_qiov = true;
1464 
1465         bytes += offset & (align - 1);
1466         offset = offset & ~(align - 1);
1467 
1468         /* We have read the tail already if the request is smaller
1469          * than one aligned block.
1470          */
1471         if (bytes < align) {
1472             qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes);
1473             bytes = align;
1474         }
1475     }
1476 
1477     if ((offset + bytes) & (align - 1)) {
1478         QEMUIOVector tail_qiov;
1479         struct iovec tail_iov;
1480         size_t tail_bytes;
1481         bool waited;
1482 
1483         mark_request_serialising(&req, align);
1484         waited = wait_serialising_requests(&req);
1485         assert(!waited || !use_local_qiov);
1486 
1487         tail_buf = qemu_blockalign(bs, align);
1488         tail_iov = (struct iovec) {
1489             .iov_base   = tail_buf,
1490             .iov_len    = align,
1491         };
1492         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
1493 
1494         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1495         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
1496                                   align, &tail_qiov, 0);
1497         if (ret < 0) {
1498             goto fail;
1499         }
1500         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1501 
1502         if (!use_local_qiov) {
1503             qemu_iovec_init(&local_qiov, qiov->niov + 1);
1504             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1505             use_local_qiov = true;
1506         }
1507 
1508         tail_bytes = (offset + bytes) & (align - 1);
1509         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
1510 
1511         bytes = ROUND_UP(bytes, align);
1512     }
1513 
1514     ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
1515                                use_local_qiov ? &local_qiov : qiov,
1516                                flags);
1517 
1518 fail:
1519 
1520     if (use_local_qiov) {
1521         qemu_iovec_destroy(&local_qiov);
1522     }
1523     qemu_vfree(head_buf);
1524     qemu_vfree(tail_buf);
1525 out:
1526     tracked_request_end(&req);
1527     return ret;
1528 }
1529 
1530 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1531     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1532     BdrvRequestFlags flags)
1533 {
1534     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1535         return -EINVAL;
1536     }
1537 
1538     return bdrv_co_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
1539                            nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1540 }
1541 
1542 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1543     int nb_sectors, QEMUIOVector *qiov)
1544 {
1545     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1546 
1547     return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1548 }
1549 
1550 int coroutine_fn bdrv_co_pwrite_zeroes(BlockDriverState *bs,
1551                                        int64_t offset, int count,
1552                                        BdrvRequestFlags flags)
1553 {
1554     trace_bdrv_co_pwrite_zeroes(bs, offset, count, flags);
1555 
1556     if (!(bs->open_flags & BDRV_O_UNMAP)) {
1557         flags &= ~BDRV_REQ_MAY_UNMAP;
1558     }
1559 
1560     return bdrv_co_pwritev(bs, offset, count, NULL,
1561                            BDRV_REQ_ZERO_WRITE | flags);
1562 }
1563 
1564 typedef struct BdrvCoGetBlockStatusData {
1565     BlockDriverState *bs;
1566     BlockDriverState *base;
1567     BlockDriverState **file;
1568     int64_t sector_num;
1569     int nb_sectors;
1570     int *pnum;
1571     int64_t ret;
1572     bool done;
1573 } BdrvCoGetBlockStatusData;
1574 
1575 /*
1576  * Returns the allocation status of the specified sectors.
1577  * Drivers not implementing the functionality are assumed to not support
1578  * backing files, hence all their sectors are reported as allocated.
1579  *
1580  * If 'sector_num' is beyond the end of the disk image the return value is 0
1581  * and 'pnum' is set to 0.
1582  *
1583  * 'pnum' is set to the number of sectors (including and immediately following
1584  * the specified sector) that are known to be in the same
1585  * allocated/unallocated state.
1586  *
1587  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
1588  * beyond the end of the disk image it will be clamped.
1589  *
1590  * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file'
1591  * points to the BDS which the sector range is allocated in.
1592  */
1593 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
1594                                                      int64_t sector_num,
1595                                                      int nb_sectors, int *pnum,
1596                                                      BlockDriverState **file)
1597 {
1598     int64_t total_sectors;
1599     int64_t n;
1600     int64_t ret, ret2;
1601 
1602     total_sectors = bdrv_nb_sectors(bs);
1603     if (total_sectors < 0) {
1604         return total_sectors;
1605     }
1606 
1607     if (sector_num >= total_sectors) {
1608         *pnum = 0;
1609         return 0;
1610     }
1611 
1612     n = total_sectors - sector_num;
1613     if (n < nb_sectors) {
1614         nb_sectors = n;
1615     }
1616 
1617     if (!bs->drv->bdrv_co_get_block_status) {
1618         *pnum = nb_sectors;
1619         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
1620         if (bs->drv->protocol_name) {
1621             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
1622         }
1623         return ret;
1624     }
1625 
1626     *file = NULL;
1627     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum,
1628                                             file);
1629     if (ret < 0) {
1630         *pnum = 0;
1631         return ret;
1632     }
1633 
1634     if (ret & BDRV_BLOCK_RAW) {
1635         assert(ret & BDRV_BLOCK_OFFSET_VALID);
1636         return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
1637                                      *pnum, pnum, file);
1638     }
1639 
1640     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
1641         ret |= BDRV_BLOCK_ALLOCATED;
1642     } else {
1643         if (bdrv_unallocated_blocks_are_zero(bs)) {
1644             ret |= BDRV_BLOCK_ZERO;
1645         } else if (bs->backing) {
1646             BlockDriverState *bs2 = bs->backing->bs;
1647             int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
1648             if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
1649                 ret |= BDRV_BLOCK_ZERO;
1650             }
1651         }
1652     }
1653 
1654     if (*file && *file != bs &&
1655         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
1656         (ret & BDRV_BLOCK_OFFSET_VALID)) {
1657         BlockDriverState *file2;
1658         int file_pnum;
1659 
1660         ret2 = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS,
1661                                         *pnum, &file_pnum, &file2);
1662         if (ret2 >= 0) {
1663             /* Ignore errors.  This is just providing extra information, it
1664              * is useful but not necessary.
1665              */
1666             if (!file_pnum) {
1667                 /* !file_pnum indicates an offset at or beyond the EOF; it is
1668                  * perfectly valid for the format block driver to point to such
1669                  * offsets, so catch it and mark everything as zero */
1670                 ret |= BDRV_BLOCK_ZERO;
1671             } else {
1672                 /* Limit request to the range reported by the protocol driver */
1673                 *pnum = file_pnum;
1674                 ret |= (ret2 & BDRV_BLOCK_ZERO);
1675             }
1676         }
1677     }
1678 
1679     return ret;
1680 }
1681 
1682 static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs,
1683         BlockDriverState *base,
1684         int64_t sector_num,
1685         int nb_sectors,
1686         int *pnum,
1687         BlockDriverState **file)
1688 {
1689     BlockDriverState *p;
1690     int64_t ret = 0;
1691 
1692     assert(bs != base);
1693     for (p = bs; p != base; p = backing_bs(p)) {
1694         ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file);
1695         if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) {
1696             break;
1697         }
1698         /* [sector_num, pnum] unallocated on this layer, which could be only
1699          * the first part of [sector_num, nb_sectors].  */
1700         nb_sectors = MIN(nb_sectors, *pnum);
1701     }
1702     return ret;
1703 }
1704 
1705 /* Coroutine wrapper for bdrv_get_block_status_above() */
1706 static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque)
1707 {
1708     BdrvCoGetBlockStatusData *data = opaque;
1709 
1710     data->ret = bdrv_co_get_block_status_above(data->bs, data->base,
1711                                                data->sector_num,
1712                                                data->nb_sectors,
1713                                                data->pnum,
1714                                                data->file);
1715     data->done = true;
1716 }
1717 
1718 /*
1719  * Synchronous wrapper around bdrv_co_get_block_status_above().
1720  *
1721  * See bdrv_co_get_block_status_above() for details.
1722  */
1723 int64_t bdrv_get_block_status_above(BlockDriverState *bs,
1724                                     BlockDriverState *base,
1725                                     int64_t sector_num,
1726                                     int nb_sectors, int *pnum,
1727                                     BlockDriverState **file)
1728 {
1729     Coroutine *co;
1730     BdrvCoGetBlockStatusData data = {
1731         .bs = bs,
1732         .base = base,
1733         .file = file,
1734         .sector_num = sector_num,
1735         .nb_sectors = nb_sectors,
1736         .pnum = pnum,
1737         .done = false,
1738     };
1739 
1740     if (qemu_in_coroutine()) {
1741         /* Fast-path if already in coroutine context */
1742         bdrv_get_block_status_above_co_entry(&data);
1743     } else {
1744         AioContext *aio_context = bdrv_get_aio_context(bs);
1745 
1746         co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry);
1747         qemu_coroutine_enter(co, &data);
1748         while (!data.done) {
1749             aio_poll(aio_context, true);
1750         }
1751     }
1752     return data.ret;
1753 }
1754 
1755 int64_t bdrv_get_block_status(BlockDriverState *bs,
1756                               int64_t sector_num,
1757                               int nb_sectors, int *pnum,
1758                               BlockDriverState **file)
1759 {
1760     return bdrv_get_block_status_above(bs, backing_bs(bs),
1761                                        sector_num, nb_sectors, pnum, file);
1762 }
1763 
1764 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
1765                                    int nb_sectors, int *pnum)
1766 {
1767     BlockDriverState *file;
1768     int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum,
1769                                         &file);
1770     if (ret < 0) {
1771         return ret;
1772     }
1773     return !!(ret & BDRV_BLOCK_ALLOCATED);
1774 }
1775 
1776 /*
1777  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
1778  *
1779  * Return true if the given sector is allocated in any image between
1780  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
1781  * sector is allocated in any image of the chain.  Return false otherwise.
1782  *
1783  * 'pnum' is set to the number of sectors (including and immediately following
1784  *  the specified sector) that are known to be in the same
1785  *  allocated/unallocated state.
1786  *
1787  */
1788 int bdrv_is_allocated_above(BlockDriverState *top,
1789                             BlockDriverState *base,
1790                             int64_t sector_num,
1791                             int nb_sectors, int *pnum)
1792 {
1793     BlockDriverState *intermediate;
1794     int ret, n = nb_sectors;
1795 
1796     intermediate = top;
1797     while (intermediate && intermediate != base) {
1798         int pnum_inter;
1799         ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
1800                                 &pnum_inter);
1801         if (ret < 0) {
1802             return ret;
1803         } else if (ret) {
1804             *pnum = pnum_inter;
1805             return 1;
1806         }
1807 
1808         /*
1809          * [sector_num, nb_sectors] is unallocated on top but intermediate
1810          * might have
1811          *
1812          * [sector_num+x, nr_sectors] allocated.
1813          */
1814         if (n > pnum_inter &&
1815             (intermediate == top ||
1816              sector_num + pnum_inter < intermediate->total_sectors)) {
1817             n = pnum_inter;
1818         }
1819 
1820         intermediate = backing_bs(intermediate);
1821     }
1822 
1823     *pnum = n;
1824     return 0;
1825 }
1826 
1827 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
1828                           const uint8_t *buf, int nb_sectors)
1829 {
1830     BlockDriver *drv = bs->drv;
1831     int ret;
1832 
1833     if (!drv) {
1834         return -ENOMEDIUM;
1835     }
1836     if (!drv->bdrv_write_compressed) {
1837         return -ENOTSUP;
1838     }
1839     ret = bdrv_check_request(bs, sector_num, nb_sectors);
1840     if (ret < 0) {
1841         return ret;
1842     }
1843 
1844     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1845 
1846     return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
1847 }
1848 
1849 typedef struct BdrvVmstateCo {
1850     BlockDriverState    *bs;
1851     QEMUIOVector        *qiov;
1852     int64_t             pos;
1853     bool                is_read;
1854     int                 ret;
1855 } BdrvVmstateCo;
1856 
1857 static int coroutine_fn
1858 bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
1859                    bool is_read)
1860 {
1861     BlockDriver *drv = bs->drv;
1862 
1863     if (!drv) {
1864         return -ENOMEDIUM;
1865     } else if (drv->bdrv_load_vmstate) {
1866         return is_read ? drv->bdrv_load_vmstate(bs, qiov, pos)
1867                        : drv->bdrv_save_vmstate(bs, qiov, pos);
1868     } else if (bs->file) {
1869         return bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
1870     }
1871 
1872     return -ENOTSUP;
1873 }
1874 
1875 static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
1876 {
1877     BdrvVmstateCo *co = opaque;
1878     co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
1879 }
1880 
1881 static inline int
1882 bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
1883                 bool is_read)
1884 {
1885     if (qemu_in_coroutine()) {
1886         return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
1887     } else {
1888         BdrvVmstateCo data = {
1889             .bs         = bs,
1890             .qiov       = qiov,
1891             .pos        = pos,
1892             .is_read    = is_read,
1893             .ret        = -EINPROGRESS,
1894         };
1895         Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry);
1896 
1897         qemu_coroutine_enter(co, &data);
1898         while (data.ret == -EINPROGRESS) {
1899             aio_poll(bdrv_get_aio_context(bs), true);
1900         }
1901         return data.ret;
1902     }
1903 }
1904 
1905 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
1906                       int64_t pos, int size)
1907 {
1908     QEMUIOVector qiov;
1909     struct iovec iov = {
1910         .iov_base   = (void *) buf,
1911         .iov_len    = size,
1912     };
1913     int ret;
1914 
1915     qemu_iovec_init_external(&qiov, &iov, 1);
1916 
1917     ret = bdrv_writev_vmstate(bs, &qiov, pos);
1918     if (ret < 0) {
1919         return ret;
1920     }
1921 
1922     return size;
1923 }
1924 
1925 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
1926 {
1927     return bdrv_rw_vmstate(bs, qiov, pos, false);
1928 }
1929 
1930 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
1931                       int64_t pos, int size)
1932 {
1933     QEMUIOVector qiov;
1934     struct iovec iov = {
1935         .iov_base   = buf,
1936         .iov_len    = size,
1937     };
1938     int ret;
1939 
1940     qemu_iovec_init_external(&qiov, &iov, 1);
1941     ret = bdrv_readv_vmstate(bs, &qiov, pos);
1942     if (ret < 0) {
1943         return ret;
1944     }
1945 
1946     return size;
1947 }
1948 
1949 int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
1950 {
1951     return bdrv_rw_vmstate(bs, qiov, pos, true);
1952 }
1953 
1954 /**************************************************************/
1955 /* async I/Os */
1956 
1957 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
1958                            QEMUIOVector *qiov, int nb_sectors,
1959                            BlockCompletionFunc *cb, void *opaque)
1960 {
1961     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
1962 
1963     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
1964                                  cb, opaque, false);
1965 }
1966 
1967 BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
1968                             QEMUIOVector *qiov, int nb_sectors,
1969                             BlockCompletionFunc *cb, void *opaque)
1970 {
1971     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
1972 
1973     return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
1974                                  cb, opaque, true);
1975 }
1976 
1977 void bdrv_aio_cancel(BlockAIOCB *acb)
1978 {
1979     qemu_aio_ref(acb);
1980     bdrv_aio_cancel_async(acb);
1981     while (acb->refcnt > 1) {
1982         if (acb->aiocb_info->get_aio_context) {
1983             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
1984         } else if (acb->bs) {
1985             aio_poll(bdrv_get_aio_context(acb->bs), true);
1986         } else {
1987             abort();
1988         }
1989     }
1990     qemu_aio_unref(acb);
1991 }
1992 
1993 /* Async version of aio cancel. The caller is not blocked if the acb implements
1994  * cancel_async, otherwise we do nothing and let the request normally complete.
1995  * In either case the completion callback must be called. */
1996 void bdrv_aio_cancel_async(BlockAIOCB *acb)
1997 {
1998     if (acb->aiocb_info->cancel_async) {
1999         acb->aiocb_info->cancel_async(acb);
2000     }
2001 }
2002 
2003 /**************************************************************/
2004 /* async block device emulation */
2005 
2006 typedef struct BlockRequest {
2007     union {
2008         /* Used during read, write, trim */
2009         struct {
2010             int64_t sector;
2011             int nb_sectors;
2012             int flags;
2013             QEMUIOVector *qiov;
2014         };
2015         /* Used during ioctl */
2016         struct {
2017             int req;
2018             void *buf;
2019         };
2020     };
2021     BlockCompletionFunc *cb;
2022     void *opaque;
2023 
2024     int error;
2025 } BlockRequest;
2026 
2027 typedef struct BlockAIOCBCoroutine {
2028     BlockAIOCB common;
2029     BlockRequest req;
2030     bool is_write;
2031     bool need_bh;
2032     bool *done;
2033     QEMUBH* bh;
2034 } BlockAIOCBCoroutine;
2035 
2036 static const AIOCBInfo bdrv_em_co_aiocb_info = {
2037     .aiocb_size         = sizeof(BlockAIOCBCoroutine),
2038 };
2039 
2040 static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
2041 {
2042     if (!acb->need_bh) {
2043         acb->common.cb(acb->common.opaque, acb->req.error);
2044         qemu_aio_unref(acb);
2045     }
2046 }
2047 
2048 static void bdrv_co_em_bh(void *opaque)
2049 {
2050     BlockAIOCBCoroutine *acb = opaque;
2051 
2052     assert(!acb->need_bh);
2053     qemu_bh_delete(acb->bh);
2054     bdrv_co_complete(acb);
2055 }
2056 
2057 static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
2058 {
2059     acb->need_bh = false;
2060     if (acb->req.error != -EINPROGRESS) {
2061         BlockDriverState *bs = acb->common.bs;
2062 
2063         acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
2064         qemu_bh_schedule(acb->bh);
2065     }
2066 }
2067 
2068 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
2069 static void coroutine_fn bdrv_co_do_rw(void *opaque)
2070 {
2071     BlockAIOCBCoroutine *acb = opaque;
2072     BlockDriverState *bs = acb->common.bs;
2073 
2074     if (!acb->is_write) {
2075         acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
2076             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
2077     } else {
2078         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
2079             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
2080     }
2081 
2082     bdrv_co_complete(acb);
2083 }
2084 
2085 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
2086                                          int64_t sector_num,
2087                                          QEMUIOVector *qiov,
2088                                          int nb_sectors,
2089                                          BdrvRequestFlags flags,
2090                                          BlockCompletionFunc *cb,
2091                                          void *opaque,
2092                                          bool is_write)
2093 {
2094     Coroutine *co;
2095     BlockAIOCBCoroutine *acb;
2096 
2097     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2098     acb->need_bh = true;
2099     acb->req.error = -EINPROGRESS;
2100     acb->req.sector = sector_num;
2101     acb->req.nb_sectors = nb_sectors;
2102     acb->req.qiov = qiov;
2103     acb->req.flags = flags;
2104     acb->is_write = is_write;
2105 
2106     co = qemu_coroutine_create(bdrv_co_do_rw);
2107     qemu_coroutine_enter(co, acb);
2108 
2109     bdrv_co_maybe_schedule_bh(acb);
2110     return &acb->common;
2111 }
2112 
2113 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
2114 {
2115     BlockAIOCBCoroutine *acb = opaque;
2116     BlockDriverState *bs = acb->common.bs;
2117 
2118     acb->req.error = bdrv_co_flush(bs);
2119     bdrv_co_complete(acb);
2120 }
2121 
2122 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
2123         BlockCompletionFunc *cb, void *opaque)
2124 {
2125     trace_bdrv_aio_flush(bs, opaque);
2126 
2127     Coroutine *co;
2128     BlockAIOCBCoroutine *acb;
2129 
2130     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2131     acb->need_bh = true;
2132     acb->req.error = -EINPROGRESS;
2133 
2134     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
2135     qemu_coroutine_enter(co, acb);
2136 
2137     bdrv_co_maybe_schedule_bh(acb);
2138     return &acb->common;
2139 }
2140 
2141 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
2142 {
2143     BlockAIOCBCoroutine *acb = opaque;
2144     BlockDriverState *bs = acb->common.bs;
2145 
2146     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
2147     bdrv_co_complete(acb);
2148 }
2149 
2150 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
2151         int64_t sector_num, int nb_sectors,
2152         BlockCompletionFunc *cb, void *opaque)
2153 {
2154     Coroutine *co;
2155     BlockAIOCBCoroutine *acb;
2156 
2157     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
2158 
2159     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2160     acb->need_bh = true;
2161     acb->req.error = -EINPROGRESS;
2162     acb->req.sector = sector_num;
2163     acb->req.nb_sectors = nb_sectors;
2164     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
2165     qemu_coroutine_enter(co, acb);
2166 
2167     bdrv_co_maybe_schedule_bh(acb);
2168     return &acb->common;
2169 }
2170 
2171 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
2172                    BlockCompletionFunc *cb, void *opaque)
2173 {
2174     BlockAIOCB *acb;
2175 
2176     acb = g_malloc(aiocb_info->aiocb_size);
2177     acb->aiocb_info = aiocb_info;
2178     acb->bs = bs;
2179     acb->cb = cb;
2180     acb->opaque = opaque;
2181     acb->refcnt = 1;
2182     return acb;
2183 }
2184 
2185 void qemu_aio_ref(void *p)
2186 {
2187     BlockAIOCB *acb = p;
2188     acb->refcnt++;
2189 }
2190 
2191 void qemu_aio_unref(void *p)
2192 {
2193     BlockAIOCB *acb = p;
2194     assert(acb->refcnt > 0);
2195     if (--acb->refcnt == 0) {
2196         g_free(acb);
2197     }
2198 }
2199 
2200 /**************************************************************/
2201 /* Coroutine block device emulation */
2202 
2203 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2204 {
2205     RwCo *rwco = opaque;
2206 
2207     rwco->ret = bdrv_co_flush(rwco->bs);
2208 }
2209 
2210 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2211 {
2212     int ret;
2213     BdrvTrackedRequest req;
2214 
2215     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
2216         bdrv_is_sg(bs)) {
2217         return 0;
2218     }
2219 
2220     tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
2221 
2222     /* Write back all layers by calling one driver function */
2223     if (bs->drv->bdrv_co_flush) {
2224         ret = bs->drv->bdrv_co_flush(bs);
2225         goto out;
2226     }
2227 
2228     /* Write back cached data to the OS even with cache=unsafe */
2229     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
2230     if (bs->drv->bdrv_co_flush_to_os) {
2231         ret = bs->drv->bdrv_co_flush_to_os(bs);
2232         if (ret < 0) {
2233             goto out;
2234         }
2235     }
2236 
2237     /* But don't actually force it to the disk with cache=unsafe */
2238     if (bs->open_flags & BDRV_O_NO_FLUSH) {
2239         goto flush_parent;
2240     }
2241 
2242     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2243     if (bs->drv->bdrv_co_flush_to_disk) {
2244         ret = bs->drv->bdrv_co_flush_to_disk(bs);
2245     } else if (bs->drv->bdrv_aio_flush) {
2246         BlockAIOCB *acb;
2247         CoroutineIOCompletion co = {
2248             .coroutine = qemu_coroutine_self(),
2249         };
2250 
2251         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2252         if (acb == NULL) {
2253             ret = -EIO;
2254         } else {
2255             qemu_coroutine_yield();
2256             ret = co.ret;
2257         }
2258     } else {
2259         /*
2260          * Some block drivers always operate in either writethrough or unsafe
2261          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2262          * know how the server works (because the behaviour is hardcoded or
2263          * depends on server-side configuration), so we can't ensure that
2264          * everything is safe on disk. Returning an error doesn't work because
2265          * that would break guests even if the server operates in writethrough
2266          * mode.
2267          *
2268          * Let's hope the user knows what he's doing.
2269          */
2270         ret = 0;
2271     }
2272     if (ret < 0) {
2273         goto out;
2274     }
2275 
2276     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
2277      * in the case of cache=unsafe, so there are no useless flushes.
2278      */
2279 flush_parent:
2280     ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2281 out:
2282     tracked_request_end(&req);
2283     return ret;
2284 }
2285 
2286 int bdrv_flush(BlockDriverState *bs)
2287 {
2288     Coroutine *co;
2289     RwCo rwco = {
2290         .bs = bs,
2291         .ret = NOT_DONE,
2292     };
2293 
2294     if (qemu_in_coroutine()) {
2295         /* Fast-path if already in coroutine context */
2296         bdrv_flush_co_entry(&rwco);
2297     } else {
2298         AioContext *aio_context = bdrv_get_aio_context(bs);
2299 
2300         co = qemu_coroutine_create(bdrv_flush_co_entry);
2301         qemu_coroutine_enter(co, &rwco);
2302         while (rwco.ret == NOT_DONE) {
2303             aio_poll(aio_context, true);
2304         }
2305     }
2306 
2307     return rwco.ret;
2308 }
2309 
2310 typedef struct DiscardCo {
2311     BlockDriverState *bs;
2312     int64_t sector_num;
2313     int nb_sectors;
2314     int ret;
2315 } DiscardCo;
2316 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
2317 {
2318     DiscardCo *rwco = opaque;
2319 
2320     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
2321 }
2322 
2323 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
2324                                  int nb_sectors)
2325 {
2326     BdrvTrackedRequest req;
2327     int max_discard, ret;
2328 
2329     if (!bs->drv) {
2330         return -ENOMEDIUM;
2331     }
2332 
2333     ret = bdrv_check_request(bs, sector_num, nb_sectors);
2334     if (ret < 0) {
2335         return ret;
2336     } else if (bs->read_only) {
2337         return -EPERM;
2338     }
2339     assert(!(bs->open_flags & BDRV_O_INACTIVE));
2340 
2341     /* Do nothing if disabled.  */
2342     if (!(bs->open_flags & BDRV_O_UNMAP)) {
2343         return 0;
2344     }
2345 
2346     if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
2347         return 0;
2348     }
2349 
2350     tracked_request_begin(&req, bs, sector_num << BDRV_SECTOR_BITS,
2351                           nb_sectors << BDRV_SECTOR_BITS, BDRV_TRACKED_DISCARD);
2352 
2353     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
2354     if (ret < 0) {
2355         goto out;
2356     }
2357 
2358     max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
2359     while (nb_sectors > 0) {
2360         int ret;
2361         int num = nb_sectors;
2362 
2363         /* align request */
2364         if (bs->bl.discard_alignment &&
2365             num >= bs->bl.discard_alignment &&
2366             sector_num % bs->bl.discard_alignment) {
2367             if (num > bs->bl.discard_alignment) {
2368                 num = bs->bl.discard_alignment;
2369             }
2370             num -= sector_num % bs->bl.discard_alignment;
2371         }
2372 
2373         /* limit request size */
2374         if (num > max_discard) {
2375             num = max_discard;
2376         }
2377 
2378         if (bs->drv->bdrv_co_discard) {
2379             ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
2380         } else {
2381             BlockAIOCB *acb;
2382             CoroutineIOCompletion co = {
2383                 .coroutine = qemu_coroutine_self(),
2384             };
2385 
2386             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
2387                                             bdrv_co_io_em_complete, &co);
2388             if (acb == NULL) {
2389                 ret = -EIO;
2390                 goto out;
2391             } else {
2392                 qemu_coroutine_yield();
2393                 ret = co.ret;
2394             }
2395         }
2396         if (ret && ret != -ENOTSUP) {
2397             goto out;
2398         }
2399 
2400         sector_num += num;
2401         nb_sectors -= num;
2402     }
2403     ret = 0;
2404 out:
2405     bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS,
2406                    req.bytes >> BDRV_SECTOR_BITS);
2407     tracked_request_end(&req);
2408     return ret;
2409 }
2410 
2411 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
2412 {
2413     Coroutine *co;
2414     DiscardCo rwco = {
2415         .bs = bs,
2416         .sector_num = sector_num,
2417         .nb_sectors = nb_sectors,
2418         .ret = NOT_DONE,
2419     };
2420 
2421     if (qemu_in_coroutine()) {
2422         /* Fast-path if already in coroutine context */
2423         bdrv_discard_co_entry(&rwco);
2424     } else {
2425         AioContext *aio_context = bdrv_get_aio_context(bs);
2426 
2427         co = qemu_coroutine_create(bdrv_discard_co_entry);
2428         qemu_coroutine_enter(co, &rwco);
2429         while (rwco.ret == NOT_DONE) {
2430             aio_poll(aio_context, true);
2431         }
2432     }
2433 
2434     return rwco.ret;
2435 }
2436 
2437 static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf)
2438 {
2439     BlockDriver *drv = bs->drv;
2440     BdrvTrackedRequest tracked_req;
2441     CoroutineIOCompletion co = {
2442         .coroutine = qemu_coroutine_self(),
2443     };
2444     BlockAIOCB *acb;
2445 
2446     tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL);
2447     if (!drv || !drv->bdrv_aio_ioctl) {
2448         co.ret = -ENOTSUP;
2449         goto out;
2450     }
2451 
2452     acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
2453     if (!acb) {
2454         co.ret = -ENOTSUP;
2455         goto out;
2456     }
2457     qemu_coroutine_yield();
2458 out:
2459     tracked_request_end(&tracked_req);
2460     return co.ret;
2461 }
2462 
2463 typedef struct {
2464     BlockDriverState *bs;
2465     int req;
2466     void *buf;
2467     int ret;
2468 } BdrvIoctlCoData;
2469 
2470 static void coroutine_fn bdrv_co_ioctl_entry(void *opaque)
2471 {
2472     BdrvIoctlCoData *data = opaque;
2473     data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf);
2474 }
2475 
2476 /* needed for generic scsi interface */
2477 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
2478 {
2479     BdrvIoctlCoData data = {
2480         .bs = bs,
2481         .req = req,
2482         .buf = buf,
2483         .ret = -EINPROGRESS,
2484     };
2485 
2486     if (qemu_in_coroutine()) {
2487         /* Fast-path if already in coroutine context */
2488         bdrv_co_ioctl_entry(&data);
2489     } else {
2490         Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry);
2491 
2492         qemu_coroutine_enter(co, &data);
2493         while (data.ret == -EINPROGRESS) {
2494             aio_poll(bdrv_get_aio_context(bs), true);
2495         }
2496     }
2497     return data.ret;
2498 }
2499 
2500 static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque)
2501 {
2502     BlockAIOCBCoroutine *acb = opaque;
2503     acb->req.error = bdrv_co_do_ioctl(acb->common.bs,
2504                                       acb->req.req, acb->req.buf);
2505     bdrv_co_complete(acb);
2506 }
2507 
2508 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
2509         unsigned long int req, void *buf,
2510         BlockCompletionFunc *cb, void *opaque)
2511 {
2512     BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info,
2513                                             bs, cb, opaque);
2514     Coroutine *co;
2515 
2516     acb->need_bh = true;
2517     acb->req.error = -EINPROGRESS;
2518     acb->req.req = req;
2519     acb->req.buf = buf;
2520     co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry);
2521     qemu_coroutine_enter(co, acb);
2522 
2523     bdrv_co_maybe_schedule_bh(acb);
2524     return &acb->common;
2525 }
2526 
2527 void *qemu_blockalign(BlockDriverState *bs, size_t size)
2528 {
2529     return qemu_memalign(bdrv_opt_mem_align(bs), size);
2530 }
2531 
2532 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
2533 {
2534     return memset(qemu_blockalign(bs, size), 0, size);
2535 }
2536 
2537 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
2538 {
2539     size_t align = bdrv_opt_mem_align(bs);
2540 
2541     /* Ensure that NULL is never returned on success */
2542     assert(align > 0);
2543     if (size == 0) {
2544         size = align;
2545     }
2546 
2547     return qemu_try_memalign(align, size);
2548 }
2549 
2550 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
2551 {
2552     void *mem = qemu_try_blockalign(bs, size);
2553 
2554     if (mem) {
2555         memset(mem, 0, size);
2556     }
2557 
2558     return mem;
2559 }
2560 
2561 /*
2562  * Check if all memory in this vector is sector aligned.
2563  */
2564 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
2565 {
2566     int i;
2567     size_t alignment = bdrv_min_mem_align(bs);
2568 
2569     for (i = 0; i < qiov->niov; i++) {
2570         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
2571             return false;
2572         }
2573         if (qiov->iov[i].iov_len % alignment) {
2574             return false;
2575         }
2576     }
2577 
2578     return true;
2579 }
2580 
2581 void bdrv_add_before_write_notifier(BlockDriverState *bs,
2582                                     NotifierWithReturn *notifier)
2583 {
2584     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
2585 }
2586 
2587 void bdrv_io_plug(BlockDriverState *bs)
2588 {
2589     BdrvChild *child;
2590 
2591     QLIST_FOREACH(child, &bs->children, next) {
2592         bdrv_io_plug(child->bs);
2593     }
2594 
2595     if (bs->io_plugged++ == 0 && bs->io_plug_disabled == 0) {
2596         BlockDriver *drv = bs->drv;
2597         if (drv && drv->bdrv_io_plug) {
2598             drv->bdrv_io_plug(bs);
2599         }
2600     }
2601 }
2602 
2603 void bdrv_io_unplug(BlockDriverState *bs)
2604 {
2605     BdrvChild *child;
2606 
2607     assert(bs->io_plugged);
2608     if (--bs->io_plugged == 0 && bs->io_plug_disabled == 0) {
2609         BlockDriver *drv = bs->drv;
2610         if (drv && drv->bdrv_io_unplug) {
2611             drv->bdrv_io_unplug(bs);
2612         }
2613     }
2614 
2615     QLIST_FOREACH(child, &bs->children, next) {
2616         bdrv_io_unplug(child->bs);
2617     }
2618 }
2619 
2620 void bdrv_io_unplugged_begin(BlockDriverState *bs)
2621 {
2622     BdrvChild *child;
2623 
2624     if (bs->io_plug_disabled++ == 0 && bs->io_plugged > 0) {
2625         BlockDriver *drv = bs->drv;
2626         if (drv && drv->bdrv_io_unplug) {
2627             drv->bdrv_io_unplug(bs);
2628         }
2629     }
2630 
2631     QLIST_FOREACH(child, &bs->children, next) {
2632         bdrv_io_unplugged_begin(child->bs);
2633     }
2634 }
2635 
2636 void bdrv_io_unplugged_end(BlockDriverState *bs)
2637 {
2638     BdrvChild *child;
2639 
2640     assert(bs->io_plug_disabled);
2641     QLIST_FOREACH(child, &bs->children, next) {
2642         bdrv_io_unplugged_end(child->bs);
2643     }
2644 
2645     if (--bs->io_plug_disabled == 0 && bs->io_plugged > 0) {
2646         BlockDriver *drv = bs->drv;
2647         if (drv && drv->bdrv_io_plug) {
2648             drv->bdrv_io_plug(bs);
2649         }
2650     }
2651 }
2652