xref: /openbmc/qemu/block/io.c (revision 259ebed4)
1 /*
2  * Block layer I/O functions
3  *
4  * Copyright (c) 2003 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "trace.h"
27 #include "sysemu/block-backend.h"
28 #include "block/aio-wait.h"
29 #include "block/blockjob.h"
30 #include "block/blockjob_int.h"
31 #include "block/block_int.h"
32 #include "block/coroutines.h"
33 #include "block/dirty-bitmap.h"
34 #include "block/write-threshold.h"
35 #include "qemu/cutils.h"
36 #include "qemu/memalign.h"
37 #include "qapi/error.h"
38 #include "qemu/error-report.h"
39 #include "qemu/main-loop.h"
40 #include "sysemu/replay.h"
41 
42 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
43 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
44 
45 static void coroutine_fn GRAPH_RDLOCK
46 bdrv_parent_cb_resize(BlockDriverState *bs);
47 
48 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
49     int64_t offset, int64_t bytes, BdrvRequestFlags flags);
50 
51 static void GRAPH_RDLOCK
52 bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
53 {
54     BdrvChild *c, *next;
55     IO_OR_GS_CODE();
56     assert_bdrv_graph_readable();
57 
58     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
59         if (c == ignore) {
60             continue;
61         }
62         bdrv_parent_drained_begin_single(c);
63     }
64 }
65 
66 void bdrv_parent_drained_end_single(BdrvChild *c)
67 {
68     GLOBAL_STATE_CODE();
69 
70     assert(c->quiesced_parent);
71     c->quiesced_parent = false;
72 
73     if (c->klass->drained_end) {
74         c->klass->drained_end(c);
75     }
76 }
77 
78 static void GRAPH_RDLOCK
79 bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
80 {
81     BdrvChild *c;
82     IO_OR_GS_CODE();
83     assert_bdrv_graph_readable();
84 
85     QLIST_FOREACH(c, &bs->parents, next_parent) {
86         if (c == ignore) {
87             continue;
88         }
89         bdrv_parent_drained_end_single(c);
90     }
91 }
92 
93 bool bdrv_parent_drained_poll_single(BdrvChild *c)
94 {
95     IO_OR_GS_CODE();
96 
97     if (c->klass->drained_poll) {
98         return c->klass->drained_poll(c);
99     }
100     return false;
101 }
102 
103 static bool GRAPH_RDLOCK
104 bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
105                          bool ignore_bds_parents)
106 {
107     BdrvChild *c, *next;
108     bool busy = false;
109     IO_OR_GS_CODE();
110     assert_bdrv_graph_readable();
111 
112     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
113         if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
114             continue;
115         }
116         busy |= bdrv_parent_drained_poll_single(c);
117     }
118 
119     return busy;
120 }
121 
122 void bdrv_parent_drained_begin_single(BdrvChild *c)
123 {
124     GLOBAL_STATE_CODE();
125 
126     assert(!c->quiesced_parent);
127     c->quiesced_parent = true;
128 
129     if (c->klass->drained_begin) {
130         /* called with rdlock taken, but it doesn't really need it. */
131         c->klass->drained_begin(c);
132     }
133 }
134 
135 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
136 {
137     dst->pdiscard_alignment = MAX(dst->pdiscard_alignment,
138                                   src->pdiscard_alignment);
139     dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
140     dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
141     dst->max_hw_transfer = MIN_NON_ZERO(dst->max_hw_transfer,
142                                         src->max_hw_transfer);
143     dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
144                                  src->opt_mem_alignment);
145     dst->min_mem_alignment = MAX(dst->min_mem_alignment,
146                                  src->min_mem_alignment);
147     dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
148     dst->max_hw_iov = MIN_NON_ZERO(dst->max_hw_iov, src->max_hw_iov);
149 }
150 
151 typedef struct BdrvRefreshLimitsState {
152     BlockDriverState *bs;
153     BlockLimits old_bl;
154 } BdrvRefreshLimitsState;
155 
156 static void bdrv_refresh_limits_abort(void *opaque)
157 {
158     BdrvRefreshLimitsState *s = opaque;
159 
160     s->bs->bl = s->old_bl;
161 }
162 
163 static TransactionActionDrv bdrv_refresh_limits_drv = {
164     .abort = bdrv_refresh_limits_abort,
165     .clean = g_free,
166 };
167 
168 /* @tran is allowed to be NULL, in this case no rollback is possible. */
169 void bdrv_refresh_limits(BlockDriverState *bs, Transaction *tran, Error **errp)
170 {
171     ERRP_GUARD();
172     BlockDriver *drv = bs->drv;
173     BdrvChild *c;
174     bool have_limits;
175 
176     GLOBAL_STATE_CODE();
177 
178     if (tran) {
179         BdrvRefreshLimitsState *s = g_new(BdrvRefreshLimitsState, 1);
180         *s = (BdrvRefreshLimitsState) {
181             .bs = bs,
182             .old_bl = bs->bl,
183         };
184         tran_add(tran, &bdrv_refresh_limits_drv, s);
185     }
186 
187     memset(&bs->bl, 0, sizeof(bs->bl));
188 
189     if (!drv) {
190         return;
191     }
192 
193     /* Default alignment based on whether driver has byte interface */
194     bs->bl.request_alignment = (drv->bdrv_co_preadv ||
195                                 drv->bdrv_aio_preadv ||
196                                 drv->bdrv_co_preadv_part) ? 1 : 512;
197 
198     /* Take some limits from the children as a default */
199     have_limits = false;
200     QLIST_FOREACH(c, &bs->children, next) {
201         if (c->role & (BDRV_CHILD_DATA | BDRV_CHILD_FILTERED | BDRV_CHILD_COW))
202         {
203             bdrv_merge_limits(&bs->bl, &c->bs->bl);
204             have_limits = true;
205         }
206 
207         if (c->role & BDRV_CHILD_FILTERED) {
208             bs->bl.has_variable_length |= c->bs->bl.has_variable_length;
209         }
210     }
211 
212     if (!have_limits) {
213         bs->bl.min_mem_alignment = 512;
214         bs->bl.opt_mem_alignment = qemu_real_host_page_size();
215 
216         /* Safe default since most protocols use readv()/writev()/etc */
217         bs->bl.max_iov = IOV_MAX;
218     }
219 
220     /* Then let the driver override it */
221     if (drv->bdrv_refresh_limits) {
222         drv->bdrv_refresh_limits(bs, errp);
223         if (*errp) {
224             return;
225         }
226     }
227 
228     if (bs->bl.request_alignment > BDRV_MAX_ALIGNMENT) {
229         error_setg(errp, "Driver requires too large request alignment");
230     }
231 }
232 
233 /**
234  * The copy-on-read flag is actually a reference count so multiple users may
235  * use the feature without worrying about clobbering its previous state.
236  * Copy-on-read stays enabled until all users have called to disable it.
237  */
238 void bdrv_enable_copy_on_read(BlockDriverState *bs)
239 {
240     IO_CODE();
241     qatomic_inc(&bs->copy_on_read);
242 }
243 
244 void bdrv_disable_copy_on_read(BlockDriverState *bs)
245 {
246     int old = qatomic_fetch_dec(&bs->copy_on_read);
247     IO_CODE();
248     assert(old >= 1);
249 }
250 
251 typedef struct {
252     Coroutine *co;
253     BlockDriverState *bs;
254     bool done;
255     bool begin;
256     bool poll;
257     BdrvChild *parent;
258 } BdrvCoDrainData;
259 
260 /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
261 bool bdrv_drain_poll(BlockDriverState *bs, BdrvChild *ignore_parent,
262                      bool ignore_bds_parents)
263 {
264     GLOBAL_STATE_CODE();
265 
266     if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) {
267         return true;
268     }
269 
270     if (qatomic_read(&bs->in_flight)) {
271         return true;
272     }
273 
274     return false;
275 }
276 
277 static bool bdrv_drain_poll_top_level(BlockDriverState *bs,
278                                       BdrvChild *ignore_parent)
279 {
280     GLOBAL_STATE_CODE();
281     GRAPH_RDLOCK_GUARD_MAINLOOP();
282 
283     return bdrv_drain_poll(bs, ignore_parent, false);
284 }
285 
286 static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent,
287                                   bool poll);
288 static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
289 
290 static void bdrv_co_drain_bh_cb(void *opaque)
291 {
292     BdrvCoDrainData *data = opaque;
293     Coroutine *co = data->co;
294     BlockDriverState *bs = data->bs;
295 
296     if (bs) {
297         AioContext *ctx = bdrv_get_aio_context(bs);
298         aio_context_acquire(ctx);
299         bdrv_dec_in_flight(bs);
300         if (data->begin) {
301             bdrv_do_drained_begin(bs, data->parent, data->poll);
302         } else {
303             assert(!data->poll);
304             bdrv_do_drained_end(bs, data->parent);
305         }
306         aio_context_release(ctx);
307     } else {
308         assert(data->begin);
309         bdrv_drain_all_begin();
310     }
311 
312     data->done = true;
313     aio_co_wake(co);
314 }
315 
316 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
317                                                 bool begin,
318                                                 BdrvChild *parent,
319                                                 bool poll)
320 {
321     BdrvCoDrainData data;
322     Coroutine *self = qemu_coroutine_self();
323     AioContext *ctx = bdrv_get_aio_context(bs);
324     AioContext *co_ctx = qemu_coroutine_get_aio_context(self);
325 
326     /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
327      * other coroutines run if they were queued by aio_co_enter(). */
328 
329     assert(qemu_in_coroutine());
330     data = (BdrvCoDrainData) {
331         .co = self,
332         .bs = bs,
333         .done = false,
334         .begin = begin,
335         .parent = parent,
336         .poll = poll,
337     };
338 
339     if (bs) {
340         bdrv_inc_in_flight(bs);
341     }
342 
343     /*
344      * Temporarily drop the lock across yield or we would get deadlocks.
345      * bdrv_co_drain_bh_cb() reaquires the lock as needed.
346      *
347      * When we yield below, the lock for the current context will be
348      * released, so if this is actually the lock that protects bs, don't drop
349      * it a second time.
350      */
351     if (ctx != co_ctx) {
352         aio_context_release(ctx);
353     }
354     replay_bh_schedule_oneshot_event(qemu_get_aio_context(),
355                                      bdrv_co_drain_bh_cb, &data);
356 
357     qemu_coroutine_yield();
358     /* If we are resumed from some other event (such as an aio completion or a
359      * timer callback), it is a bug in the caller that should be fixed. */
360     assert(data.done);
361 
362     /* Reacquire the AioContext of bs if we dropped it */
363     if (ctx != co_ctx) {
364         aio_context_acquire(ctx);
365     }
366 }
367 
368 static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent,
369                                   bool poll)
370 {
371     IO_OR_GS_CODE();
372 
373     if (qemu_in_coroutine()) {
374         bdrv_co_yield_to_drain(bs, true, parent, poll);
375         return;
376     }
377 
378     GLOBAL_STATE_CODE();
379 
380     /* Stop things in parent-to-child order */
381     if (qatomic_fetch_inc(&bs->quiesce_counter) == 0) {
382         GRAPH_RDLOCK_GUARD_MAINLOOP();
383         bdrv_parent_drained_begin(bs, parent);
384         if (bs->drv && bs->drv->bdrv_drain_begin) {
385             bs->drv->bdrv_drain_begin(bs);
386         }
387     }
388 
389     /*
390      * Wait for drained requests to finish.
391      *
392      * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The
393      * call is needed so things in this AioContext can make progress even
394      * though we don't return to the main AioContext loop - this automatically
395      * includes other nodes in the same AioContext and therefore all child
396      * nodes.
397      */
398     if (poll) {
399         BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, parent));
400     }
401 }
402 
403 void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, BdrvChild *parent)
404 {
405     bdrv_do_drained_begin(bs, parent, false);
406 }
407 
408 void coroutine_mixed_fn
409 bdrv_drained_begin(BlockDriverState *bs)
410 {
411     IO_OR_GS_CODE();
412     bdrv_do_drained_begin(bs, NULL, true);
413 }
414 
415 /**
416  * This function does not poll, nor must any of its recursively called
417  * functions.
418  */
419 static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
420 {
421     int old_quiesce_counter;
422 
423     IO_OR_GS_CODE();
424 
425     if (qemu_in_coroutine()) {
426         bdrv_co_yield_to_drain(bs, false, parent, false);
427         return;
428     }
429 
430     /* At this point, we should be always running in the main loop. */
431     GLOBAL_STATE_CODE();
432     assert(bs->quiesce_counter > 0);
433     GLOBAL_STATE_CODE();
434 
435     /* Re-enable things in child-to-parent order */
436     old_quiesce_counter = qatomic_fetch_dec(&bs->quiesce_counter);
437     if (old_quiesce_counter == 1) {
438         GRAPH_RDLOCK_GUARD_MAINLOOP();
439         if (bs->drv && bs->drv->bdrv_drain_end) {
440             bs->drv->bdrv_drain_end(bs);
441         }
442         bdrv_parent_drained_end(bs, parent);
443     }
444 }
445 
446 void bdrv_drained_end(BlockDriverState *bs)
447 {
448     IO_OR_GS_CODE();
449     bdrv_do_drained_end(bs, NULL);
450 }
451 
452 void bdrv_drain(BlockDriverState *bs)
453 {
454     IO_OR_GS_CODE();
455     bdrv_drained_begin(bs);
456     bdrv_drained_end(bs);
457 }
458 
459 static void bdrv_drain_assert_idle(BlockDriverState *bs)
460 {
461     BdrvChild *child, *next;
462     GLOBAL_STATE_CODE();
463     GRAPH_RDLOCK_GUARD_MAINLOOP();
464 
465     assert(qatomic_read(&bs->in_flight) == 0);
466     QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
467         bdrv_drain_assert_idle(child->bs);
468     }
469 }
470 
471 unsigned int bdrv_drain_all_count = 0;
472 
473 static bool bdrv_drain_all_poll(void)
474 {
475     BlockDriverState *bs = NULL;
476     bool result = false;
477 
478     GLOBAL_STATE_CODE();
479     GRAPH_RDLOCK_GUARD_MAINLOOP();
480 
481     /* bdrv_drain_poll() can't make changes to the graph and we are holding the
482      * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
483     while ((bs = bdrv_next_all_states(bs))) {
484         AioContext *aio_context = bdrv_get_aio_context(bs);
485         aio_context_acquire(aio_context);
486         result |= bdrv_drain_poll(bs, NULL, true);
487         aio_context_release(aio_context);
488     }
489 
490     return result;
491 }
492 
493 /*
494  * Wait for pending requests to complete across all BlockDriverStates
495  *
496  * This function does not flush data to disk, use bdrv_flush_all() for that
497  * after calling this function.
498  *
499  * This pauses all block jobs and disables external clients. It must
500  * be paired with bdrv_drain_all_end().
501  *
502  * NOTE: no new block jobs or BlockDriverStates can be created between
503  * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
504  */
505 void bdrv_drain_all_begin_nopoll(void)
506 {
507     BlockDriverState *bs = NULL;
508     GLOBAL_STATE_CODE();
509 
510     /*
511      * bdrv queue is managed by record/replay,
512      * waiting for finishing the I/O requests may
513      * be infinite
514      */
515     if (replay_events_enabled()) {
516         return;
517     }
518 
519     /* AIO_WAIT_WHILE() with a NULL context can only be called from the main
520      * loop AioContext, so make sure we're in the main context. */
521     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
522     assert(bdrv_drain_all_count < INT_MAX);
523     bdrv_drain_all_count++;
524 
525     /* Quiesce all nodes, without polling in-flight requests yet. The graph
526      * cannot change during this loop. */
527     while ((bs = bdrv_next_all_states(bs))) {
528         AioContext *aio_context = bdrv_get_aio_context(bs);
529 
530         aio_context_acquire(aio_context);
531         bdrv_do_drained_begin(bs, NULL, false);
532         aio_context_release(aio_context);
533     }
534 }
535 
536 void coroutine_mixed_fn bdrv_drain_all_begin(void)
537 {
538     BlockDriverState *bs = NULL;
539 
540     if (qemu_in_coroutine()) {
541         bdrv_co_yield_to_drain(NULL, true, NULL, true);
542         return;
543     }
544 
545     /*
546      * bdrv queue is managed by record/replay,
547      * waiting for finishing the I/O requests may
548      * be infinite
549      */
550     if (replay_events_enabled()) {
551         return;
552     }
553 
554     bdrv_drain_all_begin_nopoll();
555 
556     /* Now poll the in-flight requests */
557     AIO_WAIT_WHILE_UNLOCKED(NULL, bdrv_drain_all_poll());
558 
559     while ((bs = bdrv_next_all_states(bs))) {
560         bdrv_drain_assert_idle(bs);
561     }
562 }
563 
564 void bdrv_drain_all_end_quiesce(BlockDriverState *bs)
565 {
566     GLOBAL_STATE_CODE();
567 
568     g_assert(bs->quiesce_counter > 0);
569     g_assert(!bs->refcnt);
570 
571     while (bs->quiesce_counter) {
572         bdrv_do_drained_end(bs, NULL);
573     }
574 }
575 
576 void bdrv_drain_all_end(void)
577 {
578     BlockDriverState *bs = NULL;
579     GLOBAL_STATE_CODE();
580 
581     /*
582      * bdrv queue is managed by record/replay,
583      * waiting for finishing the I/O requests may
584      * be endless
585      */
586     if (replay_events_enabled()) {
587         return;
588     }
589 
590     while ((bs = bdrv_next_all_states(bs))) {
591         AioContext *aio_context = bdrv_get_aio_context(bs);
592 
593         aio_context_acquire(aio_context);
594         bdrv_do_drained_end(bs, NULL);
595         aio_context_release(aio_context);
596     }
597 
598     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
599     assert(bdrv_drain_all_count > 0);
600     bdrv_drain_all_count--;
601 }
602 
603 void bdrv_drain_all(void)
604 {
605     GLOBAL_STATE_CODE();
606     bdrv_drain_all_begin();
607     bdrv_drain_all_end();
608 }
609 
610 /**
611  * Remove an active request from the tracked requests list
612  *
613  * This function should be called when a tracked request is completing.
614  */
615 static void coroutine_fn tracked_request_end(BdrvTrackedRequest *req)
616 {
617     if (req->serialising) {
618         qatomic_dec(&req->bs->serialising_in_flight);
619     }
620 
621     qemu_mutex_lock(&req->bs->reqs_lock);
622     QLIST_REMOVE(req, list);
623     qemu_mutex_unlock(&req->bs->reqs_lock);
624 
625     /*
626      * At this point qemu_co_queue_wait(&req->wait_queue, ...) won't be called
627      * anymore because the request has been removed from the list, so it's safe
628      * to restart the queue outside reqs_lock to minimize the critical section.
629      */
630     qemu_co_queue_restart_all(&req->wait_queue);
631 }
632 
633 /**
634  * Add an active request to the tracked requests list
635  */
636 static void coroutine_fn tracked_request_begin(BdrvTrackedRequest *req,
637                                                BlockDriverState *bs,
638                                                int64_t offset,
639                                                int64_t bytes,
640                                                enum BdrvTrackedRequestType type)
641 {
642     bdrv_check_request(offset, bytes, &error_abort);
643 
644     *req = (BdrvTrackedRequest){
645         .bs = bs,
646         .offset         = offset,
647         .bytes          = bytes,
648         .type           = type,
649         .co             = qemu_coroutine_self(),
650         .serialising    = false,
651         .overlap_offset = offset,
652         .overlap_bytes  = bytes,
653     };
654 
655     qemu_co_queue_init(&req->wait_queue);
656 
657     qemu_mutex_lock(&bs->reqs_lock);
658     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
659     qemu_mutex_unlock(&bs->reqs_lock);
660 }
661 
662 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
663                                      int64_t offset, int64_t bytes)
664 {
665     bdrv_check_request(offset, bytes, &error_abort);
666 
667     /*        aaaa   bbbb */
668     if (offset >= req->overlap_offset + req->overlap_bytes) {
669         return false;
670     }
671     /* bbbb   aaaa        */
672     if (req->overlap_offset >= offset + bytes) {
673         return false;
674     }
675     return true;
676 }
677 
678 /* Called with self->bs->reqs_lock held */
679 static coroutine_fn BdrvTrackedRequest *
680 bdrv_find_conflicting_request(BdrvTrackedRequest *self)
681 {
682     BdrvTrackedRequest *req;
683 
684     QLIST_FOREACH(req, &self->bs->tracked_requests, list) {
685         if (req == self || (!req->serialising && !self->serialising)) {
686             continue;
687         }
688         if (tracked_request_overlaps(req, self->overlap_offset,
689                                      self->overlap_bytes))
690         {
691             /*
692              * Hitting this means there was a reentrant request, for
693              * example, a block driver issuing nested requests.  This must
694              * never happen since it means deadlock.
695              */
696             assert(qemu_coroutine_self() != req->co);
697 
698             /*
699              * If the request is already (indirectly) waiting for us, or
700              * will wait for us as soon as it wakes up, then just go on
701              * (instead of producing a deadlock in the former case).
702              */
703             if (!req->waiting_for) {
704                 return req;
705             }
706         }
707     }
708 
709     return NULL;
710 }
711 
712 /* Called with self->bs->reqs_lock held */
713 static void coroutine_fn
714 bdrv_wait_serialising_requests_locked(BdrvTrackedRequest *self)
715 {
716     BdrvTrackedRequest *req;
717 
718     while ((req = bdrv_find_conflicting_request(self))) {
719         self->waiting_for = req;
720         qemu_co_queue_wait(&req->wait_queue, &self->bs->reqs_lock);
721         self->waiting_for = NULL;
722     }
723 }
724 
725 /* Called with req->bs->reqs_lock held */
726 static void tracked_request_set_serialising(BdrvTrackedRequest *req,
727                                             uint64_t align)
728 {
729     int64_t overlap_offset = req->offset & ~(align - 1);
730     int64_t overlap_bytes =
731         ROUND_UP(req->offset + req->bytes, align) - overlap_offset;
732 
733     bdrv_check_request(req->offset, req->bytes, &error_abort);
734 
735     if (!req->serialising) {
736         qatomic_inc(&req->bs->serialising_in_flight);
737         req->serialising = true;
738     }
739 
740     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
741     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
742 }
743 
744 /**
745  * Return the tracked request on @bs for the current coroutine, or
746  * NULL if there is none.
747  */
748 BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs)
749 {
750     BdrvTrackedRequest *req;
751     Coroutine *self = qemu_coroutine_self();
752     IO_CODE();
753 
754     QLIST_FOREACH(req, &bs->tracked_requests, list) {
755         if (req->co == self) {
756             return req;
757         }
758     }
759 
760     return NULL;
761 }
762 
763 /**
764  * Round a region to subcluster (if supported) or cluster boundaries
765  */
766 void coroutine_fn GRAPH_RDLOCK
767 bdrv_round_to_subclusters(BlockDriverState *bs, int64_t offset, int64_t bytes,
768                           int64_t *align_offset, int64_t *align_bytes)
769 {
770     BlockDriverInfo bdi;
771     IO_CODE();
772     if (bdrv_co_get_info(bs, &bdi) < 0 || bdi.subcluster_size == 0) {
773         *align_offset = offset;
774         *align_bytes = bytes;
775     } else {
776         int64_t c = bdi.subcluster_size;
777         *align_offset = QEMU_ALIGN_DOWN(offset, c);
778         *align_bytes = QEMU_ALIGN_UP(offset - *align_offset + bytes, c);
779     }
780 }
781 
782 static int coroutine_fn GRAPH_RDLOCK bdrv_get_cluster_size(BlockDriverState *bs)
783 {
784     BlockDriverInfo bdi;
785     int ret;
786 
787     ret = bdrv_co_get_info(bs, &bdi);
788     if (ret < 0 || bdi.cluster_size == 0) {
789         return bs->bl.request_alignment;
790     } else {
791         return bdi.cluster_size;
792     }
793 }
794 
795 void bdrv_inc_in_flight(BlockDriverState *bs)
796 {
797     IO_CODE();
798     qatomic_inc(&bs->in_flight);
799 }
800 
801 void bdrv_wakeup(BlockDriverState *bs)
802 {
803     IO_CODE();
804     aio_wait_kick();
805 }
806 
807 void bdrv_dec_in_flight(BlockDriverState *bs)
808 {
809     IO_CODE();
810     qatomic_dec(&bs->in_flight);
811     bdrv_wakeup(bs);
812 }
813 
814 static void coroutine_fn
815 bdrv_wait_serialising_requests(BdrvTrackedRequest *self)
816 {
817     BlockDriverState *bs = self->bs;
818 
819     if (!qatomic_read(&bs->serialising_in_flight)) {
820         return;
821     }
822 
823     qemu_mutex_lock(&bs->reqs_lock);
824     bdrv_wait_serialising_requests_locked(self);
825     qemu_mutex_unlock(&bs->reqs_lock);
826 }
827 
828 void coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req,
829                                                 uint64_t align)
830 {
831     IO_CODE();
832 
833     qemu_mutex_lock(&req->bs->reqs_lock);
834 
835     tracked_request_set_serialising(req, align);
836     bdrv_wait_serialising_requests_locked(req);
837 
838     qemu_mutex_unlock(&req->bs->reqs_lock);
839 }
840 
841 int bdrv_check_qiov_request(int64_t offset, int64_t bytes,
842                             QEMUIOVector *qiov, size_t qiov_offset,
843                             Error **errp)
844 {
845     /*
846      * Check generic offset/bytes correctness
847      */
848 
849     if (offset < 0) {
850         error_setg(errp, "offset is negative: %" PRIi64, offset);
851         return -EIO;
852     }
853 
854     if (bytes < 0) {
855         error_setg(errp, "bytes is negative: %" PRIi64, bytes);
856         return -EIO;
857     }
858 
859     if (bytes > BDRV_MAX_LENGTH) {
860         error_setg(errp, "bytes(%" PRIi64 ") exceeds maximum(%" PRIi64 ")",
861                    bytes, BDRV_MAX_LENGTH);
862         return -EIO;
863     }
864 
865     if (offset > BDRV_MAX_LENGTH) {
866         error_setg(errp, "offset(%" PRIi64 ") exceeds maximum(%" PRIi64 ")",
867                    offset, BDRV_MAX_LENGTH);
868         return -EIO;
869     }
870 
871     if (offset > BDRV_MAX_LENGTH - bytes) {
872         error_setg(errp, "sum of offset(%" PRIi64 ") and bytes(%" PRIi64 ") "
873                    "exceeds maximum(%" PRIi64 ")", offset, bytes,
874                    BDRV_MAX_LENGTH);
875         return -EIO;
876     }
877 
878     if (!qiov) {
879         return 0;
880     }
881 
882     /*
883      * Check qiov and qiov_offset
884      */
885 
886     if (qiov_offset > qiov->size) {
887         error_setg(errp, "qiov_offset(%zu) overflow io vector size(%zu)",
888                    qiov_offset, qiov->size);
889         return -EIO;
890     }
891 
892     if (bytes > qiov->size - qiov_offset) {
893         error_setg(errp, "bytes(%" PRIi64 ") + qiov_offset(%zu) overflow io "
894                    "vector size(%zu)", bytes, qiov_offset, qiov->size);
895         return -EIO;
896     }
897 
898     return 0;
899 }
900 
901 int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp)
902 {
903     return bdrv_check_qiov_request(offset, bytes, NULL, 0, errp);
904 }
905 
906 static int bdrv_check_request32(int64_t offset, int64_t bytes,
907                                 QEMUIOVector *qiov, size_t qiov_offset)
908 {
909     int ret = bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, NULL);
910     if (ret < 0) {
911         return ret;
912     }
913 
914     if (bytes > BDRV_REQUEST_MAX_BYTES) {
915         return -EIO;
916     }
917 
918     return 0;
919 }
920 
921 /*
922  * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
923  * The operation is sped up by checking the block status and only writing
924  * zeroes to the device if they currently do not return zeroes. Optional
925  * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
926  * BDRV_REQ_FUA).
927  *
928  * Returns < 0 on error, 0 on success. For error codes see bdrv_pwrite().
929  */
930 int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
931 {
932     int ret;
933     int64_t target_size, bytes, offset = 0;
934     BlockDriverState *bs = child->bs;
935     IO_CODE();
936 
937     target_size = bdrv_getlength(bs);
938     if (target_size < 0) {
939         return target_size;
940     }
941 
942     for (;;) {
943         bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES);
944         if (bytes <= 0) {
945             return 0;
946         }
947         ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL);
948         if (ret < 0) {
949             return ret;
950         }
951         if (ret & BDRV_BLOCK_ZERO) {
952             offset += bytes;
953             continue;
954         }
955         ret = bdrv_pwrite_zeroes(child, offset, bytes, flags);
956         if (ret < 0) {
957             return ret;
958         }
959         offset += bytes;
960     }
961 }
962 
963 /*
964  * Writes to the file and ensures that no writes are reordered across this
965  * request (acts as a barrier)
966  *
967  * Returns 0 on success, -errno in error cases.
968  */
969 int coroutine_fn bdrv_co_pwrite_sync(BdrvChild *child, int64_t offset,
970                                      int64_t bytes, const void *buf,
971                                      BdrvRequestFlags flags)
972 {
973     int ret;
974     IO_CODE();
975     assert_bdrv_graph_readable();
976 
977     ret = bdrv_co_pwrite(child, offset, bytes, buf, flags);
978     if (ret < 0) {
979         return ret;
980     }
981 
982     ret = bdrv_co_flush(child->bs);
983     if (ret < 0) {
984         return ret;
985     }
986 
987     return 0;
988 }
989 
990 typedef struct CoroutineIOCompletion {
991     Coroutine *coroutine;
992     int ret;
993 } CoroutineIOCompletion;
994 
995 static void bdrv_co_io_em_complete(void *opaque, int ret)
996 {
997     CoroutineIOCompletion *co = opaque;
998 
999     co->ret = ret;
1000     aio_co_wake(co->coroutine);
1001 }
1002 
1003 static int coroutine_fn GRAPH_RDLOCK
1004 bdrv_driver_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
1005                    QEMUIOVector *qiov, size_t qiov_offset, int flags)
1006 {
1007     BlockDriver *drv = bs->drv;
1008     int64_t sector_num;
1009     unsigned int nb_sectors;
1010     QEMUIOVector local_qiov;
1011     int ret;
1012     assert_bdrv_graph_readable();
1013 
1014     bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
1015     assert(!(flags & ~bs->supported_read_flags));
1016 
1017     if (!drv) {
1018         return -ENOMEDIUM;
1019     }
1020 
1021     if (drv->bdrv_co_preadv_part) {
1022         return drv->bdrv_co_preadv_part(bs, offset, bytes, qiov, qiov_offset,
1023                                         flags);
1024     }
1025 
1026     if (qiov_offset > 0 || bytes != qiov->size) {
1027         qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
1028         qiov = &local_qiov;
1029     }
1030 
1031     if (drv->bdrv_co_preadv) {
1032         ret = drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
1033         goto out;
1034     }
1035 
1036     if (drv->bdrv_aio_preadv) {
1037         BlockAIOCB *acb;
1038         CoroutineIOCompletion co = {
1039             .coroutine = qemu_coroutine_self(),
1040         };
1041 
1042         acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
1043                                    bdrv_co_io_em_complete, &co);
1044         if (acb == NULL) {
1045             ret = -EIO;
1046             goto out;
1047         } else {
1048             qemu_coroutine_yield();
1049             ret = co.ret;
1050             goto out;
1051         }
1052     }
1053 
1054     sector_num = offset >> BDRV_SECTOR_BITS;
1055     nb_sectors = bytes >> BDRV_SECTOR_BITS;
1056 
1057     assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
1058     assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
1059     assert(bytes <= BDRV_REQUEST_MAX_BYTES);
1060     assert(drv->bdrv_co_readv);
1061 
1062     ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1063 
1064 out:
1065     if (qiov == &local_qiov) {
1066         qemu_iovec_destroy(&local_qiov);
1067     }
1068 
1069     return ret;
1070 }
1071 
1072 static int coroutine_fn GRAPH_RDLOCK
1073 bdrv_driver_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
1074                     QEMUIOVector *qiov, size_t qiov_offset,
1075                     BdrvRequestFlags flags)
1076 {
1077     BlockDriver *drv = bs->drv;
1078     bool emulate_fua = false;
1079     int64_t sector_num;
1080     unsigned int nb_sectors;
1081     QEMUIOVector local_qiov;
1082     int ret;
1083     assert_bdrv_graph_readable();
1084 
1085     bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
1086 
1087     if (!drv) {
1088         return -ENOMEDIUM;
1089     }
1090 
1091     if ((flags & BDRV_REQ_FUA) &&
1092         (~bs->supported_write_flags & BDRV_REQ_FUA)) {
1093         flags &= ~BDRV_REQ_FUA;
1094         emulate_fua = true;
1095     }
1096 
1097     flags &= bs->supported_write_flags;
1098 
1099     if (drv->bdrv_co_pwritev_part) {
1100         ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset,
1101                                         flags);
1102         goto emulate_flags;
1103     }
1104 
1105     if (qiov_offset > 0 || bytes != qiov->size) {
1106         qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
1107         qiov = &local_qiov;
1108     }
1109 
1110     if (drv->bdrv_co_pwritev) {
1111         ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, flags);
1112         goto emulate_flags;
1113     }
1114 
1115     if (drv->bdrv_aio_pwritev) {
1116         BlockAIOCB *acb;
1117         CoroutineIOCompletion co = {
1118             .coroutine = qemu_coroutine_self(),
1119         };
1120 
1121         acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, flags,
1122                                     bdrv_co_io_em_complete, &co);
1123         if (acb == NULL) {
1124             ret = -EIO;
1125         } else {
1126             qemu_coroutine_yield();
1127             ret = co.ret;
1128         }
1129         goto emulate_flags;
1130     }
1131 
1132     sector_num = offset >> BDRV_SECTOR_BITS;
1133     nb_sectors = bytes >> BDRV_SECTOR_BITS;
1134 
1135     assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
1136     assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
1137     assert(bytes <= BDRV_REQUEST_MAX_BYTES);
1138 
1139     assert(drv->bdrv_co_writev);
1140     ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, flags);
1141 
1142 emulate_flags:
1143     if (ret == 0 && emulate_fua) {
1144         ret = bdrv_co_flush(bs);
1145     }
1146 
1147     if (qiov == &local_qiov) {
1148         qemu_iovec_destroy(&local_qiov);
1149     }
1150 
1151     return ret;
1152 }
1153 
1154 static int coroutine_fn GRAPH_RDLOCK
1155 bdrv_driver_pwritev_compressed(BlockDriverState *bs, int64_t offset,
1156                                int64_t bytes, QEMUIOVector *qiov,
1157                                size_t qiov_offset)
1158 {
1159     BlockDriver *drv = bs->drv;
1160     QEMUIOVector local_qiov;
1161     int ret;
1162     assert_bdrv_graph_readable();
1163 
1164     bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
1165 
1166     if (!drv) {
1167         return -ENOMEDIUM;
1168     }
1169 
1170     if (!block_driver_can_compress(drv)) {
1171         return -ENOTSUP;
1172     }
1173 
1174     if (drv->bdrv_co_pwritev_compressed_part) {
1175         return drv->bdrv_co_pwritev_compressed_part(bs, offset, bytes,
1176                                                     qiov, qiov_offset);
1177     }
1178 
1179     if (qiov_offset == 0) {
1180         return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
1181     }
1182 
1183     qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
1184     ret = drv->bdrv_co_pwritev_compressed(bs, offset, bytes, &local_qiov);
1185     qemu_iovec_destroy(&local_qiov);
1186 
1187     return ret;
1188 }
1189 
1190 static int coroutine_fn GRAPH_RDLOCK
1191 bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes,
1192                          QEMUIOVector *qiov, size_t qiov_offset, int flags)
1193 {
1194     BlockDriverState *bs = child->bs;
1195 
1196     /* Perform I/O through a temporary buffer so that users who scribble over
1197      * their read buffer while the operation is in progress do not end up
1198      * modifying the image file.  This is critical for zero-copy guest I/O
1199      * where anything might happen inside guest memory.
1200      */
1201     void *bounce_buffer = NULL;
1202 
1203     BlockDriver *drv = bs->drv;
1204     int64_t align_offset;
1205     int64_t align_bytes;
1206     int64_t skip_bytes;
1207     int ret;
1208     int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
1209                                     BDRV_REQUEST_MAX_BYTES);
1210     int64_t progress = 0;
1211     bool skip_write;
1212 
1213     bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
1214 
1215     if (!drv) {
1216         return -ENOMEDIUM;
1217     }
1218 
1219     /*
1220      * Do not write anything when the BDS is inactive.  That is not
1221      * allowed, and it would not help.
1222      */
1223     skip_write = (bs->open_flags & BDRV_O_INACTIVE);
1224 
1225     /* FIXME We cannot require callers to have write permissions when all they
1226      * are doing is a read request. If we did things right, write permissions
1227      * would be obtained anyway, but internally by the copy-on-read code. As
1228      * long as it is implemented here rather than in a separate filter driver,
1229      * the copy-on-read code doesn't have its own BdrvChild, however, for which
1230      * it could request permissions. Therefore we have to bypass the permission
1231      * system for the moment. */
1232     // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
1233 
1234     /* Cover entire cluster so no additional backing file I/O is required when
1235      * allocating cluster in the image file.  Note that this value may exceed
1236      * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
1237      * is one reason we loop rather than doing it all at once.
1238      */
1239     bdrv_round_to_subclusters(bs, offset, bytes, &align_offset, &align_bytes);
1240     skip_bytes = offset - align_offset;
1241 
1242     trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
1243                                    align_offset, align_bytes);
1244 
1245     while (align_bytes) {
1246         int64_t pnum;
1247 
1248         if (skip_write) {
1249             ret = 1; /* "already allocated", so nothing will be copied */
1250             pnum = MIN(align_bytes, max_transfer);
1251         } else {
1252             ret = bdrv_co_is_allocated(bs, align_offset,
1253                                        MIN(align_bytes, max_transfer), &pnum);
1254             if (ret < 0) {
1255                 /*
1256                  * Safe to treat errors in querying allocation as if
1257                  * unallocated; we'll probably fail again soon on the
1258                  * read, but at least that will set a decent errno.
1259                  */
1260                 pnum = MIN(align_bytes, max_transfer);
1261             }
1262 
1263             /* Stop at EOF if the image ends in the middle of the cluster */
1264             if (ret == 0 && pnum == 0) {
1265                 assert(progress >= bytes);
1266                 break;
1267             }
1268 
1269             assert(skip_bytes < pnum);
1270         }
1271 
1272         if (ret <= 0) {
1273             QEMUIOVector local_qiov;
1274 
1275             /* Must copy-on-read; use the bounce buffer */
1276             pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
1277             if (!bounce_buffer) {
1278                 int64_t max_we_need = MAX(pnum, align_bytes - pnum);
1279                 int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER);
1280                 int64_t bounce_buffer_len = MIN(max_we_need, max_allowed);
1281 
1282                 bounce_buffer = qemu_try_blockalign(bs, bounce_buffer_len);
1283                 if (!bounce_buffer) {
1284                     ret = -ENOMEM;
1285                     goto err;
1286                 }
1287             }
1288             qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum);
1289 
1290             ret = bdrv_driver_preadv(bs, align_offset, pnum,
1291                                      &local_qiov, 0, 0);
1292             if (ret < 0) {
1293                 goto err;
1294             }
1295 
1296             bdrv_co_debug_event(bs, BLKDBG_COR_WRITE);
1297             if (drv->bdrv_co_pwrite_zeroes &&
1298                 buffer_is_zero(bounce_buffer, pnum)) {
1299                 /* FIXME: Should we (perhaps conditionally) be setting
1300                  * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
1301                  * that still correctly reads as zero? */
1302                 ret = bdrv_co_do_pwrite_zeroes(bs, align_offset, pnum,
1303                                                BDRV_REQ_WRITE_UNCHANGED);
1304             } else {
1305                 /* This does not change the data on the disk, it is not
1306                  * necessary to flush even in cache=writethrough mode.
1307                  */
1308                 ret = bdrv_driver_pwritev(bs, align_offset, pnum,
1309                                           &local_qiov, 0,
1310                                           BDRV_REQ_WRITE_UNCHANGED);
1311             }
1312 
1313             if (ret < 0) {
1314                 /* It might be okay to ignore write errors for guest
1315                  * requests.  If this is a deliberate copy-on-read
1316                  * then we don't want to ignore the error.  Simply
1317                  * report it in all cases.
1318                  */
1319                 goto err;
1320             }
1321 
1322             if (!(flags & BDRV_REQ_PREFETCH)) {
1323                 qemu_iovec_from_buf(qiov, qiov_offset + progress,
1324                                     bounce_buffer + skip_bytes,
1325                                     MIN(pnum - skip_bytes, bytes - progress));
1326             }
1327         } else if (!(flags & BDRV_REQ_PREFETCH)) {
1328             /* Read directly into the destination */
1329             ret = bdrv_driver_preadv(bs, offset + progress,
1330                                      MIN(pnum - skip_bytes, bytes - progress),
1331                                      qiov, qiov_offset + progress, 0);
1332             if (ret < 0) {
1333                 goto err;
1334             }
1335         }
1336 
1337         align_offset += pnum;
1338         align_bytes -= pnum;
1339         progress += pnum - skip_bytes;
1340         skip_bytes = 0;
1341     }
1342     ret = 0;
1343 
1344 err:
1345     qemu_vfree(bounce_buffer);
1346     return ret;
1347 }
1348 
1349 /*
1350  * Forwards an already correctly aligned request to the BlockDriver. This
1351  * handles copy on read, zeroing after EOF, and fragmentation of large
1352  * reads; any other features must be implemented by the caller.
1353  */
1354 static int coroutine_fn GRAPH_RDLOCK
1355 bdrv_aligned_preadv(BdrvChild *child, BdrvTrackedRequest *req,
1356                     int64_t offset, int64_t bytes, int64_t align,
1357                     QEMUIOVector *qiov, size_t qiov_offset, int flags)
1358 {
1359     BlockDriverState *bs = child->bs;
1360     int64_t total_bytes, max_bytes;
1361     int ret = 0;
1362     int64_t bytes_remaining = bytes;
1363     int max_transfer;
1364 
1365     bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
1366     assert(is_power_of_2(align));
1367     assert((offset & (align - 1)) == 0);
1368     assert((bytes & (align - 1)) == 0);
1369     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1370     max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1371                                    align);
1372 
1373     /*
1374      * TODO: We would need a per-BDS .supported_read_flags and
1375      * potential fallback support, if we ever implement any read flags
1376      * to pass through to drivers.  For now, there aren't any
1377      * passthrough flags except the BDRV_REQ_REGISTERED_BUF optimization hint.
1378      */
1379     assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH |
1380                        BDRV_REQ_REGISTERED_BUF)));
1381 
1382     /* Handle Copy on Read and associated serialisation */
1383     if (flags & BDRV_REQ_COPY_ON_READ) {
1384         /* If we touch the same cluster it counts as an overlap.  This
1385          * guarantees that allocating writes will be serialized and not race
1386          * with each other for the same cluster.  For example, in copy-on-read
1387          * it ensures that the CoR read and write operations are atomic and
1388          * guest writes cannot interleave between them. */
1389         bdrv_make_request_serialising(req, bdrv_get_cluster_size(bs));
1390     } else {
1391         bdrv_wait_serialising_requests(req);
1392     }
1393 
1394     if (flags & BDRV_REQ_COPY_ON_READ) {
1395         int64_t pnum;
1396 
1397         /* The flag BDRV_REQ_COPY_ON_READ has reached its addressee */
1398         flags &= ~BDRV_REQ_COPY_ON_READ;
1399 
1400         ret = bdrv_co_is_allocated(bs, offset, bytes, &pnum);
1401         if (ret < 0) {
1402             goto out;
1403         }
1404 
1405         if (!ret || pnum != bytes) {
1406             ret = bdrv_co_do_copy_on_readv(child, offset, bytes,
1407                                            qiov, qiov_offset, flags);
1408             goto out;
1409         } else if (flags & BDRV_REQ_PREFETCH) {
1410             goto out;
1411         }
1412     }
1413 
1414     /* Forward the request to the BlockDriver, possibly fragmenting it */
1415     total_bytes = bdrv_co_getlength(bs);
1416     if (total_bytes < 0) {
1417         ret = total_bytes;
1418         goto out;
1419     }
1420 
1421     assert(!(flags & ~(bs->supported_read_flags | BDRV_REQ_REGISTERED_BUF)));
1422 
1423     max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
1424     if (bytes <= max_bytes && bytes <= max_transfer) {
1425         ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, flags);
1426         goto out;
1427     }
1428 
1429     while (bytes_remaining) {
1430         int64_t num;
1431 
1432         if (max_bytes) {
1433             num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
1434             assert(num);
1435 
1436             ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
1437                                      num, qiov,
1438                                      qiov_offset + bytes - bytes_remaining,
1439                                      flags);
1440             max_bytes -= num;
1441         } else {
1442             num = bytes_remaining;
1443             ret = qemu_iovec_memset(qiov, qiov_offset + bytes - bytes_remaining,
1444                                     0, bytes_remaining);
1445         }
1446         if (ret < 0) {
1447             goto out;
1448         }
1449         bytes_remaining -= num;
1450     }
1451 
1452 out:
1453     return ret < 0 ? ret : 0;
1454 }
1455 
1456 /*
1457  * Request padding
1458  *
1459  *  |<---- align ----->|                     |<----- align ---->|
1460  *  |<- head ->|<------------- bytes ------------->|<-- tail -->|
1461  *  |          |       |                     |     |            |
1462  * -*----------$-------*-------- ... --------*-----$------------*---
1463  *  |          |       |                     |     |            |
1464  *  |          offset  |                     |     end          |
1465  *  ALIGN_DOWN(offset) ALIGN_UP(offset)      ALIGN_DOWN(end)   ALIGN_UP(end)
1466  *  [buf   ... )                             [tail_buf          )
1467  *
1468  * @buf is an aligned allocation needed to store @head and @tail paddings. @head
1469  * is placed at the beginning of @buf and @tail at the @end.
1470  *
1471  * @tail_buf is a pointer to sub-buffer, corresponding to align-sized chunk
1472  * around tail, if tail exists.
1473  *
1474  * @merge_reads is true for small requests,
1475  * if @buf_len == @head + bytes + @tail. In this case it is possible that both
1476  * head and tail exist but @buf_len == align and @tail_buf == @buf.
1477  *
1478  * @write is true for write requests, false for read requests.
1479  *
1480  * If padding makes the vector too long (exceeding IOV_MAX), then we need to
1481  * merge existing vector elements into a single one.  @collapse_bounce_buf acts
1482  * as the bounce buffer in such cases.  @pre_collapse_qiov has the pre-collapse
1483  * I/O vector elements so for read requests, the data can be copied back after
1484  * the read is done.
1485  */
1486 typedef struct BdrvRequestPadding {
1487     uint8_t *buf;
1488     size_t buf_len;
1489     uint8_t *tail_buf;
1490     size_t head;
1491     size_t tail;
1492     bool merge_reads;
1493     bool write;
1494     QEMUIOVector local_qiov;
1495 
1496     uint8_t *collapse_bounce_buf;
1497     size_t collapse_len;
1498     QEMUIOVector pre_collapse_qiov;
1499 } BdrvRequestPadding;
1500 
1501 static bool bdrv_init_padding(BlockDriverState *bs,
1502                               int64_t offset, int64_t bytes,
1503                               bool write,
1504                               BdrvRequestPadding *pad)
1505 {
1506     int64_t align = bs->bl.request_alignment;
1507     int64_t sum;
1508 
1509     bdrv_check_request(offset, bytes, &error_abort);
1510     assert(align <= INT_MAX); /* documented in block/block_int.h */
1511     assert(align <= SIZE_MAX / 2); /* so we can allocate the buffer */
1512 
1513     memset(pad, 0, sizeof(*pad));
1514 
1515     pad->head = offset & (align - 1);
1516     pad->tail = ((offset + bytes) & (align - 1));
1517     if (pad->tail) {
1518         pad->tail = align - pad->tail;
1519     }
1520 
1521     if (!pad->head && !pad->tail) {
1522         return false;
1523     }
1524 
1525     assert(bytes); /* Nothing good in aligning zero-length requests */
1526 
1527     sum = pad->head + bytes + pad->tail;
1528     pad->buf_len = (sum > align && pad->head && pad->tail) ? 2 * align : align;
1529     pad->buf = qemu_blockalign(bs, pad->buf_len);
1530     pad->merge_reads = sum == pad->buf_len;
1531     if (pad->tail) {
1532         pad->tail_buf = pad->buf + pad->buf_len - align;
1533     }
1534 
1535     pad->write = write;
1536 
1537     return true;
1538 }
1539 
1540 static int coroutine_fn GRAPH_RDLOCK
1541 bdrv_padding_rmw_read(BdrvChild *child, BdrvTrackedRequest *req,
1542                       BdrvRequestPadding *pad, bool zero_middle)
1543 {
1544     QEMUIOVector local_qiov;
1545     BlockDriverState *bs = child->bs;
1546     uint64_t align = bs->bl.request_alignment;
1547     int ret;
1548 
1549     assert(req->serialising && pad->buf);
1550 
1551     if (pad->head || pad->merge_reads) {
1552         int64_t bytes = pad->merge_reads ? pad->buf_len : align;
1553 
1554         qemu_iovec_init_buf(&local_qiov, pad->buf, bytes);
1555 
1556         if (pad->head) {
1557             bdrv_co_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1558         }
1559         if (pad->merge_reads && pad->tail) {
1560             bdrv_co_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1561         }
1562         ret = bdrv_aligned_preadv(child, req, req->overlap_offset, bytes,
1563                                   align, &local_qiov, 0, 0);
1564         if (ret < 0) {
1565             return ret;
1566         }
1567         if (pad->head) {
1568             bdrv_co_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1569         }
1570         if (pad->merge_reads && pad->tail) {
1571             bdrv_co_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1572         }
1573 
1574         if (pad->merge_reads) {
1575             goto zero_mem;
1576         }
1577     }
1578 
1579     if (pad->tail) {
1580         qemu_iovec_init_buf(&local_qiov, pad->tail_buf, align);
1581 
1582         bdrv_co_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1583         ret = bdrv_aligned_preadv(
1584                 child, req,
1585                 req->overlap_offset + req->overlap_bytes - align,
1586                 align, align, &local_qiov, 0, 0);
1587         if (ret < 0) {
1588             return ret;
1589         }
1590         bdrv_co_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1591     }
1592 
1593 zero_mem:
1594     if (zero_middle) {
1595         memset(pad->buf + pad->head, 0, pad->buf_len - pad->head - pad->tail);
1596     }
1597 
1598     return 0;
1599 }
1600 
1601 /**
1602  * Free *pad's associated buffers, and perform any necessary finalization steps.
1603  */
1604 static void bdrv_padding_finalize(BdrvRequestPadding *pad)
1605 {
1606     if (pad->collapse_bounce_buf) {
1607         if (!pad->write) {
1608             /*
1609              * If padding required elements in the vector to be collapsed into a
1610              * bounce buffer, copy the bounce buffer content back
1611              */
1612             qemu_iovec_from_buf(&pad->pre_collapse_qiov, 0,
1613                                 pad->collapse_bounce_buf, pad->collapse_len);
1614         }
1615         qemu_vfree(pad->collapse_bounce_buf);
1616         qemu_iovec_destroy(&pad->pre_collapse_qiov);
1617     }
1618     if (pad->buf) {
1619         qemu_vfree(pad->buf);
1620         qemu_iovec_destroy(&pad->local_qiov);
1621     }
1622     memset(pad, 0, sizeof(*pad));
1623 }
1624 
1625 /*
1626  * Create pad->local_qiov by wrapping @iov in the padding head and tail, while
1627  * ensuring that the resulting vector will not exceed IOV_MAX elements.
1628  *
1629  * To ensure this, when necessary, the first two or three elements of @iov are
1630  * merged into pad->collapse_bounce_buf and replaced by a reference to that
1631  * bounce buffer in pad->local_qiov.
1632  *
1633  * After performing a read request, the data from the bounce buffer must be
1634  * copied back into pad->pre_collapse_qiov (e.g. by bdrv_padding_finalize()).
1635  */
1636 static int bdrv_create_padded_qiov(BlockDriverState *bs,
1637                                    BdrvRequestPadding *pad,
1638                                    struct iovec *iov, int niov,
1639                                    size_t iov_offset, size_t bytes)
1640 {
1641     int padded_niov, surplus_count, collapse_count;
1642 
1643     /* Assert this invariant */
1644     assert(niov <= IOV_MAX);
1645 
1646     /*
1647      * Cannot pad if resulting length would exceed SIZE_MAX.  Returning an error
1648      * to the guest is not ideal, but there is little else we can do.  At least
1649      * this will practically never happen on 64-bit systems.
1650      */
1651     if (SIZE_MAX - pad->head < bytes ||
1652         SIZE_MAX - pad->head - bytes < pad->tail)
1653     {
1654         return -EINVAL;
1655     }
1656 
1657     /* Length of the resulting IOV if we just concatenated everything */
1658     padded_niov = !!pad->head + niov + !!pad->tail;
1659 
1660     qemu_iovec_init(&pad->local_qiov, MIN(padded_niov, IOV_MAX));
1661 
1662     if (pad->head) {
1663         qemu_iovec_add(&pad->local_qiov, pad->buf, pad->head);
1664     }
1665 
1666     /*
1667      * If padded_niov > IOV_MAX, we cannot just concatenate everything.
1668      * Instead, merge the first two or three elements of @iov to reduce the
1669      * number of vector elements as necessary.
1670      */
1671     if (padded_niov > IOV_MAX) {
1672         /*
1673          * Only head and tail can have lead to the number of entries exceeding
1674          * IOV_MAX, so we can exceed it by the head and tail at most.  We need
1675          * to reduce the number of elements by `surplus_count`, so we merge that
1676          * many elements plus one into one element.
1677          */
1678         surplus_count = padded_niov - IOV_MAX;
1679         assert(surplus_count <= !!pad->head + !!pad->tail);
1680         collapse_count = surplus_count + 1;
1681 
1682         /*
1683          * Move the elements to collapse into `pad->pre_collapse_qiov`, then
1684          * advance `iov` (and associated variables) by those elements.
1685          */
1686         qemu_iovec_init(&pad->pre_collapse_qiov, collapse_count);
1687         qemu_iovec_concat_iov(&pad->pre_collapse_qiov, iov,
1688                               collapse_count, iov_offset, SIZE_MAX);
1689         iov += collapse_count;
1690         iov_offset = 0;
1691         niov -= collapse_count;
1692         bytes -= pad->pre_collapse_qiov.size;
1693 
1694         /*
1695          * Construct the bounce buffer to match the length of the to-collapse
1696          * vector elements, and for write requests, initialize it with the data
1697          * from those elements.  Then add it to `pad->local_qiov`.
1698          */
1699         pad->collapse_len = pad->pre_collapse_qiov.size;
1700         pad->collapse_bounce_buf = qemu_blockalign(bs, pad->collapse_len);
1701         if (pad->write) {
1702             qemu_iovec_to_buf(&pad->pre_collapse_qiov, 0,
1703                               pad->collapse_bounce_buf, pad->collapse_len);
1704         }
1705         qemu_iovec_add(&pad->local_qiov,
1706                        pad->collapse_bounce_buf, pad->collapse_len);
1707     }
1708 
1709     qemu_iovec_concat_iov(&pad->local_qiov, iov, niov, iov_offset, bytes);
1710 
1711     if (pad->tail) {
1712         qemu_iovec_add(&pad->local_qiov,
1713                        pad->buf + pad->buf_len - pad->tail, pad->tail);
1714     }
1715 
1716     assert(pad->local_qiov.niov == MIN(padded_niov, IOV_MAX));
1717     return 0;
1718 }
1719 
1720 /*
1721  * bdrv_pad_request
1722  *
1723  * Exchange request parameters with padded request if needed. Don't include RMW
1724  * read of padding, bdrv_padding_rmw_read() should be called separately if
1725  * needed.
1726  *
1727  * @write is true for write requests, false for read requests.
1728  *
1729  * Request parameters (@qiov, &qiov_offset, &offset, &bytes) are in-out:
1730  *  - on function start they represent original request
1731  *  - on failure or when padding is not needed they are unchanged
1732  *  - on success when padding is needed they represent padded request
1733  */
1734 static int bdrv_pad_request(BlockDriverState *bs,
1735                             QEMUIOVector **qiov, size_t *qiov_offset,
1736                             int64_t *offset, int64_t *bytes,
1737                             bool write,
1738                             BdrvRequestPadding *pad, bool *padded,
1739                             BdrvRequestFlags *flags)
1740 {
1741     int ret;
1742     struct iovec *sliced_iov;
1743     int sliced_niov;
1744     size_t sliced_head, sliced_tail;
1745 
1746     /* Should have been checked by the caller already */
1747     ret = bdrv_check_request32(*offset, *bytes, *qiov, *qiov_offset);
1748     if (ret < 0) {
1749         return ret;
1750     }
1751 
1752     if (!bdrv_init_padding(bs, *offset, *bytes, write, pad)) {
1753         if (padded) {
1754             *padded = false;
1755         }
1756         return 0;
1757     }
1758 
1759     sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes,
1760                                   &sliced_head, &sliced_tail,
1761                                   &sliced_niov);
1762 
1763     /* Guaranteed by bdrv_check_request32() */
1764     assert(*bytes <= SIZE_MAX);
1765     ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov,
1766                                   sliced_head, *bytes);
1767     if (ret < 0) {
1768         bdrv_padding_finalize(pad);
1769         return ret;
1770     }
1771     *bytes += pad->head + pad->tail;
1772     *offset -= pad->head;
1773     *qiov = &pad->local_qiov;
1774     *qiov_offset = 0;
1775     if (padded) {
1776         *padded = true;
1777     }
1778     if (flags) {
1779         /* Can't use optimization hint with bounce buffer */
1780         *flags &= ~BDRV_REQ_REGISTERED_BUF;
1781     }
1782 
1783     return 0;
1784 }
1785 
1786 int coroutine_fn bdrv_co_preadv(BdrvChild *child,
1787     int64_t offset, int64_t bytes, QEMUIOVector *qiov,
1788     BdrvRequestFlags flags)
1789 {
1790     IO_CODE();
1791     return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags);
1792 }
1793 
1794 int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
1795     int64_t offset, int64_t bytes,
1796     QEMUIOVector *qiov, size_t qiov_offset,
1797     BdrvRequestFlags flags)
1798 {
1799     BlockDriverState *bs = child->bs;
1800     BdrvTrackedRequest req;
1801     BdrvRequestPadding pad;
1802     int ret;
1803     IO_CODE();
1804 
1805     trace_bdrv_co_preadv_part(bs, offset, bytes, flags);
1806 
1807     if (!bdrv_co_is_inserted(bs)) {
1808         return -ENOMEDIUM;
1809     }
1810 
1811     ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset);
1812     if (ret < 0) {
1813         return ret;
1814     }
1815 
1816     if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) {
1817         /*
1818          * Aligning zero request is nonsense. Even if driver has special meaning
1819          * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass
1820          * it to driver due to request_alignment.
1821          *
1822          * Still, no reason to return an error if someone do unaligned
1823          * zero-length read occasionally.
1824          */
1825         return 0;
1826     }
1827 
1828     bdrv_inc_in_flight(bs);
1829 
1830     /* Don't do copy-on-read if we read data before write operation */
1831     if (qatomic_read(&bs->copy_on_read)) {
1832         flags |= BDRV_REQ_COPY_ON_READ;
1833     }
1834 
1835     ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, false,
1836                            &pad, NULL, &flags);
1837     if (ret < 0) {
1838         goto fail;
1839     }
1840 
1841     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
1842     ret = bdrv_aligned_preadv(child, &req, offset, bytes,
1843                               bs->bl.request_alignment,
1844                               qiov, qiov_offset, flags);
1845     tracked_request_end(&req);
1846     bdrv_padding_finalize(&pad);
1847 
1848 fail:
1849     bdrv_dec_in_flight(bs);
1850 
1851     return ret;
1852 }
1853 
1854 static int coroutine_fn GRAPH_RDLOCK
1855 bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
1856                          BdrvRequestFlags flags)
1857 {
1858     BlockDriver *drv = bs->drv;
1859     QEMUIOVector qiov;
1860     void *buf = NULL;
1861     int ret = 0;
1862     bool need_flush = false;
1863     int head = 0;
1864     int tail = 0;
1865 
1866     int64_t max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes,
1867                                             INT64_MAX);
1868     int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
1869                         bs->bl.request_alignment);
1870     int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER);
1871 
1872     assert_bdrv_graph_readable();
1873     bdrv_check_request(offset, bytes, &error_abort);
1874 
1875     if (!drv) {
1876         return -ENOMEDIUM;
1877     }
1878 
1879     if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) {
1880         return -ENOTSUP;
1881     }
1882 
1883     /* By definition there is no user buffer so this flag doesn't make sense */
1884     if (flags & BDRV_REQ_REGISTERED_BUF) {
1885         return -EINVAL;
1886     }
1887 
1888     /* Invalidate the cached block-status data range if this write overlaps */
1889     bdrv_bsc_invalidate_range(bs, offset, bytes);
1890 
1891     assert(alignment % bs->bl.request_alignment == 0);
1892     head = offset % alignment;
1893     tail = (offset + bytes) % alignment;
1894     max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
1895     assert(max_write_zeroes >= bs->bl.request_alignment);
1896 
1897     while (bytes > 0 && !ret) {
1898         int64_t num = bytes;
1899 
1900         /* Align request.  Block drivers can expect the "bulk" of the request
1901          * to be aligned, and that unaligned requests do not cross cluster
1902          * boundaries.
1903          */
1904         if (head) {
1905             /* Make a small request up to the first aligned sector. For
1906              * convenience, limit this request to max_transfer even if
1907              * we don't need to fall back to writes.  */
1908             num = MIN(MIN(bytes, max_transfer), alignment - head);
1909             head = (head + num) % alignment;
1910             assert(num < max_write_zeroes);
1911         } else if (tail && num > alignment) {
1912             /* Shorten the request to the last aligned sector.  */
1913             num -= tail;
1914         }
1915 
1916         /* limit request size */
1917         if (num > max_write_zeroes) {
1918             num = max_write_zeroes;
1919         }
1920 
1921         ret = -ENOTSUP;
1922         /* First try the efficient write zeroes operation */
1923         if (drv->bdrv_co_pwrite_zeroes) {
1924             ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1925                                              flags & bs->supported_zero_flags);
1926             if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1927                 !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1928                 need_flush = true;
1929             }
1930         } else {
1931             assert(!bs->supported_zero_flags);
1932         }
1933 
1934         if (ret == -ENOTSUP && !(flags & BDRV_REQ_NO_FALLBACK)) {
1935             /* Fall back to bounce buffer if write zeroes is unsupported */
1936             BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1937 
1938             if ((flags & BDRV_REQ_FUA) &&
1939                 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1940                 /* No need for bdrv_driver_pwrite() to do a fallback
1941                  * flush on each chunk; use just one at the end */
1942                 write_flags &= ~BDRV_REQ_FUA;
1943                 need_flush = true;
1944             }
1945             num = MIN(num, max_transfer);
1946             if (buf == NULL) {
1947                 buf = qemu_try_blockalign0(bs, num);
1948                 if (buf == NULL) {
1949                     ret = -ENOMEM;
1950                     goto fail;
1951                 }
1952             }
1953             qemu_iovec_init_buf(&qiov, buf, num);
1954 
1955             ret = bdrv_driver_pwritev(bs, offset, num, &qiov, 0, write_flags);
1956 
1957             /* Keep bounce buffer around if it is big enough for all
1958              * all future requests.
1959              */
1960             if (num < max_transfer) {
1961                 qemu_vfree(buf);
1962                 buf = NULL;
1963             }
1964         }
1965 
1966         offset += num;
1967         bytes -= num;
1968     }
1969 
1970 fail:
1971     if (ret == 0 && need_flush) {
1972         ret = bdrv_co_flush(bs);
1973     }
1974     qemu_vfree(buf);
1975     return ret;
1976 }
1977 
1978 static inline int coroutine_fn GRAPH_RDLOCK
1979 bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes,
1980                           BdrvTrackedRequest *req, int flags)
1981 {
1982     BlockDriverState *bs = child->bs;
1983 
1984     bdrv_check_request(offset, bytes, &error_abort);
1985 
1986     if (bdrv_is_read_only(bs)) {
1987         return -EPERM;
1988     }
1989 
1990     assert(!(bs->open_flags & BDRV_O_INACTIVE));
1991     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1992     assert(!(flags & ~BDRV_REQ_MASK));
1993     assert(!((flags & BDRV_REQ_NO_WAIT) && !(flags & BDRV_REQ_SERIALISING)));
1994 
1995     if (flags & BDRV_REQ_SERIALISING) {
1996         QEMU_LOCK_GUARD(&bs->reqs_lock);
1997 
1998         tracked_request_set_serialising(req, bdrv_get_cluster_size(bs));
1999 
2000         if ((flags & BDRV_REQ_NO_WAIT) && bdrv_find_conflicting_request(req)) {
2001             return -EBUSY;
2002         }
2003 
2004         bdrv_wait_serialising_requests_locked(req);
2005     } else {
2006         bdrv_wait_serialising_requests(req);
2007     }
2008 
2009     assert(req->overlap_offset <= offset);
2010     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
2011     assert(offset + bytes <= bs->total_sectors * BDRV_SECTOR_SIZE ||
2012            child->perm & BLK_PERM_RESIZE);
2013 
2014     switch (req->type) {
2015     case BDRV_TRACKED_WRITE:
2016     case BDRV_TRACKED_DISCARD:
2017         if (flags & BDRV_REQ_WRITE_UNCHANGED) {
2018             assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
2019         } else {
2020             assert(child->perm & BLK_PERM_WRITE);
2021         }
2022         bdrv_write_threshold_check_write(bs, offset, bytes);
2023         return 0;
2024     case BDRV_TRACKED_TRUNCATE:
2025         assert(child->perm & BLK_PERM_RESIZE);
2026         return 0;
2027     default:
2028         abort();
2029     }
2030 }
2031 
2032 static inline void coroutine_fn GRAPH_RDLOCK
2033 bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, int64_t bytes,
2034                          BdrvTrackedRequest *req, int ret)
2035 {
2036     int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
2037     BlockDriverState *bs = child->bs;
2038 
2039     bdrv_check_request(offset, bytes, &error_abort);
2040 
2041     qatomic_inc(&bs->write_gen);
2042 
2043     /*
2044      * Discard cannot extend the image, but in error handling cases, such as
2045      * when reverting a qcow2 cluster allocation, the discarded range can pass
2046      * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD
2047      * here. Instead, just skip it, since semantically a discard request
2048      * beyond EOF cannot expand the image anyway.
2049      */
2050     if (ret == 0 &&
2051         (req->type == BDRV_TRACKED_TRUNCATE ||
2052          end_sector > bs->total_sectors) &&
2053         req->type != BDRV_TRACKED_DISCARD) {
2054         bs->total_sectors = end_sector;
2055         bdrv_parent_cb_resize(bs);
2056         bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS);
2057     }
2058     if (req->bytes) {
2059         switch (req->type) {
2060         case BDRV_TRACKED_WRITE:
2061             stat64_max(&bs->wr_highest_offset, offset + bytes);
2062             /* fall through, to set dirty bits */
2063         case BDRV_TRACKED_DISCARD:
2064             bdrv_set_dirty(bs, offset, bytes);
2065             break;
2066         default:
2067             break;
2068         }
2069     }
2070 }
2071 
2072 /*
2073  * Forwards an already correctly aligned write request to the BlockDriver,
2074  * after possibly fragmenting it.
2075  */
2076 static int coroutine_fn GRAPH_RDLOCK
2077 bdrv_aligned_pwritev(BdrvChild *child, BdrvTrackedRequest *req,
2078                      int64_t offset, int64_t bytes, int64_t align,
2079                      QEMUIOVector *qiov, size_t qiov_offset,
2080                      BdrvRequestFlags flags)
2081 {
2082     BlockDriverState *bs = child->bs;
2083     BlockDriver *drv = bs->drv;
2084     int ret;
2085 
2086     int64_t bytes_remaining = bytes;
2087     int max_transfer;
2088 
2089     bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
2090 
2091     if (!drv) {
2092         return -ENOMEDIUM;
2093     }
2094 
2095     if (bdrv_has_readonly_bitmaps(bs)) {
2096         return -EPERM;
2097     }
2098 
2099     assert(is_power_of_2(align));
2100     assert((offset & (align - 1)) == 0);
2101     assert((bytes & (align - 1)) == 0);
2102     max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
2103                                    align);
2104 
2105     ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags);
2106 
2107     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
2108         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
2109         qemu_iovec_is_zero(qiov, qiov_offset, bytes)) {
2110         flags |= BDRV_REQ_ZERO_WRITE;
2111         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
2112             flags |= BDRV_REQ_MAY_UNMAP;
2113         }
2114 
2115         /* Can't use optimization hint with bufferless zero write */
2116         flags &= ~BDRV_REQ_REGISTERED_BUF;
2117     }
2118 
2119     if (ret < 0) {
2120         /* Do nothing, write notifier decided to fail this request */
2121     } else if (flags & BDRV_REQ_ZERO_WRITE) {
2122         bdrv_co_debug_event(bs, BLKDBG_PWRITEV_ZERO);
2123         ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
2124     } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
2125         ret = bdrv_driver_pwritev_compressed(bs, offset, bytes,
2126                                              qiov, qiov_offset);
2127     } else if (bytes <= max_transfer) {
2128         bdrv_co_debug_event(bs, BLKDBG_PWRITEV);
2129         ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, qiov_offset, flags);
2130     } else {
2131         bdrv_co_debug_event(bs, BLKDBG_PWRITEV);
2132         while (bytes_remaining) {
2133             int num = MIN(bytes_remaining, max_transfer);
2134             int local_flags = flags;
2135 
2136             assert(num);
2137             if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
2138                 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
2139                 /* If FUA is going to be emulated by flush, we only
2140                  * need to flush on the last iteration */
2141                 local_flags &= ~BDRV_REQ_FUA;
2142             }
2143 
2144             ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
2145                                       num, qiov,
2146                                       qiov_offset + bytes - bytes_remaining,
2147                                       local_flags);
2148             if (ret < 0) {
2149                 break;
2150             }
2151             bytes_remaining -= num;
2152         }
2153     }
2154     bdrv_co_debug_event(bs, BLKDBG_PWRITEV_DONE);
2155 
2156     if (ret >= 0) {
2157         ret = 0;
2158     }
2159     bdrv_co_write_req_finish(child, offset, bytes, req, ret);
2160 
2161     return ret;
2162 }
2163 
2164 static int coroutine_fn GRAPH_RDLOCK
2165 bdrv_co_do_zero_pwritev(BdrvChild *child, int64_t offset, int64_t bytes,
2166                         BdrvRequestFlags flags, BdrvTrackedRequest *req)
2167 {
2168     BlockDriverState *bs = child->bs;
2169     QEMUIOVector local_qiov;
2170     uint64_t align = bs->bl.request_alignment;
2171     int ret = 0;
2172     bool padding;
2173     BdrvRequestPadding pad;
2174 
2175     /* This flag doesn't make sense for padding or zero writes */
2176     flags &= ~BDRV_REQ_REGISTERED_BUF;
2177 
2178     padding = bdrv_init_padding(bs, offset, bytes, true, &pad);
2179     if (padding) {
2180         assert(!(flags & BDRV_REQ_NO_WAIT));
2181         bdrv_make_request_serialising(req, align);
2182 
2183         bdrv_padding_rmw_read(child, req, &pad, true);
2184 
2185         if (pad.head || pad.merge_reads) {
2186             int64_t aligned_offset = offset & ~(align - 1);
2187             int64_t write_bytes = pad.merge_reads ? pad.buf_len : align;
2188 
2189             qemu_iovec_init_buf(&local_qiov, pad.buf, write_bytes);
2190             ret = bdrv_aligned_pwritev(child, req, aligned_offset, write_bytes,
2191                                        align, &local_qiov, 0,
2192                                        flags & ~BDRV_REQ_ZERO_WRITE);
2193             if (ret < 0 || pad.merge_reads) {
2194                 /* Error or all work is done */
2195                 goto out;
2196             }
2197             offset += write_bytes - pad.head;
2198             bytes -= write_bytes - pad.head;
2199         }
2200     }
2201 
2202     assert(!bytes || (offset & (align - 1)) == 0);
2203     if (bytes >= align) {
2204         /* Write the aligned part in the middle. */
2205         int64_t aligned_bytes = bytes & ~(align - 1);
2206         ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
2207                                    NULL, 0, flags);
2208         if (ret < 0) {
2209             goto out;
2210         }
2211         bytes -= aligned_bytes;
2212         offset += aligned_bytes;
2213     }
2214 
2215     assert(!bytes || (offset & (align - 1)) == 0);
2216     if (bytes) {
2217         assert(align == pad.tail + bytes);
2218 
2219         qemu_iovec_init_buf(&local_qiov, pad.tail_buf, align);
2220         ret = bdrv_aligned_pwritev(child, req, offset, align, align,
2221                                    &local_qiov, 0,
2222                                    flags & ~BDRV_REQ_ZERO_WRITE);
2223     }
2224 
2225 out:
2226     bdrv_padding_finalize(&pad);
2227 
2228     return ret;
2229 }
2230 
2231 /*
2232  * Handle a write request in coroutine context
2233  */
2234 int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
2235     int64_t offset, int64_t bytes, QEMUIOVector *qiov,
2236     BdrvRequestFlags flags)
2237 {
2238     IO_CODE();
2239     return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags);
2240 }
2241 
2242 int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
2243     int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset,
2244     BdrvRequestFlags flags)
2245 {
2246     BlockDriverState *bs = child->bs;
2247     BdrvTrackedRequest req;
2248     uint64_t align = bs->bl.request_alignment;
2249     BdrvRequestPadding pad;
2250     int ret;
2251     bool padded = false;
2252     IO_CODE();
2253 
2254     trace_bdrv_co_pwritev_part(child->bs, offset, bytes, flags);
2255 
2256     if (!bdrv_co_is_inserted(bs)) {
2257         return -ENOMEDIUM;
2258     }
2259 
2260     if (flags & BDRV_REQ_ZERO_WRITE) {
2261         ret = bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, NULL);
2262     } else {
2263         ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset);
2264     }
2265     if (ret < 0) {
2266         return ret;
2267     }
2268 
2269     /* If the request is misaligned then we can't make it efficient */
2270     if ((flags & BDRV_REQ_NO_FALLBACK) &&
2271         !QEMU_IS_ALIGNED(offset | bytes, align))
2272     {
2273         return -ENOTSUP;
2274     }
2275 
2276     if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) {
2277         /*
2278          * Aligning zero request is nonsense. Even if driver has special meaning
2279          * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass
2280          * it to driver due to request_alignment.
2281          *
2282          * Still, no reason to return an error if someone do unaligned
2283          * zero-length write occasionally.
2284          */
2285         return 0;
2286     }
2287 
2288     if (!(flags & BDRV_REQ_ZERO_WRITE)) {
2289         /*
2290          * Pad request for following read-modify-write cycle.
2291          * bdrv_co_do_zero_pwritev() does aligning by itself, so, we do
2292          * alignment only if there is no ZERO flag.
2293          */
2294         ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, true,
2295                                &pad, &padded, &flags);
2296         if (ret < 0) {
2297             return ret;
2298         }
2299     }
2300 
2301     bdrv_inc_in_flight(bs);
2302     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
2303 
2304     if (flags & BDRV_REQ_ZERO_WRITE) {
2305         assert(!padded);
2306         ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
2307         goto out;
2308     }
2309 
2310     if (padded) {
2311         /*
2312          * Request was unaligned to request_alignment and therefore
2313          * padded.  We are going to do read-modify-write, and must
2314          * serialize the request to prevent interactions of the
2315          * widened region with other transactions.
2316          */
2317         assert(!(flags & BDRV_REQ_NO_WAIT));
2318         bdrv_make_request_serialising(&req, align);
2319         bdrv_padding_rmw_read(child, &req, &pad, false);
2320     }
2321 
2322     ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
2323                                qiov, qiov_offset, flags);
2324 
2325     bdrv_padding_finalize(&pad);
2326 
2327 out:
2328     tracked_request_end(&req);
2329     bdrv_dec_in_flight(bs);
2330 
2331     return ret;
2332 }
2333 
2334 int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
2335                                        int64_t bytes, BdrvRequestFlags flags)
2336 {
2337     IO_CODE();
2338     trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
2339     assert_bdrv_graph_readable();
2340 
2341     if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
2342         flags &= ~BDRV_REQ_MAY_UNMAP;
2343     }
2344 
2345     return bdrv_co_pwritev(child, offset, bytes, NULL,
2346                            BDRV_REQ_ZERO_WRITE | flags);
2347 }
2348 
2349 /*
2350  * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
2351  */
2352 int bdrv_flush_all(void)
2353 {
2354     BdrvNextIterator it;
2355     BlockDriverState *bs = NULL;
2356     int result = 0;
2357 
2358     GLOBAL_STATE_CODE();
2359     GRAPH_RDLOCK_GUARD_MAINLOOP();
2360 
2361     /*
2362      * bdrv queue is managed by record/replay,
2363      * creating new flush request for stopping
2364      * the VM may break the determinism
2365      */
2366     if (replay_events_enabled()) {
2367         return result;
2368     }
2369 
2370     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
2371         AioContext *aio_context = bdrv_get_aio_context(bs);
2372         int ret;
2373 
2374         aio_context_acquire(aio_context);
2375         ret = bdrv_flush(bs);
2376         if (ret < 0 && !result) {
2377             result = ret;
2378         }
2379         aio_context_release(aio_context);
2380     }
2381 
2382     return result;
2383 }
2384 
2385 /*
2386  * Returns the allocation status of the specified sectors.
2387  * Drivers not implementing the functionality are assumed to not support
2388  * backing files, hence all their sectors are reported as allocated.
2389  *
2390  * If 'want_zero' is true, the caller is querying for mapping
2391  * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and
2392  * _ZERO where possible; otherwise, the result favors larger 'pnum',
2393  * with a focus on accurate BDRV_BLOCK_ALLOCATED.
2394  *
2395  * If 'offset' is beyond the end of the disk image the return value is
2396  * BDRV_BLOCK_EOF and 'pnum' is set to 0.
2397  *
2398  * 'bytes' is the max value 'pnum' should be set to.  If bytes goes
2399  * beyond the end of the disk image it will be clamped; if 'pnum' is set to
2400  * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
2401  *
2402  * 'pnum' is set to the number of bytes (including and immediately
2403  * following the specified offset) that are easily known to be in the
2404  * same allocated/unallocated state.  Note that a second call starting
2405  * at the original offset plus returned pnum may have the same status.
2406  * The returned value is non-zero on success except at end-of-file.
2407  *
2408  * Returns negative errno on failure.  Otherwise, if the
2409  * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are
2410  * set to the host mapping and BDS corresponding to the guest offset.
2411  */
2412 static int coroutine_fn GRAPH_RDLOCK
2413 bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero,
2414                         int64_t offset, int64_t bytes,
2415                         int64_t *pnum, int64_t *map, BlockDriverState **file)
2416 {
2417     int64_t total_size;
2418     int64_t n; /* bytes */
2419     int ret;
2420     int64_t local_map = 0;
2421     BlockDriverState *local_file = NULL;
2422     int64_t aligned_offset, aligned_bytes;
2423     uint32_t align;
2424     bool has_filtered_child;
2425 
2426     assert(pnum);
2427     assert_bdrv_graph_readable();
2428     *pnum = 0;
2429     total_size = bdrv_co_getlength(bs);
2430     if (total_size < 0) {
2431         ret = total_size;
2432         goto early_out;
2433     }
2434 
2435     if (offset >= total_size) {
2436         ret = BDRV_BLOCK_EOF;
2437         goto early_out;
2438     }
2439     if (!bytes) {
2440         ret = 0;
2441         goto early_out;
2442     }
2443 
2444     n = total_size - offset;
2445     if (n < bytes) {
2446         bytes = n;
2447     }
2448 
2449     /* Must be non-NULL or bdrv_co_getlength() would have failed */
2450     assert(bs->drv);
2451     has_filtered_child = bdrv_filter_child(bs);
2452     if (!bs->drv->bdrv_co_block_status && !has_filtered_child) {
2453         *pnum = bytes;
2454         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
2455         if (offset + bytes == total_size) {
2456             ret |= BDRV_BLOCK_EOF;
2457         }
2458         if (bs->drv->protocol_name) {
2459             ret |= BDRV_BLOCK_OFFSET_VALID;
2460             local_map = offset;
2461             local_file = bs;
2462         }
2463         goto early_out;
2464     }
2465 
2466     bdrv_inc_in_flight(bs);
2467 
2468     /* Round out to request_alignment boundaries */
2469     align = bs->bl.request_alignment;
2470     aligned_offset = QEMU_ALIGN_DOWN(offset, align);
2471     aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
2472 
2473     if (bs->drv->bdrv_co_block_status) {
2474         /*
2475          * Use the block-status cache only for protocol nodes: Format
2476          * drivers are generally quick to inquire the status, but protocol
2477          * drivers often need to get information from outside of qemu, so
2478          * we do not have control over the actual implementation.  There
2479          * have been cases where inquiring the status took an unreasonably
2480          * long time, and we can do nothing in qemu to fix it.
2481          * This is especially problematic for images with large data areas,
2482          * because finding the few holes in them and giving them special
2483          * treatment does not gain much performance.  Therefore, we try to
2484          * cache the last-identified data region.
2485          *
2486          * Second, limiting ourselves to protocol nodes allows us to assume
2487          * the block status for data regions to be DATA | OFFSET_VALID, and
2488          * that the host offset is the same as the guest offset.
2489          *
2490          * Note that it is possible that external writers zero parts of
2491          * the cached regions without the cache being invalidated, and so
2492          * we may report zeroes as data.  This is not catastrophic,
2493          * however, because reporting zeroes as data is fine.
2494          */
2495         if (QLIST_EMPTY(&bs->children) &&
2496             bdrv_bsc_is_data(bs, aligned_offset, pnum))
2497         {
2498             ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
2499             local_file = bs;
2500             local_map = aligned_offset;
2501         } else {
2502             ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
2503                                                 aligned_bytes, pnum, &local_map,
2504                                                 &local_file);
2505 
2506             /*
2507              * Note that checking QLIST_EMPTY(&bs->children) is also done when
2508              * the cache is queried above.  Technically, we do not need to check
2509              * it here; the worst that can happen is that we fill the cache for
2510              * non-protocol nodes, and then it is never used.  However, filling
2511              * the cache requires an RCU update, so double check here to avoid
2512              * such an update if possible.
2513              *
2514              * Check want_zero, because we only want to update the cache when we
2515              * have accurate information about what is zero and what is data.
2516              */
2517             if (want_zero &&
2518                 ret == (BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID) &&
2519                 QLIST_EMPTY(&bs->children))
2520             {
2521                 /*
2522                  * When a protocol driver reports BLOCK_OFFSET_VALID, the
2523                  * returned local_map value must be the same as the offset we
2524                  * have passed (aligned_offset), and local_bs must be the node
2525                  * itself.
2526                  * Assert this, because we follow this rule when reading from
2527                  * the cache (see the `local_file = bs` and
2528                  * `local_map = aligned_offset` assignments above), and the
2529                  * result the cache delivers must be the same as the driver
2530                  * would deliver.
2531                  */
2532                 assert(local_file == bs);
2533                 assert(local_map == aligned_offset);
2534                 bdrv_bsc_fill(bs, aligned_offset, *pnum);
2535             }
2536         }
2537     } else {
2538         /* Default code for filters */
2539 
2540         local_file = bdrv_filter_bs(bs);
2541         assert(local_file);
2542 
2543         *pnum = aligned_bytes;
2544         local_map = aligned_offset;
2545         ret = BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
2546     }
2547     if (ret < 0) {
2548         *pnum = 0;
2549         goto out;
2550     }
2551 
2552     /*
2553      * The driver's result must be a non-zero multiple of request_alignment.
2554      * Clamp pnum and adjust map to original request.
2555      */
2556     assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) &&
2557            align > offset - aligned_offset);
2558     if (ret & BDRV_BLOCK_RECURSE) {
2559         assert(ret & BDRV_BLOCK_DATA);
2560         assert(ret & BDRV_BLOCK_OFFSET_VALID);
2561         assert(!(ret & BDRV_BLOCK_ZERO));
2562     }
2563 
2564     *pnum -= offset - aligned_offset;
2565     if (*pnum > bytes) {
2566         *pnum = bytes;
2567     }
2568     if (ret & BDRV_BLOCK_OFFSET_VALID) {
2569         local_map += offset - aligned_offset;
2570     }
2571 
2572     if (ret & BDRV_BLOCK_RAW) {
2573         assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file);
2574         ret = bdrv_co_do_block_status(local_file, want_zero, local_map,
2575                                       *pnum, pnum, &local_map, &local_file);
2576         goto out;
2577     }
2578 
2579     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
2580         ret |= BDRV_BLOCK_ALLOCATED;
2581     } else if (bs->drv->supports_backing) {
2582         BlockDriverState *cow_bs = bdrv_cow_bs(bs);
2583 
2584         if (!cow_bs) {
2585             ret |= BDRV_BLOCK_ZERO;
2586         } else if (want_zero) {
2587             int64_t size2 = bdrv_co_getlength(cow_bs);
2588 
2589             if (size2 >= 0 && offset >= size2) {
2590                 ret |= BDRV_BLOCK_ZERO;
2591             }
2592         }
2593     }
2594 
2595     if (want_zero && ret & BDRV_BLOCK_RECURSE &&
2596         local_file && local_file != bs &&
2597         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
2598         (ret & BDRV_BLOCK_OFFSET_VALID)) {
2599         int64_t file_pnum;
2600         int ret2;
2601 
2602         ret2 = bdrv_co_do_block_status(local_file, want_zero, local_map,
2603                                        *pnum, &file_pnum, NULL, NULL);
2604         if (ret2 >= 0) {
2605             /* Ignore errors.  This is just providing extra information, it
2606              * is useful but not necessary.
2607              */
2608             if (ret2 & BDRV_BLOCK_EOF &&
2609                 (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
2610                 /*
2611                  * It is valid for the format block driver to read
2612                  * beyond the end of the underlying file's current
2613                  * size; such areas read as zero.
2614                  */
2615                 ret |= BDRV_BLOCK_ZERO;
2616             } else {
2617                 /* Limit request to the range reported by the protocol driver */
2618                 *pnum = file_pnum;
2619                 ret |= (ret2 & BDRV_BLOCK_ZERO);
2620             }
2621         }
2622     }
2623 
2624 out:
2625     bdrv_dec_in_flight(bs);
2626     if (ret >= 0 && offset + *pnum == total_size) {
2627         ret |= BDRV_BLOCK_EOF;
2628     }
2629 early_out:
2630     if (file) {
2631         *file = local_file;
2632     }
2633     if (map) {
2634         *map = local_map;
2635     }
2636     return ret;
2637 }
2638 
2639 int coroutine_fn
2640 bdrv_co_common_block_status_above(BlockDriverState *bs,
2641                                   BlockDriverState *base,
2642                                   bool include_base,
2643                                   bool want_zero,
2644                                   int64_t offset,
2645                                   int64_t bytes,
2646                                   int64_t *pnum,
2647                                   int64_t *map,
2648                                   BlockDriverState **file,
2649                                   int *depth)
2650 {
2651     int ret;
2652     BlockDriverState *p;
2653     int64_t eof = 0;
2654     int dummy;
2655     IO_CODE();
2656 
2657     assert(!include_base || base); /* Can't include NULL base */
2658     assert_bdrv_graph_readable();
2659 
2660     if (!depth) {
2661         depth = &dummy;
2662     }
2663     *depth = 0;
2664 
2665     if (!include_base && bs == base) {
2666         *pnum = bytes;
2667         return 0;
2668     }
2669 
2670     ret = bdrv_co_do_block_status(bs, want_zero, offset, bytes, pnum,
2671                                   map, file);
2672     ++*depth;
2673     if (ret < 0 || *pnum == 0 || ret & BDRV_BLOCK_ALLOCATED || bs == base) {
2674         return ret;
2675     }
2676 
2677     if (ret & BDRV_BLOCK_EOF) {
2678         eof = offset + *pnum;
2679     }
2680 
2681     assert(*pnum <= bytes);
2682     bytes = *pnum;
2683 
2684     for (p = bdrv_filter_or_cow_bs(bs); include_base || p != base;
2685          p = bdrv_filter_or_cow_bs(p))
2686     {
2687         ret = bdrv_co_do_block_status(p, want_zero, offset, bytes, pnum,
2688                                       map, file);
2689         ++*depth;
2690         if (ret < 0) {
2691             return ret;
2692         }
2693         if (*pnum == 0) {
2694             /*
2695              * The top layer deferred to this layer, and because this layer is
2696              * short, any zeroes that we synthesize beyond EOF behave as if they
2697              * were allocated at this layer.
2698              *
2699              * We don't include BDRV_BLOCK_EOF into ret, as upper layer may be
2700              * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see
2701              * below.
2702              */
2703             assert(ret & BDRV_BLOCK_EOF);
2704             *pnum = bytes;
2705             if (file) {
2706                 *file = p;
2707             }
2708             ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_ALLOCATED;
2709             break;
2710         }
2711         if (ret & BDRV_BLOCK_ALLOCATED) {
2712             /*
2713              * We've found the node and the status, we must break.
2714              *
2715              * Drop BDRV_BLOCK_EOF, as it's not for upper layer, which may be
2716              * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see
2717              * below.
2718              */
2719             ret &= ~BDRV_BLOCK_EOF;
2720             break;
2721         }
2722 
2723         if (p == base) {
2724             assert(include_base);
2725             break;
2726         }
2727 
2728         /*
2729          * OK, [offset, offset + *pnum) region is unallocated on this layer,
2730          * let's continue the diving.
2731          */
2732         assert(*pnum <= bytes);
2733         bytes = *pnum;
2734     }
2735 
2736     if (offset + *pnum == eof) {
2737         ret |= BDRV_BLOCK_EOF;
2738     }
2739 
2740     return ret;
2741 }
2742 
2743 int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
2744                                             BlockDriverState *base,
2745                                             int64_t offset, int64_t bytes,
2746                                             int64_t *pnum, int64_t *map,
2747                                             BlockDriverState **file)
2748 {
2749     IO_CODE();
2750     return bdrv_co_common_block_status_above(bs, base, false, true, offset,
2751                                              bytes, pnum, map, file, NULL);
2752 }
2753 
2754 int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, int64_t offset,
2755                                       int64_t bytes, int64_t *pnum,
2756                                       int64_t *map, BlockDriverState **file)
2757 {
2758     IO_CODE();
2759     return bdrv_co_block_status_above(bs, bdrv_filter_or_cow_bs(bs),
2760                                       offset, bytes, pnum, map, file);
2761 }
2762 
2763 /*
2764  * Check @bs (and its backing chain) to see if the range defined
2765  * by @offset and @bytes is known to read as zeroes.
2766  * Return 1 if that is the case, 0 otherwise and -errno on error.
2767  * This test is meant to be fast rather than accurate so returning 0
2768  * does not guarantee non-zero data.
2769  */
2770 int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset,
2771                                       int64_t bytes)
2772 {
2773     int ret;
2774     int64_t pnum = bytes;
2775     IO_CODE();
2776 
2777     if (!bytes) {
2778         return 1;
2779     }
2780 
2781     ret = bdrv_co_common_block_status_above(bs, NULL, false, false, offset,
2782                                             bytes, &pnum, NULL, NULL, NULL);
2783 
2784     if (ret < 0) {
2785         return ret;
2786     }
2787 
2788     return (pnum == bytes) && (ret & BDRV_BLOCK_ZERO);
2789 }
2790 
2791 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t offset,
2792                                       int64_t bytes, int64_t *pnum)
2793 {
2794     int ret;
2795     int64_t dummy;
2796     IO_CODE();
2797 
2798     ret = bdrv_co_common_block_status_above(bs, bs, true, false, offset,
2799                                             bytes, pnum ? pnum : &dummy, NULL,
2800                                             NULL, NULL);
2801     if (ret < 0) {
2802         return ret;
2803     }
2804     return !!(ret & BDRV_BLOCK_ALLOCATED);
2805 }
2806 
2807 /*
2808  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2809  *
2810  * Return a positive depth if (a prefix of) the given range is allocated
2811  * in any image between BASE and TOP (BASE is only included if include_base
2812  * is set).  Depth 1 is TOP, 2 is the first backing layer, and so forth.
2813  * BASE can be NULL to check if the given offset is allocated in any
2814  * image of the chain.  Return 0 otherwise, or negative errno on
2815  * failure.
2816  *
2817  * 'pnum' is set to the number of bytes (including and immediately
2818  * following the specified offset) that are known to be in the same
2819  * allocated/unallocated state.  Note that a subsequent call starting
2820  * at 'offset + *pnum' may return the same allocation status (in other
2821  * words, the result is not necessarily the maximum possible range);
2822  * but 'pnum' will only be 0 when end of file is reached.
2823  */
2824 int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *bs,
2825                                             BlockDriverState *base,
2826                                             bool include_base, int64_t offset,
2827                                             int64_t bytes, int64_t *pnum)
2828 {
2829     int depth;
2830     int ret;
2831     IO_CODE();
2832 
2833     ret = bdrv_co_common_block_status_above(bs, base, include_base, false,
2834                                             offset, bytes, pnum, NULL, NULL,
2835                                             &depth);
2836     if (ret < 0) {
2837         return ret;
2838     }
2839 
2840     if (ret & BDRV_BLOCK_ALLOCATED) {
2841         return depth;
2842     }
2843     return 0;
2844 }
2845 
2846 int coroutine_fn
2847 bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2848 {
2849     BlockDriver *drv = bs->drv;
2850     BlockDriverState *child_bs = bdrv_primary_bs(bs);
2851     int ret;
2852     IO_CODE();
2853     assert_bdrv_graph_readable();
2854 
2855     ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL);
2856     if (ret < 0) {
2857         return ret;
2858     }
2859 
2860     if (!drv) {
2861         return -ENOMEDIUM;
2862     }
2863 
2864     bdrv_inc_in_flight(bs);
2865 
2866     if (drv->bdrv_co_load_vmstate) {
2867         ret = drv->bdrv_co_load_vmstate(bs, qiov, pos);
2868     } else if (child_bs) {
2869         ret = bdrv_co_readv_vmstate(child_bs, qiov, pos);
2870     } else {
2871         ret = -ENOTSUP;
2872     }
2873 
2874     bdrv_dec_in_flight(bs);
2875 
2876     return ret;
2877 }
2878 
2879 int coroutine_fn
2880 bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2881 {
2882     BlockDriver *drv = bs->drv;
2883     BlockDriverState *child_bs = bdrv_primary_bs(bs);
2884     int ret;
2885     IO_CODE();
2886     assert_bdrv_graph_readable();
2887 
2888     ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL);
2889     if (ret < 0) {
2890         return ret;
2891     }
2892 
2893     if (!drv) {
2894         return -ENOMEDIUM;
2895     }
2896 
2897     bdrv_inc_in_flight(bs);
2898 
2899     if (drv->bdrv_co_save_vmstate) {
2900         ret = drv->bdrv_co_save_vmstate(bs, qiov, pos);
2901     } else if (child_bs) {
2902         ret = bdrv_co_writev_vmstate(child_bs, qiov, pos);
2903     } else {
2904         ret = -ENOTSUP;
2905     }
2906 
2907     bdrv_dec_in_flight(bs);
2908 
2909     return ret;
2910 }
2911 
2912 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2913                       int64_t pos, int size)
2914 {
2915     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
2916     int ret = bdrv_writev_vmstate(bs, &qiov, pos);
2917     IO_CODE();
2918 
2919     return ret < 0 ? ret : size;
2920 }
2921 
2922 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2923                       int64_t pos, int size)
2924 {
2925     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
2926     int ret = bdrv_readv_vmstate(bs, &qiov, pos);
2927     IO_CODE();
2928 
2929     return ret < 0 ? ret : size;
2930 }
2931 
2932 /**************************************************************/
2933 /* async I/Os */
2934 
2935 /**
2936  * Synchronously cancels an acb. Must be called with the BQL held and the acb
2937  * must be processed with the BQL held too (IOThreads are not allowed).
2938  *
2939  * Use bdrv_aio_cancel_async() instead when possible.
2940  */
2941 void bdrv_aio_cancel(BlockAIOCB *acb)
2942 {
2943     GLOBAL_STATE_CODE();
2944     qemu_aio_ref(acb);
2945     bdrv_aio_cancel_async(acb);
2946     AIO_WAIT_WHILE_UNLOCKED(NULL, acb->refcnt > 1);
2947     qemu_aio_unref(acb);
2948 }
2949 
2950 /* Async version of aio cancel. The caller is not blocked if the acb implements
2951  * cancel_async, otherwise we do nothing and let the request normally complete.
2952  * In either case the completion callback must be called. */
2953 void bdrv_aio_cancel_async(BlockAIOCB *acb)
2954 {
2955     IO_CODE();
2956     if (acb->aiocb_info->cancel_async) {
2957         acb->aiocb_info->cancel_async(acb);
2958     }
2959 }
2960 
2961 /**************************************************************/
2962 /* Coroutine block device emulation */
2963 
2964 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2965 {
2966     BdrvChild *primary_child = bdrv_primary_child(bs);
2967     BdrvChild *child;
2968     int current_gen;
2969     int ret = 0;
2970     IO_CODE();
2971 
2972     assert_bdrv_graph_readable();
2973     bdrv_inc_in_flight(bs);
2974 
2975     if (!bdrv_co_is_inserted(bs) || bdrv_is_read_only(bs) ||
2976         bdrv_is_sg(bs)) {
2977         goto early_exit;
2978     }
2979 
2980     qemu_mutex_lock(&bs->reqs_lock);
2981     current_gen = qatomic_read(&bs->write_gen);
2982 
2983     /* Wait until any previous flushes are completed */
2984     while (bs->active_flush_req) {
2985         qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock);
2986     }
2987 
2988     /* Flushes reach this point in nondecreasing current_gen order.  */
2989     bs->active_flush_req = true;
2990     qemu_mutex_unlock(&bs->reqs_lock);
2991 
2992     /* Write back all layers by calling one driver function */
2993     if (bs->drv->bdrv_co_flush) {
2994         ret = bs->drv->bdrv_co_flush(bs);
2995         goto out;
2996     }
2997 
2998     /* Write back cached data to the OS even with cache=unsafe */
2999     BLKDBG_CO_EVENT(primary_child, BLKDBG_FLUSH_TO_OS);
3000     if (bs->drv->bdrv_co_flush_to_os) {
3001         ret = bs->drv->bdrv_co_flush_to_os(bs);
3002         if (ret < 0) {
3003             goto out;
3004         }
3005     }
3006 
3007     /* But don't actually force it to the disk with cache=unsafe */
3008     if (bs->open_flags & BDRV_O_NO_FLUSH) {
3009         goto flush_children;
3010     }
3011 
3012     /* Check if we really need to flush anything */
3013     if (bs->flushed_gen == current_gen) {
3014         goto flush_children;
3015     }
3016 
3017     BLKDBG_CO_EVENT(primary_child, BLKDBG_FLUSH_TO_DISK);
3018     if (!bs->drv) {
3019         /* bs->drv->bdrv_co_flush() might have ejected the BDS
3020          * (even in case of apparent success) */
3021         ret = -ENOMEDIUM;
3022         goto out;
3023     }
3024     if (bs->drv->bdrv_co_flush_to_disk) {
3025         ret = bs->drv->bdrv_co_flush_to_disk(bs);
3026     } else if (bs->drv->bdrv_aio_flush) {
3027         BlockAIOCB *acb;
3028         CoroutineIOCompletion co = {
3029             .coroutine = qemu_coroutine_self(),
3030         };
3031 
3032         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3033         if (acb == NULL) {
3034             ret = -EIO;
3035         } else {
3036             qemu_coroutine_yield();
3037             ret = co.ret;
3038         }
3039     } else {
3040         /*
3041          * Some block drivers always operate in either writethrough or unsafe
3042          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3043          * know how the server works (because the behaviour is hardcoded or
3044          * depends on server-side configuration), so we can't ensure that
3045          * everything is safe on disk. Returning an error doesn't work because
3046          * that would break guests even if the server operates in writethrough
3047          * mode.
3048          *
3049          * Let's hope the user knows what he's doing.
3050          */
3051         ret = 0;
3052     }
3053 
3054     if (ret < 0) {
3055         goto out;
3056     }
3057 
3058     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
3059      * in the case of cache=unsafe, so there are no useless flushes.
3060      */
3061 flush_children:
3062     ret = 0;
3063     QLIST_FOREACH(child, &bs->children, next) {
3064         if (child->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) {
3065             int this_child_ret = bdrv_co_flush(child->bs);
3066             if (!ret) {
3067                 ret = this_child_ret;
3068             }
3069         }
3070     }
3071 
3072 out:
3073     /* Notify any pending flushes that we have completed */
3074     if (ret == 0) {
3075         bs->flushed_gen = current_gen;
3076     }
3077 
3078     qemu_mutex_lock(&bs->reqs_lock);
3079     bs->active_flush_req = false;
3080     /* Return value is ignored - it's ok if wait queue is empty */
3081     qemu_co_queue_next(&bs->flush_queue);
3082     qemu_mutex_unlock(&bs->reqs_lock);
3083 
3084 early_exit:
3085     bdrv_dec_in_flight(bs);
3086     return ret;
3087 }
3088 
3089 int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
3090                                   int64_t bytes)
3091 {
3092     BdrvTrackedRequest req;
3093     int ret;
3094     int64_t max_pdiscard;
3095     int head, tail, align;
3096     BlockDriverState *bs = child->bs;
3097     IO_CODE();
3098     assert_bdrv_graph_readable();
3099 
3100     if (!bs || !bs->drv || !bdrv_co_is_inserted(bs)) {
3101         return -ENOMEDIUM;
3102     }
3103 
3104     if (bdrv_has_readonly_bitmaps(bs)) {
3105         return -EPERM;
3106     }
3107 
3108     ret = bdrv_check_request(offset, bytes, NULL);
3109     if (ret < 0) {
3110         return ret;
3111     }
3112 
3113     /* Do nothing if disabled.  */
3114     if (!(bs->open_flags & BDRV_O_UNMAP)) {
3115         return 0;
3116     }
3117 
3118     if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
3119         return 0;
3120     }
3121 
3122     /* Invalidate the cached block-status data range if this discard overlaps */
3123     bdrv_bsc_invalidate_range(bs, offset, bytes);
3124 
3125     /* Discard is advisory, but some devices track and coalesce
3126      * unaligned requests, so we must pass everything down rather than
3127      * round here.  Still, most devices will just silently ignore
3128      * unaligned requests (by returning -ENOTSUP), so we must fragment
3129      * the request accordingly.  */
3130     align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
3131     assert(align % bs->bl.request_alignment == 0);
3132     head = offset % align;
3133     tail = (offset + bytes) % align;
3134 
3135     bdrv_inc_in_flight(bs);
3136     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
3137 
3138     ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0);
3139     if (ret < 0) {
3140         goto out;
3141     }
3142 
3143     max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT64_MAX),
3144                                    align);
3145     assert(max_pdiscard >= bs->bl.request_alignment);
3146 
3147     while (bytes > 0) {
3148         int64_t num = bytes;
3149 
3150         if (head) {
3151             /* Make small requests to get to alignment boundaries. */
3152             num = MIN(bytes, align - head);
3153             if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
3154                 num %= bs->bl.request_alignment;
3155             }
3156             head = (head + num) % align;
3157             assert(num < max_pdiscard);
3158         } else if (tail) {
3159             if (num > align) {
3160                 /* Shorten the request to the last aligned cluster.  */
3161                 num -= tail;
3162             } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
3163                        tail > bs->bl.request_alignment) {
3164                 tail %= bs->bl.request_alignment;
3165                 num -= tail;
3166             }
3167         }
3168         /* limit request size */
3169         if (num > max_pdiscard) {
3170             num = max_pdiscard;
3171         }
3172 
3173         if (!bs->drv) {
3174             ret = -ENOMEDIUM;
3175             goto out;
3176         }
3177         if (bs->drv->bdrv_co_pdiscard) {
3178             ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
3179         } else {
3180             BlockAIOCB *acb;
3181             CoroutineIOCompletion co = {
3182                 .coroutine = qemu_coroutine_self(),
3183             };
3184 
3185             acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
3186                                              bdrv_co_io_em_complete, &co);
3187             if (acb == NULL) {
3188                 ret = -EIO;
3189                 goto out;
3190             } else {
3191                 qemu_coroutine_yield();
3192                 ret = co.ret;
3193             }
3194         }
3195         if (ret && ret != -ENOTSUP) {
3196             goto out;
3197         }
3198 
3199         offset += num;
3200         bytes -= num;
3201     }
3202     ret = 0;
3203 out:
3204     bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret);
3205     tracked_request_end(&req);
3206     bdrv_dec_in_flight(bs);
3207     return ret;
3208 }
3209 
3210 int coroutine_fn bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
3211 {
3212     BlockDriver *drv = bs->drv;
3213     CoroutineIOCompletion co = {
3214         .coroutine = qemu_coroutine_self(),
3215     };
3216     BlockAIOCB *acb;
3217     IO_CODE();
3218     assert_bdrv_graph_readable();
3219 
3220     bdrv_inc_in_flight(bs);
3221     if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
3222         co.ret = -ENOTSUP;
3223         goto out;
3224     }
3225 
3226     if (drv->bdrv_co_ioctl) {
3227         co.ret = drv->bdrv_co_ioctl(bs, req, buf);
3228     } else {
3229         acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
3230         if (!acb) {
3231             co.ret = -ENOTSUP;
3232             goto out;
3233         }
3234         qemu_coroutine_yield();
3235     }
3236 out:
3237     bdrv_dec_in_flight(bs);
3238     return co.ret;
3239 }
3240 
3241 int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
3242                         unsigned int *nr_zones,
3243                         BlockZoneDescriptor *zones)
3244 {
3245     BlockDriver *drv = bs->drv;
3246     CoroutineIOCompletion co = {
3247             .coroutine = qemu_coroutine_self(),
3248     };
3249     IO_CODE();
3250 
3251     bdrv_inc_in_flight(bs);
3252     if (!drv || !drv->bdrv_co_zone_report || bs->bl.zoned == BLK_Z_NONE) {
3253         co.ret = -ENOTSUP;
3254         goto out;
3255     }
3256     co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones);
3257 out:
3258     bdrv_dec_in_flight(bs);
3259     return co.ret;
3260 }
3261 
3262 int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
3263         int64_t offset, int64_t len)
3264 {
3265     BlockDriver *drv = bs->drv;
3266     CoroutineIOCompletion co = {
3267             .coroutine = qemu_coroutine_self(),
3268     };
3269     IO_CODE();
3270 
3271     bdrv_inc_in_flight(bs);
3272     if (!drv || !drv->bdrv_co_zone_mgmt || bs->bl.zoned == BLK_Z_NONE) {
3273         co.ret = -ENOTSUP;
3274         goto out;
3275     }
3276     co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len);
3277 out:
3278     bdrv_dec_in_flight(bs);
3279     return co.ret;
3280 }
3281 
3282 int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
3283                         QEMUIOVector *qiov,
3284                         BdrvRequestFlags flags)
3285 {
3286     int ret;
3287     BlockDriver *drv = bs->drv;
3288     CoroutineIOCompletion co = {
3289             .coroutine = qemu_coroutine_self(),
3290     };
3291     IO_CODE();
3292 
3293     ret = bdrv_check_qiov_request(*offset, qiov->size, qiov, 0, NULL);
3294     if (ret < 0) {
3295         return ret;
3296     }
3297 
3298     bdrv_inc_in_flight(bs);
3299     if (!drv || !drv->bdrv_co_zone_append || bs->bl.zoned == BLK_Z_NONE) {
3300         co.ret = -ENOTSUP;
3301         goto out;
3302     }
3303     co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
3304 out:
3305     bdrv_dec_in_flight(bs);
3306     return co.ret;
3307 }
3308 
3309 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3310 {
3311     IO_CODE();
3312     return qemu_memalign(bdrv_opt_mem_align(bs), size);
3313 }
3314 
3315 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
3316 {
3317     IO_CODE();
3318     return memset(qemu_blockalign(bs, size), 0, size);
3319 }
3320 
3321 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
3322 {
3323     size_t align = bdrv_opt_mem_align(bs);
3324     IO_CODE();
3325 
3326     /* Ensure that NULL is never returned on success */
3327     assert(align > 0);
3328     if (size == 0) {
3329         size = align;
3330     }
3331 
3332     return qemu_try_memalign(align, size);
3333 }
3334 
3335 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
3336 {
3337     void *mem = qemu_try_blockalign(bs, size);
3338     IO_CODE();
3339 
3340     if (mem) {
3341         memset(mem, 0, size);
3342     }
3343 
3344     return mem;
3345 }
3346 
3347 /* Helper that undoes bdrv_register_buf() when it fails partway through */
3348 static void GRAPH_RDLOCK
3349 bdrv_register_buf_rollback(BlockDriverState *bs, void *host, size_t size,
3350                            BdrvChild *final_child)
3351 {
3352     BdrvChild *child;
3353 
3354     GLOBAL_STATE_CODE();
3355     assert_bdrv_graph_readable();
3356 
3357     QLIST_FOREACH(child, &bs->children, next) {
3358         if (child == final_child) {
3359             break;
3360         }
3361 
3362         bdrv_unregister_buf(child->bs, host, size);
3363     }
3364 
3365     if (bs->drv && bs->drv->bdrv_unregister_buf) {
3366         bs->drv->bdrv_unregister_buf(bs, host, size);
3367     }
3368 }
3369 
3370 bool bdrv_register_buf(BlockDriverState *bs, void *host, size_t size,
3371                        Error **errp)
3372 {
3373     BdrvChild *child;
3374 
3375     GLOBAL_STATE_CODE();
3376     GRAPH_RDLOCK_GUARD_MAINLOOP();
3377 
3378     if (bs->drv && bs->drv->bdrv_register_buf) {
3379         if (!bs->drv->bdrv_register_buf(bs, host, size, errp)) {
3380             return false;
3381         }
3382     }
3383     QLIST_FOREACH(child, &bs->children, next) {
3384         if (!bdrv_register_buf(child->bs, host, size, errp)) {
3385             bdrv_register_buf_rollback(bs, host, size, child);
3386             return false;
3387         }
3388     }
3389     return true;
3390 }
3391 
3392 void bdrv_unregister_buf(BlockDriverState *bs, void *host, size_t size)
3393 {
3394     BdrvChild *child;
3395 
3396     GLOBAL_STATE_CODE();
3397     GRAPH_RDLOCK_GUARD_MAINLOOP();
3398 
3399     if (bs->drv && bs->drv->bdrv_unregister_buf) {
3400         bs->drv->bdrv_unregister_buf(bs, host, size);
3401     }
3402     QLIST_FOREACH(child, &bs->children, next) {
3403         bdrv_unregister_buf(child->bs, host, size);
3404     }
3405 }
3406 
3407 static int coroutine_fn GRAPH_RDLOCK bdrv_co_copy_range_internal(
3408         BdrvChild *src, int64_t src_offset, BdrvChild *dst,
3409         int64_t dst_offset, int64_t bytes,
3410         BdrvRequestFlags read_flags, BdrvRequestFlags write_flags,
3411         bool recurse_src)
3412 {
3413     BdrvTrackedRequest req;
3414     int ret;
3415     assert_bdrv_graph_readable();
3416 
3417     /* TODO We can support BDRV_REQ_NO_FALLBACK here */
3418     assert(!(read_flags & BDRV_REQ_NO_FALLBACK));
3419     assert(!(write_flags & BDRV_REQ_NO_FALLBACK));
3420     assert(!(read_flags & BDRV_REQ_NO_WAIT));
3421     assert(!(write_flags & BDRV_REQ_NO_WAIT));
3422 
3423     if (!dst || !dst->bs || !bdrv_co_is_inserted(dst->bs)) {
3424         return -ENOMEDIUM;
3425     }
3426     ret = bdrv_check_request32(dst_offset, bytes, NULL, 0);
3427     if (ret) {
3428         return ret;
3429     }
3430     if (write_flags & BDRV_REQ_ZERO_WRITE) {
3431         return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags);
3432     }
3433 
3434     if (!src || !src->bs || !bdrv_co_is_inserted(src->bs)) {
3435         return -ENOMEDIUM;
3436     }
3437     ret = bdrv_check_request32(src_offset, bytes, NULL, 0);
3438     if (ret) {
3439         return ret;
3440     }
3441 
3442     if (!src->bs->drv->bdrv_co_copy_range_from
3443         || !dst->bs->drv->bdrv_co_copy_range_to
3444         || src->bs->encrypted || dst->bs->encrypted) {
3445         return -ENOTSUP;
3446     }
3447 
3448     if (recurse_src) {
3449         bdrv_inc_in_flight(src->bs);
3450         tracked_request_begin(&req, src->bs, src_offset, bytes,
3451                               BDRV_TRACKED_READ);
3452 
3453         /* BDRV_REQ_SERIALISING is only for write operation */
3454         assert(!(read_flags & BDRV_REQ_SERIALISING));
3455         bdrv_wait_serialising_requests(&req);
3456 
3457         ret = src->bs->drv->bdrv_co_copy_range_from(src->bs,
3458                                                     src, src_offset,
3459                                                     dst, dst_offset,
3460                                                     bytes,
3461                                                     read_flags, write_flags);
3462 
3463         tracked_request_end(&req);
3464         bdrv_dec_in_flight(src->bs);
3465     } else {
3466         bdrv_inc_in_flight(dst->bs);
3467         tracked_request_begin(&req, dst->bs, dst_offset, bytes,
3468                               BDRV_TRACKED_WRITE);
3469         ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req,
3470                                         write_flags);
3471         if (!ret) {
3472             ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs,
3473                                                       src, src_offset,
3474                                                       dst, dst_offset,
3475                                                       bytes,
3476                                                       read_flags, write_flags);
3477         }
3478         bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret);
3479         tracked_request_end(&req);
3480         bdrv_dec_in_flight(dst->bs);
3481     }
3482 
3483     return ret;
3484 }
3485 
3486 /* Copy range from @src to @dst.
3487  *
3488  * See the comment of bdrv_co_copy_range for the parameter and return value
3489  * semantics. */
3490 int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, int64_t src_offset,
3491                                          BdrvChild *dst, int64_t dst_offset,
3492                                          int64_t bytes,
3493                                          BdrvRequestFlags read_flags,
3494                                          BdrvRequestFlags write_flags)
3495 {
3496     IO_CODE();
3497     assert_bdrv_graph_readable();
3498     trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes,
3499                                   read_flags, write_flags);
3500     return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
3501                                        bytes, read_flags, write_flags, true);
3502 }
3503 
3504 /* Copy range from @src to @dst.
3505  *
3506  * See the comment of bdrv_co_copy_range for the parameter and return value
3507  * semantics. */
3508 int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, int64_t src_offset,
3509                                        BdrvChild *dst, int64_t dst_offset,
3510                                        int64_t bytes,
3511                                        BdrvRequestFlags read_flags,
3512                                        BdrvRequestFlags write_flags)
3513 {
3514     IO_CODE();
3515     assert_bdrv_graph_readable();
3516     trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
3517                                 read_flags, write_flags);
3518     return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
3519                                        bytes, read_flags, write_flags, false);
3520 }
3521 
3522 int coroutine_fn bdrv_co_copy_range(BdrvChild *src, int64_t src_offset,
3523                                     BdrvChild *dst, int64_t dst_offset,
3524                                     int64_t bytes, BdrvRequestFlags read_flags,
3525                                     BdrvRequestFlags write_flags)
3526 {
3527     IO_CODE();
3528     assert_bdrv_graph_readable();
3529 
3530     return bdrv_co_copy_range_from(src, src_offset,
3531                                    dst, dst_offset,
3532                                    bytes, read_flags, write_flags);
3533 }
3534 
3535 static void coroutine_fn GRAPH_RDLOCK
3536 bdrv_parent_cb_resize(BlockDriverState *bs)
3537 {
3538     BdrvChild *c;
3539 
3540     assert_bdrv_graph_readable();
3541 
3542     QLIST_FOREACH(c, &bs->parents, next_parent) {
3543         if (c->klass->resize) {
3544             c->klass->resize(c);
3545         }
3546     }
3547 }
3548 
3549 /**
3550  * Truncate file to 'offset' bytes (needed only for file protocols)
3551  *
3552  * If 'exact' is true, the file must be resized to exactly the given
3553  * 'offset'.  Otherwise, it is sufficient for the node to be at least
3554  * 'offset' bytes in length.
3555  */
3556 int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
3557                                   PreallocMode prealloc, BdrvRequestFlags flags,
3558                                   Error **errp)
3559 {
3560     BlockDriverState *bs = child->bs;
3561     BdrvChild *filtered, *backing;
3562     BlockDriver *drv = bs->drv;
3563     BdrvTrackedRequest req;
3564     int64_t old_size, new_bytes;
3565     int ret;
3566     IO_CODE();
3567     assert_bdrv_graph_readable();
3568 
3569     /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
3570     if (!drv) {
3571         error_setg(errp, "No medium inserted");
3572         return -ENOMEDIUM;
3573     }
3574     if (offset < 0) {
3575         error_setg(errp, "Image size cannot be negative");
3576         return -EINVAL;
3577     }
3578 
3579     ret = bdrv_check_request(offset, 0, errp);
3580     if (ret < 0) {
3581         return ret;
3582     }
3583 
3584     old_size = bdrv_co_getlength(bs);
3585     if (old_size < 0) {
3586         error_setg_errno(errp, -old_size, "Failed to get old image size");
3587         return old_size;
3588     }
3589 
3590     if (bdrv_is_read_only(bs)) {
3591         error_setg(errp, "Image is read-only");
3592         return -EACCES;
3593     }
3594 
3595     if (offset > old_size) {
3596         new_bytes = offset - old_size;
3597     } else {
3598         new_bytes = 0;
3599     }
3600 
3601     bdrv_inc_in_flight(bs);
3602     tracked_request_begin(&req, bs, offset - new_bytes, new_bytes,
3603                           BDRV_TRACKED_TRUNCATE);
3604 
3605     /* If we are growing the image and potentially using preallocation for the
3606      * new area, we need to make sure that no write requests are made to it
3607      * concurrently or they might be overwritten by preallocation. */
3608     if (new_bytes) {
3609         bdrv_make_request_serialising(&req, 1);
3610     }
3611     ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req,
3612                                     0);
3613     if (ret < 0) {
3614         error_setg_errno(errp, -ret,
3615                          "Failed to prepare request for truncation");
3616         goto out;
3617     }
3618 
3619     filtered = bdrv_filter_child(bs);
3620     backing = bdrv_cow_child(bs);
3621 
3622     /*
3623      * If the image has a backing file that is large enough that it would
3624      * provide data for the new area, we cannot leave it unallocated because
3625      * then the backing file content would become visible. Instead, zero-fill
3626      * the new area.
3627      *
3628      * Note that if the image has a backing file, but was opened without the
3629      * backing file, taking care of keeping things consistent with that backing
3630      * file is the user's responsibility.
3631      */
3632     if (new_bytes && backing) {
3633         int64_t backing_len;
3634 
3635         backing_len = bdrv_co_getlength(backing->bs);
3636         if (backing_len < 0) {
3637             ret = backing_len;
3638             error_setg_errno(errp, -ret, "Could not get backing file size");
3639             goto out;
3640         }
3641 
3642         if (backing_len > old_size) {
3643             flags |= BDRV_REQ_ZERO_WRITE;
3644         }
3645     }
3646 
3647     if (drv->bdrv_co_truncate) {
3648         if (flags & ~bs->supported_truncate_flags) {
3649             error_setg(errp, "Block driver does not support requested flags");
3650             ret = -ENOTSUP;
3651             goto out;
3652         }
3653         ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp);
3654     } else if (filtered) {
3655         ret = bdrv_co_truncate(filtered, offset, exact, prealloc, flags, errp);
3656     } else {
3657         error_setg(errp, "Image format driver does not support resize");
3658         ret = -ENOTSUP;
3659         goto out;
3660     }
3661     if (ret < 0) {
3662         goto out;
3663     }
3664 
3665     ret = bdrv_co_refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3666     if (ret < 0) {
3667         error_setg_errno(errp, -ret, "Could not refresh total sector count");
3668     } else {
3669         offset = bs->total_sectors * BDRV_SECTOR_SIZE;
3670     }
3671     /*
3672      * It's possible that truncation succeeded but bdrv_refresh_total_sectors
3673      * failed, but the latter doesn't affect how we should finish the request.
3674      * Pass 0 as the last parameter so that dirty bitmaps etc. are handled.
3675      */
3676     bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0);
3677 
3678 out:
3679     tracked_request_end(&req);
3680     bdrv_dec_in_flight(bs);
3681 
3682     return ret;
3683 }
3684 
3685 void bdrv_cancel_in_flight(BlockDriverState *bs)
3686 {
3687     GLOBAL_STATE_CODE();
3688     if (!bs || !bs->drv) {
3689         return;
3690     }
3691 
3692     if (bs->drv->bdrv_cancel_in_flight) {
3693         bs->drv->bdrv_cancel_in_flight(bs);
3694     }
3695 }
3696 
3697 int coroutine_fn
3698 bdrv_co_preadv_snapshot(BdrvChild *child, int64_t offset, int64_t bytes,
3699                         QEMUIOVector *qiov, size_t qiov_offset)
3700 {
3701     BlockDriverState *bs = child->bs;
3702     BlockDriver *drv = bs->drv;
3703     int ret;
3704     IO_CODE();
3705     assert_bdrv_graph_readable();
3706 
3707     if (!drv) {
3708         return -ENOMEDIUM;
3709     }
3710 
3711     if (!drv->bdrv_co_preadv_snapshot) {
3712         return -ENOTSUP;
3713     }
3714 
3715     bdrv_inc_in_flight(bs);
3716     ret = drv->bdrv_co_preadv_snapshot(bs, offset, bytes, qiov, qiov_offset);
3717     bdrv_dec_in_flight(bs);
3718 
3719     return ret;
3720 }
3721 
3722 int coroutine_fn
3723 bdrv_co_snapshot_block_status(BlockDriverState *bs,
3724                               bool want_zero, int64_t offset, int64_t bytes,
3725                               int64_t *pnum, int64_t *map,
3726                               BlockDriverState **file)
3727 {
3728     BlockDriver *drv = bs->drv;
3729     int ret;
3730     IO_CODE();
3731     assert_bdrv_graph_readable();
3732 
3733     if (!drv) {
3734         return -ENOMEDIUM;
3735     }
3736 
3737     if (!drv->bdrv_co_snapshot_block_status) {
3738         return -ENOTSUP;
3739     }
3740 
3741     bdrv_inc_in_flight(bs);
3742     ret = drv->bdrv_co_snapshot_block_status(bs, want_zero, offset, bytes,
3743                                              pnum, map, file);
3744     bdrv_dec_in_flight(bs);
3745 
3746     return ret;
3747 }
3748 
3749 int coroutine_fn
3750 bdrv_co_pdiscard_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes)
3751 {
3752     BlockDriver *drv = bs->drv;
3753     int ret;
3754     IO_CODE();
3755     assert_bdrv_graph_readable();
3756 
3757     if (!drv) {
3758         return -ENOMEDIUM;
3759     }
3760 
3761     if (!drv->bdrv_co_pdiscard_snapshot) {
3762         return -ENOTSUP;
3763     }
3764 
3765     bdrv_inc_in_flight(bs);
3766     ret = drv->bdrv_co_pdiscard_snapshot(bs, offset, bytes);
3767     bdrv_dec_in_flight(bs);
3768 
3769     return ret;
3770 }
3771