xref: /openbmc/qemu/block/io.c (revision 3a30446aed8497b5928576c6d1aedba557363934)
1  /*
2   * Block layer I/O functions
3   *
4   * Copyright (c) 2003 Fabrice Bellard
5   *
6   * Permission is hereby granted, free of charge, to any person obtaining a copy
7   * of this software and associated documentation files (the "Software"), to deal
8   * in the Software without restriction, including without limitation the rights
9   * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10   * copies of the Software, and to permit persons to whom the Software is
11   * furnished to do so, subject to the following conditions:
12   *
13   * The above copyright notice and this permission notice shall be included in
14   * all copies or substantial portions of the Software.
15   *
16   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19   * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22   * THE SOFTWARE.
23   */
24  
25  #include "qemu/osdep.h"
26  #include "trace.h"
27  #include "sysemu/block-backend.h"
28  #include "block/aio-wait.h"
29  #include "block/blockjob.h"
30  #include "block/blockjob_int.h"
31  #include "block/block_int.h"
32  #include "qemu/cutils.h"
33  #include "qapi/error.h"
34  #include "qemu/error-report.h"
35  #include "qemu/main-loop.h"
36  #include "sysemu/replay.h"
37  
38  /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
39  #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
40  
41  static void bdrv_parent_cb_resize(BlockDriverState *bs);
42  static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
43      int64_t offset, int bytes, BdrvRequestFlags flags);
44  
45  static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
46                                        bool ignore_bds_parents)
47  {
48      BdrvChild *c, *next;
49  
50      QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
51          if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
52              continue;
53          }
54          bdrv_parent_drained_begin_single(c, false);
55      }
56  }
57  
58  static void bdrv_parent_drained_end_single_no_poll(BdrvChild *c,
59                                                     int *drained_end_counter)
60  {
61      assert(c->parent_quiesce_counter > 0);
62      c->parent_quiesce_counter--;
63      if (c->klass->drained_end) {
64          c->klass->drained_end(c, drained_end_counter);
65      }
66  }
67  
68  void bdrv_parent_drained_end_single(BdrvChild *c)
69  {
70      int drained_end_counter = 0;
71      bdrv_parent_drained_end_single_no_poll(c, &drained_end_counter);
72      BDRV_POLL_WHILE(c->bs, atomic_read(&drained_end_counter) > 0);
73  }
74  
75  static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
76                                      bool ignore_bds_parents,
77                                      int *drained_end_counter)
78  {
79      BdrvChild *c;
80  
81      QLIST_FOREACH(c, &bs->parents, next_parent) {
82          if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
83              continue;
84          }
85          bdrv_parent_drained_end_single_no_poll(c, drained_end_counter);
86      }
87  }
88  
89  static bool bdrv_parent_drained_poll_single(BdrvChild *c)
90  {
91      if (c->klass->drained_poll) {
92          return c->klass->drained_poll(c);
93      }
94      return false;
95  }
96  
97  static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
98                                       bool ignore_bds_parents)
99  {
100      BdrvChild *c, *next;
101      bool busy = false;
102  
103      QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
104          if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
105              continue;
106          }
107          busy |= bdrv_parent_drained_poll_single(c);
108      }
109  
110      return busy;
111  }
112  
113  void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll)
114  {
115      c->parent_quiesce_counter++;
116      if (c->klass->drained_begin) {
117          c->klass->drained_begin(c);
118      }
119      if (poll) {
120          BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c));
121      }
122  }
123  
124  static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
125  {
126      dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
127      dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
128      dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
129                                   src->opt_mem_alignment);
130      dst->min_mem_alignment = MAX(dst->min_mem_alignment,
131                                   src->min_mem_alignment);
132      dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
133  }
134  
135  void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
136  {
137      BlockDriver *drv = bs->drv;
138      Error *local_err = NULL;
139  
140      memset(&bs->bl, 0, sizeof(bs->bl));
141  
142      if (!drv) {
143          return;
144      }
145  
146      /* Default alignment based on whether driver has byte interface */
147      bs->bl.request_alignment = (drv->bdrv_co_preadv ||
148                                  drv->bdrv_aio_preadv ||
149                                  drv->bdrv_co_preadv_part) ? 1 : 512;
150  
151      /* Take some limits from the children as a default */
152      if (bs->file) {
153          bdrv_refresh_limits(bs->file->bs, &local_err);
154          if (local_err) {
155              error_propagate(errp, local_err);
156              return;
157          }
158          bdrv_merge_limits(&bs->bl, &bs->file->bs->bl);
159      } else {
160          bs->bl.min_mem_alignment = 512;
161          bs->bl.opt_mem_alignment = qemu_real_host_page_size;
162  
163          /* Safe default since most protocols use readv()/writev()/etc */
164          bs->bl.max_iov = IOV_MAX;
165      }
166  
167      if (bs->backing) {
168          bdrv_refresh_limits(bs->backing->bs, &local_err);
169          if (local_err) {
170              error_propagate(errp, local_err);
171              return;
172          }
173          bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl);
174      }
175  
176      /* Then let the driver override it */
177      if (drv->bdrv_refresh_limits) {
178          drv->bdrv_refresh_limits(bs, errp);
179      }
180  }
181  
182  /**
183   * The copy-on-read flag is actually a reference count so multiple users may
184   * use the feature without worrying about clobbering its previous state.
185   * Copy-on-read stays enabled until all users have called to disable it.
186   */
187  void bdrv_enable_copy_on_read(BlockDriverState *bs)
188  {
189      atomic_inc(&bs->copy_on_read);
190  }
191  
192  void bdrv_disable_copy_on_read(BlockDriverState *bs)
193  {
194      int old = atomic_fetch_dec(&bs->copy_on_read);
195      assert(old >= 1);
196  }
197  
198  typedef struct {
199      Coroutine *co;
200      BlockDriverState *bs;
201      bool done;
202      bool begin;
203      bool recursive;
204      bool poll;
205      BdrvChild *parent;
206      bool ignore_bds_parents;
207      int *drained_end_counter;
208  } BdrvCoDrainData;
209  
210  static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
211  {
212      BdrvCoDrainData *data = opaque;
213      BlockDriverState *bs = data->bs;
214  
215      if (data->begin) {
216          bs->drv->bdrv_co_drain_begin(bs);
217      } else {
218          bs->drv->bdrv_co_drain_end(bs);
219      }
220  
221      /* Set data->done and decrement drained_end_counter before bdrv_wakeup() */
222      atomic_mb_set(&data->done, true);
223      if (!data->begin) {
224          atomic_dec(data->drained_end_counter);
225      }
226      bdrv_dec_in_flight(bs);
227  
228      g_free(data);
229  }
230  
231  /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
232  static void bdrv_drain_invoke(BlockDriverState *bs, bool begin,
233                                int *drained_end_counter)
234  {
235      BdrvCoDrainData *data;
236  
237      if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
238              (!begin && !bs->drv->bdrv_co_drain_end)) {
239          return;
240      }
241  
242      data = g_new(BdrvCoDrainData, 1);
243      *data = (BdrvCoDrainData) {
244          .bs = bs,
245          .done = false,
246          .begin = begin,
247          .drained_end_counter = drained_end_counter,
248      };
249  
250      if (!begin) {
251          atomic_inc(drained_end_counter);
252      }
253  
254      /* Make sure the driver callback completes during the polling phase for
255       * drain_begin. */
256      bdrv_inc_in_flight(bs);
257      data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data);
258      aio_co_schedule(bdrv_get_aio_context(bs), data->co);
259  }
260  
261  /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
262  bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
263                       BdrvChild *ignore_parent, bool ignore_bds_parents)
264  {
265      BdrvChild *child, *next;
266  
267      if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) {
268          return true;
269      }
270  
271      if (atomic_read(&bs->in_flight)) {
272          return true;
273      }
274  
275      if (recursive) {
276          assert(!ignore_bds_parents);
277          QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
278              if (bdrv_drain_poll(child->bs, recursive, child, false)) {
279                  return true;
280              }
281          }
282      }
283  
284      return false;
285  }
286  
287  static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive,
288                                        BdrvChild *ignore_parent)
289  {
290      return bdrv_drain_poll(bs, recursive, ignore_parent, false);
291  }
292  
293  static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
294                                    BdrvChild *parent, bool ignore_bds_parents,
295                                    bool poll);
296  static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
297                                  BdrvChild *parent, bool ignore_bds_parents,
298                                  int *drained_end_counter);
299  
300  static void bdrv_co_drain_bh_cb(void *opaque)
301  {
302      BdrvCoDrainData *data = opaque;
303      Coroutine *co = data->co;
304      BlockDriverState *bs = data->bs;
305  
306      if (bs) {
307          AioContext *ctx = bdrv_get_aio_context(bs);
308          AioContext *co_ctx = qemu_coroutine_get_aio_context(co);
309  
310          /*
311           * When the coroutine yielded, the lock for its home context was
312           * released, so we need to re-acquire it here. If it explicitly
313           * acquired a different context, the lock is still held and we don't
314           * want to lock it a second time (or AIO_WAIT_WHILE() would hang).
315           */
316          if (ctx == co_ctx) {
317              aio_context_acquire(ctx);
318          }
319          bdrv_dec_in_flight(bs);
320          if (data->begin) {
321              assert(!data->drained_end_counter);
322              bdrv_do_drained_begin(bs, data->recursive, data->parent,
323                                    data->ignore_bds_parents, data->poll);
324          } else {
325              assert(!data->poll);
326              bdrv_do_drained_end(bs, data->recursive, data->parent,
327                                  data->ignore_bds_parents,
328                                  data->drained_end_counter);
329          }
330          if (ctx == co_ctx) {
331              aio_context_release(ctx);
332          }
333      } else {
334          assert(data->begin);
335          bdrv_drain_all_begin();
336      }
337  
338      data->done = true;
339      aio_co_wake(co);
340  }
341  
342  static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
343                                                  bool begin, bool recursive,
344                                                  BdrvChild *parent,
345                                                  bool ignore_bds_parents,
346                                                  bool poll,
347                                                  int *drained_end_counter)
348  {
349      BdrvCoDrainData data;
350  
351      /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
352       * other coroutines run if they were queued by aio_co_enter(). */
353  
354      assert(qemu_in_coroutine());
355      data = (BdrvCoDrainData) {
356          .co = qemu_coroutine_self(),
357          .bs = bs,
358          .done = false,
359          .begin = begin,
360          .recursive = recursive,
361          .parent = parent,
362          .ignore_bds_parents = ignore_bds_parents,
363          .poll = poll,
364          .drained_end_counter = drained_end_counter,
365      };
366  
367      if (bs) {
368          bdrv_inc_in_flight(bs);
369      }
370      replay_bh_schedule_oneshot_event(bdrv_get_aio_context(bs),
371                                       bdrv_co_drain_bh_cb, &data);
372  
373      qemu_coroutine_yield();
374      /* If we are resumed from some other event (such as an aio completion or a
375       * timer callback), it is a bug in the caller that should be fixed. */
376      assert(data.done);
377  }
378  
379  void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
380                                     BdrvChild *parent, bool ignore_bds_parents)
381  {
382      assert(!qemu_in_coroutine());
383  
384      /* Stop things in parent-to-child order */
385      if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
386          aio_disable_external(bdrv_get_aio_context(bs));
387      }
388  
389      bdrv_parent_drained_begin(bs, parent, ignore_bds_parents);
390      bdrv_drain_invoke(bs, true, NULL);
391  }
392  
393  static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
394                                    BdrvChild *parent, bool ignore_bds_parents,
395                                    bool poll)
396  {
397      BdrvChild *child, *next;
398  
399      if (qemu_in_coroutine()) {
400          bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents,
401                                 poll, NULL);
402          return;
403      }
404  
405      bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents);
406  
407      if (recursive) {
408          assert(!ignore_bds_parents);
409          bs->recursive_quiesce_counter++;
410          QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
411              bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents,
412                                    false);
413          }
414      }
415  
416      /*
417       * Wait for drained requests to finish.
418       *
419       * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The
420       * call is needed so things in this AioContext can make progress even
421       * though we don't return to the main AioContext loop - this automatically
422       * includes other nodes in the same AioContext and therefore all child
423       * nodes.
424       */
425      if (poll) {
426          assert(!ignore_bds_parents);
427          BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent));
428      }
429  }
430  
431  void bdrv_drained_begin(BlockDriverState *bs)
432  {
433      bdrv_do_drained_begin(bs, false, NULL, false, true);
434  }
435  
436  void bdrv_subtree_drained_begin(BlockDriverState *bs)
437  {
438      bdrv_do_drained_begin(bs, true, NULL, false, true);
439  }
440  
441  /**
442   * This function does not poll, nor must any of its recursively called
443   * functions.  The *drained_end_counter pointee will be incremented
444   * once for every background operation scheduled, and decremented once
445   * the operation settles.  Therefore, the pointer must remain valid
446   * until the pointee reaches 0.  That implies that whoever sets up the
447   * pointee has to poll until it is 0.
448   *
449   * We use atomic operations to access *drained_end_counter, because
450   * (1) when called from bdrv_set_aio_context_ignore(), the subgraph of
451   *     @bs may contain nodes in different AioContexts,
452   * (2) bdrv_drain_all_end() uses the same counter for all nodes,
453   *     regardless of which AioContext they are in.
454   */
455  static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
456                                  BdrvChild *parent, bool ignore_bds_parents,
457                                  int *drained_end_counter)
458  {
459      BdrvChild *child;
460      int old_quiesce_counter;
461  
462      assert(drained_end_counter != NULL);
463  
464      if (qemu_in_coroutine()) {
465          bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents,
466                                 false, drained_end_counter);
467          return;
468      }
469      assert(bs->quiesce_counter > 0);
470  
471      /* Re-enable things in child-to-parent order */
472      bdrv_drain_invoke(bs, false, drained_end_counter);
473      bdrv_parent_drained_end(bs, parent, ignore_bds_parents,
474                              drained_end_counter);
475  
476      old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
477      if (old_quiesce_counter == 1) {
478          aio_enable_external(bdrv_get_aio_context(bs));
479      }
480  
481      if (recursive) {
482          assert(!ignore_bds_parents);
483          bs->recursive_quiesce_counter--;
484          QLIST_FOREACH(child, &bs->children, next) {
485              bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents,
486                                  drained_end_counter);
487          }
488      }
489  }
490  
491  void bdrv_drained_end(BlockDriverState *bs)
492  {
493      int drained_end_counter = 0;
494      bdrv_do_drained_end(bs, false, NULL, false, &drained_end_counter);
495      BDRV_POLL_WHILE(bs, atomic_read(&drained_end_counter) > 0);
496  }
497  
498  void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter)
499  {
500      bdrv_do_drained_end(bs, false, NULL, false, drained_end_counter);
501  }
502  
503  void bdrv_subtree_drained_end(BlockDriverState *bs)
504  {
505      int drained_end_counter = 0;
506      bdrv_do_drained_end(bs, true, NULL, false, &drained_end_counter);
507      BDRV_POLL_WHILE(bs, atomic_read(&drained_end_counter) > 0);
508  }
509  
510  void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
511  {
512      int i;
513  
514      for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
515          bdrv_do_drained_begin(child->bs, true, child, false, true);
516      }
517  }
518  
519  void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
520  {
521      int drained_end_counter = 0;
522      int i;
523  
524      for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
525          bdrv_do_drained_end(child->bs, true, child, false,
526                              &drained_end_counter);
527      }
528  
529      BDRV_POLL_WHILE(child->bs, atomic_read(&drained_end_counter) > 0);
530  }
531  
532  /*
533   * Wait for pending requests to complete on a single BlockDriverState subtree,
534   * and suspend block driver's internal I/O until next request arrives.
535   *
536   * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
537   * AioContext.
538   */
539  void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
540  {
541      assert(qemu_in_coroutine());
542      bdrv_drained_begin(bs);
543      bdrv_drained_end(bs);
544  }
545  
546  void bdrv_drain(BlockDriverState *bs)
547  {
548      bdrv_drained_begin(bs);
549      bdrv_drained_end(bs);
550  }
551  
552  static void bdrv_drain_assert_idle(BlockDriverState *bs)
553  {
554      BdrvChild *child, *next;
555  
556      assert(atomic_read(&bs->in_flight) == 0);
557      QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
558          bdrv_drain_assert_idle(child->bs);
559      }
560  }
561  
562  unsigned int bdrv_drain_all_count = 0;
563  
564  static bool bdrv_drain_all_poll(void)
565  {
566      BlockDriverState *bs = NULL;
567      bool result = false;
568  
569      /* bdrv_drain_poll() can't make changes to the graph and we are holding the
570       * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
571      while ((bs = bdrv_next_all_states(bs))) {
572          AioContext *aio_context = bdrv_get_aio_context(bs);
573          aio_context_acquire(aio_context);
574          result |= bdrv_drain_poll(bs, false, NULL, true);
575          aio_context_release(aio_context);
576      }
577  
578      return result;
579  }
580  
581  /*
582   * Wait for pending requests to complete across all BlockDriverStates
583   *
584   * This function does not flush data to disk, use bdrv_flush_all() for that
585   * after calling this function.
586   *
587   * This pauses all block jobs and disables external clients. It must
588   * be paired with bdrv_drain_all_end().
589   *
590   * NOTE: no new block jobs or BlockDriverStates can be created between
591   * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
592   */
593  void bdrv_drain_all_begin(void)
594  {
595      BlockDriverState *bs = NULL;
596  
597      if (qemu_in_coroutine()) {
598          bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true, NULL);
599          return;
600      }
601  
602      /*
603       * bdrv queue is managed by record/replay,
604       * waiting for finishing the I/O requests may
605       * be infinite
606       */
607      if (replay_events_enabled()) {
608          return;
609      }
610  
611      /* AIO_WAIT_WHILE() with a NULL context can only be called from the main
612       * loop AioContext, so make sure we're in the main context. */
613      assert(qemu_get_current_aio_context() == qemu_get_aio_context());
614      assert(bdrv_drain_all_count < INT_MAX);
615      bdrv_drain_all_count++;
616  
617      /* Quiesce all nodes, without polling in-flight requests yet. The graph
618       * cannot change during this loop. */
619      while ((bs = bdrv_next_all_states(bs))) {
620          AioContext *aio_context = bdrv_get_aio_context(bs);
621  
622          aio_context_acquire(aio_context);
623          bdrv_do_drained_begin(bs, false, NULL, true, false);
624          aio_context_release(aio_context);
625      }
626  
627      /* Now poll the in-flight requests */
628      AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll());
629  
630      while ((bs = bdrv_next_all_states(bs))) {
631          bdrv_drain_assert_idle(bs);
632      }
633  }
634  
635  void bdrv_drain_all_end(void)
636  {
637      BlockDriverState *bs = NULL;
638      int drained_end_counter = 0;
639  
640      /*
641       * bdrv queue is managed by record/replay,
642       * waiting for finishing the I/O requests may
643       * be endless
644       */
645      if (replay_events_enabled()) {
646          return;
647      }
648  
649      while ((bs = bdrv_next_all_states(bs))) {
650          AioContext *aio_context = bdrv_get_aio_context(bs);
651  
652          aio_context_acquire(aio_context);
653          bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter);
654          aio_context_release(aio_context);
655      }
656  
657      assert(qemu_get_current_aio_context() == qemu_get_aio_context());
658      AIO_WAIT_WHILE(NULL, atomic_read(&drained_end_counter) > 0);
659  
660      assert(bdrv_drain_all_count > 0);
661      bdrv_drain_all_count--;
662  }
663  
664  void bdrv_drain_all(void)
665  {
666      bdrv_drain_all_begin();
667      bdrv_drain_all_end();
668  }
669  
670  /**
671   * Remove an active request from the tracked requests list
672   *
673   * This function should be called when a tracked request is completing.
674   */
675  static void tracked_request_end(BdrvTrackedRequest *req)
676  {
677      if (req->serialising) {
678          atomic_dec(&req->bs->serialising_in_flight);
679      }
680  
681      qemu_co_mutex_lock(&req->bs->reqs_lock);
682      QLIST_REMOVE(req, list);
683      qemu_co_queue_restart_all(&req->wait_queue);
684      qemu_co_mutex_unlock(&req->bs->reqs_lock);
685  }
686  
687  /**
688   * Add an active request to the tracked requests list
689   */
690  static void tracked_request_begin(BdrvTrackedRequest *req,
691                                    BlockDriverState *bs,
692                                    int64_t offset,
693                                    uint64_t bytes,
694                                    enum BdrvTrackedRequestType type)
695  {
696      assert(bytes <= INT64_MAX && offset <= INT64_MAX - bytes);
697  
698      *req = (BdrvTrackedRequest){
699          .bs = bs,
700          .offset         = offset,
701          .bytes          = bytes,
702          .type           = type,
703          .co             = qemu_coroutine_self(),
704          .serialising    = false,
705          .overlap_offset = offset,
706          .overlap_bytes  = bytes,
707      };
708  
709      qemu_co_queue_init(&req->wait_queue);
710  
711      qemu_co_mutex_lock(&bs->reqs_lock);
712      QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
713      qemu_co_mutex_unlock(&bs->reqs_lock);
714  }
715  
716  static bool tracked_request_overlaps(BdrvTrackedRequest *req,
717                                       int64_t offset, uint64_t bytes)
718  {
719      /*        aaaa   bbbb */
720      if (offset >= req->overlap_offset + req->overlap_bytes) {
721          return false;
722      }
723      /* bbbb   aaaa        */
724      if (req->overlap_offset >= offset + bytes) {
725          return false;
726      }
727      return true;
728  }
729  
730  static bool coroutine_fn
731  bdrv_wait_serialising_requests_locked(BlockDriverState *bs,
732                                        BdrvTrackedRequest *self)
733  {
734      BdrvTrackedRequest *req;
735      bool retry;
736      bool waited = false;
737  
738      do {
739          retry = false;
740          QLIST_FOREACH(req, &bs->tracked_requests, list) {
741              if (req == self || (!req->serialising && !self->serialising)) {
742                  continue;
743              }
744              if (tracked_request_overlaps(req, self->overlap_offset,
745                                           self->overlap_bytes))
746              {
747                  /* Hitting this means there was a reentrant request, for
748                   * example, a block driver issuing nested requests.  This must
749                   * never happen since it means deadlock.
750                   */
751                  assert(qemu_coroutine_self() != req->co);
752  
753                  /* If the request is already (indirectly) waiting for us, or
754                   * will wait for us as soon as it wakes up, then just go on
755                   * (instead of producing a deadlock in the former case). */
756                  if (!req->waiting_for) {
757                      self->waiting_for = req;
758                      qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
759                      self->waiting_for = NULL;
760                      retry = true;
761                      waited = true;
762                      break;
763                  }
764              }
765          }
766      } while (retry);
767      return waited;
768  }
769  
770  bool bdrv_mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
771  {
772      BlockDriverState *bs = req->bs;
773      int64_t overlap_offset = req->offset & ~(align - 1);
774      uint64_t overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
775                                 - overlap_offset;
776      bool waited;
777  
778      qemu_co_mutex_lock(&bs->reqs_lock);
779      if (!req->serialising) {
780          atomic_inc(&req->bs->serialising_in_flight);
781          req->serialising = true;
782      }
783  
784      req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
785      req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
786      waited = bdrv_wait_serialising_requests_locked(bs, req);
787      qemu_co_mutex_unlock(&bs->reqs_lock);
788      return waited;
789  }
790  
791  /**
792   * Return the tracked request on @bs for the current coroutine, or
793   * NULL if there is none.
794   */
795  BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs)
796  {
797      BdrvTrackedRequest *req;
798      Coroutine *self = qemu_coroutine_self();
799  
800      QLIST_FOREACH(req, &bs->tracked_requests, list) {
801          if (req->co == self) {
802              return req;
803          }
804      }
805  
806      return NULL;
807  }
808  
809  /**
810   * Round a region to cluster boundaries
811   */
812  void bdrv_round_to_clusters(BlockDriverState *bs,
813                              int64_t offset, int64_t bytes,
814                              int64_t *cluster_offset,
815                              int64_t *cluster_bytes)
816  {
817      BlockDriverInfo bdi;
818  
819      if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
820          *cluster_offset = offset;
821          *cluster_bytes = bytes;
822      } else {
823          int64_t c = bdi.cluster_size;
824          *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
825          *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
826      }
827  }
828  
829  static int bdrv_get_cluster_size(BlockDriverState *bs)
830  {
831      BlockDriverInfo bdi;
832      int ret;
833  
834      ret = bdrv_get_info(bs, &bdi);
835      if (ret < 0 || bdi.cluster_size == 0) {
836          return bs->bl.request_alignment;
837      } else {
838          return bdi.cluster_size;
839      }
840  }
841  
842  void bdrv_inc_in_flight(BlockDriverState *bs)
843  {
844      atomic_inc(&bs->in_flight);
845  }
846  
847  void bdrv_wakeup(BlockDriverState *bs)
848  {
849      aio_wait_kick();
850  }
851  
852  void bdrv_dec_in_flight(BlockDriverState *bs)
853  {
854      atomic_dec(&bs->in_flight);
855      bdrv_wakeup(bs);
856  }
857  
858  static bool coroutine_fn bdrv_wait_serialising_requests(BdrvTrackedRequest *self)
859  {
860      BlockDriverState *bs = self->bs;
861      bool waited = false;
862  
863      if (!atomic_read(&bs->serialising_in_flight)) {
864          return false;
865      }
866  
867      qemu_co_mutex_lock(&bs->reqs_lock);
868      waited = bdrv_wait_serialising_requests_locked(bs, self);
869      qemu_co_mutex_unlock(&bs->reqs_lock);
870  
871      return waited;
872  }
873  
874  static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
875                                     size_t size)
876  {
877      if (size > BDRV_REQUEST_MAX_BYTES) {
878          return -EIO;
879      }
880  
881      if (!bdrv_is_inserted(bs)) {
882          return -ENOMEDIUM;
883      }
884  
885      if (offset < 0) {
886          return -EIO;
887      }
888  
889      return 0;
890  }
891  
892  typedef int coroutine_fn BdrvRequestEntry(void *opaque);
893  typedef struct BdrvRunCo {
894      BdrvRequestEntry *entry;
895      void *opaque;
896      int ret;
897      bool done;
898      Coroutine *co; /* Coroutine, running bdrv_run_co_entry, for debugging */
899  } BdrvRunCo;
900  
901  static void coroutine_fn bdrv_run_co_entry(void *opaque)
902  {
903      BdrvRunCo *arg = opaque;
904  
905      arg->ret = arg->entry(arg->opaque);
906      arg->done = true;
907      aio_wait_kick();
908  }
909  
910  static int bdrv_run_co(BlockDriverState *bs, BdrvRequestEntry *entry,
911                         void *opaque)
912  {
913      if (qemu_in_coroutine()) {
914          /* Fast-path if already in coroutine context */
915          return entry(opaque);
916      } else {
917          BdrvRunCo s = { .entry = entry, .opaque = opaque };
918  
919          s.co = qemu_coroutine_create(bdrv_run_co_entry, &s);
920          bdrv_coroutine_enter(bs, s.co);
921  
922          BDRV_POLL_WHILE(bs, !s.done);
923  
924          return s.ret;
925      }
926  }
927  
928  typedef struct RwCo {
929      BdrvChild *child;
930      int64_t offset;
931      QEMUIOVector *qiov;
932      bool is_write;
933      BdrvRequestFlags flags;
934  } RwCo;
935  
936  static int coroutine_fn bdrv_rw_co_entry(void *opaque)
937  {
938      RwCo *rwco = opaque;
939  
940      if (!rwco->is_write) {
941          return bdrv_co_preadv(rwco->child, rwco->offset,
942                                rwco->qiov->size, rwco->qiov,
943                                rwco->flags);
944      } else {
945          return bdrv_co_pwritev(rwco->child, rwco->offset,
946                                 rwco->qiov->size, rwco->qiov,
947                                 rwco->flags);
948      }
949  }
950  
951  /*
952   * Process a vectored synchronous request using coroutines
953   */
954  static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
955                          QEMUIOVector *qiov, bool is_write,
956                          BdrvRequestFlags flags)
957  {
958      RwCo rwco = {
959          .child = child,
960          .offset = offset,
961          .qiov = qiov,
962          .is_write = is_write,
963          .flags = flags,
964      };
965  
966      return bdrv_run_co(child->bs, bdrv_rw_co_entry, &rwco);
967  }
968  
969  int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
970                         int bytes, BdrvRequestFlags flags)
971  {
972      QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes);
973  
974      return bdrv_prwv_co(child, offset, &qiov, true,
975                          BDRV_REQ_ZERO_WRITE | flags);
976  }
977  
978  /*
979   * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
980   * The operation is sped up by checking the block status and only writing
981   * zeroes to the device if they currently do not return zeroes. Optional
982   * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
983   * BDRV_REQ_FUA).
984   *
985   * Returns < 0 on error, 0 on success. For error codes see bdrv_pwrite().
986   */
987  int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
988  {
989      int ret;
990      int64_t target_size, bytes, offset = 0;
991      BlockDriverState *bs = child->bs;
992  
993      target_size = bdrv_getlength(bs);
994      if (target_size < 0) {
995          return target_size;
996      }
997  
998      for (;;) {
999          bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES);
1000          if (bytes <= 0) {
1001              return 0;
1002          }
1003          ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL);
1004          if (ret < 0) {
1005              return ret;
1006          }
1007          if (ret & BDRV_BLOCK_ZERO) {
1008              offset += bytes;
1009              continue;
1010          }
1011          ret = bdrv_pwrite_zeroes(child, offset, bytes, flags);
1012          if (ret < 0) {
1013              return ret;
1014          }
1015          offset += bytes;
1016      }
1017  }
1018  
1019  /* return < 0 if error. See bdrv_pwrite() for the return codes */
1020  int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
1021  {
1022      int ret;
1023  
1024      ret = bdrv_prwv_co(child, offset, qiov, false, 0);
1025      if (ret < 0) {
1026          return ret;
1027      }
1028  
1029      return qiov->size;
1030  }
1031  
1032  /* See bdrv_pwrite() for the return codes */
1033  int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes)
1034  {
1035      QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
1036  
1037      if (bytes < 0) {
1038          return -EINVAL;
1039      }
1040  
1041      return bdrv_preadv(child, offset, &qiov);
1042  }
1043  
1044  int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
1045  {
1046      int ret;
1047  
1048      ret = bdrv_prwv_co(child, offset, qiov, true, 0);
1049      if (ret < 0) {
1050          return ret;
1051      }
1052  
1053      return qiov->size;
1054  }
1055  
1056  /* Return no. of bytes on success or < 0 on error. Important errors are:
1057    -EIO         generic I/O error (may happen for all errors)
1058    -ENOMEDIUM   No media inserted.
1059    -EINVAL      Invalid offset or number of bytes
1060    -EACCES      Trying to write a read-only device
1061  */
1062  int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes)
1063  {
1064      QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
1065  
1066      if (bytes < 0) {
1067          return -EINVAL;
1068      }
1069  
1070      return bdrv_pwritev(child, offset, &qiov);
1071  }
1072  
1073  /*
1074   * Writes to the file and ensures that no writes are reordered across this
1075   * request (acts as a barrier)
1076   *
1077   * Returns 0 on success, -errno in error cases.
1078   */
1079  int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
1080                       const void *buf, int count)
1081  {
1082      int ret;
1083  
1084      ret = bdrv_pwrite(child, offset, buf, count);
1085      if (ret < 0) {
1086          return ret;
1087      }
1088  
1089      ret = bdrv_flush(child->bs);
1090      if (ret < 0) {
1091          return ret;
1092      }
1093  
1094      return 0;
1095  }
1096  
1097  typedef struct CoroutineIOCompletion {
1098      Coroutine *coroutine;
1099      int ret;
1100  } CoroutineIOCompletion;
1101  
1102  static void bdrv_co_io_em_complete(void *opaque, int ret)
1103  {
1104      CoroutineIOCompletion *co = opaque;
1105  
1106      co->ret = ret;
1107      aio_co_wake(co->coroutine);
1108  }
1109  
1110  static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
1111                                             uint64_t offset, uint64_t bytes,
1112                                             QEMUIOVector *qiov,
1113                                             size_t qiov_offset, int flags)
1114  {
1115      BlockDriver *drv = bs->drv;
1116      int64_t sector_num;
1117      unsigned int nb_sectors;
1118      QEMUIOVector local_qiov;
1119      int ret;
1120  
1121      assert(!(flags & ~BDRV_REQ_MASK));
1122      assert(!(flags & BDRV_REQ_NO_FALLBACK));
1123  
1124      if (!drv) {
1125          return -ENOMEDIUM;
1126      }
1127  
1128      if (drv->bdrv_co_preadv_part) {
1129          return drv->bdrv_co_preadv_part(bs, offset, bytes, qiov, qiov_offset,
1130                                          flags);
1131      }
1132  
1133      if (qiov_offset > 0 || bytes != qiov->size) {
1134          qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
1135          qiov = &local_qiov;
1136      }
1137  
1138      if (drv->bdrv_co_preadv) {
1139          ret = drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
1140          goto out;
1141      }
1142  
1143      if (drv->bdrv_aio_preadv) {
1144          BlockAIOCB *acb;
1145          CoroutineIOCompletion co = {
1146              .coroutine = qemu_coroutine_self(),
1147          };
1148  
1149          acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
1150                                     bdrv_co_io_em_complete, &co);
1151          if (acb == NULL) {
1152              ret = -EIO;
1153              goto out;
1154          } else {
1155              qemu_coroutine_yield();
1156              ret = co.ret;
1157              goto out;
1158          }
1159      }
1160  
1161      sector_num = offset >> BDRV_SECTOR_BITS;
1162      nb_sectors = bytes >> BDRV_SECTOR_BITS;
1163  
1164      assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
1165      assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
1166      assert(bytes <= BDRV_REQUEST_MAX_BYTES);
1167      assert(drv->bdrv_co_readv);
1168  
1169      ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1170  
1171  out:
1172      if (qiov == &local_qiov) {
1173          qemu_iovec_destroy(&local_qiov);
1174      }
1175  
1176      return ret;
1177  }
1178  
1179  static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
1180                                              uint64_t offset, uint64_t bytes,
1181                                              QEMUIOVector *qiov,
1182                                              size_t qiov_offset, int flags)
1183  {
1184      BlockDriver *drv = bs->drv;
1185      int64_t sector_num;
1186      unsigned int nb_sectors;
1187      QEMUIOVector local_qiov;
1188      int ret;
1189  
1190      assert(!(flags & ~BDRV_REQ_MASK));
1191      assert(!(flags & BDRV_REQ_NO_FALLBACK));
1192  
1193      if (!drv) {
1194          return -ENOMEDIUM;
1195      }
1196  
1197      if (drv->bdrv_co_pwritev_part) {
1198          ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset,
1199                                          flags & bs->supported_write_flags);
1200          flags &= ~bs->supported_write_flags;
1201          goto emulate_flags;
1202      }
1203  
1204      if (qiov_offset > 0 || bytes != qiov->size) {
1205          qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
1206          qiov = &local_qiov;
1207      }
1208  
1209      if (drv->bdrv_co_pwritev) {
1210          ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
1211                                     flags & bs->supported_write_flags);
1212          flags &= ~bs->supported_write_flags;
1213          goto emulate_flags;
1214      }
1215  
1216      if (drv->bdrv_aio_pwritev) {
1217          BlockAIOCB *acb;
1218          CoroutineIOCompletion co = {
1219              .coroutine = qemu_coroutine_self(),
1220          };
1221  
1222          acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
1223                                      flags & bs->supported_write_flags,
1224                                      bdrv_co_io_em_complete, &co);
1225          flags &= ~bs->supported_write_flags;
1226          if (acb == NULL) {
1227              ret = -EIO;
1228          } else {
1229              qemu_coroutine_yield();
1230              ret = co.ret;
1231          }
1232          goto emulate_flags;
1233      }
1234  
1235      sector_num = offset >> BDRV_SECTOR_BITS;
1236      nb_sectors = bytes >> BDRV_SECTOR_BITS;
1237  
1238      assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
1239      assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
1240      assert(bytes <= BDRV_REQUEST_MAX_BYTES);
1241  
1242      assert(drv->bdrv_co_writev);
1243      ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov,
1244                                flags & bs->supported_write_flags);
1245      flags &= ~bs->supported_write_flags;
1246  
1247  emulate_flags:
1248      if (ret == 0 && (flags & BDRV_REQ_FUA)) {
1249          ret = bdrv_co_flush(bs);
1250      }
1251  
1252      if (qiov == &local_qiov) {
1253          qemu_iovec_destroy(&local_qiov);
1254      }
1255  
1256      return ret;
1257  }
1258  
1259  static int coroutine_fn
1260  bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
1261                                 uint64_t bytes, QEMUIOVector *qiov,
1262                                 size_t qiov_offset)
1263  {
1264      BlockDriver *drv = bs->drv;
1265      QEMUIOVector local_qiov;
1266      int ret;
1267  
1268      if (!drv) {
1269          return -ENOMEDIUM;
1270      }
1271  
1272      if (!block_driver_can_compress(drv)) {
1273          return -ENOTSUP;
1274      }
1275  
1276      if (drv->bdrv_co_pwritev_compressed_part) {
1277          return drv->bdrv_co_pwritev_compressed_part(bs, offset, bytes,
1278                                                      qiov, qiov_offset);
1279      }
1280  
1281      if (qiov_offset == 0) {
1282          return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
1283      }
1284  
1285      qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
1286      ret = drv->bdrv_co_pwritev_compressed(bs, offset, bytes, &local_qiov);
1287      qemu_iovec_destroy(&local_qiov);
1288  
1289      return ret;
1290  }
1291  
1292  static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
1293          int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1294          size_t qiov_offset, int flags)
1295  {
1296      BlockDriverState *bs = child->bs;
1297  
1298      /* Perform I/O through a temporary buffer so that users who scribble over
1299       * their read buffer while the operation is in progress do not end up
1300       * modifying the image file.  This is critical for zero-copy guest I/O
1301       * where anything might happen inside guest memory.
1302       */
1303      void *bounce_buffer = NULL;
1304  
1305      BlockDriver *drv = bs->drv;
1306      int64_t cluster_offset;
1307      int64_t cluster_bytes;
1308      size_t skip_bytes;
1309      int ret;
1310      int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
1311                                      BDRV_REQUEST_MAX_BYTES);
1312      unsigned int progress = 0;
1313      bool skip_write;
1314  
1315      if (!drv) {
1316          return -ENOMEDIUM;
1317      }
1318  
1319      /*
1320       * Do not write anything when the BDS is inactive.  That is not
1321       * allowed, and it would not help.
1322       */
1323      skip_write = (bs->open_flags & BDRV_O_INACTIVE);
1324  
1325      /* FIXME We cannot require callers to have write permissions when all they
1326       * are doing is a read request. If we did things right, write permissions
1327       * would be obtained anyway, but internally by the copy-on-read code. As
1328       * long as it is implemented here rather than in a separate filter driver,
1329       * the copy-on-read code doesn't have its own BdrvChild, however, for which
1330       * it could request permissions. Therefore we have to bypass the permission
1331       * system for the moment. */
1332      // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
1333  
1334      /* Cover entire cluster so no additional backing file I/O is required when
1335       * allocating cluster in the image file.  Note that this value may exceed
1336       * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
1337       * is one reason we loop rather than doing it all at once.
1338       */
1339      bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
1340      skip_bytes = offset - cluster_offset;
1341  
1342      trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
1343                                     cluster_offset, cluster_bytes);
1344  
1345      while (cluster_bytes) {
1346          int64_t pnum;
1347  
1348          if (skip_write) {
1349              ret = 1; /* "already allocated", so nothing will be copied */
1350              pnum = MIN(cluster_bytes, max_transfer);
1351          } else {
1352              ret = bdrv_is_allocated(bs, cluster_offset,
1353                                      MIN(cluster_bytes, max_transfer), &pnum);
1354              if (ret < 0) {
1355                  /*
1356                   * Safe to treat errors in querying allocation as if
1357                   * unallocated; we'll probably fail again soon on the
1358                   * read, but at least that will set a decent errno.
1359                   */
1360                  pnum = MIN(cluster_bytes, max_transfer);
1361              }
1362  
1363              /* Stop at EOF if the image ends in the middle of the cluster */
1364              if (ret == 0 && pnum == 0) {
1365                  assert(progress >= bytes);
1366                  break;
1367              }
1368  
1369              assert(skip_bytes < pnum);
1370          }
1371  
1372          if (ret <= 0) {
1373              QEMUIOVector local_qiov;
1374  
1375              /* Must copy-on-read; use the bounce buffer */
1376              pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
1377              if (!bounce_buffer) {
1378                  int64_t max_we_need = MAX(pnum, cluster_bytes - pnum);
1379                  int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER);
1380                  int64_t bounce_buffer_len = MIN(max_we_need, max_allowed);
1381  
1382                  bounce_buffer = qemu_try_blockalign(bs, bounce_buffer_len);
1383                  if (!bounce_buffer) {
1384                      ret = -ENOMEM;
1385                      goto err;
1386                  }
1387              }
1388              qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum);
1389  
1390              ret = bdrv_driver_preadv(bs, cluster_offset, pnum,
1391                                       &local_qiov, 0, 0);
1392              if (ret < 0) {
1393                  goto err;
1394              }
1395  
1396              bdrv_debug_event(bs, BLKDBG_COR_WRITE);
1397              if (drv->bdrv_co_pwrite_zeroes &&
1398                  buffer_is_zero(bounce_buffer, pnum)) {
1399                  /* FIXME: Should we (perhaps conditionally) be setting
1400                   * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
1401                   * that still correctly reads as zero? */
1402                  ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum,
1403                                                 BDRV_REQ_WRITE_UNCHANGED);
1404              } else {
1405                  /* This does not change the data on the disk, it is not
1406                   * necessary to flush even in cache=writethrough mode.
1407                   */
1408                  ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
1409                                            &local_qiov, 0,
1410                                            BDRV_REQ_WRITE_UNCHANGED);
1411              }
1412  
1413              if (ret < 0) {
1414                  /* It might be okay to ignore write errors for guest
1415                   * requests.  If this is a deliberate copy-on-read
1416                   * then we don't want to ignore the error.  Simply
1417                   * report it in all cases.
1418                   */
1419                  goto err;
1420              }
1421  
1422              if (!(flags & BDRV_REQ_PREFETCH)) {
1423                  qemu_iovec_from_buf(qiov, qiov_offset + progress,
1424                                      bounce_buffer + skip_bytes,
1425                                      MIN(pnum - skip_bytes, bytes - progress));
1426              }
1427          } else if (!(flags & BDRV_REQ_PREFETCH)) {
1428              /* Read directly into the destination */
1429              ret = bdrv_driver_preadv(bs, offset + progress,
1430                                       MIN(pnum - skip_bytes, bytes - progress),
1431                                       qiov, qiov_offset + progress, 0);
1432              if (ret < 0) {
1433                  goto err;
1434              }
1435          }
1436  
1437          cluster_offset += pnum;
1438          cluster_bytes -= pnum;
1439          progress += pnum - skip_bytes;
1440          skip_bytes = 0;
1441      }
1442      ret = 0;
1443  
1444  err:
1445      qemu_vfree(bounce_buffer);
1446      return ret;
1447  }
1448  
1449  /*
1450   * Forwards an already correctly aligned request to the BlockDriver. This
1451   * handles copy on read, zeroing after EOF, and fragmentation of large
1452   * reads; any other features must be implemented by the caller.
1453   */
1454  static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
1455      BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1456      int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags)
1457  {
1458      BlockDriverState *bs = child->bs;
1459      int64_t total_bytes, max_bytes;
1460      int ret = 0;
1461      uint64_t bytes_remaining = bytes;
1462      int max_transfer;
1463  
1464      assert(is_power_of_2(align));
1465      assert((offset & (align - 1)) == 0);
1466      assert((bytes & (align - 1)) == 0);
1467      assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1468      max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1469                                     align);
1470  
1471      /* TODO: We would need a per-BDS .supported_read_flags and
1472       * potential fallback support, if we ever implement any read flags
1473       * to pass through to drivers.  For now, there aren't any
1474       * passthrough flags.  */
1475      assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH)));
1476  
1477      /* Handle Copy on Read and associated serialisation */
1478      if (flags & BDRV_REQ_COPY_ON_READ) {
1479          /* If we touch the same cluster it counts as an overlap.  This
1480           * guarantees that allocating writes will be serialized and not race
1481           * with each other for the same cluster.  For example, in copy-on-read
1482           * it ensures that the CoR read and write operations are atomic and
1483           * guest writes cannot interleave between them. */
1484          bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs));
1485      } else {
1486          bdrv_wait_serialising_requests(req);
1487      }
1488  
1489      if (flags & BDRV_REQ_COPY_ON_READ) {
1490          int64_t pnum;
1491  
1492          ret = bdrv_is_allocated(bs, offset, bytes, &pnum);
1493          if (ret < 0) {
1494              goto out;
1495          }
1496  
1497          if (!ret || pnum != bytes) {
1498              ret = bdrv_co_do_copy_on_readv(child, offset, bytes,
1499                                             qiov, qiov_offset, flags);
1500              goto out;
1501          } else if (flags & BDRV_REQ_PREFETCH) {
1502              goto out;
1503          }
1504      }
1505  
1506      /* Forward the request to the BlockDriver, possibly fragmenting it */
1507      total_bytes = bdrv_getlength(bs);
1508      if (total_bytes < 0) {
1509          ret = total_bytes;
1510          goto out;
1511      }
1512  
1513      max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
1514      if (bytes <= max_bytes && bytes <= max_transfer) {
1515          ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, 0);
1516          goto out;
1517      }
1518  
1519      while (bytes_remaining) {
1520          int num;
1521  
1522          if (max_bytes) {
1523              num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
1524              assert(num);
1525  
1526              ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
1527                                       num, qiov,
1528                                       qiov_offset + bytes - bytes_remaining, 0);
1529              max_bytes -= num;
1530          } else {
1531              num = bytes_remaining;
1532              ret = qemu_iovec_memset(qiov, qiov_offset + bytes - bytes_remaining,
1533                                      0, bytes_remaining);
1534          }
1535          if (ret < 0) {
1536              goto out;
1537          }
1538          bytes_remaining -= num;
1539      }
1540  
1541  out:
1542      return ret < 0 ? ret : 0;
1543  }
1544  
1545  /*
1546   * Request padding
1547   *
1548   *  |<---- align ----->|                     |<----- align ---->|
1549   *  |<- head ->|<------------- bytes ------------->|<-- tail -->|
1550   *  |          |       |                     |     |            |
1551   * -*----------$-------*-------- ... --------*-----$------------*---
1552   *  |          |       |                     |     |            |
1553   *  |          offset  |                     |     end          |
1554   *  ALIGN_DOWN(offset) ALIGN_UP(offset)      ALIGN_DOWN(end)   ALIGN_UP(end)
1555   *  [buf   ... )                             [tail_buf          )
1556   *
1557   * @buf is an aligned allocation needed to store @head and @tail paddings. @head
1558   * is placed at the beginning of @buf and @tail at the @end.
1559   *
1560   * @tail_buf is a pointer to sub-buffer, corresponding to align-sized chunk
1561   * around tail, if tail exists.
1562   *
1563   * @merge_reads is true for small requests,
1564   * if @buf_len == @head + bytes + @tail. In this case it is possible that both
1565   * head and tail exist but @buf_len == align and @tail_buf == @buf.
1566   */
1567  typedef struct BdrvRequestPadding {
1568      uint8_t *buf;
1569      size_t buf_len;
1570      uint8_t *tail_buf;
1571      size_t head;
1572      size_t tail;
1573      bool merge_reads;
1574      QEMUIOVector local_qiov;
1575  } BdrvRequestPadding;
1576  
1577  static bool bdrv_init_padding(BlockDriverState *bs,
1578                                int64_t offset, int64_t bytes,
1579                                BdrvRequestPadding *pad)
1580  {
1581      uint64_t align = bs->bl.request_alignment;
1582      size_t sum;
1583  
1584      memset(pad, 0, sizeof(*pad));
1585  
1586      pad->head = offset & (align - 1);
1587      pad->tail = ((offset + bytes) & (align - 1));
1588      if (pad->tail) {
1589          pad->tail = align - pad->tail;
1590      }
1591  
1592      if (!pad->head && !pad->tail) {
1593          return false;
1594      }
1595  
1596      assert(bytes); /* Nothing good in aligning zero-length requests */
1597  
1598      sum = pad->head + bytes + pad->tail;
1599      pad->buf_len = (sum > align && pad->head && pad->tail) ? 2 * align : align;
1600      pad->buf = qemu_blockalign(bs, pad->buf_len);
1601      pad->merge_reads = sum == pad->buf_len;
1602      if (pad->tail) {
1603          pad->tail_buf = pad->buf + pad->buf_len - align;
1604      }
1605  
1606      return true;
1607  }
1608  
1609  static int bdrv_padding_rmw_read(BdrvChild *child,
1610                                   BdrvTrackedRequest *req,
1611                                   BdrvRequestPadding *pad,
1612                                   bool zero_middle)
1613  {
1614      QEMUIOVector local_qiov;
1615      BlockDriverState *bs = child->bs;
1616      uint64_t align = bs->bl.request_alignment;
1617      int ret;
1618  
1619      assert(req->serialising && pad->buf);
1620  
1621      if (pad->head || pad->merge_reads) {
1622          uint64_t bytes = pad->merge_reads ? pad->buf_len : align;
1623  
1624          qemu_iovec_init_buf(&local_qiov, pad->buf, bytes);
1625  
1626          if (pad->head) {
1627              bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1628          }
1629          if (pad->merge_reads && pad->tail) {
1630              bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1631          }
1632          ret = bdrv_aligned_preadv(child, req, req->overlap_offset, bytes,
1633                                    align, &local_qiov, 0, 0);
1634          if (ret < 0) {
1635              return ret;
1636          }
1637          if (pad->head) {
1638              bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1639          }
1640          if (pad->merge_reads && pad->tail) {
1641              bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1642          }
1643  
1644          if (pad->merge_reads) {
1645              goto zero_mem;
1646          }
1647      }
1648  
1649      if (pad->tail) {
1650          qemu_iovec_init_buf(&local_qiov, pad->tail_buf, align);
1651  
1652          bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1653          ret = bdrv_aligned_preadv(
1654                  child, req,
1655                  req->overlap_offset + req->overlap_bytes - align,
1656                  align, align, &local_qiov, 0, 0);
1657          if (ret < 0) {
1658              return ret;
1659          }
1660          bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1661      }
1662  
1663  zero_mem:
1664      if (zero_middle) {
1665          memset(pad->buf + pad->head, 0, pad->buf_len - pad->head - pad->tail);
1666      }
1667  
1668      return 0;
1669  }
1670  
1671  static void bdrv_padding_destroy(BdrvRequestPadding *pad)
1672  {
1673      if (pad->buf) {
1674          qemu_vfree(pad->buf);
1675          qemu_iovec_destroy(&pad->local_qiov);
1676      }
1677  }
1678  
1679  /*
1680   * bdrv_pad_request
1681   *
1682   * Exchange request parameters with padded request if needed. Don't include RMW
1683   * read of padding, bdrv_padding_rmw_read() should be called separately if
1684   * needed.
1685   *
1686   * All parameters except @bs are in-out: they represent original request at
1687   * function call and padded (if padding needed) at function finish.
1688   *
1689   * Function always succeeds.
1690   */
1691  static bool bdrv_pad_request(BlockDriverState *bs,
1692                               QEMUIOVector **qiov, size_t *qiov_offset,
1693                               int64_t *offset, unsigned int *bytes,
1694                               BdrvRequestPadding *pad)
1695  {
1696      if (!bdrv_init_padding(bs, *offset, *bytes, pad)) {
1697          return false;
1698      }
1699  
1700      qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head,
1701                               *qiov, *qiov_offset, *bytes,
1702                               pad->buf + pad->buf_len - pad->tail, pad->tail);
1703      *bytes += pad->head + pad->tail;
1704      *offset -= pad->head;
1705      *qiov = &pad->local_qiov;
1706      *qiov_offset = 0;
1707  
1708      return true;
1709  }
1710  
1711  int coroutine_fn bdrv_co_preadv(BdrvChild *child,
1712      int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1713      BdrvRequestFlags flags)
1714  {
1715      return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags);
1716  }
1717  
1718  int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
1719      int64_t offset, unsigned int bytes,
1720      QEMUIOVector *qiov, size_t qiov_offset,
1721      BdrvRequestFlags flags)
1722  {
1723      BlockDriverState *bs = child->bs;
1724      BdrvTrackedRequest req;
1725      BdrvRequestPadding pad;
1726      int ret;
1727  
1728      trace_bdrv_co_preadv(bs, offset, bytes, flags);
1729  
1730      ret = bdrv_check_byte_request(bs, offset, bytes);
1731      if (ret < 0) {
1732          return ret;
1733      }
1734  
1735      if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) {
1736          /*
1737           * Aligning zero request is nonsense. Even if driver has special meaning
1738           * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass
1739           * it to driver due to request_alignment.
1740           *
1741           * Still, no reason to return an error if someone do unaligned
1742           * zero-length read occasionally.
1743           */
1744          return 0;
1745      }
1746  
1747      bdrv_inc_in_flight(bs);
1748  
1749      /* Don't do copy-on-read if we read data before write operation */
1750      if (atomic_read(&bs->copy_on_read)) {
1751          flags |= BDRV_REQ_COPY_ON_READ;
1752      }
1753  
1754      bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad);
1755  
1756      tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
1757      ret = bdrv_aligned_preadv(child, &req, offset, bytes,
1758                                bs->bl.request_alignment,
1759                                qiov, qiov_offset, flags);
1760      tracked_request_end(&req);
1761      bdrv_dec_in_flight(bs);
1762  
1763      bdrv_padding_destroy(&pad);
1764  
1765      return ret;
1766  }
1767  
1768  static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
1769      int64_t offset, int bytes, BdrvRequestFlags flags)
1770  {
1771      BlockDriver *drv = bs->drv;
1772      QEMUIOVector qiov;
1773      void *buf = NULL;
1774      int ret = 0;
1775      bool need_flush = false;
1776      int head = 0;
1777      int tail = 0;
1778  
1779      int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
1780      int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
1781                          bs->bl.request_alignment);
1782      int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER);
1783  
1784      if (!drv) {
1785          return -ENOMEDIUM;
1786      }
1787  
1788      if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) {
1789          return -ENOTSUP;
1790      }
1791  
1792      assert(alignment % bs->bl.request_alignment == 0);
1793      head = offset % alignment;
1794      tail = (offset + bytes) % alignment;
1795      max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
1796      assert(max_write_zeroes >= bs->bl.request_alignment);
1797  
1798      while (bytes > 0 && !ret) {
1799          int num = bytes;
1800  
1801          /* Align request.  Block drivers can expect the "bulk" of the request
1802           * to be aligned, and that unaligned requests do not cross cluster
1803           * boundaries.
1804           */
1805          if (head) {
1806              /* Make a small request up to the first aligned sector. For
1807               * convenience, limit this request to max_transfer even if
1808               * we don't need to fall back to writes.  */
1809              num = MIN(MIN(bytes, max_transfer), alignment - head);
1810              head = (head + num) % alignment;
1811              assert(num < max_write_zeroes);
1812          } else if (tail && num > alignment) {
1813              /* Shorten the request to the last aligned sector.  */
1814              num -= tail;
1815          }
1816  
1817          /* limit request size */
1818          if (num > max_write_zeroes) {
1819              num = max_write_zeroes;
1820          }
1821  
1822          ret = -ENOTSUP;
1823          /* First try the efficient write zeroes operation */
1824          if (drv->bdrv_co_pwrite_zeroes) {
1825              ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1826                                               flags & bs->supported_zero_flags);
1827              if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1828                  !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1829                  need_flush = true;
1830              }
1831          } else {
1832              assert(!bs->supported_zero_flags);
1833          }
1834  
1835          if (ret == -ENOTSUP && !(flags & BDRV_REQ_NO_FALLBACK)) {
1836              /* Fall back to bounce buffer if write zeroes is unsupported */
1837              BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1838  
1839              if ((flags & BDRV_REQ_FUA) &&
1840                  !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1841                  /* No need for bdrv_driver_pwrite() to do a fallback
1842                   * flush on each chunk; use just one at the end */
1843                  write_flags &= ~BDRV_REQ_FUA;
1844                  need_flush = true;
1845              }
1846              num = MIN(num, max_transfer);
1847              if (buf == NULL) {
1848                  buf = qemu_try_blockalign0(bs, num);
1849                  if (buf == NULL) {
1850                      ret = -ENOMEM;
1851                      goto fail;
1852                  }
1853              }
1854              qemu_iovec_init_buf(&qiov, buf, num);
1855  
1856              ret = bdrv_driver_pwritev(bs, offset, num, &qiov, 0, write_flags);
1857  
1858              /* Keep bounce buffer around if it is big enough for all
1859               * all future requests.
1860               */
1861              if (num < max_transfer) {
1862                  qemu_vfree(buf);
1863                  buf = NULL;
1864              }
1865          }
1866  
1867          offset += num;
1868          bytes -= num;
1869      }
1870  
1871  fail:
1872      if (ret == 0 && need_flush) {
1873          ret = bdrv_co_flush(bs);
1874      }
1875      qemu_vfree(buf);
1876      return ret;
1877  }
1878  
1879  static inline int coroutine_fn
1880  bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes,
1881                            BdrvTrackedRequest *req, int flags)
1882  {
1883      BlockDriverState *bs = child->bs;
1884      bool waited;
1885      int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
1886  
1887      if (bs->read_only) {
1888          return -EPERM;
1889      }
1890  
1891      assert(!(bs->open_flags & BDRV_O_INACTIVE));
1892      assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1893      assert(!(flags & ~BDRV_REQ_MASK));
1894  
1895      if (flags & BDRV_REQ_SERIALISING) {
1896          waited = bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs));
1897          /*
1898           * For a misaligned request we should have already waited earlier,
1899           * because we come after bdrv_padding_rmw_read which must be called
1900           * with the request already marked as serialising.
1901           */
1902          assert(!waited ||
1903                 (req->offset == req->overlap_offset &&
1904                  req->bytes == req->overlap_bytes));
1905      } else {
1906          bdrv_wait_serialising_requests(req);
1907      }
1908  
1909      assert(req->overlap_offset <= offset);
1910      assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1911      assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
1912  
1913      switch (req->type) {
1914      case BDRV_TRACKED_WRITE:
1915      case BDRV_TRACKED_DISCARD:
1916          if (flags & BDRV_REQ_WRITE_UNCHANGED) {
1917              assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
1918          } else {
1919              assert(child->perm & BLK_PERM_WRITE);
1920          }
1921          return notifier_with_return_list_notify(&bs->before_write_notifiers,
1922                                                  req);
1923      case BDRV_TRACKED_TRUNCATE:
1924          assert(child->perm & BLK_PERM_RESIZE);
1925          return 0;
1926      default:
1927          abort();
1928      }
1929  }
1930  
1931  static inline void coroutine_fn
1932  bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes,
1933                           BdrvTrackedRequest *req, int ret)
1934  {
1935      int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
1936      BlockDriverState *bs = child->bs;
1937  
1938      atomic_inc(&bs->write_gen);
1939  
1940      /*
1941       * Discard cannot extend the image, but in error handling cases, such as
1942       * when reverting a qcow2 cluster allocation, the discarded range can pass
1943       * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD
1944       * here. Instead, just skip it, since semantically a discard request
1945       * beyond EOF cannot expand the image anyway.
1946       */
1947      if (ret == 0 &&
1948          (req->type == BDRV_TRACKED_TRUNCATE ||
1949           end_sector > bs->total_sectors) &&
1950          req->type != BDRV_TRACKED_DISCARD) {
1951          bs->total_sectors = end_sector;
1952          bdrv_parent_cb_resize(bs);
1953          bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS);
1954      }
1955      if (req->bytes) {
1956          switch (req->type) {
1957          case BDRV_TRACKED_WRITE:
1958              stat64_max(&bs->wr_highest_offset, offset + bytes);
1959              /* fall through, to set dirty bits */
1960          case BDRV_TRACKED_DISCARD:
1961              bdrv_set_dirty(bs, offset, bytes);
1962              break;
1963          default:
1964              break;
1965          }
1966      }
1967  }
1968  
1969  /*
1970   * Forwards an already correctly aligned write request to the BlockDriver,
1971   * after possibly fragmenting it.
1972   */
1973  static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
1974      BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1975      int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags)
1976  {
1977      BlockDriverState *bs = child->bs;
1978      BlockDriver *drv = bs->drv;
1979      int ret;
1980  
1981      uint64_t bytes_remaining = bytes;
1982      int max_transfer;
1983  
1984      if (!drv) {
1985          return -ENOMEDIUM;
1986      }
1987  
1988      if (bdrv_has_readonly_bitmaps(bs)) {
1989          return -EPERM;
1990      }
1991  
1992      assert(is_power_of_2(align));
1993      assert((offset & (align - 1)) == 0);
1994      assert((bytes & (align - 1)) == 0);
1995      assert(!qiov || qiov_offset + bytes <= qiov->size);
1996      max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1997                                     align);
1998  
1999      ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags);
2000  
2001      if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
2002          !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
2003          qemu_iovec_is_zero(qiov, qiov_offset, bytes)) {
2004          flags |= BDRV_REQ_ZERO_WRITE;
2005          if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
2006              flags |= BDRV_REQ_MAY_UNMAP;
2007          }
2008      }
2009  
2010      if (ret < 0) {
2011          /* Do nothing, write notifier decided to fail this request */
2012      } else if (flags & BDRV_REQ_ZERO_WRITE) {
2013          bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
2014          ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
2015      } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
2016          ret = bdrv_driver_pwritev_compressed(bs, offset, bytes,
2017                                               qiov, qiov_offset);
2018      } else if (bytes <= max_transfer) {
2019          bdrv_debug_event(bs, BLKDBG_PWRITEV);
2020          ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, qiov_offset, flags);
2021      } else {
2022          bdrv_debug_event(bs, BLKDBG_PWRITEV);
2023          while (bytes_remaining) {
2024              int num = MIN(bytes_remaining, max_transfer);
2025              int local_flags = flags;
2026  
2027              assert(num);
2028              if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
2029                  !(bs->supported_write_flags & BDRV_REQ_FUA)) {
2030                  /* If FUA is going to be emulated by flush, we only
2031                   * need to flush on the last iteration */
2032                  local_flags &= ~BDRV_REQ_FUA;
2033              }
2034  
2035              ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
2036                                        num, qiov,
2037                                        qiov_offset + bytes - bytes_remaining,
2038                                        local_flags);
2039              if (ret < 0) {
2040                  break;
2041              }
2042              bytes_remaining -= num;
2043          }
2044      }
2045      bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
2046  
2047      if (ret >= 0) {
2048          ret = 0;
2049      }
2050      bdrv_co_write_req_finish(child, offset, bytes, req, ret);
2051  
2052      return ret;
2053  }
2054  
2055  static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
2056                                                  int64_t offset,
2057                                                  unsigned int bytes,
2058                                                  BdrvRequestFlags flags,
2059                                                  BdrvTrackedRequest *req)
2060  {
2061      BlockDriverState *bs = child->bs;
2062      QEMUIOVector local_qiov;
2063      uint64_t align = bs->bl.request_alignment;
2064      int ret = 0;
2065      bool padding;
2066      BdrvRequestPadding pad;
2067  
2068      padding = bdrv_init_padding(bs, offset, bytes, &pad);
2069      if (padding) {
2070          bdrv_mark_request_serialising(req, align);
2071  
2072          bdrv_padding_rmw_read(child, req, &pad, true);
2073  
2074          if (pad.head || pad.merge_reads) {
2075              int64_t aligned_offset = offset & ~(align - 1);
2076              int64_t write_bytes = pad.merge_reads ? pad.buf_len : align;
2077  
2078              qemu_iovec_init_buf(&local_qiov, pad.buf, write_bytes);
2079              ret = bdrv_aligned_pwritev(child, req, aligned_offset, write_bytes,
2080                                         align, &local_qiov, 0,
2081                                         flags & ~BDRV_REQ_ZERO_WRITE);
2082              if (ret < 0 || pad.merge_reads) {
2083                  /* Error or all work is done */
2084                  goto out;
2085              }
2086              offset += write_bytes - pad.head;
2087              bytes -= write_bytes - pad.head;
2088          }
2089      }
2090  
2091      assert(!bytes || (offset & (align - 1)) == 0);
2092      if (bytes >= align) {
2093          /* Write the aligned part in the middle. */
2094          uint64_t aligned_bytes = bytes & ~(align - 1);
2095          ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
2096                                     NULL, 0, flags);
2097          if (ret < 0) {
2098              goto out;
2099          }
2100          bytes -= aligned_bytes;
2101          offset += aligned_bytes;
2102      }
2103  
2104      assert(!bytes || (offset & (align - 1)) == 0);
2105      if (bytes) {
2106          assert(align == pad.tail + bytes);
2107  
2108          qemu_iovec_init_buf(&local_qiov, pad.tail_buf, align);
2109          ret = bdrv_aligned_pwritev(child, req, offset, align, align,
2110                                     &local_qiov, 0,
2111                                     flags & ~BDRV_REQ_ZERO_WRITE);
2112      }
2113  
2114  out:
2115      bdrv_padding_destroy(&pad);
2116  
2117      return ret;
2118  }
2119  
2120  /*
2121   * Handle a write request in coroutine context
2122   */
2123  int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
2124      int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
2125      BdrvRequestFlags flags)
2126  {
2127      return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags);
2128  }
2129  
2130  int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
2131      int64_t offset, unsigned int bytes, QEMUIOVector *qiov, size_t qiov_offset,
2132      BdrvRequestFlags flags)
2133  {
2134      BlockDriverState *bs = child->bs;
2135      BdrvTrackedRequest req;
2136      uint64_t align = bs->bl.request_alignment;
2137      BdrvRequestPadding pad;
2138      int ret;
2139  
2140      trace_bdrv_co_pwritev(child->bs, offset, bytes, flags);
2141  
2142      if (!bs->drv) {
2143          return -ENOMEDIUM;
2144      }
2145  
2146      ret = bdrv_check_byte_request(bs, offset, bytes);
2147      if (ret < 0) {
2148          return ret;
2149      }
2150  
2151      /* If the request is misaligned then we can't make it efficient */
2152      if ((flags & BDRV_REQ_NO_FALLBACK) &&
2153          !QEMU_IS_ALIGNED(offset | bytes, align))
2154      {
2155          return -ENOTSUP;
2156      }
2157  
2158      if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) {
2159          /*
2160           * Aligning zero request is nonsense. Even if driver has special meaning
2161           * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass
2162           * it to driver due to request_alignment.
2163           *
2164           * Still, no reason to return an error if someone do unaligned
2165           * zero-length write occasionally.
2166           */
2167          return 0;
2168      }
2169  
2170      bdrv_inc_in_flight(bs);
2171      /*
2172       * Align write if necessary by performing a read-modify-write cycle.
2173       * Pad qiov with the read parts and be sure to have a tracked request not
2174       * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
2175       */
2176      tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
2177  
2178      if (flags & BDRV_REQ_ZERO_WRITE) {
2179          ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
2180          goto out;
2181      }
2182  
2183      if (bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad)) {
2184          bdrv_mark_request_serialising(&req, align);
2185          bdrv_padding_rmw_read(child, &req, &pad, false);
2186      }
2187  
2188      ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
2189                                 qiov, qiov_offset, flags);
2190  
2191      bdrv_padding_destroy(&pad);
2192  
2193  out:
2194      tracked_request_end(&req);
2195      bdrv_dec_in_flight(bs);
2196  
2197      return ret;
2198  }
2199  
2200  int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
2201                                         int bytes, BdrvRequestFlags flags)
2202  {
2203      trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
2204  
2205      if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
2206          flags &= ~BDRV_REQ_MAY_UNMAP;
2207      }
2208  
2209      return bdrv_co_pwritev(child, offset, bytes, NULL,
2210                             BDRV_REQ_ZERO_WRITE | flags);
2211  }
2212  
2213  /*
2214   * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
2215   */
2216  int bdrv_flush_all(void)
2217  {
2218      BdrvNextIterator it;
2219      BlockDriverState *bs = NULL;
2220      int result = 0;
2221  
2222      /*
2223       * bdrv queue is managed by record/replay,
2224       * creating new flush request for stopping
2225       * the VM may break the determinism
2226       */
2227      if (replay_events_enabled()) {
2228          return result;
2229      }
2230  
2231      for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
2232          AioContext *aio_context = bdrv_get_aio_context(bs);
2233          int ret;
2234  
2235          aio_context_acquire(aio_context);
2236          ret = bdrv_flush(bs);
2237          if (ret < 0 && !result) {
2238              result = ret;
2239          }
2240          aio_context_release(aio_context);
2241      }
2242  
2243      return result;
2244  }
2245  
2246  
2247  typedef struct BdrvCoBlockStatusData {
2248      BlockDriverState *bs;
2249      BlockDriverState *base;
2250      bool want_zero;
2251      int64_t offset;
2252      int64_t bytes;
2253      int64_t *pnum;
2254      int64_t *map;
2255      BlockDriverState **file;
2256  } BdrvCoBlockStatusData;
2257  
2258  int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs,
2259                                                  bool want_zero,
2260                                                  int64_t offset,
2261                                                  int64_t bytes,
2262                                                  int64_t *pnum,
2263                                                  int64_t *map,
2264                                                  BlockDriverState **file)
2265  {
2266      assert(bs->file && bs->file->bs);
2267      *pnum = bytes;
2268      *map = offset;
2269      *file = bs->file->bs;
2270      return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
2271  }
2272  
2273  int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs,
2274                                                     bool want_zero,
2275                                                     int64_t offset,
2276                                                     int64_t bytes,
2277                                                     int64_t *pnum,
2278                                                     int64_t *map,
2279                                                     BlockDriverState **file)
2280  {
2281      assert(bs->backing && bs->backing->bs);
2282      *pnum = bytes;
2283      *map = offset;
2284      *file = bs->backing->bs;
2285      return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
2286  }
2287  
2288  /*
2289   * Returns the allocation status of the specified sectors.
2290   * Drivers not implementing the functionality are assumed to not support
2291   * backing files, hence all their sectors are reported as allocated.
2292   *
2293   * If 'want_zero' is true, the caller is querying for mapping
2294   * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and
2295   * _ZERO where possible; otherwise, the result favors larger 'pnum',
2296   * with a focus on accurate BDRV_BLOCK_ALLOCATED.
2297   *
2298   * If 'offset' is beyond the end of the disk image the return value is
2299   * BDRV_BLOCK_EOF and 'pnum' is set to 0.
2300   *
2301   * 'bytes' is the max value 'pnum' should be set to.  If bytes goes
2302   * beyond the end of the disk image it will be clamped; if 'pnum' is set to
2303   * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
2304   *
2305   * 'pnum' is set to the number of bytes (including and immediately
2306   * following the specified offset) that are easily known to be in the
2307   * same allocated/unallocated state.  Note that a second call starting
2308   * at the original offset plus returned pnum may have the same status.
2309   * The returned value is non-zero on success except at end-of-file.
2310   *
2311   * Returns negative errno on failure.  Otherwise, if the
2312   * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are
2313   * set to the host mapping and BDS corresponding to the guest offset.
2314   */
2315  static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
2316                                               bool want_zero,
2317                                               int64_t offset, int64_t bytes,
2318                                               int64_t *pnum, int64_t *map,
2319                                               BlockDriverState **file)
2320  {
2321      int64_t total_size;
2322      int64_t n; /* bytes */
2323      int ret;
2324      int64_t local_map = 0;
2325      BlockDriverState *local_file = NULL;
2326      int64_t aligned_offset, aligned_bytes;
2327      uint32_t align;
2328  
2329      assert(pnum);
2330      *pnum = 0;
2331      total_size = bdrv_getlength(bs);
2332      if (total_size < 0) {
2333          ret = total_size;
2334          goto early_out;
2335      }
2336  
2337      if (offset >= total_size) {
2338          ret = BDRV_BLOCK_EOF;
2339          goto early_out;
2340      }
2341      if (!bytes) {
2342          ret = 0;
2343          goto early_out;
2344      }
2345  
2346      n = total_size - offset;
2347      if (n < bytes) {
2348          bytes = n;
2349      }
2350  
2351      /* Must be non-NULL or bdrv_getlength() would have failed */
2352      assert(bs->drv);
2353      if (!bs->drv->bdrv_co_block_status) {
2354          *pnum = bytes;
2355          ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
2356          if (offset + bytes == total_size) {
2357              ret |= BDRV_BLOCK_EOF;
2358          }
2359          if (bs->drv->protocol_name) {
2360              ret |= BDRV_BLOCK_OFFSET_VALID;
2361              local_map = offset;
2362              local_file = bs;
2363          }
2364          goto early_out;
2365      }
2366  
2367      bdrv_inc_in_flight(bs);
2368  
2369      /* Round out to request_alignment boundaries */
2370      align = bs->bl.request_alignment;
2371      aligned_offset = QEMU_ALIGN_DOWN(offset, align);
2372      aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
2373  
2374      ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
2375                                          aligned_bytes, pnum, &local_map,
2376                                          &local_file);
2377      if (ret < 0) {
2378          *pnum = 0;
2379          goto out;
2380      }
2381  
2382      /*
2383       * The driver's result must be a non-zero multiple of request_alignment.
2384       * Clamp pnum and adjust map to original request.
2385       */
2386      assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) &&
2387             align > offset - aligned_offset);
2388      if (ret & BDRV_BLOCK_RECURSE) {
2389          assert(ret & BDRV_BLOCK_DATA);
2390          assert(ret & BDRV_BLOCK_OFFSET_VALID);
2391          assert(!(ret & BDRV_BLOCK_ZERO));
2392      }
2393  
2394      *pnum -= offset - aligned_offset;
2395      if (*pnum > bytes) {
2396          *pnum = bytes;
2397      }
2398      if (ret & BDRV_BLOCK_OFFSET_VALID) {
2399          local_map += offset - aligned_offset;
2400      }
2401  
2402      if (ret & BDRV_BLOCK_RAW) {
2403          assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file);
2404          ret = bdrv_co_block_status(local_file, want_zero, local_map,
2405                                     *pnum, pnum, &local_map, &local_file);
2406          goto out;
2407      }
2408  
2409      if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
2410          ret |= BDRV_BLOCK_ALLOCATED;
2411      } else if (want_zero && bs->drv->supports_backing) {
2412          if (bs->backing) {
2413              BlockDriverState *bs2 = bs->backing->bs;
2414              int64_t size2 = bdrv_getlength(bs2);
2415  
2416              if (size2 >= 0 && offset >= size2) {
2417                  ret |= BDRV_BLOCK_ZERO;
2418              }
2419          } else {
2420              ret |= BDRV_BLOCK_ZERO;
2421          }
2422      }
2423  
2424      if (want_zero && ret & BDRV_BLOCK_RECURSE &&
2425          local_file && local_file != bs &&
2426          (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
2427          (ret & BDRV_BLOCK_OFFSET_VALID)) {
2428          int64_t file_pnum;
2429          int ret2;
2430  
2431          ret2 = bdrv_co_block_status(local_file, want_zero, local_map,
2432                                      *pnum, &file_pnum, NULL, NULL);
2433          if (ret2 >= 0) {
2434              /* Ignore errors.  This is just providing extra information, it
2435               * is useful but not necessary.
2436               */
2437              if (ret2 & BDRV_BLOCK_EOF &&
2438                  (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
2439                  /*
2440                   * It is valid for the format block driver to read
2441                   * beyond the end of the underlying file's current
2442                   * size; such areas read as zero.
2443                   */
2444                  ret |= BDRV_BLOCK_ZERO;
2445              } else {
2446                  /* Limit request to the range reported by the protocol driver */
2447                  *pnum = file_pnum;
2448                  ret |= (ret2 & BDRV_BLOCK_ZERO);
2449              }
2450          }
2451      }
2452  
2453  out:
2454      bdrv_dec_in_flight(bs);
2455      if (ret >= 0 && offset + *pnum == total_size) {
2456          ret |= BDRV_BLOCK_EOF;
2457      }
2458  early_out:
2459      if (file) {
2460          *file = local_file;
2461      }
2462      if (map) {
2463          *map = local_map;
2464      }
2465      return ret;
2466  }
2467  
2468  static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
2469                                                     BlockDriverState *base,
2470                                                     bool want_zero,
2471                                                     int64_t offset,
2472                                                     int64_t bytes,
2473                                                     int64_t *pnum,
2474                                                     int64_t *map,
2475                                                     BlockDriverState **file)
2476  {
2477      BlockDriverState *p;
2478      int ret = 0;
2479      bool first = true;
2480  
2481      assert(bs != base);
2482      for (p = bs; p != base; p = backing_bs(p)) {
2483          ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
2484                                     file);
2485          if (ret < 0) {
2486              break;
2487          }
2488          if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) {
2489              /*
2490               * Reading beyond the end of the file continues to read
2491               * zeroes, but we can only widen the result to the
2492               * unallocated length we learned from an earlier
2493               * iteration.
2494               */
2495              *pnum = bytes;
2496          }
2497          if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) {
2498              break;
2499          }
2500          /* [offset, pnum] unallocated on this layer, which could be only
2501           * the first part of [offset, bytes].  */
2502          bytes = MIN(bytes, *pnum);
2503          first = false;
2504      }
2505      return ret;
2506  }
2507  
2508  /* Coroutine wrapper for bdrv_block_status_above() */
2509  static int coroutine_fn bdrv_block_status_above_co_entry(void *opaque)
2510  {
2511      BdrvCoBlockStatusData *data = opaque;
2512  
2513      return bdrv_co_block_status_above(data->bs, data->base,
2514                                        data->want_zero,
2515                                        data->offset, data->bytes,
2516                                        data->pnum, data->map, data->file);
2517  }
2518  
2519  /*
2520   * Synchronous wrapper around bdrv_co_block_status_above().
2521   *
2522   * See bdrv_co_block_status_above() for details.
2523   */
2524  static int bdrv_common_block_status_above(BlockDriverState *bs,
2525                                            BlockDriverState *base,
2526                                            bool want_zero, int64_t offset,
2527                                            int64_t bytes, int64_t *pnum,
2528                                            int64_t *map,
2529                                            BlockDriverState **file)
2530  {
2531      BdrvCoBlockStatusData data = {
2532          .bs = bs,
2533          .base = base,
2534          .want_zero = want_zero,
2535          .offset = offset,
2536          .bytes = bytes,
2537          .pnum = pnum,
2538          .map = map,
2539          .file = file,
2540      };
2541  
2542      return bdrv_run_co(bs, bdrv_block_status_above_co_entry, &data);
2543  }
2544  
2545  int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
2546                              int64_t offset, int64_t bytes, int64_t *pnum,
2547                              int64_t *map, BlockDriverState **file)
2548  {
2549      return bdrv_common_block_status_above(bs, base, true, offset, bytes,
2550                                            pnum, map, file);
2551  }
2552  
2553  int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
2554                        int64_t *pnum, int64_t *map, BlockDriverState **file)
2555  {
2556      return bdrv_block_status_above(bs, backing_bs(bs),
2557                                     offset, bytes, pnum, map, file);
2558  }
2559  
2560  int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
2561                                     int64_t bytes, int64_t *pnum)
2562  {
2563      int ret;
2564      int64_t dummy;
2565  
2566      ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset,
2567                                           bytes, pnum ? pnum : &dummy, NULL,
2568                                           NULL);
2569      if (ret < 0) {
2570          return ret;
2571      }
2572      return !!(ret & BDRV_BLOCK_ALLOCATED);
2573  }
2574  
2575  /*
2576   * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2577   *
2578   * Return 1 if (a prefix of) the given range is allocated in any image
2579   * between BASE and TOP (BASE is only included if include_base is set).
2580   * BASE can be NULL to check if the given offset is allocated in any
2581   * image of the chain.  Return 0 otherwise, or negative errno on
2582   * failure.
2583   *
2584   * 'pnum' is set to the number of bytes (including and immediately
2585   * following the specified offset) that are known to be in the same
2586   * allocated/unallocated state.  Note that a subsequent call starting
2587   * at 'offset + *pnum' may return the same allocation status (in other
2588   * words, the result is not necessarily the maximum possible range);
2589   * but 'pnum' will only be 0 when end of file is reached.
2590   *
2591   */
2592  int bdrv_is_allocated_above(BlockDriverState *top,
2593                              BlockDriverState *base,
2594                              bool include_base, int64_t offset,
2595                              int64_t bytes, int64_t *pnum)
2596  {
2597      BlockDriverState *intermediate;
2598      int ret;
2599      int64_t n = bytes;
2600  
2601      assert(base || !include_base);
2602  
2603      intermediate = top;
2604      while (include_base || intermediate != base) {
2605          int64_t pnum_inter;
2606          int64_t size_inter;
2607  
2608          assert(intermediate);
2609          ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter);
2610          if (ret < 0) {
2611              return ret;
2612          }
2613          if (ret) {
2614              *pnum = pnum_inter;
2615              return 1;
2616          }
2617  
2618          size_inter = bdrv_getlength(intermediate);
2619          if (size_inter < 0) {
2620              return size_inter;
2621          }
2622          if (n > pnum_inter &&
2623              (intermediate == top || offset + pnum_inter < size_inter)) {
2624              n = pnum_inter;
2625          }
2626  
2627          if (intermediate == base) {
2628              break;
2629          }
2630  
2631          intermediate = backing_bs(intermediate);
2632      }
2633  
2634      *pnum = n;
2635      return 0;
2636  }
2637  
2638  typedef struct BdrvVmstateCo {
2639      BlockDriverState    *bs;
2640      QEMUIOVector        *qiov;
2641      int64_t             pos;
2642      bool                is_read;
2643  } BdrvVmstateCo;
2644  
2645  static int coroutine_fn
2646  bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
2647                     bool is_read)
2648  {
2649      BlockDriver *drv = bs->drv;
2650      int ret = -ENOTSUP;
2651  
2652      bdrv_inc_in_flight(bs);
2653  
2654      if (!drv) {
2655          ret = -ENOMEDIUM;
2656      } else if (drv->bdrv_load_vmstate) {
2657          if (is_read) {
2658              ret = drv->bdrv_load_vmstate(bs, qiov, pos);
2659          } else {
2660              ret = drv->bdrv_save_vmstate(bs, qiov, pos);
2661          }
2662      } else if (bs->file) {
2663          ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
2664      }
2665  
2666      bdrv_dec_in_flight(bs);
2667      return ret;
2668  }
2669  
2670  static int coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
2671  {
2672      BdrvVmstateCo *co = opaque;
2673  
2674      return bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
2675  }
2676  
2677  static inline int
2678  bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
2679                  bool is_read)
2680  {
2681      BdrvVmstateCo data = {
2682          .bs         = bs,
2683          .qiov       = qiov,
2684          .pos        = pos,
2685          .is_read    = is_read,
2686      };
2687  
2688      return bdrv_run_co(bs, bdrv_co_rw_vmstate_entry, &data);
2689  }
2690  
2691  int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2692                        int64_t pos, int size)
2693  {
2694      QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
2695      int ret;
2696  
2697      ret = bdrv_writev_vmstate(bs, &qiov, pos);
2698      if (ret < 0) {
2699          return ret;
2700      }
2701  
2702      return size;
2703  }
2704  
2705  int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2706  {
2707      return bdrv_rw_vmstate(bs, qiov, pos, false);
2708  }
2709  
2710  int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2711                        int64_t pos, int size)
2712  {
2713      QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
2714      int ret;
2715  
2716      ret = bdrv_readv_vmstate(bs, &qiov, pos);
2717      if (ret < 0) {
2718          return ret;
2719      }
2720  
2721      return size;
2722  }
2723  
2724  int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2725  {
2726      return bdrv_rw_vmstate(bs, qiov, pos, true);
2727  }
2728  
2729  /**************************************************************/
2730  /* async I/Os */
2731  
2732  void bdrv_aio_cancel(BlockAIOCB *acb)
2733  {
2734      qemu_aio_ref(acb);
2735      bdrv_aio_cancel_async(acb);
2736      while (acb->refcnt > 1) {
2737          if (acb->aiocb_info->get_aio_context) {
2738              aio_poll(acb->aiocb_info->get_aio_context(acb), true);
2739          } else if (acb->bs) {
2740              /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
2741               * assert that we're not using an I/O thread.  Thread-safe
2742               * code should use bdrv_aio_cancel_async exclusively.
2743               */
2744              assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
2745              aio_poll(bdrv_get_aio_context(acb->bs), true);
2746          } else {
2747              abort();
2748          }
2749      }
2750      qemu_aio_unref(acb);
2751  }
2752  
2753  /* Async version of aio cancel. The caller is not blocked if the acb implements
2754   * cancel_async, otherwise we do nothing and let the request normally complete.
2755   * In either case the completion callback must be called. */
2756  void bdrv_aio_cancel_async(BlockAIOCB *acb)
2757  {
2758      if (acb->aiocb_info->cancel_async) {
2759          acb->aiocb_info->cancel_async(acb);
2760      }
2761  }
2762  
2763  /**************************************************************/
2764  /* Coroutine block device emulation */
2765  
2766  static int coroutine_fn bdrv_flush_co_entry(void *opaque)
2767  {
2768      return bdrv_co_flush(opaque);
2769  }
2770  
2771  int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2772  {
2773      int current_gen;
2774      int ret = 0;
2775  
2776      bdrv_inc_in_flight(bs);
2777  
2778      if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
2779          bdrv_is_sg(bs)) {
2780          goto early_exit;
2781      }
2782  
2783      qemu_co_mutex_lock(&bs->reqs_lock);
2784      current_gen = atomic_read(&bs->write_gen);
2785  
2786      /* Wait until any previous flushes are completed */
2787      while (bs->active_flush_req) {
2788          qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock);
2789      }
2790  
2791      /* Flushes reach this point in nondecreasing current_gen order.  */
2792      bs->active_flush_req = true;
2793      qemu_co_mutex_unlock(&bs->reqs_lock);
2794  
2795      /* Write back all layers by calling one driver function */
2796      if (bs->drv->bdrv_co_flush) {
2797          ret = bs->drv->bdrv_co_flush(bs);
2798          goto out;
2799      }
2800  
2801      /* Write back cached data to the OS even with cache=unsafe */
2802      BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
2803      if (bs->drv->bdrv_co_flush_to_os) {
2804          ret = bs->drv->bdrv_co_flush_to_os(bs);
2805          if (ret < 0) {
2806              goto out;
2807          }
2808      }
2809  
2810      /* But don't actually force it to the disk with cache=unsafe */
2811      if (bs->open_flags & BDRV_O_NO_FLUSH) {
2812          goto flush_parent;
2813      }
2814  
2815      /* Check if we really need to flush anything */
2816      if (bs->flushed_gen == current_gen) {
2817          goto flush_parent;
2818      }
2819  
2820      BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2821      if (!bs->drv) {
2822          /* bs->drv->bdrv_co_flush() might have ejected the BDS
2823           * (even in case of apparent success) */
2824          ret = -ENOMEDIUM;
2825          goto out;
2826      }
2827      if (bs->drv->bdrv_co_flush_to_disk) {
2828          ret = bs->drv->bdrv_co_flush_to_disk(bs);
2829      } else if (bs->drv->bdrv_aio_flush) {
2830          BlockAIOCB *acb;
2831          CoroutineIOCompletion co = {
2832              .coroutine = qemu_coroutine_self(),
2833          };
2834  
2835          acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2836          if (acb == NULL) {
2837              ret = -EIO;
2838          } else {
2839              qemu_coroutine_yield();
2840              ret = co.ret;
2841          }
2842      } else {
2843          /*
2844           * Some block drivers always operate in either writethrough or unsafe
2845           * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2846           * know how the server works (because the behaviour is hardcoded or
2847           * depends on server-side configuration), so we can't ensure that
2848           * everything is safe on disk. Returning an error doesn't work because
2849           * that would break guests even if the server operates in writethrough
2850           * mode.
2851           *
2852           * Let's hope the user knows what he's doing.
2853           */
2854          ret = 0;
2855      }
2856  
2857      if (ret < 0) {
2858          goto out;
2859      }
2860  
2861      /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
2862       * in the case of cache=unsafe, so there are no useless flushes.
2863       */
2864  flush_parent:
2865      ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2866  out:
2867      /* Notify any pending flushes that we have completed */
2868      if (ret == 0) {
2869          bs->flushed_gen = current_gen;
2870      }
2871  
2872      qemu_co_mutex_lock(&bs->reqs_lock);
2873      bs->active_flush_req = false;
2874      /* Return value is ignored - it's ok if wait queue is empty */
2875      qemu_co_queue_next(&bs->flush_queue);
2876      qemu_co_mutex_unlock(&bs->reqs_lock);
2877  
2878  early_exit:
2879      bdrv_dec_in_flight(bs);
2880      return ret;
2881  }
2882  
2883  int bdrv_flush(BlockDriverState *bs)
2884  {
2885      return bdrv_run_co(bs, bdrv_flush_co_entry, bs);
2886  }
2887  
2888  typedef struct DiscardCo {
2889      BdrvChild *child;
2890      int64_t offset;
2891      int64_t bytes;
2892  } DiscardCo;
2893  
2894  static int coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
2895  {
2896      DiscardCo *rwco = opaque;
2897  
2898      return bdrv_co_pdiscard(rwco->child, rwco->offset, rwco->bytes);
2899  }
2900  
2901  int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
2902                                    int64_t bytes)
2903  {
2904      BdrvTrackedRequest req;
2905      int max_pdiscard, ret;
2906      int head, tail, align;
2907      BlockDriverState *bs = child->bs;
2908  
2909      if (!bs || !bs->drv || !bdrv_is_inserted(bs)) {
2910          return -ENOMEDIUM;
2911      }
2912  
2913      if (bdrv_has_readonly_bitmaps(bs)) {
2914          return -EPERM;
2915      }
2916  
2917      if (offset < 0 || bytes < 0 || bytes > INT64_MAX - offset) {
2918          return -EIO;
2919      }
2920  
2921      /* Do nothing if disabled.  */
2922      if (!(bs->open_flags & BDRV_O_UNMAP)) {
2923          return 0;
2924      }
2925  
2926      if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
2927          return 0;
2928      }
2929  
2930      /* Discard is advisory, but some devices track and coalesce
2931       * unaligned requests, so we must pass everything down rather than
2932       * round here.  Still, most devices will just silently ignore
2933       * unaligned requests (by returning -ENOTSUP), so we must fragment
2934       * the request accordingly.  */
2935      align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
2936      assert(align % bs->bl.request_alignment == 0);
2937      head = offset % align;
2938      tail = (offset + bytes) % align;
2939  
2940      bdrv_inc_in_flight(bs);
2941      tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
2942  
2943      ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0);
2944      if (ret < 0) {
2945          goto out;
2946      }
2947  
2948      max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
2949                                     align);
2950      assert(max_pdiscard >= bs->bl.request_alignment);
2951  
2952      while (bytes > 0) {
2953          int64_t num = bytes;
2954  
2955          if (head) {
2956              /* Make small requests to get to alignment boundaries. */
2957              num = MIN(bytes, align - head);
2958              if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
2959                  num %= bs->bl.request_alignment;
2960              }
2961              head = (head + num) % align;
2962              assert(num < max_pdiscard);
2963          } else if (tail) {
2964              if (num > align) {
2965                  /* Shorten the request to the last aligned cluster.  */
2966                  num -= tail;
2967              } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
2968                         tail > bs->bl.request_alignment) {
2969                  tail %= bs->bl.request_alignment;
2970                  num -= tail;
2971              }
2972          }
2973          /* limit request size */
2974          if (num > max_pdiscard) {
2975              num = max_pdiscard;
2976          }
2977  
2978          if (!bs->drv) {
2979              ret = -ENOMEDIUM;
2980              goto out;
2981          }
2982          if (bs->drv->bdrv_co_pdiscard) {
2983              ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
2984          } else {
2985              BlockAIOCB *acb;
2986              CoroutineIOCompletion co = {
2987                  .coroutine = qemu_coroutine_self(),
2988              };
2989  
2990              acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
2991                                               bdrv_co_io_em_complete, &co);
2992              if (acb == NULL) {
2993                  ret = -EIO;
2994                  goto out;
2995              } else {
2996                  qemu_coroutine_yield();
2997                  ret = co.ret;
2998              }
2999          }
3000          if (ret && ret != -ENOTSUP) {
3001              goto out;
3002          }
3003  
3004          offset += num;
3005          bytes -= num;
3006      }
3007      ret = 0;
3008  out:
3009      bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret);
3010      tracked_request_end(&req);
3011      bdrv_dec_in_flight(bs);
3012      return ret;
3013  }
3014  
3015  int bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes)
3016  {
3017      DiscardCo rwco = {
3018          .child = child,
3019          .offset = offset,
3020          .bytes = bytes,
3021      };
3022  
3023      return bdrv_run_co(child->bs, bdrv_pdiscard_co_entry, &rwco);
3024  }
3025  
3026  int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
3027  {
3028      BlockDriver *drv = bs->drv;
3029      CoroutineIOCompletion co = {
3030          .coroutine = qemu_coroutine_self(),
3031      };
3032      BlockAIOCB *acb;
3033  
3034      bdrv_inc_in_flight(bs);
3035      if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
3036          co.ret = -ENOTSUP;
3037          goto out;
3038      }
3039  
3040      if (drv->bdrv_co_ioctl) {
3041          co.ret = drv->bdrv_co_ioctl(bs, req, buf);
3042      } else {
3043          acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
3044          if (!acb) {
3045              co.ret = -ENOTSUP;
3046              goto out;
3047          }
3048          qemu_coroutine_yield();
3049      }
3050  out:
3051      bdrv_dec_in_flight(bs);
3052      return co.ret;
3053  }
3054  
3055  void *qemu_blockalign(BlockDriverState *bs, size_t size)
3056  {
3057      return qemu_memalign(bdrv_opt_mem_align(bs), size);
3058  }
3059  
3060  void *qemu_blockalign0(BlockDriverState *bs, size_t size)
3061  {
3062      return memset(qemu_blockalign(bs, size), 0, size);
3063  }
3064  
3065  void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
3066  {
3067      size_t align = bdrv_opt_mem_align(bs);
3068  
3069      /* Ensure that NULL is never returned on success */
3070      assert(align > 0);
3071      if (size == 0) {
3072          size = align;
3073      }
3074  
3075      return qemu_try_memalign(align, size);
3076  }
3077  
3078  void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
3079  {
3080      void *mem = qemu_try_blockalign(bs, size);
3081  
3082      if (mem) {
3083          memset(mem, 0, size);
3084      }
3085  
3086      return mem;
3087  }
3088  
3089  /*
3090   * Check if all memory in this vector is sector aligned.
3091   */
3092  bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
3093  {
3094      int i;
3095      size_t alignment = bdrv_min_mem_align(bs);
3096  
3097      for (i = 0; i < qiov->niov; i++) {
3098          if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
3099              return false;
3100          }
3101          if (qiov->iov[i].iov_len % alignment) {
3102              return false;
3103          }
3104      }
3105  
3106      return true;
3107  }
3108  
3109  void bdrv_add_before_write_notifier(BlockDriverState *bs,
3110                                      NotifierWithReturn *notifier)
3111  {
3112      notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
3113  }
3114  
3115  void bdrv_io_plug(BlockDriverState *bs)
3116  {
3117      BdrvChild *child;
3118  
3119      QLIST_FOREACH(child, &bs->children, next) {
3120          bdrv_io_plug(child->bs);
3121      }
3122  
3123      if (atomic_fetch_inc(&bs->io_plugged) == 0) {
3124          BlockDriver *drv = bs->drv;
3125          if (drv && drv->bdrv_io_plug) {
3126              drv->bdrv_io_plug(bs);
3127          }
3128      }
3129  }
3130  
3131  void bdrv_io_unplug(BlockDriverState *bs)
3132  {
3133      BdrvChild *child;
3134  
3135      assert(bs->io_plugged);
3136      if (atomic_fetch_dec(&bs->io_plugged) == 1) {
3137          BlockDriver *drv = bs->drv;
3138          if (drv && drv->bdrv_io_unplug) {
3139              drv->bdrv_io_unplug(bs);
3140          }
3141      }
3142  
3143      QLIST_FOREACH(child, &bs->children, next) {
3144          bdrv_io_unplug(child->bs);
3145      }
3146  }
3147  
3148  void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size)
3149  {
3150      BdrvChild *child;
3151  
3152      if (bs->drv && bs->drv->bdrv_register_buf) {
3153          bs->drv->bdrv_register_buf(bs, host, size);
3154      }
3155      QLIST_FOREACH(child, &bs->children, next) {
3156          bdrv_register_buf(child->bs, host, size);
3157      }
3158  }
3159  
3160  void bdrv_unregister_buf(BlockDriverState *bs, void *host)
3161  {
3162      BdrvChild *child;
3163  
3164      if (bs->drv && bs->drv->bdrv_unregister_buf) {
3165          bs->drv->bdrv_unregister_buf(bs, host);
3166      }
3167      QLIST_FOREACH(child, &bs->children, next) {
3168          bdrv_unregister_buf(child->bs, host);
3169      }
3170  }
3171  
3172  static int coroutine_fn bdrv_co_copy_range_internal(
3173          BdrvChild *src, uint64_t src_offset, BdrvChild *dst,
3174          uint64_t dst_offset, uint64_t bytes,
3175          BdrvRequestFlags read_flags, BdrvRequestFlags write_flags,
3176          bool recurse_src)
3177  {
3178      BdrvTrackedRequest req;
3179      int ret;
3180  
3181      /* TODO We can support BDRV_REQ_NO_FALLBACK here */
3182      assert(!(read_flags & BDRV_REQ_NO_FALLBACK));
3183      assert(!(write_flags & BDRV_REQ_NO_FALLBACK));
3184  
3185      if (!dst || !dst->bs) {
3186          return -ENOMEDIUM;
3187      }
3188      ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes);
3189      if (ret) {
3190          return ret;
3191      }
3192      if (write_flags & BDRV_REQ_ZERO_WRITE) {
3193          return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags);
3194      }
3195  
3196      if (!src || !src->bs) {
3197          return -ENOMEDIUM;
3198      }
3199      ret = bdrv_check_byte_request(src->bs, src_offset, bytes);
3200      if (ret) {
3201          return ret;
3202      }
3203  
3204      if (!src->bs->drv->bdrv_co_copy_range_from
3205          || !dst->bs->drv->bdrv_co_copy_range_to
3206          || src->bs->encrypted || dst->bs->encrypted) {
3207          return -ENOTSUP;
3208      }
3209  
3210      if (recurse_src) {
3211          bdrv_inc_in_flight(src->bs);
3212          tracked_request_begin(&req, src->bs, src_offset, bytes,
3213                                BDRV_TRACKED_READ);
3214  
3215          /* BDRV_REQ_SERIALISING is only for write operation */
3216          assert(!(read_flags & BDRV_REQ_SERIALISING));
3217          bdrv_wait_serialising_requests(&req);
3218  
3219          ret = src->bs->drv->bdrv_co_copy_range_from(src->bs,
3220                                                      src, src_offset,
3221                                                      dst, dst_offset,
3222                                                      bytes,
3223                                                      read_flags, write_flags);
3224  
3225          tracked_request_end(&req);
3226          bdrv_dec_in_flight(src->bs);
3227      } else {
3228          bdrv_inc_in_flight(dst->bs);
3229          tracked_request_begin(&req, dst->bs, dst_offset, bytes,
3230                                BDRV_TRACKED_WRITE);
3231          ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req,
3232                                          write_flags);
3233          if (!ret) {
3234              ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs,
3235                                                        src, src_offset,
3236                                                        dst, dst_offset,
3237                                                        bytes,
3238                                                        read_flags, write_flags);
3239          }
3240          bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret);
3241          tracked_request_end(&req);
3242          bdrv_dec_in_flight(dst->bs);
3243      }
3244  
3245      return ret;
3246  }
3247  
3248  /* Copy range from @src to @dst.
3249   *
3250   * See the comment of bdrv_co_copy_range for the parameter and return value
3251   * semantics. */
3252  int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset,
3253                                           BdrvChild *dst, uint64_t dst_offset,
3254                                           uint64_t bytes,
3255                                           BdrvRequestFlags read_flags,
3256                                           BdrvRequestFlags write_flags)
3257  {
3258      trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes,
3259                                    read_flags, write_flags);
3260      return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
3261                                         bytes, read_flags, write_flags, true);
3262  }
3263  
3264  /* Copy range from @src to @dst.
3265   *
3266   * See the comment of bdrv_co_copy_range for the parameter and return value
3267   * semantics. */
3268  int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset,
3269                                         BdrvChild *dst, uint64_t dst_offset,
3270                                         uint64_t bytes,
3271                                         BdrvRequestFlags read_flags,
3272                                         BdrvRequestFlags write_flags)
3273  {
3274      trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
3275                                  read_flags, write_flags);
3276      return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
3277                                         bytes, read_flags, write_flags, false);
3278  }
3279  
3280  int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset,
3281                                      BdrvChild *dst, uint64_t dst_offset,
3282                                      uint64_t bytes, BdrvRequestFlags read_flags,
3283                                      BdrvRequestFlags write_flags)
3284  {
3285      return bdrv_co_copy_range_from(src, src_offset,
3286                                     dst, dst_offset,
3287                                     bytes, read_flags, write_flags);
3288  }
3289  
3290  static void bdrv_parent_cb_resize(BlockDriverState *bs)
3291  {
3292      BdrvChild *c;
3293      QLIST_FOREACH(c, &bs->parents, next_parent) {
3294          if (c->klass->resize) {
3295              c->klass->resize(c);
3296          }
3297      }
3298  }
3299  
3300  /**
3301   * Truncate file to 'offset' bytes (needed only for file protocols)
3302   *
3303   * If 'exact' is true, the file must be resized to exactly the given
3304   * 'offset'.  Otherwise, it is sufficient for the node to be at least
3305   * 'offset' bytes in length.
3306   */
3307  int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
3308                                    PreallocMode prealloc, BdrvRequestFlags flags,
3309                                    Error **errp)
3310  {
3311      BlockDriverState *bs = child->bs;
3312      BlockDriver *drv = bs->drv;
3313      BdrvTrackedRequest req;
3314      int64_t old_size, new_bytes;
3315      int ret;
3316  
3317  
3318      /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
3319      if (!drv) {
3320          error_setg(errp, "No medium inserted");
3321          return -ENOMEDIUM;
3322      }
3323      if (offset < 0) {
3324          error_setg(errp, "Image size cannot be negative");
3325          return -EINVAL;
3326      }
3327  
3328      old_size = bdrv_getlength(bs);
3329      if (old_size < 0) {
3330          error_setg_errno(errp, -old_size, "Failed to get old image size");
3331          return old_size;
3332      }
3333  
3334      if (offset > old_size) {
3335          new_bytes = offset - old_size;
3336      } else {
3337          new_bytes = 0;
3338      }
3339  
3340      bdrv_inc_in_flight(bs);
3341      tracked_request_begin(&req, bs, offset - new_bytes, new_bytes,
3342                            BDRV_TRACKED_TRUNCATE);
3343  
3344      /* If we are growing the image and potentially using preallocation for the
3345       * new area, we need to make sure that no write requests are made to it
3346       * concurrently or they might be overwritten by preallocation. */
3347      if (new_bytes) {
3348          bdrv_mark_request_serialising(&req, 1);
3349      }
3350      if (bs->read_only) {
3351          error_setg(errp, "Image is read-only");
3352          ret = -EACCES;
3353          goto out;
3354      }
3355      ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req,
3356                                      0);
3357      if (ret < 0) {
3358          error_setg_errno(errp, -ret,
3359                           "Failed to prepare request for truncation");
3360          goto out;
3361      }
3362  
3363      /*
3364       * If the image has a backing file that is large enough that it would
3365       * provide data for the new area, we cannot leave it unallocated because
3366       * then the backing file content would become visible. Instead, zero-fill
3367       * the new area.
3368       *
3369       * Note that if the image has a backing file, but was opened without the
3370       * backing file, taking care of keeping things consistent with that backing
3371       * file is the user's responsibility.
3372       */
3373      if (new_bytes && bs->backing) {
3374          int64_t backing_len;
3375  
3376          backing_len = bdrv_getlength(backing_bs(bs));
3377          if (backing_len < 0) {
3378              ret = backing_len;
3379              error_setg_errno(errp, -ret, "Could not get backing file size");
3380              goto out;
3381          }
3382  
3383          if (backing_len > old_size) {
3384              flags |= BDRV_REQ_ZERO_WRITE;
3385          }
3386      }
3387  
3388      if (drv->bdrv_co_truncate) {
3389          if (flags & ~bs->supported_truncate_flags) {
3390              error_setg(errp, "Block driver does not support requested flags");
3391              ret = -ENOTSUP;
3392              goto out;
3393          }
3394          ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp);
3395      } else if (bs->file && drv->is_filter) {
3396          ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp);
3397      } else {
3398          error_setg(errp, "Image format driver does not support resize");
3399          ret = -ENOTSUP;
3400          goto out;
3401      }
3402      if (ret < 0) {
3403          goto out;
3404      }
3405  
3406      ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3407      if (ret < 0) {
3408          error_setg_errno(errp, -ret, "Could not refresh total sector count");
3409      } else {
3410          offset = bs->total_sectors * BDRV_SECTOR_SIZE;
3411      }
3412      /* It's possible that truncation succeeded but refresh_total_sectors
3413       * failed, but the latter doesn't affect how we should finish the request.
3414       * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */
3415      bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0);
3416  
3417  out:
3418      tracked_request_end(&req);
3419      bdrv_dec_in_flight(bs);
3420  
3421      return ret;
3422  }
3423  
3424  typedef struct TruncateCo {
3425      BdrvChild *child;
3426      int64_t offset;
3427      bool exact;
3428      PreallocMode prealloc;
3429      BdrvRequestFlags flags;
3430      Error **errp;
3431  } TruncateCo;
3432  
3433  static int coroutine_fn bdrv_truncate_co_entry(void *opaque)
3434  {
3435      TruncateCo *tco = opaque;
3436  
3437      return bdrv_co_truncate(tco->child, tco->offset, tco->exact,
3438                              tco->prealloc, tco->flags, tco->errp);
3439  }
3440  
3441  int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
3442                    PreallocMode prealloc, BdrvRequestFlags flags, Error **errp)
3443  {
3444      TruncateCo tco = {
3445          .child      = child,
3446          .offset     = offset,
3447          .exact      = exact,
3448          .prealloc   = prealloc,
3449          .flags      = flags,
3450          .errp       = errp,
3451      };
3452  
3453      return bdrv_run_co(child->bs, bdrv_truncate_co_entry, &tco);
3454  }
3455